agda-llvm/report.lagda.tex at master · fredins/agda-llvm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% !TEX program = lualatex
\documentclass[9pt, twocolumn]{article}

% \usepackage{a4wide}

%\usepackage[a4paper, total={8.5in, 11in}]{geometry}
% \usepackage[a4paper]{geometry}
\usepackage[a4paper, margin=2.5cm]{geometry}
%\setlength{\textheight}{1.1\textheight}
% Support for Agda code.
\usepackage{agda}

\usepackage{fontspec}
\newfontfamily{\AgdaSerifFont}{Latin Modern Roman}
\newfontfamily{\AgdaSansSerifFont}{Latin Modern Sans}
\newfontfamily{\AgdaTypewriterFont}{Latin Modern Mono}
\renewcommand{\AgdaFontStyle}[1]{{\AgdaSansSerifFont{}#1}}
\renewcommand{\AgdaKeywordFontStyle}[1]{{\AgdaSansSerifFont{}#1}}
\renewcommand{\AgdaStringFontStyle}[1]{{\AgdaTypewriterFont{}#1}}
\renewcommand{\AgdaCommentFontStyle}[1]{{\AgdaTypewriterFont{}#1}}
\renewcommand{\AgdaBoundFontStyle}[1]{\textit{\AgdaSerifFont{}#1}}

% Small underscore
\renewcommand{\AgdaUnderscore}{\texttt{\_}}

% Bibliography
\usepackage[backend=biber, style=apa, natbib=true]{biblatex}
\addbibresource{../bibliography.bib}

% Packages
\usepackage{empheq} % for border around math
\usepackage{listings} % non-agda code blocks
\usepackage{amssymb} % math symbols
\usepackage[]{lmodern} % Latin modern font
\usepackage[labelfont=bf, textfont=it, justification=centering, singlelinecheck=false]{caption}
\usepackage{subcaption}
\usepackage{enumitem} % compact itemize
\usepackage{color}
\newcommand{\hl}[2][lightgray]{\colorbox{#1}{#2}}
\usepackage{bussproofs}
\usepackage{stackengine}
% \usepackage[skip=4pt plus1pt, indent=10pt]{parskip}

\usepackage[framemethod=tikz]{mdframed}
\mdfsetup{%
innertopmargin=1em,
innerbottommargin=1em,
innerleftmargin=1em,
innerrightmargin=1em,
linewidth=0.5pt,
}

% Cosmetics
\linespread{1.2}

\usepackage{newunicodechar}
\usepackage{colonequals}
\newunicodechar{λ}{\ensuremath{\mathnormal{\lambda}}}
\newunicodechar{Γ}{\ensuremath{\mathnormal{\Gamma}}}
\newunicodechar{Δ}{\ensuremath{\mathnormal{\Delta}}}
\newunicodechar{∀}{\ensuremath{\mathnormal{\forall}}}
\newunicodechar{∷}{\ensuremath{\mathnormal{\coloncolon}}}
\newunicodechar{ω}{\ensuremath{\mathnormal{\omega}}}
\newunicodechar{→}{\ensuremath{\mathnormal{\rightarrow}}}
\newunicodechar{⟿}{\ensuremath{\mathnormal{\rightsquigarrow}}}
\newunicodechar{ℕ}{\ensuremath{\mathbb{N}}}

\newunicodechar{₁}{\ensuremath{{}_1}}
\newunicodechar{₂}{\ensuremath{{}_2}}
\newunicodechar{₃}{\ensuremath{{}_3}}
\newunicodechar{₄}{\ensuremath{{}_4}}
\newunicodechar{₅}{\ensuremath{{}_5}}
\newunicodechar{₆}{\ensuremath{{}_6}}
\newunicodechar{₇}{\ensuremath{{}_7}}
\newunicodechar{₈}{\ensuremath{{}_8}}
\newunicodechar{₉}{\ensuremath{{}_9}}

\usepackage{listings}
\lstdefinelanguage{Treeless}{keywords={case, of, let, in}}
\lstdefinelanguage{GRIN}{keywords={case, of, store, update, unit, fetch}}
\lstset{
frame=none,
framexleftmargin=0pt,
framesep=10pt,
stepnumber=1,
numbers=none,
numbersep=7pt,
numberstyle=\ttfamily\scriptsize\color[gray]{0.4},
basicstyle=\linespread{1}\ttfamily\fontseries{l}\selectfont,
keywordstyle=\linespread{1}\ttfamily\fontseries{b}\selectfont,
captionpos=b,
language=GRIN,
commentstyle=\it,
columns=flexible,
keepspaces=true,
mathescape=true,
escapechar=!,
escapeinside={!}{!},
}

%\strutshortanchors{F}
%\def\stackalignment{l}

\newcommand{\refp}[1]{(\ref{#1})}
\newcommand{\jm}[4]{#1 $\mid$ #2 $\vdash$ #3 $\rightsquigarrow$ #4}
\newcommand{\jmdouble}[4]{\ensuremath{#1 \mid #2 \vdash\fCenter \stackengine{\stackgap}{#3\rightsquigarrow}{#4}{U}{l}{F}{F}{S}}}
\newcommand{\jmv}[4]{#1 $\mid$ #2 $\vdash$ #3 $\rightsquigarrow_v$ #4}

\newcommand{\axEmpty}{\Axiom$\fCenter$}
\newcommand{\axLet}[2]{\Axiom$#1\;=\fCenter\;#2$}
\newcommand{\uiLet}[2]{\UnaryInf$#1\;=\fCenter\;#2$}
\newcommand{\axJudgmentV}[4]{\Axiom$#1\;\mid\;#2\;\vdash\fCenter\;#3\;\rightsquigarrow_v\;#4$}
\newcommand{\axJudgment}[4]{\Axiom$#1\;\mid\;#2\;\vdash\fCenter\;#3\;\rightsquigarrow\;#4$}
\newcommand{\uiJudgment}[4]{\UnaryInf$#1\;\mid\;#2\;\vdash\fCenter\;#3\;\rightsquigarrow\;#4$}
\newcommand{\biJudgment}[4]{\BinaryInf$#1\;\mid\;#2\;\vdash\fCenter\;#3\;\rightsquigarrow\;#4$}
\newcommand{\biJudgmentAlignRight}[4]{\BinaryInf$#1\;\mid\;#2\;\vdash\fCenter\;#3\;\rightsquigarrow\;\stackMath\Shortunderstack[l]{#4}$}
\newcommand{\triJudgment}[4]{\TrinaryInf$#1\;\mid\;#2\;\vdash\fCenter\;#3\;\rightsquigarrow\;#4$}
\newcommand{\triJudgmentDouble}[4]{\TrinaryInf$#1\;\mid\;#2\;\vdash\fCenter\;\stackengine{\stackgap}{$#3\rightsquigarrow$}{$#4$}{U}{l}{F}{F}{S}$}

% https://tex.stackexchange.com/questions/87407/iterating-through-comma-separated-arguments
\ExplSyntaxOn
\NewDocumentCommand \sequence { O{\ensuremath{,}} m } { % TODO Decide appropriate spacing
  \clist_use:nn { #2 } { #1 }
}
\ExplSyntaxOff


% Custom type writer listing with equation numbers
\lstnewenvironment{typewriter}[3][none]%
  {\lstset{language=#2, numbers=#1}\minipage{\linewidth-2em}}
  {\refstepcounter{AgdaCodeNumber}\label{#3}%
   \endminipage%
   \AgdaFormatCodeNumber{\theAgdaCodeNumber}
   \newline}%

\title{Precise reference counting for lazy functional languages with interprocedural points-to analysis}
\author{Martin Fredin \\ \texttt{fredinm@chalmers.se}}
\date{2023}

\begin{document}
\maketitle

\begin{code}[hide]
open import Agda.Builtin.Nat using (suc; zero; _+_) renaming (Nat to ℕ)
open import Agda.Builtin.Strict using (primForce)

infixr 5 _∷_
data List A : Set where
  []  : List A
  _∷_ : (x : A) (xs : List A) → List A
\end{code}


% Motivation:
% - Conflict between abstract code and runtime performance
%   • It is important for verified code to be performant:
%       "Such security-critical code often lies on the critical path of larger subsystems; users therefore
%        expect security-critical code to be not only secure and reliable, but also fast" \cite{ho2023}
%   • Many optimizations are hindered by module barriers, e.g. monomorphization, defunctionalization, and inlining.
%   • As a result, polymorphic code are plagued with boxed types, dynamic dispatch and indirect function.
%   • Non-strict evalutation semantics also encourage more allocations and indirect function calls. And purity prevents in-place mutation.
%   • Automatic memory management is often mandatory in high-level languages, which limits throughput and may introduce signficant delays.
%
% - In this paper, we address indirect function calls and automatic memory management.
%
% - We belive it is neccessary to utilse whole program compilation and interprocedural analysis to compile away abstractions. Hence GRIN.
%   Perhaps only focus on memory management and reference counting and mention that we chose GRIN address other concerns such as indirect
%   function calls.
%
% - Precise reference counting is promising technique to enable in-place mutations for pure function languages (pure or immutable),
%   and to avoid delays associtated with tracing-based garbage collectors. However, Perceus have only been applied for strict functional
%   languages.
%
% Contribution:
%   • We demonstrate the applicability of combining non-strict languages with precise reference counting.
%   • We implement a a compiler backend which compiles Agda to GRIN. Then, we perform transformations on the GRIN code to
%     remove indirect function calls by an interprocedural analysis.
%   • We implement a Perceus-like algorithm for the GRIN syntax which is much more low-level then previous implementations.
%   • We implement a GRIN interpreter and a LLVM code generator.
%
% Related Work and Discussions
% - ASAP uses compile-time garbage collection using interprocedural analysis. Would be interesting to apply some of these techniques to reduce reference counting operations.
% - Two objections two whole program compilation and reference counting are long compile times and innability to collect cyclic data structures, respectively.
%   We have been "avoiding" these uses for now. Hopefully, someone will come up with a nice solution. Some reference counting implementations like Swift and Python uses
%   weak references. Koka do not handle cycles (Or doesn't have cycles). Roc does not have cycles. Lean does not have uncontrollable recursion nor mutable references, so it
%   is impossible to create circular references.
% - Our precise reference counting algorithm targets GRIN but perhaps it can also be used in other simliar strict and low-level intermediate languages like the STG.
% - Our implementation is not ready for benchmarks. We would like to compile a larger subset of Agda, implement the rest of optimizing transformations, and reuse-analysis.
% - We would also like to formalize our precise reference counting algorithm.
%

% TODO
% - Focus more on motivation: problem of high-level verified code and efficient evaluation and memory management plays a huge role.
% - Maybe use non-strict/strict instead of lazy/eager
% - Agda have normal order evaluation (This is maybe only true in compile-time normalization). Compiler writers can decide the most
%   efficient evaluation order.
% - Write in maybe abstract and introduction that we "cheat" by transforming non-strictness into a
%   strict language
% - Also write that we implement Perceus for a much more lower-level language (GRIN)
% - Fix bad writing
% - Think about which is the best de Brujin representation

% - Section: introduction
%   • Write a motivation for the work. Why is this important for "outsiders"?
%      - (Non-strictness is nice? Memory management / cache-fredliness is super important. Conflict between nice polymorphic code and performance)
%      - "Such security-critical code often lies on the critical path of larger subsystems; users therefore
%         expect security-critical code to be not only secure and reliable, but also fast" \cite{ho2023}
% - Section: codegen/interpreter
%   • Mention that the layout is simple because we need to give it more thought
%     before implementing the reuse analysis
% - Section: Related work
%

\begin{abstract}
Precise reference counting is a technique by \citeauthor{reinking2021} that uses ownership to deallocate objects as soon as possible.
The algorithm is called Perceus, and as of this writing, it has only been impl\-emented for eager functional languages.
This paper describes the implementation of a new lazy compiler backend for the Agda programming lang\-uage with precise reference counting.
The compiler uses \citeauthor{johnsson1991} and \citeauthor{boquist1999}'s intermediate language GRIN to compile lazy programs.
GRIN uses an inter\-procedural \mbox{points-to} analysis to inline the evaluation of suspended computations.
We extend GRIN with a variant of Perceus, and demonstrate the applic\-ability of combining lazy functional programming with precise reference counting by developing a GRIN interpreter and an LLVM IR code generator.
  Due to GRIN and reference counting, our compiler does not require an external runtime system%
\footnote{Besides initialization code (crt0) and the C standard library, libc.}%
 , which is uncommon for lazy functional languages.%

\end{abstract}

\section{Introduction}

Reference counting \citep{collins1960} is a memory management technique which can detect and free resources as soon as they are no longer needed, allowing memory reuse and destructive updates.
Common reference counting algorithms are easy to implement; each allocation contains an extra field which tracks the number of references to an object.
When the reference count reaches zero, the heap space occupied by the object is reclaimed.
The number of references to an object is updated by interleaved reference counting operations (\texttt{dup} and \texttt{drop}), which increments and decrements the reference count at runtime.
As a result of the interleaved collection strategy, memory usage remain low and the throughput is continuous throughout the computation\footnote{Reference counted programs may introduce pauses similar to tracing garbage collectors. For example, when dropping a long linked list all at once.} \citep{jones1996}.
Still, tracing garbage collectors are usually favored over reference counting implementations due to cheaper allocations, higher throughput, and the ability to collect cyclic data structures.

\citet{reinking2021} reexamine reference counting with a new approach, utilizing static guarantees to make the algorithm \emph{precise} so objects can be deallocated as soon as possible.
They present a formalized algorithm called Perceus, which ensures precision.
Perceus is implemented in the functional language Koka, along with optimizations for reducing reference counting operations and reusing memory.
This builds upon previous work by \citet{ullrich2021} in the Lean programming language and theorem prover.

% No external runtime library
% TODO write: implementation of lazy language have historically required a heavy runtime system.
%             Turners combinatory interpreter, GHC runtime system for evaluation for suspended computations.
%             GRIN is "middle way" language which is able to describe evaluation of suspended computations within
%             the language. Therefore, the only runtime system is the need for a garbage collector. In our work,
%             we remove this restriction by also moving memory management into the language.

Both Koka and Lean are, however, eagerly evaluated.
Lazy languages pose an extra challenge for compiler writers because of the unintuitive control flow.
Additionally, previous implementations of lazy languages require an external runtime system to manage evaluation of suspended compuation and garbage collection \citep{johnsson1984, augustsson1984, jones1992, turner1979}.
\citet{johnsson1991}, presents a method for evaluating suspended computations without an external runtime system.
This is accomplished in the intermediate language GRIN which can express evaluation of suspended computations as a regular procedure.
In this paper, we extend GRIN with a variant of the Perceus algorithm.
This alleviates the need for an external runtime system because memory is automatically managed by interleaved reference counting operations.
We implement these ideas in a new compiler backend for the Agda programming language and proof assistant.

% In this paper, we develop a new lazy compiler backend for the Agda programming language and proof assistant, which uses GRIN extended with a variant of the Perceus algorithm.

% In this paper, we adapt the Perceus algorithm to a new lazy compiler backend for the Agda programming language and proof assistant.
% We use an intermediate languages called GRIN \citep{johnsson1991}, which can express the evaluation of suspended compuations within the language instead of a runtime system.
% As a first step, we transform Agda into an intermediate language called GRIN \citep{johnsson1991}.
% We make the following contributions:
% \begin{itemize}[nosep,topsep=0pt,parsep=0pt,partopsep=0pt]
% \item We adapt the Perceus algorithm to language with explicit memory operations.
% \item We demonstrate the applicability of combining lazy functional programming with precise reference counting by developing a GRIN interpreter and an LLVM code generator.


% Boquist (1999) mentioned the need for default alternatives in section 10.2.1

% "It would be an interesting experiment to be able to write the garbage collector in GRIN just as we can express other parts of the runtime
%  system in GRIN, for example EVAL (see section 4.1)." (Boquist, 1995, p.280)
%\item We have extended the syntax of GRIN language with \emph{default alternatives} for case expressions, and an indexed update operation.
%\item We have extended GRIN case expressions with \emph{default alternatives} which mentioned in future work in \citet{boquist1999}
%\item We give the syntax and semantics of a variant of GRIN. We can describe memory management in our variant of GRIN,
      %an thereby free our implementation for any runtime system similar to how evaluation of suspended computations are done
      %at a language level in GRIN.
%\item We adapt the Perceus algorithm for our variant of GRIN. (lower level, first-order)
%\item We demonstrate the applicability of combining lazy functional programming with precise reference counting.
% \end{itemize}


% Contribution:
%   • We demonstrate the applicability of combining non-strict languages with precise reference counting.
%   • We implement a a compiler backend which compiles Agda to GRIN. Then, we perform transformations on the GRIN code to
%     remove indirect function calls by an interprocedural analysis.
%   • We implement a Perceus-like algorithm for the GRIN syntax which is much more low-level then previous implementations.
%   • We implement a GRIN interpreter and a LLVM code generator.

% Contributions:
%   • We describe a rare reference counting implementation for a lazy functional programming language.
%   • We adapt the Perceus algorithm for a lower level language with explict memory operations.
%   • We demonstrate the applicability of combining lazy functional programming wth precise reference counting.
%   • We extend GRIN to enable memory management within the language.
%\begingroup
%\large{TODO SKRIV RENT}
%\endgroup
%\normalfont
%\begin{itemize}
%\item expand on laziness (see boquist1996). Why is laziness important? Osasaki pure functional data structures.
%\item Lazy languages are hard optimize. For example, common implementations of laziness are implemented we indirect functional calls to
%   evaluated suspended computations (johnsson1984, Peyton Jones 1992). Indirect function calls prevent optimization such as
%   inlining and is proven to cache misses.
%\item In this paper, we develop a new lazy compiler backend for the Agda programming language and proof assistant.
%      Our implementation uses the intermediate language GRIN to compile lazy programs. GRIN uses an interprocedural points-to analysis
%      to eliminate all indirect function calls.
%\item We extend GRIN with variant of the Perceus algorithm for precise reference counting.
%\item Contributions:
%  \begin{itemize}
%  \item Demonstrate the applicability of combining lazy function programming with precise reference counting.
%  \item Adapt the Perceus algorithm to a much lower-level language than previous implementations
%  \item We develop GRIN interpreter and a LLVM code generator to compile GRIN to machine code.
%  \end{itemize}
%\end{itemize}

% \subsection{Syntax}
% TODO
% • We choosed GRIN because of powerful optimizations and simple. But, we belive that, precise reference counting could also be applied
%   to languages like GHC's Core (PJ 96) and STG (92).

\section{Graph Reduction \\ Intermediate Notation}
\label{sec:grin}
In \citeyear{johnsson1991}, \citeauthor{johnsson1991} presented the Graph Reduction Intermediate Notation (GRIN) as an imperative version of the G-machine \citep{johnsson1984}, where lexically scoped variables are stored in registers instead of on the stack.
Later, GRIN was reformulated with a more functional flavor \citep{boquist1995}.
% FIXME previously this was "for the internal representation of Agda"
In this project, we introduce an additional variant of GRIN adapted for a subset of Agda and precise reference counting.
The syntax of our variant is shown in \mbox{Figure \ref{fig:grin-syntax}}.

A defining feature of GRIN is that suspended computations and higher-order functions are defunctionalized \citep{reynolds1972}, by
an interprocedural analysis called the heap points-to analysis \citep{boquist1999}.
As a result, all unknown function calls are eliminated.
Due to defunctionalization, all GRIN values are weak head normal forms.
% FIXME
However, in order to distinguish between complex values (closures) and simple values (literals and data types),
future mentions of \emph{suspended computation} and \emph{weak head normal form} refer to the representation in the corresponding Agda program.


\begingroup
\setlength{\fboxsep}{1em} % padding for border
\begin{figure*}[htbp]
\centering
\begin{empheq}[box=\fbox]{align*}
% TODO fix rest of emph
&\begin{array}{l l l l}
term & ::=    & \emph{term} \; ; \; \lambda \; \emph{lpat} \rightarrow \emph{term}                                      & \; \text{binding} \\
     & \;\; \mid  & \texttt{case} \; \emph{val} \; \texttt{of} \; \emph{term} \; \{\emph{cpat\;\rightarrow\;term}\,\}*  & \; \text{case} \\
     & \;\; \mid  & val \; \{val\}*                                                                                     & \; \text{application} \\
     & \;\; \mid  & \texttt{unit} \; val                                                                                & \; \text{return value} \\
     & \;\; \mid  & \texttt{store} \; val                                                                               & \; \text{allocate new heap node} \\
     & \;\; \mid  & \texttt{fetch} \; \{tag\} \; n \;  \{i\}                                                            & \; \text{load heap node} \\
     & \;\; \mid  & \texttt{update} \; \{tag\} \; n \; \{i\} \; val                                                     & \; \text{overwrite heap node} \\
     & \;\; \mid  & \texttt{unreachable}                                                                                & \; \text{unreachable} \\
\end{array} \\ \\
&\begin{array}{l l l l}
val & ::=     & \emph{tag} \; \{\emph{val}\,\}* & \; \text{constant node} \\
    & \;\; \mid & \emph{x$_n$} \; \{\emph{val}\,\}*   & \; \text{variable node} \\
    & \;\; \mid & \emph{tag}      & \; \text{single tag} \\
    & \;\; \mid & ()              & \; \text{empty} \\
    & \;\; \mid & \emph{lit}      & \; \text{literal} \\
    & \;\; \mid & \emph{x$_n$}        & \; \text{variable (de Bruijn index)} \\
    & \;\; \mid & \emph{def}      & \; \text{function definition} \\
    & \;\; \mid & \emph{prim}     & \; \text{primitive definition} \\
\end{array} \\ \\
&\begin{array}{l l l l}
lpat & ::=       & tag \; \{x\}* & \; \text{constant node pattern} \\
     & \;\; \mid & x \; \{x\}*   & \; \text{variable node pattern} \\
     & \;\; \mid & ()            & \; \text{empty pattern} \\
     & \;\; \mid & x             & \; \text{variable pattern} \\
\end{array} \\ \\
&\begin{array}{l l l l}
cpat & ::=       & tag \; \{x\}*   & \; \text{constant node pattern} \\
     & \;\; \mid & tag             & \; \text{single tag pattern} \\
     & \;\; \mid & lit             & \; \text{literal pattern} \\
\end{array} \\ \\
&\begin{array}{l l l l}
tag  & ::=       & ctag & \; \text{constructor tag} \\
     & \;\; \mid & ftag & \; \text{function tag} \\
     & \;\; \mid & ptag & \; \text{partial application tag} \\
\end{array} \\ \\
&\begin{array}{l l}
\{...\}  & \text{means 0 or 1 times}    \\
\{...\}* & \text{means 0 or more times} \\
\end{array}
\end{empheq}
\caption{GRIN syntax. }
\label{fig:grin-syntax}
\end{figure*}
\endgroup


\subsection{Code generation}
% FIXME beskriv lambda lifting istället.
Currently, our compiler only accepts a subset of Agda which is lambda lifted, first-order, and monomorphic.%
\footnote{%
Some lambda expressions and higher-order functions compile successfully because they are reduced to another representation during optimization.
See Section \ref{sec:discussion-and-future-work} for an example.
}
Following is an Agda program that computes the sum of the first 100 numbers in descending order.
%\begin{figure}[hb!]
\begin{code}[number=agda:program]
downFrom : ℕ → List ℕ
downFrom zero = []
downFrom (suc n) = n ∷ downFrom n

sum : List ℕ → ℕ
sum [] = 0
sum (x ∷ xs) = x + sum xs

main = sum (downFrom 100)
\end{code}
%\caption{Agda example program}
%\label{fig:agda-program}
%\end{figure}

Before generating GRIN code, the program is tranformed into an untyped lambda calculus with let and case expressions called Treeless \citep{hausmann2015}.
The Treeless representation uses administrative normal form.
Hence, case expressions only scrutinize variables, patterns are not nested, and functions are only applied to variables.
Following is the Treeless representation of \AgdaFunction{downFrom} \refp{agda:program}.
The implementation uses de Bruijn indices to represent variables, but this paper uses variable names to make it human-readable.

\begin{typewriter}{Treeless}{treeless:downFrom}
downFrom x7 = case x7 of
  0 → []
  _ → let x5 = 1
          x4 = _-_ x7 x5
          x3 = downFrom x4 in
      _$∷$_ x4 x3
\end{typewriter}
% cite De Bruijn 72
% uneccessary?
% During this phase, we also transform the program so applications only take variables as operands.

% TODO write that GRIN is low level comparatively.
% FIXME monadic operations instead of let expressions???
% TODO Be specific! Don't write "is simliar". Instead write "Translating Treeless to GRIN is straight forward..."

% GRIN is closer to the machine compared to Treeless:
When compiling Treeless to GRIN we need to be explicit about evaluation of suspended compuations and memory operations.
GRIN uses a builtin state monad to sequence compuations.
The monadic operations are \lstinline{unit}, bind (written "\lstinline{;}"), \lstinline{store}, \lstinline{fetch}, and \lstinline{update}.
For example, to compile the Treeless expression
\begin{lstlisting}[language=Treeless, xleftmargin=1em]
let x = $\emph{val}$ in foo x
\end{lstlisting}
lazily, we allocate the value and pass the pointer as an argument to the translated function
% FIXME show in some way that the resulting "foo" is not the same as the original "foo"
\begin{lstlisting}[language=GRIN, xleftmargin=1em]
store $\emph{val}$ ; λ x$_{ptr}$ → foo x$_{ptr}$
\end{lstlisting}
Here, \emph{val} must be a constant node value, which is a tag followed by a sequence of arguments (see \mbox{Figure \ref{fig:grin-syntax}}).
We can pattern match on a tag with the case expression to determine the kind.
Tags are prefixed with either a C if it is a constructor value or an F for suspended function applications.
The node arguments are usually pointers to other heap-allocated nodes, but they can also be unboxed values.
For example, the boxed integer tag \lstinline{Cnat} accepts one unboxed integer.
Following is the corresponding GRIN representation of \lstinline{downFrom} \refp{treeless:downFrom}.
Note the explicit call to \lstinline{eval} prior to the case expression which forces weak head normal form.

\begin{typewriter}{GRIN}{grin:downFrom}
downFrom x7 =
  eval x7 ; λ Cnat x6 →
  case x6 of
    0 → unit (C[])
    _ →
      store (Cnat 1) ; λ x5 →
      store (F_-_ x7 x5) ; λ x4 →
      store (FdownFrom x4) ; λ x3 →
      unit (C_$∷$_ x4 x3)
\end{typewriter}%

\label{sec:grin-interpreter}
\subsection{Interpreter}
The typing of the semantics of our language are presented in \mbox{Figure \ref{fig:grin-semantics}}.
We use two semantic functions, $\mathcal{E}$ and $\mathcal{V}$, to assign semantic values to terms and syntactic values, respectively.
The semantic values are are a subset of the syntactic values with an additional construct representing undefined behaviour ($\bot$).
Moreover, we have a new construct for heap allocated nodes.
A heap allocated node consist of a reference count, a tag, and a sequence of values.

We have implemented the semantics as an interpreter, to test that transformations do not change the semantics of the program.
% FIXME lower level what? nature?
Due to the lower level of GRIN, we can also collect information about resource usage.
For instance, our example program \refp{agda:program} allocates the following nodes: 101 \lstinline{Cnat}, 101 \lstinline{FdownFrom}, 100 \lstinline{F_-_},  and 100 \lstinline{Fsum}.
In total, our program allocates 402 nodes and peak size is 202 nodes.
With this information we can estimate the performance characteristics of the program and identify optimization opportunities (more on this in Section \ref{sec:discussion-and-future-work}).

\begingroup
\setlength{\fboxsep}{1em} % padding for border
\begin{figure}[htbp]
\centering
\begin{empheq}[box=\fbox]{align*}
\begin{array}{l}
\emph{SemVal} = \emph{tag} \; \{\emph{val}\,\}* \mid \emph{tag} \mid \emph{loc} \mid \emph{()} \mid \emph{lit} \mid \bot \\
\emph{HeapNode} = \emph{ref-count} \; \emph{tag} \; \{\emph{val}\,\}* \\
\emph{Stack} = \emph{Var} \rightarrow  \emph{Loc} \\
\emph{Heap} = \emph{Loc} \rightarrow  \emph{HeapNode} \\
\\
\mathcal{E} : \emph{Stack} \rightarrow \emph{Heap} \rightarrow \emph{Term} \rightarrow \emph{SemVal} \times \emph{Heap} \\
\mathcal{V} : \emph{Stack} \rightarrow \emph{Val} \rightarrow \emph{SemVal} \times \emph{Heap} \\
\end{array}
\end{empheq}
\caption{GRIN semantic functions. }
\label{fig:grin-semantics}
\end{figure}
\endgroup


\subsection{Analysis and transformations}
\label{sec:analysis-and-transformations}
The most important GRIN transformation is \emph{eval inlining}.
\lstinline{eval} is a normal GRIN function which forces a suspended computation to weak head normal form.
For instance, "\lstinline{eval x7 ; λ Cnat x6 → ...}" \refp{grin:downFrom} evaluates the value at the pointer \lstinline{x7} to a boxed integer \lstinline{Cnat x6}.
Eval inlining generates a specialized \lstinline{eval} function for each call site.
Following \refp{grin:downFrom-inlined} is the \mbox{eval-inlined} version of \lstinline{downFrom} \refp{grin:downFrom}.
To evaluate a suspended computation, we load the node from the heap using \lstinline{fetch} (line 2).
Then, we pattern match on the possible nodes.
Constructor nodes are already in weak head normal form so they are left unchanged (line 4).
Function nodes are evaluated by applying the corresponding function to the arguments (line 5).
Finally, the heap is updated with the evaluated value (line 7).

\begin{typewriter}[left]{GRIN}{grin:downFrom-inlined}
downFrom x7 =
  (fetch x7 ; λ x34 →
   (case x34 of
      Cnat x36 → unit (Cnat x36)
      F_-_ x37 x38 → _-_ x37 x38
   ) ; λ x35 →
   update x7 x35 ; λ () →
   unit x35
  ) ; λ Cnat x6 →
  case x6 of
    0 → unit (C[])
    _ →
      store (Cnat 1) ; λ x5 →
      store (F_-_ x7 x5) ; λ x4 →
      store (FdownFrom x4) ; λ x3 →
      unit (C_$∷$_ x4 x3)
\end{typewriter}
The pattern-matching step (lines 3-5) requires a set of possible nodes for each abstract heap location.
One option is to pattern match on all the possible nodes, but this does not scale to large programs.
Instead, we use the \emph{heap points-to} analysis \citep{johnsson1991} to create a conservative approximation.%
\footnote{%
The analysis does not consider the context or order (flow) of operations, which means that certain phases of the analysis can be executed in parallell and cached.

}
The analysis is interprocedural, meaning that multiple functions need to be analyzed together.
We will not go into detail about the algorithm, as it is thoroughly described in \citep{boquist1996, johnsson1991}.
Instead, this paper will only provide a general intuition of the algorithm.

Consider the inlined evaluation in \mbox{\lstinline{downFrom} \refp{grin:downFrom-inlined}}.
There are two tags in the case expression: \lstinline{F_-_} (line 4) and \lstinline{Cnat} (line 5).
The former comes from the suspended recursive call inside \lstinline{downFrom} (line 15).
The latter is from the \lstinline{update} operation (line 7), and the suspended call to \lstinline{downFrom} (line 3) in the main function:

\begin{typewriter}[left]{GRIN}{grin:main}
main =
  store (Cnat 100) ; λ x20 →
  store (FdownFrom x20) ; λ x19 →
  sum x19 ; λ Cnat x18 →
  printf x18
\end{typewriter}

\citeauthor{boquist1999}'s thesis presents 24 transformations divided into two groups: simplifying transformations and optimizing
transformations \citep{boquist1999}.
The simplifying transformations are necessary for the LLVM IR code generator and are all implemented, except \emph{inlining calls to apply}
which is used for higher-order functions.
The optimizing transformations significantly alter the program and achieve similar effects to deforestation \citep{wadler1988} and listlessness \citep{wadler1984}.
% FIXME deforestation is the algorithm for getting listlessness/Treelessness
% TODO Instead, mention that it eliminates allocations and unboxing, or constant folding?
Of these, we have only implemented \emph{copy propagation} and \emph{constant propagation}.
This project aims to combine lazy functional programming with precise reference counting.
Producing the most optimized code is out of scope.
However, this is something we would like to explore in future research.

\section{Precise reference counting}
\label{sec:precise}
After the GRIN transformations, we insert reference counting operations to automatically manage memory.
We use an algorithm based on Perceus \citep{reinking2021}.
Perceus is a deterministic syntax-directed algorithm for the linear resource calculus $λ₁$, which is an untyped lambda calculus extended with explicit binds and pattern matching.
Our implementation uses GRIN which have explicit memory operations and different calling conventions.
In this section, we give a brief overview of the algorithm and describe two optimizations: \emph{drop specialization} and \emph{dup/drop fusion}.
We also discuss some challenges when adapting Perceus to GRIN in Section \ref{sec:perceus-challanges}.


The algorithm uses two sets of resource tokens; an owned environment $\Gamma$ and a borrowed environment $\Delta$.
% Explain isomorphism of resource tokens and reference
Elements in the owned environment must be consumed exactly once.
We consume a value by calling the function \lstinline{drop}, or by transferring ownership to another consumer.
An example of this is \lstinline{main} \refp{grin:main} which requires no reference counting operations because it transfers ownership of both of its allocations.
% The allocation
% \begin{lstlisting}[xleftmargin=1em]
% store (Cnat 100) ; λ x20 →
% \end{lstlisting}
% is consumed by the suspended computation
% \begin{lstlisting}[xleftmargin=1em]
% store (FdownFrom x20) ; λ x19 →
% \end{lstlisting}
% which in turn is consumed by
% \begin{lstlisting}[xleftmargin=1em]
% sum x19
% \end{lstlisting}
The allocation "\lstinline{store (Cnat 100) ; λ x20 →}" \mbox{(line 2)} is consumed by the suspended computation "\lstinline{store (FdownFrom x20) ; λ x19 →}" \mbox{(line 3)}, which in turn is consumed by "\lstinline{sum x19}" (line 4).
Elements in the borrowed environment can only be applied to non-consuming operations, such as pattern matching.
We can promote an element from the borrowed environment to the owned environment by calling the function \lstinline{dup}.
\ref{fig:perceus-rules}

\begin{figure*}[hptb]
\begin{mdframed}
\centering
% FETCH-OPAQUE
\AxiomC{}
\RightLabel{\textsc{fetch-opaque}}
\UnaryInfC{\jm{\Delta}{\Gamma}{\lstinline|fetch| x$_m$ [1]}{\lstinline|fetch| x$_m$ [1]}}
\DisplayProof
%\\
\hspace{1em}
% ⊥
\AxiomC{}
\RightLabel{\textsc{unreachable}}
\uiJudgment{\Delta}{\Gamma}{\bot}{\bot}
\DisplayProof
\\
\vspace{1em}
% STORE
\AxiomC{\jmv{\Delta}{\Gamma}{v}{\Gamma$_d$}}
\RightLabel{\textsc{store}}
  \UnaryInfC{\jm{\Delta}{\Gamma}{\lstinline|store|\;v}{dup\;\Gamma$_d$\;;\;\lstinline|store|\;v$^\prime$}}
\DisplayProof
\hspace{1em}
% UNIT
\AxiomC{\jmv{\Delta}{\Gamma}{v}{\Gamma$_d$}}
\RightLabel{\textsc{unit}}
\UnaryInfC{\jm{\Delta}{\Gamma}{\lstinline|unit|\;v}{dup\;\Gamma$_d$\;;\;\lstinline|unit|\;v$^\prime$}}
\DisplayProof
\\
\vspace{1em}
% APP
\axLet{\Gamma_i}%
  {(\Gamma\;-\;\sequence{\Gamma_{i+1},\dots,\Gamma_n})\;\cap\;fv(v_i)}
\axJudgment%
  {\sequence{\Delta,\Gamma_{i+1},\dots,\Gamma_n}}%
  {\Gamma_i}{v_i}{\Gamma_{d_i}}
\RightLabel{\textsc{app}}
\biJudgment{\Delta}{\Gamma}%
  {f\;\{\sequence{v_i,\dots,v_n}\}}%
  {dup\;(\sequence{\Gamma_{d_1},\dots,\Gamma_{d_n}})\;;\;f\;\{\sequence{v_i,\dots,v_n}\}}%
\DisplayProof
\\
\vspace{1em}
% CASE
\axLet{\Gamma_i}{\Gamma\;\cap\;fv(t_i)}
\axLet{\Gamma_d}{\Gamma\;-\;\Gamma_i}
\axJudgment{\Delta}{\Gamma_i}{t_i}{t^\prime_i}
\RightLabel{\textsc{case}}
\triJudgmentDouble{\Delta}{\Gamma}%
  {\mbox{\lstinline|case|}\;v\;t_1\;\{\sequence{tag_2\rightarrow\:t_2,\dots,tag_n\rightarrow\:t_n}\}}%
  {\mbox{\lstinline|case|}\;v\;t^\prime_1\;\{\sequence{tag_2\rightarrow\:drop\;\Gamma_{d_2}\;;\;t^\prime_2,\dots,tag_n\rightarrow\:drop\;\Gamma_{d_n}\;;\;t^\prime_n}\}}
\DisplayProof
\\
\vspace{1em}
% UPDATE
\axJudgmentV{\Delta}{\Gamma}{v}{\Gamma_d}
\axLet{k}{arity(tag)}
\RightLabel{\textsc{update}}
\biJudgmentAlignRight{\Delta}{\Gamma}%
  {\mbox{\lstinline|update|}\;[tag^\prime/tag]\;x_n\;v}%
  {%
    {\mbox{\lstinline|fetch|}\;tag\;x_n\;[2]\;;\;\lambda\rightarrow} %
    {drop\;x_0\;;} %
    {\dots} %
    {\mbox{\lstinline|fetch|}\;tag\;x_{n+k-1}\;[k+1]\;;\;\lambda\rightarrow} %
    {drop\;x_0\;;} %
    {dup\;\Gamma_d\;;} %
    {\mbox{\lstinline|update|}\;[tag^\prime/tag]\;x_{n+k}\;v}%
  }
\DisplayProof
\\
\vspace{1em}
% BIND-FETCH-DUP
\Axiom$x_0\in\fCenter\;fv(t)$
\axLet{\Gamma^\prime}{\sequence{(\Gamma\;\cap\;fv(t)),\{x_0\}}}
\noLine
\uiLet{\Gamma_d}{\Gamma\;-\;fv(t)}
\axJudgment{\Delta}{\Gamma^\prime}{t}{t^\prime}
\RightLabel{\textsc{bind-fetch-dup}}
\triJudgment{\Delta}{\Gamma}%
  {\mbox{\lstinline|fetch|\;tag\;[i]}\;;\;\lambda\rightarrow\;t}%
  {\mbox{\lstinline|fetch|}\;tag\;[i]\;;\;\lambda\rightarrow\;dup\;x_0\;;\;drop\;\Gamma_d\;;\;t^\prime}
\DisplayProof
\\
\vspace{1em}
% BIND-FETCH
\Axiom$x_0\not\in\fCenter\;fv(t)$
\axLet{\Gamma_d}{\Gamma\;-\;fv(t)}
\axJudgment{\sequence{\Delta,\{x_0\}}}{\Gamma\;\cap\;fv(t)}{t}{t^\prime}
\RightLabel{\textsc{bind-fetch}}
\triJudgment{\Delta}{\Gamma}%
  {\mbox{\lstinline|fetch|}\;tag\;x_n\;[i]\;;\;\lambda\rightarrow\;t}%
  {\mbox{\lstinline|fetch|}\;tag\;x_n\;[i]\;;\;\lambda\rightarrow\;drop\;\Gamma_d\;;\;t^\prime}
\DisplayProof
\\
\vspace{1em}
% BIND
\axLet{\Gamma_2}{(\Gamma,\;bv(p))\;\cap\;fv(t_2)}
\noLine
\uiLet{\Gamma_1}{(\Gamma\;\cap\;ov(t_1))\;-\;\Gamma_2}
\noLine
\uiLet{\Gamma_d}{(\Gamma,\;bv(p))\;-\;\Gamma_1,\;\Gamma_2}
\AxiomC{\jm{\Delta,\;\Gamma$_d$,\;\Gamma$_2$}{\Gamma$_1$}{t$_1$}{t$^\prime_1$}}
\AxiomC{\jm{\Delta}{\Gamma$_2$}{t$_2$}{t$^\prime_2$}}
\RightLabel{\textsc{bind}}
\TrinaryInfC{\jm{\Delta}{\Gamma}{t$_1$\;;\;\lambda\;p\;$\rightarrow$\;t$_2$}{t$^\prime_1$\;;\;\lambda\;p\;$\rightarrow$\;drop\;\Gamma$_d$\;;\;t$^\prime_2$}}
\DisplayProof
\end{mdframed}
\caption{Perceus rules}
\label{fig:perceus-rules}
\end{figure*}

Figure \ref{fig:drop-insert} presents the function \lstinline{sum} with reference counting operations inserted and highlighted in gray.
Many aspects of adapting Perceus to GRIN are present in \lstinline{sum}.
On line 2, "\lstinline{fetch x14 [2] ; λ$_\Gamma$ x40 →}" extends the owned environment with a resource token for the variable \lstinline{x40}.
The resource token is subsequently consumed by "\lstinline{downFrom x40 ; λ$_\Delta$ x59 x60 x61 →}", which in turn extends the borrowed environment with tokens for \lstinline{x60}  and \lstinline{x61}.
There is no resource token for \lstinline{x59} because the first argument of a node is always a tag, represented by an unboxed integer stored in registers.
Similarly, unannotated bindings, such as "\lstinline{update x14 (C[]) ; λ () →}" \mbox{(line 6)}, do not extend any of the environments.
Additionally, we can conclude that \lstinline{fetch} and \lstinline{update} must be borrowing operations, because both use \lstinline{x14} prior to it being dropped (line 7).

%\mbox{(line 3)}, which extends borrowed environment with resource tokens to \lstlisting{x60} and \lstlisting{x61}.
% \begin{itemize}[nosep,topsep=0pt,parsep=0pt,partopsep=0pt]
% \item Owning binds: "
% \item Borrowed binds: "\lstinline{downFrom x40 ; λ$_\Delta$ x59 x60 x61 →}" \mbox{(line 3)}
% \item Local binds:
% \end{itemize}
%For example, we can conclude that the operations \lstinline{fetch} and \lstinline{update} must be borrowing operations.
%This is evident because both \lstinline{fetch} and \lstinline{update} use \lstinline{x14} prior to it being explicitly dropped.
% TODO perhaps introduce a new syntax for borrowed and owned lambda $\lambda_{\delta}$
% There a three types of abstractions: Abstractions which binds to the owned environment, abtractions which binds the borrowed envirionment, and abstractions which doesn't bind at all.
%
% Another obeservation is that there are now three kinds of binds.
%We can also conclude that \lstinline{fetch} bind variables that extend the owned environment.
%Meanwhile, the bound variables (as a result) of function applications extend the borrowed environment.

As a result of GRIN's explicit memory operations, we can describe the Perceus primitives \lstinline{dup}, \lstinline{drop}, \lstinline{is-unique}, \lstinline{decref}, and \lstinline{free} with GRIN's existing constructs --- similarly to how GRIN can express evaluation of suspended compuations within the language.
GRIN lacks a primitive for deallocating memory, so \lstinline{free} is just a foreign function call to libc's \lstinline{free}.
However, our implementation of \lstinline{drop} is unsatisfactory.
Nodes of different tags vary in arity and the arguments can be either boxed or unboxed.
Therefore, we need to pattern match on all the possible tags to drop the child references.
This is similar to the problem with a general \lstinline{eval} function (Section \ref{sec:analysis-and-transformations}).
A crucial difference is that \lstinline{drop} is recursive, so we cannot always eliminate the general function by specialization and inlining.
Still, we can specialize the \emph{initial} call to \lstinline{drop} if the tag is known.
The points-to analysis already require us to keep track of the possible nodes of each variable, so in practice we can almost always specialize \lstinline{drop}.

In Figure \ref{fig:drop-spec}, we specialize both calls to \lstinline{drop}.
This eliminates all reference counting operations in the branch for the empty list (lines 5-14).
Then, we push down the \lstinline{dup} operations (lines 16-17) and eliminate \lstinline{dup}/\lstinline{drop} pairs.
This optimization is called \emph{dup/drop fusion} and the result is presented in Figure \ref{fig:dup/drop-fusion}.
For our example program we eliminate the general \lstinline{drop} function completely, but this is not always the case.

\citeauthor{reinking2021} presents an additional optimization called \emph{reuse analysis}, which perfoms destructive updates when possible.
This optimization is not yet implemented in our project.
However, we will definitely implement this optimization in the future because it enables in-place updates for pure functional languages.
This, in turn, reduces the number allocations and deallocations, which is especially important for reference counting implementations as the overhead is proportional to the number of allocations, deallocations, and reference counting operations \citep{wilson1992}.

\begin{figure*}[hp]
\begin{mdframed}
\centering
\setlength{\fboxsep}{0pt} % padding for border
\begin{subfigure}[t]{0.44\textwidth}
\centering
\begin{lstlisting}[numbers=left]
sum x14 =
  fetch x14 [2] ; λ$_\Gamma$ x40 →
  downFrom x40 ; λ$_\Delta$ x59 x60 x61 →
  case x59 of
    [] →
      update x14 (C[]) ; λ () →
      !\hl{drop x14 ; λ () →}!
      unit (Cnat 0)
    _$∷$_ →
      !\hl{dup x61 ; λ () →}!
      !\hl{dup x60 ; λ () →}!
      update x14 (C_$∷$_ x60 x61) ; λ () →
      !\hl{drop x14 ; λ () →}!
      store (Fsum x61) ; λ$_\Gamma$ x10 →
      _+_ x10 x60
\end{lstlisting}
\caption{dup/drop insertion}
\label{fig:drop-insert}
\end{subfigure}\par\medskip
\begin{subfigure}[b]{0.44\textwidth}
\centering
\begin{lstlisting}[numbers=left]
sum x14 =
  fetch x14 [2] ; λ$_\Gamma$ x40 →
  downFrom x40 ; λ$_\Delta$ x59 x60 x61 →
  case x59 of
    [] →
      update x14 (C[]) ; λ () →
      fetch x14 [0] ; λ x499 →
      (case x499 of
         1 → free x14
         _ →
           PSub x499 1 ; λ x498 →
           update x14 [0] x498
      ) ; λ () →
      unit (Cnat 0)
    _$∷$_ →
      !\hl{dup x61 ; λ () →}!
      !\hl{dup x60 ; λ () →}!
      update x14 (C_$∷$_ x60 x61) ; λ () →
      fetch x14 [0] ; λ x501 →
      (case x501 of
         1 →
           !\hl{drop x61 ; λ () →}!
           !\hl{drop x60 ; λ () →}!
           free x14
         _ →
           PSub x501 1 ; λ x500 →
           update x14 [0] x500
      ) ; λ () →
      store (Fsum x61) ; λ$_\Gamma$ x10 →
      _+_ x10 x60
\end{lstlisting}
\caption{drop specialization}
\label{fig:drop-spec}
\end{subfigure}
\begin{subfigure}[b]{0.44\textwidth}
\begin{lstlisting}[showlines=true, numbers=none]
sum x14 =
  fetch x14 [2] ; λ$_\Gamma$ x40 →
  downFrom x40 ; λ$_\Delta$ x59 x60 x61 →
  case x59 of
    [] →
      update x14 (C[]) ; λ () →
      fetch x14 [0] ; λ x499 →
      (case x499 of
         1 → free x14
         _ →
           PSub x499 1 ; λ x498 →
           update x14 [0] x498
      ) ; λ () →
      unit (Cnat 0)
    _$∷$_ →
      update x14 (C_$∷$_ x60 x61) ; λ () →
      fetch x14 [0] ; λ x501 →
      (case x501 of
         1 →
           free x14
         _ →
           !\hl{dup x61 ; λ () →}!
           !\hl{dup x60 ; λ () →}!
           PSub x501 1 ; λ x500 →
           update x14 [0] x500
      ) ; λ () →
      store (Fsum x61) ; λ$_\Gamma$ x10 →
      _+_ x10 x60


\end{lstlisting}
\caption{dup push-down and dup/drop fusion}
\label{fig:dup/drop-fusion}
\end{subfigure}
\end{mdframed}
\caption{Perceus transformations}
\label{fig:perceus}
\end{figure*}


\subsection{Challenges adapting Perceus to GRIN}
\label{sec:perceus-challanges}
One of the goals of GRIN is to improve register utilization for lazy functional languages \mbox{\citep{boquist1999}}.
As such, the result of function applications, \lstinline{fetch}, and \lstinline{unit} are stored in registers.
Registers can either hold regular values, such as integers, or pointers to heap allocated nodes.
Only heap allocated values contain a reference count, so the implementation need to keep track and only reference count variables which are pointers.
For example, primitive subtraction returns an integer stored in registers "\lstinline{PSub x499 1 ; λ x498 →}" \mbox{(Figure \ref{fig:dup/drop-fusion}}, line 11), so \lstinline{x498} should not be reference counted.
This is in contrast to the calc\-ulus which Perceus is defined for, $λ₁$, where all variables are pointers to heap-allocated values \citep{reinking2021}.
Another difference is that functions in GRIN are not allowed to return unboxed pointers.
Moreover, all function calls ret\-urn explicit nodes rather than node variables.
For example: "\lstinline{downFrom x40 ; λ$_\Delta$ x59 x60 x61 →}" (\mbox{Figure \ref{fig:dup/drop-fusion}}, line 3).
This is problematic because \lstinline{x60} and \lstinline{x61} are undefined when \lstinline{downFrom} returns the empty list.
Therefore, we are only allowed to \lstinline{dup}/\lstinline{drop} the variables once the tag is known.

\section{LLVM IR code generation}
\label{sec:llvm-codegen}
% TODO we use the C runtime library for malloc, free, and printf
We have implemented an LLVM code generator to test that our compiler works and that the programs reclaim all the allocated memory.
Our implementation of the code generator is simple.
GRIN does not demand a specific memory layout for the node values.
The only requirements are that tags should be unique and easy to extract \citep{boquist1999}.
We use an array of four 64-bit integer values for all heap-allocated nodes.
The first two cells contain the reference count and the tag, respectively.
The node arguments occupy the rest of the array.

LLVM IR is a statically typed language, while GRIN is untyped.
This discrepancy is not an issue because all functions return nodes, and all variables are pointers or integers.
Hence, we store the pointers as integer values and convert them to the pointer type (\lstinline{ptr}) as required.
However, we will probably develop a type system for GRIN once we switch to a more sophisticated memory layout and implement the \emph{general unboxing} optimization \citep{boquist1999}.

% - Our implementation is not ready for benchmarks. We would like to compile a larger subset of Agda, implement the rest of optimizing transformations, and reuse-analysis.
\section{Discussion and future work}
\label{sec:discussion-and-future-work}
\citet{reinking2021} proves that the Perceus algorithm never drops a live reference, and that all references are freed as soon as they are no longer needed.
We have not proven whether our variant of Perceus maintains these invariants.
Even so, the algorithm has been remarkably dependable in our preliminary tests: our programs never leak memory or dereference dangling pointers, and objects are freed in a timely manner.
On the other hand, we have only tested small programs because our compiler lacks common features such as higher-order functions.
For the same reason, we have no realiable metrics for the performance of the compiled code.

As we mentioned in Section \ref{sec:precise}, the overhead of reference counting is closely related to the number of allocations, deallocations, and reference counting operations.
Laziness tend to encourage more allocations because function arguments are boxed.
To make lazy functional programming with reference counting viable, we belive it is necessary to reduce the number of allocations via aggressive compiler optimization.
Section \ref{sec:grin-interpreter} presents the resource information about our example program \refp{agda:program}.
The program allocates $\mathcal{O}(n)$ nodes with respect to the instance size, and the peak heap size is also $\mathcal{O}(n)$.
Ideally, the compiler should produce a program simliar to the following C program which have no allocations.

\begin{typewriter}{c}{c:program}
int sum_down_from(int n){
  int m = 0;
  for (int i = n - 1; i >= 1; i--)
    m += i;
  return m;
}
\end{typewriter}
To address the total amount of allocations, we would like to implement GRIN optimziations such as \emph{general unboxing} and \emph{late inlining}.
These optimization are sufficient to remove all allocations for a tail recursive variant our example program \refp{agda:program} \citep{boquist1999}.
However, for complex programs, implementing the Perceus reuse-analysis avoids additional allocations by updating nodes \mbox{in-place}.

Space leaks is another issue for lazy functional languages that can degrade performance and cause memory overflows \citep{jones1987}.
For example, our program \refp{agda:program} stack overflows for larger instances.
We can prevent this by making \AgdaFunction{sum} tail recursive and eager in the first argument, by using the \mbox{\AgdaFunction{primForce}} primitive:

\begin{code}[number=agda:final-program]
downFrom′ : ℕ → List ℕ
downFrom′ zero = []
downFrom′ (suc n) = n ∷ downFrom′ n

sum′ : ℕ → List ℕ → ℕ
sum′ m [] = m
sum′ m (n ∷ ns) =
  sum′ (primForce n _+_ m) ns

main′ = sum (downFrom 100)
\end{code}
Evaluating (\AgdaFunction{primForce} \AgdaBound{n} \AgdaFunction{\AgdaUnderscore+\AgdaUnderscore} \AgdaBound{m}) prior to the recursive call, achieves several things.
First, we prevent the stack overflow by not building and then evaluating a large suspended computation.
Second, forcing \AgdaFunction{\AgdaUnderscore+\AgdaUnderscore} drops the last reference count to \AgdaBound{n}, and thus the object is deallocated.
Which in turn, eliminates all out of memory errors and makes the peak heap size $\mathcal{O}(1)$.
As this example demonstrates, selectively applying eager evaluation can drastically change the performance characteristics of our programs.
However, this require a comprehensive understanding of the evaluation model and how it interacts with various compiler optimization.
Instead, we would like to develop a \emph{strictness analysis} \citep{mycroft1980} that can automatically decide when to use eager evaluation.
This is an intresting research opportunity because Agda is a total language so evaluation order does not matter.
Therefore, the analyis can freely mix lazy and eager evaluation without accidentally making a productive program unproductive.
% TODO perhaps add that GRIN also supports both eager and lazy evaluation.

\section{Related work}
Reference counting implementations are uncommon in the literature of lazy functional languages.
\citet{goldberg1988} uses a distributed reference counting scheme, simliar to that of \citet{lermen1986}, in the lazy functional language Alfalfa.
\citet{kaser1992} uses reference counting in a parallel lazy functional language called EQUALS.
They observe that reference counting minimizes memory contention%
\footnote{That is, multiple processes trying to access the same memory at once.} %
because memory deallocations are distributed.
This observervation is further supported by \citet{jones1987, jones1996}.

To our knowledge, precise reference counting which utilizes ownership to free objects as soon as possible have not yet been implemented in any lazy functional language.
However, there have been multiple implementations for eager functional languages \citep{reinking2021, ullrich2021, teeuwissen2023, pinto2023}.
\citeauthor{teeuwissen2023}'s implementation in the Roc programming language is the most simliar to ours.
Their algorithm is also based on Perceus and it is implemented in an intermediate representation with explicit memory operations.
Moreover, they use an defunctionalization algorithm by \citet{brandon2023} to compile higher-order functions.
However, their algorithm specializes higher-order function for each callee.
Compared to using branching, this gives better performance at the cost of larger code size.

The most recent implementation of GRIN is presented by \citet{podlovics2021}.
Among other contributions, they implement an analyis by \citet{turk2010} that creates a bipartite graph of node producers and consumers.
They use this analyis to perform a \emph{dead data eliminatation} optimization.
Finally, \citet{podlovics2021} presents overview of other attempts at using GRIN since \citeauthor{boquist1999}'s thes\-is.


% Since \citeauthor{boquist1999}'s \citeyear{boquist1999} thesis, there have been multiple projects using GRIN.
% Recently, \citet{podlovics2021}
%The most relevant is \citet{podlovics2021}
%They also provide an overview of earlier projects using GRIN.
%
%\citep{mol} created an Agda-to-GRIN compiler in his master thesis.
%The compiler uses the Agda \emph{internal syntax} which is the AST prior to Treeless.
%This made compiling Agda to GRIN significantly more difficult than compiling Treeless to GRIN.
%Mol only compiles Agda to GRIN. Then, the points-to analysis, GRIN transformations, and C code generation are
%carried out by the EHC/UHC compiler, which is a Haskell compiler which uses GRIN as an intermediate language.


% 1. Reference counting in lazy languages. EQUAL.
% 2. Precise reference counting. Perceus. Lean, Koka, Roc.
%    FBIP. Based on Hofmann (2000).
% 3. GRIN. Interprocedural analysis.
% 4. Compile-time garbage collection. Linear types. ASAP.

% To our knowledge, Perceus is the main memory management technique in the Koka \citep{reinking2021}, Lean \citep{TODO}, and Roc \citep{teeuwissen2023}.
% There is also an experimental implementation in OCaml \citep{pinto2023}.
% All of these, are eager functional languages and mostly immutable.
% Our backend uses lazy evaluation.


%\begin{itemize}
%\item GRIN \citep{podlovics2021}