From a2afd4b08dc2b7eada2f95ee95457457a3331344 Mon Sep 17 00:00:00 2001 From: Chloe Brown Date: Tue, 25 Mar 2025 16:52:43 +0000 Subject: Before the big rewrite --- lexer.py | 19 ++++ main.bib | 70 ++++++++++++++ main.tex | 261 +++++++++++++++++++++++++++++++++++++++++++++++++++ sec/compiler.ltx | 18 ++++ sec/embedding.ltx | 65 +++++++++++++ sec/intro.ltx | 103 ++++++++++++++++++++ sec/lang.ltx | 226 ++++++++++++++++++++++++++++++++++++++++++++ sec/lexer.py | 1 + sec/reducer.ltx | 17 ++++ sec/related.ltx | 12 +++ sec/systemt.ltx | 130 +++++++++++++++++++++++++ thmdef-tcolorbox.sty | 9 ++ 12 files changed, 931 insertions(+) create mode 100644 lexer.py create mode 100644 main.bib create mode 100644 main.tex create mode 100644 sec/compiler.ltx create mode 100644 sec/embedding.ltx create mode 100644 sec/intro.ltx create mode 100644 sec/lang.ltx create mode 120000 sec/lexer.py create mode 100644 sec/reducer.ltx create mode 100644 sec/related.ltx create mode 100644 sec/systemt.ltx create mode 100644 thmdef-tcolorbox.sty diff --git a/lexer.py b/lexer.py new file mode 100644 index 0000000..52080b0 --- /dev/null +++ b/lexer.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +from pygments.lexers.ml import OcamlLexer +from pygments.token import Name, Keyword +from pathlib import Path + +class SystemTLexer(OcamlLexer): + name = 'System T' + aliases = ['syst'] + filenames = ['*.syst'] # just to have one if you whant to use + + EXTRA_KEYWORDS = ['primrec', 'prl', 'prr', 'fold'] + + def get_tokens_unprocessed(self, text): + for index, token, value in OcamlLexer.get_tokens_unprocessed(self, text): + if token is Name and value in self.EXTRA_KEYWORDS: + yield index, Keyword, value + else: + yield index, token, value diff --git a/main.bib b/main.bib new file mode 100644 index 0000000..ee30bc1 --- /dev/null +++ b/main.bib @@ -0,0 +1,70 @@ +@article{DBLP:journals/pacmpl/XueO24, + author = {Xu Xue and + Bruno C. d. S. Oliveira}, + title = {Contextual Typing}, + journal = {Proc. {ACM} Program. Lang.}, + volume = {8}, + number = {{ICFP}}, + pages = {880--908}, + year = {2024}, + doi = {10.1145/3674655}, + timestamp = {Tue, 24 Dec 2024 22:38:35 +0100}, + biburl = {https://dblp.org/rec/journals/pacmpl/XueO24.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/vlc/GreenP96, + author = {Thomas R. G. Green and + Marian Petre}, + title = {Usability Analysis of Visual Programming Environments: {A} 'Cognitive + Dimensions' Framework}, + journal = {J. Vis. Lang. Comput.}, + volume = {7}, + number = {2}, + pages = {131--174}, + year = {1996}, + doi = {10.1006/JVLC.1996.0009}, + timestamp = {Mon, 29 Jul 2019 16:00:07 +0200}, + biburl = {https://dblp.org/rec/journals/vlc/GreenP96.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@book{DBLP:books/sp/LongleyN15, + author = {John Longley and + Dag Normann}, + title = {Higher-Order Computability}, + series = {Theory and Applications of Computability}, + publisher = {Springer}, + year = {2015}, + doi = {10.1007/978-3-662-47992-6}, + isbn = {978-3-662-47991-9}, + timestamp = {Tue, 16 May 2017 14:01:44 +0200}, + biburl = {https://dblp.org/rec/books/sp/LongleyN15.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@unpublished{Squid:unpublished/McBride01, + author = {Conor McBride}, + title = {The Derivative of a Regular Type is its Type of One-Hole Contexts}, + year = {2001}, + url = {http://strictlypositive.org/diff.pdf}, + urldate = {2025-02-07} +} + +@online{Squid:online/Kiselyov22, + author = {Oleg Kiselyov}, + title = {Simply-typed encodings: {PCF} considered as unexpectedly expressive programming language}, + year = {2022}, + url = {https://www.okmij.org/ftp/Computation/simple-encodings.html}, + urldate = {2024-09-16} +} + +@book{DiscoverEd:book/oc/Cantor52, + author = {Cantor, Georg}, + title = {Contributions to the Founding of the Theory of Transfinite Numbers}, + series = {The Open Court Series of Classics of Science and Philosophy}, + publisher = {Open Court}, + year = {1952}, + booktitle = {Contributions to the Founding of the Theory of Transfinite Numbers}, + timestamp = {Thu, 20 Feb 2025 15:02:44 +0000} +} diff --git a/main.tex b/main.tex new file mode 100644 index 0000000..e96270e --- /dev/null +++ b/main.tex @@ -0,0 +1,261 @@ +\newif\ifacmart +\acmarttrue + +\ifacmart + \documentclass[acmsmall, anonymous, review, natbib=false, pbalance]{acmart} +\else + \documentclass[a4paper]{article} +\fi + +%% Package Imports %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%% Provided by acmart +\ifacmart\else + \usepackage{amsthm} %% proof environments +\fi + +%% Incompatible with acmart +\ifacmart\else + \usepackage[ + warnings-off={mathtools-colon,mathtools-overbracket} + ]{unicode-math} %% math symbols + \usepackage[subtle, margins=tight]{savetrees} +\fi + +\usepackage[ + datamodel=acmdatamodel, + style=acmnumeric, + giveninits, + doi, + eprint = false +]{biblatex} %% bibliography configuration +\usepackage{booktabs} %% table formatting +\usepackage{ebproof} %% proof trees +\usepackage{expl3} %% Latex 3 +\usepackage{enumitem} %% format contributions list +\usepackage{float} %% forcing appendix figures +\usepackage{hyperref} %% hyperlinks within document +\usepackage{mathtools} %% matrix alignment +\usepackage[outputdir=build]{minted} %% code formatting +\usepackage{stackrel} %% Annotating judgements +\usepackage{subcaption} %% subfigures with captions +\usepackage{subfiles} %% splitting file into sections +\usepackage{thmtools} %% proof environments +\usepackage{tikz} %% diagram +\usepackage[dvipsnames]{xcolor} %% color TODO and FIXME +\usepackage{xr} %% cross reference between sections + +%% After hyperref +\usepackage{cleveref} %% easier citations + +%% Document Metadata %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\author{Greg Brown} +\ifacmart + \orcid{0009-0000-3401-8637} + \affiliation{% + \institution{University of Edinburgh} + \city{Edinburgh} + \country{UK} + } + \email{greg.brown01@ed.ac.uk}{} +\fi + +\ifacmart + \acmJournal{PACMPL} + %% \acmConference[ICFP '25]{}{January 21, 2025}{Denver, CO, USA} +\fi + +\ifacmart + \setcopyright{none} +\fi + +\title{ARTiST:\ Adding Regular Types in System~T} + +\ifacmart + \keywords{} +\fi + +%% Package Configuration %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%% biblatex +\addbibresource{\subfix{main.bib}} + +%% hyperref +\ifacmart\else + \makeatletter + \hypersetup{ + pdftitle={\@title}, + pdfauthor={\@author}, + } + \makeatother +\fi + +%% Minted centered +\RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{} + +%% xr +\externaldocument[M-]{\subfix{main}} + +%% Possessive textcite +\DeclareNameWrapperFormat{labelname:poss}{#1's} +\DeclareFieldFormat{shorthand:poss}{% + \ifnameundef{labelname}{#1's}{#1}} +\DeclareFieldFormat{citetitle:poss}{\mkbibemph{#1}'s} +\DeclareFieldFormat{label:poss}{#1's} + +\newrobustcmd*{\posscitealias}{% + \AtNextCite{% + \DeclareNameWrapperAlias{labelname}{labelname:poss}% + \DeclareFieldAlias{shorthand}{shorthand:poss}% + \DeclareFieldAlias{citetitle}{citetitle:poss}% + \DeclareFieldAlias{label}{label:poss}}} + +\newrobustcmd*{\posscite}{% + \posscitealias% + \textcite} + +\newrobustcmd*{\Posscite}{\bibsentence\posscite} + +\newrobustcmd*{\posscites}{% + \posscitealias% + \textcites} + +%% Theorem Environments %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\ifacmart\else + \declaretheorem[ + style=plain, + numberwithin=section + ]{definition} + \declaretheorem[ + numberwithin=section, + sibling=definition + ]{lemma,corollary,proposition,theorem, conjecture} + \declaretheorem[ + style=remark, + numberwithin=section, + sibling=definition + ]{remark,example} +\fi + +\renewcommand*{\textcitedelim}{\addcomma\addspace\iflastcitekey{\bibstring{and}\addspace}{}} + +%% Custom Macros %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +% Needed for compatibility +\ifacmart\else + \newcommand{\bibliofont}{\footnotesize} +\fi + +% Psuedocode +\newminted[systemt]{'lexer.py:SystemTLexer -x'}{} +\newmintinline[systemtinline]{'lexer.py:SystemTLexer -x'}{} + +% TODOS and such +\newcommand\lang{{\color{Green} ARTiST}} +\newcommand\TODO[1]{{\color{Blue}TODO:\ {#1}}} +\newcommand\FIXME[1]{{\color{Red}FIXME:\ {#1}}} + +% Quantification: e.g. \rangeover{x_i}{i < n} +\newcommand\rangeover[2]{\overrightarrow{#1}} + +% Judgements for typing and such +\newcommand\judgement[4][]{{#2} \vdash^{#1} {#3} : {#4}} +\newcommand\jdgmnt[3]{{#2} \vdash {#3}~\mathrm{#1}} + +%%% Term syntax + +%% Godel encoding +\newcommand\encode[1]{\llcorner {#1} \lrcorner} + +%% Tuples +\DeclarePairedDelimiter\tuple{\langle}{\rangle} +\newcommand\lettm[3]{\mathsf{let}~{#1} \coloneq {#2}~\mathsf{in}~{#3}} + +%% Naturals +\newcommand\nat{\mathsf{N}} +\newcommand\zero{\ensuremath{\mathsf{zero}}} +\newcommand\suc{\ensuremath{\mathsf{suc}}} + +\newcommand\primreckw{\ensuremath{\mathsf{primrec}}} +\NewDocumentCommand\primrec{m m g}{% + \primreckw~{#1}~{#2}\IfValueT{#3}{~{#3}}% + } + +\newcommand\pairingkw{\ensuremath{\mathsf{pair}}} +\NewDocumentCommand\pairing{m m}{% + \pairingkw~{#1}~{#2}% + } + +%% Functors +\newcommand\mapkw{\ensuremath{\mathsf{map}}} +% e.g. \maptm{X}{1 + X} +\newcommand\maptm[2]{\mapkw_{\Lambda {#1}.{#2}}} + +%% Recursive types +\newcommand\foldkw{\ensuremath{\mathsf{fold}}} +\newcommand\foldtm[3]{\foldkw~{#1}~\mathsf{with}~{{#2}.~{#3}}} + +%% Sums +\newcommand\matchtm[2]{\mathsf{match}~{#1}~\mathsf{with}~{#2}} +\newcommand\casetm[5]{\matchtm{#1}{\rangeover{\tuple{{#4}, {#2}}.~{#3}}{{#4}<{#5}}}} + +%%% Substitution +\ExplSyntaxOn +\NewDocumentCommand \squid_subpart:nn {m m} {{#1} \mapsto {#2}} +\NewDocumentCommand \squid_contsub:n {> {\SplitArgument{1}{/}} m} + { \squid_subpart:nn #1 \squid_checksub + } +\NewDocumentCommand \squid_checksub {O{\parallel}} + { + \peek_meaning_ignore_spaces:NTF \bgroup + {#1 \squid_contsub:n} + {]} + } +\NewDocumentCommand \squid_startsub:n {m} + { {#1} [ \squid_checksub [] + } +\NewDocumentCommand \sub {m > {\SplitList{,}} m} + { \squid_startsub:n {#1} #2 + } +\ExplSyntaxOff + +%% Document %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\begin{document} + +\ifacmart\else + \maketitle{} +\fi + +\begin{abstract} +G\"odel's System T is a simple calculus that performs computation. The absence +of any type abstractions, such as products or sums, along with the limited form +of recursion makes using System T as a target language difficult. In this paper +we introduce \lang{}, a new intermediate language. One can compile \lang{} into +System~T using well-known type encodings, and we have written this algorithm +within \lang{} itself. We also describe a compiler for \lang{} written in Idris +2, featuring an intrinsically well-scoped AST and runtime-erased type-checking +proofs. +\end{abstract} + +\ifacmart + \maketitle{} +\fi + +%% Body + +\subfile{sec/intro.ltx} +\subfile{sec/systemt.ltx} +\subfile{sec/lang.ltx} +\subfile{sec/embedding.ltx} +\subfile{sec/reducer.ltx} +\subfile{sec/compiler.ltx} +\subfile{sec/related.ltx} + +\printbibliography{} + +\appendix + +\end{document} diff --git a/sec/compiler.ltx b/sec/compiler.ltx new file mode 100644 index 0000000..dde2856 --- /dev/null +++ b/sec/compiler.ltx @@ -0,0 +1,18 @@ +\documentclass[../main.tex]{subfiles} + +\begin{document} +\section{Idris 2 Compiler}% +\label{sec:compiler} + +\TODO{ + \begin{itemize} + \item QTT basics + \item intrinsically well-scoped AST + \item bidirectional judgement + \item well-typed data types + \item type checking function + \item irrelevant pattern matching + \end{itemize} +} + +\end{document} diff --git a/sec/embedding.ltx b/sec/embedding.ltx new file mode 100644 index 0000000..722eff1 --- /dev/null +++ b/sec/embedding.ltx @@ -0,0 +1,65 @@ +\documentclass[../main.tex]{subfiles} + +\begin{document} +\section{Embedding in System T}% +\label{sec:embedding} + +A major motivation for \lang{} is the ability to write terms with regular types +and embed them in System~T. We explain how the embedding works, which encodes +products, sums and recursive types as System~T types. + +Every pair of System~T types has a union (in the sense of C). The union has +injections into it and projections out of it. The specification of the union +type, \(A \sqcup B\) for types is: +\[ + \begin{array}{cc} + \mathsf{inl} : A \to A \sqcup B & + \mathsf{inr} : B \to A \sqcup B \\ + \mathsf{prl} : A \sqcup B \to A & + \mathsf{prr} : A \sqcup B \to B \\ + \mathsf{prl}~(\mathsf{inl}~t) = t & + \mathsf{prr}~(\mathsf{inr}~t) = t + \end{array} +\] +\Cref{fig:union-impl} shows \posscite{Squid:online/Kiselyov22} implementation of +unions. We can extend this to n-ary unions \(\bigsqcup_i^n A_i\) in the usual +way. We pick \(\nat\) to represent the empty union arbitrarily. + +With unions, it becomes easy to encode products. +\textcite{Squid:online/Kiselyov22} uses the functional encoding \(\prod_i^n A_i +\coloneq \nat \to \bigsqcup_i^n A_i\), representing the product as a +heterogeneous vector. The \(k\)th component of the product is accessed by +applying the product to \(k\). Tupling does a case analysis of the argument, +returning the corresponding value. + +\textcite{Squid:online/Kiselyov22} also describes how to encode sums using +products and unions. A sum is encoded as a tagged union \(\sum_i^n A_i \coloneq +\nat \times \bigsqcup_i^n A_i\), where the tag indicates which case of the sum +is being stored. Case analysis first inspects the tag to decide which branch to +take, downcasts the value to the correct type, and then performs the branch +action. Note this encoding is not consistent: the empty type is necessarily +inhabited, as all System~T types are. This inconsistency is necessary for the +encoding of recursive types. + +Recursive types are the most difficult to encode. +\textcite{DBLP:books/sp/LongleyN15} describe one strategy: represent values as a +heap. We have the encoding \(\mu X. A \coloneq \nat \times \nat \times (\nat \to +\sub{A}{X/\nat})\). The first component is the ``depth'' of the heap: an upper +bound on the number of pointers to follow on any arbitrary path. The second +component represents the root index of the heap---the address where the data +structure starts. The final component is the heap itself. Each location stores +data associated with its value. Any recursion is replaced with a pointer to a +different location in the heap. + +\TODO{example} + +Folding is easier to compute than rolling. The precise encoding is given below. +We compute the fold across the heap in parallel, iteratively increasing the +depth of paths we can safely use. The initial result is arbitrary: we don't have +any data to operate on. + +\begin{multline*} +\TODO{fold} +\end{multline*} + +\end{document} diff --git a/sec/intro.ltx b/sec/intro.ltx new file mode 100644 index 0000000..00a6d5b --- /dev/null +++ b/sec/intro.ltx @@ -0,0 +1,103 @@ +\documentclass[../main.tex]{subfiles} + +\begin{document} +\section{Introduction}% +\label{sec:intro} + +System~T is an extension of the simply-typed lambda calculus adding natural +numbers and primitive recursion. This minimality makes it an excellent calculus +to model, and yet the calculus is expressive enough to perform a wide array of +useful calculations. This includes most arithmetic, \TODO{middle thing} and even +rewriting of lambda terms. + +The restricted type system of System~T makes expressing complex data difficult. +Often structured data has to be encoded as naturals or first-order functions. +Encoding data within a natural number often makes structural recursion +exceedingly complex. Whilst using first-order functions can make recursion +easier, it is often the case that passing different arguments can return +different shapes of data. + +Consider the following System~T pseudocode: +\begin{listing}[H] + \begin{systemt} + let sum (tree : nat -> nat -> nat -> nat -> nat) : nat = + let depth = tree 0 0 0 0 in + let root = tree 1 0 0 0 in + let heap = tree 2 in + let go : nat -> nat = primrec depth with + Z => fun i => i + | S(acc) => fun i => + let tag = heap i 0 0 in + let v = heap i 1 in + if tag == 0 then + v 0 + else + acc (v 0) + acc (v 1) + \end{systemt} + \vspace{-\baselineskip} +\end{listing} +From the name and type, one can deduce this performs a sum over some tree +structure. With some more thought, you may be able to work out that +\systemtinline{tree} is a binary tree. This information is not at all available +in its type. It is also unclear why \systemtinline{tree 0 0 0 0} should return +the depth of the tree and \systemtinline{tree 1 0 0 0} the root. + +In other functional programming languages, the same sum function would look +something like the following OCaml code: +\begin{listing}[H] + \begin{minted}{ocaml} + let sum tree = match tree with + Leaf(v) -> v + | Branch(left, right) -> sum left + sum right + \end{minted} + \vspace{-\baselineskip} +\end{listing} +In this example it is immediately obvious what the function does. The code is +first order reducing its complexity and the richer type system makes it easier +to identify the role of different variables. These additional features of OCaml +and other such languages improve the closeness of mapping and +role-expressiveness of the code whilst reducing the error-proneness (\TODO{check + this}). This is great for human programmers but this comes at a modelling +cost. Unlike System~T, most functional languages allow non-termination. For +example, OCaml would have no issue with us writing +\mintinline{ocaml}{sum tree + sum right} by mistake. + +In this work we present \lang{}, a new intermediate language that compiles into +System~T. \lang{} extends the simply-typed lambda calculus with the regular +types~\cite{Squid:unpublished/McBride01}: finite products, finite sums and some +inductive types.\@ \lang{} has most of the expressive types we are used to from +functional languages---we can write code similar to the second example---whilst +still being simple enough to represent within System~T. + +The type encodings used to compile \lang{} into System~T are +well-known~\cites{Squid:online/Kiselyov22}{DBLP:books/sp/LongleyN15}. Encoding +products and sums relies on C-style union types, which exist due to System~T +having only naturals and function types. Recursive types are represented using a +heap with an explicit depth, informally described by +\textcite{DBLP:books/sp/LongleyN15}. We have written this compilation algorithm +within \lang{} itself. + +To aid in the development of this program, we also constructed a type checker +for \lang{} in Idris~2~\cite{idris}. The typechecker uses intrisically +well-scoped syntax. The syntax is represented by an indexed data type +\texttt{Term}, where the index describes the variables that are in scope. By +using quantities~\cite{quants} and auto-implicit search, the compiler code is +written in a named style, yet compiles down to using deBruijn indices. Using +names helps to reduce errors in the code, and the compilation to deBruijn +indices means we don't pay any runtime cost. + +%% \subsubsection*{Contributions}% +%% \label{sec:contributions} +%% \begin{itemize} +%% \item Design of new intermediate language \lang{}, featuring iso-recursive +%% regular types +%% \item Algorithm for embedding \lang{} into to System~T, implementable within +%% \lang{} +%% \item Case study implementation of a fuelled self-reducer for System~T, +%% demonstrating improvements in several cognitive dimensions~\cite{cog-dim} for +%% \lang{} compared to working in System~T directly +%% \item Compiler implemented in Idris 2 with intrinsically well-scoped syntax and +%% runtime-erased typechecking proofs. +%% \end{itemize} + +\end{document} diff --git a/sec/lang.ltx b/sec/lang.ltx new file mode 100644 index 0000000..c7851e2 --- /dev/null +++ b/sec/lang.ltx @@ -0,0 +1,226 @@ +\documentclass[../main.tex]{subfiles} + +\begin{document} +\section{Core Language}% +\label{sec:lang} + +In this section, we describe \lang{} a new calculus with higher-order functions +and regular types~\cite{Squid:unpublished/McBride01}. We start by describing the +types within the language: products, sums, recursive types and functions. We +then describe the syntax which is standard for n-ary products and sums. We also +define a \mapkw{} operation over arbitrary (covariant) functors.\@ \mapkw{} is +necessary to define the equational theory of \foldkw{}, the eliminator for +recursive types. The equational theory has \(\beta\) equalities, but no \(\eta\) +equalities, a consequence of the embedding into System~T. + +Types for \lang{}, in \cref{fig:lang-wf}, are given relative to a context of +type variables. The judgement \(\jdgmnt{ty}{\Theta}{A}\) states that \(A\) is a +well-formed type in with variables from context \(\Theta\). Products and sums +are well-formed when each of the n-ary components are well-formed. The recursive +type \(\mu X. A\) is formed from a type \(A\) in a context extended by \(X\). +The other type formation rules ensure that \(X\) is used strictly positively, +and thus that recursive types are well-formed. In particular the rule forming +function types forbids using any type variables on the left of the arrow. This +forbids the use of type variables in negative positions. The function type +formation rule also forbids variables on the right. This is to forbid types +such as \(\mu X. 1 + \nat \to X\) of countable trees, as such a type cannot be +represented by System~T~\cite{proof}. Well-formed types also respect +substitution: +\begin{proposition}[Type Substitution] +Given \(\jdgmnt{ty}{\Theta}{A}\) and \(\jdgmnt{ty}{\Psi}{B_i}\) where \(i\) +ranges from \(0\) to \(\lvert \Theta \rvert\), we have +\(\jdgmnt{ty}{\Psi}{\FIXME{\sub{A}{X_i/B_i}}}\). +\end{proposition} + +\begin{figure} +\[ +\begin{array}{ccccc} + \begin{prooftree} + \hypo{ + X \in \Theta + \vphantom{\jdgmnt{ty}{\Theta}{A}} %% Spooky formatting phantom + } + \infer1{\jdgmnt{ty}{\Theta}{X}} + \end{prooftree} + & + \begin{prooftree} + \hypo{\rangeover{\jdgmnt{ty}{\Theta}{A_i}}{i