Colorized unlambda.sed

# unlambda v1 interpreter in sed 
# 2003-02-19 - Laurent Vogel - http://lvogel/free.fr
# GPL version 2 or later (see http://www.gnu.org/copyleft/gpl.html)

# This is an interpreter for unlambda version 1. 
# Unlambda is an esoteric language invented by David Madore in 1999.
# You should really refer to the unlambda page for the complete description,
# of the language, including a tutorial and other things:
# http://www.eleves.ens.fr:8080/home/madore/programs/unlambda/unlambda.html
# but nevertheless here is a sketchy description of unlambda:
#
# The only data type in unlambda are functions. A function, when applied
# to another function, yields a third function, and so on: 
# `<f><g> is the notation for the application of <f> to <g>. 
# When evaluating `<f><g>, <f> is first evaluated, then <g> 
# (unless <f> did evaluate to the builtin d) then <f> is applied to <g>.
# The unlambda builtin functions are:
#  i  the identity function: `i<a> returns <a>
#  k  the constant generator function: `k<a> returns a function <f>
#     such as `<f><b> returns <a>. In short: ``k<a><b> returns <a>
#  s  the substitution function: ```s<a><b><c> returns ``<a><c>`<b><c>
#  v  a shortcut for ```s``s`kskk``s``s`kskk: `v<a> returns v
#  .x (for any ASCII char x): `.x<a> outputs x then returns <a>
#  r  another notation for .<newline>
#  d  `d<a> does not evaluate <a>, but ``d<a><b> evaluates `<a><b>
#  c  ("call with current continuation") is difficult to explain: 
#     `c<a> returns `<a><cont> for a new function <cont> called a 
#     continuation; if later `<cont><b> is evaluated, then it is as if at 
#     the time `c<a> was evaluated <b> was returned instead of `<a><cont>.
# An unlambda program consists of one expression. Spaces and comments
# (everything between a sharp sign '#' and the end of line) are allowed
# and ignored.
#
# This version of the interpreter does not implement the unlambda v2 
# builtins. There are no unlambda v2 programs small enough anyway.
#
# The following Programs from the CUAN (Comprehensive Unlambda Archive 
# Network) run successfully:
#   Hello, Square, power2, count, count2, trivial, trivial2, trivial3,
#   fibo, Triangle, pattern(*)
#   Jean.Marot/Quine, Quine2
#   Denis.Auroux/q2, q3, q5
#   Panu.Kalliokoski/Quine2
# (*) modified to print stars and newlines. 
# Bigger programs were not tested.

# Sed note
# --------
# Some attempt has been made to make this script portable. It is known to
# run on the following sed interpreters:
# GNU sed 3.02 and newer: if the last line of the unlambda program does 
#   not end with a newline, then no newlines are ever printed. I consider 
#   this to be a bug in GNU sed.
# HHsed: runs up to seven times faster than GNU sed-4.0.5 on my machine, 
#   once the following two lines are commented out in sedexec.c:
#     if (++jumpcnt >= JUMPLIMIT)
#       ABORT2(INFLOOP, lnum);
#   You will also need to increase MAXBUF for some programs.

# When parsing the program, the hold buffer contains: 
# the line number, then a space, then the program parsed up to now
x  

# Initialize the hold buffer
1 s/^/0 /

# Increment line number
s/ /i /
:inc
s/^[^ ]*.i/&0123456789i0_/
s/^i/1/
s/\(.\)i[^_]*\1\(i*.\)[^_]*_/\2/
/^[^ ]*i/ b inc

# Back to the pattern space
x  

# Obtain a newline at the beginning of the pattern space. Successfully
# parsed input will go left of the newline. A space is added first for
# buggy sed whose G does not add a newline over an empty pattern space.
s/^/ /
G  
s/^\([^\n]*\)\(\n\).*/\2\1/

:parse
# Remove space and TABs
s/\(\n\)[ 	]*/\1/
# Remove comments
s/\n#.*//
# Rename '.x' for some special chars x into ',y', y being a letter. 
# Special chars, and reason why:
# - ':@();|<>_':!#{} used as special markers in [^x]* sed patterns
# - '`': made special to allow s/`i//g
/\n\.[:@();|<>_`!#{}]/ {
    s//&:a@b(c)d;e|f<g>h_i`j!k#l{m}n~/
    s/\(\n\)\.\(.\)[^~]*\2\([^~]\)[^~]*~/,\3\1/
}  
# .<newline>
s/\n\.$/r/
# .x in the default case 
s/\(\n\)\(\..\)/\2\1/
# Plain tokens
s/\(\n\)\([cdikrsv`]*\)/\2\1/
# Illegal input
/\n[^cdikrsv.`# 	]/ {
    # remove anything before the first offending char 
    s/.*\n//
    # print an error message including the line number
    x
    s/^\([0-9]*\).*/Syntax error line \1:/p
    x
    q
}  
# Continue while not at the end of this line
/\n./ b parse

# Accumulate to the hold buffer
H  

# Go on parsing while the last line is not reached
$ !d

# Get back program from the hold buffer, and remove the line number.
# The hold buffer will now contain pending output. Since some sed
# interpreters have weird behaviour when the hold buffer is empty, 
# make sure there is always a space in it.
s/.*/ /
x  
s/[^ ]* //
s/\n//g

# Verify that the program contains one, and only one, expression.
# This is done by skipping one expression to the right. After skipping
# error messages are printed if the program is too short or too long,
# else we come back at :valid
s/^/?>/
s/$/|##a/
b skip
:valid

# Initialize the continuation database.
# A continuation is represented in the pattern space as
#   |[I];[N];{A};{B}
# [I] is the continuation unique identifier ([I] matches /^aa*$/);
# [N] is the reference counter of the continuation (matches /^a*$/);
#    [N] is '' when only one continuation reference exists, and it
#    is increased by one 'a' per additional reference. The refcounts
#    are updated when copying or deleting parts of the pattern space.
# {A} and {B} are the 'before' and 'after' part of the unlambda
#    state at the time c was applied (i.e. the unlambda state was
#    {A}`c{expr}{B})
# Continuations are deleted when no longer referenced, but the 
# unique identifier remains in the base in the form  |[i] ready
# to be re-used to serve as the unique identifier of another 
# continuation. For instance, after evaluating `cc
#  `cc -> `c<c1> -> `<c1><c2> -> <c2>
# we obtain the following pattern space:
#   <%aa%|aa;;;|a|
# where 'aa' is the ID of a continuation referenced once (%aa%) 
# and 'a' is a free ID.
# (see also the explanation on continuations below)

# Position the evaluation cursor at the beginning of the unlambda program.
s/^/_/

# Meaning of special chars:
# Evaluation steps
# ----------------
# '_' is the evaluation cursor, at the left of the expression to evaluate. 
# 'x>' is the skip right operator (x is any char except @).
#     branching to :skip will drop '>x' to the right skipping exactly 
#     one (balanced) unlambda expression. 
# '<' is the move left operator. Branching to :left will move '<' to the
#     left until ':' is reached
# ':' is used to block <
#
# They are used as follows: '_' first before the expression to eval.
# during eval it may be necessary to leave a token 'x:' and skip right over
# expressions. We then use 'y>' and branch to :skip which moves '>y'
# farther to the right. '>y' proceeds and ultimately should return with a
# '<' which goes back to some 'z:' yielding 'z:<' which itself eventually:
# - either drops a '_' and branches to :eval
# - or keeps a '<' and branches to :left
# - or goes right by dropping a 't>' and branching to :skip
#
# Because of this, treatment of builtin 'x' will typically involve code
# fragments in different places:
# - handling '_`x' located after :eval
# - handling '>x' located after :skip
# - handling 'x:<' located after :left
#
# At any time there is at most one of '_', '<' and '>' in the pattern space
#
# other special chars
# -------------------
# '(...)' are used to delimit zones to delete (by branching to :delete) or
#     to scan after copy to increase the refcount of any continuation
#     references found in them (by branching to :incref). 
#     A '<' must be present somewhere in the data outside '(...)', as 
#     both :delete and :incref will branch to :left when done.
# '|' is used to delimit continuations in the continuation database.
# ';' is used to delimit fields in the continuation database, and to
#     serve as marker for copies during the execution of complex builtins.
# '#' is used to delimit the skip reference free IDs
# '!' is used by :skip internally
# '{}' are used to note 'skip references', a way of caching the execution
#     of skip to speed things up (three times faster under HHsed)
#
# Encoding of unlambda expressions
# --------------------------------
# '`'            is the apply operator
# cdikrsv        are builtins
# '.x' and ',x'  are the printing builtins
# 'D<expr>'      is the promise `d<expr>
# 'K<expr>'      is the result of evaluating `k<expr>
# 'S<expr>'      is the result of evaluating `s<expr>
# 'T<e1><e2>'    is the result of evaluating ``s<e1><e2>
# '%aaa%'        is a continuation reference

# eval/apply
:eval

# `i is a no-op
s/`i//g

# Uncomment the following line when debugging to print the state from 
# time to time (the state is not printed after each eval, because 
# several evaluations might occur between two jumps to :eval)
#p

# Expressions other than apply evaluate to themselves, with or without
# skip reference
s/_\([^{`]\)/<\1/
s/_\({[^,]*,[^`]\)/<\1/

# kill the skip reference
s/_\(`*\){\([^,]*,\)\(.*\)}\2\([^#]*#\)/_\1\3\4\2/

# <expr> is not evaluated when evaluating `d<expr>
s/_`d/<D/
/</ b left

# ``dAB - go evaluate B with 'e>', leaving '`:' to evaluate `AB
s/_`D/`:e>/
# ``kAB - leave A, go delete B with 'K>'
s/_`K/K>/
# ``sAB - replace S with T, go evaluate B with 'e>'
s/_`S/Te>/
# ```sABC - leave '`:`' behind to form the first ``A of ``AC`BC,
# and go over A activate 'f>' that will transform BC in C`BC
s/_`T/`:`f>/
/>/ b skip

# `k and `s just eval their argument and store it in K<expr> or S<expr>.
# the '<' ultimately resulting from evaluating the argument will move
# through 'K' or 'S', giving 'K<expr>' or 'S<expr>' as return value.
s/_`k/K_/
s/_`s/S_/

# These evaluate their argument, leaving an 'x:' token to do the proper 
# job after '<' returns. See at the bottom of the file, after :left,
# what is done then.
s/_``/`:_`/
s/_`\([rc]\)/\1:_/
# .x becomes x.: to distinguish easily z.:< (.z) from z:< (delete)
s/_`\([.,]\)\(.\)/\2\1:_/
# Prepare leaving 'v', evaluate expr, then delete it using 'z:<'
s/_`v/vz:_/
s/_`\(%a*%\)/\1:_/

/_/ b eval

# Skip right 1 expression. Starting with '>@' we move this token to
# the right, adding one '@' per '`' or 'T', going over any number of
# 'S', 'D', 'K', and removing one @ per builtin, until no more @ is left. 
# Note: since ',' is always followed by a lowercase letter, any number of
# ',' can also be skipped together with 'S', 'D', 'K' (because one cannot
# encounter ',`', ',T' ...)
# Some caching is implemented: long expressions are surrounded by opening
# and closing "skip references": {ab, ... }ab, . At any time, for each 
# skip-ref ID "ab", either there is exactly one opening and one closing
# skip-ref with this ID in the entire pattern space, or there are no 
# skip-refs with this ID and this ID is in the free-skip-ref-ID-list.
# skip-refs are mutated when copying (see :incref below) and go to the
# free list when deleted (see :delete)
#
:skip
# remember the start of the skip
s/>/!>@/
:skiploop

# increase depth for each '`' or 'T'
s/>\(>*@@*\)\([SDK]*[`T]\)/\2>>\1@/
# decrease depth for each builtin
s/>\(>*@*\)@\([SDK,]*[a-z]\)/\2>>\1/
s/>\(>*@*\)@\([SDK]*\..\)/\2>>\1/
s/>\(>*@*\)@\([SDK]*%a*%\)/\2>>\1/

# jump over a skip ref
s/>\(>*@*\)@\([SDK]*\){\([^,]*,\)\(.*\)}\3/\2{\3\4}\3>\1/

# if the overall number of skip steps done this time is big, recurse.
# There is a tradeof here between doing the least steps in :skip
# but increasing the number of skiprefs makes it slower to match
# them in pairs, and also increases the size of the pattern space,
# thus making *all* failing pattern matching slower.
# the number of '>' below can be changed. The optimum value probably 
# depends on the sed interpreter and the machine.
/>>>>>>>>>>>>>>>>>>>>>*@/ {
    s/>>*\(@@*\)/\1!>@/
    b skiploop
}  

# If we haven't finished skipping one expression at the end of the 
# pattern space, either there is a bug in this script, or the program 
# does not start with a complete unlambda expression.
/@|/ {
    s/.*/Syntax error: unexpected end of file/
    q
}  

/>>*@@*[^@]/ b skiploop

# end of recurse
/@!/ {
    # we need a new skip ref. 
    /##/ {
        :newskref
        # copy the next one in place
        s/#\([^#][^;]*\)/\1,&/
        # generate the next one
        s/$/;/
        :inca
        s/#;/#a/
        s/#[^#;]*[^#;];/&abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ;a~/
        s/#\([^#;]*\)\([^#;]\);[^~]*\2\(;*[^~]\)[^~]*~/#\1\3/
        /#[^;]*;/ b inca
        # if coming from incref, go back there
        /^</ b mutate
    }
    # get a skip ref among the free skip ref list and replace the !
    s/@\(@*\)!\([^@!>]*\)>>*\([^#]*#\)\([^,]*,\)/{\4\2}\4>\1\3/
    b skiploop
}  

# end of skip - get the action char from last !
s/\(.\)!\([^>]*\)>*/\2>\1/

# After :skip

# '>e' for eval
s/>e/_/
# Evaluate the expression, then delete it with 'z:<'
s/>K/z:_/

# >f, >g, >h are part of the ```s[A][B][C] evaluation explained below. 
s/>g/g:_/
/_/ b eval

# ```s[A][B][C] evaluation 
# ------------------------
#    _`T[A][B][C] -> `:`f>[A][B][C] -> `:`[A]>f[B][C]
# the remaining task is to evaluate [C] and replace [B][C] by [C]`[B][C]. 
#    >f[B][C] -> f:g>[B][C] -> f:[B]>g[C] -> (eval C) f:[B]g:<[C]
# -> f:[B];h>[C] -> f:[B];[C]>h -> <[C]`[B]([C]) -> (incref) <[C]`[B][C]
# Note that ';' can only be inserted after [C] was evaluated, because 
# evaluating [C] can need ';' too.
/>f/ {
    s//f:g>/
    b skip
}  
/>h/ {
    s//;/
    s/f:\([^;:]*\);\([^;:]*\);/<\2`\1(\2)/
    b incref
}  

# delete
/>z/ {
    s//)</
    b delete
}  

# Continuation (1) - `c<expr>
# ---------------------------
# `c<expr> first evaluates the expression, then surrounds it with ';'
# _`c[E] -> c:_[E] -> (eval E) c:<[E] -> ;[E]c> -> ;[E];
# At this stage the pattern space is either:
#   {A};[E];{B}|...|[I]|...    
# if there is a free ID [I], or:
#   {A};[E];{B}|[I-1];...
# if all IDs from 1 to I-1 are taken.
# {A} and {B} denote the before and after parts of the unlambda state.
# The unlambda state will then become
#   ({A})`:<[E]%[I]%({B})|...
# with a new continuation (added, or replacing the free ID) being:
#   |[I];;{A};{B}
# Finally we jump to incref over ({A}) and ({B}) as they have been copied.
#
/>c/ {
    s//;/
    # try reusing a free ID 
    /^\([^;]*\);\([^;]*\);\([^|]*\)\(.*\)|\(a*\)|/ {
        s//(\1)`:<\2%\5%(\3)\4|\5;;\1;\3|/
        b incref
    }
    # else create our ID as being 1 plus the biggest (leftmost) ID
    s/^\([^;]*\);\([^;]*\);\([^|]*\)|\(a*\)/(\1)`:<\2%\4a%(\3)|\4a;;\1;\3|\4/
    b incref
}  

# Continuation (2) - `<cont><expr>
# --------------------------------
# _`%[I]%[E] -> %[I]%_[E] -> (eval E) %[I]%<[E] -> ;[I];C>[E] -> ;[I];[E];
# At this stage the pattern space is:
#     {a};[I];[E];{b}|...|[I];[N];{A};{B}|...
# where {a} and {b} represent parts of the current unlambda state,
# We first erase {a} and {b}, so that the refcount [N] of continuation [I] 
# has the right value: 
#  -> C:<[I];[E]({a}{b})|...|[I];[N];{A};{B}|...
#  -> C:<[I];[E]|...|[I];[N];{A};{B}|...
# Now, if '[N]' equals '', we will remove the continuation from the base,
# keeping [I] as a free identifier.
#  -> [I];[E]|...|[I];;{A};{B}|...
#  -> {A}<[E]{B}|...|[I]|...
# else if '[N]' is '[n]a', the continuation stays, so me must incref over
# the copied {A} and {B}:
#  -> [I];[E]|...|[I];[n]a;{A};{B}|...
#  -> ({A})<[E]({B})|...|[I];[n];{A};{B}|... 
#
/>C/ {
    s//;/
    # first erase current state
    s/\([^;]*\);\(a*;[^;]*\);\([^|]*\)/C:<\2(\1\3)/
    b delete
}  
# Called before running to check that the program contains exactly one
# expression
/>?/ {
    />?|/ {
        s/>?//
        b valid
    }
    s/.*>?//
    s/|.*//
    # revert ',y' into '.x'
    s/,[a-n]/&:a@b(c)d;e|f<g>h_i`j!k#l{m}n~/g
    s/,\([a-n]\)[^~]*\([^~]\)\1[^~]*~/.\2/g
    # exit with error message
    i\
Syntax error: trailing garbage
    q
}  

b done

# Increment the refcount of any continuation whose reference lays 
# between ( ). Called for each copied zone after copy. Remove ( )
# and branch to :left when done.
:incref
s/(\([^{%)]*%\(a*\)%[^%)]*\)\(.*|\2;\)/\1(\3a/
s/(\([^{%)]*\))/\1/

# skip-ref mutation: {ab,...}ab, -> {cd,...}cd,
/([^{%)]*{/ {
    # generate a new skip-ref if needed
    /##/ {
        s/^/</
        b newskref
        :mutate
        s/^<//
    }
    # replace current skip ref by a new one for the copy
    # the [^)]* is essential here, in case the original is still 
    # on the right from here (needed when copying continuation
    # parts as the continuation refcounts are only updated to
    # the right).
    s/(\([^{%)]*{\)\([^,]*,\)\([^)]*}\)\2\([^#]*#\)\([^,]*,\)/\1\5(\3\5\4/
}  
/(/ b incref
b left

# Delete between ( and ) included. If continuation references are found, 
# decrement their refcount and mark them for deletion too (but keeping
# the free ID) if it goes below zero; branch to :left when done
:delete
s/([^}%)]*%\(a*\)%[^%)]*\(.*|\1;\)a/(\2/
s/([^}%)]*%\(a*\)%[^%)]*\(.*\)|\1;;\([^|]*\)|/(\2|\1(\3)|/
s/([^}%)]*)//
s/([^}%)]*}\([^,]*,\)\([^#]*#\)/(\2\1/
/(/ b delete

# < moves left over commands.
:left
s/:\([^:]*\)</:<\1/
/:</ !b done

# After left
# Delete one expression: 
# z:<[A] -> (z>[A] -> ([A]>z -> ([A])< -> (delete) <
s/z:</(z>/

# Part of ```s[A][B][C] handling, see explanation above.
s/g:</;h>/

# Continuation handling
s/c:</;c>/
s/%\(a*\)%:</;\1;C>/
/>/ b skip

# `: means re-eval ` when next eval is done
/`:</ {
    s//_`/
    b eval
}  

# Output functions all leave a '<'
# Print the hold buffer and empty it.
/r:</ {
    s//</
    x
    s/.//
    p
    s/.*/ /
    x
}  
# Append the char to the hold buffer
/\(.\)\.:</ {
    s//<\1/
    H
    s/<./</
    x
    s/\n[^<]*<\(.\).*/\1/
    x
}  
# Append the converted char to the hold buffer
/\(.\),:</ {
    s//<\1/
    H
    s/<./</
    x
    # handle ',y' -> x conversion
    s/\(\n\)[^<]*<\(.\).*/\1\2:a@b(c)d;e|f<g>h_i`j!k#l{m}n/
    s/\n\(.\).*\(.\)\1.*/\2/
    x
}  

# Part of `<cont><expr> handling - see explanation above
/C:</ {
    # if the reference count does not goes below zero, keep the continuation
    # and incref over the duplicated parts.
    /C:<\(a*;\)\([^|]*\)\(.*\)|\1a\(a*;\)\([^;]*\);\([^|]*\)/ {
        s//(\5)<\2(\6)\3|\1\4\5;\6/
        b incref
    }
    # else remove the continuation but keep the free ID.
    s/C:<\(a*\);\([^|]*\)\(.*\)|\1;;\([^;]*\);\([^|]*\)/\4<\2\5\3|\1/
}  

/</ b left

:done

# flush output if pending
x  
/../ {
    # remove the dumb first char
    s/^ //
    p
}  

# quit 
d  


### colorized by sedsed, a debugger and code formatter for sed scripts
### original script: ftp://quatramaran.ens.fr/pub/madore/unlambda/contrib/unlambda.sed