During my professional career I often met the task of code generation based on some source data. Last time it was analogue of BCL compiler for ExpressionTrees and then C# compiler have done mostly all the required job with parsing expressions. It is time to invert our responsibilities and feel ourselves in the shoes of compiler. As a test animal we’ll take MiniML-like language (without binary/unary operators and with slightly modified syntax).
Parsing
Out first goal is transformation of plain character stream into the structured representation. First of all we’ll declare components of abstract syntax tree (AST)
namespace MiniML
type Ast =
| Let of string * Ast * Ast
| Letrec of string * Ast * Ast
| Var of string
| Integer of int
| Lambda of string * Ast
| Apply of Ast * Ast
| IfThenElse of Ast * Ast * Ast
With tools like fslexx/fsyacc creation of parser becomes a easy task, however today we’ll use another techique named parser combinators and define parser using F# itself. More details about this approach can be found in the documents below:
Disclaimer:Version of parser given in sample is pretty straightforward and ineffective for the sake of clearness
First of all we need to declare basic elements and combinators to create more complex parsers.
namespace MiniML
module BaseParsing =
open System
// result of parsing - either value+remaining part of char stream or evidence of failure
type ParseResult<'T> = Success of 'T * list<char> | Failed
// parser itself - function from list of chars to parsing result
type Parser<'T> = Parser of (list<char> -> ParseResult<'T>)
let apply (Parser p) s = p s
let run p l = apply p (Seq.toList l)
// wraps given value into parser keeping char stream untouched
let one v = Parser( fun cs -> Success(v, cs) )
let failed () = Parser (fun _ -> Failed)
// binds subsequent parsers
let bind p f = Parser (fun cs ->
match apply p cs with
| Success(r, cs2) -> apply (f r) cs2
| Failed -> Failed
)
// (OR combinator) applies first parser, if it fails - applies second
let (<|>) p1 p2 = Parser(fun cs ->
match apply p1 cs with
| Failed -> apply p2 cs
| ok -> ok
)
// (AND combinator) applies first parser to source stream, then applies second to remaining part of stream
let (<&>) p1 p2 = Parser(fun cs ->
match apply p1 cs with
| Success(_, cs2) -> apply p2 cs2
| Failed -> Failed
)
// applies given predicate to first symbol in the stream, on success transforms the symbol with p function
let choose f p = Parser(fun cs ->
match cs with
| c::cs2 when f c -> Success(p c, cs2)
| _ -> Failed
)
// selects digit and returns it as a result
let digit = choose Char.IsDigit id
// selects digit and returns unit
let digitU = choose Char.IsDigit ignore
// selects letter and returns it as a result
let letter = choose Char.IsLetter id
// selects digit and returns unit
let letterU = choose Char.IsLetter ignore
// selects letter or digit and returns it as a result
let letterOrDigit = choose Char.IsLetterOrDigit id
// selects digit and returns unit
let letterOrDigitU = choose Char.IsLetterOrDigit ignore
// selects given char and returns it as a result
let char c = choose ((=) c) id
// selects given char and returns unit
let charU c = choose ((=) c) ignore
// selects whitespace
let ws = choose Char.IsWhiteSpace ignore
// To enable computation expresion syntax
type ParserBuilder() =
member this.Return(v) = one v
member this.Bind(p, f) = bind p f
member this.ReturnFrom(p) = p
member this.Zero() = failed ()
let parser = ParserBuilder()
// applies given parser zero or more times. results are composed with v0 using f (in fold like manner)
let rec zeroOrMany p f v0 = parser {
return! oneOrMany p f v0 <|> one v0
}
// applies given parser one or more times. results are composed with v0 using f (in fold like manner)
and oneOrMany p f v0 = parser {
let! v1 = p
return! zeroOrMany p f (f v0 v1)
}
// repetition of given parser 0 or more times (similar to * in regexes)
let zeroOrManyU p = zeroOrMany p (fun _ _ -> ()) ()
// repetition of given parser 1 or more times (similar to + in regexes)
let oneOrManyU p = oneOrMany p (fun _ _ -> ()) ()
// transforms parser result with f function
let map p f = parser {
let! v = p
return f v
}
Now we can declare language grammar itself
namespace MiniML
module MiniMLParser =
open System
open BaseParsing
// Helpers
// checks if parsed text matches given string
let text (s : string) = Parser(fun cs ->
let rec loop l p =
if p >= s.Length then Success((), l)
else
match l with
| h::tl when h = s.[p] -> loop tl (p + 1)
| _ -> Failed
loop cs 0
)
// defines parser that consumes 0+ whitespaces + specified parser
let ws0Plus p = zeroOrManyU ws <&> p
// defines parser that consumes 1+ whitespaces + specified parser
let ws1Plus = oneOrManyU ws
// matches 0+ whitespaces + specifies string
let ws0Text s = ws0Plus (text s)
let ws1Text s = ws1Plus <&> (text s)
// matches 0+ whitespaces + specified char
let ws0Char c = ws0Plus (charU c)
// list of language keywords
let keywords = set ["let"; "letrec"; "in"; "if"; "then"; "else"]
// zero or more whitespaces + optional '-' + one or more digit
let integer = parser {
let! sign = ws0Plus ((char '-' <&> one -1) <|> (one 1))
let! value = 0 |> oneOrMany digit (fun acc v -> acc * 10 + (int v - int '0'))
return sign * value
}
// zero or more whitespaces + letter + zero or more letters or digits
// result should not be keyword
let identifier = parser {
let! f = ws0Plus letter
let sb = (new Text.StringBuilder()).Append(f)
let! r = sb |> zeroOrMany letterOrDigit (fun acc v -> acc.Append(v))
let text = r.ToString()
if not <| keywords.Contains(text)
then return text
}
let rec expr =
let parseIdentifier = map identifier Var
let parseNumber = map integer Integer
let parseLet, parseLetrec =
let p keyword f = parser {
do! ws0Text keyword <&> ws
let! id = identifier
do! ws0Char '='
let! value = expr
do! ws1Text "in"<&> ws
let! body = expr
return f(id, value, body)
}
p "let" Let, p "letrec" Letrec
let parseLambda = parser {
do! ws0Char '\\'
let! id = identifier
do! ws0Text "->"
let! body = expr
return Lambda(id, body)
}
let parseBracketed = parser {
do! ws0Char '('
let! e = expr
do! ws0Char ')'
return e
}
let parseApply = parser {
let! f = parseIdentifier <|> parseBracketed
return! f |> oneOrMany (parseIdentifier <|> parseNumber <|> parseBracketed) (fun acc e -> Apply(acc, e))
}
let parseIfThenElse = parser {
let body = parseIdentifier <|> parseBracketed <|> parseNumber
do! ws0Text "if" <&> ws
let! cond = parseIdentifier <|> parseBracketed
do! ws1Text "then" <&> ws
let! ifTrue = body
do! ws1Text "else" <&> ws
let! ifFalse = body
return IfThenElse(cond, ifTrue, ifFalse)
}
parseLet <|> parseLetrec <|> parseLambda <|> parseIfThenElse <|> parseApply <|> parseBracketed <|> parseIdentifier <|> parseNumber
let parse s = run expr s
To visualize parsing results we’ll use auxiliary function toString
open MiniML
let rec astToString = function
| Let(name, v, body) ->
sprintf "let %s =
%s in
%s" name (astToString v) (astToString body)
| Letrec(name, v, body) ->
sprintf "letrec %s =
%s in
%s" name (astToString v) (astToString body)
| Apply(arg, r) -> sprintf("(%s %s)") (astToString arg) (astToString r)
| IfThenElse(cond, ifTrue, ifFalse) ->
sprintf "if %s
then %s
else %s" (astToString cond) (astToString ifTrue) (astToString ifFalse)
| Integer(v) -> string v
| Lambda(name, body) -> sprintf "\%s -> %s" name (astToString body)
| Var(name) -> name
let parse s =
match MiniMLParser.parse s with
| BaseParsing.Success(r, []) -> printfn "%s" (astToString r)
| BaseParsing.Success(r, rest) ->
printfn "Warning: remaining chars found %A" rest
printfn "%s" (astToString r)
| BaseParsing.Failed -> printfn "Parsing failed for %s" s
let sources =
[
@"letrec length = \l -> if (null l) then 0 else (add 1 (length (tail l))) in length"
@"letrec fact = \x -> if (eq x 1) then 1 else (mul x (fact (add x -1))) in fact"
@"let f = \x -> \y -> (cons (x y) y) in f"
]
for src in sources do
parse src
(*
letrec length =
\l -> if (null l)
then 0
else ((add 1) (length (tail l))) in
length
letrec fact =
\x -> if ((eq x) 1)
then 1
else ((mul x) (fact ((add x) -1))) in
fact
let f =
\x -> \y -> ((cons (x y)) y) in
f
*)
All right, parsing step is completed but now we are facing another problem: our AST doesn’t contain any information about expression types (except the primitive numeric values). Luckily we know the answer.
Type inference
Wikipedia defines type inference as
Type inference, or implicit typing, refers to the ability to deduce automatically the type of a value in a programming language
In our compiler we’ll implement Hindley Milner type inference algorithm that is used by F# itself (as well as Haskell, Ocaml etc). Also nice and simple explanation can be found in ScalaByExample or here.
namespace MiniML
exception TypeInferenceException of Ast * string
module Types =
open System
type TypeVarName = string
// basic entity in the algorithm
type Type =
| TypeVar of TypeVarName
| Function of Type * Type
| TypeConstructor of string * Type list
override this.ToString() =
match this with
| TypeVar(name) -> name
| Function(arg, res) -> sprintf "(%O -> %O)" arg res
| TypeConstructor(name, args) ->
if List.isEmpty args
then name
else sprintf "%s[%s]" name (String.Join(", ", args |> Seq.map string))
exception UnificationException of Type * Type
// creates fresh type variables
type TypeVarGenerator() =
let n = ref 0
member this.New() =
incr n
TypeVar(sprintf "T%d" !n)
// represents mapping between types
[<AbstractClass>]
type Substitution private() =
static let empty = {new Substitution() with override this.Lookup(tv) = TypeVar(tv) }
static member Empty = empty
abstract Lookup : TypeVarName -> Type
member this.Run(t) =
match t with
| TypeVar tv ->
let substituted = this.Lookup(tv)
if t = substituted then substituted
else this.Run(substituted)
| Function(a, r) ->
Function(this.Run(a), this.Run(r))
| TypeConstructor(name, tyArgs) ->
TypeConstructor(name, tyArgs |> List.map this.Run)
member this.Extend(v : TypeVarName, t : Type) =
{ new Substitution() with override __.Lookup(tv) = if v = tv then t else this.Lookup(tv) }
// type + free type variables in type
// Instance returns new type where all free variables are replaces with fresh type vars
type TypeScheme(tyVars : Set<_>, t, gen : TypeVarGenerator) =
let subst = (Substitution.Empty, tyVars) ||> Seq.fold(fun s tt -> s.Extend(tt, gen.New() ))
let instance = subst.Run(t)
member this.Instance = instance
member this.TypeVariables = tyVars
member this.Type = t
// maps names to type schemes
// contains auxiliary functions that extracts type vars from different entities
module Env =
type Environment = Map<string, TypeScheme>
let rec typeVarsOfType = function
| TypeVar(tv) -> Set.singleton tv
| Function(a, r) -> typeVarsOfType(a) + typeVarsOfType(r)
| TypeConstructor(_, tyArgs) -> (Set.empty, tyArgs) ||> List.fold (fun acc ty -> acc + typeVarsOfType ty)
let typeVarsOfScheme(s : TypeScheme) =
(typeVarsOfType s.Type) - s.TypeVariables
let typeVarsOfEnv(e : Environment) =
let schemes = e |> Map.toSeq |> Seq.map snd
(Set.empty, schemes) ||> Seq.fold (fun acc s -> acc + (typeVarsOfScheme s))
let typeToScheme (env : Environment, t : Type, gen) =
TypeScheme((typeVarsOfType t) - (typeVarsOfEnv env), t, gen)
type Inferencer() =
let nameGenerator = TypeVarGenerator()
// calculates substitution that can convert two specified types into equivalent one
let rec unify(a : Type, b : Type, s : Substitution) =
match(s.Run(a), s.Run(b)) with
| TypeVar(ta), TypeVar(tb) when ta = tb ->
s
| TypeVar(ta), _ when not <| Env.typeVarsOfType(b).Contains(ta) ->
s.Extend(ta, b)
| _, TypeVar(_) ->
unify(b, a, s)
| Function(a1, b1), Function(a2, b2) ->
unify(a1, a2, unify(b1, b2, s))
| TypeConstructor(name1, args1), TypeConstructor(name2, args2) when name1 = name2 ->
(s, args1, args2) |||> List.fold2 (fun subst t1 t2 -> unify(t1, t2, subst))
| x,y -> UnificationException(x,y) |> raise
// predefines types and expressions
let newTypeVar = nameGenerator.New()
let newTypeScheme t = Env.typeToScheme(Map.empty, t, nameGenerator)
let boolean = TypeConstructor("bool", [])
let integer = TypeConstructor("int", [])
let list(t) = TypeConstructor("list", [t])
let builtins =
[
"true", newTypeScheme(boolean)
"false", newTypeScheme(boolean)
"nil", newTypeScheme(list(newTypeVar))
"cons", newTypeScheme(Function(newTypeVar, Function(list(newTypeVar), list(newTypeVar))))
"null", newTypeScheme(Function(list(newTypeVar), boolean))
"eq", newTypeScheme(Function(newTypeVar, Function(newTypeVar, boolean)))
"add", newTypeScheme(Function(integer, Function(integer, integer)))
"mul", newTypeScheme(Function(integer, Function(integer, integer)))
"sub", newTypeScheme(Function(integer, Function(integer, integer)))
"tail", newTypeScheme(Function(list(newTypeVar), list(newTypeVar)))
"head", newTypeScheme(Function(list(newTypeVar), newTypeVar))
] |> Map.ofList
// compute general substitution (based on s) for the pair of ast and baseType
let rec analyze (e : Env.Environment, ast, baseType : Type, s : Substitution) =
try
match ast with
| Integer(v) ->
unify(integer, baseType, s)
| Var(name) ->
if not (e.ContainsKey name)
then failwithf "Name %s no found" name
let schema = e.[name]
unify(schema.Instance, baseType, s)
| Lambda(arg, body) ->
let a = nameGenerator.New()
let b = nameGenerator.New()
let s1 = unify(baseType, Function(a, b), s)
let newEnv = e.Add(arg, TypeScheme(Set.empty, a, nameGenerator))
analyze(newEnv, body, b, s1)
| Apply(f, arg) ->
let a = nameGenerator.New()
let s1 = analyze(e, f, Function(a, baseType), s)
analyze(e, arg, a, s1)
| Let(name, inV, body) ->
let a = nameGenerator.New()
let s1 = analyze(e, inV, a, s)
analyze(e.Add(name, Env.typeToScheme(e, s1.Run(a), nameGenerator)),body, baseType, s1)
| Letrec(name, inV, body) ->
let t = nameGenerator.New()
let newEnv = e.Add(name, TypeScheme(Set.empty, t, nameGenerator))
let s1 = analyze(newEnv, inV, t, s)
analyze(e.Add(name, Env.typeToScheme(e, s1.Run(t), nameGenerator)), body, baseType, s1)
| IfThenElse(cond, ifTrue, ifFalse) ->
let s1 = analyze(e, cond, boolean, s)
let s2 = analyze(e, ifTrue, baseType, s1)
analyze(e, ifFalse, baseType, s2)
with
UnificationException(t1, t2) -> TypeInferenceException(ast, sprintf "Cannot unify %O and %O" t1 t2) |> raise
// renames type variables into good-looking ones
let alpha (t) =
let l = ref 'A'
let map = Collections.Generic.Dictionary<_, _>()
let rec run = function
| TypeVar(name) ->
if not <| map.ContainsKey(name) then
let newName = string (!l)
l := Convert.ToChar(int !l + 1)
map.Add(name, newName)
TypeVar(map.[name])
| Function(arg, res) -> Function(run arg, run res)
| TypeConstructor(name, typeArgs) -> TypeConstructor(name, List.map run typeArgs)
run t
member this.TypeOf(ast) =
let a = nameGenerator.New()
analyze(builtins, ast, a, Substitution.Empty).Run(a) |> alpha
Some samples
open MiniML
let sources =
[
@"if (1) then true else false", "incorrect condition type"
@"if true then 1 else false", "different types for conditional branches"
@"letrec fold =
\f -> \s -> \l -> if (null l) then s else (fold f (f s (head l)) (tail l)) in
fold", "generic fold"
@"letrec fold =
\f -> \s -> \l -> if (null l) then s else (fold f (f s (head l)) (tail l)) in
fold add", "partial application"
@"letrec length = \l -> if (null l) then 0 else (add 1 (length (tail l))) in length", "recursive definition1"
@"letrec fact = \x -> if (eq x 1) then 1 else (mul x (fact (add x -1))) in fact", "factorial"
@"let f = \x -> \y -> (cons (x y) y) in f", ""
]
let rec astToString = function
| Let(name, v, body) ->
sprintf "let %s =
%s in
%s" name (astToString v) (astToString body)
| Letrec(name, v, body) ->
sprintf "letrec %s =
%s in
%s" name (astToString v) (astToString body)
| Apply(arg, r) -> sprintf("(%s %s)") (astToString arg) (astToString r)
| IfThenElse(cond, ifTrue, ifFalse) ->
sprintf "if %s
then %s
else %s" (astToString cond) (astToString ifTrue) (astToString ifFalse)
| Integer(v) -> string v
| Lambda(name, body) -> sprintf "\%s -> %s" name (astToString body)
| Var(name) -> name
let parse (s, comment) =
printfn "===%s" comment
let inferencer = Types.Inferencer()
match MiniMLParser.parse s with
| BaseParsing.Success(r, []) ->
printfn "%s" (astToString r)
let text = try inferencer.TypeOf(r).ToString() with TypeInferenceException(ast, text) -> sprintf "%s: %s" (astToString ast) text
printfn "Type: %s" text
| _ -> printfn "Parsing failed for %s" s
for src in sources do
parse src
(*
===incorrect condition type
if 1
then true
else false
Type: 1: Cannot unify int and bool
===different types for conditional branches
if true
then 1
else false
Type: false: Cannot unify bool and int
===generic fold
letrec fold =
\f -> \s -> \l -> if (null l)
then s
else (((fold f) ((f s) (head l))) (tail l)) in
fold
Type: ((A -> (B -> A)) -> (A -> (list[B] -> A)))
===partial application
letrec fold =
\f -> \s -> \l -> if (null l)
then s
else (((fold f) ((f s) (head l))) (tail l)) in
(fold add)
Type: (int -> (list[int] -> int))
===recursive definition1
letrec length =
\l -> if (null l)
then 0
else ((add 1) (length (tail l))) in
length
Type: (list[A] -> int)
===factorial
letrec fact =
\x -> if ((eq x) 1)
then 1
else ((mul x) (fact ((add x) -1))) in
fact
Type: (int -> int)
===
let f =
\x -> \y -> ((cons (x y)) y) in
f
Type: ((list[A] -> A) -> (list[A] -> list[A]))
*)
Notes:
- Substutution is principal of the algorithm. It is function that maps type variables in given type to other types. Base substitution is Empty that is simple identity mapping. Inferencer.unify method creates general substitution that
- Extend source substitution
- Can convert two specified type into equivalent one
- TypeScheme describes structure of the type. It contains type itself and all type variables that is free in type (not captured from environment)
- Type inference mechanism relies on known information about built-in basic types (defined by builtins map)
- After type inference process is completed we perform alpha conversion for type variables to replace temporary names with nice looking ones