Occasional notes: F# Parsing simple language

During my professional career I often met the task of code generation based on some source data. Last time it was analogue of BCL compiler for ExpressionTrees and then C# compiler have done mostly all the required job with parsing expressions. It is time to invert our responsibilities and feel ourselves in the shoes of compiler. As a test animal we’ll take MiniML-like language (without binary/unary operators and with slightly modified syntax).

Parsing

Out first goal is transformation of plain character stream into the structured representation. First of all we’ll declare components of abstract syntax tree (AST)

namespace MiniML

type Ast = 
    | Let of string * Ast * Ast
    | Letrec of string * Ast * Ast
    | Var of string
    | Integer of int
    | Lambda of string * Ast
    | Apply of Ast * Ast
    | IfThenElse of Ast * Ast * Ast

With tools like fslexx/fsyacc creation of parser becomes a easy task, however today we’ll use another techique named parser combinators and define parser using F# itself. More details about this approach can be found in the documents below:

Disclaimer:Version of parser given in sample is pretty straightforward and ineffective for the sake of clearness

First of all we need to declare basic elements and combinators to create more complex parsers.

namespace MiniML

module BaseParsing = 
    open System

    // result of parsing - either value+remaining part of char stream or evidence of failure
    type ParseResult<'T> = Success of 'T * list<char> | Failed
    // parser itself - function from list of chars to parsing result
    type Parser<'T> = Parser of (list<char> -> ParseResult<'T>)

    let apply (Parser p) s = p s
    let run p l = apply p (Seq.toList l)
    
    // wraps given value into parser keeping char stream untouched
    let one v = Parser( fun cs -> Success(v, cs) )
    let failed () = Parser (fun _ -> Failed)
    
    // binds subsequent parsers
    let bind p f = Parser (fun cs ->
        match apply p cs with
        | Success(r, cs2) -> apply (f r) cs2
        | Failed -> Failed
        )
    // (OR combinator) applies first parser, if it fails - applies second
    let (<|>) p1 p2 = Parser(fun cs ->
        match apply p1 cs with
        | Failed -> apply p2 cs
        | ok -> ok
        )
    // (AND combinator) applies first parser to source stream, then applies second to remaining part of stream
    let (<&>) p1 p2 = Parser(fun cs ->
        match apply p1 cs with
        | Success(_, cs2) -> apply p2 cs2
        | Failed -> Failed
        )
    // applies given predicate to first symbol in the stream, on success transforms the symbol with p function
    let choose f p = Parser(fun cs ->
        match cs with
        | c::cs2 when f c -> Success(p c, cs2)
        | _ -> Failed 
        )
    // selects digit and returns it as a result
    let digit = choose Char.IsDigit id
    // selects digit and returns unit
    let digitU = choose Char.IsDigit ignore
    // selects letter and returns it as a result
    let letter = choose Char.IsLetter id
    // selects digit and returns unit
    let letterU = choose Char.IsLetter ignore
    // selects letter or digit and returns it as a result
    let letterOrDigit = choose Char.IsLetterOrDigit id
    // selects digit and returns unit
    let letterOrDigitU = choose Char.IsLetterOrDigit ignore
    // selects given char and returns it as a result
    let char c = choose ((=) c) id
    // selects given char and returns unit
    let charU c = choose ((=) c) ignore
    // selects whitespace
    let ws = choose Char.IsWhiteSpace ignore
    
    // To enable computation expresion syntax
    type ParserBuilder() =
        member this.Return(v) = one v
        member this.Bind(p, f) = bind p f
        member this.ReturnFrom(p) = p
        member this.Zero() = failed ()
    let parser = ParserBuilder()

    // applies given parser zero or more times. results are composed with v0 using f (in fold like manner)
    let rec zeroOrMany p f v0 = parser {
        return! oneOrMany p f v0 <|> one v0
        }    
    // applies given parser one or more times. results are composed with v0 using f (in fold like manner)
    and oneOrMany p f v0 = parser {
        let! v1 = p
        return! zeroOrMany p f (f v0 v1)
        }

    // repetition of given parser 0 or more times (similar to * in regexes)
    let zeroOrManyU p = zeroOrMany p (fun _ _ -> ()) ()
    // repetition of given parser 1 or more times (similar to + in regexes)
    let oneOrManyU p = oneOrMany p (fun _ _ -> ()) ()
    // transforms parser result with f function
    let map p f = parser {
        let! v = p
        return f v
        }

Now we can declare language grammar itself

namespace MiniML

module MiniMLParser = 
    
    open System
    open BaseParsing

    // Helpers

    // checks if parsed text matches given string
    let text (s : string) = Parser(fun cs ->
        let rec loop l p = 
            if p >= s.Length then Success((), l)
            else 
                match l with
                | h::tl when h = s.[p] -> loop tl (p + 1)
                | _ -> Failed
        loop cs 0
        ) 
    // defines parser that consumes 0+ whitespaces + specified parser
    let ws0Plus p = zeroOrManyU ws <&> p
    // defines parser that consumes 1+ whitespaces + specified parser
    let ws1Plus = oneOrManyU ws
    // matches 0+ whitespaces + specifies string
    let ws0Text s = ws0Plus (text s)
    let ws1Text s = ws1Plus <&> (text s)
    // matches 0+ whitespaces + specified char
    let ws0Char c = ws0Plus (charU c)

    // list of language keywords
    let keywords = set ["let"; "letrec"; "in"; "if"; "then"; "else"]

    // zero or more whitespaces + optional '-' + one or more digit
    let integer = parser {
        let! sign = ws0Plus ((char '-' <&> one -1) <|> (one 1))
        let! value = 0 |> oneOrMany digit (fun acc v -> acc * 10 + (int v - int '0'))
        return sign * value
        }
    
    // zero or more whitespaces + letter + zero or more letters or digits
    // result should not be keyword
    let identifier = parser {
        let! f  = ws0Plus letter
        let sb = (new Text.StringBuilder()).Append(f)
        let! r = sb |> zeroOrMany letterOrDigit (fun acc v -> acc.Append(v))
        let text = r.ToString()
        if not <| keywords.Contains(text)
        then return text
        }

    let rec expr  = 
        let parseIdentifier = map identifier Var
        let parseNumber = map integer Integer

        let parseLet, parseLetrec = 
            let p keyword f = parser {
                do! ws0Text keyword <&> ws
                let! id = identifier
                do! ws0Char '='
                let! value = expr 
                do! ws1Text "in"<&> ws 
                let! body = expr 
                return f(id, value, body)
                }
            p "let" Let, p "letrec" Letrec

        let parseLambda = parser {
            do! ws0Char '\\'
            let! id = identifier
            do! ws0Text "->"
            let! body = expr 
            return Lambda(id, body)
            }

        let parseBracketed = parser {
            do! ws0Char '('
            let! e = expr 
            do! ws0Char ')'
            return e
            }

        let parseApply = parser {
            let! f = parseIdentifier <|> parseBracketed 
            return! f |> oneOrMany (parseIdentifier <|> parseNumber <|> parseBracketed) (fun acc e -> Apply(acc, e))
            }

        let parseIfThenElse = parser {
            let body = parseIdentifier <|> parseBracketed  <|> parseNumber
            
            do! ws0Text "if" <&> ws
            let! cond = parseIdentifier <|> parseBracketed 
            do! ws1Text "then" <&> ws
            let! ifTrue = body
            do! ws1Text "else" <&> ws
            let! ifFalse = body
            return IfThenElse(cond, ifTrue, ifFalse)

            }
    
        parseLet <|> parseLetrec <|> parseLambda <|> parseIfThenElse <|> parseApply <|> parseBracketed <|> parseIdentifier <|> parseNumber
    
    let parse s = run expr s

To visualize parsing results we’ll use auxiliary function toString

open MiniML

let rec astToString = function 
    | Let(name, v, body) -> 
        sprintf "let %s = 
        %s in
        %s" name (astToString v) (astToString body)
    | Letrec(name, v, body) -> 
        sprintf "letrec %s = 
        %s in
        %s" name (astToString v) (astToString body)
    | Apply(arg, r) -> sprintf("(%s %s)") (astToString arg) (astToString r)
    | IfThenElse(cond, ifTrue, ifFalse) ->
        sprintf "if %s
        then %s
        else %s" (astToString cond) (astToString ifTrue) (astToString ifFalse)
    | Integer(v) -> string v
    | Lambda(name, body) -> sprintf "\%s -> %s" name (astToString body)
    | Var(name) -> name

let parse s = 
    match MiniMLParser.parse s with
    | BaseParsing.Success(r, []) -> printfn "%s" (astToString r)
    | BaseParsing.Success(r, rest) ->
        printfn "Warning: remaining chars found %A" rest
        printfn "%s" (astToString r)
    | BaseParsing.Failed -> printfn "Parsing failed for %s" s

let sources = 
    [
    @"letrec length = \l -> if (null l) then 0 else (add 1 (length (tail l))) in length"
    @"letrec fact = \x -> if (eq x 1) then 1 else (mul x (fact (add x -1))) in fact"
    @"let f = \x -> \y -> (cons (x y) y) in f"
    ]

for src in sources do
    parse src
(*
letrec length =
        \l -> if (null l)
        then 0
        else ((add 1) (length (tail l))) in
        length
letrec fact =
        \x -> if ((eq x) 1)
        then 1
        else ((mul x) (fact ((add x) -1))) in
        fact
let f =
        \x -> \y -> ((cons (x y)) y) in
        f
*)

All right, parsing step is completed but now we are facing another problem: our AST doesn’t contain any information about expression types (except the primitive numeric values). Luckily we know the answer.

Type inference

Wikipedia defines type inference as

Type inference, or implicit typing, refers to the ability to deduce automatically the type of a value in a programming language

In our compiler we’ll implement Hindley Milner type inference algorithm that is used by F# itself (as well as Haskell, Ocaml etc). Also nice and simple explanation can be found in ScalaByExample or here.

namespace MiniML

exception TypeInferenceException of Ast * string

module Types = 
    open System

    type TypeVarName = string

    // basic entity in the algorithm
    type Type =
        | TypeVar of TypeVarName
        | Function of Type * Type
        | TypeConstructor of string * Type list
        override this.ToString() =
            match this with
            | TypeVar(name) -> name
            | Function(arg, res) -> sprintf "(%O -> %O)" arg res
            | TypeConstructor(name, args) -> 
                if List.isEmpty args 
                then name
                else sprintf "%s[%s]" name (String.Join(", ", args |> Seq.map string))
    
    exception UnificationException of Type * Type
    
    // creates fresh type variables
    type TypeVarGenerator() = 
        let n = ref 0
        member this.New() = 
            incr n
            TypeVar(sprintf "T%d" !n)
    
    // represents mapping between types
    [<AbstractClass>]
    type Substitution private() = 
        static let empty = {new Substitution() with override this.Lookup(tv) = TypeVar(tv) }
        static member Empty = empty

        abstract Lookup : TypeVarName -> Type
        member this.Run(t) = 
            match t with
            | TypeVar tv ->
                let substituted = this.Lookup(tv)
                if t = substituted then substituted
                else this.Run(substituted)    
            | Function(a, r) -> 
                Function(this.Run(a), this.Run(r))
            | TypeConstructor(name, tyArgs) ->
                TypeConstructor(name, tyArgs |> List.map this.Run)

        member this.Extend(v : TypeVarName, t : Type) = 
            { new Substitution() with override __.Lookup(tv) = if v = tv then t else this.Lookup(tv) }
    
    // type + free type variables in type
    // Instance returns new type where all free variables are replaces with fresh type vars
    type TypeScheme(tyVars : Set<_>, t, gen : TypeVarGenerator) = 
        let subst = (Substitution.Empty, tyVars) ||> Seq.fold(fun s tt -> s.Extend(tt, gen.New() ))
        let instance = subst.Run(t)
        member this.Instance = instance
        member this.TypeVariables = tyVars
        member this.Type = t

    // maps names to type schemes
    // contains auxiliary functions that extracts type vars from different entities
    module Env = 
        type Environment = Map<string, TypeScheme>

        let rec typeVarsOfType = function
            | TypeVar(tv)  -> Set.singleton tv
            | Function(a, r) -> typeVarsOfType(a) + typeVarsOfType(r)
            | TypeConstructor(_, tyArgs) -> (Set.empty, tyArgs) ||> List.fold (fun acc ty -> acc + typeVarsOfType ty)
    
        let typeVarsOfScheme(s : TypeScheme) = 
            (typeVarsOfType s.Type) - s.TypeVariables
    
        let typeVarsOfEnv(e : Environment) = 
            let schemes = e |> Map.toSeq |> Seq.map snd
            (Set.empty, schemes) ||> Seq.fold (fun acc s -> acc + (typeVarsOfScheme s))

        let typeToScheme (env : Environment, t : Type, gen) = 
            TypeScheme((typeVarsOfType t) - (typeVarsOfEnv env), t, gen)
    
    type Inferencer() =
        let nameGenerator = TypeVarGenerator() 

        // calculates substitution that can convert two specified types into equivalent one
        let rec unify(a : Type, b : Type, s : Substitution) =
            match(s.Run(a), s.Run(b)) with
            | TypeVar(ta), TypeVar(tb) when ta = tb -> 
                s
            | TypeVar(ta), _ when not <| Env.typeVarsOfType(b).Contains(ta) -> 
                s.Extend(ta, b)
            | _, TypeVar(_) -> 
                unify(b, a, s) 
            | Function(a1, b1), Function(a2, b2) -> 
                unify(a1, a2, unify(b1, b2, s))
            | TypeConstructor(name1, args1), TypeConstructor(name2, args2) when name1 = name2 ->
                (s, args1, args2) |||> List.fold2 (fun subst t1 t2 -> unify(t1, t2, subst))
            | x,y -> UnificationException(x,y) |> raise

        // predefines types and expressions
        let newTypeVar = nameGenerator.New()
        let newTypeScheme t = Env.typeToScheme(Map.empty, t, nameGenerator)

        let boolean = TypeConstructor("bool", [])
        let integer = TypeConstructor("int", [])
        let list(t) = TypeConstructor("list", [t])
         
        let builtins = 
            [
            "true", newTypeScheme(boolean)
            "false", newTypeScheme(boolean)
            "nil", newTypeScheme(list(newTypeVar))
            "cons", newTypeScheme(Function(newTypeVar, Function(list(newTypeVar), list(newTypeVar))))
            "null", newTypeScheme(Function(list(newTypeVar), boolean))
            "eq", newTypeScheme(Function(newTypeVar, Function(newTypeVar, boolean)))
            "add", newTypeScheme(Function(integer, Function(integer, integer)))
            "mul", newTypeScheme(Function(integer, Function(integer, integer)))
            "sub", newTypeScheme(Function(integer, Function(integer, integer)))
            "tail", newTypeScheme(Function(list(newTypeVar), list(newTypeVar)))
            "head", newTypeScheme(Function(list(newTypeVar), newTypeVar))
            ] |> Map.ofList

        // compute general substitution (based on s) for the pair of ast and baseType
        let rec analyze (e : Env.Environment, ast, baseType : Type, s : Substitution) = 
            try
                match ast with
                | Integer(v) ->
                    unify(integer, baseType, s)
                | Var(name) ->
                    if not (e.ContainsKey name) 
                    then failwithf "Name %s no found" name

                    let schema = e.[name]
                    unify(schema.Instance, baseType, s)
                | Lambda(arg, body) ->
                    let a = nameGenerator.New()
                    let b = nameGenerator.New()

                    let s1 = unify(baseType, Function(a, b), s)
                    let newEnv = e.Add(arg, TypeScheme(Set.empty, a, nameGenerator))
                    analyze(newEnv, body, b, s1)
                | Apply(f, arg) ->
                    let a = nameGenerator.New()
                    let s1 = analyze(e, f, Function(a, baseType), s)
                    analyze(e, arg, a, s1)
                | Let(name, inV, body) ->
                    let a = nameGenerator.New()
                    let s1 = analyze(e, inV, a, s)
                    analyze(e.Add(name, Env.typeToScheme(e, s1.Run(a), nameGenerator)),body, baseType, s1)
                | Letrec(name, inV, body) ->
                    let t = nameGenerator.New()
                    let newEnv = e.Add(name, TypeScheme(Set.empty, t, nameGenerator))
                    let s1 = analyze(newEnv, inV, t, s)
                    analyze(e.Add(name, Env.typeToScheme(e, s1.Run(t), nameGenerator)), body, baseType, s1)               

                | IfThenElse(cond, ifTrue, ifFalse) ->
                    let s1 = analyze(e, cond, boolean, s)
                    let s2 = analyze(e, ifTrue, baseType, s1)
                    analyze(e, ifFalse, baseType, s2)             
            with
                UnificationException(t1, t2) -> TypeInferenceException(ast, sprintf "Cannot unify %O and %O" t1 t2) |> raise

        // renames type variables into good-looking ones
        let alpha (t) = 
            let l = ref 'A'
            let map = Collections.Generic.Dictionary<_, _>()
            let rec run = function
                | TypeVar(name) ->
                    if not <| map.ContainsKey(name) then
                        let newName = string (!l)
                        l := Convert.ToChar(int !l + 1)
                        map.Add(name, newName)
                    TypeVar(map.[name])
                | Function(arg, res) -> Function(run arg, run res)
                | TypeConstructor(name, typeArgs) -> TypeConstructor(name, List.map run typeArgs)
            run t

        member this.TypeOf(ast) =
            let a = nameGenerator.New()
            analyze(builtins, ast, a, Substitution.Empty).Run(a) |> alpha

Some samples

open MiniML

let sources = 
    [
    @"if (1) then true else false", "incorrect condition type"
    @"if true then 1 else false", "different types for conditional branches"
    @"letrec fold = 
        \f -> \s -> \l -> if (null l) then s else (fold f (f s (head l)) (tail l)) in 
        fold", "generic fold"
    @"letrec fold = 
        \f -> \s -> \l -> if (null l) then s else (fold f (f s (head l)) (tail l)) in 
        fold add", "partial application"
    @"letrec length = \l -> if (null l) then 0 else (add 1 (length (tail l))) in length", "recursive definition1"
    @"letrec fact = \x -> if (eq x 1) then 1 else (mul x (fact (add x -1))) in fact", "factorial"
    @"let f = \x -> \y -> (cons (x y) y) in f", ""
    ]

let rec astToString = function 
    | Let(name, v, body) -> 
        sprintf "let %s = 
        %s in
        %s" name (astToString v) (astToString body)
    | Letrec(name, v, body) -> 
        sprintf "letrec %s = 
        %s in
        %s" name (astToString v) (astToString body)
    | Apply(arg, r) -> sprintf("(%s %s)") (astToString arg) (astToString r)
    | IfThenElse(cond, ifTrue, ifFalse) ->
        sprintf "if %s
        then %s
        else %s" (astToString cond) (astToString ifTrue) (astToString ifFalse)
    | Integer(v) -> string v
    | Lambda(name, body) -> sprintf "\%s -> %s" name (astToString body)
    | Var(name) -> name

let parse (s, comment) = 
    printfn "===%s" comment
    let inferencer = Types.Inferencer()
    match MiniMLParser.parse s with
    | BaseParsing.Success(r, []) -> 
        printfn "%s" (astToString r)
        let text = try inferencer.TypeOf(r).ToString() with TypeInferenceException(ast, text) -> sprintf "%s: %s" (astToString ast) text
        printfn "Type: %s" text
    | _ -> printfn "Parsing failed for %s" s

for src in sources do
    parse src
(*
===incorrect condition type
if 1
        then true
        else false
Type: 1: Cannot unify int and bool
===different types for conditional branches
if true
        then 1
        else false
Type: false: Cannot unify bool and int
===generic fold
letrec fold =
        \f -> \s -> \l -> if (null l)
        then s
        else (((fold f) ((f s) (head l))) (tail l)) in
        fold
Type: ((A -> (B -> A)) -> (A -> (list[B] -> A)))
===partial application
letrec fold =
        \f -> \s -> \l -> if (null l)
        then s
        else (((fold f) ((f s) (head l))) (tail l)) in
        (fold add)
Type: (int -> (list[int] -> int))
===recursive definition1
letrec length =
        \l -> if (null l)
        then 0
        else ((add 1) (length (tail l))) in
        length
Type: (list[A] -> int)
===factorial
letrec fact =
        \x -> if ((eq x) 1)
        then 1
        else ((mul x) (fact ((add x) -1))) in
        fact
Type: (int -> int)
===
let f =
        \x -> \y -> ((cons (x y)) y) in
        f
Type: ((list[A] -> A) -> (list[A] -> list[A]))
*)

Notes:

Substutution is principal of the algorithm. It is function that maps type variables in given type to other types. Base substitution is Empty that is simple identity mapping. Inferencer.unify method creates general substitution that

Extend source substitution
Can convert two specified type into equivalent one

TypeScheme describes structure of the type. It contains type itself and all type variables that is free in type (not captured from environment)
Type inference mechanism relies on known information about built-in basic types (defined by builtins map)
After type inference process is completed we perform alpha conversion for type variables to replace temporary names with nice looking ones

Occasional notes

суббота, 29 мая 2010 г.

F# Parsing simple language

Parsing

Type inference

Комментариев нет:

Отправить комментарий

Обо мне

FeedBurner Feed Count

StackOverflow Profile

Архив блога

Ярлыки

Occasional notes

суббота, 29 мая 2010 г.

F# Parsing simple language

Parsing

Type inference

Комментариев нет:

Отправить комментарий

Обо мне

FeedBurner Feed Count

StackOverflow Profile

Архив блога

Ярлыки

суббота, 29 мая 2010 г.