From 2224a46b6fcab8fcb043dacadc869d19168c5b4e Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Thu, 14 Feb 2008 14:07:57 +0000 Subject: [PATCH] docs form David Van Horn; Slideshow tweaks svn: r8661 original commit: ca1a9dc8bfc22c376c6c3eeeedfbb52c2ae9b1e8 --- collects/parser-tools/info.ss | 2 +- collects/parser-tools/parser-tools.scrbl | 403 ++++++++++++++--------- 2 files changed, 245 insertions(+), 160 deletions(-) diff --git a/collects/parser-tools/info.ss b/collects/parser-tools/info.ss index 67fcce5..5e433f6 100644 --- a/collects/parser-tools/info.ss +++ b/collects/parser-tools/info.ss @@ -1,3 +1,3 @@ #lang setup/infotab -(define scribblings '(("parser-tools.scrbl" ()))) +(define scribblings '(("parser-tools.scrbl" (multi-page)))) diff --git a/collects/parser-tools/parser-tools.scrbl b/collects/parser-tools/parser-tools.scrbl index 5366d1c..df13918 100644 --- a/collects/parser-tools/parser-tools.scrbl +++ b/collects/parser-tools/parser-tools.scrbl @@ -3,14 +3,19 @@ scribble/struct scribble/xref scribble/bnf - (for-label parser-tools/lex - (prefix-in : parser-tools/lex-sre))) + (for-label scheme/base + scheme/contract + parser-tools/lex + (prefix-in : parser-tools/lex-sre) + parser-tools/yacc)) @title{@bold{Parser Tools}: @exec{lex} and @exec{yacc}-style Parsing} This documentation assumes familiarity with @exec{lex} and @exec{yacc} style lexer and parser generators. +@table-of-contents[] + @; ---------------------------------------------------------------------- @section{Lexers} @@ -22,7 +27,7 @@ style lexer and parser generators. @subsection{Creating a Lexer} @defform/subs[#:literals (repetition union intersection complement concatenation - char-range char-complement char-set + char-range char-complement eof special special-comment) (lexer [trigger action-expr] ...) ([trigger re @@ -39,7 +44,6 @@ style lexer and parser generators. (concatenation re ...) (char-range char char) (char-complement re) - (char-set string) (id datum ...)])]{ Produces a function that takes an input-port, matches the @@ -70,7 +74,6 @@ style lexer and parser generators. a single character string can be used as a @scheme[char].} @item{@scheme[(char-complement re)] --- matches any character not matched by @scheme[re]. The sub-expression must be a set of characters @scheme[re].} - @item{@scheme[(char-set string)] --- matches any character in the string.} @item{@scheme[(id datum ...)] --- expands the @deftech{lexer macro} named @scheme[id]; macros are defined via @scheme[define-lex-trans].} } @@ -276,6 +279,10 @@ error.} @subsection{Lexer Abbreviations and Macros} +@defform[(char-set string)]{ + +A @tech{lexer macro} that matches any character in @scheme[string].} + @defidform[any-char]{A @tech{lexer abbreviation} that matches any character.} @defidform[any-string]{A @tech{lexer abbreviation} that matches any string.} @@ -321,54 +328,7 @@ characters, @scheme[char-lower-case?] characters, etc.} @; ---------------------------------------- -@subsection{Tokens} - -Each @scheme[_action-expr] in a @scheme[lexer] form can produce any -kind of value, but for many purposes, producing a @deftech{token} -value is useful. Tokens are usually necessary for inter-operating with -a parser generated by @scheme[parser-tools/parser], but tokens not be -the right choice when using @scheme[lexer] in other situations. - -@defform[(define-tokens group-id (token-id ...))]{ - - Binds @scheme[group-id] to the group of tokens being defined. For - each @scheme[token-id], a function - @schemeidfont{token-}@scheme[token-id] is created that takes any - value and puts it in a token record specific to @scheme[token-id]. - The token value is inspected using @scheme[token-name] and - @scheme[token-value]. - - A token cannot be named @schemeidfont{error}, since - @schemeidfont{error} it has special use in the parser.} - -@defform[(define-empty-tokens group-id (token-id ...) )]{ - - - Like @scheme[define-tokens], except a each token constructor - @schemeidfont{token-}@scheme[token-id] take nos arguments and returns - @scheme[(@scheme[quote] token-id)].} - - -@defproc[(token-name [t (or/c token? symbol?)]) symbol?]{ - - Returns the name of a token that is represented either by a symbol - or a token structure.} - - -@defproc[(token-value [t (or/c token? symbol?)]) any/c]{ - - Returns the value of a token that is represented either by a symbol - or a token structure, returning @scheme[#f] for a symbol token.} - - -@defproc[(token? [v any/c]) boolean?]{ - - Returns @scheme[#t] if @scheme[val] is a - token structure, @scheme[#f] otherwise.} - -@; ---------------------------------------------------------------------- - -@section{Lex SRE Operators} +@subsection{Lexer SRE Operators} @defmodule[parser-tools/lex-sre] @@ -442,144 +402,266 @@ characters.} @(lex-sre-doc) +@; ---------------------------------------- + +@subsection{Lexer Legacy Operators} + +@defmodule[parser-tools/lex-plt-v200] + +@(define-syntax-rule (lex-v200-doc) + (... + (begin + (require (for-label parser-tools/lex-plt-v200)) + +@t{The @schememodname[parser-tools/lex-plt-v200] module re-exports + @scheme[*], @scheme[+], @scheme[?], and @scheme[&] from + @schememodname[parser-tools/lex-sre]. It also re-exports + @scheme[:or] as @scheme[:], @scheme[::] as @scheme[|@|], @scheme[:~] + as @scheme[^], and @scheme[:/] as @scheme[-].} + +@defform[(epsilon)]{ + +A @tech{lexer macro} that matches an empty sequence.} + +@defform[(~ re ...)]{ + +The same as @scheme[(complement re ...)].}))) + +@(lex-v200-doc) + +@; ---------------------------------------- + +@subsection{Tokens} + +Each @scheme[_action-expr] in a @scheme[lexer] form can produce any +kind of value, but for many purposes, producing a @deftech{token} +value is useful. Tokens are usually necessary for inter-operating with +a parser generated by @scheme[parser-tools/parser], but tokens not be +the right choice when using @scheme[lexer] in other situations. + +@defform[(define-tokens group-id (token-id ...))]{ + + Binds @scheme[group-id] to the group of tokens being defined. For + each @scheme[token-id], a function + @schemeidfont{token-}@scheme[token-id] is created that takes any + value and puts it in a token record specific to @scheme[token-id]. + The token value is inspected using @scheme[token-id] and + @scheme[token-value]. + + A token cannot be named @schemeidfont{error}, since + @schemeidfont{error} it has special use in the parser.} + +@defform[(define-empty-tokens group-id (token-id ...) )]{ + + + Like @scheme[define-tokens], except a each token constructor + @schemeidfont{token-}@scheme[token-id] take nos arguments and returns + @scheme[(@scheme[quote] token-id)].} + + +@defproc[(token-name [t (or/c token? symbol?)]) symbol?]{ + + Returns the name of a token that is represented either by a symbol + or a token structure.} + + +@defproc[(token-value [t (or/c token? symbol?)]) any/c]{ + + Returns the value of a token that is represented either by a symbol + or a token structure, returning @scheme[#f] for a symbol token.} + + +@defproc[(token? [v any/c]) boolean?]{ + + Returns @scheme[#t] if @scheme[val] is a + token structure, @scheme[#f] otherwise.} + @; ---------------------------------------------------------------------- @section{Parsers} @defmodule[parser-tools/yacc] -@defform/subs[(parser clause ...) - ([clause ....])]{ - - Creates a parser. The clauses may be in any order (as - long as there are no duplicates and all non-optional arguments are - present). +@defform/subs[#:literals (grammar tokens start end precs error src-pos + suppress debug yacc-output prec) + (parser clause ...) + ([clause (grammar (non-terminal-id + ((grammar-id ...) maybe-prec expr) + ...) + ...) + (tokens group-id ...) + (start non-terminal-id ...) + (end token-id ...) + (error expr) + (precs (assoc token-id ...) ...) + (src-pos) + (suppress) + (debug filename) + (yacc-output filename)] + [maybe-prec code:blank + (prec token-id)] + [assoc left right nonassoc])]{ + + Creates a parser. The clauses may be in any order, as long as there + are no duplicates and all non-@italic{OPTIONAL} declarations are + present: @itemize{ - @item{@scheme[(debug filename)] @italic{OPTIONAL} + @item{@schemeblock0[(grammar (non-terminal-id + ((grammar-id ...) maybe-prec expr) + ...) + ...)] + + Declares the grammar to be parsed. Each @scheme[grammar-id] can + be a @scheme[token-id] from a @scheme[group-id] named in a + @scheme[tokens] declaration, or it can be a + @scheme[non-terminal-id] declared in the @scheme[grammar] + declaration. The optional @scheme[prec] declaration works with + the @scheme[precs] declaration. The @scheme[expr] is a + ``semantic action,'' which is evaluated when the input is found + to match its corresponding production. + + Each action is scheme code that has the same scope as its + parser's definition, except that the variables @scheme[$1], ..., + @schemeidfont{$}@math{n} are bound, where @math{n} is the number + of @scheme[grammar-id]s in the corresponding production. Each + @schemeidfont{$}@math{i} is bound to the result of the action + for the @math{i}@superscript{th} grammar symbol on the right of + the production, if that grammar symbol is a non-terminal, or the + value stored in the token if the grammar symbol is a terminal. + If the @scheme[src-pos] option is present in the parser, then + variables @scheme[$1-start-pos], ..., + @schemeidfont{$}@math{n}@schemeidfont{-start-pos} and + @scheme[$1-end-pos], ..., + @schemeidfont{$}@math{n}@schemeidfont{-end-pos} and are also + available, and they refer to the position structures + corresponding to the start and end of the corresponding + @scheme[grammar-symbol]. Grammar symbols defined as empty-tokens + have no @schemeidfont{$}@math{i} associated, but do have + @schemeidfont{$}@math{i}@schemeidfont{-start-pos} and + @schemeidfont{$}@math{i}@schemeidfont{-end-pos}. - causes the parser generator to write the LALR table to the file - named @filepath{filename} (unless the file exists). - @filepath{filename} must be a string. Additionally, if a debug - file is specified, when a running generated parser encounters a - parse error on some input file, after the user specified error - expression returns, the complete parse stack is printed to - assist in debugging the grammar of that particular parser. The - numbers in the stack printout correspond to the state numbers in - the LALR table file.} + All of the productions for a given non-terminal must be grouped + with it. That is, no @scheme[non-terminal-id] may appear twice + on the left hand side in a parser.} - @item{@scheme[(yacc-output filename)] @italic{OPTIONAL} - causes the parser generator to write a grammar file in the - syntax of YACC/Bison. The file might not be a valid YACC file - because the scheme grammar can use symbols that are invalid in - C.} + @item{@scheme[(tokens group-id ...)] - @item{@scheme[(suppress)] @italic{OPTIONAL} + Declares that all of the tokens defined in each + @scheme[group-id] can be used by the parser in the + @scheme[grammar] declaration.} - causes the parser generator not to report shift/reduce or - reduce/reduce conflicts.} - @item{@scheme[(src-pos)] @italic{OPTIONAL} + @item{@scheme[(start non-terminal-id ...)] + + Declares a list of starting non-terminals for the grammar.} + + + @item{@scheme[(end token-id ...)] + + Specifies a set of tokens from which some member must follow any + valid parse. For example, an EOF token would be specified for a + parser that parses entire files and a newline token for a parser + that parses entire lines individually.} - causes the generated parser to expect input in the form - @scheme[(make-position-token token position position)] instead - of simply @scheme[token]. Include this option when using the - parser with a lexer generated with @scheme[lexer-src-pos].} - @item{@scheme[(error expression)] + @item{@scheme[(error expr)] - expression should evaluate to a function which will be executed - for its side-effect whenever the parser encounters an error. If - the @scheme[src-pos] option is present, the function should - accept 5 arguments, @schemeblock[(lambda (token-ok token-name - token-value start-pos end-pos) ...)]. Otherwise it should - accept 3, @schemeblock[(lambda (token-ok token-name token-value) - ...)]. The first argument will be @scheme[#f] iff the error is - that an invalid token was received. The second and third + The @scheme[expr] should evaluate to a function which will be + executed for its side-effect whenever the parser encounters an + error. + + If the @scheme[src-pos] declaration is present, the function + should accept 5 arguments,: + + @schemeblock[(lambda (tok-ok? tok-name tok-value _start-pos _end-pos) + ....)] + + Otherwise it should accept 3: + + @schemeblock[(lambda (tok-ok? tok-name tok-value) + ....)] + + The first argument will be @scheme[#f] if and only if the error + is that an invalid token was received. The second and third arguments will be the name and the value of the token at which the error was detected. The fourth and fifth arguments, if present, provide the source positions of that token.} - @item{@scheme[(tokens group-name ...)] - declares that all of the tokens defined in the groups can be - handled by this parser.} + @item{@scheme[(precs (assoc token-id ...) ...)] + @italic{OPTIONAL} - @item{@scheme[(start non-terminal-name ...)] + Precedence declarations to resolve shift/reduce and + reduce/reduce conflicts as in @exec{yacc}/@exec{bison}. An + @scheme[assoc] must be one of @scheme[left], @scheme[right] or + @scheme[nonassoc]. States with multiple shift/reduce or + reduce/reduce conflicts (or some combination thereof) are not + resolved with precedence.} - declares a list of starting non-terminals for the grammar.} + @item{@scheme[(src-pos)] @italic{OPTIONAL} - @item{@scheme[(end token-name ...)] + Causes the generated parser to expect input in the form + @scheme[(make-position-token _token _start-pos _end-pos)] instead + of simply @scheme[_token]. Include this option when using the + parser with a lexer generated with @scheme[lexer-src-pos].} - specifies a set of tokens from which some member must follow any - valid parse. For example an EOF token would be specified for a - parser that parses entire files and a @nonterm{newline} token - for a parser that parses entire lines individually.} - @item{@scheme[(precs (assoc token-name ...) ...)] - @italic{OPTIONAL} + @item{@scheme[(debug filename)] @italic{OPTIONAL} - precedence declarations to resolve shift/reduce and - reduce/reduce conflicts as in YACC/BISON. @scheme[assoc] must - be one of @scheme[left], @scheme[right] or @scheme[nonassoc]. - States with multiple shift/reduce or reduce/reduce conflicts or - some combination thereof are not resolved with precedence.} - - @item{@schemeblock0[(grammar (non-terminal ((grammar-symbol ...) (prec token-name) expression) - ...) - ...)] - - declares the @scheme[grammar] to be parsed. Each - @scheme[grammar-symbol] must be a @scheme[token-name] or - @scheme[non-terminal]. The @scheme[prec] declaration is - optional. @scheme[expression] is a semantic action which will - be evaluated when the input is found to match its corresponding - production. Each action is scheme code that has the same scope - as its parser's definition, except that the variables - @scheme[$1], ..., @scheme[$n] are bound in the expression and - may hide outside bindings of @scheme[$1], ... @scheme[$n]. - @scheme[$x] is bound to the result of the action for the - @scheme[$x]@superscript{th} grammar symbol on the right of the - production, if that grammar symbol is a non-terminal, or the - value stored in the token if the grammar symbol is a terminal. - Here @scheme[n] is the number of @scheme[grammar-symbol]s on the - right of the production. If the @scheme[src-pos] option is - present in the parser, variables @scheme[$1-start-pos], ..., - @scheme[$n-start-pos] and @scheme[$1-end-pos], ..., - @scheme[$n-end-pos] are also available and refer to the position - structures corresponding to the start and end of the - corresponding @scheme[grammar-symbol]. Grammar symbols defined - as empty-tokens have no @scheme[$n] associated, but do have - @scheme[$n-start-pos] and @scheme[$n-end-pos]. All of the - productions for a given non-terminal must be grouped with it, - i.e., no non-terminal may appear twice on the left hand side in - a parser.} + Causes the parser generator to write the LALR table to the file + named @scheme[filename] (unless the file exists), where + @scheme[filename] is a literal string. Additionally, if a debug + file is specified, when a running generated parser encounters a + parse error on some input file, after the user specified error + expression returns, the complete parse stack is printed to + assist in debugging the grammar of that particular parser. The + numbers in the stack printout correspond to the state numbers in + the LALR table file.} + + + @item{@scheme[(yacc-output filename)] @italic{OPTIONAL} + + Causes the parser generator to write a grammar file in + approximately the syntax of @exec{yacc}/@exec{bison}. The file + might not be a valid @exec{yacc} file, because the scheme + grammar can use symbols that are invalid in C.} + + + @item{@scheme[(suppress)] @italic{OPTIONAL} + + Causes the parser generator not to report shift/reduce or + reduce/reduce conflicts.} } -The result of a parser expression with one start non-terminal is a -function, @scheme[f], that takes one argument. This argument must be -a zero argument function, @scheme[t], that produces successive tokens -of the input each time it is called. If desired, the @scheme[t] may -return symbols instead of tokens. The parser will treat symbols as -tokens of the corresponding name (with @scheme[#f] as a value, so it -is usual to return symbols only in the case of empty tokens). -@scheme[f] returns the value associated with the parse tree by the -semantic actions. If the parser encounters an error, after invoking -the supplied error function, it will try to use error productions to -continue parsing. If it cannot, it raises a read error. - -If multiple start non-terminals are provided, the parser expression -will result in a list of parsing functions (each one will individually -behave as if it were the result of a parser expression with only one -start non-terminal), one for each start non-terminal, in the same order. - -Each time the scheme code for a lexer is compiled (e.g. when a -@filepath{.ss} file containing a @scheme[parser] form is loaded), the -parser generator is run. To avoid this overhead place the parser into -a module and compile the module to a @filepath{.zo} bytecode file.} + The result of a @scheme[parser] expression with one @scheme[start] + non-terminal is a function, @scheme[_parse], that takes one + argument. This argument must be a zero argument function, + @scheme[_gen], that produces successive tokens of the input each + time it is called. If desired, the @scheme[_gen] may return + symbols instead of tokens, and the parser will treat symbols as + tokens of the corresponding name (with @scheme[#f] as a value, so + it is usual to return symbols only in the case of empty tokens). + The @scheme[_parse] function returns the value associated with the + parse tree by the semantic actions. If the parser encounters an + error, after invoking the supplied error function, it will try to + use error productions to continue parsing. If it cannot, it + raises @scheme[exn:fail:read]. + + If multiple non-terminals are provided in @scheme[start], the + @scheme[parser] expression produces a list of parsing functions, + one for each non-terminal in the same order. Each parsing function + is like the result of a parser expression with only one + @scheme[start] non-terminal, + + Each time the scheme code for a @scheme[parser] is compiled + (e.g. when a @filepath{.ss} file containing a @scheme[parser] form + is loaded), the parser generator is run. To avoid this overhead + place the parser into a module and compile the module to a + @filepath{.zo} bytecode file.} @; ---------------------------------------------------------------------- @@ -601,3 +683,6 @@ actions in the original grammar have nested blocks, the tool will fail. Annotated examples are in the @filepath{examples} subdirectory of the @filepath{parser-tools} collection.} +@; ---------------------------------------------------------------------- + +@index-section[]