From db7fa0e156c6dabdf56ac18f7cd0e080745a9ce1 Mon Sep 17 00:00:00 2001 From: Jack Firth Date: Mon, 28 Mar 2022 13:41:22 -0700 Subject: [PATCH] Copy parser tools packages into this repo --- br-parser-tools-doc/LICENSE.txt | 11 + .../br-parser-tools/br-parser-tools.scrbl | 772 +++++++++++++++ br-parser-tools-doc/br-parser-tools/info.rkt | 3 + br-parser-tools-doc/info.rkt | 14 + br-parser-tools-lib/LICENSE.txt | 11 + .../br-parser-tools/cfg-parser.rkt | 876 ++++++++++++++++++ .../br-parser-tools/examples/calc.rkt | 92 ++ .../br-parser-tools/examples/read.rkt | 240 +++++ br-parser-tools-lib/br-parser-tools/info.rkt | 3 + .../br-parser-tools/lex-plt-v200.rkt | 23 + .../br-parser-tools/lex-sre.rkt | 103 ++ br-parser-tools-lib/br-parser-tools/lex.rkt | 370 ++++++++ .../br-parser-tools/private-lex/actions.rkt | 15 + .../br-parser-tools/private-lex/deriv.rkt | 333 +++++++ .../private-lex/error-tests.rkt | 81 ++ .../br-parser-tools/private-lex/front.rkt | 159 ++++ .../br-parser-tools/private-lex/re.rkt | 384 ++++++++ .../br-parser-tools/private-lex/stx.rkt | 183 ++++ .../private-lex/token-syntax.rkt | 7 + .../br-parser-tools/private-lex/token.rkt | 80 ++ .../private-lex/unicode-chars.rkt | 65 ++ .../br-parser-tools/private-lex/util.rkt | 127 +++ .../br-parser-tools/private-yacc/grammar.rkt | 250 +++++ .../br-parser-tools/private-yacc/graph.rkt | 53 ++ .../private-yacc/input-file-parser.rkt | 297 ++++++ .../br-parser-tools/private-yacc/lalr.rkt | 252 +++++ .../br-parser-tools/private-yacc/lr0.rkt | 314 +++++++ .../private-yacc/parser-actions.rkt | 54 ++ .../private-yacc/parser-builder.rkt | 103 ++ .../br-parser-tools/private-yacc/table.rkt | 264 ++++++ .../private-yacc/yacc-helper.rkt | 71 ++ .../br-parser-tools/yacc-to-scheme.rkt | 130 +++ br-parser-tools-lib/br-parser-tools/yacc.rkt | 334 +++++++ br-parser-tools-lib/info.rkt | 9 + br-parser-tools/LICENSE.txt | 11 + br-parser-tools/info.rkt | 12 + 36 files changed, 6106 insertions(+) create mode 100644 br-parser-tools-doc/LICENSE.txt create mode 100644 br-parser-tools-doc/br-parser-tools/br-parser-tools.scrbl create mode 100644 br-parser-tools-doc/br-parser-tools/info.rkt create mode 100644 br-parser-tools-doc/info.rkt create mode 100644 br-parser-tools-lib/LICENSE.txt create mode 100755 br-parser-tools-lib/br-parser-tools/cfg-parser.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/examples/calc.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/examples/read.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/info.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/lex-plt-v200.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/lex-sre.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/lex.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/actions.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/deriv.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/error-tests.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/front.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/re.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/stx.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/token-syntax.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/token.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/unicode-chars.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-lex/util.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/grammar.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/graph.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/input-file-parser.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/lalr.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/lr0.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/parser-actions.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/parser-builder.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/table.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/private-yacc/yacc-helper.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/yacc-to-scheme.rkt create mode 100644 br-parser-tools-lib/br-parser-tools/yacc.rkt create mode 100644 br-parser-tools-lib/info.rkt create mode 100644 br-parser-tools/LICENSE.txt create mode 100644 br-parser-tools/info.rkt diff --git a/br-parser-tools-doc/LICENSE.txt b/br-parser-tools-doc/LICENSE.txt new file mode 100644 index 0000000..c424668 --- /dev/null +++ b/br-parser-tools-doc/LICENSE.txt @@ -0,0 +1,11 @@ +parser-tools-doc +Copyright (c) 2010-2014 PLT Design Inc. + +This package is distributed under the GNU Lesser General Public +License (LGPL). This means that you can link this package into proprietary +applications, provided you follow the rules stated in the LGPL. You +can also modify this package; if you distribute a modified version, +you must distribute it under the terms of the LGPL, which in +particular means that you must release the source code for the +modified software. See http://www.gnu.org/copyleft/lesser.html +for more information. diff --git a/br-parser-tools-doc/br-parser-tools/br-parser-tools.scrbl b/br-parser-tools-doc/br-parser-tools/br-parser-tools.scrbl new file mode 100644 index 0000000..1a23b5e --- /dev/null +++ b/br-parser-tools-doc/br-parser-tools/br-parser-tools.scrbl @@ -0,0 +1,772 @@ +#lang scribble/doc +@(require scribble/manual scribble/struct scribble/xref scribble/bnf + (for-label scheme/base + scheme/contract + br-parser-tools/lex + (prefix-in : br-parser-tools/lex-sre) + br-parser-tools/yacc + br-parser-tools/cfg-parser)) + +@title{Parser Tools: @exec{lex} and @exec{yacc}-style Parsing (BR edition)} + +@author["Scott Owens (99%)" "Matthew Butterick (1%)"] + +This documentation assumes familiarity with @exec{lex}- and @exec{yacc}-style lexer and parser generators. + +@margin-note{This is a fork of the @link["https://docs.racket-lang.org/parser-tools"]{@racket[parser-tools]} package. It has a variety of small improvements and bugfixes designed to support the @link["https://docs.racket-lang.org/brag"]{@racket[brag]} parser language, in particular the @racket[srcloc] structure type (e.g., @racket[lexer-srcloc]). But the core lexing and parsing engines are identical.} + + +@table-of-contents[] + +@; ---------------------------------------------------------------------- + +@section{Lexers} + +@section-index["lex"] +@section-index["scanning"] +@section-index["scanner"] + +@defmodule[br-parser-tools/lex] + +@; ---------------------------------------- + +@subsection{Creating a Lexer} + +@defform/subs[#:literals (repetition union intersection complement concatenation + char-range char-complement + eof special special-comment) + (lexer [trigger action-expr] ...) + ([trigger re + (eof) + (special) + (special-comment)] + [re id + string + character + (repetition lo hi re) + (union re ...) + (intersection re ...) + (complement re) + (concatenation re ...) + (char-range char char) + (char-complement re) + (id datum ...)])]{ + + Produces a function that takes an input-port, matches the + @racket[re] patterns against the buffer, and returns the result of + executing the corresponding @racket[action-expr]. When multiple + patterns match, a lexer will choose the longest match, breaking + ties in favor of the rule appearing first. + + @margin-note{The implementation of @racketmodname[syntax-color/racket-lexer] + contains a lexer for the @racketmodname[racket] language. + In addition, files in the @filepath{examples} sub-directory + of the @filepath{br-parser-tools} collection contain + simpler example lexers.} + + An @racket[re] is matched as follows: + + @itemize[ + @item{@racket[id] --- expands to the named @deftech{lexer abbreviation}; + abbreviations are defined via @racket[define-lex-abbrev] or supplied by modules + like @racketmodname[br-parser-tools/lex-sre].} + @item{@racket[string] --- matches the sequence of characters in @racket[string].} + @item{@racket[character] --- matches a literal @racket[character].} + @item{@racket[(repetition lo hi re)] --- matches @racket[re] repeated between @racket[lo] + and @racket[hi] times, inclusive; @racket[hi] can be @racket[+inf.0] for unbounded repetitions.} + @item{@racket[(union re ...)] --- matches if any of the sub-expressions match} + @item{@racket[(intersection re ...)] --- matches if all of the @racket[re]s match.} + @item{@racket[(complement re)] --- matches anything that @racket[re] does not.} + @item{@racket[(concatenation re ...)] --- matches each @racket[re] in succession.} + @item{@racket[(char-range char char)] --- matches any character between the two (inclusive); + a single character string can be used as a @racket[char].} + @item{@racket[(char-complement re)] --- matches any character not matched by @racket[re]. + The sub-expression must be a set of characters @racket[re].} + @item{@racket[(id datum ...)] --- expands the @deftech{lexer macro} named @racket[id]; macros + are defined via @racket[define-lex-trans].} + ] + +Note that both @racket[(concatenation)] and @racket[""] match the +empty string, @racket[(union)] matches nothing, +@racket[(intersection)] matches any string, and +@racket[(char-complement (union))] matches any single character. + +The regular expression language is not designed to be used directly, +but rather as a basis for a user-friendly notation written with +regular expression macros. For example, +@racketmodname[br-parser-tools/lex-sre] supplies operators from Olin +Shivers's SREs, and @racketmodname[br-parser-tools/lex-plt-v200] supplies +(deprecated) operators from the previous version of this library. +Since those libraries provide operators whose names match other Racket +bindings, such as @racket[*] and @racket[+], they normally must be +imported using a prefix: + +@racketblock[ +(require (prefix-in : br-parser-tools/lex-sre)) +] + +The suggested prefix is @racket[:], so that @racket[:*] and +@racket[:+] are imported. Of course, a prefix other than @racket[:] +(such as @racket[re-]) will work too. + +Since negation is not a common operator on regular expressions, here +are a few examples, using @racket[:] prefixed SRE syntax: + +@itemize[ + +@item{@racketblock0[(complement "1")] + + Matches all strings except the string @racket["1"], including + @racket["11"], @racket["111"], @racket["0"], @racket["01"], + @racket[""], and so on.} + +@item{@racketblock0[(complement (:* "1"))] + + Matches all strings that are not sequences of @racket["1"], + including @racket["0"], @racket["00"], @racket["11110"], + @racket["0111"], @racket["11001010"] and so on.} + +@item{@racketblock0[(:& (:: any-string "111" any-string) + (complement (:or (:: any-string "01") (:+ "1"))))] + + Matches all strings that have 3 consecutive ones, but not those that + end in @racket["01"] and not those that are ones only. These + include @racket["1110"], @racket["0001000111"] and @racket["0111"] + but not @racket[""], @racket["11"], @racket["11101"], @racket["111"] + and @racket["11111"].} + +@item{@racketblock0[(:: "/*" (complement (:: any-string "*/" any-string)) "*/")] + + Matches Java/C block comments. @racket["/**/"], + @racket["/******/"], @racket["/*////*/"], @racket["/*asg4*/"] and so + on. It does not match @racket["/**/*/"], @racket["/* */ */"] and so + on. @racket[(:: any-string "*/" any-string)] matches any string + that has a @racket["*/"] in is, so @racket[(complement (:: any-string "*/" + any-string))] matches any string without a @racket["*/"] in it.} + +@item{@racketblock0[(:: "/*" (:* (complement "*/")) "*/")] + + Matches any string that starts with @racket["/*"] and ends with + @racket["*/"], including @racket["/* */ */ */"]. + @racket[(complement "*/")] matches any string except @racket["*/"]. + This includes @racket["*"] and @racket["/"] separately. Thus + @racket[(:* (complement "*/"))] matches @racket["*/"] by first + matching @racket["*"] and then matching @racket["/"]. Any other + string is matched directly by @racket[(complement "*/")]. In other + words, @racket[(:* (complement "xx"))] = @racket[any-string]. It is + usually not correct to place a @racket[:*] around a + @racket[complement].} +] + + + The following binding have special meaning inside of a lexer + action: + + @itemize[ + @item{@racket[start-pos] --- a @racket[position] struct for the first character matched.} + @item{@racket[end-pos] --- a @racket[position] struct for the character after the last character in the match.} + @item{@racket[lexeme] --- the matched string.} + @item{@racket[input-port] --- the input-port being + processed (this is useful for matching input with multiple + lexers).} + @item{@racket[(return-without-pos x)] and @racket[(return-without-srcloc x)] are functions (continuations) that + immediately return the value of @racket[x] from the lexer. This useful + in a src-pos or src-loc lexer to prevent the lexer from adding source + information. For example: + + @racketblock[ + (define get-token + (lexer-srcloc + ... + ((comment) (get-token input-port)) + ...)) + ] + + would wrap the source location information for the comment around + the value of the recursive call. Using + @racket[((comment) (return-without-srcloc (get-token input-port)))] + will cause the value of the recursive call to be returned without + wrapping position around it.} + ] + + The lexer raises an @racket[exn:fail:read] exception if none of the + regular expressions match the input. Hint: If @racket[(any-char + _custom-error-behavior)] is the last rule, then there will always + be a match, and @racket[_custom-error-behavior] is executed to + handle the error situation as desired, only consuming the first + character from the input buffer. + + In addition to returning characters, input + ports can return @racket[eof-object]s. Custom input ports can + also return a @racket[special-comment] value to indicate a + non-textual comment, or return another arbitrary value (a + special). The non-@racket[re] @racket[trigger] forms handle these + cases: + + @itemize[ + + @item{The @racket[(eof)] rule is matched when the input port + returns an @racket[eof-object] value. If no @racket[(eof)] + rule is present, the lexer returns the symbol @racket['eof] + when the port returns an @racket[eof-object] value.} + + @item{The @racket[(special-comment)] rule is matched when the + input port returns a @racket[special-comment] structure. If no + @racket[special-comment] rule is present, the lexer + automatically tries to return the next token from the input + port.} + + @item{The @racket[(special)] rule is matched when the input + port returns a value other than a character, + @racket[eof-object], or @racket[special-comment] structure. If + no @racket[(special)] rule is present, the lexer returns + @racket[(void)].}] + + End-of-files, specials, special-comments and special-errors cannot + be parsed via a rule using an ordinary regular expression + (but dropping down and manipulating the port to handle them + is possible in some situations). + + Since the lexer gets its source information from the port, use + @racket[port-count-lines!] to enable the tracking of line and + column information. Otherwise, the line and column information + will return @racket[#f]. + + When peeking from the input port raises an exception (such as by + an embedded XML editor with malformed syntax), the exception can + be raised before all tokens preceding the exception have been + returned. + + Each time the racket code for a lexer is compiled (e.g. when a + @filepath{.rkt} file containing a @racket[lexer] form is loaded), + the lexer generator is run. To avoid this overhead place the + lexer into a module and compile the module to a @filepath{.zo} + bytecode file.} + +@defform[(lexer-src-pos (trigger action-expr) ...)]{ + +Like @racket[lexer], but for each @racket[_action-result] produced by +an @racket[action-expr], returns @racket[(make-position-token +_action-result start-pos end-pos)] instead of simply +@racket[_action-result].} + +@defform[(lexer-srcloc (trigger action-expr) ...)]{ + +Like @racket[lexer], but for each @racket[_action-result] produced by +an @racket[action-expr], returns @racket[(make-srcloc-token +_action-result lexeme-srcloc)] instead of simply +@racket[_action-result].} + +@deftogether[( +@defidform[start-pos] +@defidform[end-pos] +@defidform[lexeme] +@defidform[lexeme-srcloc] +@defidform[input-port] +@defidform[return-without-pos] +@defidform[return-without-srcloc] +)]{ + +Use of these names outside of a @racket[lexer] action is a syntax +error.} + +@defstruct[position ([offset exact-positive-integer?] + [line exact-positive-integer?] + [col exact-nonnegative-integer?])]{ + + Instances of @racket[position] are bound to @racket[start-pos] and + @racket[end-pos]. The @racket[offset] field contains the offset of + the character in the input. The @racket[line] field contains the + line number of the character. The @racket[col] field contains the + offset in the current line.} + +@defstruct[position-token ([token any/c] + [start-pos position?] + [end-pos position?])]{ + + Lexers created with @racket[lexer-src-pos] return instances of @racket[position-token].} + + +@defstruct[srcloc-token ([token any/c] + [srcloc srcloc?])]{ + + Lexers created with @racket[lexer-srcloc] return instances of @racket[srcloc-token].} + +@defparam[file-path source any/c]{ + + A parameter that the lexer uses as the source location if it + raises a @racket[exn:fail:read] error. Setting this parameter allows + DrRacket, for example, to open the file containing the error.} + +@defparam[lexer-file-path source any/c]{ + +Alias for @racket[file-path].} + + +@; ---------------------------------------- + +@subsection{Lexer Abbreviations and Macros} + +@defform[(char-set string)]{ + +A @tech{lexer macro} that matches any character in @racket[string].} + +@defidform[any-char]{A @tech{lexer abbreviation} that matches any character.} + +@defidform[any-string]{A @tech{lexer abbreviation} that matches any string.} + +@defidform[nothing]{A @tech{lexer abbreviation} that matches no string.} + +@deftogether[( +@defidform[alphabetic] +@defidform[lower-case] +@defidform[upper-case] +@defidform[title-case] +@defidform[numeric] +@defidform[symbolic] +@defidform[punctuation] +@defidform[graphic] +@defidform[whitespace] +@defidform[blank] +@defidform[iso-control] +)]{ + +@tech{Lexer abbreviations} that match @racket[char-alphabetic?] +characters, @racket[char-lower-case?] characters, etc.} + +@defform[(define-lex-abbrev id re)]{ + + Defines a @tech{lexer abbreviation} by associating a regular + expression to be used in place of the @racket[id] in other + regular expression. The definition of name has the same scoping + properties as a other syntactic binding (e.g., it can be exported + from a module).} + +@defform[(define-lex-abbrevs (id re) ...)]{ + + Like @racket[define-lex-abbrev], but defines several @tech{lexer + abbreviations}.} + + +@defform[(define-lex-trans id trans-expr)]{ + + Defines a @tech{lexer macro}, where @racket[trans-expr] produces a + transformer procedure that takes one argument. When @racket[(id + _datum ...)] appears as a regular expression, it is replaced with + the result of applying the transformer to the expression.} + + +@; ---------------------------------------- + +@subsection{Lexer SRE Operators} + +@defmodule[br-parser-tools/lex-sre] + +@; Put the docs in a macro, so that we can bound the scope of +@; the import of `*', etc.: +@(define-syntax-rule (lex-sre-doc) + (... + (begin + (require (for-label br-parser-tools/lex-sre)) + +@defform[(* re ...)]{ + +0 or more occurrences of any @racket[re] pattern.} + +@defform[(+ re ...)]{ + +1 or more occurrences of any @racket[re] pattern.} + +@defform[(? re ...)]{ + +0 or 1 occurrence of any @racket[re] pattern.} + +@defform[(= n re ...)]{ + +Exactly @racket[n] occurrences of any @racket[re] pattern, where +@racket[n] must be a literal exact, non-negative number.} + +@defform[(>= n re ...)]{ + +At least @racket[n] occurrences of any @racket[re] pattern, where +@racket[n] must be a literal exact, non-negative number.} + +@defform[(** n m re ...)]{ + +Between @racket[n] and @racket[m] (inclusive) occurrences of +any @racket[re] pattern, where @racket[n] must be a literal exact, +non-negative number, and @racket[m] must be literally either +@racket[#f], @racket[+inf.0], or an exact, non-negative number; a +@racket[#f] value for @racket[m] is the same as @racket[+inf.0].} + +@defform[(or re ...)]{ + +Same as @racket[(union re ...)].} + +@deftogether[( +@defform[(: re ...)] +@defform[(seq re ...)] +)]{ + +Both forms concatenate the @racket[re]s into a single, indivisible pattern. +In other words, this matches @emph{all} the @racket[re]s in order, whereas @racket[(union re ...)] matches @emph{any} of the @racket[re]s.} + +@defform[(& re ...)]{ + +Intersects the @racket[re]s.} + +@defform[(- re ...)]{ + +The set difference of the @racket[re]s.} + +@defform[(~ re ...)]{ + +Character-set complement, which each @racket[re] must match exactly +one character.} + +@defform[(/ char-or-string ...)]{ + +Character ranges, matching characters between successive pairs of +characters.} + +))) + +@(lex-sre-doc) + +@; ---------------------------------------- + +@subsection{Lexer Legacy Operators} + +@defmodule[br-parser-tools/lex-plt-v200] + +@(define-syntax-rule (lex-v200-doc) + (... + (begin + (require (for-label br-parser-tools/lex-plt-v200)) + +@t{The @racketmodname[br-parser-tools/lex-plt-v200] module re-exports + @racket[*], @racket[+], @racket[?], and @racket[&] from + @racketmodname[br-parser-tools/lex-sre]. It also re-exports + @racket[:or] as @racket[:], @racket[::] as @racket[|@|], @racket[:~] + as @racket[^], and @racket[:/] as @racket[-].} + +@defform[(epsilon)]{ + +A @tech{lexer macro} that matches an empty sequence.} + +@defform[(~ re ...)]{ + +The same as @racket[(complement re ...)].}))) + +@(lex-v200-doc) + +@; ---------------------------------------- + +@subsection{Tokens} + +Each @racket[_action-expr] in a @racket[lexer] form can produce any +kind of value, but for many purposes, producing a @deftech{token} +value is useful. Tokens are usually necessary for inter-operating with +a parser generated by @racket[br-parser-tools/parser], but tokens may not +be the right choice when using @racket[lexer] in other situations. + +@defform[(define-tokens group-id (token-id ...))]{ + + Binds @racket[group-id] to the group of tokens being defined. For + each @racket[token-id], a function + @racketidfont{token-}@racket[token-id] is created that takes any + value and puts it in a token record specific to @racket[token-id]. + The token value is inspected using @racket[token-id] and + @racket[token-value]. + + A token cannot be named @racketidfont{error}, since + @racketidfont{error} it has special use in the parser.} + +@defform[(define-empty-tokens group-id (token-id ...) )]{ + + + Like @racket[define-tokens], except a each token constructor + @racketidfont{token-}@racket[token-id] takes no arguments and returns + @racket[(@#,racket[quote] token-id)].} + + +@defproc[(token-name [t (or/c token? symbol?)]) symbol?]{ + + Returns the name of a token that is represented either by a symbol + or a token structure.} + + +@defproc[(token-value [t (or/c token? symbol?)]) any/c]{ + + Returns the value of a token that is represented either by a symbol + or a token structure, returning @racket[#f] for a symbol token.} + + +@defproc[(token? [v any/c]) boolean?]{ + + Returns @racket[#t] if @racket[val] is a + token structure, @racket[#f] otherwise.} + +@; ---------------------------------------------------------------------- + +@section{LALR(1) Parsers} + +@section-index["yacc"] + +@defmodule[br-parser-tools/yacc] + +@defform/subs[#:literals (grammar tokens start end precs src-pos + suppress debug yacc-output prec) + (parser clause ...) + ([clause (grammar (non-terminal-id + ((grammar-id ...) maybe-prec expr) + ...) + ...) + (tokens group-id ...) + (start non-terminal-id ...) + (end token-id ...) + (@#,racketidfont{error} expr) + (precs (assoc token-id ...) ...) + (src-pos) + (suppress) + (debug filename) + (yacc-output filename)] + [maybe-prec code:blank + (prec token-id)] + [assoc left right nonassoc])]{ + + Creates a parser. The clauses may be in any order, as long as there + are no duplicates and all non-@italic{OPTIONAL} declarations are + present: + + @itemize[ + + @item{@racketblock0[(grammar (non-terminal-id + ((grammar-id ...) maybe-prec expr) + ...) + ...)] + + Declares the grammar to be parsed. Each @racket[grammar-id] can + be a @racket[token-id] from a @racket[group-id] named in a + @racket[tokens] declaration, or it can be a + @racket[non-terminal-id] declared in the @racket[grammar] + declaration. The optional @racket[prec] declaration works with + the @racket[precs] declaration. The @racket[expr] is a + ``semantic action,'' which is evaluated when the input is found + to match its corresponding production. + + Each action is Racket code that has the same scope as its + parser's definition, except that the variables @racket[$1], ..., + @racketidfont{$}@math{i} are bound, where @math{i} is the number + of @racket[grammar-id]s in the corresponding production. Each + @racketidfont{$}@math{k} is bound to the result of the action + for the @math{k}@superscript{th} grammar symbol on the right of + the production, if that grammar symbol is a non-terminal, or the + value stored in the token if the grammar symbol is a terminal. + If the @racket[src-pos] option is present in the parser, then + variables @racket[$1-start-pos], ..., + @racketidfont{$}@math{i}@racketidfont{-start-pos} and + @racket[$1-end-pos], ..., + @racketidfont{$}@math{i}@racketidfont{-end-pos} and are also + available, and they refer to the position structures + corresponding to the start and end of the corresponding + @racket[grammar-symbol]. Grammar symbols defined as empty-tokens + have no @racketidfont{$}@math{k} associated, but do have + @racketidfont{$}@math{k}@racketidfont{-start-pos} and + @racketidfont{$}@math{k}@racketidfont{-end-pos}. + Also @racketidfont{$n-start-pos} and @racketidfont{$n-end-pos} + are bound to the largest start and end positions, (i.e., + @racketidfont{$}@math{i}@racketidfont{-start-pos} and + @racketidfont{$}@math{i}@racketidfont{-end-pos}). + + An @deftech{error production} can be defined by providing + a production of the form @racket[(error α)], where α is a + string of grammar symbols, possibly empty. + + All of the productions for a given non-terminal must be grouped + with it. That is, no @racket[non-terminal-id] may appear twice + on the left hand side in a parser.} + + + @item{@racket[(tokens group-id ...)] + + Declares that all of the tokens defined in each + @racket[group-id]---as bound by @racket[define-tokens] or + @racket[define-empty-tokens]---can be used by the parser in the + @racket[grammar] declaration.} + + + @item{@racket[(start non-terminal-id ...)] + + Declares a list of starting non-terminals for the grammar.} + + + @item{@racket[(end token-id ...)] + + Specifies a set of tokens from which some member must follow any + valid parse. For example, an EOF token would be specified for a + parser that parses entire files and a newline token for a parser + that parses entire lines individually.} + + + @item{@racket[(@#,racketidfont{error} expr)] + + The @racket[expr] should evaluate to a function which will be + executed for its side-effect whenever the parser encounters an + error. + + If the @racket[src-pos] declaration is present, the function + should accept 5 arguments,: + + @racketblock[(lambda (tok-ok? tok-name tok-value _start-pos _end-pos) + ....)] + + Otherwise it should accept 3: + + @racketblock[(lambda (tok-ok? tok-name tok-value) + ....)] + + The first argument will be @racket[#f] if and only if the error + is that an invalid token was received. The second and third + arguments will be the name and the value of the token at which + the error was detected. The fourth and fifth arguments, if + present, provide the source positions of that token.} + + + @item{@racket[(precs (assoc token-id ...) ...)] + @italic{OPTIONAL} + + Precedence declarations to resolve shift/reduce and + reduce/reduce conflicts as in @exec{yacc}/@exec{bison}. An + @racket[assoc] must be one of @racket[left], @racket[right] or + @racket[nonassoc]. States with multiple shift/reduce or + reduce/reduce conflicts (or some combination thereof) are not + resolved with precedence.} + + @item{@racket[(src-pos)] @italic{OPTIONAL} + + Causes the generated parser to expect input in the form + @racket[(make-position-token _token _start-pos _end-pos)] instead + of simply @racket[_token]. Include this option when using the + parser with a lexer generated with @racket[lexer-src-pos].} + + + @item{@racket[(debug filename)] @italic{OPTIONAL} + + Causes the parser generator to write the LALR table to the file + named @racket[filename] (unless the file exists), where + @racket[filename] is a literal string. Additionally, if a debug + file is specified, when a running generated parser encounters a + parse error on some input file, after the user specified error + expression returns, the complete parse stack is printed to + assist in debugging the grammar of that particular parser. The + numbers in the stack printout correspond to the state numbers in + the LALR table file.} + + + @item{@racket[(yacc-output filename)] @italic{OPTIONAL} + + Causes the parser generator to write a grammar file in + approximately the syntax of @exec{yacc}/@exec{bison}. The file + might not be a valid @exec{yacc} file, because the Racket + grammar can use symbols that are invalid in C.} + + + @item{@racket[(suppress)] @italic{OPTIONAL} + + Causes the parser generator not to report shift/reduce or + reduce/reduce conflicts.} + + ] + + The result of a @racket[parser] expression with one @racket[start] + non-terminal is a function, @racket[_parse], that takes one + argument. This argument must be a zero argument function, + @racket[_gen], that produces successive tokens of the input each + time it is called. If desired, the @racket[_gen] may return + symbols instead of tokens, and the parser will treat symbols as + tokens of the corresponding name (with @racket[#f] as a value, so + it is usual to return symbols only in the case of empty tokens). + The @racket[_parse] function returns the value associated with the + parse tree by the semantic actions. If the parser encounters an + error, after invoking the supplied error function, it will try to + use @tech{error production}s to continue parsing. If it cannot, it + raises @racket[exn:fail:read]. + + If multiple non-terminals are provided in @racket[start], the + @racket[parser] expression produces a list of parsing functions, + one for each non-terminal in the same order. Each parsing function + is like the result of a parser expression with only one + @racket[start] non-terminal, + + Each time the Racket code for a @racket[parser] is compiled + (e.g. when a @filepath{.rkt} file containing a @racket[parser] form + is loaded), the parser generator is run. To avoid this overhead + place the parser into a module and compile the module to a + @filepath{.zo} bytecode file.} + + + + +@section{Context-Free Parsers} + +@section-index["cfg-parser"] + +@defmodule[br-parser-tools/cfg-parser]{The @racketmodname[br-parser-tools/cfg-parser] +library provides a parser generator that is an alternative to that of +@racketmodname[br-parser-tools/yacc].} + +@defform/subs[#:literals (grammar tokens start end precs src-pos + suppress debug yacc-output prec) + (cfg-parser clause ...) + ([clause (grammar (non-terminal-id + ((grammar-id ...) maybe-prec expr) + ...) + ...) + (tokens group-id ...) + (start non-terminal-id ...) + (end token-id ...) + (@#,racketidfont{error} expr) + (src-pos)])]{ + + Creates a parser similar to that of @racket[parser]. Unlike @racket[parser], + @racket[cfg-parser], can consume arbitrary and potentially ambiguous context-free + grammars. Its interface is a subset of @racketmodname[br-parser-tools/yacc], with + the following differences: + + @itemize[ + + @item{@racket[(start non-terminal-id)] + + Unlike @racket[parser], @racket[cfg-parser] only allows for + a single non-terminal-id.} + + @item{The @racket[cfg-parser] form does not support the @racket[precs], + @racket[suppress], @racket[debug], or @racket[yacc-output] + options of @racket[parser].} + ] +} + +@; ---------------------------------------------------------------------- + +@section{Converting @exec{yacc} or @exec{bison} Grammars} + +@defmodule[br-parser-tools/yacc-to-scheme] + +@defproc[(trans [file path-string?]) any/c]{ + +Reads a C @exec{yacc}/@exec{bison} grammar from @racket[file] and +produces an s-expression that represents a Racket parser for use with +@racket[parser]. + +This function is intended to assist in the manual conversion of +grammars for use with @racket[parser], and not as a fully automatic +conversion tool. It is not entirely robust. For example, if the C +actions in the original grammar have nested blocks, the tool will fail. + +Annotated examples are in the @filepath{examples} subdirectory of the +@filepath{br-parser-tools} collection.} + +@; ---------------------------------------------------------------------- + +@index-section[] diff --git a/br-parser-tools-doc/br-parser-tools/info.rkt b/br-parser-tools-doc/br-parser-tools/info.rkt new file mode 100644 index 0000000..f219d03 --- /dev/null +++ b/br-parser-tools-doc/br-parser-tools/info.rkt @@ -0,0 +1,3 @@ +#lang info + +(define scribblings '(("br-parser-tools.scrbl" (multi-page) (parsing-library)))) diff --git a/br-parser-tools-doc/info.rkt b/br-parser-tools-doc/info.rkt new file mode 100644 index 0000000..8760588 --- /dev/null +++ b/br-parser-tools-doc/info.rkt @@ -0,0 +1,14 @@ +#lang info + +(define collection 'multi) +(define deps '("base")) +(define build-deps '("scheme-lib" + "racket-doc" + "syntax-color-doc" + "br-parser-tools-lib" + "scribble-lib")) +(define update-implies '("br-parser-tools-lib")) + +(define pkg-desc "documentation part of \"br-parser-tools\"") + +(define pkg-authors '(mflatt)) diff --git a/br-parser-tools-lib/LICENSE.txt b/br-parser-tools-lib/LICENSE.txt new file mode 100644 index 0000000..f6889f1 --- /dev/null +++ b/br-parser-tools-lib/LICENSE.txt @@ -0,0 +1,11 @@ +parser-tools-lib +Copyright (c) 2010-2014 PLT Design Inc. + +This package is distributed under the GNU Lesser General Public +License (LGPL). This means that you can link this package into proprietary +applications, provided you follow the rules stated in the LGPL. You +can also modify this package; if you distribute a modified version, +you must distribute it under the terms of the LGPL, which in +particular means that you must release the source code for the +modified software. See http://www.gnu.org/copyleft/lesser.html +for more information. diff --git a/br-parser-tools-lib/br-parser-tools/cfg-parser.rkt b/br-parser-tools-lib/br-parser-tools/cfg-parser.rkt new file mode 100755 index 0000000..250b019 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/cfg-parser.rkt @@ -0,0 +1,876 @@ +#lang racket/base +;; This module implements a parser form like the br-parser-tools's +;; `parser', except that it works on an arbitrary CFG (returning +;; the first sucecssful parse). + +;; I'm pretty sure that this is an implementation of Earley's +;; algorithm. + +;; To a first approximation, it's a backtracking parser. Alternative +;; for a non-terminal are computed in parallel, and multiple attempts +;; to compute the same result block until the first one completes. If +;; you get into deadlock, such as when trying to match +;; := +;; then it means that there's no successful parse, so everything +;; that's blocked fails. + +;; A cache holds the series of results for a particular non-terminal +;; at a particular starting location. (A series is used, instead of a +;; sinlge result, for backtracking.) Otherwise, the parser uses +;; backtracking search. Backtracking is implemented through explicit +;; success and failure continuations. Multiple results for a +;; particular nonterminal and location are kept only when they have +;; different lengths. (Otherwise, in the spirit of finding one +;; successful parse, only the first result is kept.) + +;; The br-parser-tools's `parse' is used to transform tokens in the +;; grammar to tokens specific to this parser. In other words, this +;; parser uses `parser' so that it doesn't have to know anything about +;; tokens. +;; + + + +(require br-parser-tools/yacc + br-parser-tools/lex) + +(require (for-syntax racket/base + syntax/boundmap + br-parser-tools/private-lex/token-syntax)) + +(provide cfg-parser) + +;; A raw token, wrapped so that we can recognize it: +(define-struct tok (name orig-name val start end)) + +;; Represents the thread scheduler: +(define-struct tasks (active active-back waits multi-waits cache progress?)) + +(define-for-syntax make-token-identifier-mapping make-hasheq) +(define-for-syntax (token-identifier-mapping-get t tok [fail #f]) + (if fail + (hash-ref t (syntax-e tok) fail) + (hash-ref t (syntax-e tok)))) + +(define-for-syntax (token-identifier-mapping-put! t tok v) + (hash-set! t (syntax-e tok) v)) + +(define-for-syntax (token-identifier-mapping-map t f) + (hash-map t f)) + +;; Used to calculate information on the grammar, such as whether +;; a particular non-terminal is "simple" instead of recursively defined. +(define-for-syntax (nt-fixpoint nts proc nt-ids patss) + (define (ormap-all val f as bs) + (cond + [(null? as) val] + [else (ormap-all (or (f (car as) (car bs)) val) + f + (cdr as) (cdr bs))])) + (let loop () + (when (ormap-all #f + (λ (nt pats) + (let ([old (bound-identifier-mapping-get nts nt)]) + (let ([new (proc nt pats old)]) + (if (equal? old new) + #f + (begin + (bound-identifier-mapping-put! nts nt new) + #t))))) + nt-ids patss) + (loop)))) + +;; Tries parse-a followed by parse-b. If parse-a is not simple, +;; then after parse-a succeeds once, we parallelize parse-b +;; and trying a second result for parse-a. +(define (parse-and simple-a? parse-a parse-b + stream last-consumed-token depth end success-k fail-k + max-depth tasks) + (define ((mk-got-k success-k fail-k) val stream last-consumed-token depth max-depth tasks next1-k) + (if simple-a? + (parse-b val stream last-consumed-token depth end + (mk-got2-k success-k fail-k next1-k) + (mk-fail2-k success-k fail-k next1-k) + max-depth tasks) + (parallel-or + (λ (success-k fail-k max-depth tasks) + (parse-b val stream last-consumed-token depth end + success-k fail-k + max-depth tasks)) + (λ (success-k fail-k max-depth tasks) + (next1-k (mk-got-k success-k fail-k) + fail-k max-depth tasks)) + success-k fail-k max-depth tasks))) + + (define ((mk-got2-k success-k fail-k next1-k) val stream last-consumed-token depth max-depth tasks next-k) + (success-k val stream last-consumed-token depth max-depth tasks + (λ (success-k fail-k max-depth tasks) + (next-k (mk-got2-k success-k fail-k next1-k) + (mk-fail2-k success-k fail-k next1-k) + max-depth tasks)))) + + (define ((mk-fail2-k success-k fail-k next1-k) max-depth tasks) + (next1-k (mk-got-k success-k fail-k) fail-k max-depth tasks)) + + (parse-a stream last-consumed-token depth end + (mk-got-k success-k fail-k) + fail-k + max-depth tasks)) + +;; Parallel or for non-terminal alternatives +(define (parse-parallel-or parse-a parse-b stream last-consumed-token depth end success-k fail-k max-depth tasks) + (parallel-or (λ (success-k fail-k max-depth tasks) + (parse-a stream last-consumed-token depth end success-k fail-k max-depth tasks)) + (λ (success-k fail-k max-depth tasks) + (parse-b stream last-consumed-token depth end success-k fail-k max-depth tasks)) + success-k fail-k max-depth tasks)) + +;; Generic parallel-or +(define (parallel-or parse-a parse-b success-k fail-k max-depth tasks) + (define answer-key (gensym)) + (define (gota-k val stream last-consumed-token depth max-depth tasks next-k) + (report-answer answer-key + max-depth + tasks + (list val stream last-consumed-token depth next-k))) + (define (faila-k max-depth tasks) + (report-answer answer-key + max-depth + tasks + null)) + (let* ([tasks (queue-task tasks (λ (max-depth tasks) + (parse-a gota-k faila-k max-depth tasks)))] + [tasks (queue-task tasks (λ (max-depth tasks) + (parse-b gota-k faila-k max-depth tasks)))] + [queue-next (λ (next-k tasks) + (queue-task tasks (λ (max-depth tasks) + (next-k gota-k faila-k max-depth tasks))))]) + (define ((mk-got-one immediate-next? get-nth success-k) val stream last-consumed-token depth max-depth tasks next-k) + (let ([tasks (if immediate-next? + (queue-next next-k tasks) + tasks)]) + (success-k val stream last-consumed-token depth max-depth + tasks + (λ (success-k fail-k max-depth tasks) + (let ([tasks (if immediate-next? + tasks + (queue-next next-k tasks))]) + (get-nth max-depth tasks success-k fail-k)))))) + (define (get-first max-depth tasks success-k fail-k) + (wait-for-answer #f max-depth tasks answer-key + (mk-got-one #t get-first success-k) + (λ (max-depth tasks) + (get-second max-depth tasks success-k fail-k)) + #f)) + (define (get-second max-depth tasks success-k fail-k) + (wait-for-answer #f max-depth tasks answer-key + (mk-got-one #f get-second success-k) + fail-k #f)) + (get-first max-depth tasks success-k fail-k))) + +;; Non-terminal alternatives where the first is "simple" can be done +;; sequentially, which is simpler +(define (parse-or parse-a parse-b + stream last-consumed-token depth end success-k fail-k max-depth tasks) + (define ((mk-got-k success-k fail-k) val stream last-consumed-token depth max-depth tasks next-k) + (success-k val stream last-consumed-token depth + max-depth tasks + (λ (success-k fail-k max-depth tasks) + (next-k (mk-got-k success-k fail-k) + (mk-fail-k success-k fail-k) + max-depth tasks)))) + (define ((mk-fail-k success-k fail-k) max-depth tasks) + (parse-b stream last-consumed-token depth end success-k fail-k max-depth tasks)) + (parse-a stream last-consumed-token depth end + (mk-got-k success-k fail-k) + (mk-fail-k success-k fail-k) + max-depth tasks)) + +;; Starts a thread +(define (queue-task tasks t [progress? #t]) + (make-tasks (tasks-active tasks) + (cons t (tasks-active-back tasks)) + (tasks-waits tasks) + (tasks-multi-waits tasks) + (tasks-cache tasks) + (or progress? (tasks-progress? tasks)))) + +;; Reports an answer to a waiting thread: +(define (report-answer answer-key max-depth tasks val) + (define v (hash-ref (tasks-waits tasks) answer-key (λ () #f))) + (if v + (let ([tasks (make-tasks (cons (v val) (tasks-active tasks)) + (tasks-active-back tasks) + (tasks-waits tasks) + (tasks-multi-waits tasks) + (tasks-cache tasks) + #t)]) + (hash-remove! (tasks-waits tasks) answer-key) + (swap-task max-depth tasks)) + ;; We have an answer ready too fast; wait + (swap-task max-depth + (queue-task tasks + (λ (max-depth tasks) + (report-answer answer-key max-depth tasks val)) + #f)))) + +;; Reports an answer to multiple waiting threads: +(define (report-answer-all answer-key max-depth tasks val k) + (define v (hash-ref (tasks-multi-waits tasks) answer-key (λ () null))) + (hash-remove! (tasks-multi-waits tasks) answer-key) + (let ([tasks (make-tasks (append (map (λ (a) (a val)) v) + (tasks-active tasks)) + (tasks-active-back tasks) + (tasks-waits tasks) + (tasks-multi-waits tasks) + (tasks-cache tasks) + #t)]) + (k max-depth tasks))) + +;; Waits for an answer; if `multi?' is #f, this is sole waiter, otherwise +;; there might be many. Use wither #t or #f (and `report-answer' or +;; `report-answer-all', resptively) consistently for a particular answer key. +(define (wait-for-answer multi? max-depth tasks answer-key success-k fail-k deadlock-k) + (let ([wait (λ (val) + (λ (max-depth tasks) + (if val + (if (null? val) + (fail-k max-depth tasks) + (let-values ([(val stream last-consumed-token depth next-k) (apply values val)]) + (success-k val stream last-consumed-token depth max-depth tasks next-k))) + (deadlock-k max-depth tasks))))]) + (if multi? + (hash-set! (tasks-multi-waits tasks) answer-key + (cons wait (hash-ref (tasks-multi-waits tasks) answer-key + (λ () null)))) + (hash-set! (tasks-waits tasks) answer-key wait)) + (let ([tasks (make-tasks (tasks-active tasks) + (tasks-active-back tasks) + (tasks-waits tasks) + (tasks-multi-waits tasks) + (tasks-cache tasks) + #t)]) + (swap-task max-depth tasks)))) + +;; Swap thread +(define (swap-task max-depth tasks) + ;; Swap in first active: + (if (null? (tasks-active tasks)) + (if (tasks-progress? tasks) + (swap-task max-depth + (make-tasks (reverse (tasks-active-back tasks)) + null + (tasks-waits tasks) + (tasks-multi-waits tasks) + (tasks-cache tasks) + #f)) + ;; No progress, so issue failure for all multi-waits + (if (zero? (hash-count (tasks-multi-waits tasks))) + (error 'swap-task "Deadlock") + (swap-task max-depth + (make-tasks (apply + append + (hash-map (tasks-multi-waits tasks) + (λ (k l) + (map (λ (v) (v #f)) l)))) + (tasks-active-back tasks) + (tasks-waits tasks) + (make-hasheq) + (tasks-cache tasks) + #t)))) + (let ([t (car (tasks-active tasks))] + [tasks (make-tasks (cdr (tasks-active tasks)) + (tasks-active-back tasks) + (tasks-waits tasks) + (tasks-multi-waits tasks) + (tasks-cache tasks) + (tasks-progress? tasks))]) + (t max-depth tasks)))) + +;; Finds the symbolic representative of a token class +(define-for-syntax (map-token toks tok) + (car (token-identifier-mapping-get toks tok))) + +(define no-pos-val (make-position #f #f #f)) +(define-for-syntax no-pos + (let ([npv ((syntax-local-certifier) #'no-pos-val)]) + (λ (stx) npv))) +(define-for-syntax ((at-tok-pos sel expr) stx) + #`(let ([v #,expr]) (if v (#,sel v) no-pos-val))) + +;; Builds a matcher for a particular alternative +(define-for-syntax (build-match nts toks pat handle $ctx) + (let loop ([pat pat] + [pos 1]) + (if (null? pat) + #`(success-k #,handle stream last-consumed-token depth max-depth tasks + (λ (success-k fail-k max-depth tasks) + (fail-k max-depth tasks))) + (let ([id (datum->syntax (car pat) (string->symbol (format "$~a" pos)))] + [id-start-pos (datum->syntax (car pat) (string->symbol (format "$~a-start-pos" pos)))] + [id-end-pos (datum->syntax (car pat) (string->symbol (format "$~a-end-pos" pos)))] + [n-end-pos (and (null? (cdr pat)) (datum->syntax (car pat) '$n-end-pos))]) + (cond + [(bound-identifier-mapping-get nts (car pat) (λ () #f)) + ;; Match non-termimal + #`(parse-and + ;; First part is simple? (If so, we don't have to parallelize the `and'.) + #,(let ([l (bound-identifier-mapping-get nts (car pat) (λ () #f))]) + (or (not l) + (andmap values (caddr l)))) + #,(car pat) + (let ([original-stream stream]) + (λ (#,id stream last-consumed-token depth end success-k fail-k max-depth tasks) + (let-syntax ([#,id-start-pos (at-tok-pos #'(if (eq? original-stream stream) + tok-end + tok-start) + #'(if (eq? original-stream stream) + last-consumed-token + (and (pair? original-stream) + (car original-stream))))] + [#,id-end-pos (at-tok-pos #'tok-end #'last-consumed-token)] + #,@(if n-end-pos + #`([#,n-end-pos (at-tok-pos #'tok-end #'last-consumed-token)]) + null)) + #,(loop (cdr pat) (add1 pos))))) + stream last-consumed-token depth + #,(let ([cnt (apply + + (map (λ (item) + (cond + [(bound-identifier-mapping-get nts item (λ () #f)) + => (λ (l) (car l))] + [else 1])) + (cdr pat)))]) + #`(- end #,cnt)) + success-k fail-k max-depth tasks)] + [else + ;; Match token + (let ([tok-id (map-token toks (car pat))]) + #`(if (and (pair? stream) + (eq? '#,tok-id (tok-name (car stream)))) + (let* ([stream-a (car stream)] + [#,id (tok-val stream-a)] + [last-consumed-token (car stream)] + [stream (cdr stream)] + [depth (add1 depth)]) + (let ([max-depth (max max-depth depth)]) + (let-syntax ([#,id-start-pos (at-tok-pos #'tok-start #'stream-a)] + [#,id-end-pos (at-tok-pos #'tok-end #'stream-a)] + #,@(if n-end-pos + #`([#,n-end-pos (at-tok-pos #'tok-end #'stream-a)]) + null)) + #,(loop (cdr pat) (add1 pos))))) + (fail-k max-depth tasks)))]))))) + +;; Starts parsing to match a non-terminal. There's a minor +;; optimization that checks for known starting tokens. Otherwise, +;; use the cache, block if someone else is already trying the match, +;; and cache the result if it's computed. +;; The cache maps nontermial+startingpos+iteration to a result, where +;; the iteration is 0 for the first match attempt, 1 for the second, +;; etc. +(define (parse-nt/share key min-cnt init-tokens stream last-consumed-token depth end max-depth tasks success-k fail-k k) + (if (and (positive? min-cnt) + (pair? stream) + (not (memq (tok-name (car stream)) init-tokens))) + ;; No such leading token; give up + (fail-k max-depth tasks) + ;; Run pattern + (let loop ([n 0] + [success-k success-k] + [fail-k fail-k] + [max-depth max-depth] + [tasks tasks] + [k k]) + (define answer-key (gensym)) + (define table-key (vector key depth n)) + (define old-depth depth) + (define old-stream stream) + #;(printf "Loop ~a\n" table-key) + (cond + [(hash-ref (tasks-cache tasks) table-key (λ () #f)) + => (λ (result) + #;(printf "Reuse ~a\n" table-key) + (result success-k fail-k max-depth tasks))] + [else + #;(printf "Try ~a ~a\n" table-key (map tok-name stream)) + (hash-set! (tasks-cache tasks) table-key + (λ (success-k fail-k max-depth tasks) + #;(printf "Wait ~a ~a\n" table-key answer-key) + (wait-for-answer #t max-depth tasks answer-key success-k fail-k + (λ (max-depth tasks) + #;(printf "Deadlock ~a ~a\n" table-key answer-key) + (fail-k max-depth tasks))))) + (let result-loop ([max-depth max-depth][tasks tasks][k k]) + (define orig-stream stream) + (define (new-got-k val stream last-consumed-token depth max-depth tasks next-k) + ;; Check whether we already have a result that consumed the same amount: + (define result-key (vector #f key old-depth depth)) + (cond + [(hash-ref (tasks-cache tasks) result-key (λ () #f)) + ;; Go for the next-result + (result-loop max-depth + tasks + (λ (end max-depth tasks success-k fail-k) + (next-k success-k fail-k max-depth tasks)))] + [else + #;(printf "Success ~a ~a\n" table-key + (map tok-name (let loop ([d old-depth][s old-stream]) + (if (= d depth) + null + (cons (car s) (loop (add1 d) (cdr s))))))) + (let ([next-k (λ (success-k fail-k max-depth tasks) + (loop (add1 n) + success-k + fail-k + max-depth + tasks + (λ (end max-depth tasks success-k fail-k) + (next-k success-k fail-k max-depth tasks))))]) + (hash-set! (tasks-cache tasks) result-key #t) + (hash-set! (tasks-cache tasks) table-key + (λ (success-k fail-k max-depth tasks) + (success-k val stream last-consumed-token depth max-depth tasks next-k))) + (report-answer-all answer-key + max-depth + tasks + (list val stream last-consumed-token depth next-k) + (λ (max-depth tasks) + (success-k val stream last-consumed-token depth max-depth tasks next-k))))])) + (define (new-fail-k max-depth tasks) + #;(printf "Failure ~a\n" table-key) + (hash-set! (tasks-cache tasks) table-key + (λ (success-k fail-k max-depth tasks) + (fail-k max-depth tasks))) + (report-answer-all answer-key + max-depth + tasks + null + (λ (max-depth tasks) + (fail-k max-depth tasks)))) + (k end max-depth tasks new-got-k new-fail-k))])))) + +;; These temp identifiers can't be `gensym` or `generate-temporary` +;; because they have to be consistent between module loads +;; (IIUC, the parser is multi-threaded, and this approach is not thread-safe) +;; so I see no alternative to the old standby of making them ludicrously unlikely +(define-for-syntax start-id-temp 'start_jihqolbbafscgxvsufnepvmxqipnxgmlpxukmdoqxqzmzgaogaftbkbyqjttwwfimifowdxfyekjiixdmtprfkcvfciraehoeuaz) +(define-for-syntax atok-id-temp 'atok_wrutdjgecmybyfipiwsgjlvsveryodlgassuzcargiuznzgdghrykfqfbwcjgzdhdoeqxcucmtjkuyucskzethozhqkasphdwbht) +(define-syntax (cfg-parser stx) + (syntax-case stx () + [(_ CLAUSE ...) + (let ([clauses (syntax->list #'(CLAUSE ...))]) + (let-values ([(start grammar cfg-error parser-clauses src-pos?) + (let ([all-toks (apply + append + (for/list ([clause (in-list clauses)]) + (syntax-case clause (tokens) + [(tokens T ...) + (apply + append + (for/list ([t (in-list (syntax->list #'(T ...)))]) + (define v (syntax-local-value t (λ () #f))) + (cond + [(terminals-def? v) + (for/list ([v (in-list (syntax->list (terminals-def-t v)))]) + (cons v #f))] + [(e-terminals-def? v) + (for/list ([v (in-list (syntax->list (e-terminals-def-t v)))]) + (cons v #t))] + [else null])))] + [_else null])))] + [all-end-toks (apply + append + (for/list ([clause (in-list clauses)]) + (syntax-case clause (end) + [(end T ...) + (syntax->list #'(T ...))] + [_else null])))]) + (let loop ([clauses clauses] + [cfg-start #f] + [cfg-grammar #f] + [cfg-error #f] + [src-pos? #f] + [parser-clauses null]) + (if (null? clauses) + (values cfg-start + cfg-grammar + cfg-error + (reverse parser-clauses) + src-pos?) + (syntax-case (car clauses) (start error grammar src-pos) + [(start TOK) + (loop (cdr clauses) #'TOK cfg-grammar cfg-error src-pos? parser-clauses)] + [(error EXPR) + (loop (cdr clauses) cfg-start cfg-grammar #'EXPR src-pos? parser-clauses)] + [(grammar [NT [PAT HANDLE0 HANDLE ...] ...] ...) + (let ([nts (make-bound-identifier-mapping)] + [toks (make-token-identifier-mapping)] + [end-toks (make-token-identifier-mapping)] + [nt-ids (syntax->list #'(NT ...))] + [patss (map (λ (stx) + (map syntax->list (syntax->list stx))) + (syntax->list #'((PAT ...) ...)))]) + (for ([nt (in-list nt-ids)]) + (bound-identifier-mapping-put! nts nt (list 0))) + (for ([t (in-list all-end-toks)]) + (token-identifier-mapping-put! end-toks t #t)) + (for ([t (in-list all-toks)] + #:unless (token-identifier-mapping-get end-toks (car t) (λ () #f))) + (define id (gensym (syntax-e (car t)))) + (token-identifier-mapping-put! toks (car t) (cons id (cdr t)))) + ;; Compute min max size for each non-term: + (nt-fixpoint + nts + (λ (nt pats old-list) + (let ([new-cnt + (apply min (for/list ([pat (in-list pats)]) + (for/sum ([elem (in-list pat)]) + (car (bound-identifier-mapping-get + nts elem (λ () (list 1)))))))]) + (if (new-cnt . > . (car old-list)) + (cons new-cnt (cdr old-list)) + old-list))) + nt-ids patss) + ;; Compute set of toks that must appear at the beginning + ;; for a non-terminal + (nt-fixpoint + nts + (λ (nt pats old-list) + (let ([new-list + (apply + append + (for/list ([pat (in-list pats)]) + (let loop ([pat pat]) + (if (pair? pat) + (let ([l (bound-identifier-mapping-get + nts + (car pat) + (λ () + (list 1 (map-token toks (car pat)))))]) + ;; If the non-terminal can match 0 things, + ;; then it might match something from the + ;; next pattern element. Otherwise, it must + ;; match the first element: + (if (zero? (car l)) + (append (cdr l) (loop (cdr pat))) + (cdr l))) + null))))]) + (let ([new (filter (λ (id) + (andmap (λ (id2) + (not (eq? id id2))) + (cdr old-list))) + new-list)]) + (if (pair? new) + ;; Drop dups in new list: + (let ([new (let loop ([new new]) + (if (null? (cdr new)) + new + (if (ormap (λ (id) + (eq? (car new) id)) + (cdr new)) + (loop (cdr new)) + (cons (car new) (loop (cdr new))))))]) + (cons (car old-list) (append new (cdr old-list)))) + old-list)))) + nt-ids patss) + ;; Determine left-recursive clauses: + (for-each (λ (nt pats) + (let ([l (bound-identifier-mapping-get nts nt)]) + (bound-identifier-mapping-put! nts nt (list (car l) + (cdr l) + (map (λ (x) #f) pats))))) + nt-ids patss) + (nt-fixpoint + nts + (λ (nt pats old-list) + (list (car old-list) + (cadr old-list) + (map (λ (pat simple?) + (or simple? + (let ([l (map (λ (elem) + (bound-identifier-mapping-get + nts + elem + (λ () #f))) + pat)]) + (andmap (λ (i) + (or (not i) + (andmap values (caddr i)))) + l)))) + pats (caddr old-list)))) + nt-ids patss) + ;; Build a definition for each non-term: + (loop (cdr clauses) + cfg-start + (map (λ (nt pats handles $ctxs) + (define info (bound-identifier-mapping-get nts nt)) + (list nt + #`(let ([key (gensym '#,nt)]) + (λ (stream last-consumed-token depth end success-k fail-k max-depth tasks) + (parse-nt/share + key #,(car info) '#,(cadr info) stream last-consumed-token depth end + max-depth tasks + success-k fail-k + (λ (end max-depth tasks success-k fail-k) + #,(let loop ([pats pats] + [handles (syntax->list handles)] + [$ctxs (syntax->list $ctxs)] + [simple?s (caddr info)]) + (if (null? pats) + #'(fail-k max-depth tasks) + #`(#,(if (or (null? (cdr pats)) + (car simple?s)) + #'parse-or + #'parse-parallel-or) + (λ (stream last-consumed-token depth end success-k fail-k max-depth tasks) + #,(build-match nts + toks + (car pats) + (car handles) + (car $ctxs))) + (λ (stream last-consumed-token depth end success-k fail-k max-depth tasks) + #,(loop (cdr pats) + (cdr handles) + (cdr $ctxs) + (cdr simple?s))) + stream last-consumed-token depth end success-k fail-k max-depth tasks))))))))) + nt-ids + patss + (syntax->list #'(((begin HANDLE0 HANDLE ...) ...) ...)) + (syntax->list #'((HANDLE0 ...) ...))) + cfg-error + src-pos? + (list* + (with-syntax ([((tok tok-id . $e) ...) + (token-identifier-mapping-map toks + (λ (k v) + (list* k + (car v) + (if (cdr v) + #f + '$1))))] + [(pos ...) + (if src-pos? + #'($1-start-pos $1-end-pos) + #'(#f #f))] + ;; rename `start` and `atok` to temp ids + ;; so that "start" and "atok" can be used as literal string tokens in a grammar. + ;; not sure why this works, but it passes all tests. + [%start start-id-temp] + [%atok atok-id-temp]) + #`(grammar (%start [() null] + [(%atok %start) (cons $1 $2)]) + (%atok [(tok) (make-tok 'tok-id 'tok $e pos ...)] ...))) + (with-syntax ([%start start-id-temp]) + #`(start %start)) + parser-clauses)))] + [(grammar . _) + (raise-syntax-error + #f + "bad grammar clause" + stx + (car clauses))] + [(src-pos) + (loop (cdr clauses) + cfg-start + cfg-grammar + cfg-error + #t + (cons (car clauses) parser-clauses))] + [_else + (loop (cdr clauses) + cfg-start + cfg-grammar + cfg-error + src-pos? + (cons (car clauses) parser-clauses))]))))]) + #`(let ([orig-parse (parser + [error (λ (a b c) + (error 'cfg-parser "unexpected ~a token: ~a" b c))] + . #,parser-clauses)] + [error-proc #,cfg-error]) + (letrec #,grammar + (λ (get-tok) + (let ([tok-list (orig-parse get-tok)]) + (letrec ([success-k + (λ (val stream last-consumed-token depth max-depth tasks next) + (if (null? stream) + val + (next success-k fail-k max-depth tasks)))] + [fail-k (λ (max-depth tasks) + (cond + [(null? tok-list) + (if error-proc + (error-proc #t + 'no-tokens + #f + (make-position #f #f #f) + (make-position #f #f #f)) + (error + 'cfg-parse + "no tokens"))] + [else + (let ([bad-tok (list-ref tok-list + (min (sub1 (length tok-list)) + max-depth))]) + (if error-proc + (error-proc #t + (tok-orig-name bad-tok) + (tok-val bad-tok) + (tok-start bad-tok) + (tok-end bad-tok)) + (error + 'cfg-parse + "failed at ~a" + (tok-val bad-tok))))]))]) + (#,start tok-list + ;; we simulate a token at the very beginning with zero width + ;; for use with the position-generating code (*-start-pos, *-end-pos). + (if (null? tok-list) + (tok #f #f #f + (position 1 + #,(if src-pos? #'1 #'#f) + #,(if src-pos? #'0 #'#f)) + (position 1 + #,(if src-pos? #'1 #'#f) + #,(if src-pos? #'0 #'#f))) + (tok (tok-name (car tok-list)) + (tok-orig-name (car tok-list)) + (tok-val (car tok-list)) + (tok-start (car tok-list)) + (tok-start (car tok-list)))) + 0 + (length tok-list) + success-k + fail-k + 0 + (make-tasks null null + (make-hasheq) (make-hasheq) + (make-hash) #t)))))))))])) + + +(module* test racket/base + (require (submod "..") + br-parser-tools/lex + racket/block + rackunit) + + ;; Test: parsing regular expressions. + ;; Here is a test case on locations: + (block + (define-tokens regexp-tokens (ANCHOR STAR OR LIT LPAREN RPAREN EOF)) + (define lex (lexer-src-pos ["|" (token-OR lexeme)] + ["^" (token-ANCHOR lexeme)] + ["*" (token-STAR lexeme)] + [(repetition 1 +inf.0 alphabetic) (token-LIT lexeme)] + ["(" (token-LPAREN lexeme)] + [")" (token-RPAREN lexeme)] + [whitespace (return-without-pos (lex input-port))] + [(eof) (token-EOF 'eof)])) + (define -parse (cfg-parser + (tokens regexp-tokens) + (start top) + (end EOF) + (src-pos) + (grammar [top [(maybe-anchor regexp) + (cond [$1 + `(anchored ,$2 ,(pos->sexp $1-start-pos) ,(pos->sexp $2-end-pos))] + [else + `(unanchored ,$2 ,(pos->sexp $1-start-pos) ,(pos->sexp $2-end-pos))])]] + [maybe-anchor [(ANCHOR) #t] + [() #f]] + [regexp [(regexp STAR) `(star ,$1 ,(pos->sexp $1-start-pos) ,(pos->sexp $2-end-pos))] + [(regexp OR regexp) `(or ,$1 ,$3 ,(pos->sexp $1-start-pos) ,(pos->sexp $3-end-pos))] + [(LPAREN regexp RPAREN) `(group ,$2 ,(pos->sexp $1-start-pos) ,(pos->sexp $3-end-pos))] + [(LIT) `(lit ,$1 ,(pos->sexp $1-start-pos) ,(pos->sexp $1-end-pos))]]))) + (define (pos->sexp pos) + (position-offset pos)) + + (define (parse s) + (define ip (open-input-string s)) + (port-count-lines! ip) + (-parse (λ () (lex ip)))) + + (check-equal? (parse "abc") + '(unanchored (lit "abc" 1 4) 1 4)) + (check-equal? (parse "a | (b*) | c") + '(unanchored (or (or (lit "a" 1 2) + (group (star (lit "b" 6 7) 6 8) 5 9) + 1 9) + (lit "c" 12 13) + 1 13) + 1 13))) + + + + + + ;; Tests used during development + (define-tokens non-terminals (PLUS MINUS STAR BAR COLON EOF)) + + (define lex + (lexer + ["+" (token-PLUS '+)] + ["-" (token-MINUS '-)] + ["*" (token-STAR '*)] + ["|" (token-BAR '||)] + [":" (token-COLON '|:|)] + [whitespace (lex input-port)] + [(eof) (token-EOF 'eof)])) + + (define parse + (cfg-parser + (tokens non-terminals) + (start ) + (end EOF) + (error (λ (a b stx) + (error 'parse "failed at ~s" stx))) + (grammar [ [(PLUS) "plus"] + [( BAR ) (list $1 $2 $3)] + [( COLON) (list $1)]] + [ [(MINUS) "minus"] + [( STAR) (cons $1 $2)]] + [ [( MINUS) "yes"]] + [ [(PLUS) 'plus] + [(MINUS) 'minus]] + [ [() '0] + [( PLUS) (add1 $1)] + [( PLUS) (add1 $1)]]))) + + (let ([p (open-input-string #;"+*|-|-*|+**" #;"-|+*|+**" + #;"+*|+**|-" #;"-|-*|-|-*" + #;"-|-*|-|-**|-|-*|-|-**" + "-|-*|-|-**|-|-*|-|-***|-|-*|-|-**|-|-*|-|-****|-|-*|-|-**|-|-*|-|-*** + |-|-*|-|-**|-|-*|-|-*****|-|-*|-|-**|-|-*|-|-***|-|-*|-|-**|-|-*|-|-****| + -|-*|-|-**|-|-*|-|-***|-|-*|-|-**|-|-*|-|-*****" + ;; This one fails: + #;"+*")]) + (check-equal? (parse (λ () (lex p))) + '((((((((((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *) || (((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *)) . *) + || + (((((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *) || (((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *)) . *)) + . + *) + || + (((((((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *) || (((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *)) . *) + || + (((((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *) || (((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *)) . *)) + . + *)) + . + *) + || + (((((((((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *) || (((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *)) . *) + || + (((((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *) || (((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *)) . *)) + . + *) + || + (((((((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *) || (((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *)) . *) + || + (((((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *) || (((("minus" || "minus") . *) || (("minus" || "minus") . *)) . *)) . *)) + . + *)) + . + *))))) diff --git a/br-parser-tools-lib/br-parser-tools/examples/calc.rkt b/br-parser-tools-lib/br-parser-tools/examples/calc.rkt new file mode 100644 index 0000000..0d351bd --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/examples/calc.rkt @@ -0,0 +1,92 @@ +#lang racket/base + +;; An interactive calculator inspired by the calculator example in the bison manual. + + +;; Import the parser and lexer generators. +(require br-parser-tools/yacc + br-parser-tools/lex + (prefix-in : br-parser-tools/lex-sre)) + +(define-tokens value-tokens (NUM VAR FNCT)) +(define-empty-tokens op-tokens (newline = OP CP + - * / ^ EOF NEG)) + +;; A hash table to store variable values in for the calculator +(define vars (make-hash)) + +(define-lex-abbrevs + (lower-letter (:/ "a" "z")) + + (upper-letter (:/ #\A #\Z)) + + ;; (:/ 0 9) would not work because the lexer does not understand numbers. (:/ #\0 #\9) is ok too. + (digit (:/ "0" "9"))) + +(define calc-lex + (lexer + [(eof) 'EOF] + ;; recursively call the lexer on the remaining input after a tab or space. Returning the + ;; result of that operation. This effectively skips all whitespace. + [(:or #\tab #\space) (calc-lex input-port)] + ;; (token-newline) returns 'newline + [#\newline (token-newline)] + ;; Since (token-=) returns '=, just return the symbol directly + [(:or "=" "+" "-" "*" "/" "^") (string->symbol lexeme)] + ["(" 'OP] + [")" 'CP] + ["sin" (token-FNCT sin)] + [(:+ (:or lower-letter upper-letter)) (token-VAR (string->symbol lexeme))] + [(:+ digit) (token-NUM (string->number lexeme))] + [(:: (:+ digit) #\. (:* digit)) (token-NUM (string->number lexeme))])) + + +(define calc-parse + (parser + + (start start) + (end newline EOF) + (tokens value-tokens op-tokens) + (error (lambda (a b c) (void))) + + (precs (right =) + (left - +) + (left * /) + (left NEG) + (right ^)) + + (grammar + + (start [() #f] + ;; If there is an error, ignore everything before the error + ;; and try to start over right after the error + [(error start) $2] + [(exp) $1]) + + (exp [(NUM) $1] + [(VAR) (hash-ref vars $1 (lambda () 0))] + [(VAR = exp) (begin (hash-set! vars $1 $3) + $3)] + [(FNCT OP exp CP) ($1 $3)] + [(exp + exp) (+ $1 $3)] + [(exp - exp) (- $1 $3)] + [(exp * exp) (* $1 $3)] + [(exp / exp) (/ $1 $3)] + [(- exp) (prec NEG) (- $2)] + [(exp ^ exp) (expt $1 $3)] + [(OP exp CP) $2])))) + +;; run the calculator on the given input-port +(define (calc ip) + (port-count-lines! ip) + (let loop () + (define result (calc-parse (λ () (calc-lex ip)))) + (when result + (printf "~a\n" result) + (loop)))) + +(module+ test + (require rackunit) + (check-equal? (let ([o (open-output-string)]) + (parameterize ([current-output-port o]) + (calc (open-input-string "x=1\n(x + 2 * 3) - (1+2)*3"))) + (get-output-string o)) "1\n-2\n")) diff --git a/br-parser-tools-lib/br-parser-tools/examples/read.rkt b/br-parser-tools-lib/br-parser-tools/examples/read.rkt new file mode 100644 index 0000000..b01f77a --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/examples/read.rkt @@ -0,0 +1,240 @@ +#lang racket/base +;; This implements the equivalent of racket's read-syntax for R5RS scheme. +;; It has not been thoroughly tested. Also it will read an entire file into a +;; list of syntax objects, instead of returning one syntax object at a time + +(require (for-syntax racket/base) + br-parser-tools/lex + (prefix-in : br-parser-tools/lex-sre) + br-parser-tools/yacc + syntax/readerr) + +(define-tokens data (DATUM)) +(define-empty-tokens delim (OP CP HASHOP QUOTE QUASIQUOTE UNQUOTE UNQUOTE-SPLICING DOT EOF)) + +(define scheme-lexer + (lexer-src-pos + + ;; Skip comments, without accumulating extra position information + [(:or scheme-whitespace comment) (return-without-pos (scheme-lexer input-port))] + + ["#t" (token-DATUM #t)] + ["#f" (token-DATUM #f)] + [(:: "#\\" any-char) (token-DATUM (caddr (string->list lexeme)))] + ["#\\space" (token-DATUM #\space)] + ["#\\newline" (token-DATUM #\newline)] + [(:or (:: initial (:* subsequent)) "+" "-" "...") (token-DATUM (string->symbol lexeme))] + [#\" (token-DATUM (list->string (get-string-token input-port)))] + [#\( 'OP] + [#\) 'CP] + [#\[ 'OP] + [#\] 'CP] + ["#(" 'HASHOP] + [num2 (token-DATUM (string->number lexeme 2))] + [num8 (token-DATUM (string->number lexeme 8))] + [num10 (token-DATUM (string->number lexeme 10))] + [num16 (token-DATUM (string->number lexeme 16))] + ["'" 'QUOTE] + ["`" 'QUASIQUOTE] + ["," 'UNQUOTE] + [",@" 'UNQUOTE-SPLICING] + ["." 'DOT] + [(eof) 'EOF])) + +(define get-string-token + (lexer + [(:~ #\" #\\) (cons (car (string->list lexeme)) + (get-string-token input-port))] + [(:: #\\ #\\) (cons #\\ (get-string-token input-port))] + [(:: #\\ #\") (cons #\" (get-string-token input-port))] + [#\" null])) + + +(define-lex-abbrevs + [letter (:or (:/ "a" "z") (:/ #\A #\Z))] + [digit (:/ #\0 #\9)] + [scheme-whitespace (:or #\newline #\return #\tab #\space #\vtab)] + [initial (:or letter (char-set "!$%&*/:<=>?^_~@"))] + [subsequent (:or initial digit (char-set "+-.@"))] + [comment (:: #\; (:* (:~ #\newline)) #\newline)] + + + ;; See ${PLTHOME}/collects/syntax-color/racket-lexer.rkt for an example of + ;; using regexp macros to avoid the cut and paste. + ; [numR (:: prefixR complexR)] + ; [complexR (:or realR + ; (:: realR "@" realR) + ; (:: realR "+" urealR "i") + ; (:: realR "-" urealR "i") + ; (:: realR "+i") + ; (:: realR "-i") + ; (:: "+" urealR "i") + ; (:: "-" urealR "i") + ; (:: "+i") + ; (:: "-i"))] + ; [realR (:: sign urealR)] + ; [urealR (:or uintegerR (:: uintegerR "/" uintegerR) decimalR)] + ; [uintegerR (:: (:+ digitR) (:* #\#))] + ; [prefixR (:or (:: radixR exactness) + ; (:: exactness radixR))] + + [num2 (:: prefix2 complex2)] + [complex2 (:or real2 + (:: real2 "@" real2) + (:: real2 "+" ureal2 "i") + (:: real2 "-" ureal2 "i") + (:: real2 "+i") + (:: real2 "-i") + (:: "+" ureal2 "i") + (:: "-" ureal2 "i") + (:: "+i") + (:: "-i"))] + [real2 (:: sign ureal2)] + [ureal2 (:or uinteger2 (:: uinteger2 "/" uinteger2))] + [uinteger2 (:: (:+ digit2) (:* #\#))] + [prefix2 (:or (:: radix2 exactness) + (:: exactness radix2))] + [radix2 "#b"] + [digit2 (:or "0" "1")] + [num8 (:: prefix8 complex8)] + [complex8 (:or real8 + (:: real8 "@" real8) + (:: real8 "+" ureal8 "i") + (:: real8 "-" ureal8 "i") + (:: real8 "+i") + (:: real8 "-i") + (:: "+" ureal8 "i") + (:: "-" ureal8 "i") + (:: "+i") + (:: "-i"))] + [real8 (:: sign ureal8)] + [ureal8 (:or uinteger8 (:: uinteger8 "/" uinteger8))] + [uinteger8 (:: (:+ digit8) (:* #\#))] + [prefix8 (:or (:: radix8 exactness) + (:: exactness radix8))] + [radix8 "#o"] + [digit8 (:/ "0" "7")] + + [num10 (:: prefix10 complex10)] + [complex10 (:or real10 + (:: real10 "@" real10) + (:: real10 "+" ureal10 "i") + (:: real10 "-" ureal10 "i") + (:: real10 "+i") + (:: real10 "-i") + (:: "+" ureal10 "i") + (:: "-" ureal10 "i") + (:: "+i") + (:: "-i"))] + [real10 (:: sign ureal10)] + [ureal10 (:or uinteger10 (:: uinteger10 "/" uinteger10) decimal10)] + [uinteger10 (:: (:+ digit10) (:* #\#))] + [prefix10 (:or (:: radix10 exactness) + (:: exactness radix10))] + [radix10 (:? "#d")] + [digit10 digit] + [decimal10 (:or (:: uinteger10 suffix) + (:: #\. (:+ digit10) (:* #\#) suffix) + (:: (:+ digit10) #\. (:* digit10) (:* #\#) suffix) + (:: (:+ digit10) (:+ #\#) #\. (:* #\#) suffix))] + + [num16 (:: prefix16 complex16)] + [complex16 (:or real16 + (:: real16 "@" real16) + (:: real16 "+" ureal16 "i") + (:: real16 "-" ureal16 "i") + (:: real16 "+i") + (:: real16 "-i") + (:: "+" ureal16 "i") + (:: "-" ureal16 "i") + "+i" + "-i")] + [real16 (:: sign ureal16)] + [ureal16 (:or uinteger16 (:: uinteger16 "/" uinteger16))] + [uinteger16 (:: (:+ digit16) (:* #\#))] + [prefix16 (:or (:: radix16 exactness) + (:: exactness radix16))] + [radix16 "#x"] + [digit16 (:or digit (:/ #\a #\f) (:/ #\A #\F))] + + + [suffix (:or "" (:: exponent-marker sign (:+ digit10)))] + [exponent-marker (:or "e" "s" "f" "d" "l")] + [sign (:or "" "+" "-")] + [exactness (:or "" "#i" "#e")]) + + +(define stx-for-original-property (read-syntax #f (open-input-string "original"))) + +;; A macro to build the syntax object +(define-syntax (build-so stx) + (syntax-case stx () + ((_ value start end) + (with-syntax ((start-pos (datum->syntax + #'end + (string->symbol + (format "$~a-start-pos" + (syntax->datum #'start))))) + (end-pos (datum->syntax + #'end + (string->symbol + (format "$~a-end-pos" + (syntax->datum #'end))))) + (source (datum->syntax + #'end + 'source-name))) + (syntax + (datum->syntax + #f + value + (list source + (position-line start-pos) + (position-col start-pos) + (position-offset start-pos) + (- (position-offset end-pos) + (position-offset start-pos))) + stx-for-original-property)))))) + +(define (scheme-parser source-name) + (parser + (src-pos) + + (start s) + (end EOF) + (error (lambda (a name val start end) + (raise-read-error + "read-error" + source-name + (position-line start) + (position-col start) + (position-offset start) + (- (position-offset end) + (position-offset start))))) + (tokens data delim) + + + (grammar + + (s [(sexp-list) (reverse $1)]) + + (sexp [(DATUM) (build-so $1 1 1)] + [(OP sexp-list CP) (build-so (reverse $2) 1 3)] + [(HASHOP sexp-list CP) (build-so (list->vector (reverse $2)) 1 3)] + [(QUOTE sexp) (build-so (list 'quote $2) 1 2)] + [(QUASIQUOTE sexp) (build-so (list 'quasiquote $2) 1 2)] + [(UNQUOTE sexp) (build-so (list 'unquote $2) 1 2)] + [(UNQUOTE-SPLICING sexp) (build-so (list 'unquote-splicing $2) 1 2)] + [(OP sexp-list DOT sexp CP) (build-so (append (reverse $2) $4) 1 5)]) + + (sexp-list [() null] + [(sexp-list sexp) (cons $2 $1)])))) + +(define (rs sn ip) + (port-count-lines! ip) + ((scheme-parser sn) (lambda () (scheme-lexer ip)))) + +(define readsyntax + (case-lambda ((sn) (rs sn (current-input-port))) + ((sn ip) (rs sn ip)))) + +(provide (rename-out [readsyntax read-syntax])) diff --git a/br-parser-tools-lib/br-parser-tools/info.rkt b/br-parser-tools-lib/br-parser-tools/info.rkt new file mode 100644 index 0000000..ae66609 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/info.rkt @@ -0,0 +1,3 @@ +#lang info + +(define compile-omit-paths '("private-lex/error-tests.rkt")) diff --git a/br-parser-tools-lib/br-parser-tools/lex-plt-v200.rkt b/br-parser-tools-lib/br-parser-tools/lex-plt-v200.rkt new file mode 100644 index 0000000..ce11f2d --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/lex-plt-v200.rkt @@ -0,0 +1,23 @@ +#lang racket/base +(require (for-syntax racket/base) + br-parser-tools/lex + (prefix-in : br-parser-tools/lex-sre)) + +(provide epsilon ~ + (rename-out [:* *] + [:+ +] + [:? ?] + [:or :] + [:& &] + [:: @] + [:~ ^] + [:/ -])) + +(define-lex-trans (epsilon stx) + (syntax-case stx () + [(_) #'""])) + +(define-lex-trans (~ stx) + (syntax-case stx () + [(_ RE) #'(complement RE)])) + diff --git a/br-parser-tools-lib/br-parser-tools/lex-sre.rkt b/br-parser-tools-lib/br-parser-tools/lex-sre.rkt new file mode 100644 index 0000000..40f2b16 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/lex-sre.rkt @@ -0,0 +1,103 @@ +#lang racket/base +(require (for-syntax racket/base) + br-parser-tools/lex) + +(provide (rename-out [sre-* *] + [sre-+ +] + [sre-= =] + [sre->= >=] + [sre-or or] + [sre-- -] + [sre-/ /]) + ? ** : seq & ~ /-only-chars) + +(define-lex-trans (sre-* stx) + (syntax-case stx () + [(_ RE ...) + #'(repetition 0 +inf.0 (union RE ...))])) + +(define-lex-trans (sre-+ stx) + (syntax-case stx () + [(_ RE ...) + #'(repetition 1 +inf.0 (union RE ...))])) + +(define-lex-trans (? stx) + (syntax-case stx () + [(_ RE ...) + #'(repetition 0 1 (union RE ...))])) + +(define-lex-trans (sre-= stx) + (syntax-case stx () + [(_ N RE ...) + #'(repetition N N (union RE ...))])) + +(define-lex-trans (sre->= stx) + (syntax-case stx () + [(_ N RE ...) + #'(repetition N +inf.0 (union RE ...))])) + +(define-lex-trans (** stx) + (syntax-case stx () + [(_ LOW #f RE ...) + #'(** LOW +inf.0 RE ...)] + [(_ LOW HIGH RE ...) + #'(repetition LOW HIGH (union RE ...))])) + +(define-lex-trans (sre-or stx) + (syntax-case stx () + [(_ RE ...) + #'(union RE ...)])) + +(define-lex-trans (: stx) + (syntax-case stx () + [(_ RE ...) + #'(concatenation RE ...)])) + +(define-lex-trans (seq stx) + (syntax-case stx () + [(_ RE ...) + #'(concatenation RE ...)])) + +(define-lex-trans (& stx) + (syntax-case stx () + [(_ RE ...) + #'(intersection RE ...)])) + +(define-lex-trans (~ stx) + (syntax-case stx () + [(_ RE ...) + #'(char-complement (union RE ...))])) + +;; set difference +(define-lex-trans (sre-- stx) + (syntax-case stx () + [(_) + (raise-syntax-error #f + "must have at least one argument" + stx)] + [(_ BIG-RE RE ...) + #'(& BIG-RE (complement (union RE ...)))])) + +(define-lex-trans (sre-/ stx) + (syntax-case stx () + [(_ RANGE ...) + (let ([chars + (apply append (for/list ([r (in-list (syntax->list #'(RANGE ...)))]) + (let ([x (syntax-e r)]) + (cond + [(char? x) (list x)] + [(string? x) (string->list x)] + [else + (raise-syntax-error #f "not a char or string" stx r)]))))]) + (unless (even? (length chars)) + (raise-syntax-error #f "not given an even number of characters" stx)) + #`(/-only-chars #,@chars))])) + +(define-lex-trans (/-only-chars stx) + (syntax-case stx () + [(_ C1 C2) + #'(char-range C1 C2)] + [(_ C1 C2 C ...) + #'(union (char-range C1 C2) (/-only-chars C ...))])) + + diff --git a/br-parser-tools-lib/br-parser-tools/lex.rkt b/br-parser-tools-lib/br-parser-tools/lex.rkt new file mode 100644 index 0000000..c26aeef --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/lex.rkt @@ -0,0 +1,370 @@ +#lang racket/base + +;; Provides the syntax used to create lexers and the functions needed to +;; create and use the buffer that the lexer reads from. See docs. + +(require (for-syntax racket/list + racket/syntax + syntax/stx + syntax/define + syntax/boundmap + "private-lex/util.rkt" + "private-lex/actions.rkt" + "private-lex/front.rkt" + "private-lex/unicode-chars.rkt" + racket/base + racket/promise)) + +(require racket/stxparam + syntax/readerr + "private-lex/token.rkt") + +(provide lexer lexer-src-pos lexer-srcloc define-lex-abbrev define-lex-abbrevs define-lex-trans + + ;; Dealing with tokens and related structures + define-tokens define-empty-tokens token-name token-value token? + (struct-out position) + (struct-out position-token) + (struct-out srcloc-token) + + ;; File path for highlighting errors while lexing + file-path + lexer-file-path ;; alternate name + + ;; Lex abbrevs for unicode char sets. + any-char any-string nothing alphabetic lower-case upper-case title-case + numeric symbolic punctuation graphic whitespace blank iso-control + + ;; A regular expression operator + char-set) + +;; wrap-action: syntax-object src-pos? -> syntax-object +(define-for-syntax (wrap-action action src-loc-style) + (with-syntax ([action-stx + (cond + [(eq? src-loc-style 'lexer-src-pos) + #`(let/ec ret + (syntax-parameterize + ([return-without-pos (make-rename-transformer #'ret)]) + (make-position-token #,action start-pos end-pos)))] + [(eq? src-loc-style 'lexer-srcloc) + #`(let/ec ret + (syntax-parameterize + ([return-without-srcloc (make-rename-transformer #'ret)]) + (make-srcloc-token #,action lexeme-srcloc)))] + [else action])]) + (syntax/loc action + (λ (start-pos-p end-pos-p lexeme-p input-port-p) + (define lexeme-srcloc-p (make-srcloc (object-name input-port-p) + (position-line start-pos-p) + (position-col start-pos-p) + (position-offset start-pos-p) + (and (number? (position-offset end-pos-p)) + (number? (position-offset start-pos-p)) + (- (position-offset end-pos-p) + (position-offset start-pos-p))))) + (syntax-parameterize + ([start-pos (make-rename-transformer #'start-pos-p)] + [end-pos (make-rename-transformer #'end-pos-p)] + [lexeme (make-rename-transformer #'lexeme-p)] + [input-port (make-rename-transformer #'input-port-p)] + [lexeme-srcloc (make-rename-transformer #'lexeme-srcloc-p)]) + action-stx))))) + +(define-for-syntax (make-lexer-macro caller src-loc-style) + (λ (stx) + (syntax-case stx () + [(_ . RE+ACTS) + (with-disappeared-uses + (let () + (define spec/re-acts (syntax->list #'RE+ACTS)) + (for/and ([x (in-list spec/re-acts)]) + (syntax-case x () + [(RE ACT) #t] + [else (raise-syntax-error caller "not a regular expression / action pair" stx x)])) + (define eof-act (get-special-action spec/re-acts #'eof (case src-loc-style + [(lexer-src-pos) #'(return-without-pos eof)] + [(lexer-srcloc) #'(return-without-srcloc eof)] + [else #'eof]))) + (define spec-act (get-special-action spec/re-acts #'special #'(void))) + (define spec-comment-act (get-special-action spec/re-acts #'special-comment #'#f)) + (define ids (list #'special #'special-comment #'eof)) + (define re-acts (filter (λ (spec/re-act) + (syntax-case spec/re-act () + [((special) act) + (not (ormap + (λ (x) + (and (identifier? #'special) + (module-or-top-identifier=? #'special x))) + ids))] + [_ #t])) spec/re-acts)) + (define names (map (λ (x) (datum->syntax #f (gensym))) re-acts)) + (define acts (map (λ (x) (stx-car (stx-cdr x))) re-acts)) + (define re-actnames (map (λ (re-act name) (list (stx-car re-act) name)) re-acts names)) + (when (null? spec/re-acts) + (raise-syntax-error caller "expected at least one action" stx)) + (define-values (trans start action-names no-look) (build-lexer re-actnames)) + (when (vector-ref action-names start) ;; Start state is final + (unless (and + ;; All the successor states are final + (vector? (vector-ref trans start)) + (andmap (λ (x) (vector-ref action-names (vector-ref x 2))) + (vector->list (vector-ref trans start))) + ;; Each character has a successor state + (let loop ([check 0] + [nexts (vector->list (vector-ref trans start))]) + (cond + [(null? nexts) #f] + [else + (let ([next (car nexts)]) + (and (= (vector-ref next 0) check) + (let ([next-check (vector-ref next 1)]) + (or (>= next-check max-char-num) + (loop (add1 next-check) (cdr nexts))))))]))) + (eprintf "warning: lexer at ~a can accept the empty string\n" stx))) + (with-syntax ([START-STATE-STX start] + [TRANS-TABLE-STX trans] + [NO-LOOKAHEAD-STX no-look] + [(NAME ...) names] + [(ACT ...) (map (λ (a) (wrap-action a src-loc-style)) acts)] + [(ACT-NAME ...) (vector->list action-names)] + [SPEC-ACT-STX (wrap-action spec-act src-loc-style)] + [HAS-COMMENT-ACT?-STX (if (syntax-e spec-comment-act) #t #f)] + [SPEC-COMMENT-ACT-STX (wrap-action spec-comment-act src-loc-style)] + [EOF-ACT-STX (wrap-action eof-act src-loc-style)]) + (syntax/loc stx (let ([NAME ACT] ...) + (let ([proc (lexer-body START-STATE-STX + TRANS-TABLE-STX + (vector ACT-NAME ...) + NO-LOOKAHEAD-STX + SPEC-ACT-STX + HAS-COMMENT-ACT?-STX + SPEC-COMMENT-ACT-STX + EOF-ACT-STX)]) + ;; reverse eta to get named procedures: + (λ (port) (proc port))))))))]))) + +(define-syntax lexer (make-lexer-macro 'lexer #f)) +(define-syntax lexer-src-pos (make-lexer-macro 'lexer-src-pos 'lexer-src-pos)) +(define-syntax lexer-srcloc (make-lexer-macro 'lexer-srcloc 'lexer-srcloc)) + +(define-syntax (define-lex-abbrev stx) + (syntax-case stx () + [(_ NAME RE) (identifier? #'NAME) + (syntax/loc stx + (define-syntax NAME + (make-lex-abbrev (λ () (quote-syntax RE)))))] + [_ (raise-syntax-error 'define-lex-abbrev "form should be (define-lex-abbrev name re)" stx)])) + +(define-syntax (define-lex-abbrevs stx) + (syntax-case stx () + [(_ . XS) + (with-syntax ([(ABBREV ...) (map + (λ (a) + (syntax-case a () + [(NAME RE) (identifier? #'NAME) + (syntax/loc a (define-lex-abbrev NAME RE))] + [_ (raise-syntax-error + #f + "form should be (define-lex-abbrevs (name re) ...)" + stx + a)])) + (syntax->list #'XS))]) + (syntax/loc stx (begin ABBREV ...)))] + [_ (raise-syntax-error #f "form should be (define-lex-abbrevs (name re) ...)" stx)])) + +(define-syntax (define-lex-trans stx) + (syntax-case stx () + [(_ name-form body-form) + (let-values (((name body) + (normalize-definition #'(define-syntax name-form body-form) #'λ))) + + #`(define-syntax #,name + (let ((func #,body)) + (unless (procedure? func) + (raise-syntax-error 'define-lex-trans "expected a procedure as the transformer, got ~e" func)) + (unless (procedure-arity-includes? func 1) + (raise-syntax-error 'define-lex-trans "expected a procedure that accepts 1 argument as the transformer, got ~e" func)) + (make-lex-trans func))))] + [_ + (raise-syntax-error + #f + "form should be (define-lex-trans name transformer)" + stx)])) + + +(define (get-next-state-helper char min max table) + (cond + [(>= min max) #f] + [else + (define try (quotient (+ min max) 2)) + (define el (vector-ref table try)) + (define r1 (vector-ref el 0)) + (define r2 (vector-ref el 1)) + (cond + [(and (>= char r1) (<= char r2)) (vector-ref el 2)] + [(< char r1) (get-next-state-helper char min try table)] + [else (get-next-state-helper char (add1 try) max table)])])) + + + + +(define (get-next-state char table) + (and table (get-next-state-helper char 0 (vector-length table) table))) + +(define ((lexer-body start-state trans-table actions no-lookahead special-action + has-special-comment-action? special-comment-action eof-action) ip) + (define (lexer ip) + (define first-pos (get-position ip)) + (define first-char (peek-char-or-special ip 0)) + ;(printf "(peek-char-or-special port 0) = ~e\n" first-char) + (cond + [(eof-object? first-char) + (do-match ip first-pos eof-action (read-char-or-special ip))] + [(special-comment? first-char) + (read-char-or-special ip) + (cond + (has-special-comment-action? + (do-match ip first-pos special-comment-action #f)) + (else (lexer ip)))] + [(not (char? first-char)) + (do-match ip first-pos special-action (read-char-or-special ip))] + [else + (let lexer-loop ( + ;; current-state + [state start-state] + ;; the character to transition on + [char first-char] + ;; action for the longest match seen thus far + ;; including a match at the current state + [longest-match-action + (vector-ref actions start-state)] + ;; how many bytes precede char + [length-bytes 0] + ;; how many characters have been read + ;; including the one just read + [length-chars 1] + ;; how many characters are in the longest match + [longest-match-length 0]) + (define next-state + (cond + [(not (char? char)) #f] + [else (get-next-state (char->integer char) + (vector-ref trans-table state))])) + (cond + [(not next-state) + (check-match ip first-pos longest-match-length + length-chars longest-match-action)] + [(vector-ref no-lookahead next-state) + (define act (vector-ref actions next-state)) + (check-match ip + first-pos + (if act length-chars longest-match-length) + length-chars + (if act act longest-match-action))] + [else + (define act (vector-ref actions next-state)) + (define next-length-bytes (+ (char-utf-8-length char) length-bytes)) + (define next-char (peek-char-or-special ip next-length-bytes)) + #;(printf "(peek-char-or-special port ~e) = ~e\n" + next-length-bytes next-char) + (lexer-loop next-state + next-char + (if act + act + longest-match-action) + next-length-bytes + (add1 length-chars) + (if act + length-chars + longest-match-length))]))])) + (unless (input-port? ip) + (raise-argument-error 'lexer "input-port?" 0 ip)) + (lexer ip)) + +(define (check-match lb first-pos longest-match-length length longest-match-action) + (unless longest-match-action + (define match (read-string length lb)) + (define end-pos (get-position lb)) + (raise-read-error + (format "lexer: No match found in input starting with: ~v" match) + (file-path) + (position-line first-pos) + (position-col first-pos) + (position-offset first-pos) + (- (position-offset end-pos) (position-offset first-pos)))) + (define match (read-string longest-match-length lb)) + ;(printf "(read-string ~e port) = ~e\n" longest-match-length match) + (do-match lb first-pos longest-match-action match)) + +(define file-path (make-parameter #f)) +(define lexer-file-path file-path) + +(define (do-match ip first-pos action value) + #;(printf "(action ~a ~a ~a ~a)\n" + (position-offset first-pos) (position-offset (get-position ip)) value ip) + (action first-pos (get-position ip) value ip)) + +(define (get-position ip) + (define-values (line col off) (port-next-location ip)) + (make-position off line col)) + +(define-syntax (create-unicode-abbrevs stx) + (syntax-case stx () + [(_ CTXT) + (with-syntax ([(RANGES ...) (for/list ([range (in-list (list (force alphabetic-ranges) + (force lower-case-ranges) + (force upper-case-ranges) + (force title-case-ranges) + (force numeric-ranges) + (force symbolic-ranges) + (force punctuation-ranges) + (force graphic-ranges) + (force whitespace-ranges) + (force blank-ranges) + (force iso-control-ranges)))]) + `(union ,@(map (λ (x) + `(char-range ,(integer->char (car x)) + ,(integer->char (cdr x)))) + range)))] + [(NAMES ...) (for/list ([sym (in-list '(alphabetic + lower-case + upper-case + title-case + numeric + symbolic + punctuation + graphic + whitespace + blank + iso-control))]) + (datum->syntax #'CTXT sym #f))]) + #'(define-lex-abbrevs (NAMES RANGES) ...))])) + +(define-lex-abbrev any-char (char-complement (union))) +(define-lex-abbrev any-string (intersection)) +(define-lex-abbrev nothing (union)) +(create-unicode-abbrevs #'here) + +(define-lex-trans (char-set stx) + (syntax-case stx () + [(_ STR) + (string? (syntax-e #'STR)) + (with-syntax ([(CHAR ...) (string->list (syntax-e #'STR))]) + #'(union CHAR ...))])) + +(define-syntax provide-lex-keyword + (syntax-rules () + [(_ ID ...) + (begin + (define-syntax-parameter ID + (make-set!-transformer + (λ (stx) + (raise-syntax-error + 'provide-lex-keyword + (format "use of a lexer keyword (~a) is not in an appropriate lexer action" 'ID) + stx)))) + ... + (provide ID ...))])) + +(provide-lex-keyword start-pos end-pos lexeme lexeme-srcloc input-port return-without-pos return-without-srcloc) diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/actions.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/actions.rkt new file mode 100644 index 0000000..34b7812 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/actions.rkt @@ -0,0 +1,15 @@ +#lang racket/base +(provide (all-defined-out)) +(require syntax/stx) + +;; get-special-action: (syntax-object list) syntax-object syntax-object -> syntax-object +;; Returns the first action from a rule of the form ((which-special) action) +(define (get-special-action rules which-special none) + (cond + [(null? rules) none] + [else + (syntax-case (car rules) () + [((special) ACT) + (and (identifier? #'special) (module-or-top-identifier=? #'special which-special)) + #'ACT] + [_ (get-special-action (cdr rules) which-special none)])])) diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/deriv.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/deriv.rkt new file mode 100644 index 0000000..d87e846 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/deriv.rkt @@ -0,0 +1,333 @@ +#lang racket/base +(require racket/list + (prefix-in is: data/integer-set) + "re.rkt" + "util.rkt") + +(provide build-dfa print-dfa (struct-out dfa)) + +(define e (build-epsilon)) +(define z (build-zero)) + + +;; Don't do anything with this one but extract the chars +(define all-chars (->re `(char-complement (union)) (make-cache))) + +;; get-char-groups : re bool -> (list-of char-setR?) +;; Collects the char-setRs in r that could be used in +;; taking the derivative of r. +(define (get-char-groups r found-negation) + (cond + [(or (eq? r e) (eq? r z)) null] + [(char-setR? r) (list r)] + [(concatR? r) + (if (re-nullable? (concatR-re1 r)) + (append (get-char-groups (concatR-re1 r) found-negation) + (get-char-groups (concatR-re2 r) found-negation)) + (get-char-groups (concatR-re1 r) found-negation))] + [(repeatR? r) + (get-char-groups (repeatR-re r) found-negation)] + [(orR? r) + (apply append (map (λ (x) (get-char-groups x found-negation)) (orR-res r)))] + [(andR? r) + (apply append (map (λ (x) (get-char-groups x found-negation)) (andR-res r)))] + [(negR? r) + (if found-negation + (get-char-groups (negR-re r) #t) + (cons all-chars (get-char-groups (negR-re r) #t)))])) + +(test-block ((c (make-cache)) + (r1 (->re #\1 c)) + (r2 (->re #\2 c))) + ((get-char-groups e #f) null) + ((get-char-groups z #f) null) + ((get-char-groups r1 #f) (list r1)) + ((get-char-groups (->re `(concatenation ,r1 ,r2) c) #f) + (list r1)) + ((get-char-groups (->re `(concatenation ,e ,r2) c) #f) + (list r2)) + ((get-char-groups (->re `(concatenation (repetition 0 +inf.0 ,r1) ,r2) c) #f) + (list r1 r2)) + ((get-char-groups (->re `(repetition 0 +inf.0 ,r1) c) #f) + (list r1)) + ((get-char-groups + (->re `(union (repetition 0 +inf.0 ,r1) + (concatenation (repetition 0 +inf.0 ,r2) "3") "4") c) #f) + (list r1 r2 (->re "3" c) (->re "4" c))) + ((get-char-groups (->re `(complement ,r1) c) #f) + (list all-chars r1)) + ((get-char-groups + (->re `(intersection (repetition 0 +inf.0 ,r1) + (concatenation (repetition 0 +inf.0 ,r2) "3") "4") c) #f) + (list r1 r2 (->re "3" c) (->re "4" c))) + ) +(define loc:member? is:member?) + +;; deriveR : re char cache -> re +(define (deriveR r c cache) + (cond + [(or (eq? r e) (eq? r z)) z] + [(char-setR? r) + (if (loc:member? c (char-setR-chars r)) e z)] + [(concatR? r) + (define r1 (concatR-re1 r)) + (define r2 (concatR-re2 r)) + (define d (build-concat (deriveR r1 c cache) r2 cache)) + (if (re-nullable? r1) + (build-or (list d (deriveR r2 c cache)) cache) + d)] + [(repeatR? r) + (build-concat (deriveR (repeatR-re r) c cache) + (build-repeat (sub1 (repeatR-low r)) + (sub1 (repeatR-high r)) + (repeatR-re r) cache) + cache)] + [(orR? r) + (build-or (map (λ (x) (deriveR x c cache)) + (orR-res r)) + cache)] + [(andR? r) + (build-and (map (λ (x) (deriveR x c cache)) + (andR-res r)) + cache)] + [(negR? r) + (build-neg (deriveR (negR-re r) c cache) cache)])) + +(test-block ((c (make-cache)) + (a (char->integer #\a)) + (b (char->integer #\b)) + (r1 (->re #\a c)) + (r2 (->re `(repetition 0 +inf.0 #\a) c)) + (r3 (->re `(repetition 0 +inf.0 ,r2) c)) + (r4 (->re `(concatenation #\a ,r2) c)) + (r5 (->re `(repetition 0 +inf.0 ,r4) c)) + (r6 (->re `(union ,r5 #\a) c)) + (r7 (->re `(concatenation ,r2 ,r2) c)) + (r8 (->re `(complement ,r4) c)) + (r9 (->re `(intersection ,r2 ,r4) c))) + ((deriveR e a c) z) + ((deriveR z a c) z) + ((deriveR r1 b c) z) + ((deriveR r1 a c) e) + ((deriveR r2 a c) r2) + ((deriveR r2 b c) z) + ((deriveR r3 a c) r2) + ((deriveR r3 b c) z) + ((deriveR r4 a c) r2) + ((deriveR r4 b c) z) + ((deriveR r5 a c) (->re `(concatenation ,r2 ,r5) c)) + ((deriveR r5 b c) z) + ((deriveR r6 a c) (->re `(union (concatenation ,r2 ,r5) "") c)) + ((deriveR r6 b c) z) + ((deriveR r7 a c) (->re `(union (concatenation ,r2 ,r2) ,r2) c)) + ((deriveR r7 b c) z) + ((deriveR r8 a c) (->re `(complement, r2) c)) + ((deriveR r8 b c) (->re `(complement ,z) c)) + ((deriveR r9 a c) r2) + ((deriveR r9 b c) z) + ((deriveR (->re `(repetition 1 2 "ab") c) a c) + (->re `(concatenation "b" (repetition 0 1 "ab")) c))) + +;; An re-action is (cons re action) + +;; derive : (list-of re-action) char cache -> (union (list-of re-action) #f) +;; applies deriveR to all the re-actions's re parts. +;; Returns #f if the derived state is equivalent to z. +(define (derive r c cache) + (define new-r (for/list ([ra (in-list r)]) + (cons (deriveR (car ra) c cache) (cdr ra)))) + (if (andmap (λ (x) (eq? z (car x))) new-r) + #f + new-r)) + +(test-block ((c (make-cache)) + (r1 (->re #\1 c)) + (r2 (->re #\2 c))) + ((derive null (char->integer #\1) c) #f) + ((derive (list (cons r1 1) (cons r2 2)) (char->integer #\1) c) + (list (cons e 1) (cons z 2))) + ((derive (list (cons r1 1) (cons r2 2)) (char->integer #\3) c) #f)) + + +;; get-final : (list-of re-action) -> (union #f syntax-object) +;; An re that accepts e represents a final state. Return the +;; action from the first final state or #f if there is none. +(define (get-final res) + (cond + [(null? res) #f] + [(re-nullable? (caar res)) (cdar res)] + [else (get-final (cdr res))])) + +(test-block ((c->i char->integer) + (c (make-cache)) + (r1 (->re #\a c)) + (r2 (->re #\b c)) + (b (list (cons z 1) (cons z 2) (cons z 3) (cons e 4) (cons z 5))) + (a (list (cons r1 1) (cons r2 2)))) + ((derive null (c->i #\a) c) #f) + ((derive a (c->i #\a) c) (list (cons e 1) (cons z 2))) + ((derive a (c->i #\b) c) (list (cons z 1) (cons e 2))) + ((derive a (c->i #\c) c) #f) + ((derive (list (cons (->re `(union " " "\n" ",") c) 1) + (cons (->re `(concatenation (repetition 0 1 "-") + (repetition 1 +inf.0 (char-range "0" "9"))) c) 2) + (cons (->re `(concatenation "-" (repetition 1 +inf.0 "-")) c) 3) + (cons (->re "[" c) 4) + (cons (->re "]" c) 5)) (c->i #\[) c) + b) + ((get-final a) #f) + ((get-final (list (cons e 1) (cons e 2))) 1) + ((get-final b) 4)) + + +;; A state is (make-state (list-of re-action) nat) +(define-struct state (spec index)) + +;; get->key : re-action -> (list-of nat) +;; states are indexed by the list of indexes of their res +(define (get-key s) + (map (λ (x) (re-index (car x))) s)) + +(define loc:partition is:partition) + +;; compute-chars : (list-of state) -> (list-of char-set) +;; Computed the sets of equivalent characters for taking the +;; derivative of the car of st. Only one derivative per set need to be taken. +(define (compute-chars st) + (cond + [(null? st) null] + [else + (loc:partition (map char-setR-chars + (apply append (map (λ (x) (get-char-groups (car x) #f)) + (state-spec (car st))))))])) + +(test-block ((c (make-cache)) + (c->i char->integer) + (r1 (->re `(char-range #\1 #\4) c)) + (r2 (->re `(char-range #\2 #\3) c))) + ((compute-chars null) null) + ((compute-chars (list (make-state null 1))) null) + ((map is:integer-set-contents + (compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2)))) + (list (is:integer-set-contents (is:make-range (c->i #\2) (c->i #\3))) + (is:integer-set-contents (is:union (is:make-range (c->i #\1)) + (is:make-range (c->i #\4))))))) + + +;; A dfa is (make-dfa int int +;; (list-of (cons int syntax-object)) +;; (list-of (cons int (list-of (cons char-set int))))) +;; Each transitions is a state and a list of chars with the state to transition to. +;; The finals and transitions are sorted by state number, and duplicate free. +(define-struct dfa (num-states start-state final-states/actions transitions) #:inspector (make-inspector)) + +(define loc:get-integer is:get-integer) + +;; build-dfa : (list-of re-action) cache -> dfa +(define (build-dfa rs cache) + (let* ([transitions (make-hash)] + [get-state-number (make-counter)] + [start (make-state rs (get-state-number))]) + (cache (cons 'state (get-key rs)) (λ () start)) + (let loop ([old-states (list start)] + [new-states null] + [all-states (list start)] + [cs (compute-chars (list start))]) + (cond + [(and (null? old-states) (null? new-states)) + (make-dfa (get-state-number) (state-index start) + (sort (for*/list ([state (in-list all-states)] + [val (in-value (cons (state-index state) (get-final (state-spec state))))] + #:when (cdr val)) + val) + < #:key car) + (sort (hash-map transitions + (λ (state trans) + (cons (state-index state) + (for/list ([t (in-list trans)]) + (cons (car t) + (state-index (cdr t))))))) + < #:key car))] + [(null? old-states) + (loop new-states null all-states (compute-chars new-states))] + [(null? cs) + (loop (cdr old-states) new-states all-states (compute-chars (cdr old-states)))] + [else + (define state (car old-states)) + (define c (car cs)) + (define new-re (derive (state-spec state) (loc:get-integer c) cache)) + (cond + [new-re + (let* ([new-state? #f] + [new-state (cache (cons 'state (get-key new-re)) + (λ () + (set! new-state? #t) + (make-state new-re (get-state-number))))] + [new-all-states (if new-state? (cons new-state all-states) all-states)]) + (hash-set! transitions + state + (cons (cons c new-state) + (hash-ref transitions state + (λ () null)))) + (cond + [new-state? + (loop old-states (cons new-state new-states) new-all-states (cdr cs))] + [else + (loop old-states new-states new-all-states (cdr cs))]))] + [else (loop old-states new-states all-states (cdr cs))])])))) + +(define (print-dfa x) + (printf "number of states: ~a\n" (dfa-num-states x)) + (printf "start state: ~a\n" (dfa-start-state x)) + (printf "final states: ~a\n" (map car (dfa-final-states/actions x))) + (for-each (λ (trans) + (printf "state: ~a\n" (car trans)) + (for-each (λ (rule) + (printf " -~a-> ~a\n" + (is:integer-set-contents (car rule)) + (cdr rule))) + (cdr trans))) + (dfa-transitions x))) + +(define (build-test-dfa rs) + (define c (make-cache)) + (build-dfa (map (λ (x) (cons (->re x c) 'action)) rs) c)) + + +#| + (define t1 (build-test-dfa null)) + (define t2 (build-test-dfa `(#\a))) + (define t3 (build-test-dfa `(#\a #\b))) + (define t4 (build-test-dfa `((repetition 0 +inf.0 #\a) + (repetition 0 +inf.0 (concatenation #\a #\b))))) + (define t5 (build-test-dfa `((concatenation (repetition 0 +inf.0 (union #\0 #\1)) #\1)))) + (define t6 (build-test-dfa `((repetition 0 +inf.0 (repetition 0 +inf.0 #\a)) + (repetition 0 +inf.0 (concatenation #\b (repetition 1 +inf.0 #\b)))))) + (define t7 (build-test-dfa `((concatenation (repetition 0 +inf.0 #\a) (repetition 0 +inf.0 #\b) + (repetition 0 +inf.0 #\c) (repetition 0 +inf.0 #\d) + (repetition 0 +inf.0 #\e))))) + (define t8 + (build-test-dfa `((concatenation (repetition 0 +inf.0 (union #\a #\b)) #\a (union #\a #\b) + (union #\a #\b) (union #\a #\b) (union #\a #\b))))) + (define t9 (build-test-dfa `((concatenation "/*" + (complement (concatenation (intersection) "*/" (intersection))) + "*/")))) + (define t11 (build-test-dfa `((complement "1")))) + (define t12 (build-test-dfa `((concatenation (intersection (concatenation (repetition 0 +inf.0 "a") "b") + (concatenation "a" (repetition 0 +inf.0 "b"))) + "ab")))) + (define x (build-test-dfa `((union " " "\n" ",") + (concatenation (repetition 0 1 "-") (repetition 1 +inf.0 (char-range "0" "9"))) + (concatenation "-" (repetition 1 +inf.0 "-")) + "[" + "]"))) + (define y (build-test-dfa + `((repetition 1 +inf.0 + (union (concatenation "|" (repetition 0 +inf.0 (char-complement "|")) "|") + (concatenation "|" (repetition 0 +inf.0 (char-complement "|")))))))) + (define t13 (build-test-dfa `((intersection (concatenation (intersection) "111" (intersection)) + (complement (union (concatenation (intersection) "01") + (repetition 1 +inf.0 "1"))))))) + (define t14 (build-test-dfa `((complement "1"))))) + + |# diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/error-tests.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/error-tests.rkt new file mode 100644 index 0000000..7ee2adf --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/error-tests.rkt @@ -0,0 +1,81 @@ +#lang racket/base +(require (for-syntax racket/base) + "../lex.rkt" + rackunit) + +(define-syntax (catch-syn-error stx) + (syntax-case stx () + [(_ arg) + (datum->syntax + #'here + (with-handlers ((exn:fail:syntax? exn-message)) + (syntax-local-expand-expression #'arg) + "not-an-error"))])) + +(check-regexp-match #rx"lex-abbrev" (catch-syn-error (define-lex-abbrev))) +(check-regexp-match #rx"lex-abbrev" (catch-syn-error (define-lex-abbrev a))) +(check-regexp-match #rx"lex-abbrev" (catch-syn-error (define-lex-abbrev (a b) v))) +(check-regexp-match #rx"lex-abbrev" (catch-syn-error (define-lex-abbrev 1 1))) +(check-regexp-match #rx"lex-abbrevs" (catch-syn-error (define-lex-abbrevs ()))) + +(check-regexp-match #rx"lex-trans" (catch-syn-error (define-lex-trans))) + +(check-regexp-match #rx"lexer" (catch-syn-error (lexer))) +(check-regexp-match #rx"lexer" (catch-syn-error (lexer ("a" "b" "c")))) +(check-regexp-match #rx"lexer" (catch-syn-error (lexer ()))) +(check-regexp-match #rx"lexer" (catch-syn-error (lexer ("")))) + +(check-regexp-match #rx"regular-expression" (catch-syn-error (lexer (a 1)))) +(check-regexp-match #rx"regular-expression" (catch-syn-error (lexer ((a) 1)))) +(check-regexp-match #rx"regular-expression" (catch-syn-error (let ((a 1)) (lexer ((a) 1))))) + +(check-regexp-match #rx"regular-expression" + (catch-syn-error (let-syntax ((a 1)) + (lexer ((a) 1))))) + +(check-regexp-match #rx"define-lex-trans" + (catch-syn-error + (let () + (define-lex-trans a 1) + (let () + (lexer ((a) 1)))))) + +;; Detecting mutual recursion cycle: +(check-regexp-match #rx"regular-expression" + (catch-syn-error + (let () + (define-lex-abbrev a b) + (define-lex-abbrev b a) + (let () + (lexer (a 1)))))) + +(check-regexp-match #rx"regular-expression" + (catch-syn-error + (let () + (define-lex-abbrev a (repetition 0 1 b)) + (define-lex-abbrev b (repetition 0 1 a)) + (let () + (lexer (a 1)))))) + +;; Detecting cycle within same abbreviation: +(check-regexp-match #rx"regular-expression" + (catch-syn-error + (let () + (define-lex-abbrev balanced + (union (concatenation "(" balanced ")" balanced) + any-char)) + (lexer + [balanced (string-append lexeme (balanced input-port))] + [(eof) ""])))) + + +(check-regexp-match #rx"regular-expression" (catch-syn-error (lexer (1 1)))) +(check-regexp-match #rx"repetition" (catch-syn-error (lexer ((repetition) 1)))) +(check-regexp-match #rx"repetition" (catch-syn-error (lexer ((repetition #\1 #\1 "3") 1)))) +(check-regexp-match #rx"repetition" (catch-syn-error (lexer ((repetition 1 #\1 "3") 1)))) +(check-regexp-match #rx"repetition" (catch-syn-error (lexer ((repetition 1 0 "3") 1)))) +(check-regexp-match #rx"complement" (catch-syn-error (lexer ((complement) 1)))) +(check-regexp-match #rx"char-range" (catch-syn-error (lexer ((char-range) 1)))) +(check-regexp-match #rx"char-range" (catch-syn-error (lexer ((char-range #\9 #\0) 1)))) +(check-regexp-match #rx"char-complement" (catch-syn-error (lexer ((char-complement) 1)))) +(check-regexp-match #rx"char-complement" (catch-syn-error (lexer ((char-complement (concatenation "1" "2")) 1)))) diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/front.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/front.rkt new file mode 100644 index 0000000..c34eba2 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/front.rkt @@ -0,0 +1,159 @@ +#lang racket/base +(require racket/base + racket/match + (prefix-in is: data/integer-set) + racket/list + syntax/stx + "util.rkt" + "stx.rkt" + "re.rkt" + "deriv.rkt") + +(provide build-lexer) + +(define-syntax time-label + (syntax-rules () + ((_ l e ...) + (begin + (printf "~a: " l) + (time (begin e ...)))))) + +;; A table is either +;; - (vector-of (union #f nat)) +;; - (vector-of (vector-of (vector nat nat nat))) + +(define loc:integer-set-contents is:integer-set-contents) + +;; dfa->1d-table : dfa -> (same as build-lexer) +(define (dfa->1d-table dfa) + (define state-table (make-vector (dfa-num-states dfa) #f)) + (define transition-cache (make-hasheq)) + (for ([trans (in-list (dfa-transitions dfa))]) + (match-define (cons from-state all-chars/to) trans) + (define flat-all-chars/to + (sort + (for*/list ([chars/to (in-list all-chars/to)] + [char-ranges (in-value (loc:integer-set-contents (car chars/to)))] + [to (in-value (cdr chars/to))] + [char-range (in-list char-ranges)]) + (define entry (vector (car char-range) (cdr char-range) to)) + (hash-ref transition-cache entry (λ () + (hash-set! transition-cache + entry + entry) + entry))) + < #:key (λ (v) (vector-ref v 0)))) + (vector-set! state-table from-state (list->vector flat-all-chars/to))) + state-table) + + +(define loc:foldr is:foldr) + +;; dfa->2d-table : dfa -> (same as build-lexer) +(define (dfa->2d-table dfa) + ;; char-table : (vector-of (union #f nat)) + ;; The lexer table, one entry per state per char. + ;; Each entry specifies a state to transition to. + ;; #f indicates no transition + (define char-table (make-vector (* 256 (dfa-num-states dfa)) #f)) + ;; Fill the char-table vector + (for* ([trans (in-list (dfa-transitions dfa))] + [chars/to (in-list (cdr trans))]) + (define from-state (car trans)) + (define to-state (cdr chars/to)) + (loc:foldr (λ (char _) + (vector-set! char-table + (bitwise-ior + char + (arithmetic-shift from-state 8)) + to-state)) + (void) + (car chars/to))) + char-table) + + +;; dfa->actions : dfa -> (vector-of (union #f syntax-object)) +;; The action for each final state, #f if the state isn't final +(define (dfa->actions dfa) + (define actions (make-vector (dfa-num-states dfa) #f)) + (for ([state/action (in-list (dfa-final-states/actions dfa))]) + (vector-set! actions (car state/action) (cdr state/action))) + actions) + +;; dfa->no-look : dfa -> (vector-of bool) +;; For each state whether the lexer can ignore the next input. +;; It can do this only if there are no transitions out of the +;; current state. +(define (dfa->no-look dfa) + (define no-look (make-vector (dfa-num-states dfa) #t)) + (for ([trans (in-list (dfa-transitions dfa))]) + (vector-set! no-look (car trans) #f)) + no-look) + +(test-block ((d1 (make-dfa 1 1 (list) (list))) + (d2 (make-dfa 4 1 (list (cons 2 2) (cons 3 3)) + (list (cons 1 (list (cons (is:make-range 49 50) 1) + (cons (is:make-range 51) 2))) + (cons 2 (list (cons (is:make-range 49) 3)))))) + (d3 (make-dfa 4 1 (list (cons 2 2) (cons 3 3)) + (list (cons 1 (list (cons (is:make-range 100 200) 0) + (cons (is:make-range 49 50) 1) + (cons (is:make-range 51) 2))) + (cons 2 (list (cons (is:make-range 49) 3))))))) + ((dfa->2d-table d1) (make-vector 256 #f)) + ((dfa->2d-table d2) (let ((v (make-vector 1024 #f))) + (vector-set! v 305 1) + (vector-set! v 306 1) + (vector-set! v 307 2) + (vector-set! v 561 3) + v)) + ((dfa->1d-table d1) (make-vector 1 #f)) + ((dfa->1d-table d2) #(#f + #(#(49 50 1) #(51 51 2)) + #(#(49 49 3)) + #f)) + ((dfa->1d-table d3) #(#f + #(#(49 50 1) #(51 51 2) #(100 200 0)) + #(#(49 49 3)) + #f)) + ((dfa->actions d1) (vector #f)) + ((dfa->actions d2) (vector #f #f 2 3)) + ((dfa->no-look d1) (vector #t)) + ((dfa->no-look d2) (vector #t #f #f #t))) + +;; build-lexer : syntax-object list -> +;; (values table nat (vector-of (union #f syntax-object)) (vector-of bool) (list-of syntax-object)) +;; each syntax object has the form (re action) +(define (build-lexer sos) + (define s-re-acts (for/list ([so (in-list sos)]) + (cons (parse (stx-car so)) + (stx-car (stx-cdr so))))) + (define cache (make-cache)) + (define re-acts (for/list ([s-re-act (in-list s-re-acts)]) + (cons (->re (car s-re-act) cache) + (cdr s-re-act)))) + (define dfa (build-dfa re-acts cache)) + (define table (dfa->1d-table dfa)) + ;(print-dfa dfa) + #;(let ((num-states (vector-length table)) + (num-vectors (length (filter values (vector->list table)))) + (num-entries (apply + (map + (λ (x) (if x (vector-length x) 0)) + (vector->list table)))) + (num-different-entries + (let ((ht (make-hash))) + (for-each + (λ (x) + (when x + (for-each + (λ (y) + (hash-set! ht y #t)) + (vector->list x)))) + (vector->list table)) + (length (hash-table-map ht cons))))) + (printf "~a states, ~aKB\n" + num-states + (/ (* 4.0 (+ 2 num-states (* 2 num-vectors) num-entries + (* 5 num-different-entries))) 1024))) + (values table (dfa-start-state dfa) (dfa->actions dfa) (dfa->no-look dfa))) + diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/re.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/re.rkt new file mode 100644 index 0000000..35b2ced --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/re.rkt @@ -0,0 +1,384 @@ +#lang racket/base +(require racket/list + racket/match + (prefix-in is: data/integer-set) + "util.rkt") + +(provide ->re build-epsilon build-zero build-char-set build-concat + build-repeat build-or build-and build-neg + epsilonR? zeroR? char-setR? concatR? repeatR? orR? andR? negR? + char-setR-chars concatR-re1 concatR-re2 repeatR-re repeatR-low repeatR-high + orR-res andR-res negR-re + re-nullable? re-index) + +;; get-index : -> nat +(define get-index (make-counter)) + +;; An re is either +;; - (make-epsilonR bool nat) +;; - (make-zeroR bool nat) +;; - (make-char-setR bool nat char-set) +;; - (make-concatR bool nat re re) +;; - (make-repeatR bool nat nat nat-or-+inf.0 re) +;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs +;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs +;; - (make-negR bool nat re) +;; +;; Every re must have an index field globally different from all +;; other re index fields. +(define-struct re (nullable? index) #:inspector (make-inspector)) +(define-struct (epsilonR re) () #:inspector (make-inspector)) +(define-struct (zeroR re) () #:inspector (make-inspector)) +(define-struct (char-setR re) (chars) #:inspector (make-inspector)) +(define-struct (concatR re) (re1 re2) #:inspector (make-inspector)) +(define-struct (repeatR re) (low high re) #:inspector (make-inspector)) +(define-struct (orR re) (res) #:inspector (make-inspector)) +(define-struct (andR re) (res) #:inspector (make-inspector)) +(define-struct (negR re) (re) #:inspector (make-inspector)) + +;; e : re +;; The unique epsilon re +(define e (make-epsilonR #t (get-index))) + +;; z : re +;; The unique zero re +(define z (make-zeroR #f (get-index))) + + +;; s-re = char constant +;; | string constant (sequence of characters) +;; | re a precompiled re +;; | (repetition low high s-re) repetition between low and high times (inclusive) +;; | (union s-re ...) +;; | (intersection s-re ...) +;; | (complement s-re) +;; | (concatenation s-re ...) +;; | (char-range rng rng) match any character between two (inclusive) +;; | (char-complement char-set) match any character not listed +;; low = natural-number +;; high = natural-number or +inf.0 +;; rng = char or string with length 1 +;; (concatenation) (repetition 0 0 x), and "" match the empty string. +;; (union) matches no strings. +;; (intersection) matches any string. + +(define loc:make-range is:make-range) +(define loc:union is:union) +(define loc:split is:split) +(define loc:complement is:complement) + +;; ->re : s-re cache -> re +(define (->re exp cache) + (match exp + [(? char?) (build-char-set (loc:make-range (char->integer exp)) cache)] + [(? string?) (->re `(concatenation ,@(string->list exp)) cache)] + [(? re?) exp] + [`(repetition ,low ,high ,r) + (build-repeat low high (->re r cache) cache)] + [`(union ,rs ...) + (build-or (flatten-res (map (λ (r) (->re r cache)) rs) + orR? orR-res loc:union cache) + cache)] + [`(intersection ,rs ...) + (build-and (flatten-res (map (λ (r) (->re r cache)) rs) + andR? andR-res (λ (a b) + (let-values (((i _ __) (loc:split a b))) i)) + cache) + cache)] + [`(complement ,r) (build-neg (->re r cache) cache)] + [`(concatenation ,rs ...) + (foldr (λ (x y) + (build-concat (->re x cache) y cache)) + e + rs)] + [`(char-range ,c1 ,c2) + (let ([i1 (char->integer (if (string? c1) (string-ref c1 0) c1))] + [i2 (char->integer (if (string? c2) (string-ref c2 0) c2))]) + (if (<= i1 i2) + (build-char-set (loc:make-range i1 i2) cache) + z))] + [`(char-complement ,crs ...) + (let ([cs (->re `(union ,@crs) cache)]) + (cond + [(zeroR? cs) (build-char-set (loc:make-range 0 max-char-num) cache)] + [(char-setR? cs) + (build-char-set (loc:complement (char-setR-chars cs) 0 max-char-num) cache)] + [else z]))])) + + + + +;; flatten-res: (list-of re) (re -> bool) (re -> (list-of re)) +;; (char-set char-set -> char-set) cache -> (list-of re) +;; Takes all the char-sets in l and combines them into one char-set using the combine function. +;; Flattens out the values of type?. get-res only needs to function on things type? returns +;; true for. +(define (flatten-res l type? get-res combine cache) + (let loop ([res l] + ;; chars : (union #f char-set) + [chars #f] + [no-chars null]) + (cond + [(null? res) + (if chars + (cons (build-char-set chars cache) no-chars) + no-chars)] + [(char-setR? (car res)) + (if chars + (loop (cdr res) (combine (char-setR-chars (car res)) chars) no-chars) + (loop (cdr res) (char-setR-chars (car res)) no-chars))] + [(type? (car res)) + (loop (append (get-res (car res)) (cdr res)) chars no-chars)] + [else (loop (cdr res) chars (cons (car res) no-chars))]))) + +;; build-epsilon : -> re +(define (build-epsilon) e) + +(define (build-zero) z) + +(define loc:integer-set-contents is:integer-set-contents) + +;; build-char-set : char-set cache -> re +(define (build-char-set cs cache) + (define l (loc:integer-set-contents cs)) + (cond + [(null? l) z] + [else + (cache l + (λ () + (make-char-setR #f (get-index) cs)))])) + + + +;; build-concat : re re cache -> re +(define (build-concat r1 r2 cache) + (cond + [(eq? e r1) r2] + [(eq? e r2) r1] + [(or (eq? z r1) (eq? z r2)) z] + [else + (cache (cons 'concat (cons (re-index r1) (re-index r2))) + (λ () + (make-concatR (and (re-nullable? r1) (re-nullable? r2)) + (get-index) + r1 r2)))])) + +;; build-repeat : nat nat-or-+inf.0 re cache -> re +(define (build-repeat low high r cache) + (let ([low (if (< low 0) 0 low)]) + (cond + [(eq? r e) e] + [(and (= 0 low) (or (= 0 high) (eq? z r))) e] + [(and (= 1 low) (= 1 high)) r] + [(and (repeatR? r) + (eqv? (repeatR-high r) +inf.0) + (or (= 0 (repeatR-low r)) + (= 1 (repeatR-low r)))) + (build-repeat (* low (repeatR-low r)) + +inf.0 + (repeatR-re r) + cache)] + [else + (cache (cons 'repeat (cons low (cons high (re-index r)))) + (λ () + (make-repeatR (or (re-nullable? r) (= 0 low)) (get-index) low high r)))]))) + + +;; build-or : (list-of re) cache -> re +(define (build-or rs cache) + (let ([rs + (filter + (λ (x) (not (eq? x z))) + (do-simple-equiv (replace rs orR? orR-res null) re-index))]) + (cond + [(null? rs) z] + [(null? (cdr rs)) (car rs)] + [(memq (build-neg z cache) rs) (build-neg z cache)] + [else + (cache (cons 'or (map re-index rs)) + (λ () + (make-orR (ormap re-nullable? rs) (get-index) rs)))]))) + +;; build-and : (list-of re) cache -> re +(define (build-and rs cache) + (let ([rs (do-simple-equiv (replace rs andR? andR-res null) re-index)]) + (cond + [(null? rs) (build-neg z cache)] + [(null? (cdr rs)) (car rs)] + [(memq z rs) z] + [else + (cache (cons 'and (map re-index rs)) + (λ () + (make-andR (andmap re-nullable? rs) (get-index) rs)))]))) + +;; build-neg : re cache -> re +(define (build-neg r cache) + (cond + [(negR? r) (negR-re r)] + [else + (cache (cons 'neg (re-index r)) + (λ () + (make-negR (not (re-nullable? r)) (get-index) r)))])) + +;; Tests for the build-functions +(test-block ((c (make-cache)) + (isc is:integer-set-contents) + (r1 (build-char-set (is:make-range (char->integer #\1)) c)) + (r2 (build-char-set (is:make-range (char->integer #\2)) c)) + (r3 (build-char-set (is:make-range (char->integer #\3)) c)) + (rc (build-concat r1 r2 c)) + (rc2 (build-concat r2 r1 c)) + (rr (build-repeat 0 +inf.0 rc c)) + (ro (build-or `(,rr ,rc ,rr) c)) + (ro2 (build-or `(,rc ,rr ,z) c)) + (ro3 (build-or `(,rr ,rc) c)) + (ro4 (build-or `(,(build-or `(,r1 ,r2) c) + ,(build-or `(,r2 ,r3) c)) c)) + (ra (build-and `(,rr ,rc ,rr) c)) + (ra2 (build-and `(,rc ,rr) c)) + (ra3 (build-and `(,rr ,rc) c)) + (ra4 (build-and `(,(build-and `(,r3 ,r2) c) + ,(build-and `(,r2 ,r1) c)) c)) + (rn (build-neg z c)) + (rn2 (build-neg r1 c))) + + ((isc (char-setR-chars r1)) (isc (is:make-range (char->integer #\1)))) + ((isc (char-setR-chars r2)) (isc (is:make-range (char->integer #\2)))) + ((isc (char-setR-chars r3)) (isc (is:make-range (char->integer #\3)))) + ((build-char-set (is:make-range) c) z) + ((build-concat r1 e c) r1) + ((build-concat e r1 c) r1) + ((build-concat r1 z c) z) + ((build-concat z r1 c) z) + ((build-concat r1 r2 c) rc) + ((concatR-re1 rc) r1) + ((concatR-re2 rc) r2) + ((concatR-re1 rc2) r2) + ((concatR-re2 rc2) r1) + (ro ro2) + (ro ro3) + (ro4 (build-or `(,r1 ,r2 ,r3) c)) + ((orR-res ro) (list rc rr)) + ((orR-res ro4) (list r1 r2 r3)) + ((build-or null c) z) + ((build-or `(,r1 ,z) c) r1) + ((build-repeat 0 +inf.0 rc c) rr) + ((build-repeat 0 1 z c) e) + ((build-repeat 0 0 rc c) e) + ((build-repeat 0 +inf.0 z c) e) + ((build-repeat -1 +inf.0 z c) e) + ((build-repeat 0 +inf.0 (build-repeat 0 +inf.0 rc c) c) + (build-repeat 0 +inf.0 rc c)) + ((build-repeat 20 20 (build-repeat 0 +inf.0 rc c) c) + (build-repeat 0 +inf.0 rc c)) + ((build-repeat 20 20 (build-repeat 1 +inf.0 rc c) c) + (build-repeat 20 +inf.0 rc c)) + ((build-repeat 1 1 rc c) rc) + ((repeatR-re rr) rc) + (ra ra2) + (ra ra3) + (ra4 (build-and `(,r1 ,r2 ,r3) c)) + ((andR-res ra) (list rc rr)) + ((andR-res ra4) (list r1 r2 r3)) + ((build-and null c) (build-neg z c)) + ((build-and `(,r1 ,z) c) z) + ((build-and `(,r1) c) r1) + ((build-neg r1 c) (build-neg r1 c)) + ((build-neg (build-neg r1 c) c) r1) + ((negR-re (build-neg r2 c)) r2) + ((re-nullable? r1) #f) + ((re-nullable? rc) #f) + ((re-nullable? (build-concat rr rr c)) #t) + ((re-nullable? rr) #t) + ((re-nullable? (build-repeat 0 1 rc c)) #t) + ((re-nullable? (build-repeat 1 2 rc c)) #f) + ((re-nullable? (build-repeat 1 2 (build-or (list e r1) c) c)) #t) + ((re-nullable? ro) #t) + ((re-nullable? (build-or `(,r1 ,r2) c)) #f) + ((re-nullable? (build-and `(,r1 ,e) c)) #f) + ((re-nullable? (build-and `(,rr ,e) c)) #t) + ((re-nullable? (build-neg r1 c)) #t) + ((re-nullable? (build-neg rr c)) #f)) + +(test-block ((c (make-cache)) + (isc is:integer-set-contents) + (r1 (->re #\1 c)) + (r2 (->re #\2 c)) + (r3-5 (->re '(char-range #\3 #\5) c)) + (r4 (build-or `(,r1 ,r2) c)) + (r5 (->re `(union ,r3-5 #\7) c)) + (r6 (->re #\6 c))) + ((flatten-res null orR? orR-res is:union c) null) + ((isc (char-setR-chars (car (flatten-res `(,r1) orR? orR-res is:union c)))) + (isc (is:make-range (char->integer #\1)))) + ((isc (char-setR-chars (car (flatten-res `(,r4) orR? orR-res is:union c)))) + (isc (is:make-range (char->integer #\1) (char->integer #\2)))) + ((isc (char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1) + orR? orR-res is:union c)))) + (isc (is:make-range (char->integer #\1) (char->integer #\7)))) + ((flatten-res `(,r1 ,r2) andR? andR-res (λ (x y) + (let-values (((i _ __) + (is:split x y))) + i)) + c) + (list z))) + +;; ->re +(test-block ((c (make-cache)) + (isc is:integer-set-contents) + (r (->re #\a c)) + (rr (->re `(concatenation ,r ,r) c)) + (rrr (->re `(concatenation ,r ,rr) c)) + (rrr* (->re `(repetition 0 +inf.0 ,rrr) c))) + ((isc (char-setR-chars r)) (isc (is:make-range (char->integer #\a)))) + ((->re "" c) e) + ((->re "asdf" c) (->re `(concatenation #\a #\s #\d #\f) c)) + ((->re r c) r) + ((->re `(repetition 0 +inf.0 ,r) c) (build-repeat 0 +inf.0 r c)) + ((->re `(repetition 1 +inf.0 ,r) c) (build-repeat 1 +inf.0 r c)) + ((->re `(repetition 0 1 ,r) c) (build-repeat 0 1 r c)) + ((->re `(repetition 0 1 ,rrr*) c) rrr*) + ((->re `(union (union (char-range #\a #\c) + (char-complement (char-range #\000 #\110) + (char-range #\112 ,(integer->char max-char-num)))) + (union (repetition 0 +inf.0 #\2))) c) + (build-or (list (build-char-set (is:union (is:make-range 73) + (is:make-range 97 99)) + c) + (build-repeat 0 +inf.0 (build-char-set (is:make-range 50) c) c)) + c)) + ((->re `(union ,rr ,rrr) c) (build-or (list rr rrr) c)) + ((->re `(union ,r) c) r) + ((->re `(union) c) z) + ((->re `(intersection (intersection #\111 + (char-complement (char-range #\000 #\110) + (char-range #\112 ,(integer->char max-char-num)))) + (intersection (repetition 0 +inf.0 #\2))) c) + (build-and (list (build-char-set (is:make-range 73) c) + (build-repeat 0 +inf.0 (build-char-set (is:make-range 50) c) c)) + c)) + ((->re `(intersection (intersection #\000 (char-complement (char-range #\000 #\110) + (char-range #\112 ,(integer->char max-char-num)))) + (intersection (repetition 0 +inf.0 #\2))) c) + z) + ((->re `(intersection ,rr ,rrr) c) (build-and (list rr rrr) c)) + ((->re `(intersection ,r) c) r) + ((->re `(intersection) c) (build-neg z c)) + ((->re `(complement ,r) c) (build-neg r c)) + ((->re `(concatenation) c) e) + ((->re `(concatenation ,rrr*) c) rrr*) + (rr (build-concat r r c)) + ((->re `(concatenation ,r ,rr ,rrr) c) + (build-concat r (build-concat rr rrr c) c)) + ((isc (char-setR-chars (->re `(char-range #\1 #\1) c))) (isc (is:make-range 49))) + ((isc (char-setR-chars (->re `(char-range #\1 #\9) c))) (isc (is:make-range 49 57))) + ((isc (char-setR-chars (->re `(char-range "1" "1") c))) (isc (is:make-range 49))) + ((isc (char-setR-chars (->re `(char-range "1" "9") c))) (isc (is:make-range 49 57))) + ((->re `(char-range "9" "1") c) z) + ((isc (char-setR-chars (->re `(char-complement) c))) + (isc (char-setR-chars (->re `(char-range #\000 ,(integer->char max-char-num)) c)))) + ((isc (char-setR-chars (->re `(char-complement #\001 (char-range #\002 ,(integer->char max-char-num))) c))) + (isc (is:make-range 0))) + ) + + diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/stx.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/stx.rkt new file mode 100644 index 0000000..6b5ab9a --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/stx.rkt @@ -0,0 +1,183 @@ +#lang racket/base +(require "util.rkt" syntax/id-table racket/syntax) +(provide parse) + +(define (bad-args stx num) + (raise-syntax-error #f (format "incorrect number of arguments (should have ~a)" num) stx)) + +;; char-range-arg: syntax-object syntax-object -> nat +;; If c contains is a character or length 1 string, returns the integer +;; for the character. Otherwise raises a syntax error. +(define (char-range-arg stx containing-stx) + (define c (syntax-e stx)) + (cond + [(char? c) (char->integer c)] + [(and (string? c) (= (string-length c) 1)) + (char->integer (string-ref c 0))] + [else + (raise-syntax-error + #f + "not a char or single-char string" + containing-stx stx)])) + +(module+ test + (check-equal? (char-range-arg #'#\1 #'here) (char->integer #\1)) + (check-equal? (char-range-arg #'"1" #'here) (char->integer #\1))) + +(define orig-insp (variable-reference->module-declaration-inspector + (#%variable-reference))) +(define (disarm stx) + (syntax-disarm stx orig-insp)) + +;; parse : syntax-object (box (list-of syntax-object)) -> s-re (see re.rkt) +;; checks for errors and generates the plain s-exp form for s +;; Expands lex-abbrevs and applies lex-trans. +(define (parse stx) + (let loop ([stx stx] + ;; seen-lex-abbrevs: id-table + [seen-lex-abbrevs (make-immutable-free-id-table)]) + (let ([recur (λ (s) + (loop (syntax-rearm s stx) + seen-lex-abbrevs))] + [recur/abbrev (λ (s id) + (loop (syntax-rearm s stx) + (free-id-table-set seen-lex-abbrevs id id)))]) + (syntax-case (disarm stx) (repetition union intersection complement concatenation + char-range char-complement) + [_ + (identifier? stx) + (let ([expansion (syntax-local-value/record stx (λ (v) #t))]) + (unless (lex-abbrev? expansion) + (raise-syntax-error 'regular-expression + "undefined abbreviation" + stx)) + ;; Check for cycles. + (when (free-id-table-ref seen-lex-abbrevs stx (λ () #f)) + (raise-syntax-error 'regular-expression + "illegal lex-abbrev cycle detected" + stx + #f + (list (free-id-table-ref seen-lex-abbrevs stx)))) + (recur/abbrev ((lex-abbrev-get-abbrev expansion)) stx))] + [_ + (or (char? (syntax-e stx)) (string? (syntax-e stx))) + (syntax-e stx)] + [(repetition ARG ...) + (let ([arg-list (syntax->list #'(ARG ...))]) + (unless (= 3 (length arg-list)) + (bad-args stx 2)) + (define low (syntax-e (car arg-list))) + (define high (syntax-e (cadr arg-list))) + (define re (caddr arg-list)) + (unless (and (number? low) (exact? low) (integer? low) (>= low 0)) + (raise-syntax-error #f "not a non-negative exact integer" stx (car arg-list))) + (unless (or (and (number? high) (exact? high) (integer? high) (>= high 0)) + (eqv? high +inf.0)) + (raise-syntax-error #f "not a non-negative exact integer or +inf.0" stx (cadr arg-list))) + (unless (<= low high) + (raise-syntax-error #f "the first argument is not less than or equal to the second argument" stx)) + `(repetition ,low ,high ,(recur re)))] + [(union RE ...) + `(union ,@(map recur (syntax->list #'(RE ...))))] + [(intersection RE ...) + `(intersection ,@(map recur (syntax->list #'(RE ...))))] + [(complement RE ...) + (let ([re-list (syntax->list #'(RE ...))]) + (unless (= 1 (length re-list)) + (bad-args stx 1)) + `(complement ,(recur (car re-list))))] + [(concatenation RE ...) + `(concatenation ,@(map recur (syntax->list #'(RE ...))))] + [(char-range ARG ...) + (let ((arg-list (syntax->list #'(ARG ...)))) + (unless (= 2 (length arg-list)) + (bad-args stx 2)) + (let ([i1 (char-range-arg (car arg-list) stx)] + [i2 (char-range-arg (cadr arg-list) stx)]) + (if (<= i1 i2) + `(char-range ,(integer->char i1) ,(integer->char i2)) + (raise-syntax-error #f "the first argument does not precede or equal second argument" stx))))] + [(char-complement ARG ...) + (let ([arg-list (syntax->list #'(ARG ...))]) + (unless (= 1 (length arg-list)) + (bad-args stx 1)) + (define parsed (recur (car arg-list))) + (unless (char-set? parsed) + (raise-syntax-error #f "not a character set" stx (car arg-list))) + `(char-complement ,parsed))] + ((OP form ...) + (identifier? #'OP) + (let* ([expansion (syntax-local-value/record #'OP (λ (v) #t))]) + (cond + [(lex-trans? expansion) + (recur ((lex-trans-f expansion) (disarm stx)))] + [expansion + (raise-syntax-error 'regular-expression "not a lex-trans" stx)] + [else + (raise-syntax-error 'regular-expression "undefined operator" stx)]))) + [_ (raise-syntax-error 'regular-expression "not a char, string, identifier, or (op args ...)" stx)])))) + + + +;; char-set? : s-re -> bool +;; A char-set is an re that matches only strings of length 1. +;; char-set? is conservative. +(define (char-set? s-re) + (cond + [(char? s-re)] + [(string? s-re) (= (string-length s-re) 1)] + [(list? s-re) (case (car s-re) + [(union intersection) (andmap char-set? (cdr s-re))] + [(char-range char-complement) #t] + [(repetition) (and (= 1 (cadr s-re) (caddr s-re)) (char-set? (cadddr s-re)))] + [(concatenation) (and (= 2 (length s-re)) (char-set? (cadr s-re)))] + (else #f))] + [else #f])) + +(module+ test + (require rackunit) + (check-equal? (char-set? #\a) #t) + (check-equal? (char-set? "12") #f) + (check-equal? (char-set? "1") #t) + (check-equal? (char-set? '(repetition 1 2 #\1)) #f) + (check-equal? (char-set? '(repetition 1 1 "12")) #f) + (check-equal? (char-set? '(repetition 1 1 "1")) #t) + (check-equal? (char-set? '(repetition 6 6 "1")) #f) + (check-equal? (char-set? '(union "1" "2" "3")) #t) + (check-equal? (char-set? '(union "1" "" "3")) #f) + (check-equal? (char-set? '(intersection "1" "2" (union "3" "4"))) #t) + (check-equal? (char-set? '(intersection "1" "")) #f) + (check-equal? (char-set? '(complement "1")) #f) + (check-equal? (char-set? '(concatenation "1" "2")) #f) + (check-equal? (char-set? '(concatenation "" "2")) #f) + (check-equal? (char-set? '(concatenation "1")) #t) + (check-equal? (char-set? '(concatenation "12")) #f) + (check-equal? (char-set? '(char-range #\1 #\2)) #t) + (check-equal? (char-set? '(char-complement #\1)) #t)) + +;; yikes... these test cases all have the wrong arity, now. +;; and by "now", I mean it's been broken since before we +;; moved to git. +(module+ test + (check-equal? (parse #'#\a) #\a) + (check-equal? (parse #'"1") "1") + (check-equal? (parse #'(repetition 1 1 #\1)) + '(repetition 1 1 #\1)) + (check-equal? (parse #'(repetition 0 +inf.0 #\1)) '(repetition 0 +inf.0 #\1)) + (check-equal? (parse #'(union #\1 (union "2") (union))) + '(union #\1 (union "2") (union))) + (check-equal? (parse #'(intersection #\1 (intersection "2") (intersection))) + '(intersection #\1 (intersection "2") (intersection))) + (check-equal? (parse #'(complement (union #\1 #\2))) + '(complement (union #\1 #\2))) + (check-equal? (parse #'(concatenation "1" "2" (concatenation))) + '(concatenation "1" "2" (concatenation))) + (check-equal? (parse #'(char-range "1" #\1)) '(char-range #\1 #\1)) + (check-equal? (parse #'(char-range #\1 "1")) '(char-range #\1 #\1)) + (check-equal? (parse #'(char-range "1" "3")) '(char-range #\1 #\3)) + (check-equal? (parse #'(char-complement (union "1" "2"))) + '(char-complement (union "1" "2"))) + (check-equal? (parse #'(char-complement (repetition 1 1 "1"))) + '(char-complement (repetition 1 1 "1"))) + (check-exn #rx"not a character set" + (λ () (parse #'(char-complement (repetition 6 6 "1")))))) diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/token-syntax.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/token-syntax.rkt new file mode 100644 index 0000000..ccb55cf --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/token-syntax.rkt @@ -0,0 +1,7 @@ +#lang racket/base +(provide make-terminals-def terminals-def-t terminals-def? + make-e-terminals-def e-terminals-def-t e-terminals-def?) + +;; The things needed at compile time to handle definition of tokens +(define-struct terminals-def (t)) +(define-struct e-terminals-def (t)) diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/token.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/token.rkt new file mode 100644 index 0000000..07e1fa5 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/token.rkt @@ -0,0 +1,80 @@ +#lang racket/base +(require (for-syntax racket/base "token-syntax.rkt")) + +;; Defining tokens + +(provide define-tokens define-empty-tokens make-token token? + (protect-out (rename-out [token-name real-token-name])) + (protect-out (rename-out [token-value real-token-value])) + (rename-out [token-name* token-name][token-value* token-value]) + (struct-out position) + (struct-out position-token) + (struct-out srcloc-token)) + + +;; A token is either +;; - symbol +;; - (make-token symbol any) +(define-struct token (name value) #:inspector (make-inspector)) + +;; token-name*: token -> symbol +(define (token-name* t) + (cond + [(symbol? t) t] + [(token? t) (token-name t)] + [else (raise-type-error 'token-name "symbol or struct:token" 0 t)])) + +;; token-value*: token -> any +(define (token-value* t) + (cond + [(symbol? t) #f] + [(token? t) (token-value t)] + [else (raise-type-error 'token-value "symbol or struct:token" 0 t)])) + +(define-for-syntax (make-ctor-name n) + (datum->syntax n + (string->symbol (format "token-~a" (syntax-e n))) + n + n)) + +(define-for-syntax ((make-define-tokens empty?) stx) + (syntax-case stx () + [(_ NAME (TOKEN ...)) + (andmap identifier? (syntax->list #'(TOKEN ...))) + (with-syntax (((marked-token ...) + (map values #;(make-syntax-introducer) + (syntax->list #'(TOKEN ...))))) + (quasisyntax/loc stx + (begin + (define-syntax NAME + #,(if empty? + #'(make-e-terminals-def (quote-syntax (marked-token ...))) + #'(make-terminals-def (quote-syntax (marked-token ...))))) + #,@(map + (λ (n) + (when (eq? (syntax-e n) 'error) + (raise-syntax-error + #f + "Cannot define a token named error." + stx)) + (if empty? + #`(define (#,(make-ctor-name n)) + '#,n) + #`(define (#,(make-ctor-name n) x) + (make-token '#,n x)))) + (syntax->list #'(TOKEN ...))) + #;(define marked-token #f) #;...)))] + [(_ ...) + (raise-syntax-error #f + "must have the form (define-tokens name (identifier ...)) or (define-empty-tokens name (identifier ...))" + stx)])) + +(define-syntax define-tokens (make-define-tokens #f)) +(define-syntax define-empty-tokens (make-define-tokens #t)) + +(define-struct position (offset line col) #:inspector #f) +(define-struct position-token (token start-pos end-pos) #:inspector #f) + +(define-struct srcloc-token (token srcloc) #:inspector #f) + + diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/unicode-chars.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/unicode-chars.rkt new file mode 100644 index 0000000..d8580b6 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/unicode-chars.rkt @@ -0,0 +1,65 @@ +#lang racket/base +(require racket/promise "util.rkt") + +(provide (all-defined-out)) + +;; mapped-chars : (listof (list nat nat bool)) +(define mapped-chars (make-known-char-range-list)) + +;; get-chars-for-x : (nat -> bool) (listof (list nat nat bool)) -> (listof (cons nat nat)) +(define (get-chars-for char-x? mapped-chars) + (cond + [(null? mapped-chars) null] + [else + (define range (car mapped-chars)) + (define low (car range)) + (define high (cadr range)) + (define x (char-x? low)) + (cond + [(caddr range) + (if x + (cons (cons low high) (get-chars-for char-x? (cdr mapped-chars))) + (get-chars-for char-x? (cdr mapped-chars)))] + [else + (let loop ([range-start low] + [i (car range)] + [parity x]) + (cond + [(> i high) + (if parity + (cons (cons range-start high) (get-chars-for char-x? (cdr mapped-chars))) + (get-chars-for char-x? (cdr mapped-chars)))] + [(eq? parity (char-x? i)) + (loop range-start (add1 i) parity)] + [parity (cons (cons range-start (sub1 i)) (loop i (add1 i) #f))] + [else (loop i (add1 i) #t)]))])])) + +(define (compute-ranges x?) + (delay (get-chars-for (λ (x) (x? (integer->char x))) mapped-chars))) + +(define alphabetic-ranges (compute-ranges char-alphabetic?)) ;; 325 +(define lower-case-ranges (compute-ranges char-lower-case?)) ;; 405 +(define upper-case-ranges (compute-ranges char-upper-case?)) ;; 380 +(define title-case-ranges (compute-ranges char-title-case?)) ;; 10 +(define numeric-ranges (compute-ranges char-numeric?)) ;; 47 +(define symbolic-ranges (compute-ranges char-symbolic?)) ;; 153 +(define punctuation-ranges (compute-ranges char-punctuation?)) ;; 86 +(define graphic-ranges (compute-ranges char-graphic?)) ;; 401 +(define whitespace-ranges (compute-ranges char-whitespace?)) ;; 10 +(define blank-ranges (compute-ranges char-blank?)) ;; 9 +#;(define hexadecimal-ranges (compute-ranges char-hexadecimal?)) +(define iso-control-ranges (compute-ranges char-iso-control?)) ;; 2 + + + +(module+ test + (require rackunit) + (check-equal? (get-chars-for odd? '()) '()) + (check-equal? (get-chars-for odd? '((1 4 #f) (8 13 #f))) + '((1 . 1) (3 . 3) (9 . 9) (11 . 11) (13 . 13))) + (check-equal? (get-chars-for (λ (x) + (odd? (quotient x 10))) + '((1 5 #t) (17 19 #t) (21 51 #f))) + '((17 . 19) (30 . 39) (50 . 51)))) + + diff --git a/br-parser-tools-lib/br-parser-tools/private-lex/util.rkt b/br-parser-tools-lib/br-parser-tools/private-lex/util.rkt new file mode 100644 index 0000000..bbcb447 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-lex/util.rkt @@ -0,0 +1,127 @@ +#lang racket/base +(require (for-syntax racket/base)) + +(provide (all-defined-out)) + +(define max-char-num #x10FFFF) + +(define-struct lex-abbrev (get-abbrev)) +(define-struct lex-trans (f)) + +(module+ test + (require rackunit)) + +(define-syntax (test-block stx) + (syntax-case stx () + [(_ defs (code right-ans) ...) + #'(module+ test + (require rackunit) + (let* defs + (let ([real-ans code]) + (check-equal? real-ans right-ans)) ...))])) + +#;(define-syntax test-block + (syntax-rules () + ((_ x ...) (void)))) + + +;; A cache is (X ( -> Y) -> Y) +;; make-cache : -> cache +;; table map Xs to Ys. If key is mapped, its value is returned. +;; Otherwise, build is invoked and its result is placed in the table and +;; returned. +;; Xs are compared with equal? +(define (make-cache) + (let ([table (make-hash)]) + (λ (key build) + (hash-ref table key (λ () + (let ([new (build)]) + (hash-set! table key new) + new)))))) + +(module+ test + (define cache (make-cache)) + (check-equal? (cache '(s 1 2) (λ () 9)) 9) + (check-equal? (cache '(s 2 1) (λ () 8)) 8) + (check-equal? (cache '(s 1 2) (λ () 1)) 9) + (check-equal? (cache (cons 's (cons 0 (cons +inf.0 10))) + (λ () 22)) 22) + (check-equal? (cache (cons 's (cons 0 (cons +inf.0 10))) + (λ () 1)) 22)) + + + +;; make-counter : -> -> nat +;; makes a function that returns a higher number by 1, each time +;; it is called. +(define (make-counter) + (let ([counter 0]) + (λ () + (begin0 + counter + (set! counter (add1 counter)))))) + +(module+ test + (define c (make-counter)) + (define d (make-counter)) + (check-equal? (c) 0) + (check-equal? (d) 0) + (check-equal? (c) 1) + (check-equal? (d) 1) + (check-equal? (c) 2)) + + +;; remove-dups : (list-of X) (X -> number) -> (list-of X) +;; removes the entries from l that have the same index as a +;; previous entry. l must be grouped by indexes. +(define (remove-dups l index acc) + (cond + [(null? l) (reverse acc)] + [(null? acc) (remove-dups (cdr l) index (cons (car l) acc))] + [(= (index (car acc)) (index (car l))) + (remove-dups (cdr l) index acc)] + [else + (remove-dups (cdr l) index (cons (car l) acc))])) + + +(module+ test + (check-equal? (remove-dups '((1 2) (2 2) (1 3) (1 4) + (100 4) (0 5)) cadr null) + '((1 2) (1 3) (1 4) (0 5))) + (check-equal? (remove-dups null error null) null)) + +;; do-simple-equiv : (list-of X) (X -> nat) -> (list-of X) +;; Sorts l according to index and removes the entries with duplicate +;; indexes. +(define (do-simple-equiv l index) + (define ordered (sort l (λ (a b) (< (index a) (index b))))) + (remove-dups ordered index null)) + +(module+ test + (check-equal? (do-simple-equiv '((2 2) (1 4) (1 2) + (100 4) (1 3) (0 5)) + cadr) + '((2 2) (1 3) (1 4) (0 5))) + (check-equal? (do-simple-equiv null error) null)) + +;; replace : (list-of X) (X -> bool) (X -> (list-of X)) (list-of X) -> +;; (list-of X) +;; If (pred? r) for some r in l, splice (get r) in place of r in the resulting +;; list. +(define (replace l pred? get acc) + (cond + [(null? l) acc] + [(pred? (car l)) (replace (cdr l) pred? get (append (get (car l)) acc))] + [else (replace (cdr l) pred? get (cons (car l) acc))])) + + +(module+ test + (check-equal? (replace null void (λ () (list 1)) null) null) + (check-equal? (replace '(1 2 3 4 3 5) + (λ (x) (= x 3)) + (λ (x) (list 1 2 3)) + null) + '(5 1 2 3 4 1 2 3 2 1))) + + + diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/grammar.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/grammar.rkt new file mode 100644 index 0000000..07aea77 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/grammar.rkt @@ -0,0 +1,250 @@ +#lang racket/base +;; Constructs to create and access grammars, the internal +;; representation of the input to the parser generator. + +(require racket/class + (except-in racket/list remove-duplicates) + "yacc-helper.rkt" + racket/contract) + +;; Each production has a unique index 0 <= index <= number of productions +(define-struct prod (lhs rhs index prec action) #:inspector (make-inspector) #:mutable) + +;; The dot-pos field is the index of the element in the rhs +;; of prod that the dot immediately precedes. +;; Thus 0 <= dot-pos <= (vector-length rhs). +(define-struct item (prod dot-pos) #:inspector (make-inspector)) + +;; gram-sym = (union term? non-term?) +;; Each term has a unique index 0 <= index < number of terms +;; Each non-term has a unique index 0 <= index < number of non-terms +(define-struct term (sym index prec) #:inspector (make-inspector) #:mutable) +(define-struct non-term (sym index) #:inspector (make-inspector) #:mutable) + +;; a precedence declaration. +(define-struct prec (num assoc) #:inspector (make-inspector)) + +(provide/contract + [make-item (prod? (or/c #f natural-number/c) . -> . item?)] + [make-term (symbol? (or/c #f natural-number/c) (or/c prec? #f) . -> . term?)] + [make-non-term (symbol? (or/c #f natural-number/c) . -> . non-term?)] + [make-prec (natural-number/c (or/c 'left 'right 'nonassoc) . -> . prec?)] + [make-prod (non-term? (vectorof (or/c non-term? term?)) + (or/c #f natural-number/c) (or/c #f prec?) syntax? . -> . prod?)]) + +(provide + ;; Things that work on items + start-item? item-prod item->string + sym-at-dot move-dot-right itemstring + non-term? term? non-termbit-vector term-index non-term-index + + ;; Things that work on precs + prec-num prec-assoc + + grammar% + + ;; Things that work on productions + prod-index prod-prec prod-rhs prod-lhs prod-action) + + +;;---------------------- LR items -------------------------- + +;; item bool +;; Lexicographic comparison on two items. +(define (item bool +;; The start production always has index 0 +(define (start-item? i) + (zero? (non-term-index (prod-lhs (item-prod i))))) + + +;; move-dot-right: LR-item -> LR-item | #f +;; moves the dot to the right in the item, unless it is at its +;; rightmost, then it returns false +(define (move-dot-right i) + (cond + [(= (item-dot-pos i) (vector-length (prod-rhs (item-prod i)))) #f] + [else (make-item (item-prod i) + (add1 (item-dot-pos i)))])) + +;; sym-at-dot: LR-item -> gram-sym | #f +;; returns the symbol after the dot in the item or #f if there is none +(define (sym-at-dot i) + (define dp (item-dot-pos i)) + (define rhs (prod-rhs (item-prod i))) + (cond + [(= dp (vector-length rhs)) #f] + [else (vector-ref rhs dp)])) + + +;; print-item: LR-item -> +(define (item->string it) + (define print-sym (λ (i) + (let ((gs (vector-ref (prod-rhs (item-prod it)) i))) + (cond + ((term? gs) (format "~a " (term-sym gs))) + (else (format "~a " (non-term-sym gs))))))) + (string-append + (format "~a -> " (non-term-sym (prod-lhs (item-prod it)))) + (let loop ((i 0)) + (cond + [(= i (vector-length (prod-rhs (item-prod it)))) + (if (= i (item-dot-pos it)) + ". " + "")] + [(= i (item-dot-pos it)) + (string-append ". " (print-sym i) (loop (add1 i)))] + [else (string-append (print-sym i) (loop (add1 i)))])))) + +;; --------------------- Grammar Symbols -------------------------- + +(define (non-termstring gs) + (symbol->string (gram-sym-symbol gs))) + +;; term-list->bit-vector: term list -> int +;; Creates a number where the nth bit is 1 if the term with index n is in +;; the list, and whose nth bit is 0 otherwise +(define (term-list->bit-vector terms) + (if (null? terms) + 0 + (bitwise-ior (arithmetic-shift 1 (term-index (car terms))) + (term-list->bit-vector (cdr terms))))) + + +;; ------------------------- Grammar ------------------------------ + +(define grammar% + (class object% + (super-instantiate ()) + ;; prods: production list list + ;; where there is one production list per non-term + (init prods) + ;; init-prods: production list + ;; The productions parsing can start from + ;; nullable-non-terms is indexed by the non-term-index and is true iff non-term is nullable + (init-field init-prods terms non-terms end-terms) + + ;; list of all productions + (define all-prods (apply append prods)) + (define num-prods (length all-prods)) + (define num-terms (length terms)) + (define num-non-terms (length non-terms)) + + (for ([(nt count) (in-indexed non-terms)]) + (set-non-term-index! nt count)) + + (for ([(t count) (in-indexed terms)]) + (set-term-index! t count)) + + (for ([(prod count) (in-indexed all-prods)]) + (set-prod-index! prod count)) + + ;; indexed by the index of the non-term - contains the list of productions for that non-term + (define nt->prods + (let ((v (make-vector (length prods) #f))) + (for ([prods (in-list prods)]) + (vector-set! v (non-term-index (prod-lhs (car prods))) prods)) + v)) + + (define nullable-non-terms + (nullable all-prods num-non-terms)) + + (define/public (get-num-terms) num-terms) + (define/public (get-num-non-terms) num-non-terms) + + (define/public (get-prods-for-non-term nt) + (vector-ref nt->prods (non-term-index nt))) + (define/public (get-prods) all-prods) + (define/public (get-init-prods) init-prods) + + (define/public (get-terms) terms) + (define/public (get-non-terms) non-terms) + + (define/public (get-num-prods) num-prods) + (define/public (get-end-terms) end-terms) + + (define/public (nullable-non-term? nt) + (vector-ref nullable-non-terms (non-term-index nt))) + + (define/public (nullable-after-dot? item) + (define rhs (prod-rhs (item-prod item))) + (define prod-length (vector-length rhs)) + (let loop ((i (item-dot-pos item))) + (cond + [(< i prod-length) + (and (non-term? (vector-ref rhs i)) + (nullable-non-term? (vector-ref rhs i)) + (loop (add1 i)))] + [(= i prod-length)]))) + + (define/public (nullable-non-term-thunk) + (λ (nt) (nullable-non-term? nt))) + (define/public (nullable-after-dot?-thunk) + (λ (item) (nullable-after-dot? item))))) + + +;; nullable: production list * int -> non-term set +;; determines which non-terminals can derive epsilon +(define (nullable prods num-nts) + (define nullable (make-vector num-nts #f)) + (define added #f) + + ;; possible-nullable: producion list -> production list + ;; Removes all productions that have a terminal + (define (possible-nullable prods) + (for/list ([prod (in-list prods)] + #:when (vector-andmap non-term? (prod-rhs prod))) + prod)) + + ;; set-nullables: production list -> production list + ;; makes one pass through the productions, adding the ones + ;; known to be nullable now to nullable and returning a list + ;; of productions that we don't know about yet. + (define (set-nullables prods) + (cond + [(null? prods) null] + [(vector-ref nullable (gram-sym-index (prod-lhs (car prods)))) + (set-nullables (cdr prods))] + [(vector-andmap (λ (nt) (vector-ref nullable (gram-sym-index nt))) (prod-rhs (car prods))) + (vector-set! nullable (gram-sym-index (prod-lhs (car prods))) #t) + (set! added #t) + (set-nullables (cdr prods))] + [else (cons (car prods) (set-nullables (cdr prods)))])) + (let loop ((P (possible-nullable prods))) + (cond + [(null? P) nullable] + [else + (set! added #f) + (define new-P (set-nullables P)) + (if added + (loop new-P) + nullable)]))) + + + diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/graph.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/graph.rkt new file mode 100644 index 0000000..bac6736 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/graph.rkt @@ -0,0 +1,53 @@ +#lang racket/base +(provide digraph) + +(define (zero-thunk) 0) + +;; digraph: +;; ('a list) * ('a -> 'a list) * ('a -> 'b) * ('b * 'b -> 'b) * (-> 'b) +;; -> ('a -> 'b) +;; DeRemer and Pennello 1982 +;; Computes (f x) = (f- x) union Union{(f y) | y in (edges x)} +;; We use a hash-table to represent the result function 'a -> 'b set, so +;; the values of type 'a must be comparable with eq?. + +(define (digraph nodes edges f- union fail) + (define results (make-hasheq)) + (define (f x) (hash-ref results x fail)) + ;; Maps elements of 'a to integers. + (define N (make-hasheq)) + (define (get-N x) (hash-ref N x zero-thunk)) + (define (set-N x d) (hash-set! N x d)) + (define stack null) + (define (push x) (set! stack (cons x stack))) + (define (pop) (begin0 + (car stack) + (set! stack (cdr stack)))) + (define (depth) (length stack)) + + ;; traverse: 'a -> + (define (traverse x) + (push x) + (define d (depth)) + (set-N x d) + (hash-set! results x (f- x)) + (for-each (λ (y) + (when (= 0 (get-N y)) + (traverse y)) + (hash-set! results + x + (union (f x) (f y))) + (set-N x (min (get-N x) (get-N y)))) + (edges x)) + (when (= d (get-N x)) + (let loop ([p (pop)]) + (set-N p +inf.0) + (hash-set! results p (f x)) + (when (not (eq? x p)) + (loop (pop)))))) + ;; Will map elements of 'a to 'b sets + (for ([x (in-list nodes)] + #:when (zero? (get-N x))) + (traverse x)) + f) + diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/input-file-parser.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/input-file-parser.rkt new file mode 100644 index 0000000..fa99240 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/input-file-parser.rkt @@ -0,0 +1,297 @@ +#lang racket/base +(require "yacc-helper.rkt" + "../private-lex/token-syntax.rkt" + "grammar.rkt" + racket/class + racket/contract + (for-template racket/base)) + +;; routines for parsing the input to the parser generator and producing a +;; grammar (See grammar.rkt) + +(define (is-a-grammar%? x) (is-a? x grammar%)) +(provide/contract + [parse-input ((listof identifier?) (listof identifier?) (listof identifier?) + (or/c #f syntax?) syntax? any/c . -> . is-a-grammar%?)] + [get-term-list ((listof identifier?) . -> . (listof identifier?))]) + +(define stx-for-original-property (read-syntax #f (open-input-string "original"))) + +;; get-args: ??? -> (values (listof syntax) (or/c #f (cons integer? stx))) +(define (get-args i rhs src-pos term-defs) + (define empty-table (make-hasheq)) + (define biggest-pos #f) + (hash-set! empty-table 'error #t) + (for* ([td (in-list term-defs)] + [v (in-value (syntax-local-value td))] + #:when (e-terminals-def? v) + [s (in-list (syntax->list (e-terminals-def-t v)))]) + (hash-set! empty-table (syntax->datum s) #t)) + (define args + (let get-args ([i i][rhs rhs]) + (cond + [(null? rhs) null] + [else + (define b (car rhs)) + (define name (if (hash-ref empty-table (syntax->datum (car rhs)) #f) + (gensym) + (string->symbol (format "$~a" i)))) + (cond + [src-pos + (define start-pos-id + (datum->syntax b (string->symbol (format "$~a-start-pos" i)) b stx-for-original-property)) + (define end-pos-id + (datum->syntax b (string->symbol (format "$~a-end-pos" i)) b stx-for-original-property)) + (set! biggest-pos (cons start-pos-id end-pos-id)) + (list* (datum->syntax b name b stx-for-original-property) + start-pos-id + end-pos-id + (get-args (add1 i) (cdr rhs)))] + [else + (list* (datum->syntax b name b stx-for-original-property) + (get-args (add1 i) (cdr rhs)))])]))) + (values args biggest-pos)) + +;; Given the list of terminal symbols and the precedence/associativity definitions, +;; builds terminal structures (See grammar.rkt) +;; build-terms: symbol list * symbol list list -> term list +(define (build-terms term-list precs) + (define counter 0) + ;;(term-list (cons (gensym) term-list)) + ;; Will map a terminal symbol to its precedence/associativity + (define prec-table (make-hasheq)) + + ;; Fill the prec table + (for ([p-decl (in-list precs)]) + (define assoc (car p-decl)) + (for ([term-sym (in-list (cdr p-decl))]) + (hash-set! prec-table term-sym (make-prec counter assoc))) + (set! counter (add1 counter))) + + ;; Build the terminal structures + (for/list ([term-sym (in-list term-list)]) + (make-term term-sym + #f + (hash-ref prec-table term-sym (λ () #f))))) + +;; Retrieves the terminal symbols from a terminals-def (See terminal-syntax.rkt) +;; get-terms-from-def: identifier? -> (listof identifier?) +(define (get-terms-from-def term-syn) + (define t (syntax-local-value term-syn #f)) + (cond + [(terminals-def? t) (syntax->list (terminals-def-t t))] + [(e-terminals-def? t) (syntax->list (e-terminals-def-t t))] + [else + (raise-syntax-error + 'parser-tokens + "undefined token group" + term-syn)])) + +(define (get-term-list term-group-names) + (remove-duplicates + (cons (datum->syntax #f 'error) + (apply append (map get-terms-from-def term-group-names))))) + +(define (parse-input term-defs start ends prec-decls prods src-pos) + (define start-syms (map syntax-e start)) + (define list-of-terms (map syntax-e (get-term-list term-defs))) + (define end-terms + (for/list ([end (in-list ends)]) + (unless (memq (syntax-e end) list-of-terms) + (raise-syntax-error + 'parser-end-tokens + (format "End token ~a not defined as a token" + (syntax-e end)) + end)) + (syntax-e end))) + ;; Get the list of terminals out of input-terms + (define list-of-non-terms + (syntax-case prods () + [((NON-TERM PRODUCTION ...) ...) + (begin + (for ([nts (in-list (syntax->list #'(NON-TERM ...)))] + #:when (memq (syntax->datum nts) list-of-terms)) + (raise-syntax-error + 'parser-non-terminals + (format "~a used as both token and non-terminal" (syntax->datum nts)) + nts)) + (let ([dup (duplicate-list? (syntax->datum #'(NON-TERM ...)))]) + (when dup + (raise-syntax-error + 'parser-non-terminals + (format "non-terminal ~a defined multiple times" dup) + prods))) + (syntax->datum #'(NON-TERM ...)))] + [_ (raise-syntax-error + 'parser-grammar + "Grammar must be of the form (grammar (non-terminal productions ...) ...)" + prods)])) + ;; Check the precedence declarations for errors and turn them into data + (define precs + (syntax-case prec-decls () + [((TYPE TERM ...) ...) + (let ([p-terms (syntax->datum #'(TERM ... ...))]) + (cond + [(duplicate-list? p-terms) => + (λ (d) + (raise-syntax-error + 'parser-precedences + (format "duplicate precedence declaration for token ~a" d) + prec-decls))] + [else (for ([t (in-list (syntax->list #'(TERM ... ...)))] + #:when (not (memq (syntax->datum t) list-of-terms))) + (raise-syntax-error + 'parser-precedences + (format "Precedence declared for non-token ~a" (syntax->datum t)) + t)) + (for ([type (in-list (syntax->list #'(TYPE ...)))] + #:unless (memq (syntax->datum type) `(left right nonassoc))) + (raise-syntax-error + 'parser-precedences + "Associativity must be left, right or nonassoc" + type)) + (syntax->datum prec-decls)]))] + [#f null] + [_ (raise-syntax-error + 'parser-precedences + "Precedence declaration must be of the form (precs (assoc term ...) ...) where assoc is left, right or nonassoc" + prec-decls)])) + + (define terms (build-terms list-of-terms precs)) + (define non-terms (map (λ (non-term) (make-non-term non-term #f)) + list-of-non-terms)) + (define term-table (make-hasheq)) + (define non-term-table (make-hasheq)) + + (for ([t (in-list terms)]) + (hash-set! term-table (gram-sym-symbol t) t)) + + (for ([nt (in-list non-terms)]) + (hash-set! non-term-table (gram-sym-symbol nt) nt)) + + ;; parse-prod: syntax-object -> gram-sym vector + (define (parse-prod prod-so) + (syntax-case prod-so () + [(PROD-RHS-SYM ...) + (andmap identifier? (syntax->list prod-so)) + (begin + (for ([t (in-list (syntax->list prod-so))] + #:when (memq (syntax->datum t) end-terms)) + (raise-syntax-error + 'parser-production-rhs + (format "~a is an end token and cannot be used in a production" (syntax->datum t)) + t)) + (for/vector ([s (in-list (syntax->list prod-so))]) + (cond + [(hash-ref term-table (syntax->datum s) #f)] + [(hash-ref non-term-table (syntax->datum s) #f)] + [else (raise-syntax-error + 'parser-production-rhs + (format "~a is not declared as a terminal or non-terminal" (syntax->datum s)) + s)])))] + [_ (raise-syntax-error + 'parser-production-rhs + "production right-hand-side must have form (symbol ...)" + prod-so)])) + + ;; parse-action: syntax-object * syntax-object -> syntax-object + (define (parse-action rhs act-in) + (define-values (args biggest) (get-args 1 (syntax->list rhs) src-pos term-defs)) + (define act + (if biggest + (with-syntax ([(CAR-BIGGEST . CDR-BIGGEST) biggest] + [$N-START-POS (datum->syntax (car biggest) '$n-start-pos)] + [$N-END-POS (datum->syntax (cdr biggest) '$n-end-pos)] + [ACT-IN act-in]) + #'(let ([$N-START-POS CAR-BIGGEST] + [$N-END-POS CDR-BIGGEST]) + ACT-IN)) + act-in)) + (with-syntax ([ARGS args][ACT act]) + (syntax/loc #'ACT (λ ARGS ACT)))) + + ;; parse-prod+action: non-term * syntax-object -> production + (define (parse-prod+action nt prod-so) + (syntax-case prod-so () + [(PROD-RHS ACTION) + (let ([p (parse-prod #'PROD-RHS)]) + (make-prod + nt + p + #f + (let loop ([i (sub1 (vector-length p))]) + (if (>= i 0) + (let ([gs (vector-ref p i)]) + (if (term? gs) + (term-prec gs) + (loop (sub1 i)))) + #f)) + (parse-action #'PROD-RHS #'ACTION)))] + [(PROD-RHS (PREC TERM) ACTION) + (identifier? #'TERM) + (let ([p (parse-prod #'PROD-RHS)]) + (make-prod + nt + p + #f + (term-prec + (cond + [(hash-ref term-table (syntax->datum #'TERM) #f)] + [else (raise-syntax-error + 'parser-production-rhs + (format + "unrecognized terminal ~a in precedence declaration" + (syntax->datum #'TERM)) + #'TERM)])) + (parse-action #'PROD-RHS #'ACTION)))] + [_ (raise-syntax-error + 'parser-production-rhs + "production must have form [(symbol ...) expression] or [(symbol ...) (prec symbol) expression]" + prod-so)])) + + ;; parse-prod-for-nt: syntax-object -> production list + (define (parse-prods-for-nt prods-so) + (syntax-case prods-so () + [(NT PRODUCTIONS ...) + (positive? (length (syntax->list #'(PRODUCTIONS ...)))) + (let ([nt (hash-ref non-term-table (syntax->datum #'NT))]) + (map (λ (p) (parse-prod+action nt p)) (syntax->list #'(PRODUCTIONS ...))))] + [_ (raise-syntax-error + 'parser-productions + "A production for a non-terminal must be (non-term right-hand-side ...) with at least 1 right hand side" + prods-so)])) + + (for ([sstx (in-list start)] + [ssym (in-list start-syms)] + #:unless (memq ssym list-of-non-terms)) + (raise-syntax-error + 'parser-start + (format "Start symbol ~a not defined as a non-terminal" ssym) + sstx)) + + (define starts (map (λ (x) (make-non-term (gensym) #f)) start-syms)) + (define end-non-terms (map (λ (x) (make-non-term (gensym) #f)) start-syms)) + (define parsed-prods (map parse-prods-for-nt (syntax->list prods))) + (define start-prods (for/list ([start (in-list starts)] + [end-non-term (in-list end-non-terms)]) + (list (make-prod start (vector end-non-term) #f #f #'values)))) + (define new-prods + (append start-prods + (for/list ([end-nt (in-list end-non-terms)] + [start-sym (in-list start-syms)]) + (for/list ([end (in-list end-terms)]) + (make-prod end-nt + (vector + (hash-ref non-term-table start-sym) + (hash-ref term-table end)) + #f + #f + #'values))) + parsed-prods)) + + (make-object grammar% + new-prods + (map car start-prods) + terms + (append starts (append end-non-terms non-terms)) + (map (λ (term-name) (hash-ref term-table term-name)) end-terms))) diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/lalr.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/lalr.rkt new file mode 100644 index 0000000..ed21c37 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/lalr.rkt @@ -0,0 +1,252 @@ +#lang racket/base +(require "lr0.rkt" + "grammar.rkt" + racket/list + racket/class) + +;; Compute LALR lookaheads from DeRemer and Pennello 1982 + +(provide compute-LA) + +;; compute-DR: LR0-automaton * grammar -> (trans-key -> term set) +;; computes for each state, non-term transition pair, the terminals +;; which can transition out of the resulting state +;; output term set is represented in bit-vector form +(define ((compute-DR a g) tk) + (define r (send a run-automaton (trans-key-st tk) (trans-key-gs tk))) + (term-list->bit-vector + (filter (λ (term) (send a run-automaton r term)) (send g get-terms)))) + +;; compute-reads: +;; LR0-automaton * grammar -> (trans-key -> trans-key list) +(define (compute-reads a g) + (define nullable-non-terms (filter (λ (nt) (send g nullable-non-term? nt)) (send g get-non-terms))) + (λ (tk) + (define r (send a run-automaton (trans-key-st tk) (trans-key-gs tk))) + (for/list ([non-term (in-list nullable-non-terms)] + #:when (send a run-automaton r non-term)) + (make-trans-key r non-term)))) + +;; compute-read: LR0-automaton * grammar -> (trans-key -> term set) +;; output term set is represented in bit-vector form +(define (compute-read a g) + (define dr (compute-DR a g)) + (define reads (compute-reads a g)) + (digraph-tk->terml (send a get-mapped-non-term-keys) + reads + dr + (send a get-num-states))) +;; returns the list of all k such that state k transitions to state start on the +;; transitions in rhs (in order) +(define (run-lr0-backward a rhs dot-pos start num-states) + (let loop ([states (list start)] + [i (sub1 dot-pos)]) + (cond + [(< i 0) states] + [else (loop (send a run-automaton-back states (vector-ref rhs i)) + (sub1 i))]))) + +;; prod->items-for-include: grammar * prod * non-term -> lr0-item list +;; returns the list of all (B -> beta . nt gamma) such that prod = (B -> beta nt gamma) +;; and gamma =>* epsilon +(define (prod->items-for-include g prod nt) + (define rhs (prod-rhs prod)) + (define rhs-l (vector-length rhs)) + (append (if (and (> rhs-l 0) (eq? nt (vector-ref rhs (sub1 rhs-l)))) + (list (make-item prod (sub1 rhs-l))) + null) + (let loop ([i (sub1 rhs-l)]) + (cond + [(and (> i 0) + (non-term? (vector-ref rhs i)) + (send g nullable-non-term? (vector-ref rhs i))) + (if (eq? nt (vector-ref rhs (sub1 i))) + (cons (make-item prod (sub1 i)) + (loop (sub1 i))) + (loop (sub1 i)))] + [else null])))) + +;; prod-list->items-for-include: grammar * prod list * non-term -> lr0-item list +;; return the list of all (B -> beta . nt gamma) such that (B -> beta nt gamma) in prod-list +;; and gamma =>* epsilon +(define (prod-list->items-for-include g prod-list nt) + (apply append (map (λ (prod) (prod->items-for-include g prod nt)) prod-list))) + +;; comput-includes: lr0-automaton * grammar -> (trans-key -> trans-key list) +(define (compute-includes a g) + (define num-states (send a get-num-states)) + (define items-for-input-nt (make-vector (send g get-num-non-terms) null)) + (for ([input-nt (in-list (send g get-non-terms))]) + (vector-set! items-for-input-nt (non-term-index input-nt) + (prod-list->items-for-include g (send g get-prods) input-nt))) + (λ (tk) + (define goal-state (trans-key-st tk)) + (define non-term (trans-key-gs tk)) + (define items (vector-ref items-for-input-nt (non-term-index non-term))) + (trans-key-list-remove-dups + (apply append + (for/list ([item (in-list items)]) + (define prod (item-prod item)) + (define rhs (prod-rhs prod)) + (define lhs (prod-lhs prod)) + (map (λ (state) (make-trans-key state lhs)) + (run-lr0-backward a + rhs + (item-dot-pos item) + goal-state + num-states))))))) + +;; compute-lookback: lr0-automaton * grammar -> (kernel * proc -> trans-key list) +(define (compute-lookback a g) + (define num-states (send a get-num-states)) + (λ (state prod) + (map (λ (k) (make-trans-key k (prod-lhs prod))) + (run-lr0-backward a (prod-rhs prod) (vector-length (prod-rhs prod)) state num-states)))) + +;; compute-follow: LR0-automaton * grammar -> (trans-key -> term set) +;; output term set is represented in bit-vector form +(define (compute-follow a g includes) + (define read (compute-read a g)) + (digraph-tk->terml (send a get-mapped-non-term-keys) + includes + read + (send a get-num-states))) + +;; compute-LA: LR0-automaton * grammar -> kernel * prod -> term set +;; output term set is represented in bit-vector form +(define (compute-LA a g) + (define includes (compute-includes a g)) + (define lookback (compute-lookback a g)) + (define follow (compute-follow a g includes)) + (λ (k p) + (define l (lookback k p)) + (define f (map follow l)) + (apply bitwise-ior (cons 0 f)))) + + +(define (print-DR dr a g) + (print-input-st-sym dr "DR" a g print-output-terms)) +(define (print-Read Read a g) + (print-input-st-sym Read "Read" a g print-output-terms)) +(define (print-includes i a g) + (print-input-st-sym i "includes" a g print-output-st-nt)) +(define (print-lookback l a g) + (print-input-st-prod l "lookback" a g print-output-st-nt)) +(define (print-follow f a g) + (print-input-st-sym f "follow" a g print-output-terms)) +(define (print-LA l a g) + (print-input-st-prod l "LA" a g print-output-terms)) + +(define (print-input-st-sym f name a g print-output) + (printf "~a:\n" name) + (send a for-each-state + (λ (state) + (for-each + (λ (non-term) + (let ([res (f (make-trans-key state non-term))]) + (when (not (null? res)) + (printf "~a(~a, ~a) = ~a\n" + name + state + (gram-sym-symbol non-term) + (print-output res))))) + (send g get-non-terms)))) + (newline)) + +(define (print-input-st-prod f name a g print-output) + (printf "~a:\n" name) + (send a for-each-state + (λ (state) + (for-each + (λ (non-term) + (for-each + (λ (prod) + (let ([res (f state prod)]) + (when (not (null? res)) + (printf "~a(~a, ~a) = ~a\n" + name + (kernel-index state) + (prod-index prod) + (print-output res))))) + (send g get-prods-for-non-term non-term))) + (send g get-non-terms))))) + +(define (print-output-terms r) + (map gram-sym-symbol r)) + +(define (print-output-st-nt r) + (map (λ (p) (list (kernel-index (trans-key-st p)) (gram-sym-symbol (trans-key-gs p)))) r)) + +;; init-tk-map : int -> (vectorof hashtable?) +(define (init-tk-map n) + (define v (make-vector n #f)) + (let loop ([i (sub1 (vector-length v))]) + (when (>= i 0) + (vector-set! v i (make-hasheq)) + (loop (sub1 i)))) + v) + +;; lookup-tk-map : (vectorof (symbol? int hashtable)) -> trans-key? -> int +(define ((lookup-tk-map map) tk) + (define st (trans-key-st tk)) + (define gs (trans-key-gs tk)) + (hash-ref (vector-ref map (kernel-index st)) + (gram-sym-symbol gs) + (λ () 0))) + +;; add-tk-map : (vectorof (symbol? int hashtable)) -> trans-key int -> +(define ((add-tk-map map) tk v) + (define st (trans-key-st tk)) + (define gs (trans-key-gs tk)) + (hash-set! (vector-ref map (kernel-index st)) + (gram-sym-symbol gs) + v)) + +;; digraph-tk->terml: +;; (trans-key list) * (trans-key -> trans-key list) * (trans-key -> term list) * int * int * int +;; -> (trans-key -> term list) +;; DeRemer and Pennello 1982 +;; Computes (f x) = (f- x) union Union{(f y) | y in (edges x)} +;; A specialization of digraph in the file graph.rkt +(define (digraph-tk->terml nodes edges f- num-states) + ;; Will map elements of trans-key to term sets represented as bit vectors + (define results (init-tk-map num-states)) + + ;; Maps elements of trans-keys to integers. + (define N (init-tk-map num-states)) + + (define get-N (lookup-tk-map N)) + (define set-N (add-tk-map N)) + (define get-f (lookup-tk-map results)) + (define set-f (add-tk-map results)) + + (define stack null) + (define (push x) (set! stack (cons x stack))) + (define (pop) (begin0 + (car stack) + (set! stack (cdr stack)))) + (define (depth) (length stack)) + + ;; traverse: 'a -> + (define (traverse x) + (push x) + (let ([d (depth)]) + (set-N x d) + (set-f x (f- x)) + (for-each (λ (y) + (when (= 0 (get-N y)) + (traverse y)) + (set-f x (bitwise-ior (get-f x) (get-f y))) + (set-N x (min (get-N x) (get-N y)))) + (edges x)) + (when (= d (get-N x)) + (let loop ([p (pop)]) + (set-N p +inf.0) + (set-f p (get-f x)) + (unless (equal? x p) + (loop (pop))))))) + + (for ([x (in-list nodes)] + #:when (zero? (get-N x))) + (traverse x)) + get-f) diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/lr0.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/lr0.rkt new file mode 100644 index 0000000..a3b1fcc --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/lr0.rkt @@ -0,0 +1,314 @@ +#lang racket/base +(require "grammar.rkt" + "graph.rkt" + racket/list + racket/class) + +;; Handle the LR0 automaton + +(provide build-lr0-automaton lr0% + (struct-out trans-key) trans-key-list-remove-dups + kernel-items kernel-index) + +;; kernel = (make-kernel (LR1-item list) index) +;; the list must be kept sorted according to item +;; (vectorof (symbol X hashtable)) +(define (build-transition-table num-states assoc) + (define transitions (make-vector num-states #f)) + (let loop ([i (sub1 (vector-length transitions))]) + (when (>= i 0) + (vector-set! transitions i (make-hasheq)) + (loop (sub1 i)))) + (for ([trans-key/kernel (in-list assoc)]) + (define tk (car trans-key/kernel)) + (hash-set! (vector-ref transitions (kernel-index (trans-key-st tk))) + (gram-sym-symbol (trans-key-gs tk)) + (cdr trans-key/kernel))) + transitions) + +;; reverse-assoc : (listof (cons/c trans-key? kernel?)) -> +;; (listof (cons/c trans-key? (listof kernel?))) +(define (reverse-assoc assoc) + (define reverse-hash (make-hash)) + (define (hash-table-add! ht k v) + (hash-set! ht k (cons v (hash-ref ht k (λ () null))))) + (for ([trans-key/kernel (in-list assoc)]) + (define tk (car trans-key/kernel)) + (hash-table-add! reverse-hash + (make-trans-key (cdr trans-key/kernel) + (trans-key-gs tk)) + (trans-key-st tk))) + (hash-map reverse-hash cons)) + + +;; kernel-list-remove-duplicates +;; LR0-automaton = object of class lr0% +(define lr0% + (class object% + (super-instantiate ()) + ;; term-assoc : (listof (cons/c trans-key? kernel?)) + ;; non-term-assoc : (listof (cons/c trans-key? kernel?)) + ;; states : (vectorof kernel?) + ;; epsilons : ??? + (init-field term-assoc non-term-assoc states epsilons) + + (define transitions (build-transition-table (vector-length states) + (append term-assoc non-term-assoc))) + + (define reverse-term-assoc (reverse-assoc term-assoc)) + (define reverse-non-term-assoc (reverse-assoc non-term-assoc)) + (define reverse-transitions + (build-transition-table (vector-length states) + (append reverse-term-assoc reverse-non-term-assoc))) + + (define mapped-non-terms (map car non-term-assoc)) + + (define/public (get-mapped-non-term-keys) + mapped-non-terms) + + (define/public (get-num-states) + (vector-length states)) + + (define/public (get-epsilon-trans) + epsilons) + + (define/public (get-transitions) + (append term-assoc non-term-assoc)) + + ;; for-each-state : (state ->) -> + ;; Iteration over the states in an automaton + (define/public (for-each-state f) + (define num-states (vector-length states)) + (let loop ([i 0]) + (when (< i num-states) + (f (vector-ref states i)) + (loop (add1 i))))) + + ;; run-automaton: kernel? gram-sym? -> (union kernel #f) + ;; returns the state reached from state k on input s, or #f when k + ;; has no transition on s + (define/public (run-automaton k s) + (hash-ref (vector-ref transitions (kernel-index k)) + (gram-sym-symbol s) + (λ () #f))) + + ;; run-automaton-back : (listof kernel?) gram-sym? -> (listof kernel) + ;; returns the list of states that can reach k by transitioning on s. + (define/public (run-automaton-back k s) + (for*/list ([k (in-list k)] + [val (in-list (hash-ref (vector-ref reverse-transitions (kernel-index k)) + (gram-sym-symbol s) + (λ () null)))]) + val)))) + +(define ((union comp (eq? a b) +(define (kernel->string k) + (apply string-append + `("{" ,@(map (λ (i) (string-append (item->string i) ", ")) + (kernel-items k)) + "}"))) + +;; build-LR0-automaton: grammar -> LR0-automaton +;; Constructs the kernels of the sets of LR(0) items of g +(define (build-lr0-automaton grammar) + ; (printf "LR(0) automaton:\n") + (define epsilons (make-hash)) + (define grammar-symbols (append (send grammar get-non-terms) + (send grammar get-terms))) + ;; first-non-term: non-term -> non-term list + ;; given a non-terminal symbol C, return those non-terminal + ;; symbols A s.t. C -> An for some string of terminals and + ;; non-terminals n where -> means a rightmost derivation in many + ;; steps. Assumes that each non-term can be reduced to a string + ;; of terms. + (define first-non-term + (digraph (send grammar get-non-terms) + (λ (nt) + (filter non-term? + (map (λ (prod) (sym-at-dot (make-item prod 0))) + (send grammar get-prods-for-non-term nt)))) + (λ (nt) (list nt)) + (union non-term LR1-item list + ;; Creates a set of items containing i s.t. if A -> n.Xm is in it, + ;; X -> .o is in it too. + (define (LR0-closure i) + (cond + [(null? i) null] + [else + (define next-gsym (sym-at-dot (car i))) + (cond + [(non-term? next-gsym) + (cons (car i) + (append + (for*/list ([non-term (in-list (first-non-term next-gsym))] + [x (in-list (send grammar + get-prods-for-non-term + non-term))]) + (make-item x 0)) + (LR0-closure (cdr i))))] + [else (cons (car i) (LR0-closure (cdr i)))])])) + + ;; maps trans-keys to kernels + (define automaton-term null) + (define automaton-non-term null) + + ;; keeps the kernels we have seen, so we can have a unique + ;; list for each kernel + (define kernels (make-hash)) + (define counter 0) + + ;; goto: LR1-item list -> LR1-item list list + ;; creates new kernels by moving the dot in each item in the + ;; LR0-closure of kernel to the right, and grouping them by + ;; the term/non-term moved over. Returns the kernels not + ;; yet seen, and places the trans-keys into automaton + (define (goto kernel) + ;; maps a gram-syms to a list of items + (define table (make-hasheq)) + + ;; add-item!: + ;; (symbol (listof item) hashtable) item? -> + ;; adds i into the table grouped with the grammar + ;; symbol following its dot + (define (add-item! table i) + (define gs (sym-at-dot i)) + (cond + [gs (define already (hash-ref table (gram-sym-symbol gs) (λ () null))) + (unless (member i already) + (hash-set! table (gram-sym-symbol gs) (cons i already)))] + ((zero? (vector-length (prod-rhs (item-prod i)))) + (define current (hash-ref epsilons kernel (λ () null))) + (hash-set! epsilons kernel (cons i current))))) + + ;; Group the items of the LR0 closure of the kernel + ;; by the character after the dot + (for ([item (in-list (LR0-closure (kernel-items kernel)))]) + (add-item! table item)) + + ;; each group is a new kernel, with the dot advanced. + ;; sorts the items in a kernel so kernels can be compared + ;; with equal? for using the table kernels to make sure + ;; only one representitive of each kernel is created + (define is + (let loop ([gsyms grammar-symbols]) + (cond + [(null? gsyms) null] + [else + (define items (hash-ref table (gram-sym-symbol (car gsyms)) (λ () null))) + (cond + [(null? items) (loop (cdr gsyms))] + [else (cons (list (car gsyms) items) + (loop (cdr gsyms)))])]))) + (filter + values + (for/list ([i (in-list is)]) + (define gs (car i)) + (define items (cadr i)) + (define new #f) + (define new-kernel (sort (filter values (map move-dot-right items)) item ~a on ~a\n" + (kernel->string kernel) + (kernel->string unique-kernel) + (gram-sym-symbol gs)) + (and new unique-kernel)))) + + (define starts (map (λ (init-prod) (list (make-item init-prod 0))) + (send grammar get-init-prods))) + (define startk (for/list ([start (in-list starts)]) + (define k (make-kernel start counter)) + (hash-set! kernels start k) + (set! counter (add1 counter)) + k)) + (define new-kernels (make-queue)) + (let loop ([old-kernels startk] + [seen-kernels null]) + (cond + [(and (empty-queue? new-kernels) (null? old-kernels)) + (make-object lr0% automaton-term automaton-non-term + (list->vector (reverse seen-kernels)) epsilons)] + [(null? old-kernels) (loop (deq! new-kernels) seen-kernels)] + [else + (enq! new-kernels (goto (car old-kernels))) + (loop (cdr old-kernels) (cons (car old-kernels) seen-kernels))]))) + +(define-struct q (f l) #:inspector (make-inspector) #:mutable) +(define (empty-queue? q) (null? (q-f q))) +(define (make-queue) (make-q null null)) + +(define (enq! q i) + (cond + [(empty-queue? q) + (let ([i (mcons i null)]) + (set-q-l! q i) + (set-q-f! q i))] + [else + (set-mcdr! (q-l q) (mcons i null)) + (set-q-l! q (mcdr (q-l q)))])) + + +(define (deq! q) + (begin0 + (mcar (q-f q)) + (set-q-f! q (mcdr (q-f q))))) + + diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/parser-actions.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/parser-actions.rkt new file mode 100644 index 0000000..11e4557 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/parser-actions.rkt @@ -0,0 +1,54 @@ +#lang racket/base +(require "grammar.rkt") +(provide (except-out (all-defined-out) make-reduce make-reduce*) + (rename-out [make-reduce* make-reduce])) + +;; An action is +;; - (make-shift int) +;; - (make-reduce prod runtime-action) +;; - (make-accept) +;; - (make-goto int) +;; - (no-action) +;; A reduce contains a runtime-reduce so that sharing of the reduces can +;; be easily transferred to sharing of runtime-reduces. + +(define-struct action () #:inspector (make-inspector)) +(define-struct (shift action) (state) #:inspector (make-inspector)) +(define-struct (reduce action) (prod runtime-reduce) #:inspector (make-inspector)) +(define-struct (accept action) () #:inspector (make-inspector)) +(define-struct (goto action) (state) #:inspector (make-inspector)) +(define-struct (no-action action) () #:inspector (make-inspector)) + +(define (make-reduce* p) + (make-reduce p + (vector (prod-index p) + (gram-sym-symbol (prod-lhs p)) + (vector-length (prod-rhs p))))) + +;; A runtime-action is +;; non-negative-int (shift) +;; (vector int symbol int) (reduce) +;; 'accept (accept) +;; negative-int (goto) +;; #f (no-action) + +(define (action->runtime-action a) + (cond + [(shift? a) (shift-state a)] + [(reduce? a) (reduce-runtime-reduce a)] + [(accept? a) 'accept] + [(goto? a) (- (+ (goto-state a) 1))] + [(no-action? a) #f])) + +(define (runtime-shift? x) (and (integer? x) (>= x 0))) +(define runtime-reduce? vector?) +(define (runtime-accept? x) (eq? x 'accept)) +(define (runtime-goto? x) (and (integer? x) (< x 0))) + +(define runtime-shift-state values) +(define (runtime-reduce-prod-num x) (vector-ref x 0)) +(define (runtime-reduce-lhs x) (vector-ref x 1)) +(define (runtime-reduce-rhs-length x) (vector-ref x 2)) +(define (runtime-goto-state x) (- (+ x 1))) + + diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/parser-builder.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/parser-builder.rkt new file mode 100644 index 0000000..87638e3 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/parser-builder.rkt @@ -0,0 +1,103 @@ +#lang racket/base +(require "input-file-parser.rkt" + "grammar.rkt" + "table.rkt" + racket/class + racket/contract) +(require (for-template racket/base)) + +(provide/contract [build-parser (-> string? any/c any/c + (listof identifier?) + (listof identifier?) + (listof identifier?) + (or/c syntax? #f) + syntax? + (values any/c any/c any/c any/c))]) + +;; fix-check-syntax : (listof identifier?) (listof identifier?) (listof identifier?) +;; (union syntax? false/c) syntax?) -> syntax? +(define (fix-check-syntax input-terms start ends assocs prods) + (define term-binders (get-term-list input-terms)) + (define get-term-binder + (let ([t (make-hasheq)]) + (for ([term (in-list term-binders)]) + (hash-set! t (syntax-e term) term)) + (λ (x) + (define r (hash-ref t (syntax-e x) (λ () #f))) + (if r + (syntax-local-introduce (datum->syntax r (syntax-e x) x x)) + x)))) + (define rhs-list (syntax-case prods () + [((_ RHS ...) ...) (syntax->list #'(RHS ... ...))])) + (with-syntax ([(TMP ...) (map syntax-local-introduce term-binders)] + [(TERM-GROUP ...) + (map (λ (tg) + (syntax-property + (datum->syntax tg #f) + 'disappeared-use + tg)) + input-terms)] + [(END ...) (map get-term-binder ends)] + [(START ...) (map get-term-binder start)] + [(BIND ...) (syntax-case prods () + (((BIND _ ...) ...) + (syntax->list #'(BIND ...))))] + [((BOUND ...) ...) + (map (λ (rhs) + (syntax-case rhs () + [((BOUND ...) (_ PBOUND) __) + (map get-term-binder + (cons #'PBOUND (syntax->list #'(BOUND ...))))] + [((BOUND ...) _) + (map get-term-binder + (syntax->list #'(BOUND ...)))])) + rhs-list)] + [(PREC ...) + (if assocs + (map get-term-binder + (syntax-case assocs () + (((__ TERM ...) ...) + (syntax->list #'(TERM ... ...))))) + null)]) + #`(when #f + (let ((BIND void) ... (TMP void) ...) + (void BOUND ... ... TERM-GROUP ... START ... END ... PREC ...))))) + +(require racket/list "parser-actions.rkt") + +(define (build-parser filename src-pos suppress input-terms start end assocs prods) + (define grammar (parse-input input-terms start end assocs prods src-pos)) + (define table (build-table grammar filename suppress)) + (define all-tokens (make-hasheq)) + (define actions-code `(vector ,@(map prod-action (send grammar get-prods)))) + + (for ([term (in-list (send grammar get-terms))]) + (hash-set! all-tokens (gram-sym-symbol term) #t)) + + #;(let ((num-states (vector-length table)) + (num-gram-syms (+ (send grammar get-num-terms) + (send grammar get-num-non-terms))) + (num-ht-entries (apply + (map length (vector->list table)))) + (num-reduces + (let ((ht (make-hasheq))) + (for-each + (λ (x) + (when (reduce? x) + (hash-set! ht x #t))) + (map cdr (apply append (vector->list table)))) + (length (hash-table-map ht void))))) + (printf "~a states, ~a grammar symbols, ~a hash-table entries, ~a reduces\n" + num-states num-gram-syms num-ht-entries num-reduces) + (printf "~a -- ~aKB, previously ~aKB\n" + (/ (+ 2 num-states + (* 4 num-states) (* 2 1.5 num-ht-entries) + (* 5 num-reduces)) 256.0) + (/ (+ 2 num-states + (* 4 num-states) (* 2 2.3 num-ht-entries) + (* 5 num-reduces)) 256.0) + (/ (+ 2 (* num-states num-gram-syms) (* 5 num-reduces)) 256.0))) + (values table + all-tokens + actions-code + (fix-check-syntax input-terms start end assocs prods))) + diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/table.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/table.rkt new file mode 100644 index 0000000..81c9a8d --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/table.rkt @@ -0,0 +1,264 @@ +#lang racket/base +(require "grammar.rkt" + "lr0.rkt" + "lalr.rkt" + "parser-actions.rkt" + racket/contract + racket/list + racket/class) + +;; Routine to build the LALR table + + +(define (is-a-grammar%? x) (is-a? x grammar%)) +(provide/contract + (build-table (-> is-a-grammar%? string? any/c + (vectorof (listof (cons/c (or/c term? non-term?) action?)))))) + +;; A parse-table is (vectorof (listof (cons/c gram-sym? action))) +;; A grouped-parse-table is (vectorof (listof (cons/c gram-sym? (listof action)))) + +;; make-parse-table : int -> parse-table +(define (make-parse-table num-states) + (make-vector num-states null)) + +;; table-add!: parse-table nat symbol action -> +(define (table-add! table state-index symbol val) + (vector-set! table state-index (cons (cons symbol val) + (vector-ref table state-index)))) + +;; group-table : parse-table -> grouped-parse-table +(define (group-table table) + (list->vector + (for/list ([state-entry (in-list (vector->list table))]) + (define ht (make-hasheq)) + (for* ([gs/actions (in-list state-entry)] + [group (in-value (hash-ref ht (car gs/actions) (λ () null)))] + #:unless (member (cdr gs/actions) group)) + (hash-set! ht (car gs/actions) (cons (cdr gs/actions) group))) + (hash-map ht cons)))) + +;; table-map : (vectorof (listof (cons/c gram-sym? X))) (gram-sym? X -> Y) -> +;; (vectorof (listof (cons/c gram-sym? Y))) +(define (table-map f table) + (list->vector + (for/list ([state-entry (in-list (vector->list table))]) + (for/list ([gs/X (in-list state-entry)]) + (cons (car gs/X) (f (car gs/X) (cdr gs/X))))))) + +(define (bit-vector-for-each f bv) + (let loop ([bv bv] [number 0]) + (cond + [(zero? bv) (void)] + [(= 1 (bitwise-and 1 bv)) + (f number) + (loop (arithmetic-shift bv -1) (add1 number))] + [else (loop (arithmetic-shift bv -1) (add1 number))]))) + + +;; print-entry: symbol action output-port -> +;; prints the action a for lookahead sym to the given port +(define (print-entry sym a port) + (define s "\t~a\t\t\t\t\t~a\t~a\n") + (cond + [(shift? a) (fprintf port s sym "shift" (shift-state a))] + [(reduce? a) (fprintf port s sym "reduce" (prod-index (reduce-prod a)))] + [(accept? a) (fprintf port s sym "accept" "")] + [(goto? a) (fprintf port s sym "goto" (goto-state a))])) + + +;; count: ('a -> bool) * 'a list -> num +;; counts the number of elements in list that satisfy pred +(define (count pred list) + (cond + [(null? list) 0] + [(pred (car list)) (+ 1 (count pred (cdr list)))] + [else (count pred (cdr list))])) + +;; display-parser: LR0-automaton grouped-parse-table (listof prod?) output-port -> +;; Prints out the parser given by table. +(define (display-parser a grouped-table prods port) + (define SR-conflicts 0) + (define RR-conflicts 0) + (for ([prod (in-list prods)]) + (fprintf port + "~a\t~a\t=\t~a\n" + (prod-index prod) + (gram-sym-symbol (prod-lhs prod)) + (map gram-sym-symbol (vector->list (prod-rhs prod))))) + + (send a for-each-state + (λ (state) + (fprintf port "State ~a\n" (kernel-index state)) + (for ([item (in-list (kernel-items state))]) + (fprintf port "\t~a\n" (item->string item))) + (newline port) + (for ([gs/action (in-list (vector-ref grouped-table (kernel-index state)))]) + (define sym (gram-sym-symbol (car gs/action))) + (define act (cdr gs/action)) + (cond + [(null? act) (void)] + [(null? (cdr act)) + (print-entry sym (car act) port)] + [else + (fprintf port "begin conflict:\n") + (when (> (count reduce? act) 1) + (set! RR-conflicts (add1 RR-conflicts))) + (when (> (count shift? act) 0) + (set! SR-conflicts (add1 SR-conflicts))) + (map (λ (x) (print-entry sym x port)) act) + (fprintf port "end conflict\n")])) + (newline port))) + + (when (> SR-conflicts 0) + (fprintf port "~a shift/reduce conflict~a\n" + SR-conflicts + (if (= SR-conflicts 1) "" "s"))) + (when (> RR-conflicts 0) + (fprintf port "~a reduce/reduce conflict~a\n" + RR-conflicts + (if (= RR-conflicts 1) "" "s")))) + +;; resolve-conflict : (listof action?) -> action? bool bool +(define (resolve-conflict actions) + (cond + [(null? actions) (values (make-no-action) #f #f)] + [(null? (cdr actions)) (values (car actions) #f #f)] + [else + (define SR-conflict? (> (count shift? actions) 0)) + (define RR-conflict? (> (count reduce? actions) 1)) + (let loop ((current-guess #f) + (rest actions)) + (cond + [(null? rest) (values current-guess SR-conflict? RR-conflict?)] + [(shift? (car rest)) (values (car rest) SR-conflict? RR-conflict?)] + [(not current-guess) (loop (car rest) (cdr rest))] + [(and (reduce? (car rest)) + (< (prod-index (reduce-prod (car rest))) + (prod-index (reduce-prod current-guess)))) + (loop (car rest) (cdr rest))] + [(accept? (car rest)) + (eprintf "accept/reduce or accept/shift conflicts. Check the grammar for useless cycles of productions\n") + (loop current-guess (cdr rest))] + [else (loop current-guess (cdr rest))]))])) + +;; resolve-conflicts : grouped-parse-table bool -> parse-table +(define (resolve-conflicts grouped-table suppress) + (define SR-conflicts 0) + (define RR-conflicts 0) + (define table (table-map + (λ (gs actions) + (let-values ([(action SR? RR?) + (resolve-conflict actions)]) + (when SR? + (set! SR-conflicts (add1 SR-conflicts))) + (when RR? + (set! RR-conflicts (add1 RR-conflicts))) + action)) + grouped-table)) + (unless suppress + (when (> SR-conflicts 0) + (eprintf "~a shift/reduce conflict~a\n" + SR-conflicts + (if (= SR-conflicts 1) "" "s"))) + (when (> RR-conflicts 0) + (eprintf "~a reduce/reduce conflict~a\n" + RR-conflicts + (if (= RR-conflicts 1) "" "s")))) + table) + + +;; resolve-sr-conflict : (listof action) (union int #f) -> (listof action) +;; Resolves a single shift-reduce conflict, if precedences are in place. +(define (resolve-sr-conflict/prec actions shift-prec) + (define shift (if (shift? (car actions)) + (car actions) + (cadr actions))) + (define reduce (if (shift? (car actions)) + (cadr actions) + (car actions))) + (define reduce-prec (prod-prec (reduce-prod reduce))) + (cond + [(and shift-prec reduce-prec) + (cond + [(< (prec-num shift-prec) (prec-num reduce-prec)) + (list reduce)] + [(> (prec-num shift-prec) (prec-num reduce-prec)) + (list shift)] + [(eq? 'left (prec-assoc shift-prec)) + (list reduce)] + [(eq? 'right (prec-assoc shift-prec)) + (list shift)] + [else null])] + [else actions])) + + +;; resolve-prec-conflicts : parse-table -> grouped-parse-table +(define (resolve-prec-conflicts table) + (table-map + (λ (gs actions) + (cond + [(and (term? gs) + (= 2 (length actions)) + (or (shift? (car actions)) + (shift? (cadr actions)))) + (resolve-sr-conflict/prec actions (term-prec gs))] + [else actions])) + (group-table table))) + +;; build-table: grammar string bool -> parse-table +(define (build-table g file suppress) + (define a (build-lr0-automaton g)) + (define term-vector (list->vector (send g get-terms))) + (define end-terms (send g get-end-terms)) + (define table (make-parse-table (send a get-num-states))) + (define get-lookahead (compute-LA a g)) + (define reduce-cache (make-hash)) + (for ([trans-key/state (in-list (send a get-transitions))]) + (define from-state-index (kernel-index (trans-key-st (car trans-key/state)))) + (define gs (trans-key-gs (car trans-key/state))) + (define to-state (cdr trans-key/state)) + + (table-add! table from-state-index gs + (cond + ((non-term? gs) + (make-goto (kernel-index to-state))) + ((member gs end-terms) + (make-accept)) + (else + (make-shift + (kernel-index to-state)))))) + (send a for-each-state + (λ (state) + (for ([item (in-list (append (hash-ref (send a get-epsilon-trans) state (λ () null)) + (filter (λ (item) + (not (move-dot-right item))) + (kernel-items state))))]) + (let ([item-prod (item-prod item)]) + (bit-vector-for-each + (λ (term-index) + (unless (start-item? item) + (let ((r (hash-ref reduce-cache item-prod + (λ () + (let ((r (make-reduce item-prod))) + (hash-set! reduce-cache item-prod r) + r))))) + (table-add! table + (kernel-index state) + (vector-ref term-vector term-index) + r)))) + (get-lookahead state item-prod)))))) + + (define grouped-table (resolve-prec-conflicts table)) + (unless (string=? file "") + (with-handlers [(exn:fail:filesystem? + (λ (e) + (eprintf + "Cannot write debug output to file \"~a\": ~a\n" + file + (exn-message e))))] + (call-with-output-file file + (λ (port) + (display-parser a grouped-table (send g get-prods) port)) + #:exists 'truncate))) + (resolve-conflicts grouped-table suppress)) diff --git a/br-parser-tools-lib/br-parser-tools/private-yacc/yacc-helper.rkt b/br-parser-tools-lib/br-parser-tools/private-yacc/yacc-helper.rkt new file mode 100644 index 0000000..5f63471 --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/private-yacc/yacc-helper.rkt @@ -0,0 +1,71 @@ +#lang racket/base +(require (prefix-in rl: racket/list) + "../private-lex/token-syntax.rkt") + +;; General helper routines +(provide duplicate-list? remove-duplicates overlap? vector-andmap display-yacc) + +(define (vector-andmap pred vec) + (for/and ([item (in-vector vec)]) + (pred vec))) + +;; duplicate-list?: symbol list -> #f | symbol +;; returns a symbol that exists twice in l, or false if no such symbol +;; exists +(define (duplicate-list? syms) + (rl:check-duplicates syms eq?)) + +;; remove-duplicates: syntax-object list -> syntax-object list +;; removes the duplicates from the lists +(define (remove-duplicates syms) + (rl:remove-duplicates syms equal? #:key syntax->datum)) + +;; overlap?: symbol list * symbol list -> #f | symbol +;; Returns an symbol in l1 intersect l2, or #f is no such symbol exists +(define (overlap? syms1 syms2) + (for/first ([sym1 (in-list syms1)] + #:when (memq sym1 syms2)) + sym1)) + + +(define (display-yacc grammar tokens start precs port) + (let-syntax ([p (syntax-rules () + ((_ args ...) (fprintf port args ...)))]) + (let* ([tokens (map syntax-local-value tokens)] + [eterms (filter e-terminals-def? tokens)] + [terms (filter terminals-def? tokens)] + [term-table (make-hasheq)] + [display-rhs + (λ (rhs) + (for ([sym (in-list (car rhs))]) + (p "~a " (hash-ref term-table sym (λ () sym)))) + (when (= 3 (length rhs)) + (p "%prec ~a" (cadadr rhs))) + (p "\n"))]) + (for* ([t (in-list eterms)] + [t (in-list (syntax->datum (e-terminals-def-t t)))]) + (hash-set! term-table t (format "'~a'" t))) + (for* ([t (in-list terms)] + [t (in-list (syntax->datum (terminals-def-t t)))]) + (p "%token ~a\n" t) + (hash-set! term-table t (format "~a" t))) + (when precs + (for ([prec (in-list precs)]) + (p "%~a " (car prec)) + (for ([tok (in-list (cdr prec))]) + (p " ~a" (hash-ref term-table tok))) + (p "\n"))) + (p "%start ~a\n" start) + (p "%%\n") + (for ([prod (in-list grammar)]) + (define nt (car prod)) + (p "~a: " nt) + (display-rhs (cadr prod)) + (for ([rhs (in-list (cddr prod))]) + (p "| ") + (display-rhs rhs)) + (p ";\n")) + (p "%%\n")))) + + + diff --git a/br-parser-tools-lib/br-parser-tools/yacc-to-scheme.rkt b/br-parser-tools-lib/br-parser-tools/yacc-to-scheme.rkt new file mode 100644 index 0000000..d0e97fe --- /dev/null +++ b/br-parser-tools-lib/br-parser-tools/yacc-to-scheme.rkt @@ -0,0 +1,130 @@ +#lang racket/base +(require br-parser-tools/lex + (prefix-in : br-parser-tools/lex-sre) + br-parser-tools/yacc + syntax/readerr + racket/list) +(provide trans) + +(define match-double-string + (lexer + [(:+ (:~ #\" #\\)) (append (string->list lexeme) + (match-double-string input-port))] + [(:: #\\ any-char) (cons (string-ref lexeme 1) (match-double-string input-port))] + [#\" null])) + +(define match-single-string + (lexer + [(:+ (:~ #\' #\\)) (append (string->list lexeme) + (match-single-string input-port))] + [(:: #\\ any-char) (cons (string-ref lexeme 1) (match-single-string input-port))] + [#\' null])) + +(define-lex-abbrevs + [letter (:or (:/ "a" "z") (:/ "A" "Z"))] + [digit (:/ "0" "9")] + [initial (:or letter (char-set "!$%&*/<=>?^_~@"))] + [subsequent (:or initial digit (char-set "+-.@"))] + [comment (:: "/*" (complement (:: any-string "*/" any-string)) "*/")]) + +(define-empty-tokens x (EOF PIPE |:| SEMI |%%| %prec)) +(define-tokens y (SYM STRING)) + +(define get-token-grammar + (lexer-src-pos + ["%%" '|%%|] + [":" (string->symbol lexeme)] + ["%prec" (string->symbol lexeme)] + [#\| 'PIPE] + [(:+ (:or #\newline #\tab " " comment (:: "{" (:* (:~ "}")) "}"))) + (return-without-pos (get-token-grammar input-port))] + [#\; 'SEMI] + [#\' (token-STRING (string->symbol (list->string (match-single-string input-port))))] + [#\" (token-STRING (string->symbol (list->string (match-double-string input-port))))] + [(:: initial (:* subsequent)) (token-SYM (string->symbol lexeme))])) + +(define (parse-grammar enter-term enter-empty-term enter-non-term) + (parser + (tokens x y) + (src-pos) + (error (λ (tok-ok tok-name tok-value start-pos end-pos) + (raise-read-error + (format "Error Parsing YACC grammar at token: ~a with value: ~a" tok-name tok-value) + (file-path) + (position-line start-pos) + (position-col start-pos) + (position-offset start-pos) + (- (position-offset end-pos) (position-offset start-pos))))) + + (end |%%|) + (start gram) + (grammar + (gram + ((production) (list $1)) + ((production gram) (cons $1 $2))) + (production + ((SYM |:| prods SEMI) + (begin + (enter-non-term $1) + (cons $1 $3)))) + (prods + ((rhs) (list `(,$1 #f))) + ((rhs prec) (list `(,$1 ,$2 #f))) + ((rhs PIPE prods) (cons `(,$1 #f) $3)) + ((rhs prec PIPE prods) (cons `(,$1 ,$2 #f) $4))) + (prec + ((%prec SYM) + (begin + (enter-term $2) + (list 'prec $2))) + ((%prec STRING) + (begin + (enter-empty-term $2) + (list 'prec $2)))) + (rhs + (() null) + ((SYM rhs) + (begin + (enter-term $1) + (cons $1 $2))) + ((STRING rhs) + (begin + (enter-empty-term $1) + (cons $1 $2))))))) + +(define (symbolstring a) (symbol->string b))) + +(define (trans filename) + (define i (open-input-file filename)) + (define terms (make-hasheq)) + (define eterms (make-hasheq)) + (define nterms (make-hasheq)) + (define (enter-term s) + (when (not (hash-ref nterms s (λ () #f))) + (hash-set! terms s #t))) + (define (enter-empty-term s) + (when (not (hash-ref nterms s (λ () #f))) + (hash-set! eterms s #t))) + (define (enter-non-term s) + (hash-remove! terms s) + (hash-remove! eterms s) + (hash-set! nterms s #t)) + (port-count-lines! i) + (file-path filename) + (regexp-match "%%" i) + (begin0 + (let ([gram ((parse-grammar enter-term enter-empty-term enter-non-term) + (λ () + (let ((t (get-token-grammar i))) + t)))]) + `(begin + (define-tokens t ,(sort (hash-map terms (λ (k v) k)) symbol +;; (vectorof (symbol runtime-action hashtable)) +(define-for-syntax (convert-parse-table table) + (for/vector ([state-entry (in-vector table)]) + (let ([ht (make-hasheq)]) + (for ([gs/action (in-list state-entry)]) + (hash-set! ht + (gram-sym-symbol (car gs/action)) + (action->runtime-action (cdr gs/action)))) + ht))) + +(define-syntax (parser stx) + (syntax-case stx () + [(_ ARGS ...) + (let ([arg-list (syntax->list #'(ARGS ...))] + [src-pos #f] + [debug #f] + [error #f] + [tokens #f] + [start #f] + [end #f] + [precs #f] + [suppress #f] + [grammar #f] + [yacc-output #f]) + (for ([arg (in-list (syntax->list #'(ARGS ...)))]) + (syntax-case* arg (debug error tokens start end precs grammar + suppress src-pos yacc-output) + (λ (a b) (eq? (syntax-e a) (syntax-e b))) + [(debug FILENAME) + (cond + [(not (string? (syntax-e #'FILENAME))) + (raise-syntax-error #f "Debugging filename must be a string" stx #'FILENAME)] + [debug (raise-syntax-error #f "Multiple debug declarations" stx)] + [else (set! debug (syntax-e #'FILENAME))])] + [(suppress) (set! suppress #t)] + [(src-pos) (set! src-pos #t)] + [(error EXPRESSION) + (if error + (raise-syntax-error #f "Multiple error declarations" stx) + (set! error #'EXPRESSION))] + [(tokens DEF ...) + (begin + (when tokens + (raise-syntax-error #f "Multiple tokens declarations" stx)) + (let ((defs (syntax->list #'(DEF ...)))) + (for ([d (in-list defs)] + #:unless (identifier? d)) + (raise-syntax-error #f "Token-group name must be an identifier" stx d)) + (set! tokens defs)))] + [(start symbol ...) + (let ([symbols (syntax->list #'(symbol ...))]) + (for ([sym (in-list symbols)] + #:unless (identifier? sym)) + (raise-syntax-error #f "Start symbol must be a symbol" stx sym)) + (when start + (raise-syntax-error #f "Multiple start declarations" stx)) + (when (null? symbols) + (raise-syntax-error #f "Missing start symbol" stx arg)) + (set! start symbols))] + [(end SYMBOLS ...) + (let ((symbols (syntax->list #'(SYMBOLS ...)))) + (for ([sym (in-list symbols)] + #:unless (identifier? sym)) + (raise-syntax-error #f "End token must be a symbol" stx sym)) + (let ([d (duplicate-list? (map syntax-e symbols))]) + (when d + (raise-syntax-error #f (format "Duplicate end token definition for ~a" d) stx arg)) + (when (null? symbols) + (raise-syntax-error #f "end declaration must contain at least 1 token" stx arg)) + (when end + (raise-syntax-error #f "Multiple end declarations" stx)) + (set! end symbols)))] + [(precs DECLS ...) + (if precs + (raise-syntax-error #f "Multiple precs declarations" stx) + (set! precs (syntax/loc arg (DECLS ...))))] + [(grammar PRODS ...) + (if grammar + (raise-syntax-error #f "Multiple grammar declarations" stx) + (set! grammar (syntax/loc arg (PRODS ...))))] + [(yacc-output FILENAME) + (cond + [(not (string? (syntax-e #'FILENAME))) + (raise-syntax-error #f "Yacc-output filename must be a string" stx #'FILENAME)] + [yacc-output + (raise-syntax-error #f "Multiple yacc-output declarations" stx)] + [else + (set! yacc-output (syntax-e #'FILENAME))])] + [_ (raise-syntax-error #f "argument must match (debug filename), (error expression), (tokens def ...), (start non-term), (end tokens ...), (precs decls ...), or (grammar prods ...)" stx arg)])) + (unless tokens + (raise-syntax-error #f "missing tokens declaration" stx)) + (unless error + (raise-syntax-error #f "missing error declaration" stx)) + (unless grammar + (raise-syntax-error #f "missing grammar declaration" stx)) + (unless end + (raise-syntax-error #f "missing end declaration" stx)) + (unless start + (raise-syntax-error #f "missing start declaration" stx)) + (let-values ([(table all-term-syms actions check-syntax-fix) + (build-parser (if debug debug "") + src-pos + suppress + tokens + start + end + precs + grammar)]) + (when (and yacc-output (not (string=? yacc-output ""))) + (with-handlers [(exn:fail:filesystem? + (λ (e) (eprintf "Cannot write yacc-output to file \"~a\"\n" yacc-output)))] + (call-with-output-file yacc-output + (λ (port) + (display-yacc (syntax->datum grammar) + tokens + (map syntax->datum start) + (and precs (syntax->datum precs)) + port)) + #:exists 'truncate))) + (with-syntax ([check-syntax-fix check-syntax-fix] + [err error] + [ends end] + [starts start] + [debug debug] + [table (convert-parse-table table)] + [all-term-syms all-term-syms] + [actions actions] + [src-pos src-pos]) + #'(begin + check-syntax-fix + (parser-body debug err (quote starts) (quote ends) table all-term-syms actions src-pos)))))] + [_ (raise-syntax-error #f "parser must have the form (parser args ...)" stx)])) + +(define (reduce-stack stack num ret-vals src-pos) + (cond + [(positive? num) + (define top-frame (car stack)) + (let ([ret-vals (if src-pos + (cons (stack-frame-value top-frame) + (cons (stack-frame-start-pos top-frame) + (cons (stack-frame-end-pos top-frame) + ret-vals))) + (cons (stack-frame-value top-frame) ret-vals))]) + (reduce-stack (cdr stack) (sub1 num) ret-vals src-pos))] + [else (values stack ret-vals)])) + +;; extract-helper : (symbol or make-token) any any -> symbol any any any +(define (extract-helper tok v1 v2) + (cond + [(symbol? tok) (values tok #f v1 v2)] + [(token? tok) (values (real-token-name tok) (real-token-value tok) v1 v2)] + [else (raise-argument-error 'parser "(or/c symbol? token?)" 0 tok)])) + +;; well-formed-position-token?: any -> boolean +;; Returns true if pt is a position token whose position-token-token +;; is itself a token or a symbol. +;; This is meant to help raise more precise error messages when +;; a tokenizer produces an erroneous position-token wrapped twice. +;; (as often happens when omitting return-without-pos). +(define (well-formed-token-field? t) + (or (symbol? t) (token? t))) + +(define (well-formed-position-token? pt) + (and (position-token? pt) + (well-formed-token-field? (position-token-token pt)))) + +(define (well-formed-srcloc-token? st) + (and (srcloc-token? st) + (well-formed-token-field? (srcloc-token-token st)))) + +;; extract-src-pos : position-token -> symbol any any any +(define (extract-src-pos ip) + (unless (well-formed-position-token? ip) + (raise-argument-error 'parser "well-formed-position-token?" 0 ip)) + (extract-helper (position-token-token ip) + (position-token-start-pos ip) + (position-token-end-pos ip))) + +(define (extract-srcloc ip) + (unless (well-formed-srcloc-token? ip) + (raise-argument-error 'parser "well-formed-srcloc-token?" 0 ip)) + (define loc (srcloc-token-srcloc ip)) + (extract-helper (srcloc-token-token ip) + (position-token (srcloc-position loc) (srcloc-line loc) (srcloc-column loc)) + (position-token (+ (srcloc-position loc) (srcloc-span loc)) #f #f))) + + +;; extract-no-src-pos : (symbol or make-token) -> symbol any any any +(define (extract-no-src-pos ip) + (extract-helper ip #f #f)) + +(define-struct stack-frame (state value start-pos end-pos) #:inspector (make-inspector)) + +(define (make-empty-stack i) (list (make-stack-frame i #f #f #f))) + + +;; The table is a vector that maps each state to a hash-table that maps a +;; terminal symbol to either an accept, shift, reduce, or goto structure. +; We encode the structures according to the runtime-action data definition in +;; parser-actions.rkt +(define (parser-body debug? err starts ends table all-term-syms actions src-pos) + (local ((define extract + (if src-pos + extract-src-pos + extract-no-src-pos)) + + (define (fix-error stack tok val start-pos end-pos get-token) + (when debug? (pretty-print stack)) + (local ((define (remove-input tok val start-pos end-pos) + (if (memq tok ends) + (raise-read-error "parser: Cannot continue after error" + #f #f #f #f #f) + (let ([a (find-action stack tok val start-pos end-pos)]) + (cond + [(runtime-shift? a) + ;; (printf "shift:~a\n" (runtime-shift-state a)) + (cons (make-stack-frame (runtime-shift-state a) + val + start-pos + end-pos) + stack)] + [else + ;; (printf "discard input:~a\n" tok) + (let-values ([(tok val start-pos end-pos) + (extract (get-token))]) + (remove-input tok val start-pos end-pos))]))))) + (let remove-states () + (let ([a (find-action stack 'error #f start-pos end-pos)]) + (cond + [(runtime-shift? a) + ;; (printf "shift:~a\n" (runtime-shift-state a)) + (set! stack + (cons + (make-stack-frame (runtime-shift-state a) + #f + start-pos + end-pos) + stack)) + (remove-input tok val start-pos end-pos)] + [else + ;; (printf "discard state:~a\n" (car stack)) + (cond + [(< (length stack) 2) + (raise-read-error "parser: Cannot continue after error" + #f #f #f #f #f)] + [else + (set! stack (cdr stack)) + (remove-states)])]))))) + + (define (find-action stack tok val start-pos end-pos) + (unless (hash-ref all-term-syms tok #f) + (if src-pos + (err #f tok val start-pos end-pos) + (err #f tok val)) + (raise-read-error (format "parser: got token of unknown type ~a" tok) + #f #f #f #f #f)) + (hash-ref (vector-ref table (stack-frame-state (car stack))) tok #f)) + + (define ((make-parser start-number) get-token) + (unless (and (procedure? get-token) + (procedure-arity-includes? get-token 0)) + (error 'get-token "expected a nullary procedure, got ~e" get-token)) + (let parsing-loop ([stack (make-empty-stack start-number)] + [ip (get-token)]) + (let-values ([(tok val start-pos end-pos) (extract ip)]) + (let ([action (find-action stack tok val start-pos end-pos)]) + (cond + [(runtime-shift? action) + ;; (printf "shift:~a\n" (runtime-shift-state action)) + (parsing-loop (cons (make-stack-frame (runtime-shift-state action) + val + start-pos + end-pos) + stack) + (get-token))] + [(runtime-reduce? action) + ;; (printf "reduce:~a\n" (runtime-reduce-prod-num action)) + (let-values ([(new-stack args) + (reduce-stack stack + (runtime-reduce-rhs-length action) + null + src-pos)]) + (let ([goto + (runtime-goto-state + (hash-ref + (vector-ref table (stack-frame-state (car new-stack))) + (runtime-reduce-lhs action)))]) + (parsing-loop + (cons + (if src-pos + (make-stack-frame + goto + (apply (vector-ref actions (runtime-reduce-prod-num action)) args) + (if (null? args) start-pos (cadr args)) + (if (null? args) + end-pos + (list-ref args (- (* (runtime-reduce-rhs-length action) 3) 1)))) + (make-stack-frame + goto + (apply (vector-ref actions (runtime-reduce-prod-num action)) args) + #f + #f)) + new-stack) + ip)))] + [(runtime-accept? action) + ;; (printf "accept\n") + (stack-frame-value (car stack))] + [else + (if src-pos + (err #t tok val start-pos end-pos) + (err #t tok val)) + (parsing-loop (fix-error stack tok val start-pos end-pos get-token) + (get-token))])))))) + (cond + [(null? (cdr starts)) (make-parser 0)] + [else + (for/list ([(l i) (in-indexed starts)]) + (make-parser i))]))) diff --git a/br-parser-tools-lib/info.rkt b/br-parser-tools-lib/info.rkt new file mode 100644 index 0000000..1e24a13 --- /dev/null +++ b/br-parser-tools-lib/info.rkt @@ -0,0 +1,9 @@ +#lang info + +(define collection 'multi) +(define deps '("scheme-lib" + "base" + "compatibility-lib")) +(define build-deps '("rackunit-lib")) + +(define pkg-desc "implementation (no documentation) part of \"br-parser-tools\"") diff --git a/br-parser-tools/LICENSE.txt b/br-parser-tools/LICENSE.txt new file mode 100644 index 0000000..d012f58 --- /dev/null +++ b/br-parser-tools/LICENSE.txt @@ -0,0 +1,11 @@ +parser-tools +Copyright (c) 2010-2014 PLT Design Inc. + +This package is distributed under the GNU Lesser General Public +License (LGPL). This means that you can link this package into proprietary +applications, provided you follow the rules stated in the LGPL. You +can also modify this package; if you distribute a modified version, +you must distribute it under the terms of the LGPL, which in +particular means that you must release the source code for the +modified software. See http://www.gnu.org/copyleft/lesser.html +for more information. diff --git a/br-parser-tools/info.rkt b/br-parser-tools/info.rkt new file mode 100644 index 0000000..6a692a8 --- /dev/null +++ b/br-parser-tools/info.rkt @@ -0,0 +1,12 @@ +#lang info + +(define collection 'multi) + +(define deps '("br-parser-tools-lib" + "br-parser-tools-doc")) +(define implies '("br-parser-tools-lib" + "br-parser-tools-doc")) + +(define pkg-desc "Lex- and Yacc-style parsing tools") + +(define pkg-authors '(mflatt))