From 4d349837a605b4345b22da107a8daeb5da0cc695 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Mon, 11 Jun 2018 11:00:40 -0700 Subject: [PATCH] Add curly quantifier notation --- brag/brag.scrbl | 65 +++++++++++++----------- brag/codegen/codegen.rkt | 6 +-- brag/codegen/flatten.rkt | 78 ++++++++++++++++------------- brag/examples/curly-quantifier.rkt | 7 +++ brag/private/colorer.rkt | 2 +- brag/rules/lexer.rkt | 7 +-- brag/rules/parser.rkt | 22 ++++++-- brag/rules/rule-structs.rkt | 3 +- brag/rules/stx.rkt | 4 +- brag/test/test-curly-quantifier.rkt | 23 +++++++++ info.rkt | 11 ++-- 11 files changed, 147 insertions(+), 81 deletions(-) create mode 100644 brag/examples/curly-quantifier.rkt create mode 100755 brag/test/test-curly-quantifier.rkt diff --git a/brag/brag.scrbl b/brag/brag.scrbl index ee720a1..1ca918e 100755 --- a/brag/brag.scrbl +++ b/brag/brag.scrbl @@ -231,7 +231,7 @@ We write a @tt{brag} program as an BNF grammar, where patterns can be: @itemize[ @item{the names of other rules (e.g. @racket[chunk])} @item{literal and symbolic token names (e.g. @racket[";"], @racket[INTEGER])} - @item{quantified patterns (e.g. @litchar{+} to represent one-or-more repetitions)} + @item{quantified patterns (e.g. @litchar{+} to represent one or more repetitions)} ] The result of a @tt{brag} program is a module with a @racket[parse] function that can parse tokens and produce a syntax object as a result. @@ -661,35 +661,41 @@ A program in the @tt{brag} language consists of the language line @litchar{#lang brag}, followed by a collection of @tech{rule}s and @tech{line comment}s. -A @deftech{rule} is a sequence consisting of: a @tech{rule identifier}, a colon -@litchar{":"}, and a @tech{pattern}. +A @deftech{rule} is a sequence consisting of: a @tech{rule identifier}, a separator (either @litchar{":"} or @litchar{"::="}), and a @tech{pattern}. A @deftech{rule identifier} is an @tech{identifier} that is not in upper case. A @deftech{symbolic token identifier} is an @tech{identifier} that is in upper case. -An @deftech{identifier} is a character sequence of letters, numbers, and -characters in @racket["-.!$%&/<=>?^_~@"]. It must not contain -@litchar{*} or @litchar{+}, as those characters are used to denote -quantification. +An @deftech{identifier} is a sequence of letters, numbers, or +characters in the set @racket["-.!$%&/<=>?^_~@"]. It must not contain +@litchar{*}, @litchar{+}, or @litchar|{{}| and @litchar|{}}|, as those characters are used to denote quantification. A @deftech{pattern} is one of the following: @itemize[ @item{an implicit sequence of @tech{pattern}s separated by whitespace} - @item{a terminal: either a literal string or a @tech{symbolic token identifier}. + @item{a @deftech{terminal}: either a literal string or a @tech{symbolic token identifier}. - When used in a pattern, both these terminals will match the same set of inputs. A literal string can match the string itself, or a @racket[token] structure whose type field contains that string (or its symbol form). So @racket["FOO"] would match @racket["FOO"], @racket[(token "FOO" "bar")], or @racket[(token 'FOO "bar")]. A symbolic token identifier can also match the string version of the identifier, or a @racket[token] whose type field is the symbol or string form of the identifier. So @racket[FOO] would also match @racket["FOO"], @racket[(token 'FOO "bar")], or @racket[(token "FOO" "bar")]. (In every case, the value of a token, like @racket["bar"], can be anything, and may or may not be the same as its type.) + When used in a pattern, both kinds of terminals will match the same set of inputs. - Because their underlying meanings are the same, the symbolic token identifier ends up being a notational convenience for readability inside a grammar pattern. Typically, the literal string @racket["FOO"] is used to connote ``match the string @racket["FOO"] exactly'' and the symbolic token identifier @racket[FOO] specially connotes ``match any token of type @racket['FOO]''. + A literal string can match the string itself, or a @racket[token] structure whose type field contains that string (or its symbol form). So @racket["FOO"] in a rule pattern would match the tokens @racket["FOO"], @racket[(token "FOO" "bar")], or @racket[(token 'FOO "bar")]. + + A symbolic token identifier can also match the string version of the identifier, or a @racket[token] whose type field is the symbol or string form of the identifier. So @racket[FOO] in a rule pattern would @emph{also} match the tokens @racket["FOO"], @racket[(token 'FOO "bar")], or @racket[(token "FOO" "bar")]. (In every case, the value of a token, like @racket["bar"], can be anything, and may or may not be the same as its type.) + + Because their underlying meanings are the same, the symbolic token identifier ends up being a notational convenience for readability inside a rule pattern. Typically, the literal string @racket["FOO"] is used to connote ``match the string @racket["FOO"] exactly'' and the symbolic token identifier @racket[FOO] specially connotes ``match a token of type @racket['FOO]''. You @bold{cannot} use the literal string @racket["error"] as a terminal in a grammar, because it's reserved for @tt{brag}. You can, however, adjust your lexer to package it inside a token structure — say, @racket[(token 'ERROR "error")] — and then use the symbolic token identifier @racket[ERROR] in the grammar to match this token structure. } @item{a @tech{rule identifier}} + @item{a @deftech{choice pattern}: a sequence of @tech{pattern}s delimited with @litchar{|} characters.} - @item{a @deftech{quantifed pattern}: a @tech{pattern} followed by either @litchar{*} (``zero or more'') or @litchar{+} (``one or more'')} + + @item{a @deftech{quantified pattern}: a @tech{pattern} followed by either @litchar{*} (``zero or more'') or @litchar{+} (``one or more''). Quantification can also be denoted by integers within curly brackets. So @litchar|{{2}}| means ``exactly 2''; @litchar|{{2,5}}| means ``between 2 and 5, inclusive''; @litchar|{{2,}}| means ``2 or more''; and @litchar|{{,5}}| means ``up to 5''.} + @item{an @deftech{optional pattern}: a @tech{pattern} surrounded by @litchar{[} and @litchar{]}} + @item{an explicit sequence: a @tech{pattern} surrounded by @litchar{(} and @litchar{)}}] A @deftech{line comment} begins with either @litchar{#} or @litchar{;} and @@ -931,7 +937,7 @@ bindings. The most important of these is @racket[parse]: @item{For terminals, the value of the token.} @item{For @tech{rule identifier}s: the associated parse value for the rule.} @item{For @tech{choice pattern}s: the associated parse value for one of the matching subpatterns.} - @item{For @tech{quantifed pattern}s and @tech{optional pattern}s: the corresponding values, spliced into the structure.} + @item{For @tech{quantified pattern}s and @tech{optional pattern}s: the corresponding values, spliced into the structure.} ] Consequently, it's only the presence of @tech{rule identifier}s in a rule's @@ -1090,44 +1096,45 @@ In addition to the exports shown below, the @racketmodname[brag/support] module @defform[(:* re ...)]{ - Repetition of @racket[re] sequence 0 or more times.} +0 or more occurrences of any @racket[re] pattern.} @defform[(:+ re ...)]{ - Repetition of @racket[re] sequence 1 or more times.} +1 or more occurrences of any @racket[re] pattern.} @defform[(:? re ...)]{ - Zero or one occurrence of @racket[re] sequence.} +0 or 1 occurrence of any @racket[re] pattern.} @defform[(:= n re ...)]{ - Exactly @racket[n] occurrences of @racket[re] sequence, where - @racket[n] must be a literal exact, non-negative number.} +Exactly @racket[n] occurrences of any @racket[re] pattern, where +@racket[n] must be a literal exact, non-negative number.} @defform[(:>= n re ...)]{ - At least @racket[n] occurrences of @racket[re] sequence, where - @racket[n] must be a literal exact, non-negative number.} +At least @racket[n] occurrences of any @racket[re] pattern, where +@racket[n] must be a literal exact, non-negative number.} @defform[(:** n m re ...)]{ - Between @racket[n] and @racket[m] (inclusive) occurrences of - @racket[re] sequence, where @racket[n] must be a literal exact, - non-negative number, and @racket[m] must be literally either - @racket[#f], @racket[+inf.0], or an exact, non-negative number; a - @racket[#f] value for @racket[m] is the same as @racket[+inf.0].} +Between @racket[n] and @racket[m] (inclusive) occurrences of +any @racket[re] pattern, where @racket[n] must be a literal exact, +non-negative number, and @racket[m] must be literally either +@racket[#f], @racket[+inf.0], or an exact, non-negative number; a +@racket[#f] value for @racket[m] is the same as @racket[+inf.0].} @defform[(:or re ...)]{ - Same as @racket[(union re ...)].} +Same as @racket[(:union re ...)].} @deftogether[( - @defform[(:: re ...)] - @defform[(:seq re ...)] - )]{ +@defform[(:: re ...)] +@defform[(:seq re ...)] +)]{ - Both forms concatenate the @racket[re]s.} +Both forms concatenate the @racket[re]s into a single, indivisible pattern. +In other words, this matches @emph{all} the @racket[re]s in order, whereas @racket[(:union re ...)] matches @emph{any} of the @racket[re]s.} @defform[(:& re ...)]{ diff --git a/brag/codegen/codegen.rkt b/brag/codegen/codegen.rkt index 14d6697..e2381f8 100755 --- a/brag/codegen/codegen.rkt +++ b/brag/codegen/codegen.rkt @@ -301,7 +301,7 @@ [explicit explicit]) ([v (in-list (syntax->list #'(vals ...)))]) (loop v implicit explicit))] - [(repeat min val) + [(repeat min max val) (loop #'val implicit explicit)] [(maybe val) (loop #'val implicit explicit)] @@ -379,7 +379,7 @@ (for/fold ([acc acc]) ([v (in-list (syntax->list #'(vals ...)))]) (loop v acc))] - [(repeat min val) + [(repeat min max val) (loop #'val acc)] [(maybe val) (loop #'val acc)] @@ -425,7 +425,7 @@ (define a-child (process-pattern v)) (sat:add-child! an-or-node a-child)) an-or-node)] - [(repeat min val) + [(repeat min max val) (syntax-case #'min () [0 (make-leaf)] diff --git a/brag/codegen/flatten.rkt b/brag/codegen/flatten.rkt index ad3ecfc..74f8a0a 100755 --- a/brag/codegen/flatten.rkt +++ b/brag/codegen/flatten.rkt @@ -1,7 +1,7 @@ #lang racket/base (require brag/rules/stx-types + racket/list (for-syntax racket/base)) - (provide flatten-rule flatten-rules prim-rule) @@ -68,67 +68,75 @@ (values (apply append (reverse rules)) (apply append (reverse patterns)))) - (with-syntax ([head (if inferred? #'inferred-prim-rule #'prim-rule)] - [origin (syntax-case a-rule (rule) [(rule name (pat-head rest ...)) #'pat-head])]) + (with-syntax ([HEAD (if inferred? #'inferred-prim-rule #'prim-rule)] + [ORIGIN (syntax-case a-rule (rule) [(rule name (pat-head rest ...)) #'pat-head])]) (syntax-case a-rule (rule) - [(rule name pat) - (syntax-case #'pat (id inferred-id lit token choice repeat maybe seq) + [(rule NAME PAT) + (syntax-case #'PAT (id inferred-id lit token choice repeat maybe seq) ;; The primitive types stay as they are: [(id val) - (list #'(head origin name [pat]))] + (list #'(HEAD ORIGIN NAME [PAT]))] [(inferred-id val reason) - (list #'(head origin name [pat]))] + (list #'(HEAD ORIGIN NAME [PAT]))] [(lit val) - (list #'(head origin name [pat]))] + (list #'(HEAD ORIGIN NAME [PAT]))] [(token val) - (list #'(head origin name [pat]))] + (list #'(HEAD ORIGIN NAME [PAT]))] ;; Everything else might need lifting: - [(choice sub-pat ...) + [(choice SUB-PAT ...) (begin (define-values (inferred-ruless/rev new-sub-patss/rev) (for/fold ([rs '()] [ps '()]) - ([p (syntax->list #'(sub-pat ...))]) + ([p (syntax->list #'(SUB-PAT ...))]) (let-values ([(new-r new-p) (lift-nonprimitive-pattern p)]) (values (cons new-r rs) (cons new-p ps))))) - (with-syntax ([((sub-pat ...) ...) (reverse new-sub-patss/rev)]) - (append (list #'(head origin name [sub-pat ...] ...)) + (with-syntax ([((SUB-PAT ...) ...) (reverse new-sub-patss/rev)]) + (append (list #'(HEAD ORIGIN NAME [SUB-PAT ...] ...)) (apply append (reverse inferred-ruless/rev)))))] - [(repeat min sub-pat) + [(repeat MIN #f SUB-PAT) + ;; indefinite repeat (begin (define-values (inferred-rules new-sub-pats) - (lift-nonprimitive-pattern #'sub-pat)) - (with-syntax ([(sub-pat ...) new-sub-pats]) - (cons (cond [(= (syntax-e #'min) 0) - #`(head origin name - [(inferred-id name repeat) sub-pat ...] - [])] - [(= (syntax-e #'min) 1) - #`(head origin name - [(inferred-id name repeat) sub-pat ...] - [sub-pat ...])]) + (lift-nonprimitive-pattern #'SUB-PAT)) + (with-syntax ([(SUB-PAT ...) new-sub-pats] + [MIN-REPEAT-SUB-PATS (apply append (make-list (syntax-e #'MIN) new-sub-pats))]) + (cons #`(HEAD ORIGIN NAME + [(inferred-id NAME repeat) SUB-PAT ...] + MIN-REPEAT-SUB-PATS) inferred-rules)))] - - [(maybe sub-pat) + + [(repeat MIN MAX SUB-PAT) + ;; finite repeat + (begin + (define min (syntax-e #'MIN)) + (define max (syntax-e #'MAX)) + (define new-rule-stx (with-syntax ([(MIN-SUBPAT ...) (make-list min #'SUB-PAT)] + [(EXTRA-SUBPAT ...) (make-list (- max min) #'SUB-PAT)]) + ;; has to keep the same name to work correctly + #'(rule NAME (seq MIN-SUBPAT ... (maybe EXTRA-SUBPAT) ...)))) + (recur new-rule-stx #f))] + + [(maybe SUB-PAT) (begin (define-values (inferred-rules new-sub-pats) - (lift-nonprimitive-pattern #'sub-pat)) - (with-syntax ([(sub-pat ...) new-sub-pats]) - (cons #'(head origin name - [sub-pat ...] + (lift-nonprimitive-pattern #'SUB-PAT)) + (with-syntax ([(SUB-PAT ...) new-sub-pats]) + (cons #'(HEAD ORIGIN NAME + [SUB-PAT ...] []) inferred-rules)))] - [(seq sub-pat ...) + [(seq SUB-PAT ...) (begin (define-values (inferred-rules new-sub-pats) - (lift-nonprimitive-patterns (syntax->list #'(sub-pat ...)))) - (with-syntax ([(sub-pat ...) new-sub-pats]) - (cons #'(head origin name [sub-pat ...]) + (lift-nonprimitive-patterns (syntax->list #'(SUB-PAT ...)))) + (with-syntax ([(SUB-PAT ...) new-sub-pats]) + (cons #'(HEAD ORIGIN NAME [SUB-PAT ...]) inferred-rules)))])])))) @@ -162,7 +170,7 @@ #t] [(choice sub-pat ...) #f] - [(repeat min val) + [(repeat min max val) #f] [(maybe sub-pat) #f] diff --git a/brag/examples/curly-quantifier.rkt b/brag/examples/curly-quantifier.rkt new file mode 100644 index 0000000..7f0d961 --- /dev/null +++ b/brag/examples/curly-quantifier.rkt @@ -0,0 +1,7 @@ +#lang brag +;; test the curly quantifier +start : a-rule | b-rule | c-rule | d-rule +a-rule : "a"{2} ; exactly 2 +b-rule : "b"{,2} ; up to 2 +c-rule : "c"{2,} ; 2 or more +d-rule : "d"{2,3} ; 2 or 3 \ No newline at end of file diff --git a/brag/private/colorer.rkt b/brag/private/colorer.rkt index 8b71244..eb2449a 100644 --- a/brag/private/colorer.rkt +++ b/brag/private/colorer.rkt @@ -9,7 +9,7 @@ ;; (for DrRacket selections etc) [whitespace (token 'WHITE lexeme)] [(:or (from/to "'" "'") (from/to "\"" "\"")) (token 'LIT lexeme)] - [(:or (char-set "()[]|+*:") hide-char splice-char) (token 'MISC lexeme)] + [(:or (char-set "()[]{}|+*:") hide-char splice-char) (token 'MISC lexeme)] [(:seq (:or "#" ";") (complement (:seq (:* any-char) NL (:* any-char))) (:or NL "")) (token 'COMMENT lexeme)] [id (token 'ID lexeme)] [any-char (token 'OTHER lexeme)])) diff --git a/brag/rules/lexer.rkt b/brag/rules/lexer.rkt index 8665142..9aaec3d 100755 --- a/brag/rules/lexer.rkt +++ b/brag/rules/lexer.rkt @@ -13,8 +13,8 @@ ;; A newline can be any one of the following. (define-lex-abbrev NL (:or "\r\n" "\r" "\n")) -;; chars used for quantifiers & parse-tree filtering -(define-for-syntax quantifiers "+:*") ; colon is reserved to separate rules and productions +;; reserved-chars = chars used for quantifiers & parse-tree filtering +(define-for-syntax quantifiers "+:*{}") ; colon is reserved to separate rules and productions (define-lex-trans reserved-chars (λ(stx) #`(char-set #,(format "~a~a~a" quantifiers hide-char splice-char)))) @@ -64,7 +64,8 @@ (token-SPLICE lexeme)] ["|" (token-PIPE lexeme)] - [(:or "+" "*") + [(:or "+" "*" + (:: "{" (:* digit) (:? (:: "," (:* digit))) "}")) (token-REPEAT lexeme)] [whitespace ;; Skip whitespace diff --git a/brag/rules/parser.rkt b/brag/rules/parser.rkt index 9d25117..f51f9f6 100755 --- a/brag/rules/parser.rkt +++ b/brag/rules/parser.rkt @@ -157,11 +157,25 @@ (cond [(string=? $2 "*") (pattern-repeat (position->pos $1-start-pos) (position->pos $2-end-pos) - 0 $1)] + 0 #f $1)] [(string=? $2 "+") (pattern-repeat (position->pos $1-start-pos) (position->pos $2-end-pos) - 1 $1)] + 1 #f $1)] + [(regexp-match #px"^\\{(\\d+)?(,)?(\\d+)?\\}$" $2) ; "{min,max}" with both min & max optional + => (λ (m) + (match-define (list min-repeat max-repeat) + (match m + [(list _ min range? max) (let ([min (if min (string->number min) 0)]) + (list + min + (cond + [(and range? max) (string->number max)] + [(and (not range?) (not max)) min] ; {3} -> {3,3} + [(not max) #f])))])) + (pattern-repeat (position->pos $1-start-pos) + (position->pos $2-end-pos) + min-repeat max-repeat $1))] [else (error 'grammar-parse "unknown repetition operator ~e" $2)])] [(atomic-pattern) @@ -222,8 +236,8 @@ (pattern-lit start-pos end-pos v (or hide? h))] [(pattern-choice _ _ vs) (pattern-choice start-pos end-pos vs)] - [(pattern-repeat _ _ m v) - (pattern-repeat start-pos end-pos m v)] + [(pattern-repeat _ _ min max v) + (pattern-repeat start-pos end-pos min max v)] [(pattern-maybe _ _ v) (pattern-maybe start-pos end-pos v)] [(pattern-seq _ _ vs) diff --git a/brag/rules/rule-structs.rkt b/brag/rules/rule-structs.rkt index 5b5968e..262de1b 100755 --- a/brag/rules/rule-structs.rkt +++ b/brag/rules/rule-structs.rkt @@ -31,7 +31,8 @@ (struct pattern-choice pattern (vals) #:transparent) -(struct pattern-repeat pattern (min ;; either 0 or 1 +(struct pattern-repeat pattern (min + max val) #:transparent) diff --git a/brag/rules/stx.rkt b/brag/rules/stx.rkt index 9c66685..9d5f5dd 100755 --- a/brag/rules/stx.rkt +++ b/brag/rules/stx.rkt @@ -80,8 +80,8 @@ 'hide hide)] [(struct pattern-choice (start end vals)) (datum->syntax #f`(choice ,@(map recur vals)) source-location)] - [(struct pattern-repeat (start end min val)) - (datum->syntax #f`(repeat ,min ,(recur val)) source-location)] + [(struct pattern-repeat (start end min max val)) + (datum->syntax #f`(repeat ,min ,max ,(recur val)) source-location)] [(struct pattern-maybe (start end val)) (datum->syntax #f`(maybe ,(recur val)) source-location)] [(struct pattern-seq (start end vals)) diff --git a/brag/test/test-curly-quantifier.rkt b/brag/test/test-curly-quantifier.rkt new file mode 100755 index 0000000..c531368 --- /dev/null +++ b/brag/test/test-curly-quantifier.rkt @@ -0,0 +1,23 @@ +#lang racket/base +(require brag/examples/curly-quantifier + brag/support + rackunit) + +(check-exn exn:fail:parsing? (λ () (parse-to-datum "a"))) +(check-equal? (parse-to-datum "aa") '(start (a-rule "a" "a"))) +(check-exn exn:fail:parsing? (λ () (parse-to-datum "aaa"))) + +(check-equal? (parse-to-datum "") '(start (b-rule))) +(check-equal? (parse-to-datum "b") '(start (b-rule "b"))) +(check-equal? (parse-to-datum "bb") '(start (b-rule "b" "b"))) +(check-exn exn:fail:parsing? (λ () (parse-to-datum "bbb"))) + +(check-exn exn:fail:parsing? (λ () (parse-to-datum "c"))) +(check-equal? (parse-to-datum "cc") '(start (c-rule "c" "c"))) +(check-equal? (parse-to-datum "ccc") '(start (c-rule "c" "c" "c"))) +(check-equal? (parse-to-datum "cccc") '(start (c-rule "c" "c" "c" "c"))) + +(check-exn exn:fail:parsing? (λ () (parse-to-datum "d"))) +(check-equal? (parse-to-datum "dd") '(start (d-rule "d" "d"))) +(check-equal? (parse-to-datum "ddd") '(start (d-rule "d" "d" "d"))) +(check-exn exn:fail:parsing? (λ () (parse-to-datum "dddd"))) diff --git a/info.rkt b/info.rkt index 000d84a..0b408bf 100755 --- a/info.rkt +++ b/info.rkt @@ -1,8 +1,13 @@ #lang info (define version "1.0") -(define deps '("base" "br-parser-tools-lib" "rackunit-lib")) -(define build-deps '("at-exp-lib" "br-parser-tools-doc" "racket-doc" - "scribble-lib")) (define collection 'multi) +(define deps '("base" + "br-parser-tools-lib" + "rackunit-lib")) +(define build-deps '("at-exp-lib" + "br-parser-tools-doc" + "racket-doc" + "scribble-lib")) +(define update-implies '("br-parser-tools-lib"))