support empty-set notation

7 years ago · de1c42d4f9
parent cf5e686ea6
commit de1c42d4f9
6 changed files with 49 additions and 6 deletions
--- a/brag/brag.scrbl
+++ b/brag/brag.scrbl
@ -673,6 +673,9 @@ A @deftech{rule identifier} is an @tech{identifier} that is not in upper case.

 A @deftech{symbolic token identifier} is an @tech{identifier} that is in upper case.

+A @deftech{line comment} begins with either @litchar{#} or @litchar{;} and
+continues till the end of the line.
+
 An @deftech{identifier} is a sequence of letters, numbers, or
 characters in the set @racket["-.!$%&/<=>?^_~@"]. It must not contain
@litchar{*}, @litchar{+}, or @litchar|{{}| and @litchar|{}}|, as those characters are used to denote quantification.
@ -680,7 +683,8 @@ characters in the set @racket["-.!$%&/<=>?^_~@"]. It must not contain

 A @deftech{pattern} is one of the following:
@itemize[
- @item{an implicit sequence of @tech{pattern}s separated by whitespace}
+ @item{an implicit sequence of @tech{pattern}s separated by whitespace.}
+
 @item{a @deftech{terminal}: either a literal string or a @tech{symbolic token identifier}. 

  When used in a pattern, both kinds of terminals will match the same set of inputs. 
@ -694,7 +698,7 @@ A @deftech{pattern} is one of the following:
  You @bold{cannot} use the literal string @racket["error"] as a terminal in a grammar, because it's reserved for @tt{brag}. You can, however, adjust your lexer to package it inside a token structure — say, @racket[(token 'ERROR "error")] — and then use the symbolic token identifier @racket[ERROR] in the grammar to match this token structure.
 }

- @item{a @tech{rule identifier}}
+ @item{a @tech{rule identifier}.}
 
 @item{a @deftech{choice pattern}: a sequence of @tech{pattern}s delimited with @litchar{|} characters.}

@ -702,10 +706,10 @@ A @deftech{pattern} is one of the following:
 
 @item{an @deftech{optional pattern}: a @tech{pattern} surrounded by @litchar{[} and @litchar{]}. (The @litchar{?} zero-or-one quantifier means the same thing.)}
 
- @item{an explicit sequence: a @tech{pattern} surrounded by @litchar{(} and @litchar{)}}]
+ @item{an explicit @deftech{sequence}: a @tech{pattern} surrounded by @litchar{(} and @litchar{)}.}

-A @deftech{line comment} begins with either @litchar{#} or @litchar{;} and
-continues till the end of the line.
+ @item{the @deftech{empty set}: a special @tech{pattern} that matches a list of zero tokens. When it appears on the right side of a rule, the empty set will match empty input (which obviously contains zero tokens), but also the ``gap'' between two existing tokens (which less obviously also contains zero tokens). The empty set can be denoted by @litchar{[]} (empty square brackets), @litchar{∅} (the Unicode empty-set character), or @litchar{Ø} (the slashed O).}
+ ]


 For example, in the following program:
--- a/brag/examples/empty-symbol.rkt
+++ b/brag/examples/empty-symbol.rkt
@ -0,0 +1,6 @@
+#lang brag
+
+top : xs | ys | zs
+xs : [] | "x" xs
+ys : Ø | "y" /ys
+zs : ∅ | "z" @zs
--- a/brag/rules/lexer.rkt
+++ b/brag/rules/lexer.rkt
@ -56,6 +56,7 @@
        (:or (:* (:or "\\\"" esc-chars (:~ "\"" "\\"))) "\\\\")
        "\"")
    (token-LIT (unescape-lexeme lexeme #\"))]
+   [(:or "[]" "Ø" "∅") (token-EMPTY lexeme)]
   ["("
    (token-LPAREN lexeme)]
   ["["
--- a/brag/rules/parser.rkt
+++ b/brag/rules/parser.rkt
@ -24,6 +24,7 @@
         token-ID
         token-LIT
         token-EOF
+         token-EMPTY
         grammar-parser
         
         current-source
@ -53,7 +54,8 @@
                       RULE_HEAD_SPLICED
                       ID
                       LIT
-                       EOF))
+                       EOF
+                       EMPTY))

 (define hide-char #\/)
 (define splice-char #\@)
@ -198,6 +200,14 @@
                      (position->pos $1-end-pos)
                      $1
                      #f))]
+
+     [(EMPTY)
+      (pattern-repeat (position->pos $1-start-pos)
+                      (position->pos $1-end-pos)
+                      0 0 (pattern-lit (position->pos $1-start-pos)
+                                       (position->pos $1-end-pos)
+                                       "" #f)
+                      #f)]
     
     [(LBRACKET pattern RBRACKET)
      (pattern-repeat (position->pos $1-start-pos)
--- a/brag/test/test-all.rkt
+++ b/brag/test/test-all.rkt
@ -7,6 +7,7 @@
         "test-baby-json-hider.rkt"
         "test-curly-quantifier.rkt"
         "test-cutter.rkt"
+         "test-empty-symbol.rkt"
         "test-errors.rkt"
         "test-flatten.rkt"
         "test-lexer.rkt"
--- a/brag/test/test-empty-symbol.rkt
+++ b/brag/test/test-empty-symbol.rkt
@ -0,0 +1,21 @@
+#lang racket/base
+(require brag/examples/empty-symbol
+         brag/support
+         rackunit)
+
+(check-true (and (member (parse-to-datum "") (list '(top (xs)) '(top (ys)) '(top (zs)))) #t))
+
+;; x is normal
+(check-equal? (parse-to-datum "x") '(top (xs "x" (xs))))
+(check-equal? (parse-to-datum "xx") '(top (xs "x" (xs "x" (xs)))))
+(check-equal? (parse-to-datum "xxx") '(top (xs "x" (xs "x" (xs "x" (xs))))))
+
+;; y cuts
+(check-equal? (parse-to-datum "y") '(top (ys "y")))
+(check-equal? (parse-to-datum "yy") '(top (ys "y")))
+(check-equal? (parse-to-datum "yyy") '(top (ys "y")))
+
+;; z splices
+(check-equal? (parse-to-datum "z") '(top (zs "z")))
+(check-equal? (parse-to-datum "zz") '(top (zs "z" "z")))
+(check-equal? (parse-to-datum "zzz") '(top (zs "z" "z" "z")))