diff --git a/collects/parser-tools/lex.ss b/collects/parser-tools/lex.ss index 9538131..1c80b1f 100644 --- a/collects/parser-tools/lex.ss +++ b/collects/parser-tools/lex.ss @@ -14,7 +14,7 @@ (lib "cffi.ss" "compiler") "private-lex/token.ss") - (provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs + (provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs define-lex-trans position-offset position-line position-col position? define-tokens define-empty-tokens token-name token-value token? file-path any-char any-string nothing alphabetic lower-case upper-case title-case @@ -28,15 +28,16 @@ (lambda (stx) (syntax-case stx () ((_) - (raise-syntax-error #f "empty lexer is not allowed" stx)) + (raise-syntax-error #f "accepts the empty string" stx)) ((_ re-act ...) (begin (for-each (lambda (x) (syntax-case x () ((re act) (void)) - (_ (raise-syntax-error 'lexer - "expects regular expression / action pairs" + (_ (raise-syntax-error #f + "not a regular expression / action pair" + stx x)))) (syntax->list (syntax (re-act ...)))) (let* ((spec/re-act-lst @@ -88,13 +89,14 @@ (define-syntax (define-lex-abbrev stx) (syntax-case stx () ((_ name re) + (identifier? (syntax name)) (syntax (define-syntax name (make-lex-abbrev (quote-syntax re))))) (_ (raise-syntax-error #f - "Form should be (define-lex-abbrev name re)" + "form should be (define-lex-abbrev name re)" stx)))) (define-syntax (define-lex-abbrevs stx) @@ -107,8 +109,9 @@ (identifier? (syntax name)) (syntax (define-lex-abbrev name re))) (_ (raise-syntax-error - 'Lexer-abbreviation - "Form should be (identifier value)" + #f + "form should be (define-lex-abbrevs (name re) ...)" + stx a)))) abbrev))) (datum->syntax-object @@ -118,19 +121,19 @@ (_ (raise-syntax-error #f - "Form should be (define-lex-abbrevs (name re) ...)" + "form should be (define-lex-abbrevs (name re) ...)" stx)))) (define-syntax (define-lex-trans stx) (syntax-case stx () ((_ name-form body-form) (let-values (((name body) - (normalize-definition (syntax (define-syntax name-form body-form) #'lambda)))) - #`(define-syntax name (make-lex-trans body)))) + (normalize-definition (syntax (define-syntax name-form body-form)) #'lambda))) + #`(define-syntax #,name (make-lex-trans #,body)))) (_ (raise-syntax-error #f - "Form should be (define-lex-trans name transformer)" + "form should be (define-lex-trans name transformer)" stx)))) diff --git a/collects/parser-tools/private-lex/deriv.ss b/collects/parser-tools/private-lex/deriv.ss index 73b3058..84ae825 100644 --- a/collects/parser-tools/private-lex/deriv.ss +++ b/collects/parser-tools/private-lex/deriv.ss @@ -12,7 +12,7 @@ ;; Don't do anything with this one but extract the chars - (define all-chars (->re `(^) (make-cache))) + (define all-chars (->re `(char-complement (union)) (make-cache))) ;; get-char-groups : re bool -> (list-of char-setR?) ;; Collects the char-setRs in r that could be used in @@ -43,19 +43,23 @@ ((get-char-groups e #f) null) ((get-char-groups z #f) null) ((get-char-groups r1 #f) (list r1)) - ((get-char-groups (->re `(@ ,r1 ,r2) c) #f) + ((get-char-groups (->re `(concatenation ,r1 ,r2) c) #f) (list r1)) - ((get-char-groups (->re `(@ ,e ,r2) c) #f) + ((get-char-groups (->re `(concatenation ,e ,r2) c) #f) (list r2)) - ((get-char-groups (->re `(@ (* ,r1) ,r2) c) #f) + ((get-char-groups (->re `(concatenation (repeat 0 +inf.0 ,r1) ,r2) c) #f) (list r1 r2)) - ((get-char-groups (->re `(* ,r1) c) #f) + ((get-char-groups (->re `(repeat 0 +inf.0 ,r1) c) #f) (list r1)) - ((get-char-groups (->re `(: (* ,r1) (@ (* ,r2) "3") "4") c) #f) + ((get-char-groups + (->re `(union (repeat 0 +inf.0 ,r1) + (concatenation (repeat 0 +inf.0 ,r2) "3") "4") c) #f) (list r1 r2 (->re "3" c) (->re "4" c))) - ((get-char-groups (->re `(~ ,r1) c) #f) + ((get-char-groups (->re `(complement ,r1) c) #f) (list all-chars r1)) - ((get-char-groups (->re `(& (* ,r1) (@ (* ,r2) "3") "4") c) #f) + ((get-char-groups + (->re `(intersection (repeat 0 +inf.0 ,r1) + (concatenation (repeat 0 +inf.0 ,r2) "3") "4") c) #f) (list r1 r2 (->re "3" c) (->re "4" c))) ) (define loc:member? is:member?) @@ -74,7 +78,11 @@ (build-or (list d (deriveR r2 c cache)) cache) d))) ((repeatR? r) - (build-concat (deriveR (repeatR-re r) c cache) r cache)) + (build-concat (deriveR (repeatR-re r) c cache) + (build-repeat (sub1 (repeatR-low r)) + (sub1 (repeatR-high r)) + (repeatR-re r) cache) + cache)) ((orR? r) (build-or (map (lambda (x) (deriveR x c cache)) (orR-res r)) @@ -90,14 +98,14 @@ (a (char->integer #\a)) (b (char->integer #\b)) (r1 (->re #\a c)) - (r2 (->re `(* #\a) c)) - (r3 (->re `(* ,r2) c)) - (r4 (->re `(@ #\a ,r2) c)) - (r5 (->re `(* ,r4) c)) - (r6 (->re `(: ,r5 #\a) c)) - (r7 (->re `(@ ,r2 ,r2) c)) - (r8 (->re `(~ ,r4) c)) - (r9 (->re `(& ,r2 ,r4) c))) + (r2 (->re `(repeat 0 +inf.0 #\a) c)) + (r3 (->re `(repeat 0 +inf.0 ,r2) c)) + (r4 (->re `(concatenation #\a ,r2) c)) + (r5 (->re `(repeat 0 +inf.0 ,r4) c)) + (r6 (->re `(union ,r5 #\a) c)) + (r7 (->re `(concatenation ,r2 ,r2) c)) + (r8 (->re `(complement ,r4) c)) + (r9 (->re `(intersection ,r2 ,r4) c))) ((deriveR e a c) z) ((deriveR z a c) z) ((deriveR r1 b c) z) @@ -108,16 +116,18 @@ ((deriveR r3 b c) z) ((deriveR r4 a c) r2) ((deriveR r4 b c) z) - ((deriveR r5 a c) (->re `(@ ,r2 ,r5) c)) + ((deriveR r5 a c) (->re `(concatenation ,r2 ,r5) c)) ((deriveR r5 b c) z) - ((deriveR r6 a c) (->re `(: (@ ,r2 ,r5) (epsilon)) c)) + ((deriveR r6 a c) (->re `(union (concatenation ,r2 ,r5) "") c)) ((deriveR r6 b c) z) - ((deriveR r7 a c) (->re `(: (@ ,r2 ,r2) ,r2) c)) + ((deriveR r7 a c) (->re `(union (concatenation ,r2 ,r2) ,r2) c)) ((deriveR r7 b c) z) - ((deriveR r8 a c) (->re `(~, r2) c)) - ((deriveR r8 b c) (->re `(~ ,z) c)) + ((deriveR r8 a c) (->re `(complement, r2) c)) + ((deriveR r8 b c) (->re `(complement ,z) c)) ((deriveR r9 a c) r2) - ((deriveR r9 b c) z)) + ((deriveR r9 b c) z) + ((deriveR (->re `(repeat 1 2 "ab") c) a c) + (->re `(concatenation "b" (repeat 0 1 "ab")) c))) ;; An re-action is (cons re action) @@ -161,9 +171,10 @@ ((derive a (c->i #\a) c) (list (cons e 1) (cons z 2))) ((derive a (c->i #\b) c) (list (cons z 1) (cons e 2))) ((derive a (c->i #\c) c) #f) - ((derive (list (cons (->re `(: " " "\n" ",") c) 1) - (cons (->re `(@ (? "-") (+ (- "0" "9"))) c) 2) - (cons (->re `(@ "-" (+ "-")) c) 3) + ((derive (list (cons (->re `(union " " "\n" ",") c) 1) + (cons (->re `(concatenation (repeat 0 1 "-") + (repeat 1 +inf.0 (char-range "0" "9"))) c) 2) + (cons (->re `(concatenation "-" (repeat 1 +inf.0 "-")) c) 3) (cons (->re "[" c) 4) (cons (->re "]" c) 5)) (c->i #\[) c) b) @@ -195,11 +206,12 @@ (test-block ((c (make-cache)) (c->i char->integer) - (r1 (->re `(- #\1 #\4) c)) - (r2 (->re `(- #\2 #\3) c))) + (r1 (->re `(char-range #\1 #\4) c)) + (r2 (->re `(char-range #\2 #\3) c))) ((compute-chars null) null) ((compute-chars (list (make-state null 1))) null) - ((map is:integer-set-contents (compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2)))) + ((map is:integer-set-contents + (compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2)))) (list (is:integer-set-contents (is:make-range (c->i #\2) (c->i #\3))) (is:integer-set-contents (is:union (is:make-range (c->i #\1)) (is:make-range (c->i #\4))))))) @@ -292,30 +304,35 @@ (define t1 (build-test-dfa null)) (define t2 (build-test-dfa `(#\a))) (define t3 (build-test-dfa `(#\a #\b))) - (define t4 (build-test-dfa `((* #\a) - (* (@ #\a #\b))))) - (define t5 (build-test-dfa `((@ (* (: #\0 #\1)) #\1)))) - (define t6 (build-test-dfa `((* (* #\a)) - (* (@ #\b (* #\b)))))) - (define t7 (build-test-dfa `((@ (* #\a) (* #\b) (* #\c) (* #\d) (* #\e))))) + (define t4 (build-test-dfa `((repeat 0 +inf.0 #\a) + (repeat 0 +inf.0 (concatenation #\a #\b))))) + (define t5 (build-test-dfa `((concatenation (repeat 0 +inf.0 (union #\0 #\1)) #\1)))) + (define t6 (build-test-dfa `((repeat 0 +inf.0 (repeat 0 +inf.0 #\a)) + (repeat 0 +inf.0 (concatenation #\b (repeat 1 +inf.0 #\b)))))) + (define t7 (build-test-dfa `((concatenation (repeat 0 +inf.0 #\a) (repeat 0 +inf.0 #\b) + (repeat 0 +inf.0 #\c) (repeat 0 +inf.0 #\d) + (repeat 0 +inf.0 #\e))))) (define t8 - (build-test-dfa `((@ (* (: #\a #\b)) #\a (: #\a #\b) (: #\a #\b) (: #\a #\b) (: #\a #\b))))) - (define t9 (build-test-dfa `((@ "/*" - (~ (@ (&) "*/" (&))) + (build-test-dfa `((concatenation (repeat 0 +inf.0 (union #\a #\b)) #\a (union #\a #\b) + (union #\a #\b) (union #\a #\b) (union #\a #\b))))) + (define t9 (build-test-dfa `((concatenation "/*" + (complement (concatenation (intersection) "*/" (intersection))) "*/")))) - (define t11 (build-test-dfa `((~ "1")))) - (define t12 (build-test-dfa `((@ (& (@ (* "a") "b") - (@ "a" (* "b"))) - "ab")))) - (define x (build-test-dfa `((: " " "\n" ",") - (@ (? "-") (+ (- "0" "9"))) - (@ "-" (+ "-")) + (define t11 (build-test-dfa `((complement "1")))) + (define t12 (build-test-dfa `((concatenation (intersection (concatenation (repeat 0 +inf.0 "a") "b") + (concatenation "a" (repeat 0 +inf.0 "b"))) + "ab")))) + (define x (build-test-dfa `((union " " "\n" ",") + (concatenation (repeat 0 1 "-") (repeat 1 +inf.0 (char-range "0" "9"))) + (concatenation "-" (repeat 1 +inf.0 "-")) "[" "]"))) - (define y (build-test-dfa `((+ (: (@ "|" (* (^ "|")) "|") - (@ "|" (* (^ "|")))))))) - (define t13 (build-test-dfa `((& (@ (&) "111" (&)) - (~ (: (@ (&) "01") - (+ "1"))))))) + (define y (build-test-dfa + `((repeat 1 +inf.0 + (union (concatenation "|" (repeat 0 +inf.0 (char-complement "|")) "|") + (concatenation "|" (repeat 0 +inf.0 (char-complement "|")))))))) + (define t13 (build-test-dfa `((intersection (concatenation (intersection) "111" (intersection)) + (complement (union (concatenation (intersection) "01") + (repeat 1 +inf.0 "1"))))))) |# ) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/error-tests.ss b/collects/parser-tools/private-lex/error-tests.ss new file mode 100644 index 0000000..6f27da3 --- /dev/null +++ b/collects/parser-tools/private-lex/error-tests.ss @@ -0,0 +1,35 @@ +(define-lex-abbrev) +(define-lex-abbrev a) +(define-lex-abbrev (a b) v) +(define-lex-abbrev 1 1) +(define-lex-abbrevs ()) + +(define-lex-trans) +(define-lex-trans (1 b) 1) + +(lexer) +(lexer ("a" "b" "c")) +(lexer ()) +(lexer ("")) + +(lexer (a 1)) +(lexer ((a) 1)) +(let ((a 1)) + (lexer ((a) 1))) +(let-syntax ((a 1)) + (lexer ((a) 1))) +(let () + (define-lex-trans a 1) + (let () + (lexer ((a) 1)))) + +(lexer (1 1)) +(lexer ((repetition) 1)) +(lexer ((repetition #\1 #\1 "3") 1)) +(lexer ((repetition 1 #\1 "3") 1)) +(lexer ((repetition 1 0 "3") 1)) +(lexer ((complement) 1)) +(lexer ((char-range) 1)) +(lexer ((char-range #\9 #\0) 1)) +(lexer ((char-complement) 1)) +(lexer ((char-complement (concatenation "1" "2")) 1)) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/re.ss b/collects/parser-tools/private-lex/re.ss index 06e775c..6d403f0 100644 --- a/collects/parser-tools/private-lex/re.ss +++ b/collects/parser-tools/private-lex/re.ss @@ -7,8 +7,8 @@ (provide ->re build-epsilon build-zero build-char-set build-concat build-repeat build-or build-and build-neg epsilonR? zeroR? char-setR? concatR? repeatR? orR? andR? negR? - char-setR-chars concatR-re1 concatR-re2 repeatR-re orR-res - andR-res negR-re + char-setR-chars concatR-re1 concatR-re2 repeatR-re repeatR-low repeatR-high + orR-res andR-res negR-re re-nullable? re-index) (define max-char-num #x7FFFFFFF) @@ -21,9 +21,9 @@ ;; - (make-zeroR bool nat) ;; - (make-char-setR bool nat char-set) ;; - (make-concatR bool nat re re) - ;; - (make-repeatR bool nat re) - ;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs - ;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs + ;; - (make-repeatR bool nat nat nat-or-+inf.0 re) + ;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs + ;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs ;; - (make-negR bool nat re) ;; ;; Every re must have an index field globally different from all @@ -33,7 +33,7 @@ (define-struct (zeroR re) () (make-inspector)) (define-struct (char-setR re) (chars) (make-inspector)) (define-struct (concatR re) (re1 re2) (make-inspector)) - (define-struct (repeatR re) (re) (make-inspector)) + (define-struct (repeatR re) (low high re) (make-inspector)) (define-struct (orR re) (res) (make-inspector)) (define-struct (andR re) (res) (make-inspector)) (define-struct (negR re) (re) (make-inspector)) @@ -47,23 +47,23 @@ (define z (make-zeroR #f (get-index))) - ;; s-re = char match the given character - ;; | string match its sequence of characters + ;; s-re = char constant + ;; | string constant (sequence of characters) ;; | re a precompiled re - ;; | (epsilon) match the empty string - ;; | (* s-re) match 0 or more - ;; | (+ s-re) match 1 or more - ;; | (? s-re) match 0 or 1 - ;; | (: s-re ...) match one of the sub-expressions - ;; | (& s-re ...) match iff all sub-expressions match - ;; | (~ s-re) match iff s-re doesn't match - ;; | (@ s-re ...) match each sub-expression in succession - ;; | (- char char) match any character between two (inclusive) - ;; | (^ char_or_range ...) match any character not listed - ;; (The null concatenation `(@) means epsilon as does "". - ;; The null or `(:) means match nothing. The null carat `(^) means match - ;; any character. The null intersection `(&) means match string.) - + ;; | (repeat low high s-re) repetition between low and high times (inclusive) + ;; | (union s-re ...) + ;; | (intersection s-re ...) + ;; | (complement s-re) + ;; | (concatenation s-re ...) + ;; | (char-range rng rng) match any character between two (inclusive) + ;; | (char-complement char-set) match any character not listed + ;; low = natural-number + ;; high = natural-number or +inf.0 + ;; rng = char or string with length 1 + ;; (concatenation) (repeat 0 0 x), and "" match the empty string. + ;; (union) matches no strings. + ;; (intersection) matches any string. + (define loc:make-range is:make-range) (define loc:union is:union) (define loc:split is:split) @@ -73,43 +73,35 @@ (define (->re exp cache) (match exp ((? char?) (build-char-set (loc:make-range (char->integer exp)) cache)) - ((? string?) (->re `(@ ,@(string->list exp)) cache)) + ((? string?) (->re `(concatenation ,@(string->list exp)) cache)) ((? re?) exp) - (`(epsilon) (build-epsilon)) - (`(* ,r) - (build-repeat (->re r cache) cache)) - (`(+ ,r) - (->re `(@ ,r (* ,r)) cache)) - (`(? ,r) - (let ((c (->re r cache))) - (if (re-nullable? c) - c - (build-or (list e c) cache)))) - (`(: ,rs ...) + (`(repeat ,low ,high ,r) + (build-repeat low high (->re r cache) cache)) + (`(union ,rs ...) (build-or (flatten-res (map (lambda (r) (->re r cache)) rs) orR? orR-res loc:union cache) cache)) - (`(& ,rs ...) + (`(intersection ,rs ...) (build-and (flatten-res (map (lambda (r) (->re r cache)) rs) andR? andR-res (lambda (a b) (let-values (((i _ __) (loc:split a b))) i)) cache) cache)) - (`(~ ,r) + (`(complement ,r) (build-neg (->re r cache) cache)) - (`(@ ,rs ...) + (`(concatenation ,rs ...) (foldr (lambda (x y) (build-concat (->re x cache) y cache)) e rs)) - (`(- ,c1 ,c2) + (`(char-range ,c1 ,c2) (let ((i1 (char->integer (if (string? c1) (string-ref c1 0) c1))) (i2 (char->integer (if (string? c2) (string-ref c2 0) c2)))) (if (<= i1 i2) (build-char-set (loc:make-range i1 i2) cache) z))) - (`(^ ,crs ...) - (let ((cs (->re `(: ,@crs) cache))) + (`(char-complement ,crs ...) + (let ((cs (->re `(union ,@crs) cache))) (cond ((zeroR? cs) (build-char-set (loc:make-range 0 max-char-num) cache)) ((char-setR? cs) @@ -174,15 +166,25 @@ (get-index) r1 r2)))))) - ;; build-repeat : re cache -> re - (define (build-repeat r cache) - (cond - ((eq? z r) e) - ((repeatR? r) r) - (else - (cache (cons 'repeat (re-index r)) - (lambda () - (make-repeatR #t (get-index) r)))))) + ;; build-repeat : nat nat-or-+inf.0 re cache -> re + (define (build-repeat low high r cache) + (let ((low (if (< low 0) 0 low))) + (cond + ((eq? r e) e) + ((and (= 0 low) (or (= 0 high) (eq? z r))) e) + ((and (= 1 low) (= 1 high)) r) + ((and (repeatR? r) + (eq? (repeatR-high r) +inf.0) + (or (= 0 (repeatR-low r)) + (= 1 (repeatR-low r)))) + (build-repeat (* low (repeatR-low r)) + +inf.0 + (repeatR-re r) + cache)) + (else + (cache (cons 'repeat (cons low (cons high (re-index r)))) + (lambda () + (make-repeatR (or (re-nullable? r) (= 0 low)) (get-index) low high r))))))) ;; build-or : (list-of re) cache -> re @@ -229,7 +231,7 @@ (r3 (build-char-set (is:make-range (char->integer #\3)) c)) (rc (build-concat r1 r2 c)) (rc2 (build-concat r2 r1 c)) - (rr (build-repeat rc c)) + (rr (build-repeat 0 +inf.0 rc c)) (ro (build-or `(,rr ,rc ,rr) c)) (ro2 (build-or `(,rc ,rr ,z) c)) (ro3 (build-or `(,rr ,rc) c)) @@ -263,9 +265,18 @@ ((orR-res ro4) (list r1 r2 r3)) ((build-or null c) z) ((build-or `(,r1 ,z) c) r1) - ((build-repeat rc c) rr) - ((build-repeat z c) e) - ((build-repeat (build-repeat rc c) c) (build-repeat rc c)) + ((build-repeat 0 +inf.0 rc c) rr) + ((build-repeat 0 1 z c) e) + ((build-repeat 0 0 rc c) e) + ((build-repeat 0 +inf.0 z c) e) + ((build-repeat -1 +inf.0 z c) e) + ((build-repeat 0 +inf.0 (build-repeat 0 +inf.0 rc c) c) + (build-repeat 0 +inf.0 rc c)) + ((build-repeat 20 20 (build-repeat 0 +inf.0 rc c) c) + (build-repeat 0 +inf.0 rc c)) + ((build-repeat 20 20 (build-repeat 1 +inf.0 rc c) c) + (build-repeat 20 +inf.0 rc c)) + ((build-repeat 1 1 rc c) rc) ((repeatR-re rr) rc) (ra ra2) (ra ra3) @@ -282,6 +293,9 @@ ((re-nullable? rc) #f) ((re-nullable? (build-concat rr rr c)) #t) ((re-nullable? rr) #t) + ((re-nullable? (build-repeat 0 1 rc c)) #t) + ((re-nullable? (build-repeat 1 2 rc c)) #f) + ((re-nullable? (build-repeat 1 2 (build-or (list e r1) c) c)) #t) ((re-nullable? ro) #t) ((re-nullable? (build-or `(,r1 ,r2) c)) #f) ((re-nullable? (build-and `(,r1 ,e) c)) #f) @@ -293,16 +307,17 @@ (isc is:integer-set-contents) (r1 (->re #\1 c)) (r2 (->re #\2 c)) - (r3-5 (->re '(- #\3 #\5) c)) + (r3-5 (->re '(char-range #\3 #\5) c)) (r4 (build-or `(,r1 ,r2) c)) - (r5 (->re `(: ,r3-5 #\7) c)) + (r5 (->re `(union ,r3-5 #\7) c)) (r6 (->re #\6 c))) ((flatten-res null orR? orR-res is:union c) null) ((isc (char-setR-chars (car (flatten-res `(,r1) orR? orR-res is:union c)))) (isc (is:make-range (char->integer #\1)))) ((isc (char-setR-chars (car (flatten-res `(,r4) orR? orR-res is:union c)))) (isc (is:make-range (char->integer #\1) (char->integer #\2)))) - ((isc (char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1) orR? orR-res is:union c)))) + ((isc (char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1) + orR? orR-res is:union c)))) (isc (is:make-range (char->integer #\1) (char->integer #\7)))) ((flatten-res `(,r1 ,r2) andR? andR-res (lambda (x y) (let-values (((i _ __) @@ -315,53 +330,58 @@ (test-block ((c (make-cache)) (isc is:integer-set-contents) (r (->re #\a c)) - (rr (->re `(@ ,r ,r) c)) - (rrr (->re `(@ ,r ,rr) c)) - (rrr* (->re `(* ,rrr) c))) + (rr (->re `(concatenation ,r ,r) c)) + (rrr (->re `(concatenation ,r ,rr) c)) + (rrr* (->re `(repeat 0 +inf.0 ,rrr) c))) ((isc (char-setR-chars r)) (isc (is:make-range (char->integer #\a)))) ((->re "" c) e) - ((->re "asdf" c) (->re `(@ #\a #\s #\d #\f) c)) + ((->re "asdf" c) (->re `(concatenation #\a #\s #\d #\f) c)) ((->re r c) r) - ((->re `(epsilon) c) e) - ((->re `(* ,r) c) (build-repeat r c)) - ((->re `(+ ,r) c) (build-concat r (build-repeat r c) c)) - ((->re `(? ,r) c) (build-or (list e r) c)) - ((->re `(? ,rrr*) c) rrr*) - ((->re `(: (: (- #\a #\c) (^ (- #\000 #\110) (- #\112 #\377))) - (: (* #\2))) c) + ((->re `(repeat 0 +inf.0 ,r) c) (build-repeat 0 +inf.0 r c)) + ((->re `(repeat 1 +inf.0 ,r) c) (build-repeat 1 +inf.0 r c)) + ((->re `(repeat 0 1 ,r) c) (build-repeat 0 1 r c)) + ((->re `(repeat 0 1 ,rrr*) c) rrr*) + ((->re `(union (union (char-range #\a #\c) + (char-complement (char-range #\000 #\110) + (char-range #\112 ,(integer->char max-char-num)))) + (union (repeat 0 +inf.0 #\2))) c) (build-or (list (build-char-set (is:union (is:make-range 73) (is:make-range 97 99)) c) - (build-repeat (build-char-set (is:make-range 50) c) c)) + (build-repeat 0 +inf.0 (build-char-set (is:make-range 50) c) c)) c)) - ((->re `(: ,rr ,rrr) c) (build-or (list rr rrr) c)) - ((->re `(: ,r) c) r) - ((->re `(:) c) z) - ((->re `(& (& #\111 (^ (- #\000 #\110) (- #\112 #\377))) - (& (* #\2))) c) + ((->re `(union ,rr ,rrr) c) (build-or (list rr rrr) c)) + ((->re `(union ,r) c) r) + ((->re `(union) c) z) + ((->re `(intersection (intersection #\111 + (char-complement (char-range #\000 #\110) + (char-range #\112 ,(integer->char max-char-num)))) + (intersection (repeat 0 +inf.0 #\2))) c) (build-and (list (build-char-set (is:make-range 73) c) - (build-repeat (build-char-set (is:make-range 50) c) c)) + (build-repeat 0 +inf.0 (build-char-set (is:make-range 50) c) c)) c)) - ((->re `(& (& #\000 (^ (- #\000 #\110) (- #\112 #\377))) - (& (* #\2))) c) + ((->re `(intersection (intersection #\000 (char-complement (char-range #\000 #\110) + (char-range #\112 ,(integer->char max-char-num)))) + (intersection (repeat 0 +inf.0 #\2))) c) z) - ((->re `(& ,rr ,rrr) c) (build-and (list rr rrr) c)) - ((->re `(& ,r) c) r) - ((->re `(&) c) (build-neg z c)) - ((->re `(~ ,r) c) (build-neg r c)) - ((->re `(@) c) e) - ((->re `(@ ,rrr*) c) rrr*) + ((->re `(intersection ,rr ,rrr) c) (build-and (list rr rrr) c)) + ((->re `(intersection ,r) c) r) + ((->re `(intersection) c) (build-neg z c)) + ((->re `(complement ,r) c) (build-neg r c)) + ((->re `(concatenation) c) e) + ((->re `(concatenation ,rrr*) c) rrr*) (rr (build-concat r r c)) - ((->re `(@ ,r ,rr ,rrr) c) + ((->re `(concatenation ,r ,rr ,rrr) c) (build-concat r (build-concat rr rrr c) c)) - ((isc (char-setR-chars (->re `(- #\1 #\1) c))) (isc (is:make-range 49))) - ((isc (char-setR-chars (->re `(- #\1 #\9) c))) (isc (is:make-range 49 57))) - ((isc (char-setR-chars (->re `(- "1" "1") c))) (isc (is:make-range 49))) - ((isc (char-setR-chars (->re `(- "1" "9") c))) (isc (is:make-range 49 57))) - ((->re `(- "9" "1") c) z) - ((isc (char-setR-chars (->re `(^) c))) - (isc (char-setR-chars (->re `(- #\000 #\377) c)))) - ((isc (char-setR-chars (->re `(^ #\001 (- #\002 #\377)) c))) (isc (is:make-range 0))) + ((isc (char-setR-chars (->re `(char-range #\1 #\1) c))) (isc (is:make-range 49))) + ((isc (char-setR-chars (->re `(char-range #\1 #\9) c))) (isc (is:make-range 49 57))) + ((isc (char-setR-chars (->re `(char-range "1" "1") c))) (isc (is:make-range 49))) + ((isc (char-setR-chars (->re `(char-range "1" "9") c))) (isc (is:make-range 49 57))) + ((->re `(char-range "9" "1") c) z) + ((isc (char-setR-chars (->re `(char-complement) c))) + (isc (char-setR-chars (->re `(char-range #\000 ,(integer->char max-char-num)) c)))) + ((isc (char-setR-chars (->re `(char-complement #\001 (char-range #\002 ,(integer->char max-char-num))) c))) + (isc (is:make-range 0))) ) ) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/stx.ss b/collects/parser-tools/private-lex/stx.ss index 7171a38..f9975f7 100644 --- a/collects/parser-tools/private-lex/stx.ss +++ b/collects/parser-tools/private-lex/stx.ss @@ -2,29 +2,35 @@ (require "util.ss") (provide parse) - - (define (repetition-error stx) + + (define (bad-args stx num) (raise-syntax-error - 'regular-expression - "must be (repetition non-negative-exact-integer non-negative-exact-integer-or-+inf.0 re)" + #f + (format "incorrect number of arguments (should have ~a)" num) stx)) - (define (char-range-error stx) - (raise-syntax-error - 'regular-expression - "must be (char-range char-or-single-char-string char-or-single-char-string)" - stx)) + ;; char-range-arg: syntax-object syntax-object -> nat + ;; If c contains is a character or length 1 string, returns the integer + ;; for the character. Otherwise raises a syntax error. + (define (char-range-arg stx containing-stx) + (let ((c (syntax-e stx))) + (cond + ((char? c) (char->integer c)) + ((and (string? c) (= (string-length c) 1)) + (char->integer (string-ref c 0))) + (else + (raise-syntax-error + #f + "not a char or single-char string" + containing-stx stx))))) + (test-block () + ((char-range-arg #'#\1 #'here) (char->integer #\1)) + ((char-range-arg #'"1" #'here) (char->integer #\1))) - (define (char-range-arg c stx) - (cond - ((char? c) (integer->char c)) - ((and (string? c (= string-length c 1))) - (integer->char (string-ref c 0))) - (else - (char-range-error stx)))) ;; parse : syntax-object -> s-re (see re.ss) ;; checks for errors and generates the plain s-exp form for s + ;; Expands lex-abbrevs and applies lex-trans. (define (parse stx) (syntax-case stx (repetition union intersection complement concatenation char-range char-complement) @@ -35,22 +41,34 @@ (raise-syntax-error 'regular-expression "undefined abbreviation" stx)) - (parse (lex-abbrev-abbrev expand)))) + (parse (lex-abbrev-abbrev expansion)))) (_ (or (char? (syntax-e stx)) (string? (syntax-e stx))) (syntax-e stx)) ((repetition arg ...) (let ((arg-list (syntax->list (syntax (arg ...))))) (unless (= 3 (length arg-list)) - (repetition-error stx)) - (let ((lo-val (car arg-list)) - (hi-val (cadr arg-list)) + (bad-args stx 2)) + (let ((low (syntax-e (car arg-list))) + (high (syntax-e (cadr arg-list))) (re (caddr arg-list))) - (unless (and (exact? lo-val) (integer? lo-val) (> lo-val 0) - (or (and (exact? hi-val) (integer? hi-val) (> hi-val 0)) - (eq? hi-val +inf.0))) - (repetition-error stx)) - `(repetition ,lo-val ,hi-val ,(parse re))))) + (unless (and (number? low) (exact? low) (integer? low) (>= low 0)) + (raise-syntax-error #f + "not a non-negative exact integer" + stx + (car arg-list))) + (unless (or (and (number? high) (exact? high) (integer? high) (>= high 0)) + (eq? high +inf.0)) + (raise-syntax-error #f + "not a non-negative exact integer or +inf.0" + stx + (cadr arg-list))) + (unless (<= low high) + (raise-syntax-error + #f + "the first argument is not less than or equal to the second argument" + stx)) + `(repetition ,low ,high ,(parse re))))) ((union re ...) `(union ,@(map parse (syntax->list (syntax (re ...)))))) ((intersection re ...) @@ -58,66 +76,108 @@ ((complement re ...) (let ((re-list (syntax->list (syntax (re ...))))) (unless (= 1 (length re-list)) - (raise-syntax-error 'regular-expression - "must be (complement re)" - stx)) + (bad-args stx 1)) `(complement ,(parse (car re-list))))) ((concatenation re ...) `(concatenation ,@(map parse (syntax->list (syntax (re ...)))))) ((char-range arg ...) (let ((arg-list (syntax->list (syntax (arg ...))))) (unless (= 2 (length arg-list)) - (char-range-error stx)) + (bad-args stx 2)) (let ((i1 (char-range-arg (car arg-list) stx)) (i2 (char-range-arg (cadr arg-list) stx))) (if (<= i1 i2) `(char-range ,(integer->char i1) ,(integer->char i2)) (raise-syntax-error - 'regular-expression - (format "first argument ~a does not preceed second argument ~a" - (car arg-list) (cdr arg-list)) + #f + "the first argument does not precede or equal second argument" stx))))) ((char-complement arg ...) (let ((arg-list (syntax->list (syntax (arg ...))))) (unless (= 1 (length arg-list)) - (raise-syntax-error - 'regular-expression - "must be (char-complement char-set-re)" - stx)) + (bad-args stx 1)) (let ((parsed (parse (car arg-list)))) - (unless (pure-char? parsed) - (raise-syntax-error - 'regular-expression - "must be (char-complement char-set-re)" - stx)) + (unless (char-set? parsed) + (raise-syntax-error #f + "not a character set" + stx + (car arg-list))) `(char-complement ,parsed)))) ((op form ...) (identifier? (syntax op)) (let ((expansion (syntax-local-value (syntax op) (lambda () #f)))) - (unless (lex-trans? expansion) - (raise-syntax-error 'regular-expression - "undefined operator in" - stx)) - (parse ((lex-trans-f expansion) stx)))) + (cond + ((lex-trans? expansion) + (parse ((lex-trans-f expansion) stx))) + (expansion + (raise-syntax-error 'regular-expression + "not a lex-trans" + stx)) + (else + (raise-syntax-error 'regular-expression + "undefined operator" + stx))))) (_ (raise-syntax-error 'regular-expression - "must be char, string, identifier, or (op args ...)" + "not a char, string, identifier, or (op args ...)" stx)))) - + - (define (pure-char? s-re) + ;; char-set? : s-re -> bool + ;; A char-set is an re that matches only strings of length 1. + ;; char-set? is conservative. + (define (char-set? s-re) (cond ((char? s-re) #t) ((string? s-re) (= (string-length s-re) 1)) ((list? s-re) (let ((op (car s-re))) (case op - ((union intersection) (andmap pure-char? (cdr s-re))) + ((union intersection) (andmap char-set? (cdr s-re))) ((char-range char-complement) #t) + ((repetition) + (and (= (cadr s-re) (caddr s-re)) (char-set? (cadddr s-re)))) + ((concatenation) + (and (= 2 (length s-re)) (char-set? (cadr s-re)))) (else #f)))) (else #f))) - - + (test-block () + ((char-set? #\a) #t) + ((char-set? "12") #f) + ((char-set? "1") #t) + ((char-set? '(repetition 1 2 #\1)) #f) + ((char-set? '(repetition 1 1 "12")) #f) + ((char-set? '(repetition 1 1 "1")) #t) + ((char-set? '(union "1" "2" "3")) #t) + ((char-set? '(union "1" "" "3")) #f) + ((char-set? '(intersection "1" "2" (union "3" "4"))) #t) + ((char-set? '(intersection "1" "")) #f) + ((char-set? '(complement "1")) #f) + ((char-set? '(concatenation "1" "2")) #f) + ((char-set? '(concatenation "" "2")) #f) + ((char-set? '(concatenation "1")) #t) + ((char-set? '(concatenation "12")) #f) + ((char-set? '(char-range #\1 #\2)) #t) + ((char-set? '(char-complement #\1)) #t)) + + (test-block () + ((parse #'#\a) #\a) + ((parse #'"1") "1") + ((parse #'(repetition 1 1 #\1)) '(repetition 1 1 #\1)) + ((parse #'(repetition 0 +inf.0 #\1)) '(repetition 0 +inf.0 #\1)) + ((parse #'(union #\1 (union "2") (union))) + '(union #\1 (union "2") (union))) + ((parse #'(intersection #\1 (intersection "2") (intersection))) + '(intersection #\1 (intersection "2") (intersection))) + ((parse #'(complement (union #\1 #\2))) + '(complement (union #\1 #\2))) + ((parse #'(concatenation "1" "2" (concatenation))) + '(concatenation "1" "2" (concatenation))) + ((parse #'(char-range "1" #\1)) '(char-range #\1 #\1)) + ((parse #'(char-range #\1 "1")) '(char-range #\1 #\1)) + ((parse #'(char-range "1" "3")) '(char-range #\1 #\3)) + ((parse #'(char-complement (union "1" "2"))) + '(char-complement (union "1" "2")))) ) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/token.ss b/collects/parser-tools/private-lex/token.ss index fb78826..7d23f67 100644 --- a/collects/parser-tools/private-lex/token.ss +++ b/collects/parser-tools/private-lex/token.ss @@ -33,6 +33,7 @@ n (string->symbol (format "token-~a" (syntax-object->datum n))) + n n) ,@(if empty? '() '(x))) (make-token ',n ,(if empty? #f 'x))))