*** empty log message ***

original commit: 008977eccd4cccf61a54aa66ec4d07b9e93a8462
tokens
Scott Owens 21 years ago
parent 15a113415c
commit 820407d045

@ -14,7 +14,7 @@
(lib "cffi.ss" "compiler") (lib "cffi.ss" "compiler")
"private-lex/token.ss") "private-lex/token.ss")
(provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs (provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs define-lex-trans
position-offset position-line position-col position? position-offset position-line position-col position?
define-tokens define-empty-tokens token-name token-value token? file-path define-tokens define-empty-tokens token-name token-value token? file-path
any-char any-string nothing alphabetic lower-case upper-case title-case any-char any-string nothing alphabetic lower-case upper-case title-case
@ -28,15 +28,16 @@
(lambda (stx) (lambda (stx)
(syntax-case stx () (syntax-case stx ()
((_) ((_)
(raise-syntax-error #f "empty lexer is not allowed" stx)) (raise-syntax-error #f "accepts the empty string" stx))
((_ re-act ...) ((_ re-act ...)
(begin (begin
(for-each (for-each
(lambda (x) (lambda (x)
(syntax-case x () (syntax-case x ()
((re act) (void)) ((re act) (void))
(_ (raise-syntax-error 'lexer (_ (raise-syntax-error #f
"expects regular expression / action pairs" "not a regular expression / action pair"
stx
x)))) x))))
(syntax->list (syntax (re-act ...)))) (syntax->list (syntax (re-act ...))))
(let* ((spec/re-act-lst (let* ((spec/re-act-lst
@ -88,13 +89,14 @@
(define-syntax (define-lex-abbrev stx) (define-syntax (define-lex-abbrev stx)
(syntax-case stx () (syntax-case stx ()
((_ name re) ((_ name re)
(identifier? (syntax name))
(syntax (syntax
(define-syntax name (define-syntax name
(make-lex-abbrev (quote-syntax re))))) (make-lex-abbrev (quote-syntax re)))))
(_ (_
(raise-syntax-error (raise-syntax-error
#f #f
"Form should be (define-lex-abbrev name re)" "form should be (define-lex-abbrev name re)"
stx)))) stx))))
(define-syntax (define-lex-abbrevs stx) (define-syntax (define-lex-abbrevs stx)
@ -107,8 +109,9 @@
(identifier? (syntax name)) (identifier? (syntax name))
(syntax (define-lex-abbrev name re))) (syntax (define-lex-abbrev name re)))
(_ (raise-syntax-error (_ (raise-syntax-error
'Lexer-abbreviation #f
"Form should be (identifier value)" "form should be (define-lex-abbrevs (name re) ...)"
stx
a)))) a))))
abbrev))) abbrev)))
(datum->syntax-object (datum->syntax-object
@ -118,19 +121,19 @@
(_ (_
(raise-syntax-error (raise-syntax-error
#f #f
"Form should be (define-lex-abbrevs (name re) ...)" "form should be (define-lex-abbrevs (name re) ...)"
stx)))) stx))))
(define-syntax (define-lex-trans stx) (define-syntax (define-lex-trans stx)
(syntax-case stx () (syntax-case stx ()
((_ name-form body-form) ((_ name-form body-form)
(let-values (((name body) (let-values (((name body)
(normalize-definition (syntax (define-syntax name-form body-form) #'lambda)))) (normalize-definition (syntax (define-syntax name-form body-form)) #'lambda)))
#`(define-syntax name (make-lex-trans body)))) #`(define-syntax #,name (make-lex-trans #,body))))
(_ (_
(raise-syntax-error (raise-syntax-error
#f #f
"Form should be (define-lex-trans name transformer)" "form should be (define-lex-trans name transformer)"
stx)))) stx))))

@ -12,7 +12,7 @@
;; Don't do anything with this one but extract the chars ;; Don't do anything with this one but extract the chars
(define all-chars (->re `(^) (make-cache))) (define all-chars (->re `(char-complement (union)) (make-cache)))
;; get-char-groups : re bool -> (list-of char-setR?) ;; get-char-groups : re bool -> (list-of char-setR?)
;; Collects the char-setRs in r that could be used in ;; Collects the char-setRs in r that could be used in
@ -43,19 +43,23 @@
((get-char-groups e #f) null) ((get-char-groups e #f) null)
((get-char-groups z #f) null) ((get-char-groups z #f) null)
((get-char-groups r1 #f) (list r1)) ((get-char-groups r1 #f) (list r1))
((get-char-groups (->re `(@ ,r1 ,r2) c) #f) ((get-char-groups (->re `(concatenation ,r1 ,r2) c) #f)
(list r1)) (list r1))
((get-char-groups (->re `(@ ,e ,r2) c) #f) ((get-char-groups (->re `(concatenation ,e ,r2) c) #f)
(list r2)) (list r2))
((get-char-groups (->re `(@ (* ,r1) ,r2) c) #f) ((get-char-groups (->re `(concatenation (repeat 0 +inf.0 ,r1) ,r2) c) #f)
(list r1 r2)) (list r1 r2))
((get-char-groups (->re `(* ,r1) c) #f) ((get-char-groups (->re `(repeat 0 +inf.0 ,r1) c) #f)
(list r1)) (list r1))
((get-char-groups (->re `(: (* ,r1) (@ (* ,r2) "3") "4") c) #f) ((get-char-groups
(->re `(union (repeat 0 +inf.0 ,r1)
(concatenation (repeat 0 +inf.0 ,r2) "3") "4") c) #f)
(list r1 r2 (->re "3" c) (->re "4" c))) (list r1 r2 (->re "3" c) (->re "4" c)))
((get-char-groups (->re `(~ ,r1) c) #f) ((get-char-groups (->re `(complement ,r1) c) #f)
(list all-chars r1)) (list all-chars r1))
((get-char-groups (->re `(& (* ,r1) (@ (* ,r2) "3") "4") c) #f) ((get-char-groups
(->re `(intersection (repeat 0 +inf.0 ,r1)
(concatenation (repeat 0 +inf.0 ,r2) "3") "4") c) #f)
(list r1 r2 (->re "3" c) (->re "4" c))) (list r1 r2 (->re "3" c) (->re "4" c)))
) )
(define loc:member? is:member?) (define loc:member? is:member?)
@ -74,7 +78,11 @@
(build-or (list d (deriveR r2 c cache)) cache) (build-or (list d (deriveR r2 c cache)) cache)
d))) d)))
((repeatR? r) ((repeatR? r)
(build-concat (deriveR (repeatR-re r) c cache) r cache)) (build-concat (deriveR (repeatR-re r) c cache)
(build-repeat (sub1 (repeatR-low r))
(sub1 (repeatR-high r))
(repeatR-re r) cache)
cache))
((orR? r) ((orR? r)
(build-or (map (lambda (x) (deriveR x c cache)) (build-or (map (lambda (x) (deriveR x c cache))
(orR-res r)) (orR-res r))
@ -90,14 +98,14 @@
(a (char->integer #\a)) (a (char->integer #\a))
(b (char->integer #\b)) (b (char->integer #\b))
(r1 (->re #\a c)) (r1 (->re #\a c))
(r2 (->re `(* #\a) c)) (r2 (->re `(repeat 0 +inf.0 #\a) c))
(r3 (->re `(* ,r2) c)) (r3 (->re `(repeat 0 +inf.0 ,r2) c))
(r4 (->re `(@ #\a ,r2) c)) (r4 (->re `(concatenation #\a ,r2) c))
(r5 (->re `(* ,r4) c)) (r5 (->re `(repeat 0 +inf.0 ,r4) c))
(r6 (->re `(: ,r5 #\a) c)) (r6 (->re `(union ,r5 #\a) c))
(r7 (->re `(@ ,r2 ,r2) c)) (r7 (->re `(concatenation ,r2 ,r2) c))
(r8 (->re `(~ ,r4) c)) (r8 (->re `(complement ,r4) c))
(r9 (->re `(& ,r2 ,r4) c))) (r9 (->re `(intersection ,r2 ,r4) c)))
((deriveR e a c) z) ((deriveR e a c) z)
((deriveR z a c) z) ((deriveR z a c) z)
((deriveR r1 b c) z) ((deriveR r1 b c) z)
@ -108,16 +116,18 @@
((deriveR r3 b c) z) ((deriveR r3 b c) z)
((deriveR r4 a c) r2) ((deriveR r4 a c) r2)
((deriveR r4 b c) z) ((deriveR r4 b c) z)
((deriveR r5 a c) (->re `(@ ,r2 ,r5) c)) ((deriveR r5 a c) (->re `(concatenation ,r2 ,r5) c))
((deriveR r5 b c) z) ((deriveR r5 b c) z)
((deriveR r6 a c) (->re `(: (@ ,r2 ,r5) (epsilon)) c)) ((deriveR r6 a c) (->re `(union (concatenation ,r2 ,r5) "") c))
((deriveR r6 b c) z) ((deriveR r6 b c) z)
((deriveR r7 a c) (->re `(: (@ ,r2 ,r2) ,r2) c)) ((deriveR r7 a c) (->re `(union (concatenation ,r2 ,r2) ,r2) c))
((deriveR r7 b c) z) ((deriveR r7 b c) z)
((deriveR r8 a c) (->re `(~, r2) c)) ((deriveR r8 a c) (->re `(complement, r2) c))
((deriveR r8 b c) (->re `(~ ,z) c)) ((deriveR r8 b c) (->re `(complement ,z) c))
((deriveR r9 a c) r2) ((deriveR r9 a c) r2)
((deriveR r9 b c) z)) ((deriveR r9 b c) z)
((deriveR (->re `(repeat 1 2 "ab") c) a c)
(->re `(concatenation "b" (repeat 0 1 "ab")) c)))
;; An re-action is (cons re action) ;; An re-action is (cons re action)
@ -161,9 +171,10 @@
((derive a (c->i #\a) c) (list (cons e 1) (cons z 2))) ((derive a (c->i #\a) c) (list (cons e 1) (cons z 2)))
((derive a (c->i #\b) c) (list (cons z 1) (cons e 2))) ((derive a (c->i #\b) c) (list (cons z 1) (cons e 2)))
((derive a (c->i #\c) c) #f) ((derive a (c->i #\c) c) #f)
((derive (list (cons (->re `(: " " "\n" ",") c) 1) ((derive (list (cons (->re `(union " " "\n" ",") c) 1)
(cons (->re `(@ (? "-") (+ (- "0" "9"))) c) 2) (cons (->re `(concatenation (repeat 0 1 "-")
(cons (->re `(@ "-" (+ "-")) c) 3) (repeat 1 +inf.0 (char-range "0" "9"))) c) 2)
(cons (->re `(concatenation "-" (repeat 1 +inf.0 "-")) c) 3)
(cons (->re "[" c) 4) (cons (->re "[" c) 4)
(cons (->re "]" c) 5)) (c->i #\[) c) (cons (->re "]" c) 5)) (c->i #\[) c)
b) b)
@ -195,11 +206,12 @@
(test-block ((c (make-cache)) (test-block ((c (make-cache))
(c->i char->integer) (c->i char->integer)
(r1 (->re `(- #\1 #\4) c)) (r1 (->re `(char-range #\1 #\4) c))
(r2 (->re `(- #\2 #\3) c))) (r2 (->re `(char-range #\2 #\3) c)))
((compute-chars null) null) ((compute-chars null) null)
((compute-chars (list (make-state null 1))) null) ((compute-chars (list (make-state null 1))) null)
((map is:integer-set-contents (compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2)))) ((map is:integer-set-contents
(compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2))))
(list (is:integer-set-contents (is:make-range (c->i #\2) (c->i #\3))) (list (is:integer-set-contents (is:make-range (c->i #\2) (c->i #\3)))
(is:integer-set-contents (is:union (is:make-range (c->i #\1)) (is:integer-set-contents (is:union (is:make-range (c->i #\1))
(is:make-range (c->i #\4))))))) (is:make-range (c->i #\4)))))))
@ -292,30 +304,35 @@
(define t1 (build-test-dfa null)) (define t1 (build-test-dfa null))
(define t2 (build-test-dfa `(#\a))) (define t2 (build-test-dfa `(#\a)))
(define t3 (build-test-dfa `(#\a #\b))) (define t3 (build-test-dfa `(#\a #\b)))
(define t4 (build-test-dfa `((* #\a) (define t4 (build-test-dfa `((repeat 0 +inf.0 #\a)
(* (@ #\a #\b))))) (repeat 0 +inf.0 (concatenation #\a #\b)))))
(define t5 (build-test-dfa `((@ (* (: #\0 #\1)) #\1)))) (define t5 (build-test-dfa `((concatenation (repeat 0 +inf.0 (union #\0 #\1)) #\1))))
(define t6 (build-test-dfa `((* (* #\a)) (define t6 (build-test-dfa `((repeat 0 +inf.0 (repeat 0 +inf.0 #\a))
(* (@ #\b (* #\b)))))) (repeat 0 +inf.0 (concatenation #\b (repeat 1 +inf.0 #\b))))))
(define t7 (build-test-dfa `((@ (* #\a) (* #\b) (* #\c) (* #\d) (* #\e))))) (define t7 (build-test-dfa `((concatenation (repeat 0 +inf.0 #\a) (repeat 0 +inf.0 #\b)
(repeat 0 +inf.0 #\c) (repeat 0 +inf.0 #\d)
(repeat 0 +inf.0 #\e)))))
(define t8 (define t8
(build-test-dfa `((@ (* (: #\a #\b)) #\a (: #\a #\b) (: #\a #\b) (: #\a #\b) (: #\a #\b))))) (build-test-dfa `((concatenation (repeat 0 +inf.0 (union #\a #\b)) #\a (union #\a #\b)
(define t9 (build-test-dfa `((@ "/*" (union #\a #\b) (union #\a #\b) (union #\a #\b)))))
(~ (@ (&) "*/" (&))) (define t9 (build-test-dfa `((concatenation "/*"
(complement (concatenation (intersection) "*/" (intersection)))
"*/")))) "*/"))))
(define t11 (build-test-dfa `((~ "1")))) (define t11 (build-test-dfa `((complement "1"))))
(define t12 (build-test-dfa `((@ (& (@ (* "a") "b") (define t12 (build-test-dfa `((concatenation (intersection (concatenation (repeat 0 +inf.0 "a") "b")
(@ "a" (* "b"))) (concatenation "a" (repeat 0 +inf.0 "b")))
"ab")))) "ab"))))
(define x (build-test-dfa `((: " " "\n" ",") (define x (build-test-dfa `((union " " "\n" ",")
(@ (? "-") (+ (- "0" "9"))) (concatenation (repeat 0 1 "-") (repeat 1 +inf.0 (char-range "0" "9")))
(@ "-" (+ "-")) (concatenation "-" (repeat 1 +inf.0 "-"))
"[" "["
"]"))) "]")))
(define y (build-test-dfa `((+ (: (@ "|" (* (^ "|")) "|") (define y (build-test-dfa
(@ "|" (* (^ "|")))))))) `((repeat 1 +inf.0
(define t13 (build-test-dfa `((& (@ (&) "111" (&)) (union (concatenation "|" (repeat 0 +inf.0 (char-complement "|")) "|")
(~ (: (@ (&) "01") (concatenation "|" (repeat 0 +inf.0 (char-complement "|"))))))))
(+ "1"))))))) (define t13 (build-test-dfa `((intersection (concatenation (intersection) "111" (intersection))
(complement (union (concatenation (intersection) "01")
(repeat 1 +inf.0 "1")))))))
|# |#
) )

@ -0,0 +1,35 @@
(define-lex-abbrev)
(define-lex-abbrev a)
(define-lex-abbrev (a b) v)
(define-lex-abbrev 1 1)
(define-lex-abbrevs ())
(define-lex-trans)
(define-lex-trans (1 b) 1)
(lexer)
(lexer ("a" "b" "c"))
(lexer ())
(lexer (""))
(lexer (a 1))
(lexer ((a) 1))
(let ((a 1))
(lexer ((a) 1)))
(let-syntax ((a 1))
(lexer ((a) 1)))
(let ()
(define-lex-trans a 1)
(let ()
(lexer ((a) 1))))
(lexer (1 1))
(lexer ((repetition) 1))
(lexer ((repetition #\1 #\1 "3") 1))
(lexer ((repetition 1 #\1 "3") 1))
(lexer ((repetition 1 0 "3") 1))
(lexer ((complement) 1))
(lexer ((char-range) 1))
(lexer ((char-range #\9 #\0) 1))
(lexer ((char-complement) 1))
(lexer ((char-complement (concatenation "1" "2")) 1))

@ -7,8 +7,8 @@
(provide ->re build-epsilon build-zero build-char-set build-concat (provide ->re build-epsilon build-zero build-char-set build-concat
build-repeat build-or build-and build-neg build-repeat build-or build-and build-neg
epsilonR? zeroR? char-setR? concatR? repeatR? orR? andR? negR? epsilonR? zeroR? char-setR? concatR? repeatR? orR? andR? negR?
char-setR-chars concatR-re1 concatR-re2 repeatR-re orR-res char-setR-chars concatR-re1 concatR-re2 repeatR-re repeatR-low repeatR-high
andR-res negR-re orR-res andR-res negR-re
re-nullable? re-index) re-nullable? re-index)
(define max-char-num #x7FFFFFFF) (define max-char-num #x7FFFFFFF)
@ -21,9 +21,9 @@
;; - (make-zeroR bool nat) ;; - (make-zeroR bool nat)
;; - (make-char-setR bool nat char-set) ;; - (make-char-setR bool nat char-set)
;; - (make-concatR bool nat re re) ;; - (make-concatR bool nat re re)
;; - (make-repeatR bool nat re) ;; - (make-repeatR bool nat nat nat-or-+inf.0 re)
;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs ;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs
;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs ;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs
;; - (make-negR bool nat re) ;; - (make-negR bool nat re)
;; ;;
;; Every re must have an index field globally different from all ;; Every re must have an index field globally different from all
@ -33,7 +33,7 @@
(define-struct (zeroR re) () (make-inspector)) (define-struct (zeroR re) () (make-inspector))
(define-struct (char-setR re) (chars) (make-inspector)) (define-struct (char-setR re) (chars) (make-inspector))
(define-struct (concatR re) (re1 re2) (make-inspector)) (define-struct (concatR re) (re1 re2) (make-inspector))
(define-struct (repeatR re) (re) (make-inspector)) (define-struct (repeatR re) (low high re) (make-inspector))
(define-struct (orR re) (res) (make-inspector)) (define-struct (orR re) (res) (make-inspector))
(define-struct (andR re) (res) (make-inspector)) (define-struct (andR re) (res) (make-inspector))
(define-struct (negR re) (re) (make-inspector)) (define-struct (negR re) (re) (make-inspector))
@ -47,22 +47,22 @@
(define z (make-zeroR #f (get-index))) (define z (make-zeroR #f (get-index)))
;; s-re = char match the given character ;; s-re = char constant
;; | string match its sequence of characters ;; | string constant (sequence of characters)
;; | re a precompiled re ;; | re a precompiled re
;; | (epsilon) match the empty string ;; | (repeat low high s-re) repetition between low and high times (inclusive)
;; | (* s-re) match 0 or more ;; | (union s-re ...)
;; | (+ s-re) match 1 or more ;; | (intersection s-re ...)
;; | (? s-re) match 0 or 1 ;; | (complement s-re)
;; | (: s-re ...) match one of the sub-expressions ;; | (concatenation s-re ...)
;; | (& s-re ...) match iff all sub-expressions match ;; | (char-range rng rng) match any character between two (inclusive)
;; | (~ s-re) match iff s-re doesn't match ;; | (char-complement char-set) match any character not listed
;; | (@ s-re ...) match each sub-expression in succession ;; low = natural-number
;; | (- char char) match any character between two (inclusive) ;; high = natural-number or +inf.0
;; | (^ char_or_range ...) match any character not listed ;; rng = char or string with length 1
;; (The null concatenation `(@) means epsilon as does "". ;; (concatenation) (repeat 0 0 x), and "" match the empty string.
;; The null or `(:) means match nothing. The null carat `(^) means match ;; (union) matches no strings.
;; any character. The null intersection `(&) means match string.) ;; (intersection) matches any string.
(define loc:make-range is:make-range) (define loc:make-range is:make-range)
(define loc:union is:union) (define loc:union is:union)
@ -73,43 +73,35 @@
(define (->re exp cache) (define (->re exp cache)
(match exp (match exp
((? char?) (build-char-set (loc:make-range (char->integer exp)) cache)) ((? char?) (build-char-set (loc:make-range (char->integer exp)) cache))
((? string?) (->re `(@ ,@(string->list exp)) cache)) ((? string?) (->re `(concatenation ,@(string->list exp)) cache))
((? re?) exp) ((? re?) exp)
(`(epsilon) (build-epsilon)) (`(repeat ,low ,high ,r)
(`(* ,r) (build-repeat low high (->re r cache) cache))
(build-repeat (->re r cache) cache)) (`(union ,rs ...)
(`(+ ,r)
(->re `(@ ,r (* ,r)) cache))
(`(? ,r)
(let ((c (->re r cache)))
(if (re-nullable? c)
c
(build-or (list e c) cache))))
(`(: ,rs ...)
(build-or (flatten-res (map (lambda (r) (->re r cache)) rs) (build-or (flatten-res (map (lambda (r) (->re r cache)) rs)
orR? orR-res loc:union cache) orR? orR-res loc:union cache)
cache)) cache))
(`(& ,rs ...) (`(intersection ,rs ...)
(build-and (flatten-res (map (lambda (r) (->re r cache)) rs) (build-and (flatten-res (map (lambda (r) (->re r cache)) rs)
andR? andR-res (lambda (a b) andR? andR-res (lambda (a b)
(let-values (((i _ __) (loc:split a b))) i)) (let-values (((i _ __) (loc:split a b))) i))
cache) cache)
cache)) cache))
(`(~ ,r) (`(complement ,r)
(build-neg (->re r cache) cache)) (build-neg (->re r cache) cache))
(`(@ ,rs ...) (`(concatenation ,rs ...)
(foldr (lambda (x y) (foldr (lambda (x y)
(build-concat (->re x cache) y cache)) (build-concat (->re x cache) y cache))
e e
rs)) rs))
(`(- ,c1 ,c2) (`(char-range ,c1 ,c2)
(let ((i1 (char->integer (if (string? c1) (string-ref c1 0) c1))) (let ((i1 (char->integer (if (string? c1) (string-ref c1 0) c1)))
(i2 (char->integer (if (string? c2) (string-ref c2 0) c2)))) (i2 (char->integer (if (string? c2) (string-ref c2 0) c2))))
(if (<= i1 i2) (if (<= i1 i2)
(build-char-set (loc:make-range i1 i2) cache) (build-char-set (loc:make-range i1 i2) cache)
z))) z)))
(`(^ ,crs ...) (`(char-complement ,crs ...)
(let ((cs (->re `(: ,@crs) cache))) (let ((cs (->re `(union ,@crs) cache)))
(cond (cond
((zeroR? cs) (build-char-set (loc:make-range 0 max-char-num) cache)) ((zeroR? cs) (build-char-set (loc:make-range 0 max-char-num) cache))
((char-setR? cs) ((char-setR? cs)
@ -174,15 +166,25 @@
(get-index) (get-index)
r1 r2)))))) r1 r2))))))
;; build-repeat : re cache -> re ;; build-repeat : nat nat-or-+inf.0 re cache -> re
(define (build-repeat r cache) (define (build-repeat low high r cache)
(cond (let ((low (if (< low 0) 0 low)))
((eq? z r) e) (cond
((repeatR? r) r) ((eq? r e) e)
(else ((and (= 0 low) (or (= 0 high) (eq? z r))) e)
(cache (cons 'repeat (re-index r)) ((and (= 1 low) (= 1 high)) r)
(lambda () ((and (repeatR? r)
(make-repeatR #t (get-index) r)))))) (eq? (repeatR-high r) +inf.0)
(or (= 0 (repeatR-low r))
(= 1 (repeatR-low r))))
(build-repeat (* low (repeatR-low r))
+inf.0
(repeatR-re r)
cache))
(else
(cache (cons 'repeat (cons low (cons high (re-index r))))
(lambda ()
(make-repeatR (or (re-nullable? r) (= 0 low)) (get-index) low high r)))))))
;; build-or : (list-of re) cache -> re ;; build-or : (list-of re) cache -> re
@ -229,7 +231,7 @@
(r3 (build-char-set (is:make-range (char->integer #\3)) c)) (r3 (build-char-set (is:make-range (char->integer #\3)) c))
(rc (build-concat r1 r2 c)) (rc (build-concat r1 r2 c))
(rc2 (build-concat r2 r1 c)) (rc2 (build-concat r2 r1 c))
(rr (build-repeat rc c)) (rr (build-repeat 0 +inf.0 rc c))
(ro (build-or `(,rr ,rc ,rr) c)) (ro (build-or `(,rr ,rc ,rr) c))
(ro2 (build-or `(,rc ,rr ,z) c)) (ro2 (build-or `(,rc ,rr ,z) c))
(ro3 (build-or `(,rr ,rc) c)) (ro3 (build-or `(,rr ,rc) c))
@ -263,9 +265,18 @@
((orR-res ro4) (list r1 r2 r3)) ((orR-res ro4) (list r1 r2 r3))
((build-or null c) z) ((build-or null c) z)
((build-or `(,r1 ,z) c) r1) ((build-or `(,r1 ,z) c) r1)
((build-repeat rc c) rr) ((build-repeat 0 +inf.0 rc c) rr)
((build-repeat z c) e) ((build-repeat 0 1 z c) e)
((build-repeat (build-repeat rc c) c) (build-repeat rc c)) ((build-repeat 0 0 rc c) e)
((build-repeat 0 +inf.0 z c) e)
((build-repeat -1 +inf.0 z c) e)
((build-repeat 0 +inf.0 (build-repeat 0 +inf.0 rc c) c)
(build-repeat 0 +inf.0 rc c))
((build-repeat 20 20 (build-repeat 0 +inf.0 rc c) c)
(build-repeat 0 +inf.0 rc c))
((build-repeat 20 20 (build-repeat 1 +inf.0 rc c) c)
(build-repeat 20 +inf.0 rc c))
((build-repeat 1 1 rc c) rc)
((repeatR-re rr) rc) ((repeatR-re rr) rc)
(ra ra2) (ra ra2)
(ra ra3) (ra ra3)
@ -282,6 +293,9 @@
((re-nullable? rc) #f) ((re-nullable? rc) #f)
((re-nullable? (build-concat rr rr c)) #t) ((re-nullable? (build-concat rr rr c)) #t)
((re-nullable? rr) #t) ((re-nullable? rr) #t)
((re-nullable? (build-repeat 0 1 rc c)) #t)
((re-nullable? (build-repeat 1 2 rc c)) #f)
((re-nullable? (build-repeat 1 2 (build-or (list e r1) c) c)) #t)
((re-nullable? ro) #t) ((re-nullable? ro) #t)
((re-nullable? (build-or `(,r1 ,r2) c)) #f) ((re-nullable? (build-or `(,r1 ,r2) c)) #f)
((re-nullable? (build-and `(,r1 ,e) c)) #f) ((re-nullable? (build-and `(,r1 ,e) c)) #f)
@ -293,16 +307,17 @@
(isc is:integer-set-contents) (isc is:integer-set-contents)
(r1 (->re #\1 c)) (r1 (->re #\1 c))
(r2 (->re #\2 c)) (r2 (->re #\2 c))
(r3-5 (->re '(- #\3 #\5) c)) (r3-5 (->re '(char-range #\3 #\5) c))
(r4 (build-or `(,r1 ,r2) c)) (r4 (build-or `(,r1 ,r2) c))
(r5 (->re `(: ,r3-5 #\7) c)) (r5 (->re `(union ,r3-5 #\7) c))
(r6 (->re #\6 c))) (r6 (->re #\6 c)))
((flatten-res null orR? orR-res is:union c) null) ((flatten-res null orR? orR-res is:union c) null)
((isc (char-setR-chars (car (flatten-res `(,r1) orR? orR-res is:union c)))) ((isc (char-setR-chars (car (flatten-res `(,r1) orR? orR-res is:union c))))
(isc (is:make-range (char->integer #\1)))) (isc (is:make-range (char->integer #\1))))
((isc (char-setR-chars (car (flatten-res `(,r4) orR? orR-res is:union c)))) ((isc (char-setR-chars (car (flatten-res `(,r4) orR? orR-res is:union c))))
(isc (is:make-range (char->integer #\1) (char->integer #\2)))) (isc (is:make-range (char->integer #\1) (char->integer #\2))))
((isc (char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1) orR? orR-res is:union c)))) ((isc (char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1)
orR? orR-res is:union c))))
(isc (is:make-range (char->integer #\1) (char->integer #\7)))) (isc (is:make-range (char->integer #\1) (char->integer #\7))))
((flatten-res `(,r1 ,r2) andR? andR-res (lambda (x y) ((flatten-res `(,r1 ,r2) andR? andR-res (lambda (x y)
(let-values (((i _ __) (let-values (((i _ __)
@ -315,53 +330,58 @@
(test-block ((c (make-cache)) (test-block ((c (make-cache))
(isc is:integer-set-contents) (isc is:integer-set-contents)
(r (->re #\a c)) (r (->re #\a c))
(rr (->re `(@ ,r ,r) c)) (rr (->re `(concatenation ,r ,r) c))
(rrr (->re `(@ ,r ,rr) c)) (rrr (->re `(concatenation ,r ,rr) c))
(rrr* (->re `(* ,rrr) c))) (rrr* (->re `(repeat 0 +inf.0 ,rrr) c)))
((isc (char-setR-chars r)) (isc (is:make-range (char->integer #\a)))) ((isc (char-setR-chars r)) (isc (is:make-range (char->integer #\a))))
((->re "" c) e) ((->re "" c) e)
((->re "asdf" c) (->re `(@ #\a #\s #\d #\f) c)) ((->re "asdf" c) (->re `(concatenation #\a #\s #\d #\f) c))
((->re r c) r) ((->re r c) r)
((->re `(epsilon) c) e) ((->re `(repeat 0 +inf.0 ,r) c) (build-repeat 0 +inf.0 r c))
((->re `(* ,r) c) (build-repeat r c)) ((->re `(repeat 1 +inf.0 ,r) c) (build-repeat 1 +inf.0 r c))
((->re `(+ ,r) c) (build-concat r (build-repeat r c) c)) ((->re `(repeat 0 1 ,r) c) (build-repeat 0 1 r c))
((->re `(? ,r) c) (build-or (list e r) c)) ((->re `(repeat 0 1 ,rrr*) c) rrr*)
((->re `(? ,rrr*) c) rrr*) ((->re `(union (union (char-range #\a #\c)
((->re `(: (: (- #\a #\c) (^ (- #\000 #\110) (- #\112 #\377))) (char-complement (char-range #\000 #\110)
(: (* #\2))) c) (char-range #\112 ,(integer->char max-char-num))))
(union (repeat 0 +inf.0 #\2))) c)
(build-or (list (build-char-set (is:union (is:make-range 73) (build-or (list (build-char-set (is:union (is:make-range 73)
(is:make-range 97 99)) (is:make-range 97 99))
c) c)
(build-repeat (build-char-set (is:make-range 50) c) c)) (build-repeat 0 +inf.0 (build-char-set (is:make-range 50) c) c))
c)) c))
((->re `(: ,rr ,rrr) c) (build-or (list rr rrr) c)) ((->re `(union ,rr ,rrr) c) (build-or (list rr rrr) c))
((->re `(: ,r) c) r) ((->re `(union ,r) c) r)
((->re `(:) c) z) ((->re `(union) c) z)
((->re `(& (& #\111 (^ (- #\000 #\110) (- #\112 #\377))) ((->re `(intersection (intersection #\111
(& (* #\2))) c) (char-complement (char-range #\000 #\110)
(char-range #\112 ,(integer->char max-char-num))))
(intersection (repeat 0 +inf.0 #\2))) c)
(build-and (list (build-char-set (is:make-range 73) c) (build-and (list (build-char-set (is:make-range 73) c)
(build-repeat (build-char-set (is:make-range 50) c) c)) (build-repeat 0 +inf.0 (build-char-set (is:make-range 50) c) c))
c)) c))
((->re `(& (& #\000 (^ (- #\000 #\110) (- #\112 #\377))) ((->re `(intersection (intersection #\000 (char-complement (char-range #\000 #\110)
(& (* #\2))) c) (char-range #\112 ,(integer->char max-char-num))))
(intersection (repeat 0 +inf.0 #\2))) c)
z) z)
((->re `(& ,rr ,rrr) c) (build-and (list rr rrr) c)) ((->re `(intersection ,rr ,rrr) c) (build-and (list rr rrr) c))
((->re `(& ,r) c) r) ((->re `(intersection ,r) c) r)
((->re `(&) c) (build-neg z c)) ((->re `(intersection) c) (build-neg z c))
((->re `(~ ,r) c) (build-neg r c)) ((->re `(complement ,r) c) (build-neg r c))
((->re `(@) c) e) ((->re `(concatenation) c) e)
((->re `(@ ,rrr*) c) rrr*) ((->re `(concatenation ,rrr*) c) rrr*)
(rr (build-concat r r c)) (rr (build-concat r r c))
((->re `(@ ,r ,rr ,rrr) c) ((->re `(concatenation ,r ,rr ,rrr) c)
(build-concat r (build-concat rr rrr c) c)) (build-concat r (build-concat rr rrr c) c))
((isc (char-setR-chars (->re `(- #\1 #\1) c))) (isc (is:make-range 49))) ((isc (char-setR-chars (->re `(char-range #\1 #\1) c))) (isc (is:make-range 49)))
((isc (char-setR-chars (->re `(- #\1 #\9) c))) (isc (is:make-range 49 57))) ((isc (char-setR-chars (->re `(char-range #\1 #\9) c))) (isc (is:make-range 49 57)))
((isc (char-setR-chars (->re `(- "1" "1") c))) (isc (is:make-range 49))) ((isc (char-setR-chars (->re `(char-range "1" "1") c))) (isc (is:make-range 49)))
((isc (char-setR-chars (->re `(- "1" "9") c))) (isc (is:make-range 49 57))) ((isc (char-setR-chars (->re `(char-range "1" "9") c))) (isc (is:make-range 49 57)))
((->re `(- "9" "1") c) z) ((->re `(char-range "9" "1") c) z)
((isc (char-setR-chars (->re `(^) c))) ((isc (char-setR-chars (->re `(char-complement) c)))
(isc (char-setR-chars (->re `(- #\000 #\377) c)))) (isc (char-setR-chars (->re `(char-range #\000 ,(integer->char max-char-num)) c))))
((isc (char-setR-chars (->re `(^ #\001 (- #\002 #\377)) c))) (isc (is:make-range 0))) ((isc (char-setR-chars (->re `(char-complement #\001 (char-range #\002 ,(integer->char max-char-num))) c)))
(isc (is:make-range 0)))
) )
) )

@ -3,28 +3,34 @@
(provide parse) (provide parse)
(define (repetition-error stx) (define (bad-args stx num)
(raise-syntax-error (raise-syntax-error
'regular-expression #f
"must be (repetition non-negative-exact-integer non-negative-exact-integer-or-+inf.0 re)" (format "incorrect number of arguments (should have ~a)" num)
stx)) stx))
(define (char-range-error stx) ;; char-range-arg: syntax-object syntax-object -> nat
(raise-syntax-error ;; If c contains is a character or length 1 string, returns the integer
'regular-expression ;; for the character. Otherwise raises a syntax error.
"must be (char-range char-or-single-char-string char-or-single-char-string)" (define (char-range-arg stx containing-stx)
stx)) (let ((c (syntax-e stx)))
(cond
((char? c) (char->integer c))
((and (string? c) (= (string-length c) 1))
(char->integer (string-ref c 0)))
(else
(raise-syntax-error
#f
"not a char or single-char string"
containing-stx stx)))))
(test-block ()
((char-range-arg #'#\1 #'here) (char->integer #\1))
((char-range-arg #'"1" #'here) (char->integer #\1)))
(define (char-range-arg c stx)
(cond
((char? c) (integer->char c))
((and (string? c (= string-length c 1)))
(integer->char (string-ref c 0)))
(else
(char-range-error stx))))
;; parse : syntax-object -> s-re (see re.ss) ;; parse : syntax-object -> s-re (see re.ss)
;; checks for errors and generates the plain s-exp form for s ;; checks for errors and generates the plain s-exp form for s
;; Expands lex-abbrevs and applies lex-trans.
(define (parse stx) (define (parse stx)
(syntax-case stx (repetition union intersection complement concatenation (syntax-case stx (repetition union intersection complement concatenation
char-range char-complement) char-range char-complement)
@ -35,22 +41,34 @@
(raise-syntax-error 'regular-expression (raise-syntax-error 'regular-expression
"undefined abbreviation" "undefined abbreviation"
stx)) stx))
(parse (lex-abbrev-abbrev expand)))) (parse (lex-abbrev-abbrev expansion))))
(_ (_
(or (char? (syntax-e stx)) (string? (syntax-e stx))) (or (char? (syntax-e stx)) (string? (syntax-e stx)))
(syntax-e stx)) (syntax-e stx))
((repetition arg ...) ((repetition arg ...)
(let ((arg-list (syntax->list (syntax (arg ...))))) (let ((arg-list (syntax->list (syntax (arg ...)))))
(unless (= 3 (length arg-list)) (unless (= 3 (length arg-list))
(repetition-error stx)) (bad-args stx 2))
(let ((lo-val (car arg-list)) (let ((low (syntax-e (car arg-list)))
(hi-val (cadr arg-list)) (high (syntax-e (cadr arg-list)))
(re (caddr arg-list))) (re (caddr arg-list)))
(unless (and (exact? lo-val) (integer? lo-val) (> lo-val 0) (unless (and (number? low) (exact? low) (integer? low) (>= low 0))
(or (and (exact? hi-val) (integer? hi-val) (> hi-val 0)) (raise-syntax-error #f
(eq? hi-val +inf.0))) "not a non-negative exact integer"
(repetition-error stx)) stx
`(repetition ,lo-val ,hi-val ,(parse re))))) (car arg-list)))
(unless (or (and (number? high) (exact? high) (integer? high) (>= high 0))
(eq? high +inf.0))
(raise-syntax-error #f
"not a non-negative exact integer or +inf.0"
stx
(cadr arg-list)))
(unless (<= low high)
(raise-syntax-error
#f
"the first argument is not less than or equal to the second argument"
stx))
`(repetition ,low ,high ,(parse re)))))
((union re ...) ((union re ...)
`(union ,@(map parse (syntax->list (syntax (re ...)))))) `(union ,@(map parse (syntax->list (syntax (re ...))))))
((intersection re ...) ((intersection re ...)
@ -58,66 +76,108 @@
((complement re ...) ((complement re ...)
(let ((re-list (syntax->list (syntax (re ...))))) (let ((re-list (syntax->list (syntax (re ...)))))
(unless (= 1 (length re-list)) (unless (= 1 (length re-list))
(raise-syntax-error 'regular-expression (bad-args stx 1))
"must be (complement re)"
stx))
`(complement ,(parse (car re-list))))) `(complement ,(parse (car re-list)))))
((concatenation re ...) ((concatenation re ...)
`(concatenation ,@(map parse (syntax->list (syntax (re ...)))))) `(concatenation ,@(map parse (syntax->list (syntax (re ...))))))
((char-range arg ...) ((char-range arg ...)
(let ((arg-list (syntax->list (syntax (arg ...))))) (let ((arg-list (syntax->list (syntax (arg ...)))))
(unless (= 2 (length arg-list)) (unless (= 2 (length arg-list))
(char-range-error stx)) (bad-args stx 2))
(let ((i1 (char-range-arg (car arg-list) stx)) (let ((i1 (char-range-arg (car arg-list) stx))
(i2 (char-range-arg (cadr arg-list) stx))) (i2 (char-range-arg (cadr arg-list) stx)))
(if (<= i1 i2) (if (<= i1 i2)
`(char-range ,(integer->char i1) ,(integer->char i2)) `(char-range ,(integer->char i1) ,(integer->char i2))
(raise-syntax-error (raise-syntax-error
'regular-expression #f
(format "first argument ~a does not preceed second argument ~a" "the first argument does not precede or equal second argument"
(car arg-list) (cdr arg-list))
stx))))) stx)))))
((char-complement arg ...) ((char-complement arg ...)
(let ((arg-list (syntax->list (syntax (arg ...))))) (let ((arg-list (syntax->list (syntax (arg ...)))))
(unless (= 1 (length arg-list)) (unless (= 1 (length arg-list))
(raise-syntax-error (bad-args stx 1))
'regular-expression
"must be (char-complement char-set-re)"
stx))
(let ((parsed (parse (car arg-list)))) (let ((parsed (parse (car arg-list))))
(unless (pure-char? parsed) (unless (char-set? parsed)
(raise-syntax-error (raise-syntax-error #f
'regular-expression "not a character set"
"must be (char-complement char-set-re)" stx
stx)) (car arg-list)))
`(char-complement ,parsed)))) `(char-complement ,parsed))))
((op form ...) ((op form ...)
(identifier? (syntax op)) (identifier? (syntax op))
(let ((expansion (syntax-local-value (syntax op) (lambda () #f)))) (let ((expansion (syntax-local-value (syntax op) (lambda () #f))))
(unless (lex-trans? expansion) (cond
(raise-syntax-error 'regular-expression ((lex-trans? expansion)
"undefined operator in" (parse ((lex-trans-f expansion) stx)))
stx)) (expansion
(parse ((lex-trans-f expansion) stx)))) (raise-syntax-error 'regular-expression
"not a lex-trans"
stx))
(else
(raise-syntax-error 'regular-expression
"undefined operator"
stx)))))
(_ (_
(raise-syntax-error (raise-syntax-error
'regular-expression 'regular-expression
"must be char, string, identifier, or (op args ...)" "not a char, string, identifier, or (op args ...)"
stx)))) stx))))
(define (pure-char? s-re) ;; char-set? : s-re -> bool
;; A char-set is an re that matches only strings of length 1.
;; char-set? is conservative.
(define (char-set? s-re)
(cond (cond
((char? s-re) #t) ((char? s-re) #t)
((string? s-re) (= (string-length s-re) 1)) ((string? s-re) (= (string-length s-re) 1))
((list? s-re) ((list? s-re)
(let ((op (car s-re))) (let ((op (car s-re)))
(case op (case op
((union intersection) (andmap pure-char? (cdr s-re))) ((union intersection) (andmap char-set? (cdr s-re)))
((char-range char-complement) #t) ((char-range char-complement) #t)
((repetition)
(and (= (cadr s-re) (caddr s-re)) (char-set? (cadddr s-re))))
((concatenation)
(and (= 2 (length s-re)) (char-set? (cadr s-re))))
(else #f)))) (else #f))))
(else #f))) (else #f)))
(test-block ()
((char-set? #\a) #t)
((char-set? "12") #f)
((char-set? "1") #t)
((char-set? '(repetition 1 2 #\1)) #f)
((char-set? '(repetition 1 1 "12")) #f)
((char-set? '(repetition 1 1 "1")) #t)
((char-set? '(union "1" "2" "3")) #t)
((char-set? '(union "1" "" "3")) #f)
((char-set? '(intersection "1" "2" (union "3" "4"))) #t)
((char-set? '(intersection "1" "")) #f)
((char-set? '(complement "1")) #f)
((char-set? '(concatenation "1" "2")) #f)
((char-set? '(concatenation "" "2")) #f)
((char-set? '(concatenation "1")) #t)
((char-set? '(concatenation "12")) #f)
((char-set? '(char-range #\1 #\2)) #t)
((char-set? '(char-complement #\1)) #t))
(test-block ()
((parse #'#\a) #\a)
((parse #'"1") "1")
((parse #'(repetition 1 1 #\1)) '(repetition 1 1 #\1))
((parse #'(repetition 0 +inf.0 #\1)) '(repetition 0 +inf.0 #\1))
((parse #'(union #\1 (union "2") (union)))
'(union #\1 (union "2") (union)))
((parse #'(intersection #\1 (intersection "2") (intersection)))
'(intersection #\1 (intersection "2") (intersection)))
((parse #'(complement (union #\1 #\2)))
'(complement (union #\1 #\2)))
((parse #'(concatenation "1" "2" (concatenation)))
'(concatenation "1" "2" (concatenation)))
((parse #'(char-range "1" #\1)) '(char-range #\1 #\1))
((parse #'(char-range #\1 "1")) '(char-range #\1 #\1))
((parse #'(char-range "1" "3")) '(char-range #\1 #\3))
((parse #'(char-complement (union "1" "2")))
'(char-complement (union "1" "2"))))
) )

@ -33,6 +33,7 @@
n n
(string->symbol (string->symbol
(format "token-~a" (syntax-object->datum n))) (format "token-~a" (syntax-object->datum n)))
n
n) n)
,@(if empty? '() '(x))) ,@(if empty? '() '(x)))
(make-token ',n ,(if empty? #f 'x)))) (make-token ',n ,(if empty? #f 'x))))

Loading…
Cancel
Save