*** empty log message ***

original commit: 008977eccd4cccf61a54aa66ec4d07b9e93a8462
tokens
Scott Owens 20 years ago
parent 15a113415c
commit 820407d045

@ -14,7 +14,7 @@
(lib "cffi.ss" "compiler")
"private-lex/token.ss")
(provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs
(provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs define-lex-trans
position-offset position-line position-col position?
define-tokens define-empty-tokens token-name token-value token? file-path
any-char any-string nothing alphabetic lower-case upper-case title-case
@ -28,15 +28,16 @@
(lambda (stx)
(syntax-case stx ()
((_)
(raise-syntax-error #f "empty lexer is not allowed" stx))
(raise-syntax-error #f "accepts the empty string" stx))
((_ re-act ...)
(begin
(for-each
(lambda (x)
(syntax-case x ()
((re act) (void))
(_ (raise-syntax-error 'lexer
"expects regular expression / action pairs"
(_ (raise-syntax-error #f
"not a regular expression / action pair"
stx
x))))
(syntax->list (syntax (re-act ...))))
(let* ((spec/re-act-lst
@ -88,13 +89,14 @@
(define-syntax (define-lex-abbrev stx)
(syntax-case stx ()
((_ name re)
(identifier? (syntax name))
(syntax
(define-syntax name
(make-lex-abbrev (quote-syntax re)))))
(_
(raise-syntax-error
#f
"Form should be (define-lex-abbrev name re)"
"form should be (define-lex-abbrev name re)"
stx))))
(define-syntax (define-lex-abbrevs stx)
@ -107,8 +109,9 @@
(identifier? (syntax name))
(syntax (define-lex-abbrev name re)))
(_ (raise-syntax-error
'Lexer-abbreviation
"Form should be (identifier value)"
#f
"form should be (define-lex-abbrevs (name re) ...)"
stx
a))))
abbrev)))
(datum->syntax-object
@ -118,19 +121,19 @@
(_
(raise-syntax-error
#f
"Form should be (define-lex-abbrevs (name re) ...)"
"form should be (define-lex-abbrevs (name re) ...)"
stx))))
(define-syntax (define-lex-trans stx)
(syntax-case stx ()
((_ name-form body-form)
(let-values (((name body)
(normalize-definition (syntax (define-syntax name-form body-form) #'lambda))))
#`(define-syntax name (make-lex-trans body))))
(normalize-definition (syntax (define-syntax name-form body-form)) #'lambda)))
#`(define-syntax #,name (make-lex-trans #,body))))
(_
(raise-syntax-error
#f
"Form should be (define-lex-trans name transformer)"
"form should be (define-lex-trans name transformer)"
stx))))

@ -12,7 +12,7 @@
;; Don't do anything with this one but extract the chars
(define all-chars (->re `(^) (make-cache)))
(define all-chars (->re `(char-complement (union)) (make-cache)))
;; get-char-groups : re bool -> (list-of char-setR?)
;; Collects the char-setRs in r that could be used in
@ -43,19 +43,23 @@
((get-char-groups e #f) null)
((get-char-groups z #f) null)
((get-char-groups r1 #f) (list r1))
((get-char-groups (->re `(@ ,r1 ,r2) c) #f)
((get-char-groups (->re `(concatenation ,r1 ,r2) c) #f)
(list r1))
((get-char-groups (->re `(@ ,e ,r2) c) #f)
((get-char-groups (->re `(concatenation ,e ,r2) c) #f)
(list r2))
((get-char-groups (->re `(@ (* ,r1) ,r2) c) #f)
((get-char-groups (->re `(concatenation (repeat 0 +inf.0 ,r1) ,r2) c) #f)
(list r1 r2))
((get-char-groups (->re `(* ,r1) c) #f)
((get-char-groups (->re `(repeat 0 +inf.0 ,r1) c) #f)
(list r1))
((get-char-groups (->re `(: (* ,r1) (@ (* ,r2) "3") "4") c) #f)
((get-char-groups
(->re `(union (repeat 0 +inf.0 ,r1)
(concatenation (repeat 0 +inf.0 ,r2) "3") "4") c) #f)
(list r1 r2 (->re "3" c) (->re "4" c)))
((get-char-groups (->re `(~ ,r1) c) #f)
((get-char-groups (->re `(complement ,r1) c) #f)
(list all-chars r1))
((get-char-groups (->re `(& (* ,r1) (@ (* ,r2) "3") "4") c) #f)
((get-char-groups
(->re `(intersection (repeat 0 +inf.0 ,r1)
(concatenation (repeat 0 +inf.0 ,r2) "3") "4") c) #f)
(list r1 r2 (->re "3" c) (->re "4" c)))
)
(define loc:member? is:member?)
@ -74,7 +78,11 @@
(build-or (list d (deriveR r2 c cache)) cache)
d)))
((repeatR? r)
(build-concat (deriveR (repeatR-re r) c cache) r cache))
(build-concat (deriveR (repeatR-re r) c cache)
(build-repeat (sub1 (repeatR-low r))
(sub1 (repeatR-high r))
(repeatR-re r) cache)
cache))
((orR? r)
(build-or (map (lambda (x) (deriveR x c cache))
(orR-res r))
@ -90,14 +98,14 @@
(a (char->integer #\a))
(b (char->integer #\b))
(r1 (->re #\a c))
(r2 (->re `(* #\a) c))
(r3 (->re `(* ,r2) c))
(r4 (->re `(@ #\a ,r2) c))
(r5 (->re `(* ,r4) c))
(r6 (->re `(: ,r5 #\a) c))
(r7 (->re `(@ ,r2 ,r2) c))
(r8 (->re `(~ ,r4) c))
(r9 (->re `(& ,r2 ,r4) c)))
(r2 (->re `(repeat 0 +inf.0 #\a) c))
(r3 (->re `(repeat 0 +inf.0 ,r2) c))
(r4 (->re `(concatenation #\a ,r2) c))
(r5 (->re `(repeat 0 +inf.0 ,r4) c))
(r6 (->re `(union ,r5 #\a) c))
(r7 (->re `(concatenation ,r2 ,r2) c))
(r8 (->re `(complement ,r4) c))
(r9 (->re `(intersection ,r2 ,r4) c)))
((deriveR e a c) z)
((deriveR z a c) z)
((deriveR r1 b c) z)
@ -108,16 +116,18 @@
((deriveR r3 b c) z)
((deriveR r4 a c) r2)
((deriveR r4 b c) z)
((deriveR r5 a c) (->re `(@ ,r2 ,r5) c))
((deriveR r5 a c) (->re `(concatenation ,r2 ,r5) c))
((deriveR r5 b c) z)
((deriveR r6 a c) (->re `(: (@ ,r2 ,r5) (epsilon)) c))
((deriveR r6 a c) (->re `(union (concatenation ,r2 ,r5) "") c))
((deriveR r6 b c) z)
((deriveR r7 a c) (->re `(: (@ ,r2 ,r2) ,r2) c))
((deriveR r7 a c) (->re `(union (concatenation ,r2 ,r2) ,r2) c))
((deriveR r7 b c) z)
((deriveR r8 a c) (->re `(~, r2) c))
((deriveR r8 b c) (->re `(~ ,z) c))
((deriveR r8 a c) (->re `(complement, r2) c))
((deriveR r8 b c) (->re `(complement ,z) c))
((deriveR r9 a c) r2)
((deriveR r9 b c) z))
((deriveR r9 b c) z)
((deriveR (->re `(repeat 1 2 "ab") c) a c)
(->re `(concatenation "b" (repeat 0 1 "ab")) c)))
;; An re-action is (cons re action)
@ -161,9 +171,10 @@
((derive a (c->i #\a) c) (list (cons e 1) (cons z 2)))
((derive a (c->i #\b) c) (list (cons z 1) (cons e 2)))
((derive a (c->i #\c) c) #f)
((derive (list (cons (->re `(: " " "\n" ",") c) 1)
(cons (->re `(@ (? "-") (+ (- "0" "9"))) c) 2)
(cons (->re `(@ "-" (+ "-")) c) 3)
((derive (list (cons (->re `(union " " "\n" ",") c) 1)
(cons (->re `(concatenation (repeat 0 1 "-")
(repeat 1 +inf.0 (char-range "0" "9"))) c) 2)
(cons (->re `(concatenation "-" (repeat 1 +inf.0 "-")) c) 3)
(cons (->re "[" c) 4)
(cons (->re "]" c) 5)) (c->i #\[) c)
b)
@ -195,11 +206,12 @@
(test-block ((c (make-cache))
(c->i char->integer)
(r1 (->re `(- #\1 #\4) c))
(r2 (->re `(- #\2 #\3) c)))
(r1 (->re `(char-range #\1 #\4) c))
(r2 (->re `(char-range #\2 #\3) c)))
((compute-chars null) null)
((compute-chars (list (make-state null 1))) null)
((map is:integer-set-contents (compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2))))
((map is:integer-set-contents
(compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2))))
(list (is:integer-set-contents (is:make-range (c->i #\2) (c->i #\3)))
(is:integer-set-contents (is:union (is:make-range (c->i #\1))
(is:make-range (c->i #\4)))))))
@ -292,30 +304,35 @@
(define t1 (build-test-dfa null))
(define t2 (build-test-dfa `(#\a)))
(define t3 (build-test-dfa `(#\a #\b)))
(define t4 (build-test-dfa `((* #\a)
(* (@ #\a #\b)))))
(define t5 (build-test-dfa `((@ (* (: #\0 #\1)) #\1))))
(define t6 (build-test-dfa `((* (* #\a))
(* (@ #\b (* #\b))))))
(define t7 (build-test-dfa `((@ (* #\a) (* #\b) (* #\c) (* #\d) (* #\e)))))
(define t4 (build-test-dfa `((repeat 0 +inf.0 #\a)
(repeat 0 +inf.0 (concatenation #\a #\b)))))
(define t5 (build-test-dfa `((concatenation (repeat 0 +inf.0 (union #\0 #\1)) #\1))))
(define t6 (build-test-dfa `((repeat 0 +inf.0 (repeat 0 +inf.0 #\a))
(repeat 0 +inf.0 (concatenation #\b (repeat 1 +inf.0 #\b))))))
(define t7 (build-test-dfa `((concatenation (repeat 0 +inf.0 #\a) (repeat 0 +inf.0 #\b)
(repeat 0 +inf.0 #\c) (repeat 0 +inf.0 #\d)
(repeat 0 +inf.0 #\e)))))
(define t8
(build-test-dfa `((@ (* (: #\a #\b)) #\a (: #\a #\b) (: #\a #\b) (: #\a #\b) (: #\a #\b)))))
(define t9 (build-test-dfa `((@ "/*"
(~ (@ (&) "*/" (&)))
(build-test-dfa `((concatenation (repeat 0 +inf.0 (union #\a #\b)) #\a (union #\a #\b)
(union #\a #\b) (union #\a #\b) (union #\a #\b)))))
(define t9 (build-test-dfa `((concatenation "/*"
(complement (concatenation (intersection) "*/" (intersection)))
"*/"))))
(define t11 (build-test-dfa `((~ "1"))))
(define t12 (build-test-dfa `((@ (& (@ (* "a") "b")
(@ "a" (* "b")))
"ab"))))
(define x (build-test-dfa `((: " " "\n" ",")
(@ (? "-") (+ (- "0" "9")))
(@ "-" (+ "-"))
(define t11 (build-test-dfa `((complement "1"))))
(define t12 (build-test-dfa `((concatenation (intersection (concatenation (repeat 0 +inf.0 "a") "b")
(concatenation "a" (repeat 0 +inf.0 "b")))
"ab"))))
(define x (build-test-dfa `((union " " "\n" ",")
(concatenation (repeat 0 1 "-") (repeat 1 +inf.0 (char-range "0" "9")))
(concatenation "-" (repeat 1 +inf.0 "-"))
"["
"]")))
(define y (build-test-dfa `((+ (: (@ "|" (* (^ "|")) "|")
(@ "|" (* (^ "|"))))))))
(define t13 (build-test-dfa `((& (@ (&) "111" (&))
(~ (: (@ (&) "01")
(+ "1")))))))
(define y (build-test-dfa
`((repeat 1 +inf.0
(union (concatenation "|" (repeat 0 +inf.0 (char-complement "|")) "|")
(concatenation "|" (repeat 0 +inf.0 (char-complement "|"))))))))
(define t13 (build-test-dfa `((intersection (concatenation (intersection) "111" (intersection))
(complement (union (concatenation (intersection) "01")
(repeat 1 +inf.0 "1")))))))
|#
)

@ -0,0 +1,35 @@
(define-lex-abbrev)
(define-lex-abbrev a)
(define-lex-abbrev (a b) v)
(define-lex-abbrev 1 1)
(define-lex-abbrevs ())
(define-lex-trans)
(define-lex-trans (1 b) 1)
(lexer)
(lexer ("a" "b" "c"))
(lexer ())
(lexer (""))
(lexer (a 1))
(lexer ((a) 1))
(let ((a 1))
(lexer ((a) 1)))
(let-syntax ((a 1))
(lexer ((a) 1)))
(let ()
(define-lex-trans a 1)
(let ()
(lexer ((a) 1))))
(lexer (1 1))
(lexer ((repetition) 1))
(lexer ((repetition #\1 #\1 "3") 1))
(lexer ((repetition 1 #\1 "3") 1))
(lexer ((repetition 1 0 "3") 1))
(lexer ((complement) 1))
(lexer ((char-range) 1))
(lexer ((char-range #\9 #\0) 1))
(lexer ((char-complement) 1))
(lexer ((char-complement (concatenation "1" "2")) 1))

@ -7,8 +7,8 @@
(provide ->re build-epsilon build-zero build-char-set build-concat
build-repeat build-or build-and build-neg
epsilonR? zeroR? char-setR? concatR? repeatR? orR? andR? negR?
char-setR-chars concatR-re1 concatR-re2 repeatR-re orR-res
andR-res negR-re
char-setR-chars concatR-re1 concatR-re2 repeatR-re repeatR-low repeatR-high
orR-res andR-res negR-re
re-nullable? re-index)
(define max-char-num #x7FFFFFFF)
@ -21,9 +21,9 @@
;; - (make-zeroR bool nat)
;; - (make-char-setR bool nat char-set)
;; - (make-concatR bool nat re re)
;; - (make-repeatR bool nat re)
;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs
;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs
;; - (make-repeatR bool nat nat nat-or-+inf.0 re)
;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs
;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs
;; - (make-negR bool nat re)
;;
;; Every re must have an index field globally different from all
@ -33,7 +33,7 @@
(define-struct (zeroR re) () (make-inspector))
(define-struct (char-setR re) (chars) (make-inspector))
(define-struct (concatR re) (re1 re2) (make-inspector))
(define-struct (repeatR re) (re) (make-inspector))
(define-struct (repeatR re) (low high re) (make-inspector))
(define-struct (orR re) (res) (make-inspector))
(define-struct (andR re) (res) (make-inspector))
(define-struct (negR re) (re) (make-inspector))
@ -47,23 +47,23 @@
(define z (make-zeroR #f (get-index)))
;; s-re = char match the given character
;; | string match its sequence of characters
;; s-re = char constant
;; | string constant (sequence of characters)
;; | re a precompiled re
;; | (epsilon) match the empty string
;; | (* s-re) match 0 or more
;; | (+ s-re) match 1 or more
;; | (? s-re) match 0 or 1
;; | (: s-re ...) match one of the sub-expressions
;; | (& s-re ...) match iff all sub-expressions match
;; | (~ s-re) match iff s-re doesn't match
;; | (@ s-re ...) match each sub-expression in succession
;; | (- char char) match any character between two (inclusive)
;; | (^ char_or_range ...) match any character not listed
;; (The null concatenation `(@) means epsilon as does "".
;; The null or `(:) means match nothing. The null carat `(^) means match
;; any character. The null intersection `(&) means match string.)
;; | (repeat low high s-re) repetition between low and high times (inclusive)
;; | (union s-re ...)
;; | (intersection s-re ...)
;; | (complement s-re)
;; | (concatenation s-re ...)
;; | (char-range rng rng) match any character between two (inclusive)
;; | (char-complement char-set) match any character not listed
;; low = natural-number
;; high = natural-number or +inf.0
;; rng = char or string with length 1
;; (concatenation) (repeat 0 0 x), and "" match the empty string.
;; (union) matches no strings.
;; (intersection) matches any string.
(define loc:make-range is:make-range)
(define loc:union is:union)
(define loc:split is:split)
@ -73,43 +73,35 @@
(define (->re exp cache)
(match exp
((? char?) (build-char-set (loc:make-range (char->integer exp)) cache))
((? string?) (->re `(@ ,@(string->list exp)) cache))
((? string?) (->re `(concatenation ,@(string->list exp)) cache))
((? re?) exp)
(`(epsilon) (build-epsilon))
(`(* ,r)
(build-repeat (->re r cache) cache))
(`(+ ,r)
(->re `(@ ,r (* ,r)) cache))
(`(? ,r)
(let ((c (->re r cache)))
(if (re-nullable? c)
c
(build-or (list e c) cache))))
(`(: ,rs ...)
(`(repeat ,low ,high ,r)
(build-repeat low high (->re r cache) cache))
(`(union ,rs ...)
(build-or (flatten-res (map (lambda (r) (->re r cache)) rs)
orR? orR-res loc:union cache)
cache))
(`(& ,rs ...)
(`(intersection ,rs ...)
(build-and (flatten-res (map (lambda (r) (->re r cache)) rs)
andR? andR-res (lambda (a b)
(let-values (((i _ __) (loc:split a b))) i))
cache)
cache))
(`(~ ,r)
(`(complement ,r)
(build-neg (->re r cache) cache))
(`(@ ,rs ...)
(`(concatenation ,rs ...)
(foldr (lambda (x y)
(build-concat (->re x cache) y cache))
e
rs))
(`(- ,c1 ,c2)
(`(char-range ,c1 ,c2)
(let ((i1 (char->integer (if (string? c1) (string-ref c1 0) c1)))
(i2 (char->integer (if (string? c2) (string-ref c2 0) c2))))
(if (<= i1 i2)
(build-char-set (loc:make-range i1 i2) cache)
z)))
(`(^ ,crs ...)
(let ((cs (->re `(: ,@crs) cache)))
(`(char-complement ,crs ...)
(let ((cs (->re `(union ,@crs) cache)))
(cond
((zeroR? cs) (build-char-set (loc:make-range 0 max-char-num) cache))
((char-setR? cs)
@ -174,15 +166,25 @@
(get-index)
r1 r2))))))
;; build-repeat : re cache -> re
(define (build-repeat r cache)
(cond
((eq? z r) e)
((repeatR? r) r)
(else
(cache (cons 'repeat (re-index r))
(lambda ()
(make-repeatR #t (get-index) r))))))
;; build-repeat : nat nat-or-+inf.0 re cache -> re
(define (build-repeat low high r cache)
(let ((low (if (< low 0) 0 low)))
(cond
((eq? r e) e)
((and (= 0 low) (or (= 0 high) (eq? z r))) e)
((and (= 1 low) (= 1 high)) r)
((and (repeatR? r)
(eq? (repeatR-high r) +inf.0)
(or (= 0 (repeatR-low r))
(= 1 (repeatR-low r))))
(build-repeat (* low (repeatR-low r))
+inf.0
(repeatR-re r)
cache))
(else
(cache (cons 'repeat (cons low (cons high (re-index r))))
(lambda ()
(make-repeatR (or (re-nullable? r) (= 0 low)) (get-index) low high r)))))))
;; build-or : (list-of re) cache -> re
@ -229,7 +231,7 @@
(r3 (build-char-set (is:make-range (char->integer #\3)) c))
(rc (build-concat r1 r2 c))
(rc2 (build-concat r2 r1 c))
(rr (build-repeat rc c))
(rr (build-repeat 0 +inf.0 rc c))
(ro (build-or `(,rr ,rc ,rr) c))
(ro2 (build-or `(,rc ,rr ,z) c))
(ro3 (build-or `(,rr ,rc) c))
@ -263,9 +265,18 @@
((orR-res ro4) (list r1 r2 r3))
((build-or null c) z)
((build-or `(,r1 ,z) c) r1)
((build-repeat rc c) rr)
((build-repeat z c) e)
((build-repeat (build-repeat rc c) c) (build-repeat rc c))
((build-repeat 0 +inf.0 rc c) rr)
((build-repeat 0 1 z c) e)
((build-repeat 0 0 rc c) e)
((build-repeat 0 +inf.0 z c) e)
((build-repeat -1 +inf.0 z c) e)
((build-repeat 0 +inf.0 (build-repeat 0 +inf.0 rc c) c)
(build-repeat 0 +inf.0 rc c))
((build-repeat 20 20 (build-repeat 0 +inf.0 rc c) c)
(build-repeat 0 +inf.0 rc c))
((build-repeat 20 20 (build-repeat 1 +inf.0 rc c) c)
(build-repeat 20 +inf.0 rc c))
((build-repeat 1 1 rc c) rc)
((repeatR-re rr) rc)
(ra ra2)
(ra ra3)
@ -282,6 +293,9 @@
((re-nullable? rc) #f)
((re-nullable? (build-concat rr rr c)) #t)
((re-nullable? rr) #t)
((re-nullable? (build-repeat 0 1 rc c)) #t)
((re-nullable? (build-repeat 1 2 rc c)) #f)
((re-nullable? (build-repeat 1 2 (build-or (list e r1) c) c)) #t)
((re-nullable? ro) #t)
((re-nullable? (build-or `(,r1 ,r2) c)) #f)
((re-nullable? (build-and `(,r1 ,e) c)) #f)
@ -293,16 +307,17 @@
(isc is:integer-set-contents)
(r1 (->re #\1 c))
(r2 (->re #\2 c))
(r3-5 (->re '(- #\3 #\5) c))
(r3-5 (->re '(char-range #\3 #\5) c))
(r4 (build-or `(,r1 ,r2) c))
(r5 (->re `(: ,r3-5 #\7) c))
(r5 (->re `(union ,r3-5 #\7) c))
(r6 (->re #\6 c)))
((flatten-res null orR? orR-res is:union c) null)
((isc (char-setR-chars (car (flatten-res `(,r1) orR? orR-res is:union c))))
(isc (is:make-range (char->integer #\1))))
((isc (char-setR-chars (car (flatten-res `(,r4) orR? orR-res is:union c))))
(isc (is:make-range (char->integer #\1) (char->integer #\2))))
((isc (char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1) orR? orR-res is:union c))))
((isc (char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1)
orR? orR-res is:union c))))
(isc (is:make-range (char->integer #\1) (char->integer #\7))))
((flatten-res `(,r1 ,r2) andR? andR-res (lambda (x y)
(let-values (((i _ __)
@ -315,53 +330,58 @@
(test-block ((c (make-cache))
(isc is:integer-set-contents)
(r (->re #\a c))
(rr (->re `(@ ,r ,r) c))
(rrr (->re `(@ ,r ,rr) c))
(rrr* (->re `(* ,rrr) c)))
(rr (->re `(concatenation ,r ,r) c))
(rrr (->re `(concatenation ,r ,rr) c))
(rrr* (->re `(repeat 0 +inf.0 ,rrr) c)))
((isc (char-setR-chars r)) (isc (is:make-range (char->integer #\a))))
((->re "" c) e)
((->re "asdf" c) (->re `(@ #\a #\s #\d #\f) c))
((->re "asdf" c) (->re `(concatenation #\a #\s #\d #\f) c))
((->re r c) r)
((->re `(epsilon) c) e)
((->re `(* ,r) c) (build-repeat r c))
((->re `(+ ,r) c) (build-concat r (build-repeat r c) c))
((->re `(? ,r) c) (build-or (list e r) c))
((->re `(? ,rrr*) c) rrr*)
((->re `(: (: (- #\a #\c) (^ (- #\000 #\110) (- #\112 #\377)))
(: (* #\2))) c)
((->re `(repeat 0 +inf.0 ,r) c) (build-repeat 0 +inf.0 r c))
((->re `(repeat 1 +inf.0 ,r) c) (build-repeat 1 +inf.0 r c))
((->re `(repeat 0 1 ,r) c) (build-repeat 0 1 r c))
((->re `(repeat 0 1 ,rrr*) c) rrr*)
((->re `(union (union (char-range #\a #\c)
(char-complement (char-range #\000 #\110)
(char-range #\112 ,(integer->char max-char-num))))
(union (repeat 0 +inf.0 #\2))) c)
(build-or (list (build-char-set (is:union (is:make-range 73)
(is:make-range 97 99))
c)
(build-repeat (build-char-set (is:make-range 50) c) c))
(build-repeat 0 +inf.0 (build-char-set (is:make-range 50) c) c))
c))
((->re `(: ,rr ,rrr) c) (build-or (list rr rrr) c))
((->re `(: ,r) c) r)
((->re `(:) c) z)
((->re `(& (& #\111 (^ (- #\000 #\110) (- #\112 #\377)))
(& (* #\2))) c)
((->re `(union ,rr ,rrr) c) (build-or (list rr rrr) c))
((->re `(union ,r) c) r)
((->re `(union) c) z)
((->re `(intersection (intersection #\111
(char-complement (char-range #\000 #\110)
(char-range #\112 ,(integer->char max-char-num))))
(intersection (repeat 0 +inf.0 #\2))) c)
(build-and (list (build-char-set (is:make-range 73) c)
(build-repeat (build-char-set (is:make-range 50) c) c))
(build-repeat 0 +inf.0 (build-char-set (is:make-range 50) c) c))
c))
((->re `(& (& #\000 (^ (- #\000 #\110) (- #\112 #\377)))
(& (* #\2))) c)
((->re `(intersection (intersection #\000 (char-complement (char-range #\000 #\110)
(char-range #\112 ,(integer->char max-char-num))))
(intersection (repeat 0 +inf.0 #\2))) c)
z)
((->re `(& ,rr ,rrr) c) (build-and (list rr rrr) c))
((->re `(& ,r) c) r)
((->re `(&) c) (build-neg z c))
((->re `(~ ,r) c) (build-neg r c))
((->re `(@) c) e)
((->re `(@ ,rrr*) c) rrr*)
((->re `(intersection ,rr ,rrr) c) (build-and (list rr rrr) c))
((->re `(intersection ,r) c) r)
((->re `(intersection) c) (build-neg z c))
((->re `(complement ,r) c) (build-neg r c))
((->re `(concatenation) c) e)
((->re `(concatenation ,rrr*) c) rrr*)
(rr (build-concat r r c))
((->re `(@ ,r ,rr ,rrr) c)
((->re `(concatenation ,r ,rr ,rrr) c)
(build-concat r (build-concat rr rrr c) c))
((isc (char-setR-chars (->re `(- #\1 #\1) c))) (isc (is:make-range 49)))
((isc (char-setR-chars (->re `(- #\1 #\9) c))) (isc (is:make-range 49 57)))
((isc (char-setR-chars (->re `(- "1" "1") c))) (isc (is:make-range 49)))
((isc (char-setR-chars (->re `(- "1" "9") c))) (isc (is:make-range 49 57)))
((->re `(- "9" "1") c) z)
((isc (char-setR-chars (->re `(^) c)))
(isc (char-setR-chars (->re `(- #\000 #\377) c))))
((isc (char-setR-chars (->re `(^ #\001 (- #\002 #\377)) c))) (isc (is:make-range 0)))
((isc (char-setR-chars (->re `(char-range #\1 #\1) c))) (isc (is:make-range 49)))
((isc (char-setR-chars (->re `(char-range #\1 #\9) c))) (isc (is:make-range 49 57)))
((isc (char-setR-chars (->re `(char-range "1" "1") c))) (isc (is:make-range 49)))
((isc (char-setR-chars (->re `(char-range "1" "9") c))) (isc (is:make-range 49 57)))
((->re `(char-range "9" "1") c) z)
((isc (char-setR-chars (->re `(char-complement) c)))
(isc (char-setR-chars (->re `(char-range #\000 ,(integer->char max-char-num)) c))))
((isc (char-setR-chars (->re `(char-complement #\001 (char-range #\002 ,(integer->char max-char-num))) c)))
(isc (is:make-range 0)))
)
)

@ -2,29 +2,35 @@
(require "util.ss")
(provide parse)
(define (repetition-error stx)
(define (bad-args stx num)
(raise-syntax-error
'regular-expression
"must be (repetition non-negative-exact-integer non-negative-exact-integer-or-+inf.0 re)"
#f
(format "incorrect number of arguments (should have ~a)" num)
stx))
(define (char-range-error stx)
(raise-syntax-error
'regular-expression
"must be (char-range char-or-single-char-string char-or-single-char-string)"
stx))
;; char-range-arg: syntax-object syntax-object -> nat
;; If c contains is a character or length 1 string, returns the integer
;; for the character. Otherwise raises a syntax error.
(define (char-range-arg stx containing-stx)
(let ((c (syntax-e stx)))
(cond
((char? c) (char->integer c))
((and (string? c) (= (string-length c) 1))
(char->integer (string-ref c 0)))
(else
(raise-syntax-error
#f
"not a char or single-char string"
containing-stx stx)))))
(test-block ()
((char-range-arg #'#\1 #'here) (char->integer #\1))
((char-range-arg #'"1" #'here) (char->integer #\1)))
(define (char-range-arg c stx)
(cond
((char? c) (integer->char c))
((and (string? c (= string-length c 1)))
(integer->char (string-ref c 0)))
(else
(char-range-error stx))))
;; parse : syntax-object -> s-re (see re.ss)
;; checks for errors and generates the plain s-exp form for s
;; Expands lex-abbrevs and applies lex-trans.
(define (parse stx)
(syntax-case stx (repetition union intersection complement concatenation
char-range char-complement)
@ -35,22 +41,34 @@
(raise-syntax-error 'regular-expression
"undefined abbreviation"
stx))
(parse (lex-abbrev-abbrev expand))))
(parse (lex-abbrev-abbrev expansion))))
(_
(or (char? (syntax-e stx)) (string? (syntax-e stx)))
(syntax-e stx))
((repetition arg ...)
(let ((arg-list (syntax->list (syntax (arg ...)))))
(unless (= 3 (length arg-list))
(repetition-error stx))
(let ((lo-val (car arg-list))
(hi-val (cadr arg-list))
(bad-args stx 2))
(let ((low (syntax-e (car arg-list)))
(high (syntax-e (cadr arg-list)))
(re (caddr arg-list)))
(unless (and (exact? lo-val) (integer? lo-val) (> lo-val 0)
(or (and (exact? hi-val) (integer? hi-val) (> hi-val 0))
(eq? hi-val +inf.0)))
(repetition-error stx))
`(repetition ,lo-val ,hi-val ,(parse re)))))
(unless (and (number? low) (exact? low) (integer? low) (>= low 0))
(raise-syntax-error #f
"not a non-negative exact integer"
stx
(car arg-list)))
(unless (or (and (number? high) (exact? high) (integer? high) (>= high 0))
(eq? high +inf.0))
(raise-syntax-error #f
"not a non-negative exact integer or +inf.0"
stx
(cadr arg-list)))
(unless (<= low high)
(raise-syntax-error
#f
"the first argument is not less than or equal to the second argument"
stx))
`(repetition ,low ,high ,(parse re)))))
((union re ...)
`(union ,@(map parse (syntax->list (syntax (re ...))))))
((intersection re ...)
@ -58,66 +76,108 @@
((complement re ...)
(let ((re-list (syntax->list (syntax (re ...)))))
(unless (= 1 (length re-list))
(raise-syntax-error 'regular-expression
"must be (complement re)"
stx))
(bad-args stx 1))
`(complement ,(parse (car re-list)))))
((concatenation re ...)
`(concatenation ,@(map parse (syntax->list (syntax (re ...))))))
((char-range arg ...)
(let ((arg-list (syntax->list (syntax (arg ...)))))
(unless (= 2 (length arg-list))
(char-range-error stx))
(bad-args stx 2))
(let ((i1 (char-range-arg (car arg-list) stx))
(i2 (char-range-arg (cadr arg-list) stx)))
(if (<= i1 i2)
`(char-range ,(integer->char i1) ,(integer->char i2))
(raise-syntax-error
'regular-expression
(format "first argument ~a does not preceed second argument ~a"
(car arg-list) (cdr arg-list))
#f
"the first argument does not precede or equal second argument"
stx)))))
((char-complement arg ...)
(let ((arg-list (syntax->list (syntax (arg ...)))))
(unless (= 1 (length arg-list))
(raise-syntax-error
'regular-expression
"must be (char-complement char-set-re)"
stx))
(bad-args stx 1))
(let ((parsed (parse (car arg-list))))
(unless (pure-char? parsed)
(raise-syntax-error
'regular-expression
"must be (char-complement char-set-re)"
stx))
(unless (char-set? parsed)
(raise-syntax-error #f
"not a character set"
stx
(car arg-list)))
`(char-complement ,parsed))))
((op form ...)
(identifier? (syntax op))
(let ((expansion (syntax-local-value (syntax op) (lambda () #f))))
(unless (lex-trans? expansion)
(raise-syntax-error 'regular-expression
"undefined operator in"
stx))
(parse ((lex-trans-f expansion) stx))))
(cond
((lex-trans? expansion)
(parse ((lex-trans-f expansion) stx)))
(expansion
(raise-syntax-error 'regular-expression
"not a lex-trans"
stx))
(else
(raise-syntax-error 'regular-expression
"undefined operator"
stx)))))
(_
(raise-syntax-error
'regular-expression
"must be char, string, identifier, or (op args ...)"
"not a char, string, identifier, or (op args ...)"
stx))))
(define (pure-char? s-re)
;; char-set? : s-re -> bool
;; A char-set is an re that matches only strings of length 1.
;; char-set? is conservative.
(define (char-set? s-re)
(cond
((char? s-re) #t)
((string? s-re) (= (string-length s-re) 1))
((list? s-re)
(let ((op (car s-re)))
(case op
((union intersection) (andmap pure-char? (cdr s-re)))
((union intersection) (andmap char-set? (cdr s-re)))
((char-range char-complement) #t)
((repetition)
(and (= (cadr s-re) (caddr s-re)) (char-set? (cadddr s-re))))
((concatenation)
(and (= 2 (length s-re)) (char-set? (cadr s-re))))
(else #f))))
(else #f)))
(test-block ()
((char-set? #\a) #t)
((char-set? "12") #f)
((char-set? "1") #t)
((char-set? '(repetition 1 2 #\1)) #f)
((char-set? '(repetition 1 1 "12")) #f)
((char-set? '(repetition 1 1 "1")) #t)
((char-set? '(union "1" "2" "3")) #t)
((char-set? '(union "1" "" "3")) #f)
((char-set? '(intersection "1" "2" (union "3" "4"))) #t)
((char-set? '(intersection "1" "")) #f)
((char-set? '(complement "1")) #f)
((char-set? '(concatenation "1" "2")) #f)
((char-set? '(concatenation "" "2")) #f)
((char-set? '(concatenation "1")) #t)
((char-set? '(concatenation "12")) #f)
((char-set? '(char-range #\1 #\2)) #t)
((char-set? '(char-complement #\1)) #t))
(test-block ()
((parse #'#\a) #\a)
((parse #'"1") "1")
((parse #'(repetition 1 1 #\1)) '(repetition 1 1 #\1))
((parse #'(repetition 0 +inf.0 #\1)) '(repetition 0 +inf.0 #\1))
((parse #'(union #\1 (union "2") (union)))
'(union #\1 (union "2") (union)))
((parse #'(intersection #\1 (intersection "2") (intersection)))
'(intersection #\1 (intersection "2") (intersection)))
((parse #'(complement (union #\1 #\2)))
'(complement (union #\1 #\2)))
((parse #'(concatenation "1" "2" (concatenation)))
'(concatenation "1" "2" (concatenation)))
((parse #'(char-range "1" #\1)) '(char-range #\1 #\1))
((parse #'(char-range #\1 "1")) '(char-range #\1 #\1))
((parse #'(char-range "1" "3")) '(char-range #\1 #\3))
((parse #'(char-complement (union "1" "2")))
'(char-complement (union "1" "2"))))
)

@ -33,6 +33,7 @@
n
(string->symbol
(format "token-~a" (syntax-object->datum n)))
n
n)
,@(if empty? '() '(x)))
(make-token ',n ,(if empty? #f 'x))))

Loading…
Cancel
Save