A new lexer generator backend based on Brzozowski derivatives.

original commit: fb626b980fd3193e34cd3f6fabb1e95bd1aeb08a
tokens
Scott Owens 21 years ago
parent 486e905094
commit 7b918d0164

@ -3,12 +3,13 @@
;; Provides the syntax used to create lexers and the functions needed to ;; Provides the syntax used to create lexers and the functions needed to
;; create and use the buffer that the lexer reads from. See doc.txt. ;; create and use the buffer that the lexer reads from. See doc.txt.
(require-for-syntax "private-lex/generate-code.ss" (require-for-syntax "private-lex/util.ss"
"private-lex/structs.ss") "private-lex/actions.ss"
"private-lex/front.ss")
(require (lib "readerr.ss" "syntax") (require (lib "readerr.ss" "syntax")
"private-lex/token.ss" (lib "cffi.ss" "compiler")
(lib "cffi.ss" "compiler")) "private-lex/token.ss")
(provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs (provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs
position-offset position-line position-col position? position-offset position-line position-col position?
@ -16,33 +17,6 @@
(define file-path (make-parameter #f)) (define file-path (make-parameter #f))
#;(define-syntaxes (lexer-exp lexer-src-pos-exp)
(let ((build-lexer
(lambda (wrap?)
(lambda (stx)
(syntax-case stx ()
((_)
(raise-syntax-error #f "empty lexer is not allowed" stx))
((_ re-act ...)
(begin
(for-each
(lambda (x)
(syntax-case x ()
((re act) (void))
(_ (raise-syntax-error 'lexer
"expects regular expression / action pairs"
x))))
(syntax->list (syntax (re-act ...))))
(let ((table (generate-table (syntax (re-act ...)) stx)))
(with-syntax ((code (compile-table table))
(actions-stx `(vector ,@(vector->list (table-actions table))))
(wrap? wrap?))
(syntax
(compiled-lexer-body code actions-stx wrap?)))))))))))
(values
(build-lexer #f)
(build-lexer #t))))
(define-syntaxes (lexer lexer-src-pos) (define-syntaxes (lexer lexer-src-pos)
(let ((build-lexer (let ((build-lexer
(lambda (wrap?) (lambda (wrap?)
@ -62,6 +36,8 @@
(syntax->list (syntax (re-act ...)))) (syntax->list (syntax (re-act ...))))
(let* ((spec/re-act-lst (let* ((spec/re-act-lst
(syntax->list (syntax (re-act ...)))) (syntax->list (syntax (re-act ...))))
(eof-act
(get-special-action spec/re-act-lst 'eof #''eof))
(spec-act (spec-act
(get-special-action spec/re-act-lst 'special #'(void))) (get-special-action spec/re-act-lst 'special #'(void)))
(spec-error-act (spec-error-act
@ -70,13 +46,15 @@
(get-special-action spec/re-act-lst 'special-comment #'#f)) (get-special-action spec/re-act-lst 'special-comment #'#f))
(re-act-lst (re-act-lst
(filter-out-specials spec/re-act-lst (filter-out-specials spec/re-act-lst
'(special special-comment special-error)))) '(special special-comment special-error eof))))
(let ((table (generate-table re-act-lst #'here stx))) (let-values (((trans start actions no-look)
(with-syntax ((start-state-stx (table-start table)) (build-lexer re-act-lst)))
(trans-table-stx (table-trans table)) (with-syntax ((start-state-stx start)
(eof-table-stx (table-eof table)) (trans-table-stx trans)
(no-lookahead-stx (table-no-lookahead table)) (no-lookahead-stx no-look)
(actions-stx `(vector ,@(vector->list (table-actions table)))) (actions-stx `(vector ,@(map (lambda (a)
(wrap-action a 'lexeme #'here a))
(vector->list actions))))
(spec-act-stx (spec-act-stx
(wrap-action spec-act 'special #'here spec-act)) (wrap-action spec-act 'special #'here spec-act))
(spec-error-act-stx (spec-error-act-stx
@ -84,17 +62,19 @@
(has-comment-act?-stx (if (syntax-e spec-comment-act) #t #f)) (has-comment-act?-stx (if (syntax-e spec-comment-act) #t #f))
(spec-comment-act-stx (spec-comment-act-stx
(wrap-action spec-comment-act (gensym) #'here spec-comment-act)) (wrap-action spec-comment-act (gensym) #'here spec-comment-act))
(eof-act-stx
(wrap-action eof-act 'lexeme #'here eof-act))
(wrap? wrap?)) (wrap? wrap?))
(syntax (syntax
(lexer-body start-state-stx (lexer-body start-state-stx
trans-table-stx trans-table-stx
eof-table-stx
actions-stx actions-stx
no-lookahead-stx no-lookahead-stx
spec-act-stx spec-act-stx
spec-error-act-stx spec-error-act-stx
has-comment-act?-stx has-comment-act?-stx
spec-comment-act-stx spec-comment-act-stx
eof-act-stx
wrap?)))))))))))) wrap?))))))))))))
(values (values
(build-lexer #f) (build-lexer #f)
@ -136,23 +116,10 @@
"Form should be (define-lex-abbrevs (name re) ...)" "Form should be (define-lex-abbrevs (name re) ...)"
stx)))) stx))))
#;(define (compiled-lexer-body lexer actions wrap?)
(lambda (ip)
(unless (input-port? ip)
(raise-type-error
'lexer
"input-port"
0
ip))
(let ((first-pos (get-position ip)))
(let-values (((longest-match-length length longest-match-action)
(lexer ip peek-string)))
(check-match ip first-pos longest-match-length length
(vector-ref actions longest-match-action) wrap?)))))
(define (lexer-body start-state trans-table eof-table actions no-lookahead (define (lexer-body start-state trans-table actions no-lookahead
special-action special-error-action special-action special-error-action
has-special-comment-action? special-comment-action wrap?) has-special-comment-action? special-comment-action eof-action wrap?)
(letrec ((lexer (letrec ((lexer
(lambda (ip) (lambda (ip)
(unless (input-port? ip) (unless (input-port? ip)
@ -167,9 +134,9 @@
((eq? 'special first-char) ((eq? 'special first-char)
(let* ((comment? #f) (let* ((comment? #f)
(error? #f) (error? #f)
(spec (with-handlers ((special-comment? (spec (with-handlers ((exn:special-comment?
(lambda (x) (set! comment? #t))) (lambda (x) (set! comment? #t)))
(exn:fail? (not-break-exn?
(lambda (ex) (set! error? #t) ex))) (lambda (ex) (set! error? #t) ex)))
(read-char-or-special ip)))) (read-char-or-special ip))))
(cond (cond
@ -181,6 +148,8 @@
(error? special-error-action) (error? special-error-action)
(else special-action)) (else special-action))
spec wrap?))))) spec wrap?)))))
((eof-object? first-char)
(do-match ip first-pos eof-action (read-char-or-special ip) wrap?))
(else (else
(let lexer-loop ( (let lexer-loop (
;; current-state ;; current-state
@ -198,10 +167,8 @@
(longest-match-length 1)) (longest-match-length 1))
(let ((next-state (let ((next-state
(cond (cond
((eof-object? char) ((eof-object? char) #f)
(vector-ref eof-table state)) ((eq? char 'special) #f)
((eq? char 'special)
#f)
(else (else
(vector-ref (vector-ref
trans-table trans-table

@ -0,0 +1,42 @@
(module actions mzscheme
(provide (all-defined))
;; wrap-action: (syntax-object or #f) symbol syntax-object syntax-object -> syntax-object
(define (wrap-action action result-name ctxt loc)
(if action
(let ((parms (datum->syntax-object
action
`(start-pos end-pos ,result-name return-without-pos input-port))))
(datum->syntax-object ctxt
`(lambda ,parms ,action)
loc))
(datum->syntax-object ctxt 'void loc)))
;; get-special-action: (syntax-object list) symbol 'a -> syntax-object or 'a
;; Returns the first action from a rule of the form ((which-special) action)
(define (get-special-action rules which-special none)
(cond
((null? rules) none)
(else
(syntax-case (car rules) ()
(((special) act)
(eq? (syntax-e (syntax special)) which-special)
(syntax act))
(_ (get-special-action (cdr rules) which-special none))))))
;; filter-out-specials: (syntax-object list) (symbol list) -> (syntax-object list)
;; Returns a list missing all the rules of the form ((special) action)
;; where special is a symbol in which specials.
(define (filter-out-specials rules which-specials)
(cond
((null? rules) null)
(else
(syntax-case (car rules) ()
(((special) act)
(memq (syntax-e (syntax special)) which-specials)
(filter-out-specials (cdr rules) which-specials))
(_ (cons (car rules) (filter-out-specials (cdr rules) which-specials)))))))
)

@ -0,0 +1,314 @@
(module deriv mzscheme
(require (lib "list.ss")
"re.ss"
"util.ss")
(provide build-dfa print-dfa (struct dfa (num-states start-state final-states/actions transitions)))
(define e (build-epsilon))
(define z (build-zero))
;; get-char-groups : re -> (list-of char-setR?)
;; Collects the char-setRs in r that could be used in
;; taking the derivative of r.
(define (get-char-groups r)
(cond
((or (eq? r e) (eq? r z)) null)
((char-setR? r) (list r))
((concatR? r)
(if (re-nullable? (concatR-re1 r))
(append (get-char-groups (concatR-re1 r))
(get-char-groups (concatR-re2 r)))
(get-char-groups (concatR-re1 r))))
((repeatR? r)
(get-char-groups (repeatR-re r)))
((orR? r)
(apply append (map get-char-groups (orR-res r))))))
(test-block ((c (make-cache))
(r1 (->re #\1 c))
(r2 (->re #\2 c)))
((get-char-groups e) null)
((get-char-groups z) null)
((get-char-groups r1) (list r1))
((get-char-groups (->re `(@ ,r1 ,r2) c))
(list r1))
((get-char-groups (->re `(@ ,e ,r2) c))
(list r2))
((get-char-groups (->re `(@ (* ,r1) ,r2) c))
(list r1 r2))
((get-char-groups (->re `(* ,r1) c))
(list r1))
((get-char-groups (->re `(: (* ,r1) (@ (* ,r2) "3") "4") c))
(list r1 r2 (->re "3" c) (->re "4" c)))
)
;; A char-set is a (list-of char) that is sorted and duplicate-free
;; partition : (list-of char-set) -> (list-of char-set)
;; The coarsest refinment r of sets such that the char-sets in r
;; are pairwise disjoint.
(define (partition sets)
(cond
((null? sets) null)
(else
(partition1 (car sets) (partition (cdr sets))))))
;; partition1 : char-set (list-of char-set) -> (list-of char-set)
;; All the char-sets in sets must be pairwise disjoint. Splits set
;; against each element in sets.
(define (partition1 set sets)
(cond
((null? set) sets)
((null? sets) (list set))
(else
(let ((set2 (car sets)))
(let-values (((i s1 s2) (split set set2)))
(let ((rest (partition1 s1 (cdr sets))))
(cond
((null? i)
(cons s2 rest))
((null? s2)
(cons i rest))
(else
(cons i (cons s2 rest))))))))))
(test-block ((sl string->list))
((partition null) null)
((partition (list (sl "1234"))) (list (sl "1234")))
((partition (list (sl "1234") (sl "0235")))
(list (sl "23") (sl "05") (sl "14")))
((partition (list (sl "12349") (sl "02359") (sl "67") (sl "29")))
(list (sl "29") (sl "67") (sl "3") (sl "05") (sl "14")))
)
(test-block ((sl string->list))
((partition1 (sl "bcdjw") null) (list (sl "bcdjw")))
((partition1 null null) null)
((partition1 null (list (sl "a") (sl "b") (sl "1")))
(list (sl "a") (sl "b") (sl "1")))
((partition1 (sl "bcdjw")
(list (sl "z")
(sl "ab")
(sl "dj")))
(list (sl "z") (sl "b") (sl "a") (sl "dj") (sl "cw"))))
;; deriveR : re * char cache -> re
(define (deriveR r c cache)
(cond
((or (eq? r e) (eq? r z)) z)
((char-setR? r)
(if (memq c (char-setR-chars r)) e z))
((concatR? r)
(let* ((r1 (concatR-re1 r))
(r2 (concatR-re2 r))
(d (build-concat (deriveR r1 c cache) r2 cache)))
(if (re-nullable? r1)
(build-or (list d (deriveR r2 c cache)) cache)
d)))
((repeatR? r)
(build-concat (deriveR (repeatR-re r) c cache) r cache))
((orR? r)
(build-or (map (lambda (x) (deriveR x c cache))
(orR-res r))
cache))
((andR? r)
(build-and (map (lambda (x) (deriveR x c cache))
(andR-res r))
cache))
((negR? r)
(build-neg (deriveR (negR-re r) c cache) cache))))
(test-block ((c (make-cache))
(r1 (->re #\a c))
(r2 (->re `(* #\a) c))
(r3 (->re `(* ,r2) c))
(r4 (->re `(@ #\a ,r2) c))
(r5 (->re `(* ,r4) c))
(r6 (->re `(: ,r5 #\a) c))
(r7 (->re `(@ ,r2 ,r2) c)))
((deriveR e #\a c) z)
((deriveR z #\a c) z)
((deriveR r1 #\b c) z)
((deriveR r1 #\a c) e)
((deriveR r2 #\a c) r2)
((deriveR r2 #\b c) z)
((deriveR r3 #\a c) (->re `(@ ,r2 ,r3) c))
((deriveR r3 #\b c) z)
((deriveR r4 #\a c) r2)
((deriveR r4 #\b c) z)
((deriveR r5 #\a c) (->re `(@ ,r2 ,r5) c))
((deriveR r5 #\b c) z)
((deriveR r6 #\a c) (->re `(: (@ ,r2 ,r5) (epsilon)) c))
((deriveR r6 #\b c) z)
((deriveR r7 #\a c) (->re `(: (@ ,r2 ,r2) ,r2) c))
((deriveR r7 #\b c) z))
;; An re-action is (cons re action)
;; derive : (list-of re-action) char cache -> (union (list-of re-action) #f)
;; applies deriveR to all the re-actions's re parts.
;; Returns #f if the derived state is equivalent to z.
(define (derive r c cache)
(let ((new-r (map (lambda (ra)
(cons (deriveR (car ra) c cache) (cdr ra)))
r)))
(if (andmap (lambda (x) (eq? z (car x)))
new-r)
#f
new-r)))
(test-block ((c (make-cache))
(r1 (->re #\1 c))
(r2 (->re #\2 c)))
((derive null #\1 c) #f)
((derive (list (cons r1 1) (cons r2 2)) #\1 c)
(list (cons e 1) (cons z 2)))
((derive (list (cons r1 1) (cons r2 2)) #\3 c) #f))
;; get-final : (list-of re-action) -> (union #f syntax-object)
;; An re that accepts e represents a final state. Return the
;; action from the first final state or #f if there is none.
(define (get-final res)
(cond
((null? res) #f)
((re-nullable? (caar res)) (cdar res))
(else (get-final (cdr res)))))
(test-block ((c (make-cache))
(r1 (->re #\a c))
(r2 (->re #\b c))
(a (list (cons r1 1) (cons r2 2))))
((derive null #\a c) #f)
((derive a #\a c) (list (cons e 1) (cons z 2)))
((derive a #\b c) (list (cons z 1) (cons e 2)))
((derive a #\c c) #f)
((get-final a) #f)
((get-final (list (cons e 1) (cons e 2))) 1))
;; A state is (make-state (list-of re-action) nat)
(define-struct state (spec index))
;; get->key : re-action -> (list-of nat)
;; states are indexed by the list of indexes of their res
(define (get-key s)
(map (lambda (x) (re-index (car x))) s))
;; compute-chars : (list-of state) -> (list-of char-set)
;; Computed the sets of equivalent characters for taking the
;; derivative of the car of st. Only one derivative per set need to be taken.
(define (compute-chars st)
(cond
((null? st) null)
(else
(partition (map char-setR-chars
(apply append (map (lambda (x) (get-char-groups (car x)))
(state-spec (car st)))))))))
(test-block ((c (make-cache))
(r1 (->re `(- #\1 #\4) c))
(r2 (->re `(- #\2 #\3) c)))
((compute-chars null) null)
((compute-chars (list (make-state null 1))) null)
((compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2)))
(list (list #\2 #\3) (list #\1 #\4))))
;; A dfa is (make-dfa int int
;; (list-of (cons int syntax-object))
;; (list-of (cons int (list-of (cons (list-of char) int)))))
;; Each transitions is a state and a list of chars with the state to transition to.
;; The finals and transitions are sorted by state number, and duplicate free.
(define-struct dfa (num-states start-state final-states/actions transitions))
;; build-dfa : (list-of re-action) cache -> dfa
(define (build-dfa rs cache)
(let* ((transitions (make-hash-table))
(get-state-number (make-counter))
(start (make-state rs (get-state-number))))
(cache (cons 'state (get-key rs)) (lambda () start))
(let loop ((old-states (list start))
(new-states null)
(cs (compute-chars (list start))))
(cond
((and (null? old-states) (null? new-states))
(make-dfa (get-state-number) (state-index start)
(mergesort (filter (lambda (x) (cdr x))
(hash-table-map transitions
(lambda (state _)
(cons (state-index state) (get-final (state-spec state))))))
(lambda (a b) (< (car a) (car b))))
(mergesort (hash-table-map transitions
(lambda (state trans)
(cons (state-index state)
(map (lambda (t)
(cons (car t)
(state-index (cdr t))))
trans))))
(lambda (a b) (< (car a) (car b))))))
((null? old-states)
(loop new-states null (compute-chars new-states)))
((null? cs)
(loop (cdr old-states) new-states (compute-chars (cdr old-states))))
(else
(let* ((state (car old-states))
(c (car cs))
(new-re (derive (state-spec state) (car c) cache)))
(cond
(new-re
(let* ((new-state? #f)
(new-state (cache (cons 'state (get-key new-re))
(lambda ()
(set! new-state? #t)
(make-state new-re (get-state-number))))))
(hash-table-put! transitions
state
(cons (cons c new-state)
(hash-table-get transitions state
(lambda () null))))
(cond
(new-state?
(loop old-states (cons new-state new-states) (cdr cs)))
(else
(loop old-states new-states (cdr cs))))))
(else (loop old-states new-states (cdr cs))))))))))
(define (print-dfa x)
(printf "number of states: ~a~n" (dfa-num-states x))
(printf "start state: ~a~n" (dfa-start-state x))
(printf "final states: ~a~n" (map car (dfa-final-states/actions x)))
(for-each (lambda (trans)
(printf "state: ~a~n" (car trans))
(for-each (lambda (rule)
(printf " -~a-> ~a~n"
(car rule)
(cdr rule)))
(cdr trans)))
(dfa-transitions x)))
(define (build-test-dfa rs)
(let ((c (make-cache)))
(build-dfa (map (lambda (x) (cons (->re x c) 'action))
rs)
c)))
#|
(define t1 (build-test-dfa null))
(define t2 (build-test-dfa `(#\a)))
(define t3 (build-test-dfa `(#\a #\b)))
(define t4 (build-test-dfa `((* #\a)
(* (@ #\a #\b)))))
(define t5 (build-test-dfa `((@ (* (: #\0 #\1)) #\1))))
(define t6 (build-test-dfa `((* (* #\a))
(* (@ #\b (* #\b))))))
(define t7 (build-test-dfa `((@ (* #\a) (* #\b) (* #\c) (* #\d) (* #\e)))))
(define t8
(build-test-dfa `((@ (* (: #\a #\b)) #\a (: #\a #\b) (: #\a #\b) (: #\a #\b) (: #\a #\b)))))
|#
)

@ -0,0 +1,101 @@
(module front mzscheme
(require "util.ss"
"stx.ss"
"re.ss"
"deriv.ss")
(provide build-lexer)
(define-syntax time-label
(syntax-rules ()
((_ l e ...)
(begin
(printf "~a: " l)
(time (begin e ...))))))
;; dfa->table : dfa -> (same as build-lexer)
(define (dfa->table dfa)
(let (
;; no-look : (vector-of bool)
;; For each state whether the lexer can ignore the next input.
;; It can do this only if there are no transitions out of the
;; current state.
(no-look (make-vector (dfa-num-states dfa) #t))
;; actions : (vector-of (union #f syntax-object))
;; The action for each final state, #f if the state isn't final
(actions (make-vector (dfa-num-states dfa) #f))
;; char-table : (vector-of (union #f nat))
;; The lexer table, one entry per state per char.
;; Each entry specifies a state to transition to.
;; #f indicates no transition
(char-table (make-vector (* 256 (dfa-num-states dfa)) #f)))
;; Fill the char-table vector
(for-each
(lambda (trans)
(let ((from-state (car trans)))
(for-each (lambda (chars/to)
(let ((to-state (cdr chars/to)))
(for-each (lambda (char)
(vector-set! char-table
(bitwise-ior
(char->integer char)
(arithmetic-shift from-state 8))
to-state))
(car chars/to))))
(cdr trans))))
(dfa-transitions dfa))
(for-each (lambda (trans)
(vector-set! no-look (car trans) #f))
(dfa-transitions dfa))
(for-each (lambda (state/action)
(vector-set! actions (car state/action) (cdr state/action)))
(dfa-final-states/actions dfa))
(values char-table (dfa-start-state dfa) actions no-look)))
(test-block ()
((call-with-values (lambda ()
(dfa->table (make-dfa 1 1 (list) (list))))
list)
(list (make-vector 256 #f) 1 (vector #f) (make-vector 1 #t)))
((call-with-values (lambda ()
(dfa->table (make-dfa 4 1 (list (cons 2 2) (cons 3 3))
(list (cons 1 (list (cons (list #\1 #\2) 1)
(cons (list #\3) 2)))
(cons 2 (list (cons (list #\1) 3)))))))
list)
(list (let ((v (make-vector 1024 #f)))
(vector-set! v 305 1)
(vector-set! v 306 1)
(vector-set! v 307 2)
(vector-set! v 561 3)
v)
1
(vector #f #f 2 3)
(vector #t #f #f #t))))
;; build-lexer : syntax-object list -> (values (vector-of (union #f nat)) nat (vector-of (union #f syntax-object)) (vector-of bool))
;; each syntax object has the form (re action)
(define (build-lexer sos)
(let* ((s-re-acts (map (lambda (so)
(cons (parse (car (syntax->list so)))
(cadr (syntax->list so))))
sos))
(cache (make-cache))
(re-acts (map (lambda (s-re-act)
(cons (->re (car s-re-act) cache)
(cdr s-re-act)))
s-re-acts))
(dfa (build-dfa re-acts cache)))
;(print-dfa dfa)
(dfa->table dfa)))
)

@ -0,0 +1,301 @@
(module re mzscheme
(require (lib "match.ss")
(lib "list.ss")
"util.ss")
(provide ->re build-epsilon build-zero build-char-set build-concat
build-repeat build-or build-and build-neg
epsilonR? zeroR? char-setR? concatR? repeatR? orR? andR? negR?
char-setR-chars concatR-re1 concatR-re2 repeatR-re orR-res
andR-res negR-re
re-nullable? re-index)
;; get-index : -> nat
(define get-index (make-counter))
;; An re is either
;; - (make-epsilonR bool nat)
;; - (make-zeroR bool nat)
;; - (make-char-setR bool nat (list-of char)) The list must be sorted
;; - (make-concatR bool nat re re)
;; - (make-repeatR bool nat re)
;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs
;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs
;; - (make-negR bool nat re)
;;
;; Every re must have an index field globally different from all
;; other re index fields.
(define-struct re (nullable? index))
(define-struct (epsilonR re) ())
(define-struct (zeroR re) ())
(define-struct (char-setR re) (chars) (make-inspector))
(define-struct (concatR re) (re1 re2) (make-inspector))
(define-struct (repeatR re) (re))
(define-struct (orR re) (res) (make-inspector))
(define-struct (andR re) (res))
(define-struct (negR re) (re))
;; e : re
;; The unique epsilon re
(define e (make-epsilonR #t (get-index)))
;; z : re
;; The unique zero re
(define z (make-zeroR #f (get-index)))
;; s-re = char match the given character
;; | string match its sequence of characters
;; | re a precompiled re
;; | (epsilon) match the empty string
;; | (* s-re) match 0 or more
;; | (+ s-re) match 1 or more
;; | (? s-re) match 0 or 1
;; | (: s-re ...) match one of the sub-expressions
;; | (@ s-re ...) match each sub-expression in succession
;; | (- char char) match any character between two (inclusive)
;; | (^ char_or_range ...1) match any character not listed
;; (the null concatenation `(@) means epsilon as does "".
;; the null or `(:) means match nothing. The null carat `(^) means match
;; any character.)
;; ->re : s-re cache -> re
(define (->re exp cache)
(match exp
((? char?) (build-char-set (list exp) cache))
((? string?) (->re `(@ ,@(string->list exp)) cache))
((? re?) exp)
(`(epsilon) (build-epsilon))
(`(* ,r)
(build-repeat (->re r cache) cache))
(`(+ ,r)
(->re `(@ ,r (* ,r)) cache))
(`(? ,r)
(let ((c (->re r cache)))
(if (re-nullable? c)
c
(build-or (list e c) cache))))
(`(: ,r)
(->re r cache))
(`(: ,rs ...)
(build-or (flatten-res (map (lambda (r) (->re r cache)) rs) cache)
cache))
(`(@ ,rs ...)
(foldr (lambda (x y)
(build-concat (->re x cache) y cache))
e
rs))
(`(- ,c1 ,c2)
(let ((i1 (char->integer (if (string? c1) (string-ref c1 0) c1)))
(i2 (char->integer (if (string? c2) (string-ref c2 0) c2))))
(if (<= i1 i2)
(build-char-set (make-range i1 i2) cache)
z)))
(`(^ ,crs ...)
(let ((cs (->re `(: ,@crs) cache)))
(cond
((zeroR? cs) (build-char-set (make-range 0 255) cache))
((char-setR? cs)
(build-char-set
(let loop ((bad-chars (map char->integer
(char-setR-chars cs)))
(i 0))
(cond
((> i 255) null)
((and (not (null? bad-chars))
(= i (car bad-chars)))
(loop (cdr bad-chars) (add1 i)))
(else
(cons (integer->char i) (loop bad-chars (add1 i))))))
cache))
(else z))))))
;; flatten-res: (list-of re) cache -> (list-of re)
;; Takes all the char-sets in l and combines them into one element.
;; Removes orRs too,
(define (flatten-res l cache)
(let loop ((res l)
(chars null)
(no-chars null))
(cond
((null? res)
(if (null? chars)
no-chars
(cons (build-char-set (mergesort chars char<?) cache) no-chars)))
((char-setR? (car res))
(loop (cdr res) (merge (char-setR-chars (car res)) chars) no-chars))
((orR? (car res))
(loop (append (orR-res (car res)) (cdr res)) chars no-chars))
(else (loop (cdr res) chars (cons (car res) no-chars))))))
;; build-epsilon : -> re
(define (build-epsilon) e)
(define (build-zero) z)
;; build-char-set : (list-of char) cache -> re
;; cs must be sorted
#;(define (build-char-set cs cache)
(cond
((null? cs) z)
(else
(make-char-setR #f (get-index) cs))))
(define (build-char-set cs cache)
(cond
((null? cs) z)
(else
(cache cs
(lambda ()
(make-char-setR #f (get-index) cs))))))
;; build-concat : re re cache -> re
(define (build-concat r1 r2 cache)
(cond
((eq? e r1) r2)
((eq? e r2) r1)
((or (eq? z r1) (eq? z r2)) z)
(else
(let* ((i1 (re-index r1))
(i2 (re-index r2))
(key (if (< i1 i2)
(cons i1 i2)
(cons i2 i1))))
(cache (cons 'concat key)
(lambda ()
(make-concatR (and (re-nullable? r1) (re-nullable? r2))
(get-index)
r1 r2)))))))
;; build-repeat : re cache -> re
(define (build-repeat r cache)
(cache (cons 'repeat (re-index r))
(lambda ()
(make-repeatR #t (get-index) r))))
;; build-or : (list-of re) cache -> re
(define (build-or rs cache)
(let ((rs
(filter
(lambda (x) (not (eq? x z)))
(do-simple-equiv (replace rs orR? orR-res null) re-index))))
(cond
((null? rs) z)
((null? (cdr rs)) (car rs))
(else
(cache (cons 'or (map re-index rs))
(lambda ()
(make-orR (ormap re-nullable? rs) (get-index) rs)))))))
;; build-and : (list-of re) cache -> re
(define (build-and rs cache)
(let ((rs (do-simple-equiv (replace rs andR? andR-res null) rs)))
(cond
((ormap (lambda (x) (eq? x z)) rs) z)
(else
(cache (cons 'and (map re-index rs))
(lambda ()
(make-andR (andmap re-nullable? rs) (get-index) rs)))))))
;; build-neg : re cache -> re
(define (build-neg r cache)
(cache (cons 'neg (re-index r))
(lambda ()
(make-negR (not (re-nullable? r)) (get-index) r))))
(test-block ((c (make-cache))
(r1 (->re #\1 c))
(r2 (->re #\2 c))
(rc (->re `(@ ,r1 ,r2) c))
(rc2 (->re `(@ ,r2 ,r1) c))
(rr (->re `(* ,rc) c))
(ro (->re `(: ,rr ,rc ,rr) c))
(ro2 (->re `(: ,rc ,rr ,z) c))
(ro3 (->re `(: ,rr ,rc) c)))
(rc rc2)
(ro ro2)
(ro ro3)
((->re `(* ,rc) c) rr)
((build-char-set null c) z)
((->re `(@ ,r1 (epsilon)) c) r1)
((->re `(@ (epsilon) ,r1) c) r1)
((->re `(@ ,r1 ,z) c) z)
((->re `(@ ,z ,r1) c) z)
((->re `(@ ,z (epsilon)) c) z)
((->re `(@ (epsilon) ,z) c) z)
((->re `(:) c) z)
((->re `(: ,rr) c) rr)
((build-or `(,z ,r1 ,z) c) r1)
((build-or (list
(build-or (list r1 r2) c)
(build-or (list rc rr) c))
c)
(build-or (list r1 r2 rc rr) c))
((concatR-re1 rc2) r1)
((concatR-re2 rc2) r2)
((orR-res ro) (list rc rr))
((repeatR-re rr) rc)
((re-nullable? r1) #f)
((re-nullable? rc) #f)
((re-nullable? (->re `(@ ,rr ,rr) c)) #t)
((re-nullable? rr) #t)
((re-nullable? ro) #t)
((re-nullable? (->re `(: ,r1 ,r2) c)) #f))
(test-block ((c (make-cache))
(r1 (->re #\1 c))
(r2 (->re #\2 c))
(r3-5 (->re '(- #\3 #\5) c))
(r4 (build-or `(,r1 ,r2) c))
(r5 (->re `(: ,r3-5 #\7) c))
(r6 (->re #\6 c)))
((flatten-res null c) null)
((char-setR-chars (car (flatten-res `(,r1) c))) '(#\1))
((char-setR-chars (car (flatten-res `(,r4) c))) '(#\1 #\2))
((char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1) c)))
(string->list "1234567")))
(test-block ((c (make-cache))
(r (->re #\a c))
(rr (->re `(@ ,r ,r) c))
(rrr (->re `(@ ,r ,rr) c))
(rrr* (->re `(* ,rrr) c)))
((char-setR-chars r) '(#\a))
((->re "" c) e)
((->re "asdf" c) (->re `(@ #\a #\s #\d #\f) c))
((->re r c) r)
((->re `(epsilon) c) e)
((->re `(* ,r) c) (build-repeat r c))
((->re `(+ ,r) c) (build-concat r (build-repeat r c) c))
((->re `(? ,r) c) (build-or (list e r) c))
((->re `(? ,rrr*) c) rrr*)
((->re `(: (: (- #\a #\c) (^ (- #\000 #\110) (- #\112 #\377)))
(: (* #\2))) c)
(build-or (list (build-char-set (list #\111 #\a #\b #\c) c)
(build-repeat (build-char-set '(#\2) c) c))
c))
((->re `(: ,rr ,rrr) c) (build-or (list rr rrr) c))
((->re `(: ,r) c) r)
((->re `(:) c) z)
((->re `(@) c) e)
((->re `(@ ,rrr*) c) rrr*)
(rr (build-concat r r c))
((->re `(@ ,r ,rr ,rrr) c)
(build-concat r (build-concat rr rrr c) c))
((char-setR-chars (->re `(- #\1 #\1) c)) '(#\1))
((char-setR-chars (->re `(- #\1 #\9) c)) (string->list "123456789"))
((char-setR-chars (->re `(- "1" "1") c)) '(#\1))
((char-setR-chars (->re `(- "1" "9") c)) (string->list "123456789"))
((->re `(- "9" "1") c) z)
((char-setR-chars (->re `(^) c))
(char-setR-chars (->re `(- #\000 #\377) c)))
((char-setR-chars (->re `(^ #\001 (- #\002 #\377)) c)) `(#\000))
)
)

@ -0,0 +1,104 @@
(module stx mzscheme
(require (lib "stx.ss" "syntax")
"util.ss")
(provide parse)
(define (num-arg-err s expect given)
(raise-syntax-error
'regular-expression
(format "operator expects ~a arguments, given ~a" expect given)
s))
;; parse : syntax-object -> s-re (see re.ss)
;; checks for errors and generates the plain s-exp form for s
(define (parse s)
(let ((s-e (syntax-e s)))
(cond
((char? s-e) s-e)
((string? s-e) s-e)
((symbol? s-e)
(let ((expand (syntax-local-value s (lambda () #f))))
(unless (lex-abbrev? expand)
(raise-syntax-error 'regular-expression "undefined abbreviation" s))
(parse (lex-abbrev-abbrev expand))))
((stx-null? s)
(raise-syntax-error 'regular-expression "invalid regular expression" s))
((stx-list? s)
(let* ((ar (stx->list (stx-cdr s)))
(num-args (length ar)))
(case (syntax-e (stx-car s))
((epsilon) '(epsilon))
((*)
(unless (= num-args 1)
(num-arg-err s 1 num-args))
`(* ,(parse (car ar))))
((+)
(unless (= num-args 1)
(num-arg-err s 1 num-args))
`(+ ,(parse (car ar))))
((?)
(unless (= num-args 1)
(num-arg-err s 1 num-args))
`(? ,(parse (car ar))))
((:) `(: ,@(map parse ar)))
((@) `(@ ,@(map parse ar)))
((-)
(unless (= num-args 2)
(num-arg-err s 2 num-args))
(let ((c1 (parse (car ar)))
(c2 (parse (cadr ar))))
(if (and (or (char? c1) (and (string? c1) (= 1 (string-length c1))))
(or (char? c2) (and (string? c2) (= 1 (string-length c2)))))
(let ((i1 (char->integer (if (char? c1) c1 (string-ref c1 0))))
(i2 (char->integer (if (char? c2) c2 (string-ref c2 0)))))
(if (<= i1 i2)
`(- ,c1 ,c2)
(raise-syntax-error
'regular-expression
(format "first argument ~a does not preceed second argument ~a"
c1 c2)
s)))
(raise-syntax-error
'regular-expression
(format "expects single character arguments, given ~a and ~a"
(syntax-object->datum (car ar))
(syntax-object->datum (cadr ar)))
s))))
((^)
(let ((res (map parse ar)))
(if (not (andmap pure-char? res))
(raise-syntax-error
'regular-expression
(format
"expects single character or character range arguments, given ~a"
(map syntax-object->datum ar))
s))
`(^ ,@res)))
(else
(raise-syntax-error
'regular-expression
"invalid operator"
s)))))
(else
(raise-syntax-error
'regular-expression
"invalid regular expression"
s)))))
(define (pure-char? s-re)
(cond
((char? s-re) #t)
((string? s-re) (= (string-length s-re) 1))
((list? s-re)
(let ((op (car s-re)))
(case op
((: ^) (andmap pure-char? (cdr s-re)))
((-) #t)
(else #f))))
(else #f)))
)

@ -0,0 +1,180 @@
(module util mzscheme
(require (lib "list.ss"))
(provide (all-defined-except split-acc))
(define-struct lex-abbrev (abbrev))
(define-syntax test-block
(syntax-rules ()
((_ defs (code right-ans) ...)
(let* defs
(let ((real-ans code))
(unless (equal? real-ans right-ans)
(printf "Test failed: ~e gave ~e. Expected ~e~n"
'code real-ans 'right-ans))) ...))))
#;(define-syntax test-block
(syntax-rules ()
((_ x ...) (void))))
;; A cache is (X ( -> Y) -> Y)
;; make-cache : -> cache
;; table map Xs to Ys. If key is mapped, its value is returned.
;; Otherwise, build is invoked and its result is placed in the table and
;; returned.
;; Xs are compared with equal?
(define (make-cache)
(let ((table (make-hash-table 'equal)))
(lambda (key build)
(hash-table-get table key
(lambda ()
(let ((new (build)))
(hash-table-put! table key new)
new))))))
(test-block ((cache (make-cache)))
((cache '(1 2) (lambda () 9)) 9)
((cache '(2 1) (lambda () 8)) 8)
((cache '(1 2) (lambda () 1)) 9))
;; make-counter : -> -> nat
;; makes a function that returns a higher number by 1, each time
;; it is called.
(define (make-counter)
(let ((counter 0))
(lambda ()
(begin0
counter
(set! counter (add1 counter))))))
(test-block ((c (make-counter))
(d (make-counter)))
((c) 0)
((d) 0)
((c) 1)
((d) 1)
((c) 2))
;; remove-dups : (list-of X) (X -> number) -> (list-of X)
;; removes the entries from l that have the same index as a
;; previous entry. l must be grouped by indexes.
(define (remove-dups l index acc)
(cond
((null? l) (reverse acc))
((null? acc) (remove-dups (cdr l) index (cons (car l) acc)))
((= (index (car acc)) (index (car l)))
(remove-dups (cdr l) index acc))
(else
(remove-dups (cdr l) index (cons (car l) acc)))))
(test-block ()
((remove-dups '((1 2) (2 2) (1 3) (1 4) (100 4) (0 5)) cadr null)
'((1 2) (1 3) (1 4) (0 5)))
((remove-dups null error null) null))
;; do-simple-equiv : (list-of X) (X -> nat) -> (list-of X)
;; Sorts l according to index and removes the entries with duplicate
;; indexes.
(define (do-simple-equiv l index)
(let ((ordered (mergesort l (lambda (a b) (< (index a) (index b))))))
(remove-dups ordered index null)))
(test-block ()
((do-simple-equiv '((2 2) (1 4) (1 2) (100 4) (1 3) (0 5)) cadr)
'((2 2) (1 3) (1 4) (0 5)))
((do-simple-equiv null error) null))
;; replace : (list-of X) (X -> bool) (X -> (list-of X)) (list-of X) ->
;; (list-of X)
;; If (pred? r) for some r in l, splice (get r) in place of r in the resulting
;; list.
(define (replace l pred? get acc)
(cond
((null? l) acc)
((pred? (car l)) (replace (cdr l) pred? get (append (get (car l)) acc)))
(else (replace (cdr l) pred? get (cons (car l) acc)))))
(test-block ()
((replace null void (lambda () (list 1)) null) null)
((replace '(1 2 3 4 3 5)
(lambda (x) (= x 3))
(lambda (x) (list 1 2 3))
null)
'(5 1 2 3 4 1 2 3 2 1)))
;; make-range : int * int -> char list
;; creates a list of all chars between i and j. i <= j
(define (make-range i j)
(letrec ((make-range
(lambda (i j)
(cond
((= i j) (list (integer->char i)))
(else
(cons (integer->char i) (make-range (add1 i) j)))))))
(make-range i j)))
(test-block ()
((make-range 97 110) (string->list "abcdefghijklmn"))
((make-range 111 111) '(#\o)))
;; merge : (list-of char) (list-of char) -> (list-of char)
;; Combines 2 sorted, duplicate-free lists into 1, removing duplicates.
(define (merge l1 l2)
(cond
((null? l2) l1)
((null? l1) l2)
(else (let ((cl1 (car l1))
(cl2 (car l2)))
(cond
((> (char->integer cl1) (char->integer cl2))
(cons cl2 (merge l1 (cdr l2))))
((< (char->integer cl1) (char->integer cl2))
(cons cl1 (merge (cdr l1) l2)))
(else (merge (cdr l1) l2)))))))
(test-block ()
((merge (string->list "abcd")
(string->list "abde"))
(string->list "abcde"))
((merge null null) null)
((merge null '(#\1)) '(#\1))
((merge '(#\1) null) '(#\1)))
(define (split-acc l1 l2 i l1-i l2-i)
(cond
((null? l1) (values (reverse! i) (reverse! l1-i) (reverse! (append! (reverse l2) l2-i))))
((null? l2) (values (reverse! i) (reverse! (append! (reverse l1) l1-i)) (reverse! l2-i)))
(else (let ((cl1 (car l1))
(cl2 (car l2)))
(cond
((> (char->integer cl1) (char->integer cl2))
(split-acc l1 (cdr l2) i l1-i (cons cl2 l2-i)))
((< (char->integer cl1) (char->integer cl2))
(split-acc (cdr l1) l2 i (cons cl1 l1-i) l2-i))
(else
(split-acc (cdr l1) (cdr l2) (cons cl1 i) l1-i l2-i)))))))
;; split : (list-of char) (list-of char) -> (list-of char) (list-of char) (list-of char)
;; Takes sorted, duplicate-free l1 and l2 and returns (l1 intersect l2),
;; l1 - (l1 intersect l2) and l2 - (l1 intersect l2)
(define (split l1 l2)
(split-acc l1 l2 null null null))
(test-block ()
((let-values (((a b c)
(split (string->list "abcdghjkl")
(string->list "abdeijmn"))))
(list a b c))
(list (string->list "abdj") (string->list "cghkl") (string->list "eimn")))
((let-values (((a b c) (split null null)))
(list a b c)) (list null null null))
((let-values (((a b c) (split '(#\1) null)))
(list a b c)) (list null '(#\1) null))
((let-values (((a b c) (split null '(#\1))))
(list a b c)) (list null null '(#\1))))
)
Loading…
Cancel
Save