A new lexer generator backend based on Brzozowski derivatives.
original commit: fb626b980fd3193e34cd3f6fabb1e95bd1aeb08atokens
parent
486e905094
commit
7b918d0164
@ -0,0 +1,42 @@
|
||||
(module actions mzscheme
|
||||
|
||||
(provide (all-defined))
|
||||
|
||||
;; wrap-action: (syntax-object or #f) symbol syntax-object syntax-object -> syntax-object
|
||||
(define (wrap-action action result-name ctxt loc)
|
||||
(if action
|
||||
(let ((parms (datum->syntax-object
|
||||
action
|
||||
`(start-pos end-pos ,result-name return-without-pos input-port))))
|
||||
(datum->syntax-object ctxt
|
||||
`(lambda ,parms ,action)
|
||||
loc))
|
||||
(datum->syntax-object ctxt 'void loc)))
|
||||
|
||||
|
||||
;; get-special-action: (syntax-object list) symbol 'a -> syntax-object or 'a
|
||||
;; Returns the first action from a rule of the form ((which-special) action)
|
||||
(define (get-special-action rules which-special none)
|
||||
(cond
|
||||
((null? rules) none)
|
||||
(else
|
||||
(syntax-case (car rules) ()
|
||||
(((special) act)
|
||||
(eq? (syntax-e (syntax special)) which-special)
|
||||
(syntax act))
|
||||
(_ (get-special-action (cdr rules) which-special none))))))
|
||||
|
||||
;; filter-out-specials: (syntax-object list) (symbol list) -> (syntax-object list)
|
||||
;; Returns a list missing all the rules of the form ((special) action)
|
||||
;; where special is a symbol in which specials.
|
||||
(define (filter-out-specials rules which-specials)
|
||||
(cond
|
||||
((null? rules) null)
|
||||
(else
|
||||
(syntax-case (car rules) ()
|
||||
(((special) act)
|
||||
(memq (syntax-e (syntax special)) which-specials)
|
||||
(filter-out-specials (cdr rules) which-specials))
|
||||
(_ (cons (car rules) (filter-out-specials (cdr rules) which-specials)))))))
|
||||
|
||||
)
|
@ -0,0 +1,314 @@
|
||||
(module deriv mzscheme
|
||||
|
||||
(require (lib "list.ss")
|
||||
"re.ss"
|
||||
"util.ss")
|
||||
|
||||
(provide build-dfa print-dfa (struct dfa (num-states start-state final-states/actions transitions)))
|
||||
|
||||
(define e (build-epsilon))
|
||||
(define z (build-zero))
|
||||
|
||||
;; get-char-groups : re -> (list-of char-setR?)
|
||||
;; Collects the char-setRs in r that could be used in
|
||||
;; taking the derivative of r.
|
||||
(define (get-char-groups r)
|
||||
(cond
|
||||
((or (eq? r e) (eq? r z)) null)
|
||||
((char-setR? r) (list r))
|
||||
((concatR? r)
|
||||
(if (re-nullable? (concatR-re1 r))
|
||||
(append (get-char-groups (concatR-re1 r))
|
||||
(get-char-groups (concatR-re2 r)))
|
||||
(get-char-groups (concatR-re1 r))))
|
||||
((repeatR? r)
|
||||
(get-char-groups (repeatR-re r)))
|
||||
((orR? r)
|
||||
(apply append (map get-char-groups (orR-res r))))))
|
||||
|
||||
(test-block ((c (make-cache))
|
||||
(r1 (->re #\1 c))
|
||||
(r2 (->re #\2 c)))
|
||||
((get-char-groups e) null)
|
||||
((get-char-groups z) null)
|
||||
((get-char-groups r1) (list r1))
|
||||
((get-char-groups (->re `(@ ,r1 ,r2) c))
|
||||
(list r1))
|
||||
((get-char-groups (->re `(@ ,e ,r2) c))
|
||||
(list r2))
|
||||
((get-char-groups (->re `(@ (* ,r1) ,r2) c))
|
||||
(list r1 r2))
|
||||
((get-char-groups (->re `(* ,r1) c))
|
||||
(list r1))
|
||||
((get-char-groups (->re `(: (* ,r1) (@ (* ,r2) "3") "4") c))
|
||||
(list r1 r2 (->re "3" c) (->re "4" c)))
|
||||
)
|
||||
|
||||
|
||||
;; A char-set is a (list-of char) that is sorted and duplicate-free
|
||||
|
||||
;; partition : (list-of char-set) -> (list-of char-set)
|
||||
;; The coarsest refinment r of sets such that the char-sets in r
|
||||
;; are pairwise disjoint.
|
||||
(define (partition sets)
|
||||
(cond
|
||||
((null? sets) null)
|
||||
(else
|
||||
(partition1 (car sets) (partition (cdr sets))))))
|
||||
|
||||
;; partition1 : char-set (list-of char-set) -> (list-of char-set)
|
||||
;; All the char-sets in sets must be pairwise disjoint. Splits set
|
||||
;; against each element in sets.
|
||||
(define (partition1 set sets)
|
||||
(cond
|
||||
((null? set) sets)
|
||||
((null? sets) (list set))
|
||||
(else
|
||||
(let ((set2 (car sets)))
|
||||
(let-values (((i s1 s2) (split set set2)))
|
||||
(let ((rest (partition1 s1 (cdr sets))))
|
||||
(cond
|
||||
((null? i)
|
||||
(cons s2 rest))
|
||||
((null? s2)
|
||||
(cons i rest))
|
||||
(else
|
||||
(cons i (cons s2 rest))))))))))
|
||||
|
||||
(test-block ((sl string->list))
|
||||
((partition null) null)
|
||||
((partition (list (sl "1234"))) (list (sl "1234")))
|
||||
((partition (list (sl "1234") (sl "0235")))
|
||||
(list (sl "23") (sl "05") (sl "14")))
|
||||
((partition (list (sl "12349") (sl "02359") (sl "67") (sl "29")))
|
||||
(list (sl "29") (sl "67") (sl "3") (sl "05") (sl "14")))
|
||||
)
|
||||
|
||||
(test-block ((sl string->list))
|
||||
((partition1 (sl "bcdjw") null) (list (sl "bcdjw")))
|
||||
((partition1 null null) null)
|
||||
((partition1 null (list (sl "a") (sl "b") (sl "1")))
|
||||
(list (sl "a") (sl "b") (sl "1")))
|
||||
((partition1 (sl "bcdjw")
|
||||
(list (sl "z")
|
||||
(sl "ab")
|
||||
(sl "dj")))
|
||||
(list (sl "z") (sl "b") (sl "a") (sl "dj") (sl "cw"))))
|
||||
|
||||
|
||||
|
||||
;; deriveR : re * char cache -> re
|
||||
(define (deriveR r c cache)
|
||||
(cond
|
||||
((or (eq? r e) (eq? r z)) z)
|
||||
((char-setR? r)
|
||||
(if (memq c (char-setR-chars r)) e z))
|
||||
((concatR? r)
|
||||
(let* ((r1 (concatR-re1 r))
|
||||
(r2 (concatR-re2 r))
|
||||
(d (build-concat (deriveR r1 c cache) r2 cache)))
|
||||
(if (re-nullable? r1)
|
||||
(build-or (list d (deriveR r2 c cache)) cache)
|
||||
d)))
|
||||
((repeatR? r)
|
||||
(build-concat (deriveR (repeatR-re r) c cache) r cache))
|
||||
((orR? r)
|
||||
(build-or (map (lambda (x) (deriveR x c cache))
|
||||
(orR-res r))
|
||||
cache))
|
||||
((andR? r)
|
||||
(build-and (map (lambda (x) (deriveR x c cache))
|
||||
(andR-res r))
|
||||
cache))
|
||||
((negR? r)
|
||||
(build-neg (deriveR (negR-re r) c cache) cache))))
|
||||
|
||||
(test-block ((c (make-cache))
|
||||
(r1 (->re #\a c))
|
||||
(r2 (->re `(* #\a) c))
|
||||
(r3 (->re `(* ,r2) c))
|
||||
(r4 (->re `(@ #\a ,r2) c))
|
||||
(r5 (->re `(* ,r4) c))
|
||||
(r6 (->re `(: ,r5 #\a) c))
|
||||
(r7 (->re `(@ ,r2 ,r2) c)))
|
||||
((deriveR e #\a c) z)
|
||||
((deriveR z #\a c) z)
|
||||
((deriveR r1 #\b c) z)
|
||||
((deriveR r1 #\a c) e)
|
||||
((deriveR r2 #\a c) r2)
|
||||
((deriveR r2 #\b c) z)
|
||||
((deriveR r3 #\a c) (->re `(@ ,r2 ,r3) c))
|
||||
((deriveR r3 #\b c) z)
|
||||
((deriveR r4 #\a c) r2)
|
||||
((deriveR r4 #\b c) z)
|
||||
((deriveR r5 #\a c) (->re `(@ ,r2 ,r5) c))
|
||||
((deriveR r5 #\b c) z)
|
||||
((deriveR r6 #\a c) (->re `(: (@ ,r2 ,r5) (epsilon)) c))
|
||||
((deriveR r6 #\b c) z)
|
||||
((deriveR r7 #\a c) (->re `(: (@ ,r2 ,r2) ,r2) c))
|
||||
((deriveR r7 #\b c) z))
|
||||
|
||||
|
||||
;; An re-action is (cons re action)
|
||||
|
||||
;; derive : (list-of re-action) char cache -> (union (list-of re-action) #f)
|
||||
;; applies deriveR to all the re-actions's re parts.
|
||||
;; Returns #f if the derived state is equivalent to z.
|
||||
(define (derive r c cache)
|
||||
(let ((new-r (map (lambda (ra)
|
||||
(cons (deriveR (car ra) c cache) (cdr ra)))
|
||||
r)))
|
||||
(if (andmap (lambda (x) (eq? z (car x)))
|
||||
new-r)
|
||||
#f
|
||||
new-r)))
|
||||
|
||||
(test-block ((c (make-cache))
|
||||
(r1 (->re #\1 c))
|
||||
(r2 (->re #\2 c)))
|
||||
((derive null #\1 c) #f)
|
||||
((derive (list (cons r1 1) (cons r2 2)) #\1 c)
|
||||
(list (cons e 1) (cons z 2)))
|
||||
((derive (list (cons r1 1) (cons r2 2)) #\3 c) #f))
|
||||
|
||||
|
||||
;; get-final : (list-of re-action) -> (union #f syntax-object)
|
||||
;; An re that accepts e represents a final state. Return the
|
||||
;; action from the first final state or #f if there is none.
|
||||
(define (get-final res)
|
||||
(cond
|
||||
((null? res) #f)
|
||||
((re-nullable? (caar res)) (cdar res))
|
||||
(else (get-final (cdr res)))))
|
||||
|
||||
(test-block ((c (make-cache))
|
||||
(r1 (->re #\a c))
|
||||
(r2 (->re #\b c))
|
||||
(a (list (cons r1 1) (cons r2 2))))
|
||||
((derive null #\a c) #f)
|
||||
((derive a #\a c) (list (cons e 1) (cons z 2)))
|
||||
((derive a #\b c) (list (cons z 1) (cons e 2)))
|
||||
((derive a #\c c) #f)
|
||||
((get-final a) #f)
|
||||
((get-final (list (cons e 1) (cons e 2))) 1))
|
||||
|
||||
|
||||
;; A state is (make-state (list-of re-action) nat)
|
||||
(define-struct state (spec index))
|
||||
|
||||
;; get->key : re-action -> (list-of nat)
|
||||
;; states are indexed by the list of indexes of their res
|
||||
(define (get-key s)
|
||||
(map (lambda (x) (re-index (car x))) s))
|
||||
|
||||
;; compute-chars : (list-of state) -> (list-of char-set)
|
||||
;; Computed the sets of equivalent characters for taking the
|
||||
;; derivative of the car of st. Only one derivative per set need to be taken.
|
||||
(define (compute-chars st)
|
||||
(cond
|
||||
((null? st) null)
|
||||
(else
|
||||
(partition (map char-setR-chars
|
||||
(apply append (map (lambda (x) (get-char-groups (car x)))
|
||||
(state-spec (car st)))))))))
|
||||
|
||||
(test-block ((c (make-cache))
|
||||
(r1 (->re `(- #\1 #\4) c))
|
||||
(r2 (->re `(- #\2 #\3) c)))
|
||||
((compute-chars null) null)
|
||||
((compute-chars (list (make-state null 1))) null)
|
||||
((compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2)))
|
||||
(list (list #\2 #\3) (list #\1 #\4))))
|
||||
|
||||
|
||||
;; A dfa is (make-dfa int int
|
||||
;; (list-of (cons int syntax-object))
|
||||
;; (list-of (cons int (list-of (cons (list-of char) int)))))
|
||||
;; Each transitions is a state and a list of chars with the state to transition to.
|
||||
;; The finals and transitions are sorted by state number, and duplicate free.
|
||||
(define-struct dfa (num-states start-state final-states/actions transitions))
|
||||
|
||||
;; build-dfa : (list-of re-action) cache -> dfa
|
||||
(define (build-dfa rs cache)
|
||||
(let* ((transitions (make-hash-table))
|
||||
(get-state-number (make-counter))
|
||||
(start (make-state rs (get-state-number))))
|
||||
(cache (cons 'state (get-key rs)) (lambda () start))
|
||||
(let loop ((old-states (list start))
|
||||
(new-states null)
|
||||
(cs (compute-chars (list start))))
|
||||
(cond
|
||||
((and (null? old-states) (null? new-states))
|
||||
(make-dfa (get-state-number) (state-index start)
|
||||
(mergesort (filter (lambda (x) (cdr x))
|
||||
(hash-table-map transitions
|
||||
(lambda (state _)
|
||||
(cons (state-index state) (get-final (state-spec state))))))
|
||||
(lambda (a b) (< (car a) (car b))))
|
||||
(mergesort (hash-table-map transitions
|
||||
(lambda (state trans)
|
||||
(cons (state-index state)
|
||||
(map (lambda (t)
|
||||
(cons (car t)
|
||||
(state-index (cdr t))))
|
||||
trans))))
|
||||
(lambda (a b) (< (car a) (car b))))))
|
||||
((null? old-states)
|
||||
(loop new-states null (compute-chars new-states)))
|
||||
((null? cs)
|
||||
(loop (cdr old-states) new-states (compute-chars (cdr old-states))))
|
||||
(else
|
||||
(let* ((state (car old-states))
|
||||
(c (car cs))
|
||||
(new-re (derive (state-spec state) (car c) cache)))
|
||||
(cond
|
||||
(new-re
|
||||
(let* ((new-state? #f)
|
||||
(new-state (cache (cons 'state (get-key new-re))
|
||||
(lambda ()
|
||||
(set! new-state? #t)
|
||||
(make-state new-re (get-state-number))))))
|
||||
(hash-table-put! transitions
|
||||
state
|
||||
(cons (cons c new-state)
|
||||
(hash-table-get transitions state
|
||||
(lambda () null))))
|
||||
(cond
|
||||
(new-state?
|
||||
(loop old-states (cons new-state new-states) (cdr cs)))
|
||||
(else
|
||||
(loop old-states new-states (cdr cs))))))
|
||||
(else (loop old-states new-states (cdr cs))))))))))
|
||||
|
||||
(define (print-dfa x)
|
||||
(printf "number of states: ~a~n" (dfa-num-states x))
|
||||
(printf "start state: ~a~n" (dfa-start-state x))
|
||||
(printf "final states: ~a~n" (map car (dfa-final-states/actions x)))
|
||||
(for-each (lambda (trans)
|
||||
(printf "state: ~a~n" (car trans))
|
||||
(for-each (lambda (rule)
|
||||
(printf " -~a-> ~a~n"
|
||||
(car rule)
|
||||
(cdr rule)))
|
||||
(cdr trans)))
|
||||
(dfa-transitions x)))
|
||||
|
||||
(define (build-test-dfa rs)
|
||||
(let ((c (make-cache)))
|
||||
(build-dfa (map (lambda (x) (cons (->re x c) 'action))
|
||||
rs)
|
||||
c)))
|
||||
#|
|
||||
(define t1 (build-test-dfa null))
|
||||
(define t2 (build-test-dfa `(#\a)))
|
||||
(define t3 (build-test-dfa `(#\a #\b)))
|
||||
(define t4 (build-test-dfa `((* #\a)
|
||||
(* (@ #\a #\b)))))
|
||||
(define t5 (build-test-dfa `((@ (* (: #\0 #\1)) #\1))))
|
||||
(define t6 (build-test-dfa `((* (* #\a))
|
||||
(* (@ #\b (* #\b))))))
|
||||
(define t7 (build-test-dfa `((@ (* #\a) (* #\b) (* #\c) (* #\d) (* #\e)))))
|
||||
(define t8
|
||||
(build-test-dfa `((@ (* (: #\a #\b)) #\a (: #\a #\b) (: #\a #\b) (: #\a #\b) (: #\a #\b)))))
|
||||
|#
|
||||
)
|
@ -0,0 +1,101 @@
|
||||
(module front mzscheme
|
||||
(require "util.ss"
|
||||
"stx.ss"
|
||||
"re.ss"
|
||||
"deriv.ss")
|
||||
|
||||
(provide build-lexer)
|
||||
|
||||
(define-syntax time-label
|
||||
(syntax-rules ()
|
||||
((_ l e ...)
|
||||
(begin
|
||||
(printf "~a: " l)
|
||||
(time (begin e ...))))))
|
||||
|
||||
|
||||
;; dfa->table : dfa -> (same as build-lexer)
|
||||
(define (dfa->table dfa)
|
||||
(let (
|
||||
;; no-look : (vector-of bool)
|
||||
;; For each state whether the lexer can ignore the next input.
|
||||
;; It can do this only if there are no transitions out of the
|
||||
;; current state.
|
||||
(no-look (make-vector (dfa-num-states dfa) #t))
|
||||
|
||||
;; actions : (vector-of (union #f syntax-object))
|
||||
;; The action for each final state, #f if the state isn't final
|
||||
(actions (make-vector (dfa-num-states dfa) #f))
|
||||
|
||||
;; char-table : (vector-of (union #f nat))
|
||||
;; The lexer table, one entry per state per char.
|
||||
;; Each entry specifies a state to transition to.
|
||||
;; #f indicates no transition
|
||||
(char-table (make-vector (* 256 (dfa-num-states dfa)) #f)))
|
||||
|
||||
;; Fill the char-table vector
|
||||
(for-each
|
||||
(lambda (trans)
|
||||
(let ((from-state (car trans)))
|
||||
(for-each (lambda (chars/to)
|
||||
(let ((to-state (cdr chars/to)))
|
||||
(for-each (lambda (char)
|
||||
(vector-set! char-table
|
||||
(bitwise-ior
|
||||
(char->integer char)
|
||||
(arithmetic-shift from-state 8))
|
||||
to-state))
|
||||
(car chars/to))))
|
||||
(cdr trans))))
|
||||
(dfa-transitions dfa))
|
||||
|
||||
(for-each (lambda (trans)
|
||||
(vector-set! no-look (car trans) #f))
|
||||
(dfa-transitions dfa))
|
||||
|
||||
(for-each (lambda (state/action)
|
||||
(vector-set! actions (car state/action) (cdr state/action)))
|
||||
(dfa-final-states/actions dfa))
|
||||
|
||||
(values char-table (dfa-start-state dfa) actions no-look)))
|
||||
|
||||
(test-block ()
|
||||
((call-with-values (lambda ()
|
||||
(dfa->table (make-dfa 1 1 (list) (list))))
|
||||
list)
|
||||
(list (make-vector 256 #f) 1 (vector #f) (make-vector 1 #t)))
|
||||
((call-with-values (lambda ()
|
||||
(dfa->table (make-dfa 4 1 (list (cons 2 2) (cons 3 3))
|
||||
(list (cons 1 (list (cons (list #\1 #\2) 1)
|
||||
(cons (list #\3) 2)))
|
||||
(cons 2 (list (cons (list #\1) 3)))))))
|
||||
list)
|
||||
(list (let ((v (make-vector 1024 #f)))
|
||||
(vector-set! v 305 1)
|
||||
(vector-set! v 306 1)
|
||||
(vector-set! v 307 2)
|
||||
(vector-set! v 561 3)
|
||||
v)
|
||||
1
|
||||
(vector #f #f 2 3)
|
||||
(vector #t #f #f #t))))
|
||||
|
||||
;; build-lexer : syntax-object list -> (values (vector-of (union #f nat)) nat (vector-of (union #f syntax-object)) (vector-of bool))
|
||||
;; each syntax object has the form (re action)
|
||||
(define (build-lexer sos)
|
||||
(let* ((s-re-acts (map (lambda (so)
|
||||
(cons (parse (car (syntax->list so)))
|
||||
(cadr (syntax->list so))))
|
||||
sos))
|
||||
|
||||
(cache (make-cache))
|
||||
|
||||
(re-acts (map (lambda (s-re-act)
|
||||
(cons (->re (car s-re-act) cache)
|
||||
(cdr s-re-act)))
|
||||
s-re-acts))
|
||||
|
||||
(dfa (build-dfa re-acts cache)))
|
||||
;(print-dfa dfa)
|
||||
(dfa->table dfa)))
|
||||
)
|
@ -0,0 +1,301 @@
|
||||
(module re mzscheme
|
||||
(require (lib "match.ss")
|
||||
(lib "list.ss")
|
||||
"util.ss")
|
||||
|
||||
(provide ->re build-epsilon build-zero build-char-set build-concat
|
||||
build-repeat build-or build-and build-neg
|
||||
epsilonR? zeroR? char-setR? concatR? repeatR? orR? andR? negR?
|
||||
char-setR-chars concatR-re1 concatR-re2 repeatR-re orR-res
|
||||
andR-res negR-re
|
||||
re-nullable? re-index)
|
||||
|
||||
;; get-index : -> nat
|
||||
(define get-index (make-counter))
|
||||
|
||||
;; An re is either
|
||||
;; - (make-epsilonR bool nat)
|
||||
;; - (make-zeroR bool nat)
|
||||
;; - (make-char-setR bool nat (list-of char)) The list must be sorted
|
||||
;; - (make-concatR bool nat re re)
|
||||
;; - (make-repeatR bool nat re)
|
||||
;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs
|
||||
;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs
|
||||
;; - (make-negR bool nat re)
|
||||
;;
|
||||
;; Every re must have an index field globally different from all
|
||||
;; other re index fields.
|
||||
(define-struct re (nullable? index))
|
||||
(define-struct (epsilonR re) ())
|
||||
(define-struct (zeroR re) ())
|
||||
(define-struct (char-setR re) (chars) (make-inspector))
|
||||
(define-struct (concatR re) (re1 re2) (make-inspector))
|
||||
(define-struct (repeatR re) (re))
|
||||
(define-struct (orR re) (res) (make-inspector))
|
||||
(define-struct (andR re) (res))
|
||||
(define-struct (negR re) (re))
|
||||
|
||||
;; e : re
|
||||
;; The unique epsilon re
|
||||
(define e (make-epsilonR #t (get-index)))
|
||||
|
||||
;; z : re
|
||||
;; The unique zero re
|
||||
(define z (make-zeroR #f (get-index)))
|
||||
|
||||
|
||||
;; s-re = char match the given character
|
||||
;; | string match its sequence of characters
|
||||
;; | re a precompiled re
|
||||
;; | (epsilon) match the empty string
|
||||
;; | (* s-re) match 0 or more
|
||||
;; | (+ s-re) match 1 or more
|
||||
;; | (? s-re) match 0 or 1
|
||||
;; | (: s-re ...) match one of the sub-expressions
|
||||
;; | (@ s-re ...) match each sub-expression in succession
|
||||
;; | (- char char) match any character between two (inclusive)
|
||||
;; | (^ char_or_range ...1) match any character not listed
|
||||
;; (the null concatenation `(@) means epsilon as does "".
|
||||
;; the null or `(:) means match nothing. The null carat `(^) means match
|
||||
;; any character.)
|
||||
|
||||
;; ->re : s-re cache -> re
|
||||
(define (->re exp cache)
|
||||
(match exp
|
||||
((? char?) (build-char-set (list exp) cache))
|
||||
((? string?) (->re `(@ ,@(string->list exp)) cache))
|
||||
((? re?) exp)
|
||||
(`(epsilon) (build-epsilon))
|
||||
(`(* ,r)
|
||||
(build-repeat (->re r cache) cache))
|
||||
(`(+ ,r)
|
||||
(->re `(@ ,r (* ,r)) cache))
|
||||
(`(? ,r)
|
||||
(let ((c (->re r cache)))
|
||||
(if (re-nullable? c)
|
||||
c
|
||||
(build-or (list e c) cache))))
|
||||
(`(: ,r)
|
||||
(->re r cache))
|
||||
(`(: ,rs ...)
|
||||
(build-or (flatten-res (map (lambda (r) (->re r cache)) rs) cache)
|
||||
cache))
|
||||
(`(@ ,rs ...)
|
||||
(foldr (lambda (x y)
|
||||
(build-concat (->re x cache) y cache))
|
||||
e
|
||||
rs))
|
||||
(`(- ,c1 ,c2)
|
||||
(let ((i1 (char->integer (if (string? c1) (string-ref c1 0) c1)))
|
||||
(i2 (char->integer (if (string? c2) (string-ref c2 0) c2))))
|
||||
(if (<= i1 i2)
|
||||
(build-char-set (make-range i1 i2) cache)
|
||||
z)))
|
||||
(`(^ ,crs ...)
|
||||
(let ((cs (->re `(: ,@crs) cache)))
|
||||
(cond
|
||||
((zeroR? cs) (build-char-set (make-range 0 255) cache))
|
||||
((char-setR? cs)
|
||||
(build-char-set
|
||||
(let loop ((bad-chars (map char->integer
|
||||
(char-setR-chars cs)))
|
||||
(i 0))
|
||||
(cond
|
||||
((> i 255) null)
|
||||
((and (not (null? bad-chars))
|
||||
(= i (car bad-chars)))
|
||||
(loop (cdr bad-chars) (add1 i)))
|
||||
(else
|
||||
(cons (integer->char i) (loop bad-chars (add1 i))))))
|
||||
cache))
|
||||
(else z))))))
|
||||
|
||||
|
||||
|
||||
|
||||
;; flatten-res: (list-of re) cache -> (list-of re)
|
||||
;; Takes all the char-sets in l and combines them into one element.
|
||||
;; Removes orRs too,
|
||||
(define (flatten-res l cache)
|
||||
(let loop ((res l)
|
||||
(chars null)
|
||||
(no-chars null))
|
||||
(cond
|
||||
((null? res)
|
||||
(if (null? chars)
|
||||
no-chars
|
||||
(cons (build-char-set (mergesort chars char<?) cache) no-chars)))
|
||||
((char-setR? (car res))
|
||||
(loop (cdr res) (merge (char-setR-chars (car res)) chars) no-chars))
|
||||
((orR? (car res))
|
||||
(loop (append (orR-res (car res)) (cdr res)) chars no-chars))
|
||||
(else (loop (cdr res) chars (cons (car res) no-chars))))))
|
||||
|
||||
;; build-epsilon : -> re
|
||||
(define (build-epsilon) e)
|
||||
|
||||
(define (build-zero) z)
|
||||
|
||||
;; build-char-set : (list-of char) cache -> re
|
||||
;; cs must be sorted
|
||||
#;(define (build-char-set cs cache)
|
||||
(cond
|
||||
((null? cs) z)
|
||||
(else
|
||||
(make-char-setR #f (get-index) cs))))
|
||||
|
||||
(define (build-char-set cs cache)
|
||||
(cond
|
||||
((null? cs) z)
|
||||
(else
|
||||
(cache cs
|
||||
(lambda ()
|
||||
(make-char-setR #f (get-index) cs))))))
|
||||
|
||||
|
||||
|
||||
;; build-concat : re re cache -> re
|
||||
(define (build-concat r1 r2 cache)
|
||||
(cond
|
||||
((eq? e r1) r2)
|
||||
((eq? e r2) r1)
|
||||
((or (eq? z r1) (eq? z r2)) z)
|
||||
(else
|
||||
(let* ((i1 (re-index r1))
|
||||
(i2 (re-index r2))
|
||||
(key (if (< i1 i2)
|
||||
(cons i1 i2)
|
||||
(cons i2 i1))))
|
||||
(cache (cons 'concat key)
|
||||
(lambda ()
|
||||
(make-concatR (and (re-nullable? r1) (re-nullable? r2))
|
||||
(get-index)
|
||||
r1 r2)))))))
|
||||
|
||||
;; build-repeat : re cache -> re
|
||||
(define (build-repeat r cache)
|
||||
(cache (cons 'repeat (re-index r))
|
||||
(lambda ()
|
||||
(make-repeatR #t (get-index) r))))
|
||||
|
||||
|
||||
;; build-or : (list-of re) cache -> re
|
||||
(define (build-or rs cache)
|
||||
(let ((rs
|
||||
(filter
|
||||
(lambda (x) (not (eq? x z)))
|
||||
(do-simple-equiv (replace rs orR? orR-res null) re-index))))
|
||||
(cond
|
||||
((null? rs) z)
|
||||
((null? (cdr rs)) (car rs))
|
||||
(else
|
||||
(cache (cons 'or (map re-index rs))
|
||||
(lambda ()
|
||||
(make-orR (ormap re-nullable? rs) (get-index) rs)))))))
|
||||
|
||||
;; build-and : (list-of re) cache -> re
|
||||
(define (build-and rs cache)
|
||||
(let ((rs (do-simple-equiv (replace rs andR? andR-res null) rs)))
|
||||
(cond
|
||||
((ormap (lambda (x) (eq? x z)) rs) z)
|
||||
(else
|
||||
(cache (cons 'and (map re-index rs))
|
||||
(lambda ()
|
||||
(make-andR (andmap re-nullable? rs) (get-index) rs)))))))
|
||||
|
||||
;; build-neg : re cache -> re
|
||||
(define (build-neg r cache)
|
||||
(cache (cons 'neg (re-index r))
|
||||
(lambda ()
|
||||
(make-negR (not (re-nullable? r)) (get-index) r))))
|
||||
|
||||
(test-block ((c (make-cache))
|
||||
(r1 (->re #\1 c))
|
||||
(r2 (->re #\2 c))
|
||||
(rc (->re `(@ ,r1 ,r2) c))
|
||||
(rc2 (->re `(@ ,r2 ,r1) c))
|
||||
(rr (->re `(* ,rc) c))
|
||||
(ro (->re `(: ,rr ,rc ,rr) c))
|
||||
(ro2 (->re `(: ,rc ,rr ,z) c))
|
||||
(ro3 (->re `(: ,rr ,rc) c)))
|
||||
(rc rc2)
|
||||
(ro ro2)
|
||||
(ro ro3)
|
||||
((->re `(* ,rc) c) rr)
|
||||
((build-char-set null c) z)
|
||||
((->re `(@ ,r1 (epsilon)) c) r1)
|
||||
((->re `(@ (epsilon) ,r1) c) r1)
|
||||
((->re `(@ ,r1 ,z) c) z)
|
||||
((->re `(@ ,z ,r1) c) z)
|
||||
((->re `(@ ,z (epsilon)) c) z)
|
||||
((->re `(@ (epsilon) ,z) c) z)
|
||||
((->re `(:) c) z)
|
||||
((->re `(: ,rr) c) rr)
|
||||
((build-or `(,z ,r1 ,z) c) r1)
|
||||
((build-or (list
|
||||
(build-or (list r1 r2) c)
|
||||
(build-or (list rc rr) c))
|
||||
c)
|
||||
(build-or (list r1 r2 rc rr) c))
|
||||
((concatR-re1 rc2) r1)
|
||||
((concatR-re2 rc2) r2)
|
||||
((orR-res ro) (list rc rr))
|
||||
((repeatR-re rr) rc)
|
||||
((re-nullable? r1) #f)
|
||||
((re-nullable? rc) #f)
|
||||
((re-nullable? (->re `(@ ,rr ,rr) c)) #t)
|
||||
((re-nullable? rr) #t)
|
||||
((re-nullable? ro) #t)
|
||||
((re-nullable? (->re `(: ,r1 ,r2) c)) #f))
|
||||
|
||||
(test-block ((c (make-cache))
|
||||
(r1 (->re #\1 c))
|
||||
(r2 (->re #\2 c))
|
||||
(r3-5 (->re '(- #\3 #\5) c))
|
||||
(r4 (build-or `(,r1 ,r2) c))
|
||||
(r5 (->re `(: ,r3-5 #\7) c))
|
||||
(r6 (->re #\6 c)))
|
||||
((flatten-res null c) null)
|
||||
((char-setR-chars (car (flatten-res `(,r1) c))) '(#\1))
|
||||
((char-setR-chars (car (flatten-res `(,r4) c))) '(#\1 #\2))
|
||||
((char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1) c)))
|
||||
(string->list "1234567")))
|
||||
|
||||
(test-block ((c (make-cache))
|
||||
(r (->re #\a c))
|
||||
(rr (->re `(@ ,r ,r) c))
|
||||
(rrr (->re `(@ ,r ,rr) c))
|
||||
(rrr* (->re `(* ,rrr) c)))
|
||||
((char-setR-chars r) '(#\a))
|
||||
((->re "" c) e)
|
||||
((->re "asdf" c) (->re `(@ #\a #\s #\d #\f) c))
|
||||
((->re r c) r)
|
||||
((->re `(epsilon) c) e)
|
||||
((->re `(* ,r) c) (build-repeat r c))
|
||||
((->re `(+ ,r) c) (build-concat r (build-repeat r c) c))
|
||||
((->re `(? ,r) c) (build-or (list e r) c))
|
||||
((->re `(? ,rrr*) c) rrr*)
|
||||
((->re `(: (: (- #\a #\c) (^ (- #\000 #\110) (- #\112 #\377)))
|
||||
(: (* #\2))) c)
|
||||
(build-or (list (build-char-set (list #\111 #\a #\b #\c) c)
|
||||
(build-repeat (build-char-set '(#\2) c) c))
|
||||
c))
|
||||
((->re `(: ,rr ,rrr) c) (build-or (list rr rrr) c))
|
||||
((->re `(: ,r) c) r)
|
||||
((->re `(:) c) z)
|
||||
((->re `(@) c) e)
|
||||
((->re `(@ ,rrr*) c) rrr*)
|
||||
(rr (build-concat r r c))
|
||||
((->re `(@ ,r ,rr ,rrr) c)
|
||||
(build-concat r (build-concat rr rrr c) c))
|
||||
((char-setR-chars (->re `(- #\1 #\1) c)) '(#\1))
|
||||
((char-setR-chars (->re `(- #\1 #\9) c)) (string->list "123456789"))
|
||||
((char-setR-chars (->re `(- "1" "1") c)) '(#\1))
|
||||
((char-setR-chars (->re `(- "1" "9") c)) (string->list "123456789"))
|
||||
((->re `(- "9" "1") c) z)
|
||||
((char-setR-chars (->re `(^) c))
|
||||
(char-setR-chars (->re `(- #\000 #\377) c)))
|
||||
((char-setR-chars (->re `(^ #\001 (- #\002 #\377)) c)) `(#\000))
|
||||
)
|
||||
|
||||
)
|
@ -0,0 +1,104 @@
|
||||
(module stx mzscheme
|
||||
(require (lib "stx.ss" "syntax")
|
||||
"util.ss")
|
||||
|
||||
(provide parse)
|
||||
|
||||
|
||||
(define (num-arg-err s expect given)
|
||||
(raise-syntax-error
|
||||
'regular-expression
|
||||
(format "operator expects ~a arguments, given ~a" expect given)
|
||||
s))
|
||||
|
||||
|
||||
;; parse : syntax-object -> s-re (see re.ss)
|
||||
;; checks for errors and generates the plain s-exp form for s
|
||||
(define (parse s)
|
||||
(let ((s-e (syntax-e s)))
|
||||
(cond
|
||||
((char? s-e) s-e)
|
||||
((string? s-e) s-e)
|
||||
((symbol? s-e)
|
||||
(let ((expand (syntax-local-value s (lambda () #f))))
|
||||
(unless (lex-abbrev? expand)
|
||||
(raise-syntax-error 'regular-expression "undefined abbreviation" s))
|
||||
(parse (lex-abbrev-abbrev expand))))
|
||||
((stx-null? s)
|
||||
(raise-syntax-error 'regular-expression "invalid regular expression" s))
|
||||
((stx-list? s)
|
||||
(let* ((ar (stx->list (stx-cdr s)))
|
||||
(num-args (length ar)))
|
||||
(case (syntax-e (stx-car s))
|
||||
((epsilon) '(epsilon))
|
||||
((*)
|
||||
(unless (= num-args 1)
|
||||
(num-arg-err s 1 num-args))
|
||||
`(* ,(parse (car ar))))
|
||||
((+)
|
||||
(unless (= num-args 1)
|
||||
(num-arg-err s 1 num-args))
|
||||
`(+ ,(parse (car ar))))
|
||||
((?)
|
||||
(unless (= num-args 1)
|
||||
(num-arg-err s 1 num-args))
|
||||
`(? ,(parse (car ar))))
|
||||
((:) `(: ,@(map parse ar)))
|
||||
((@) `(@ ,@(map parse ar)))
|
||||
((-)
|
||||
(unless (= num-args 2)
|
||||
(num-arg-err s 2 num-args))
|
||||
(let ((c1 (parse (car ar)))
|
||||
(c2 (parse (cadr ar))))
|
||||
(if (and (or (char? c1) (and (string? c1) (= 1 (string-length c1))))
|
||||
(or (char? c2) (and (string? c2) (= 1 (string-length c2)))))
|
||||
(let ((i1 (char->integer (if (char? c1) c1 (string-ref c1 0))))
|
||||
(i2 (char->integer (if (char? c2) c2 (string-ref c2 0)))))
|
||||
(if (<= i1 i2)
|
||||
`(- ,c1 ,c2)
|
||||
(raise-syntax-error
|
||||
'regular-expression
|
||||
(format "first argument ~a does not preceed second argument ~a"
|
||||
c1 c2)
|
||||
s)))
|
||||
(raise-syntax-error
|
||||
'regular-expression
|
||||
(format "expects single character arguments, given ~a and ~a"
|
||||
(syntax-object->datum (car ar))
|
||||
(syntax-object->datum (cadr ar)))
|
||||
s))))
|
||||
((^)
|
||||
(let ((res (map parse ar)))
|
||||
(if (not (andmap pure-char? res))
|
||||
(raise-syntax-error
|
||||
'regular-expression
|
||||
(format
|
||||
"expects single character or character range arguments, given ~a"
|
||||
(map syntax-object->datum ar))
|
||||
s))
|
||||
`(^ ,@res)))
|
||||
(else
|
||||
(raise-syntax-error
|
||||
'regular-expression
|
||||
"invalid operator"
|
||||
s)))))
|
||||
(else
|
||||
(raise-syntax-error
|
||||
'regular-expression
|
||||
"invalid regular expression"
|
||||
s)))))
|
||||
|
||||
(define (pure-char? s-re)
|
||||
(cond
|
||||
((char? s-re) #t)
|
||||
((string? s-re) (= (string-length s-re) 1))
|
||||
((list? s-re)
|
||||
(let ((op (car s-re)))
|
||||
(case op
|
||||
((: ^) (andmap pure-char? (cdr s-re)))
|
||||
((-) #t)
|
||||
(else #f))))
|
||||
(else #f)))
|
||||
|
||||
|
||||
)
|
@ -0,0 +1,180 @@
|
||||
(module util mzscheme
|
||||
(require (lib "list.ss"))
|
||||
|
||||
(provide (all-defined-except split-acc))
|
||||
|
||||
(define-struct lex-abbrev (abbrev))
|
||||
|
||||
(define-syntax test-block
|
||||
(syntax-rules ()
|
||||
((_ defs (code right-ans) ...)
|
||||
(let* defs
|
||||
(let ((real-ans code))
|
||||
(unless (equal? real-ans right-ans)
|
||||
(printf "Test failed: ~e gave ~e. Expected ~e~n"
|
||||
'code real-ans 'right-ans))) ...))))
|
||||
|
||||
#;(define-syntax test-block
|
||||
(syntax-rules ()
|
||||
((_ x ...) (void))))
|
||||
|
||||
|
||||
;; A cache is (X ( -> Y) -> Y)
|
||||
;; make-cache : -> cache
|
||||
;; table map Xs to Ys. If key is mapped, its value is returned.
|
||||
;; Otherwise, build is invoked and its result is placed in the table and
|
||||
;; returned.
|
||||
;; Xs are compared with equal?
|
||||
(define (make-cache)
|
||||
(let ((table (make-hash-table 'equal)))
|
||||
(lambda (key build)
|
||||
(hash-table-get table key
|
||||
(lambda ()
|
||||
(let ((new (build)))
|
||||
(hash-table-put! table key new)
|
||||
new))))))
|
||||
(test-block ((cache (make-cache)))
|
||||
((cache '(1 2) (lambda () 9)) 9)
|
||||
((cache '(2 1) (lambda () 8)) 8)
|
||||
((cache '(1 2) (lambda () 1)) 9))
|
||||
|
||||
|
||||
;; make-counter : -> -> nat
|
||||
;; makes a function that returns a higher number by 1, each time
|
||||
;; it is called.
|
||||
(define (make-counter)
|
||||
(let ((counter 0))
|
||||
(lambda ()
|
||||
(begin0
|
||||
counter
|
||||
(set! counter (add1 counter))))))
|
||||
(test-block ((c (make-counter))
|
||||
(d (make-counter)))
|
||||
((c) 0)
|
||||
((d) 0)
|
||||
((c) 1)
|
||||
((d) 1)
|
||||
((c) 2))
|
||||
|
||||
|
||||
;; remove-dups : (list-of X) (X -> number) -> (list-of X)
|
||||
;; removes the entries from l that have the same index as a
|
||||
;; previous entry. l must be grouped by indexes.
|
||||
(define (remove-dups l index acc)
|
||||
(cond
|
||||
((null? l) (reverse acc))
|
||||
((null? acc) (remove-dups (cdr l) index (cons (car l) acc)))
|
||||
((= (index (car acc)) (index (car l)))
|
||||
(remove-dups (cdr l) index acc))
|
||||
(else
|
||||
(remove-dups (cdr l) index (cons (car l) acc)))))
|
||||
|
||||
(test-block ()
|
||||
((remove-dups '((1 2) (2 2) (1 3) (1 4) (100 4) (0 5)) cadr null)
|
||||
'((1 2) (1 3) (1 4) (0 5)))
|
||||
((remove-dups null error null) null))
|
||||
|
||||
|
||||
;; do-simple-equiv : (list-of X) (X -> nat) -> (list-of X)
|
||||
;; Sorts l according to index and removes the entries with duplicate
|
||||
;; indexes.
|
||||
(define (do-simple-equiv l index)
|
||||
(let ((ordered (mergesort l (lambda (a b) (< (index a) (index b))))))
|
||||
(remove-dups ordered index null)))
|
||||
|
||||
(test-block ()
|
||||
((do-simple-equiv '((2 2) (1 4) (1 2) (100 4) (1 3) (0 5)) cadr)
|
||||
'((2 2) (1 3) (1 4) (0 5)))
|
||||
((do-simple-equiv null error) null))
|
||||
|
||||
|
||||
;; replace : (list-of X) (X -> bool) (X -> (list-of X)) (list-of X) ->
|
||||
;; (list-of X)
|
||||
;; If (pred? r) for some r in l, splice (get r) in place of r in the resulting
|
||||
;; list.
|
||||
(define (replace l pred? get acc)
|
||||
(cond
|
||||
((null? l) acc)
|
||||
((pred? (car l)) (replace (cdr l) pred? get (append (get (car l)) acc)))
|
||||
(else (replace (cdr l) pred? get (cons (car l) acc)))))
|
||||
|
||||
(test-block ()
|
||||
((replace null void (lambda () (list 1)) null) null)
|
||||
((replace '(1 2 3 4 3 5)
|
||||
(lambda (x) (= x 3))
|
||||
(lambda (x) (list 1 2 3))
|
||||
null)
|
||||
'(5 1 2 3 4 1 2 3 2 1)))
|
||||
|
||||
;; make-range : int * int -> char list
|
||||
;; creates a list of all chars between i and j. i <= j
|
||||
(define (make-range i j)
|
||||
(letrec ((make-range
|
||||
(lambda (i j)
|
||||
(cond
|
||||
((= i j) (list (integer->char i)))
|
||||
(else
|
||||
(cons (integer->char i) (make-range (add1 i) j)))))))
|
||||
(make-range i j)))
|
||||
(test-block ()
|
||||
((make-range 97 110) (string->list "abcdefghijklmn"))
|
||||
((make-range 111 111) '(#\o)))
|
||||
|
||||
|
||||
|
||||
;; merge : (list-of char) (list-of char) -> (list-of char)
|
||||
;; Combines 2 sorted, duplicate-free lists into 1, removing duplicates.
|
||||
(define (merge l1 l2)
|
||||
(cond
|
||||
((null? l2) l1)
|
||||
((null? l1) l2)
|
||||
(else (let ((cl1 (car l1))
|
||||
(cl2 (car l2)))
|
||||
(cond
|
||||
((> (char->integer cl1) (char->integer cl2))
|
||||
(cons cl2 (merge l1 (cdr l2))))
|
||||
((< (char->integer cl1) (char->integer cl2))
|
||||
(cons cl1 (merge (cdr l1) l2)))
|
||||
(else (merge (cdr l1) l2)))))))
|
||||
(test-block ()
|
||||
((merge (string->list "abcd")
|
||||
(string->list "abde"))
|
||||
(string->list "abcde"))
|
||||
((merge null null) null)
|
||||
((merge null '(#\1)) '(#\1))
|
||||
((merge '(#\1) null) '(#\1)))
|
||||
|
||||
(define (split-acc l1 l2 i l1-i l2-i)
|
||||
(cond
|
||||
((null? l1) (values (reverse! i) (reverse! l1-i) (reverse! (append! (reverse l2) l2-i))))
|
||||
((null? l2) (values (reverse! i) (reverse! (append! (reverse l1) l1-i)) (reverse! l2-i)))
|
||||
(else (let ((cl1 (car l1))
|
||||
(cl2 (car l2)))
|
||||
(cond
|
||||
((> (char->integer cl1) (char->integer cl2))
|
||||
(split-acc l1 (cdr l2) i l1-i (cons cl2 l2-i)))
|
||||
((< (char->integer cl1) (char->integer cl2))
|
||||
(split-acc (cdr l1) l2 i (cons cl1 l1-i) l2-i))
|
||||
(else
|
||||
(split-acc (cdr l1) (cdr l2) (cons cl1 i) l1-i l2-i)))))))
|
||||
|
||||
;; split : (list-of char) (list-of char) -> (list-of char) (list-of char) (list-of char)
|
||||
;; Takes sorted, duplicate-free l1 and l2 and returns (l1 intersect l2),
|
||||
;; l1 - (l1 intersect l2) and l2 - (l1 intersect l2)
|
||||
(define (split l1 l2)
|
||||
(split-acc l1 l2 null null null))
|
||||
|
||||
(test-block ()
|
||||
((let-values (((a b c)
|
||||
(split (string->list "abcdghjkl")
|
||||
(string->list "abdeijmn"))))
|
||||
(list a b c))
|
||||
(list (string->list "abdj") (string->list "cghkl") (string->list "eimn")))
|
||||
((let-values (((a b c) (split null null)))
|
||||
(list a b c)) (list null null null))
|
||||
((let-values (((a b c) (split '(#\1) null)))
|
||||
(list a b c)) (list null '(#\1) null))
|
||||
((let-values (((a b c) (split null '(#\1))))
|
||||
(list a b c)) (list null null '(#\1))))
|
||||
|
||||
)
|
Loading…
Reference in New Issue