From 7b918d0164e4055246db4df1b84a5eb8f42ffc3c Mon Sep 17 00:00:00 2001 From: Scott Owens Date: Sun, 11 Apr 2004 07:01:55 +0000 Subject: [PATCH] A new lexer generator backend based on Brzozowski derivatives. original commit: fb626b980fd3193e34cd3f6fabb1e95bd1aeb08a --- collects/parser-tools/lex.ss | 87 ++--- collects/parser-tools/private-lex/actions.ss | 42 +++ collects/parser-tools/private-lex/deriv.ss | 314 +++++++++++++++++++ collects/parser-tools/private-lex/front.ss | 101 ++++++ collects/parser-tools/private-lex/re.ss | 301 ++++++++++++++++++ collects/parser-tools/private-lex/stx.ss | 104 ++++++ collects/parser-tools/private-lex/util.ss | 180 +++++++++++ 7 files changed, 1069 insertions(+), 60 deletions(-) create mode 100644 collects/parser-tools/private-lex/actions.ss create mode 100644 collects/parser-tools/private-lex/deriv.ss create mode 100644 collects/parser-tools/private-lex/front.ss create mode 100644 collects/parser-tools/private-lex/re.ss create mode 100644 collects/parser-tools/private-lex/stx.ss create mode 100644 collects/parser-tools/private-lex/util.ss diff --git a/collects/parser-tools/lex.ss b/collects/parser-tools/lex.ss index ff3fd86..d883ae6 100644 --- a/collects/parser-tools/lex.ss +++ b/collects/parser-tools/lex.ss @@ -3,12 +3,13 @@ ;; Provides the syntax used to create lexers and the functions needed to ;; create and use the buffer that the lexer reads from. See doc.txt. - (require-for-syntax "private-lex/generate-code.ss" - "private-lex/structs.ss") + (require-for-syntax "private-lex/util.ss" + "private-lex/actions.ss" + "private-lex/front.ss") (require (lib "readerr.ss" "syntax") - "private-lex/token.ss" - (lib "cffi.ss" "compiler")) + (lib "cffi.ss" "compiler") + "private-lex/token.ss") (provide lexer lexer-src-pos define-lex-abbrev define-lex-abbrevs position-offset position-line position-col position? @@ -16,33 +17,6 @@ (define file-path (make-parameter #f)) - #;(define-syntaxes (lexer-exp lexer-src-pos-exp) - (let ((build-lexer - (lambda (wrap?) - (lambda (stx) - (syntax-case stx () - ((_) - (raise-syntax-error #f "empty lexer is not allowed" stx)) - ((_ re-act ...) - (begin - (for-each - (lambda (x) - (syntax-case x () - ((re act) (void)) - (_ (raise-syntax-error 'lexer - "expects regular expression / action pairs" - x)))) - (syntax->list (syntax (re-act ...)))) - (let ((table (generate-table (syntax (re-act ...)) stx))) - (with-syntax ((code (compile-table table)) - (actions-stx `(vector ,@(vector->list (table-actions table)))) - (wrap? wrap?)) - (syntax - (compiled-lexer-body code actions-stx wrap?))))))))))) - (values - (build-lexer #f) - (build-lexer #t)))) - (define-syntaxes (lexer lexer-src-pos) (let ((build-lexer (lambda (wrap?) @@ -62,6 +36,8 @@ (syntax->list (syntax (re-act ...)))) (let* ((spec/re-act-lst (syntax->list (syntax (re-act ...)))) + (eof-act + (get-special-action spec/re-act-lst 'eof #''eof)) (spec-act (get-special-action spec/re-act-lst 'special #'(void))) (spec-error-act @@ -70,13 +46,15 @@ (get-special-action spec/re-act-lst 'special-comment #'#f)) (re-act-lst (filter-out-specials spec/re-act-lst - '(special special-comment special-error)))) - (let ((table (generate-table re-act-lst #'here stx))) - (with-syntax ((start-state-stx (table-start table)) - (trans-table-stx (table-trans table)) - (eof-table-stx (table-eof table)) - (no-lookahead-stx (table-no-lookahead table)) - (actions-stx `(vector ,@(vector->list (table-actions table)))) + '(special special-comment special-error eof)))) + (let-values (((trans start actions no-look) + (build-lexer re-act-lst))) + (with-syntax ((start-state-stx start) + (trans-table-stx trans) + (no-lookahead-stx no-look) + (actions-stx `(vector ,@(map (lambda (a) + (wrap-action a 'lexeme #'here a)) + (vector->list actions)))) (spec-act-stx (wrap-action spec-act 'special #'here spec-act)) (spec-error-act-stx @@ -84,17 +62,19 @@ (has-comment-act?-stx (if (syntax-e spec-comment-act) #t #f)) (spec-comment-act-stx (wrap-action spec-comment-act (gensym) #'here spec-comment-act)) + (eof-act-stx + (wrap-action eof-act 'lexeme #'here eof-act)) (wrap? wrap?)) (syntax (lexer-body start-state-stx trans-table-stx - eof-table-stx actions-stx no-lookahead-stx spec-act-stx spec-error-act-stx has-comment-act?-stx spec-comment-act-stx + eof-act-stx wrap?)))))))))))) (values (build-lexer #f) @@ -136,23 +116,10 @@ "Form should be (define-lex-abbrevs (name re) ...)" stx)))) - #;(define (compiled-lexer-body lexer actions wrap?) - (lambda (ip) - (unless (input-port? ip) - (raise-type-error - 'lexer - "input-port" - 0 - ip)) - (let ((first-pos (get-position ip))) - (let-values (((longest-match-length length longest-match-action) - (lexer ip peek-string))) - (check-match ip first-pos longest-match-length length - (vector-ref actions longest-match-action) wrap?))))) - (define (lexer-body start-state trans-table eof-table actions no-lookahead + (define (lexer-body start-state trans-table actions no-lookahead special-action special-error-action - has-special-comment-action? special-comment-action wrap?) + has-special-comment-action? special-comment-action eof-action wrap?) (letrec ((lexer (lambda (ip) (unless (input-port? ip) @@ -167,9 +134,9 @@ ((eq? 'special first-char) (let* ((comment? #f) (error? #f) - (spec (with-handlers ((special-comment? + (spec (with-handlers ((exn:special-comment? (lambda (x) (set! comment? #t))) - (exn:fail? + (not-break-exn? (lambda (ex) (set! error? #t) ex))) (read-char-or-special ip)))) (cond @@ -181,6 +148,8 @@ (error? special-error-action) (else special-action)) spec wrap?))))) + ((eof-object? first-char) + (do-match ip first-pos eof-action (read-char-or-special ip) wrap?)) (else (let lexer-loop ( ;; current-state @@ -198,10 +167,8 @@ (longest-match-length 1)) (let ((next-state (cond - ((eof-object? char) - (vector-ref eof-table state)) - ((eq? char 'special) - #f) + ((eof-object? char) #f) + ((eq? char 'special) #f) (else (vector-ref trans-table diff --git a/collects/parser-tools/private-lex/actions.ss b/collects/parser-tools/private-lex/actions.ss new file mode 100644 index 0000000..f88344c --- /dev/null +++ b/collects/parser-tools/private-lex/actions.ss @@ -0,0 +1,42 @@ +(module actions mzscheme + + (provide (all-defined)) + + ;; wrap-action: (syntax-object or #f) symbol syntax-object syntax-object -> syntax-object + (define (wrap-action action result-name ctxt loc) + (if action + (let ((parms (datum->syntax-object + action + `(start-pos end-pos ,result-name return-without-pos input-port)))) + (datum->syntax-object ctxt + `(lambda ,parms ,action) + loc)) + (datum->syntax-object ctxt 'void loc))) + + + ;; get-special-action: (syntax-object list) symbol 'a -> syntax-object or 'a + ;; Returns the first action from a rule of the form ((which-special) action) + (define (get-special-action rules which-special none) + (cond + ((null? rules) none) + (else + (syntax-case (car rules) () + (((special) act) + (eq? (syntax-e (syntax special)) which-special) + (syntax act)) + (_ (get-special-action (cdr rules) which-special none)))))) + + ;; filter-out-specials: (syntax-object list) (symbol list) -> (syntax-object list) + ;; Returns a list missing all the rules of the form ((special) action) + ;; where special is a symbol in which specials. + (define (filter-out-specials rules which-specials) + (cond + ((null? rules) null) + (else + (syntax-case (car rules) () + (((special) act) + (memq (syntax-e (syntax special)) which-specials) + (filter-out-specials (cdr rules) which-specials)) + (_ (cons (car rules) (filter-out-specials (cdr rules) which-specials))))))) + + ) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/deriv.ss b/collects/parser-tools/private-lex/deriv.ss new file mode 100644 index 0000000..1b0425a --- /dev/null +++ b/collects/parser-tools/private-lex/deriv.ss @@ -0,0 +1,314 @@ +(module deriv mzscheme + + (require (lib "list.ss") + "re.ss" + "util.ss") + + (provide build-dfa print-dfa (struct dfa (num-states start-state final-states/actions transitions))) + + (define e (build-epsilon)) + (define z (build-zero)) + + ;; get-char-groups : re -> (list-of char-setR?) + ;; Collects the char-setRs in r that could be used in + ;; taking the derivative of r. + (define (get-char-groups r) + (cond + ((or (eq? r e) (eq? r z)) null) + ((char-setR? r) (list r)) + ((concatR? r) + (if (re-nullable? (concatR-re1 r)) + (append (get-char-groups (concatR-re1 r)) + (get-char-groups (concatR-re2 r))) + (get-char-groups (concatR-re1 r)))) + ((repeatR? r) + (get-char-groups (repeatR-re r))) + ((orR? r) + (apply append (map get-char-groups (orR-res r)))))) + + (test-block ((c (make-cache)) + (r1 (->re #\1 c)) + (r2 (->re #\2 c))) + ((get-char-groups e) null) + ((get-char-groups z) null) + ((get-char-groups r1) (list r1)) + ((get-char-groups (->re `(@ ,r1 ,r2) c)) + (list r1)) + ((get-char-groups (->re `(@ ,e ,r2) c)) + (list r2)) + ((get-char-groups (->re `(@ (* ,r1) ,r2) c)) + (list r1 r2)) + ((get-char-groups (->re `(* ,r1) c)) + (list r1)) + ((get-char-groups (->re `(: (* ,r1) (@ (* ,r2) "3") "4") c)) + (list r1 r2 (->re "3" c) (->re "4" c))) + ) + + + ;; A char-set is a (list-of char) that is sorted and duplicate-free + + ;; partition : (list-of char-set) -> (list-of char-set) + ;; The coarsest refinment r of sets such that the char-sets in r + ;; are pairwise disjoint. + (define (partition sets) + (cond + ((null? sets) null) + (else + (partition1 (car sets) (partition (cdr sets)))))) + + ;; partition1 : char-set (list-of char-set) -> (list-of char-set) + ;; All the char-sets in sets must be pairwise disjoint. Splits set + ;; against each element in sets. + (define (partition1 set sets) + (cond + ((null? set) sets) + ((null? sets) (list set)) + (else + (let ((set2 (car sets))) + (let-values (((i s1 s2) (split set set2))) + (let ((rest (partition1 s1 (cdr sets)))) + (cond + ((null? i) + (cons s2 rest)) + ((null? s2) + (cons i rest)) + (else + (cons i (cons s2 rest)))))))))) + + (test-block ((sl string->list)) + ((partition null) null) + ((partition (list (sl "1234"))) (list (sl "1234"))) + ((partition (list (sl "1234") (sl "0235"))) + (list (sl "23") (sl "05") (sl "14"))) + ((partition (list (sl "12349") (sl "02359") (sl "67") (sl "29"))) + (list (sl "29") (sl "67") (sl "3") (sl "05") (sl "14"))) + ) + + (test-block ((sl string->list)) + ((partition1 (sl "bcdjw") null) (list (sl "bcdjw"))) + ((partition1 null null) null) + ((partition1 null (list (sl "a") (sl "b") (sl "1"))) + (list (sl "a") (sl "b") (sl "1"))) + ((partition1 (sl "bcdjw") + (list (sl "z") + (sl "ab") + (sl "dj"))) + (list (sl "z") (sl "b") (sl "a") (sl "dj") (sl "cw")))) + + + + ;; deriveR : re * char cache -> re + (define (deriveR r c cache) + (cond + ((or (eq? r e) (eq? r z)) z) + ((char-setR? r) + (if (memq c (char-setR-chars r)) e z)) + ((concatR? r) + (let* ((r1 (concatR-re1 r)) + (r2 (concatR-re2 r)) + (d (build-concat (deriveR r1 c cache) r2 cache))) + (if (re-nullable? r1) + (build-or (list d (deriveR r2 c cache)) cache) + d))) + ((repeatR? r) + (build-concat (deriveR (repeatR-re r) c cache) r cache)) + ((orR? r) + (build-or (map (lambda (x) (deriveR x c cache)) + (orR-res r)) + cache)) + ((andR? r) + (build-and (map (lambda (x) (deriveR x c cache)) + (andR-res r)) + cache)) + ((negR? r) + (build-neg (deriveR (negR-re r) c cache) cache)))) + + (test-block ((c (make-cache)) + (r1 (->re #\a c)) + (r2 (->re `(* #\a) c)) + (r3 (->re `(* ,r2) c)) + (r4 (->re `(@ #\a ,r2) c)) + (r5 (->re `(* ,r4) c)) + (r6 (->re `(: ,r5 #\a) c)) + (r7 (->re `(@ ,r2 ,r2) c))) + ((deriveR e #\a c) z) + ((deriveR z #\a c) z) + ((deriveR r1 #\b c) z) + ((deriveR r1 #\a c) e) + ((deriveR r2 #\a c) r2) + ((deriveR r2 #\b c) z) + ((deriveR r3 #\a c) (->re `(@ ,r2 ,r3) c)) + ((deriveR r3 #\b c) z) + ((deriveR r4 #\a c) r2) + ((deriveR r4 #\b c) z) + ((deriveR r5 #\a c) (->re `(@ ,r2 ,r5) c)) + ((deriveR r5 #\b c) z) + ((deriveR r6 #\a c) (->re `(: (@ ,r2 ,r5) (epsilon)) c)) + ((deriveR r6 #\b c) z) + ((deriveR r7 #\a c) (->re `(: (@ ,r2 ,r2) ,r2) c)) + ((deriveR r7 #\b c) z)) + + + ;; An re-action is (cons re action) + + ;; derive : (list-of re-action) char cache -> (union (list-of re-action) #f) + ;; applies deriveR to all the re-actions's re parts. + ;; Returns #f if the derived state is equivalent to z. + (define (derive r c cache) + (let ((new-r (map (lambda (ra) + (cons (deriveR (car ra) c cache) (cdr ra))) + r))) + (if (andmap (lambda (x) (eq? z (car x))) + new-r) + #f + new-r))) + + (test-block ((c (make-cache)) + (r1 (->re #\1 c)) + (r2 (->re #\2 c))) + ((derive null #\1 c) #f) + ((derive (list (cons r1 1) (cons r2 2)) #\1 c) + (list (cons e 1) (cons z 2))) + ((derive (list (cons r1 1) (cons r2 2)) #\3 c) #f)) + + + ;; get-final : (list-of re-action) -> (union #f syntax-object) + ;; An re that accepts e represents a final state. Return the + ;; action from the first final state or #f if there is none. + (define (get-final res) + (cond + ((null? res) #f) + ((re-nullable? (caar res)) (cdar res)) + (else (get-final (cdr res))))) + + (test-block ((c (make-cache)) + (r1 (->re #\a c)) + (r2 (->re #\b c)) + (a (list (cons r1 1) (cons r2 2)))) + ((derive null #\a c) #f) + ((derive a #\a c) (list (cons e 1) (cons z 2))) + ((derive a #\b c) (list (cons z 1) (cons e 2))) + ((derive a #\c c) #f) + ((get-final a) #f) + ((get-final (list (cons e 1) (cons e 2))) 1)) + + + ;; A state is (make-state (list-of re-action) nat) + (define-struct state (spec index)) + + ;; get->key : re-action -> (list-of nat) + ;; states are indexed by the list of indexes of their res + (define (get-key s) + (map (lambda (x) (re-index (car x))) s)) + + ;; compute-chars : (list-of state) -> (list-of char-set) + ;; Computed the sets of equivalent characters for taking the + ;; derivative of the car of st. Only one derivative per set need to be taken. + (define (compute-chars st) + (cond + ((null? st) null) + (else + (partition (map char-setR-chars + (apply append (map (lambda (x) (get-char-groups (car x))) + (state-spec (car st))))))))) + + (test-block ((c (make-cache)) + (r1 (->re `(- #\1 #\4) c)) + (r2 (->re `(- #\2 #\3) c))) + ((compute-chars null) null) + ((compute-chars (list (make-state null 1))) null) + ((compute-chars (list (make-state (list (cons r1 1) (cons r2 2)) 2))) + (list (list #\2 #\3) (list #\1 #\4)))) + + + ;; A dfa is (make-dfa int int + ;; (list-of (cons int syntax-object)) + ;; (list-of (cons int (list-of (cons (list-of char) int))))) + ;; Each transitions is a state and a list of chars with the state to transition to. + ;; The finals and transitions are sorted by state number, and duplicate free. + (define-struct dfa (num-states start-state final-states/actions transitions)) + + ;; build-dfa : (list-of re-action) cache -> dfa + (define (build-dfa rs cache) + (let* ((transitions (make-hash-table)) + (get-state-number (make-counter)) + (start (make-state rs (get-state-number)))) + (cache (cons 'state (get-key rs)) (lambda () start)) + (let loop ((old-states (list start)) + (new-states null) + (cs (compute-chars (list start)))) + (cond + ((and (null? old-states) (null? new-states)) + (make-dfa (get-state-number) (state-index start) + (mergesort (filter (lambda (x) (cdr x)) + (hash-table-map transitions + (lambda (state _) + (cons (state-index state) (get-final (state-spec state)))))) + (lambda (a b) (< (car a) (car b)))) + (mergesort (hash-table-map transitions + (lambda (state trans) + (cons (state-index state) + (map (lambda (t) + (cons (car t) + (state-index (cdr t)))) + trans)))) + (lambda (a b) (< (car a) (car b)))))) + ((null? old-states) + (loop new-states null (compute-chars new-states))) + ((null? cs) + (loop (cdr old-states) new-states (compute-chars (cdr old-states)))) + (else + (let* ((state (car old-states)) + (c (car cs)) + (new-re (derive (state-spec state) (car c) cache))) + (cond + (new-re + (let* ((new-state? #f) + (new-state (cache (cons 'state (get-key new-re)) + (lambda () + (set! new-state? #t) + (make-state new-re (get-state-number)))))) + (hash-table-put! transitions + state + (cons (cons c new-state) + (hash-table-get transitions state + (lambda () null)))) + (cond + (new-state? + (loop old-states (cons new-state new-states) (cdr cs))) + (else + (loop old-states new-states (cdr cs)))))) + (else (loop old-states new-states (cdr cs)))))))))) + + (define (print-dfa x) + (printf "number of states: ~a~n" (dfa-num-states x)) + (printf "start state: ~a~n" (dfa-start-state x)) + (printf "final states: ~a~n" (map car (dfa-final-states/actions x))) + (for-each (lambda (trans) + (printf "state: ~a~n" (car trans)) + (for-each (lambda (rule) + (printf " -~a-> ~a~n" + (car rule) + (cdr rule))) + (cdr trans))) + (dfa-transitions x))) + + (define (build-test-dfa rs) + (let ((c (make-cache))) + (build-dfa (map (lambda (x) (cons (->re x c) 'action)) + rs) + c))) + #| + (define t1 (build-test-dfa null)) + (define t2 (build-test-dfa `(#\a))) + (define t3 (build-test-dfa `(#\a #\b))) + (define t4 (build-test-dfa `((* #\a) + (* (@ #\a #\b))))) + (define t5 (build-test-dfa `((@ (* (: #\0 #\1)) #\1)))) + (define t6 (build-test-dfa `((* (* #\a)) + (* (@ #\b (* #\b)))))) + (define t7 (build-test-dfa `((@ (* #\a) (* #\b) (* #\c) (* #\d) (* #\e))))) + (define t8 + (build-test-dfa `((@ (* (: #\a #\b)) #\a (: #\a #\b) (: #\a #\b) (: #\a #\b) (: #\a #\b))))) +|# + ) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/front.ss b/collects/parser-tools/private-lex/front.ss new file mode 100644 index 0000000..05ecdb1 --- /dev/null +++ b/collects/parser-tools/private-lex/front.ss @@ -0,0 +1,101 @@ +(module front mzscheme + (require "util.ss" + "stx.ss" + "re.ss" + "deriv.ss") + + (provide build-lexer) + + (define-syntax time-label + (syntax-rules () + ((_ l e ...) + (begin + (printf "~a: " l) + (time (begin e ...)))))) + + + ;; dfa->table : dfa -> (same as build-lexer) + (define (dfa->table dfa) + (let ( + ;; no-look : (vector-of bool) + ;; For each state whether the lexer can ignore the next input. + ;; It can do this only if there are no transitions out of the + ;; current state. + (no-look (make-vector (dfa-num-states dfa) #t)) + + ;; actions : (vector-of (union #f syntax-object)) + ;; The action for each final state, #f if the state isn't final + (actions (make-vector (dfa-num-states dfa) #f)) + + ;; char-table : (vector-of (union #f nat)) + ;; The lexer table, one entry per state per char. + ;; Each entry specifies a state to transition to. + ;; #f indicates no transition + (char-table (make-vector (* 256 (dfa-num-states dfa)) #f))) + + ;; Fill the char-table vector + (for-each + (lambda (trans) + (let ((from-state (car trans))) + (for-each (lambda (chars/to) + (let ((to-state (cdr chars/to))) + (for-each (lambda (char) + (vector-set! char-table + (bitwise-ior + (char->integer char) + (arithmetic-shift from-state 8)) + to-state)) + (car chars/to)))) + (cdr trans)))) + (dfa-transitions dfa)) + + (for-each (lambda (trans) + (vector-set! no-look (car trans) #f)) + (dfa-transitions dfa)) + + (for-each (lambda (state/action) + (vector-set! actions (car state/action) (cdr state/action))) + (dfa-final-states/actions dfa)) + + (values char-table (dfa-start-state dfa) actions no-look))) + + (test-block () + ((call-with-values (lambda () + (dfa->table (make-dfa 1 1 (list) (list)))) + list) + (list (make-vector 256 #f) 1 (vector #f) (make-vector 1 #t))) + ((call-with-values (lambda () + (dfa->table (make-dfa 4 1 (list (cons 2 2) (cons 3 3)) + (list (cons 1 (list (cons (list #\1 #\2) 1) + (cons (list #\3) 2))) + (cons 2 (list (cons (list #\1) 3))))))) + list) + (list (let ((v (make-vector 1024 #f))) + (vector-set! v 305 1) + (vector-set! v 306 1) + (vector-set! v 307 2) + (vector-set! v 561 3) + v) + 1 + (vector #f #f 2 3) + (vector #t #f #f #t)))) + + ;; build-lexer : syntax-object list -> (values (vector-of (union #f nat)) nat (vector-of (union #f syntax-object)) (vector-of bool)) + ;; each syntax object has the form (re action) + (define (build-lexer sos) + (let* ((s-re-acts (map (lambda (so) + (cons (parse (car (syntax->list so))) + (cadr (syntax->list so)))) + sos)) + + (cache (make-cache)) + + (re-acts (map (lambda (s-re-act) + (cons (->re (car s-re-act) cache) + (cdr s-re-act))) + s-re-acts)) + + (dfa (build-dfa re-acts cache))) + ;(print-dfa dfa) + (dfa->table dfa))) + ) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/re.ss b/collects/parser-tools/private-lex/re.ss new file mode 100644 index 0000000..5122e2a --- /dev/null +++ b/collects/parser-tools/private-lex/re.ss @@ -0,0 +1,301 @@ +(module re mzscheme + (require (lib "match.ss") + (lib "list.ss") + "util.ss") + + (provide ->re build-epsilon build-zero build-char-set build-concat + build-repeat build-or build-and build-neg + epsilonR? zeroR? char-setR? concatR? repeatR? orR? andR? negR? + char-setR-chars concatR-re1 concatR-re2 repeatR-re orR-res + andR-res negR-re + re-nullable? re-index) + + ;; get-index : -> nat + (define get-index (make-counter)) + + ;; An re is either + ;; - (make-epsilonR bool nat) + ;; - (make-zeroR bool nat) + ;; - (make-char-setR bool nat (list-of char)) The list must be sorted + ;; - (make-concatR bool nat re re) + ;; - (make-repeatR bool nat re) + ;; - (make-orR bool nat (list-of re)) Must not directly contain any orRs + ;; - (make-andR bool nat (list-of re)) Must not directly contain any andRs + ;; - (make-negR bool nat re) + ;; + ;; Every re must have an index field globally different from all + ;; other re index fields. + (define-struct re (nullable? index)) + (define-struct (epsilonR re) ()) + (define-struct (zeroR re) ()) + (define-struct (char-setR re) (chars) (make-inspector)) + (define-struct (concatR re) (re1 re2) (make-inspector)) + (define-struct (repeatR re) (re)) + (define-struct (orR re) (res) (make-inspector)) + (define-struct (andR re) (res)) + (define-struct (negR re) (re)) + + ;; e : re + ;; The unique epsilon re + (define e (make-epsilonR #t (get-index))) + + ;; z : re + ;; The unique zero re + (define z (make-zeroR #f (get-index))) + + + ;; s-re = char match the given character + ;; | string match its sequence of characters + ;; | re a precompiled re + ;; | (epsilon) match the empty string + ;; | (* s-re) match 0 or more + ;; | (+ s-re) match 1 or more + ;; | (? s-re) match 0 or 1 + ;; | (: s-re ...) match one of the sub-expressions + ;; | (@ s-re ...) match each sub-expression in succession + ;; | (- char char) match any character between two (inclusive) + ;; | (^ char_or_range ...1) match any character not listed + ;; (the null concatenation `(@) means epsilon as does "". + ;; the null or `(:) means match nothing. The null carat `(^) means match + ;; any character.) + + ;; ->re : s-re cache -> re + (define (->re exp cache) + (match exp + ((? char?) (build-char-set (list exp) cache)) + ((? string?) (->re `(@ ,@(string->list exp)) cache)) + ((? re?) exp) + (`(epsilon) (build-epsilon)) + (`(* ,r) + (build-repeat (->re r cache) cache)) + (`(+ ,r) + (->re `(@ ,r (* ,r)) cache)) + (`(? ,r) + (let ((c (->re r cache))) + (if (re-nullable? c) + c + (build-or (list e c) cache)))) + (`(: ,r) + (->re r cache)) + (`(: ,rs ...) + (build-or (flatten-res (map (lambda (r) (->re r cache)) rs) cache) + cache)) + (`(@ ,rs ...) + (foldr (lambda (x y) + (build-concat (->re x cache) y cache)) + e + rs)) + (`(- ,c1 ,c2) + (let ((i1 (char->integer (if (string? c1) (string-ref c1 0) c1))) + (i2 (char->integer (if (string? c2) (string-ref c2 0) c2)))) + (if (<= i1 i2) + (build-char-set (make-range i1 i2) cache) + z))) + (`(^ ,crs ...) + (let ((cs (->re `(: ,@crs) cache))) + (cond + ((zeroR? cs) (build-char-set (make-range 0 255) cache)) + ((char-setR? cs) + (build-char-set + (let loop ((bad-chars (map char->integer + (char-setR-chars cs))) + (i 0)) + (cond + ((> i 255) null) + ((and (not (null? bad-chars)) + (= i (car bad-chars))) + (loop (cdr bad-chars) (add1 i))) + (else + (cons (integer->char i) (loop bad-chars (add1 i)))))) + cache)) + (else z)))))) + + + + + ;; flatten-res: (list-of re) cache -> (list-of re) + ;; Takes all the char-sets in l and combines them into one element. + ;; Removes orRs too, + (define (flatten-res l cache) + (let loop ((res l) + (chars null) + (no-chars null)) + (cond + ((null? res) + (if (null? chars) + no-chars + (cons (build-char-set (mergesort chars char re + (define (build-epsilon) e) + + (define (build-zero) z) + + ;; build-char-set : (list-of char) cache -> re + ;; cs must be sorted + #;(define (build-char-set cs cache) + (cond + ((null? cs) z) + (else + (make-char-setR #f (get-index) cs)))) + + (define (build-char-set cs cache) + (cond + ((null? cs) z) + (else + (cache cs + (lambda () + (make-char-setR #f (get-index) cs)))))) + + + + ;; build-concat : re re cache -> re + (define (build-concat r1 r2 cache) + (cond + ((eq? e r1) r2) + ((eq? e r2) r1) + ((or (eq? z r1) (eq? z r2)) z) + (else + (let* ((i1 (re-index r1)) + (i2 (re-index r2)) + (key (if (< i1 i2) + (cons i1 i2) + (cons i2 i1)))) + (cache (cons 'concat key) + (lambda () + (make-concatR (and (re-nullable? r1) (re-nullable? r2)) + (get-index) + r1 r2))))))) + + ;; build-repeat : re cache -> re + (define (build-repeat r cache) + (cache (cons 'repeat (re-index r)) + (lambda () + (make-repeatR #t (get-index) r)))) + + + ;; build-or : (list-of re) cache -> re + (define (build-or rs cache) + (let ((rs + (filter + (lambda (x) (not (eq? x z))) + (do-simple-equiv (replace rs orR? orR-res null) re-index)))) + (cond + ((null? rs) z) + ((null? (cdr rs)) (car rs)) + (else + (cache (cons 'or (map re-index rs)) + (lambda () + (make-orR (ormap re-nullable? rs) (get-index) rs))))))) + + ;; build-and : (list-of re) cache -> re + (define (build-and rs cache) + (let ((rs (do-simple-equiv (replace rs andR? andR-res null) rs))) + (cond + ((ormap (lambda (x) (eq? x z)) rs) z) + (else + (cache (cons 'and (map re-index rs)) + (lambda () + (make-andR (andmap re-nullable? rs) (get-index) rs))))))) + + ;; build-neg : re cache -> re + (define (build-neg r cache) + (cache (cons 'neg (re-index r)) + (lambda () + (make-negR (not (re-nullable? r)) (get-index) r)))) + + (test-block ((c (make-cache)) + (r1 (->re #\1 c)) + (r2 (->re #\2 c)) + (rc (->re `(@ ,r1 ,r2) c)) + (rc2 (->re `(@ ,r2 ,r1) c)) + (rr (->re `(* ,rc) c)) + (ro (->re `(: ,rr ,rc ,rr) c)) + (ro2 (->re `(: ,rc ,rr ,z) c)) + (ro3 (->re `(: ,rr ,rc) c))) + (rc rc2) + (ro ro2) + (ro ro3) + ((->re `(* ,rc) c) rr) + ((build-char-set null c) z) + ((->re `(@ ,r1 (epsilon)) c) r1) + ((->re `(@ (epsilon) ,r1) c) r1) + ((->re `(@ ,r1 ,z) c) z) + ((->re `(@ ,z ,r1) c) z) + ((->re `(@ ,z (epsilon)) c) z) + ((->re `(@ (epsilon) ,z) c) z) + ((->re `(:) c) z) + ((->re `(: ,rr) c) rr) + ((build-or `(,z ,r1 ,z) c) r1) + ((build-or (list + (build-or (list r1 r2) c) + (build-or (list rc rr) c)) + c) + (build-or (list r1 r2 rc rr) c)) + ((concatR-re1 rc2) r1) + ((concatR-re2 rc2) r2) + ((orR-res ro) (list rc rr)) + ((repeatR-re rr) rc) + ((re-nullable? r1) #f) + ((re-nullable? rc) #f) + ((re-nullable? (->re `(@ ,rr ,rr) c)) #t) + ((re-nullable? rr) #t) + ((re-nullable? ro) #t) + ((re-nullable? (->re `(: ,r1 ,r2) c)) #f)) + + (test-block ((c (make-cache)) + (r1 (->re #\1 c)) + (r2 (->re #\2 c)) + (r3-5 (->re '(- #\3 #\5) c)) + (r4 (build-or `(,r1 ,r2) c)) + (r5 (->re `(: ,r3-5 #\7) c)) + (r6 (->re #\6 c))) + ((flatten-res null c) null) + ((char-setR-chars (car (flatten-res `(,r1) c))) '(#\1)) + ((char-setR-chars (car (flatten-res `(,r4) c))) '(#\1 #\2)) + ((char-setR-chars (car (flatten-res `(,r6 ,r5 ,r4 ,r3-5 ,r2 ,r1) c))) + (string->list "1234567"))) + + (test-block ((c (make-cache)) + (r (->re #\a c)) + (rr (->re `(@ ,r ,r) c)) + (rrr (->re `(@ ,r ,rr) c)) + (rrr* (->re `(* ,rrr) c))) + ((char-setR-chars r) '(#\a)) + ((->re "" c) e) + ((->re "asdf" c) (->re `(@ #\a #\s #\d #\f) c)) + ((->re r c) r) + ((->re `(epsilon) c) e) + ((->re `(* ,r) c) (build-repeat r c)) + ((->re `(+ ,r) c) (build-concat r (build-repeat r c) c)) + ((->re `(? ,r) c) (build-or (list e r) c)) + ((->re `(? ,rrr*) c) rrr*) + ((->re `(: (: (- #\a #\c) (^ (- #\000 #\110) (- #\112 #\377))) + (: (* #\2))) c) + (build-or (list (build-char-set (list #\111 #\a #\b #\c) c) + (build-repeat (build-char-set '(#\2) c) c)) + c)) + ((->re `(: ,rr ,rrr) c) (build-or (list rr rrr) c)) + ((->re `(: ,r) c) r) + ((->re `(:) c) z) + ((->re `(@) c) e) + ((->re `(@ ,rrr*) c) rrr*) + (rr (build-concat r r c)) + ((->re `(@ ,r ,rr ,rrr) c) + (build-concat r (build-concat rr rrr c) c)) + ((char-setR-chars (->re `(- #\1 #\1) c)) '(#\1)) + ((char-setR-chars (->re `(- #\1 #\9) c)) (string->list "123456789")) + ((char-setR-chars (->re `(- "1" "1") c)) '(#\1)) + ((char-setR-chars (->re `(- "1" "9") c)) (string->list "123456789")) + ((->re `(- "9" "1") c) z) + ((char-setR-chars (->re `(^) c)) + (char-setR-chars (->re `(- #\000 #\377) c))) + ((char-setR-chars (->re `(^ #\001 (- #\002 #\377)) c)) `(#\000)) + ) + + ) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/stx.ss b/collects/parser-tools/private-lex/stx.ss new file mode 100644 index 0000000..7f8fd4b --- /dev/null +++ b/collects/parser-tools/private-lex/stx.ss @@ -0,0 +1,104 @@ +(module stx mzscheme + (require (lib "stx.ss" "syntax") + "util.ss") + + (provide parse) + + + (define (num-arg-err s expect given) + (raise-syntax-error + 'regular-expression + (format "operator expects ~a arguments, given ~a" expect given) + s)) + + + ;; parse : syntax-object -> s-re (see re.ss) + ;; checks for errors and generates the plain s-exp form for s + (define (parse s) + (let ((s-e (syntax-e s))) + (cond + ((char? s-e) s-e) + ((string? s-e) s-e) + ((symbol? s-e) + (let ((expand (syntax-local-value s (lambda () #f)))) + (unless (lex-abbrev? expand) + (raise-syntax-error 'regular-expression "undefined abbreviation" s)) + (parse (lex-abbrev-abbrev expand)))) + ((stx-null? s) + (raise-syntax-error 'regular-expression "invalid regular expression" s)) + ((stx-list? s) + (let* ((ar (stx->list (stx-cdr s))) + (num-args (length ar))) + (case (syntax-e (stx-car s)) + ((epsilon) '(epsilon)) + ((*) + (unless (= num-args 1) + (num-arg-err s 1 num-args)) + `(* ,(parse (car ar)))) + ((+) + (unless (= num-args 1) + (num-arg-err s 1 num-args)) + `(+ ,(parse (car ar)))) + ((?) + (unless (= num-args 1) + (num-arg-err s 1 num-args)) + `(? ,(parse (car ar)))) + ((:) `(: ,@(map parse ar))) + ((@) `(@ ,@(map parse ar))) + ((-) + (unless (= num-args 2) + (num-arg-err s 2 num-args)) + (let ((c1 (parse (car ar))) + (c2 (parse (cadr ar)))) + (if (and (or (char? c1) (and (string? c1) (= 1 (string-length c1)))) + (or (char? c2) (and (string? c2) (= 1 (string-length c2))))) + (let ((i1 (char->integer (if (char? c1) c1 (string-ref c1 0)))) + (i2 (char->integer (if (char? c2) c2 (string-ref c2 0))))) + (if (<= i1 i2) + `(- ,c1 ,c2) + (raise-syntax-error + 'regular-expression + (format "first argument ~a does not preceed second argument ~a" + c1 c2) + s))) + (raise-syntax-error + 'regular-expression + (format "expects single character arguments, given ~a and ~a" + (syntax-object->datum (car ar)) + (syntax-object->datum (cadr ar))) + s)))) + ((^) + (let ((res (map parse ar))) + (if (not (andmap pure-char? res)) + (raise-syntax-error + 'regular-expression + (format + "expects single character or character range arguments, given ~a" + (map syntax-object->datum ar)) + s)) + `(^ ,@res))) + (else + (raise-syntax-error + 'regular-expression + "invalid operator" + s))))) + (else + (raise-syntax-error + 'regular-expression + "invalid regular expression" + s))))) + + (define (pure-char? s-re) + (cond + ((char? s-re) #t) + ((string? s-re) (= (string-length s-re) 1)) + ((list? s-re) + (let ((op (car s-re))) + (case op + ((: ^) (andmap pure-char? (cdr s-re))) + ((-) #t) + (else #f)))) + (else #f))) + + + ) \ No newline at end of file diff --git a/collects/parser-tools/private-lex/util.ss b/collects/parser-tools/private-lex/util.ss new file mode 100644 index 0000000..168a530 --- /dev/null +++ b/collects/parser-tools/private-lex/util.ss @@ -0,0 +1,180 @@ +(module util mzscheme + (require (lib "list.ss")) + + (provide (all-defined-except split-acc)) + + (define-struct lex-abbrev (abbrev)) + + (define-syntax test-block + (syntax-rules () + ((_ defs (code right-ans) ...) + (let* defs + (let ((real-ans code)) + (unless (equal? real-ans right-ans) + (printf "Test failed: ~e gave ~e. Expected ~e~n" + 'code real-ans 'right-ans))) ...)))) + + #;(define-syntax test-block + (syntax-rules () + ((_ x ...) (void)))) + + + ;; A cache is (X ( -> Y) -> Y) + ;; make-cache : -> cache + ;; table map Xs to Ys. If key is mapped, its value is returned. + ;; Otherwise, build is invoked and its result is placed in the table and + ;; returned. + ;; Xs are compared with equal? + (define (make-cache) + (let ((table (make-hash-table 'equal))) + (lambda (key build) + (hash-table-get table key + (lambda () + (let ((new (build))) + (hash-table-put! table key new) + new)))))) + (test-block ((cache (make-cache))) + ((cache '(1 2) (lambda () 9)) 9) + ((cache '(2 1) (lambda () 8)) 8) + ((cache '(1 2) (lambda () 1)) 9)) + + + ;; make-counter : -> -> nat + ;; makes a function that returns a higher number by 1, each time + ;; it is called. + (define (make-counter) + (let ((counter 0)) + (lambda () + (begin0 + counter + (set! counter (add1 counter)))))) + (test-block ((c (make-counter)) + (d (make-counter))) + ((c) 0) + ((d) 0) + ((c) 1) + ((d) 1) + ((c) 2)) + + + ;; remove-dups : (list-of X) (X -> number) -> (list-of X) + ;; removes the entries from l that have the same index as a + ;; previous entry. l must be grouped by indexes. + (define (remove-dups l index acc) + (cond + ((null? l) (reverse acc)) + ((null? acc) (remove-dups (cdr l) index (cons (car l) acc))) + ((= (index (car acc)) (index (car l))) + (remove-dups (cdr l) index acc)) + (else + (remove-dups (cdr l) index (cons (car l) acc))))) + + (test-block () + ((remove-dups '((1 2) (2 2) (1 3) (1 4) (100 4) (0 5)) cadr null) + '((1 2) (1 3) (1 4) (0 5))) + ((remove-dups null error null) null)) + + + ;; do-simple-equiv : (list-of X) (X -> nat) -> (list-of X) + ;; Sorts l according to index and removes the entries with duplicate + ;; indexes. + (define (do-simple-equiv l index) + (let ((ordered (mergesort l (lambda (a b) (< (index a) (index b)))))) + (remove-dups ordered index null))) + + (test-block () + ((do-simple-equiv '((2 2) (1 4) (1 2) (100 4) (1 3) (0 5)) cadr) + '((2 2) (1 3) (1 4) (0 5))) + ((do-simple-equiv null error) null)) + + + ;; replace : (list-of X) (X -> bool) (X -> (list-of X)) (list-of X) -> + ;; (list-of X) + ;; If (pred? r) for some r in l, splice (get r) in place of r in the resulting + ;; list. + (define (replace l pred? get acc) + (cond + ((null? l) acc) + ((pred? (car l)) (replace (cdr l) pred? get (append (get (car l)) acc))) + (else (replace (cdr l) pred? get (cons (car l) acc))))) + + (test-block () + ((replace null void (lambda () (list 1)) null) null) + ((replace '(1 2 3 4 3 5) + (lambda (x) (= x 3)) + (lambda (x) (list 1 2 3)) + null) + '(5 1 2 3 4 1 2 3 2 1))) + + ;; make-range : int * int -> char list + ;; creates a list of all chars between i and j. i <= j + (define (make-range i j) + (letrec ((make-range + (lambda (i j) + (cond + ((= i j) (list (integer->char i))) + (else + (cons (integer->char i) (make-range (add1 i) j))))))) + (make-range i j))) + (test-block () + ((make-range 97 110) (string->list "abcdefghijklmn")) + ((make-range 111 111) '(#\o))) + + + + ;; merge : (list-of char) (list-of char) -> (list-of char) + ;; Combines 2 sorted, duplicate-free lists into 1, removing duplicates. + (define (merge l1 l2) + (cond + ((null? l2) l1) + ((null? l1) l2) + (else (let ((cl1 (car l1)) + (cl2 (car l2))) + (cond + ((> (char->integer cl1) (char->integer cl2)) + (cons cl2 (merge l1 (cdr l2)))) + ((< (char->integer cl1) (char->integer cl2)) + (cons cl1 (merge (cdr l1) l2))) + (else (merge (cdr l1) l2))))))) + (test-block () + ((merge (string->list "abcd") + (string->list "abde")) + (string->list "abcde")) + ((merge null null) null) + ((merge null '(#\1)) '(#\1)) + ((merge '(#\1) null) '(#\1))) + + (define (split-acc l1 l2 i l1-i l2-i) + (cond + ((null? l1) (values (reverse! i) (reverse! l1-i) (reverse! (append! (reverse l2) l2-i)))) + ((null? l2) (values (reverse! i) (reverse! (append! (reverse l1) l1-i)) (reverse! l2-i))) + (else (let ((cl1 (car l1)) + (cl2 (car l2))) + (cond + ((> (char->integer cl1) (char->integer cl2)) + (split-acc l1 (cdr l2) i l1-i (cons cl2 l2-i))) + ((< (char->integer cl1) (char->integer cl2)) + (split-acc (cdr l1) l2 i (cons cl1 l1-i) l2-i)) + (else + (split-acc (cdr l1) (cdr l2) (cons cl1 i) l1-i l2-i))))))) + + ;; split : (list-of char) (list-of char) -> (list-of char) (list-of char) (list-of char) + ;; Takes sorted, duplicate-free l1 and l2 and returns (l1 intersect l2), + ;; l1 - (l1 intersect l2) and l2 - (l1 intersect l2) + (define (split l1 l2) + (split-acc l1 l2 null null null)) + + (test-block () + ((let-values (((a b c) + (split (string->list "abcdghjkl") + (string->list "abdeijmn")))) + (list a b c)) + (list (string->list "abdj") (string->list "cghkl") (string->list "eimn"))) + ((let-values (((a b c) (split null null))) + (list a b c)) (list null null null)) + ((let-values (((a b c) (split '(#\1) null))) + (list a b c)) (list null '(#\1) null)) + ((let-values (((a b c) (split null '(#\1)))) + (list a b c)) (list null null '(#\1)))) + + ) \ No newline at end of file