From 6329271600f6a22e9ab1ec322321f96915010329 Mon Sep 17 00:00:00 2001 From: Jack Firth Date: Sun, 8 May 2022 19:30:42 -0700 Subject: [PATCH] Rework regex API again --- lexer/private/regex-vm.rkt | 239 ++++++++++++++++-------------- lexer/private/regular-match.rkt | 137 +++++++++++++++++ lexer/private/regular-pattern.rkt | 150 +++++++++---------- private/hash.rkt | 19 +++ 4 files changed, 352 insertions(+), 193 deletions(-) create mode 100644 lexer/private/regular-match.rkt create mode 100644 private/hash.rkt diff --git a/lexer/private/regex-vm.rkt b/lexer/private/regex-vm.rkt index db03a93..c6b2a29 100644 --- a/lexer/private/regex-vm.rkt +++ b/lexer/private/regex-vm.rkt @@ -12,7 +12,9 @@ rebellion/collection/vector rebellion/streaming/reducer rebellion/streaming/transducer - syntax/parse/define) + syntax/parse/define + yaragg/lexer/private/regular-match + yaragg/private/hash) (module+ test @@ -28,13 +30,6 @@ ;; https://swtch.com/~rsc/regexp/regexp2.html. -(define (hash->immutable-hash h) - (if (and (hash? h) (immutable? h)) - h - (for/hash ([(k v) (in-hash h)]) - (values k v)))) - - (define (compiled-regex-with-labels program labels) (compiled-regex (for/vector ([instruction (in-vector program)]) @@ -48,7 +43,8 @@ (struct compiled-regex (program) #:transparent - #:guard (λ (instructions _) (sequence->vector instructions))) + #:guard + (λ (instructions _) (sequence->vector instructions))) (struct regex-instruction () #:transparent) @@ -59,24 +55,13 @@ (struct split-instruction regex-instruction (primary-address secondary-address) #:transparent) (struct labeled-jump-instruction regex-instruction (label) #:transparent) (struct labeled-split-instruction regex-instruction (primary-label secondary-label) #:transparent) -(struct match-instruction regex-instruction (mode) #:transparent) -(struct save-instruction regex-instruction (savepoint) #:transparent) +(struct match-instruction regex-instruction () #:transparent) +(struct start-group-instruction regex-instruction (key) #:transparent) +(struct finish-group-instruction regex-instruction (key) #:transparent) (struct fail-instruction regex-instruction () #:transparent) -(define (compiled-regex-savepoint-count compiled) - (transduce (compiled-regex-program compiled) - (filtering save-instruction?) - (mapping save-instruction-savepoint) - #:into into-count)) - - -(struct regex-execution-result (mode savepoints) - #:transparent - #:guard (λ (mode savepoints _) (values mode (sequence->vector savepoints)))) - - -(struct regex-thread (program-counter savepoints) #:transparent) +(struct regex-thread (program-counter captured-groups) #:transparent) (struct thread-list ([size #:mutable] threads-by-priority-order threads-by-program-counter) @@ -92,29 +77,36 @@ (define (thread-list-add! threads thread #:program program #:input input #:input-index i) - (let loop ([thread thread] [i i] [pi i]) + (let loop ([thread thread] [i i] [peek i] [max-peek i]) (define pc (regex-thread-program-counter thread)) - (define savepoints (regex-thread-savepoints thread)) + (define groups (regex-thread-captured-groups thread)) (match (vector-ref program pc) - [(jump-instruction address) (loop (regex-thread address savepoints) i pi)] + [(jump-instruction address) (loop (regex-thread address groups) i peek max-peek)] [(split-instruction primary secondary) - (define secondary-savepoints (vector-copy savepoints)) - (loop (regex-thread primary savepoints) i pi) - (loop (regex-thread secondary secondary-savepoints) i pi)] - [(save-instruction savepoint) - (vector-set! savepoints savepoint i) - (loop (regex-thread (add1 pc) savepoints) i pi)] + (define secondary-groups (captured-groups-builder-copy groups)) + (max (loop (regex-thread primary groups) i peek max-peek) + (loop (regex-thread secondary secondary-groups) i peek max-peek))] + [(start-group-instruction key) + (captured-groups-builder-start-group! groups key i) + (loop (regex-thread (add1 pc) groups) i peek max-peek)] + [(finish-group-instruction key) + (captured-groups-builder-finish-group! groups key i) + (loop (regex-thread (add1 pc) groups) i peek max-peek)] [(peek-instruction expected) - (when (equal? (string-ref input pi) expected) - (loop (regex-thread (add1 pc) savepoints) i (add1 pi)))] - [(reset-peek-instruction) (loop (regex-thread (add1 pc) savepoints) i i)] + (cond + [(equal? (string-ref input peek) expected) + (define next-peek (add1 peek)) + (loop (regex-thread (add1 pc) groups) i next-peek (max max-peek next-peek))] + [else max-peek])] + [(reset-peek-instruction) (loop (regex-thread (add1 pc) groups) i i max-peek)] [_ (define by-pc (thread-list-threads-by-program-counter threads)) (unless (vector-ref by-pc pc) (define size (thread-list-size threads)) (vector-set! by-pc pc thread) (vector-set! (thread-list-threads-by-priority-order threads) size thread) - (set-thread-list-size! threads (add1 size)))]))) + (set-thread-list-size! threads (add1 size))) + max-peek]))) (define (thread-list-clear! threads) @@ -130,63 +122,73 @@ (set-thread-list-size! threads 0)) -(define (compiled-regex-match-string r str) - (define strlen (string-length str)) +(define (compiled-regex-match-string r str [start 0] [end (string-length str)]) (define program (compiled-regex-program r)) - (define savepoint-count (compiled-regex-savepoint-count r)) - (define (make-thread [pc 0] [savepoints (make-vector savepoint-count #false)]) - (regex-thread pc savepoints)) + (define (make-thread [pc 0] [groups (make-captured-groups-builder)]) + (regex-thread pc groups)) (define running-threads (make-thread-list #:program-size (vector-length program))) (define blocked-threads (make-thread-list #:program-size (vector-length program))) - (thread-list-add! running-threads (make-thread) #:program program #:input str #:input-index 0) + (define max-peek + (thread-list-add! running-threads (make-thread) #:program program #:input str #:input-index start)) (for/fold ([running-threads running-threads] [blocked-threads blocked-threads] [last-match #false] - #:result last-match) - ([input-index (in-range 0 (add1 strlen))]) - (let loop ([i 0]) + [max-peek max-peek] + #:result (or last-match (regular-match-failure start (- max-peek start)))) + ([input-index (in-range start (add1 end))]) + (let loop ([i 0] [max-peek max-peek]) (cond [(equal? i (thread-list-size running-threads)) (thread-list-clear! running-threads) - (values blocked-threads running-threads last-match)] + (values blocked-threads running-threads last-match max-peek)] [else (define thread (thread-list-get running-threads i)) (define pc (regex-thread-program-counter thread)) - (define savepoints (regex-thread-savepoints thread)) + (define groups (regex-thread-captured-groups thread)) (match (vector-ref program pc) [(read-instruction c) - (when (and (< input-index strlen) (equal? (string-ref str input-index) c)) - (define next-thread (make-thread (add1 pc) savepoints)) - (thread-list-add! blocked-threads - next-thread - #:program program - #:input str - #:input-index input-index)) - (loop (add1 i))] - [(match-instruction mode) + (cond + [(and (< input-index end) (equal? (string-ref str input-index) c)) + (define next-thread (make-thread (add1 pc) groups)) + (define next-max-peek + (thread-list-add! blocked-threads + next-thread + #:program program + #:input str + #:input-index (add1 input-index))) + (loop (add1 i) (max max-peek next-max-peek))] + [else + (loop (add1 i) (max (add1 input-index) max-peek))])] + [(match-instruction) (thread-list-clear! running-threads) - (values blocked-threads running-threads (regex-execution-result mode savepoints))])])))) + (define resulting-match + (regular-match start input-index + #:peek-distance (- max-peek input-index) + #:groups (build-captured-groups groups))) + (values blocked-threads running-threads resulting-match max-peek)])])))) (module+ test (test-case (name-string match-instruction) - (define r (compiled-regex (list (match-instruction 42)))) - (check-equal? (compiled-regex-match-string r "aaaaaaaa") (regex-execution-result 42 '())) - (check-equal? (compiled-regex-match-string r "") (regex-execution-result 42 '()))) + (define r (compiled-regex (list (match-instruction)))) + (define expected (regular-match 0 0)) + (check-equal? (compiled-regex-match-string r "aaaaaaaa") expected) + (check-equal? (compiled-regex-match-string r "") expected)) (test-case (name-string read-instruction) (test-case "reading one character" - (define r (compiled-regex (list (read-instruction #\a) (match-instruction 0)))) - (check-equal? (compiled-regex-match-string r "a") (regex-execution-result 0 '())) - (check-equal? (compiled-regex-match-string r "aaa") (regex-execution-result 0 '())) - (check-equal? (compiled-regex-match-string r "ab") (regex-execution-result 0 '())) - (check-false (compiled-regex-match-string r "b")) - (check-false (compiled-regex-match-string r "")) - (check-false (compiled-regex-match-string r "ba"))) + (define r (compiled-regex (list (read-instruction #\a) (match-instruction)))) + (define expected (regular-match 0 1)) + (check-equal? (compiled-regex-match-string r "a") expected) + (check-equal? (compiled-regex-match-string r "aaa") expected) + (check-equal? (compiled-regex-match-string r "ab") expected) + (check-equal? (compiled-regex-match-string r "b") (regular-match-failure 0 1)) + (check-equal? (compiled-regex-match-string r "") (regular-match-failure 0 1)) + (check-equal? (compiled-regex-match-string r "ba") (regular-match-failure 0 1))) (test-case "reading multiple characters" (define r @@ -195,79 +197,94 @@ (read-instruction #\a) (read-instruction #\b) (read-instruction #\c) - (match-instruction 0)))) - (check-equal? (compiled-regex-match-string r "abc") (regex-execution-result 0 '())) - (check-equal? (compiled-regex-match-string r "abcxxx") (regex-execution-result 0 '())) - (check-false (compiled-regex-match-string r "cba")) - (check-false (compiled-regex-match-string r "a")) - (check-false (compiled-regex-match-string r "ab")) - (check-false (compiled-regex-match-string r "aaa")) - (check-false (compiled-regex-match-string r "bbb")) - (check-false (compiled-regex-match-string r "ccc")) - (check-false (compiled-regex-match-string r "aabc")))) + (match-instruction)))) + (define expected (regular-match 0 3)) + (check-equal? (compiled-regex-match-string r "abc") expected) + (check-equal? (compiled-regex-match-string r "abcxxx") expected) + (check-equal? (compiled-regex-match-string r "cba") (regular-match-failure 0 1)) + (check-equal? (compiled-regex-match-string r "a") (regular-match-failure 0 2)) + (check-equal? (compiled-regex-match-string r "ab") (regular-match-failure 0 3)) + (check-equal? (compiled-regex-match-string r "aaa") (regular-match-failure 0 2)) + (check-equal? (compiled-regex-match-string r "bbb") (regular-match-failure 0 1)) + (check-equal? (compiled-regex-match-string r "ccc") (regular-match-failure 0 1)) + (check-equal? (compiled-regex-match-string r "aabc") (regular-match-failure 0 2)))) (test-case (name-string jump-instruction) (define r (compiled-regex (list - (jump-instruction 2) (read-instruction #\a) (read-instruction #\b) (match-instruction 0)))) - (check-equal? (compiled-regex-match-string r "b") (regex-execution-result 0 '())) - (check-false (compiled-regex-match-string r "a")) - (check-false (compiled-regex-match-string r "c"))) + (jump-instruction 2) (read-instruction #\a) (read-instruction #\b) (match-instruction)))) + (check-equal? (compiled-regex-match-string r "b") (regular-match 0 1)) + (check-equal? (compiled-regex-match-string r "a") (regular-match-failure 0 1)) + (check-equal? (compiled-regex-match-string r "c") (regular-match-failure 0 1))) - (test-case (name-string split-instruction) + (test-case "group capturing instructions" (define r (compiled-regex (list - (split-instruction 1 3) + (start-group-instruction 'a) + (read-instruction #\a) (read-instruction #\a) - (match-instruction 0) + (read-instruction #\a) + (finish-group-instruction 'a) + (start-group-instruction 'b) + (read-instruction #\b) + (read-instruction #\b) (read-instruction #\b) - (match-instruction 1)))) - (check-equal? (compiled-regex-match-string r "a") (regex-execution-result 0 '())) - (check-equal? (compiled-regex-match-string r "b") (regex-execution-result 1 '())) - (check-equal? (compiled-regex-match-string r "ab") (regex-execution-result 0 '())) - (check-equal? (compiled-regex-match-string r "ba") (regex-execution-result 1 '())) - (check-false (compiled-regex-match-string r "c")) - (check-false (compiled-regex-match-string r ""))) - - (test-case (name-string save-instruction) + (finish-group-instruction 'b) + (match-instruction)))) + (define expected + (regular-match 0 6 + #:groups (hash 'a (list (captured-group 0 3)) 'b (list (captured-group 3 6))))) + (check-equal? (compiled-regex-match-string r "aaabbb") expected)) + + (test-case (name-string split-instruction) (define r (compiled-regex (list - (save-instruction 0) + (split-instruction 1 5) + (start-group-instruction 'a) (read-instruction #\a) - (read-instruction #\a) - (read-instruction #\a) - (save-instruction 1) - (read-instruction #\b) - (read-instruction #\b) + (finish-group-instruction 'a) + (match-instruction) + (start-group-instruction 'b) (read-instruction #\b) - (save-instruction 2) - (match-instruction 0)))) - (check-equal? (compiled-regex-match-string r "aaabbb") (regex-execution-result 0 '(0 2 5)))) + (finish-group-instruction 'b) + (match-instruction)))) + (define a-match (regular-match 0 1 #:groups (hash 'a (list (captured-group 0 1))))) + (define b-match (regular-match 0 1 #:groups (hash 'b (list (captured-group 0 1))))) + (check-equal? (compiled-regex-match-string r "a") a-match) + (check-equal? (compiled-regex-match-string r "b") b-match) + (check-equal? (compiled-regex-match-string r "ab") a-match) + (check-equal? (compiled-regex-match-string r "ba") b-match) + (check-equal? (compiled-regex-match-string r "c") (regular-match-failure 0 1)) + (check-equal? (compiled-regex-match-string r "") (regular-match-failure 0 1))) (test-case (name-string peek-instruction) (define r (compiled-regex (list - (save-instruction 0) + (start-group-instruction 'a) (peek-instruction #\a) (peek-instruction #\b) (peek-instruction #\c) - (save-instruction 1) - (match-instruction 0)))) - (check-equal? (compiled-regex-match-string r "abc") (regex-execution-result 0 '(0 0)))) + (finish-group-instruction 'a) + (match-instruction)))) + (define expected + (regular-match 0 0 #:peek-distance 3 #:groups (hash 'a (list (captured-group 0 0))))) + (check-equal? (compiled-regex-match-string r "abc") expected)) (test-case (name-string reset-peek-instruction) (define r (compiled-regex (list - (save-instruction 0) + (start-group-instruction 'a) (peek-instruction #\a) (peek-instruction #\b) (reset-peek-instruction) (peek-instruction #\a) - (save-instruction 1) - (match-instruction 0)))) - (check-equal? (compiled-regex-match-string r "abc") (regex-execution-result 0 '(0 0))))) + (finish-group-instruction 'a) + (match-instruction)))) + (define expected + (regular-match 0 0 #:peek-distance 2 #:groups (hash 'a (list (captured-group 0 0))))) + (check-equal? (compiled-regex-match-string r "abc") expected))) diff --git a/lexer/private/regular-match.rkt b/lexer/private/regular-match.rkt new file mode 100644 index 0000000..8c42af4 --- /dev/null +++ b/lexer/private/regular-match.rkt @@ -0,0 +1,137 @@ +#lang racket/base + + +(require racket/contract/base) + + +(provide + (contract-out + [regular-match + (->* (exact-nonnegative-integer? exact-nonnegative-integer?) + (#:peek-distance exact-nonnegative-integer? + #:groups (hash/c any/c (sequence/c captured-group?))) + regular-match?)] + [regular-match? predicate/c] + [regular-match-start (-> regular-match? exact-nonnegative-integer?)] + [regular-match-end (-> regular-match? exact-nonnegative-integer?)] + [regular-match-peek-distance (-> regular-match? exact-nonnegative-integer?)] + [regular-match-captured-groups + (-> regular-match? (hash/c any/c sorted-set? #:immutable #true #:flat? #true))] + [regular-match-failure + (-> exact-nonnegative-integer? exact-nonnegative-integer? regular-match-failure?)] + [regular-match-failure? predicate/c] + [regular-match-failure-start (-> regular-match-failure? exact-nonnegative-integer?)] + [regular-match-failure-peek-distance (-> regular-match-failure? exact-nonnegative-integer?)] + [captured-group (-> exact-nonnegative-integer? exact-nonnegative-integer? captured-group?)] + [captured-group? predicate/c] + [captured-group-start (-> captured-group? exact-nonnegative-integer?)] + [captured-group-end (-> captured-group? exact-nonnegative-integer?)] + [make-captured-groups-builder (-> captured-groups-builder?)] + [captured-groups-builder? predicate/c] + [captured-groups-builder-start-group! + (-> captured-groups-builder? any/c exact-nonnegative-integer? captured-groups-builder?)] + [captured-groups-builder-finish-group! + (-> captured-groups-builder? any/c exact-nonnegative-integer? captured-groups-builder?)] + [captured-groups-builder-copy (-> captured-groups-builder? captured-groups-builder?)] + [build-captured-groups + (-> captured-groups-builder? (hash/c any/c sorted-set? #:immutable #true #:flat? #true))])) + + +(require racket/match + racket/sequence + rebellion/base/comparator + rebellion/collection/sorted-set + rebellion/private/static-name + yaragg/private/hash) + + +;@---------------------------------------------------------------------------------------------------- + + +(struct regular-match (start end peek-distance captured-groups) + #:constructor-name constructor:regular-match + #:omit-define-syntaxes + #:transparent + + #:guard + (λ (position span peek-distance captured-groups _) + (define sorted-groups + (for/hash ([(key groups) (in-hash captured-groups)]) + (values key (sequence->sorted-set groups #:comparator captured-group<=>)))) + (values position span peek-distance sorted-groups)) + + #:property prop:custom-print-quotable 'never) + + +(struct regular-match-failure (start peek-distance) #:transparent) + + +(define (regular-match start end + #:peek-distance [peek-distance 0] + #:groups [groups (hash)]) + (constructor:regular-match start end peek-distance groups)) + + +(struct captured-group (start end) #:transparent) + + +(define captured-group<=> + (comparator-chain (comparator-map natural<=> captured-group-start) + (comparator-map natural<=> captured-group-end))) + + +(struct captured-groups-builder (started-groups finished-groups)) + + +(define (make-captured-groups-builder) + (captured-groups-builder (make-hash) (make-hash))) + + +(define (captured-groups-builder-copy builder) + (define started (hash-copy (captured-groups-builder-started-groups builder))) + (define finished (make-hash)) + (for ([(key group-set-builder) (in-hash (captured-groups-builder-finished-groups builder))]) + (define group-set-copy + (sorted-set-builder-add-all + (make-sorted-set-builder captured-group<=>) (build-sorted-set group-set-builder))) + (hash-set! finished key group-set-copy)) + (captured-groups-builder started finished)) + + +(define (captured-groups-builder-start-group! builder key input-position) + (define started (captured-groups-builder-started-groups builder)) + (when (hash-has-key? started key) + (raise-arguments-error (name captured-groups-builder-start-group!) + "already started capturing a group for this capture key" + "capture key" key + "previous start" (hash-ref started key) + "next start" input-position)) + (hash-set! started key input-position) + builder) + + +(define (captured-groups-builder-finish-group! builder key input-position) + (define started (captured-groups-builder-started-groups builder)) + (unless (hash-has-key? started key) + (raise-arguments-error (name captured-groups-builder-finish-group!) + "can't finish a capture group for this key, no group started yet" + "capture key" key + "finish position" input-position)) + (define finished (captured-groups-builder-finished-groups builder)) + (define start (hash-ref started key)) + (hash-remove! started key) + (define group (captured-group start input-position)) + (define groups-for-key (hash-ref! finished key (λ () (make-sorted-set-builder captured-group<=>)))) + (sorted-set-builder-add groups-for-key group) + builder) + + +(define (build-captured-groups builder) + (define started (captured-groups-builder-started-groups builder)) + (unless (hash-empty? (captured-groups-builder-started-groups builder)) + (raise-arguments-error (name build-captured-groups) + "some capture groups were started but not finished" + "unfinished groups" started)) + (define finished (captured-groups-builder-finished-groups builder)) + (for/hash ([(key group-set-builder) (in-hash finished)]) + (values key (build-sorted-set group-set-builder)))) diff --git a/lexer/private/regular-pattern.rkt b/lexer/private/regular-pattern.rkt index a6993e9..f32bda1 100644 --- a/lexer/private/regular-pattern.rkt +++ b/lexer/private/regular-pattern.rkt @@ -10,7 +10,8 @@ [element-pattern (-> any/c regular-pattern?)] [element-string-pattern (-> (sequence/c any/c) regular-pattern?)] [element-set-pattern (-> (sequence/c any/c) regular-pattern?)] - [group-pattern (->* ((sequence/c regular-pattern?)) (#:capture? boolean?) regular-pattern?)] + [group-pattern + (->* ((sequence/c regular-pattern?)) (#:capture-key (not/c #false)) regular-pattern?)] [choice-pattern (-> (sequence/c regular-pattern?) regular-pattern?)] [repetition-pattern (->i ([subpattern regular-pattern?]) @@ -21,14 +22,15 @@ "minimum repetition count cannot be greater than the maximum repetition count" (or (unsupplied-arg? min-count) (unsupplied-arg? max-count) (<= min-count max-count)) [_ regular-pattern?])] + [optional-pattern (->* (regular-pattern?) (#:greedy? boolean?) regular-pattern?)] [lookahead-pattern (-> regular-pattern? regular-pattern?)] - [regular-pattern-compile (-> regular-pattern? compiled-regex?)] - [regular-patterns-compile (-> (sequence/c regular-pattern?) compiled-regex?)])) + [regular-pattern-compile (-> regular-pattern? compiled-regex?)])) (require racket/match racket/sequence racket/set + rebellion/base/option rebellion/collection/vector rebellion/collection/vector/builder rebellion/streaming/transducer @@ -65,15 +67,15 @@ (choice-pattern choices)) -(struct group-pattern regular-pattern (subpatterns capture?) +(struct group-pattern regular-pattern (subpatterns capture-key) #:transparent #:name struct-transformer:group-pattern #:constructor-name constructor:group-pattern - #:guard (λ (subpatterns capture? _) (values (sequence->vector subpatterns) capture?))) + #:guard (λ (subpatterns capture-key _) (values (sequence->vector subpatterns) capture-key))) -(define (group-pattern #:capture? [capture? #false] subpatterns) - (constructor:group-pattern subpatterns capture?)) +(define (group-pattern #:capture-key [capture-key #false] subpatterns) + (constructor:group-pattern subpatterns (falsey->option capture-key))) (struct choice-pattern regular-pattern (choices) @@ -100,20 +102,17 @@ (constructor:repetition-pattern subpattern min-count max-count greedy?)) -(struct lookahead-pattern regular-pattern (subpattern) #:transparent) +(define (optional-pattern subpattern #:greedy? [greedy? #true]) + (repetition-pattern subpattern #:min-count 0 #:max-count 1 #:greedy? greedy?)) -(define (regular-pattern-compile pattern) - (regular-patterns-compile (vector-immutable pattern))) +(struct lookahead-pattern regular-pattern (subpattern) #:transparent) -(define (regular-patterns-compile patterns) - (define pattern-vector (sequence->vector patterns)) - (define pattern-count (vector-length pattern-vector)) +(define (regular-pattern-compile pattern) (define instructions (make-vector-builder)) (define labels (make-hash)) (define instruction-counter 0) - (define savepoint-counter 0) (define label-counter 0) (define (next-label!) @@ -121,11 +120,6 @@ (set! label-counter (add1 next)) next) - (define (next-savepoint!) - (define next savepoint-counter) - (set! savepoint-counter (add1 next)) - next) - (define (label! key) (hash-set! labels key instruction-counter)) @@ -133,8 +127,7 @@ (vector-builder-add instructions instruction) (set! instruction-counter (add1 instruction-counter))) - (define (compile-pattern! pattern pattern-index) - (let loop ([pattern pattern] [peeking? #false]) + (let loop ([pattern pattern] [peeking? #false]) (match pattern [(element-pattern expected) @@ -144,13 +137,14 @@ (loop subpattern #true) (add-instruction! (reset-peek-instruction))] - [(struct-transformer:group-pattern subpatterns capture?) - (when capture? - (add-instruction! (save-instruction (next-savepoint!)))) + [(struct-transformer:group-pattern subpatterns (== absent)) (for ([subpattern (in-vector subpatterns)]) - (loop subpattern peeking?)) - (when capture? - (add-instruction! (save-instruction (next-savepoint!))))] + (loop subpattern peeking?))] + + [(struct-transformer:group-pattern subpatterns (present key)) + (add-instruction! (start-group-instruction key)) + (loop (group-pattern subpatterns) peeking?) + (add-instruction! (finish-group-instruction key))] [(struct-transformer:choice-pattern choices) (define post-choice-label (next-label!)) @@ -198,27 +192,17 @@ (loop subpattern peeking?)) (loop (repetition-pattern subpattern #:max-count (- m n) #:greedy? greedy?) peeking?)])) - (add-instruction! (match-instruction pattern-index))) - - (for ([pattern (in-vector pattern-vector 0 (sub1 pattern-count))] - [i (in-naturals)]) - (define pattern-label (next-label!)) - (define skip-label (next-label!)) - (add-instruction! (labeled-split-instruction pattern-label skip-label)) - (label! pattern-label) - (compile-pattern! pattern i) - (label! skip-label)) - (define last-index (sub1 pattern-count)) - (compile-pattern! (vector-ref pattern-vector last-index) last-index) + (add-instruction! (match-instruction)) + (compiled-regex-with-labels (build-vector instructions) labels)) (module+ test - (test-case (name-string regular-patterns-compile) + (test-case (name-string regular-pattern-compile) (test-case (name-string element-pattern) (define pattern (element-pattern #\a)) - (define expected (compiled-regex (list (read-instruction #\a) (match-instruction 0)))) + (define expected (compiled-regex (list (read-instruction #\a) (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (define a (element-pattern #\a)) @@ -235,20 +219,20 @@ (read-instruction #\a) (read-instruction #\b) (read-instruction #\c) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case "capturing" - (define pattern (group-pattern (list a b c) #:capture? #true)) + (define pattern (group-pattern (list a b c) #:capture-key 'foo)) (define expected (compiled-regex (list - (save-instruction 0) + (start-group-instruction 'foo) (read-instruction #\a) (read-instruction #\b) (read-instruction #\c) - (save-instruction 1) - (match-instruction 0)))) + (finish-group-instruction 'foo) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected))) (test-case (name-string choice-pattern) @@ -263,7 +247,7 @@ (read-instruction #\b) (jump-instruction 7) (read-instruction #\c) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case (name-string element-string-pattern) @@ -295,7 +279,7 @@ (read-instruction #\b) (read-instruction #\c) (jump-instruction 0) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case "non-greedy without quantifiers" @@ -308,7 +292,7 @@ (read-instruction #\b) (read-instruction #\c) (jump-instruction 0) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case "greedy with minimum quantity" @@ -330,7 +314,7 @@ (read-instruction #\b) (read-instruction #\c) (jump-instruction 9) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case "non-greedy with minimum quantity" @@ -352,7 +336,7 @@ (read-instruction #\b) (read-instruction #\c) (jump-instruction 9) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case "greedy with maximum quantity" @@ -372,7 +356,7 @@ (read-instruction #\a) (read-instruction #\b) (read-instruction #\c) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case "non-greedy with maximum quantity" @@ -392,7 +376,7 @@ (read-instruction #\a) (read-instruction #\b) (read-instruction #\c) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case "greedy with minimum and maximum quantity" @@ -417,7 +401,7 @@ (read-instruction #\a) (read-instruction #\b) (read-instruction #\c) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected)) (test-case "non-greedy with minimum and maximum quantity" @@ -442,9 +426,36 @@ (read-instruction #\a) (read-instruction #\b) (read-instruction #\c) - (match-instruction 0)))) + (match-instruction)))) (check-equal? (regular-pattern-compile pattern) expected))) + (test-case (name-string optional-pattern) + + (test-case "greedy" + (define pattern (optional-pattern abc)) + (define expected + (compiled-regex + (list + (split-instruction 1 4) + (read-instruction #\a) + (read-instruction #\b) + (read-instruction #\c) + (match-instruction)))) + (check-equal? (regular-pattern-compile pattern) expected)) + + (test-case "non-greedy" + (define pattern (optional-pattern abc #:greedy? #false)) + (define expected + (compiled-regex + (list + (split-instruction 4 1) + (read-instruction #\a) + (read-instruction #\b) + (read-instruction #\c) + (match-instruction)))) + (check-equal? (regular-pattern-compile pattern) expected))) + + (test-case (name-string lookahead-pattern) (define pattern (lookahead-pattern abc)) (define expected @@ -454,30 +465,5 @@ (peek-instruction #\b) (peek-instruction #\c) (reset-peek-instruction) - (match-instruction 0)))) - (check-equal? (regular-pattern-compile pattern) expected)) - - (test-case "multiple patterns" - (define patterns - (list - (element-string-pattern "aaa") - (element-string-pattern "bbb") - (element-string-pattern "ccc"))) - (define expected - (compiled-regex - (list - (split-instruction 1 5) - (read-instruction #\a) - (read-instruction #\a) - (read-instruction #\a) - (match-instruction 0) - (split-instruction 6 10) - (read-instruction #\b) - (read-instruction #\b) - (read-instruction #\b) - (match-instruction 1) - (read-instruction #\c) - (read-instruction #\c) - (read-instruction #\c) - (match-instruction 2)))) - (check-equal? (regular-patterns-compile patterns) expected)))) + (match-instruction)))) + (check-equal? (regular-pattern-compile pattern) expected)))) diff --git a/private/hash.rkt b/private/hash.rkt new file mode 100644 index 0000000..d0478db --- /dev/null +++ b/private/hash.rkt @@ -0,0 +1,19 @@ +#lang racket/base + + +(require racket/contract/base) + + +(provide + (contract-out + [hash->immutable-hash (-> hash? (and/c hash? immutable?))])) + + +;@---------------------------------------------------------------------------------------------------- + + +(define (hash->immutable-hash h) + (if (and (hash? h) (immutable? h)) + h + (for/hash ([(k v) (in-hash h)]) + (values k v))))