Rework regex API again

remotes/jackfirth/master
Jack Firth 2 years ago
parent 5a3315d6e8
commit 6329271600

@ -12,7 +12,9 @@
rebellion/collection/vector rebellion/collection/vector
rebellion/streaming/reducer rebellion/streaming/reducer
rebellion/streaming/transducer rebellion/streaming/transducer
syntax/parse/define) syntax/parse/define
yaragg/lexer/private/regular-match
yaragg/private/hash)
(module+ test (module+ test
@ -28,13 +30,6 @@
;; https://swtch.com/~rsc/regexp/regexp2.html. ;; https://swtch.com/~rsc/regexp/regexp2.html.
(define (hash->immutable-hash h)
(if (and (hash? h) (immutable? h))
h
(for/hash ([(k v) (in-hash h)])
(values k v))))
(define (compiled-regex-with-labels program labels) (define (compiled-regex-with-labels program labels)
(compiled-regex (compiled-regex
(for/vector ([instruction (in-vector program)]) (for/vector ([instruction (in-vector program)])
@ -48,7 +43,8 @@
(struct compiled-regex (program) (struct compiled-regex (program)
#:transparent #:transparent
#:guard (λ (instructions _) (sequence->vector instructions))) #:guard
(λ (instructions _) (sequence->vector instructions)))
(struct regex-instruction () #:transparent) (struct regex-instruction () #:transparent)
@ -59,24 +55,13 @@
(struct split-instruction regex-instruction (primary-address secondary-address) #:transparent) (struct split-instruction regex-instruction (primary-address secondary-address) #:transparent)
(struct labeled-jump-instruction regex-instruction (label) #:transparent) (struct labeled-jump-instruction regex-instruction (label) #:transparent)
(struct labeled-split-instruction regex-instruction (primary-label secondary-label) #:transparent) (struct labeled-split-instruction regex-instruction (primary-label secondary-label) #:transparent)
(struct match-instruction regex-instruction (mode) #:transparent) (struct match-instruction regex-instruction () #:transparent)
(struct save-instruction regex-instruction (savepoint) #:transparent) (struct start-group-instruction regex-instruction (key) #:transparent)
(struct finish-group-instruction regex-instruction (key) #:transparent)
(struct fail-instruction regex-instruction () #:transparent) (struct fail-instruction regex-instruction () #:transparent)
(define (compiled-regex-savepoint-count compiled) (struct regex-thread (program-counter captured-groups) #:transparent)
(transduce (compiled-regex-program compiled)
(filtering save-instruction?)
(mapping save-instruction-savepoint)
#:into into-count))
(struct regex-execution-result (mode savepoints)
#:transparent
#:guard (λ (mode savepoints _) (values mode (sequence->vector savepoints))))
(struct regex-thread (program-counter savepoints) #:transparent)
(struct thread-list ([size #:mutable] threads-by-priority-order threads-by-program-counter) (struct thread-list ([size #:mutable] threads-by-priority-order threads-by-program-counter)
@ -92,29 +77,36 @@
(define (thread-list-add! threads thread #:program program #:input input #:input-index i) (define (thread-list-add! threads thread #:program program #:input input #:input-index i)
(let loop ([thread thread] [i i] [pi i]) (let loop ([thread thread] [i i] [peek i] [max-peek i])
(define pc (regex-thread-program-counter thread)) (define pc (regex-thread-program-counter thread))
(define savepoints (regex-thread-savepoints thread)) (define groups (regex-thread-captured-groups thread))
(match (vector-ref program pc) (match (vector-ref program pc)
[(jump-instruction address) (loop (regex-thread address savepoints) i pi)] [(jump-instruction address) (loop (regex-thread address groups) i peek max-peek)]
[(split-instruction primary secondary) [(split-instruction primary secondary)
(define secondary-savepoints (vector-copy savepoints)) (define secondary-groups (captured-groups-builder-copy groups))
(loop (regex-thread primary savepoints) i pi) (max (loop (regex-thread primary groups) i peek max-peek)
(loop (regex-thread secondary secondary-savepoints) i pi)] (loop (regex-thread secondary secondary-groups) i peek max-peek))]
[(save-instruction savepoint) [(start-group-instruction key)
(vector-set! savepoints savepoint i) (captured-groups-builder-start-group! groups key i)
(loop (regex-thread (add1 pc) savepoints) i pi)] (loop (regex-thread (add1 pc) groups) i peek max-peek)]
[(finish-group-instruction key)
(captured-groups-builder-finish-group! groups key i)
(loop (regex-thread (add1 pc) groups) i peek max-peek)]
[(peek-instruction expected) [(peek-instruction expected)
(when (equal? (string-ref input pi) expected) (cond
(loop (regex-thread (add1 pc) savepoints) i (add1 pi)))] [(equal? (string-ref input peek) expected)
[(reset-peek-instruction) (loop (regex-thread (add1 pc) savepoints) i i)] (define next-peek (add1 peek))
(loop (regex-thread (add1 pc) groups) i next-peek (max max-peek next-peek))]
[else max-peek])]
[(reset-peek-instruction) (loop (regex-thread (add1 pc) groups) i i max-peek)]
[_ [_
(define by-pc (thread-list-threads-by-program-counter threads)) (define by-pc (thread-list-threads-by-program-counter threads))
(unless (vector-ref by-pc pc) (unless (vector-ref by-pc pc)
(define size (thread-list-size threads)) (define size (thread-list-size threads))
(vector-set! by-pc pc thread) (vector-set! by-pc pc thread)
(vector-set! (thread-list-threads-by-priority-order threads) size thread) (vector-set! (thread-list-threads-by-priority-order threads) size thread)
(set-thread-list-size! threads (add1 size)))]))) (set-thread-list-size! threads (add1 size)))
max-peek])))
(define (thread-list-clear! threads) (define (thread-list-clear! threads)
@ -130,63 +122,73 @@
(set-thread-list-size! threads 0)) (set-thread-list-size! threads 0))
(define (compiled-regex-match-string r str) (define (compiled-regex-match-string r str [start 0] [end (string-length str)])
(define strlen (string-length str))
(define program (compiled-regex-program r)) (define program (compiled-regex-program r))
(define savepoint-count (compiled-regex-savepoint-count r))
(define (make-thread [pc 0] [savepoints (make-vector savepoint-count #false)]) (define (make-thread [pc 0] [groups (make-captured-groups-builder)])
(regex-thread pc savepoints)) (regex-thread pc groups))
(define running-threads (make-thread-list #:program-size (vector-length program))) (define running-threads (make-thread-list #:program-size (vector-length program)))
(define blocked-threads (make-thread-list #:program-size (vector-length program))) (define blocked-threads (make-thread-list #:program-size (vector-length program)))
(thread-list-add! running-threads (make-thread) #:program program #:input str #:input-index 0) (define max-peek
(thread-list-add! running-threads (make-thread) #:program program #:input str #:input-index start))
(for/fold ([running-threads running-threads] (for/fold ([running-threads running-threads]
[blocked-threads blocked-threads] [blocked-threads blocked-threads]
[last-match #false] [last-match #false]
#:result last-match) [max-peek max-peek]
([input-index (in-range 0 (add1 strlen))]) #:result (or last-match (regular-match-failure start (- max-peek start))))
(let loop ([i 0]) ([input-index (in-range start (add1 end))])
(let loop ([i 0] [max-peek max-peek])
(cond (cond
[(equal? i (thread-list-size running-threads)) [(equal? i (thread-list-size running-threads))
(thread-list-clear! running-threads) (thread-list-clear! running-threads)
(values blocked-threads running-threads last-match)] (values blocked-threads running-threads last-match max-peek)]
[else [else
(define thread (thread-list-get running-threads i)) (define thread (thread-list-get running-threads i))
(define pc (regex-thread-program-counter thread)) (define pc (regex-thread-program-counter thread))
(define savepoints (regex-thread-savepoints thread)) (define groups (regex-thread-captured-groups thread))
(match (vector-ref program pc) (match (vector-ref program pc)
[(read-instruction c) [(read-instruction c)
(when (and (< input-index strlen) (equal? (string-ref str input-index) c)) (cond
(define next-thread (make-thread (add1 pc) savepoints)) [(and (< input-index end) (equal? (string-ref str input-index) c))
(thread-list-add! blocked-threads (define next-thread (make-thread (add1 pc) groups))
next-thread (define next-max-peek
#:program program (thread-list-add! blocked-threads
#:input str next-thread
#:input-index input-index)) #:program program
(loop (add1 i))] #:input str
[(match-instruction mode) #:input-index (add1 input-index)))
(loop (add1 i) (max max-peek next-max-peek))]
[else
(loop (add1 i) (max (add1 input-index) max-peek))])]
[(match-instruction)
(thread-list-clear! running-threads) (thread-list-clear! running-threads)
(values blocked-threads running-threads (regex-execution-result mode savepoints))])])))) (define resulting-match
(regular-match start input-index
#:peek-distance (- max-peek input-index)
#:groups (build-captured-groups groups)))
(values blocked-threads running-threads resulting-match max-peek)])]))))
(module+ test (module+ test
(test-case (name-string match-instruction) (test-case (name-string match-instruction)
(define r (compiled-regex (list (match-instruction 42)))) (define r (compiled-regex (list (match-instruction))))
(check-equal? (compiled-regex-match-string r "aaaaaaaa") (regex-execution-result 42 '())) (define expected (regular-match 0 0))
(check-equal? (compiled-regex-match-string r "") (regex-execution-result 42 '()))) (check-equal? (compiled-regex-match-string r "aaaaaaaa") expected)
(check-equal? (compiled-regex-match-string r "") expected))
(test-case (name-string read-instruction) (test-case (name-string read-instruction)
(test-case "reading one character" (test-case "reading one character"
(define r (compiled-regex (list (read-instruction #\a) (match-instruction 0)))) (define r (compiled-regex (list (read-instruction #\a) (match-instruction))))
(check-equal? (compiled-regex-match-string r "a") (regex-execution-result 0 '())) (define expected (regular-match 0 1))
(check-equal? (compiled-regex-match-string r "aaa") (regex-execution-result 0 '())) (check-equal? (compiled-regex-match-string r "a") expected)
(check-equal? (compiled-regex-match-string r "ab") (regex-execution-result 0 '())) (check-equal? (compiled-regex-match-string r "aaa") expected)
(check-false (compiled-regex-match-string r "b")) (check-equal? (compiled-regex-match-string r "ab") expected)
(check-false (compiled-regex-match-string r "")) (check-equal? (compiled-regex-match-string r "b") (regular-match-failure 0 1))
(check-false (compiled-regex-match-string r "ba"))) (check-equal? (compiled-regex-match-string r "") (regular-match-failure 0 1))
(check-equal? (compiled-regex-match-string r "ba") (regular-match-failure 0 1)))
(test-case "reading multiple characters" (test-case "reading multiple characters"
(define r (define r
@ -195,79 +197,94 @@
(read-instruction #\a) (read-instruction #\a)
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(match-instruction 0)))) (match-instruction))))
(check-equal? (compiled-regex-match-string r "abc") (regex-execution-result 0 '())) (define expected (regular-match 0 3))
(check-equal? (compiled-regex-match-string r "abcxxx") (regex-execution-result 0 '())) (check-equal? (compiled-regex-match-string r "abc") expected)
(check-false (compiled-regex-match-string r "cba")) (check-equal? (compiled-regex-match-string r "abcxxx") expected)
(check-false (compiled-regex-match-string r "a")) (check-equal? (compiled-regex-match-string r "cba") (regular-match-failure 0 1))
(check-false (compiled-regex-match-string r "ab")) (check-equal? (compiled-regex-match-string r "a") (regular-match-failure 0 2))
(check-false (compiled-regex-match-string r "aaa")) (check-equal? (compiled-regex-match-string r "ab") (regular-match-failure 0 3))
(check-false (compiled-regex-match-string r "bbb")) (check-equal? (compiled-regex-match-string r "aaa") (regular-match-failure 0 2))
(check-false (compiled-regex-match-string r "ccc")) (check-equal? (compiled-regex-match-string r "bbb") (regular-match-failure 0 1))
(check-false (compiled-regex-match-string r "aabc")))) (check-equal? (compiled-regex-match-string r "ccc") (regular-match-failure 0 1))
(check-equal? (compiled-regex-match-string r "aabc") (regular-match-failure 0 2))))
(test-case (name-string jump-instruction) (test-case (name-string jump-instruction)
(define r (define r
(compiled-regex (compiled-regex
(list (list
(jump-instruction 2) (read-instruction #\a) (read-instruction #\b) (match-instruction 0)))) (jump-instruction 2) (read-instruction #\a) (read-instruction #\b) (match-instruction))))
(check-equal? (compiled-regex-match-string r "b") (regex-execution-result 0 '())) (check-equal? (compiled-regex-match-string r "b") (regular-match 0 1))
(check-false (compiled-regex-match-string r "a")) (check-equal? (compiled-regex-match-string r "a") (regular-match-failure 0 1))
(check-false (compiled-regex-match-string r "c"))) (check-equal? (compiled-regex-match-string r "c") (regular-match-failure 0 1)))
(test-case (name-string split-instruction) (test-case "group capturing instructions"
(define r (define r
(compiled-regex (compiled-regex
(list (list
(split-instruction 1 3) (start-group-instruction 'a)
(read-instruction #\a)
(read-instruction #\a) (read-instruction #\a)
(match-instruction 0) (read-instruction #\a)
(finish-group-instruction 'a)
(start-group-instruction 'b)
(read-instruction #\b)
(read-instruction #\b)
(read-instruction #\b) (read-instruction #\b)
(match-instruction 1)))) (finish-group-instruction 'b)
(check-equal? (compiled-regex-match-string r "a") (regex-execution-result 0 '())) (match-instruction))))
(check-equal? (compiled-regex-match-string r "b") (regex-execution-result 1 '())) (define expected
(check-equal? (compiled-regex-match-string r "ab") (regex-execution-result 0 '())) (regular-match 0 6
(check-equal? (compiled-regex-match-string r "ba") (regex-execution-result 1 '())) #:groups (hash 'a (list (captured-group 0 3)) 'b (list (captured-group 3 6)))))
(check-false (compiled-regex-match-string r "c")) (check-equal? (compiled-regex-match-string r "aaabbb") expected))
(check-false (compiled-regex-match-string r "")))
(test-case (name-string split-instruction)
(test-case (name-string save-instruction)
(define r (define r
(compiled-regex (compiled-regex
(list (list
(save-instruction 0) (split-instruction 1 5)
(start-group-instruction 'a)
(read-instruction #\a) (read-instruction #\a)
(read-instruction #\a) (finish-group-instruction 'a)
(read-instruction #\a) (match-instruction)
(save-instruction 1) (start-group-instruction 'b)
(read-instruction #\b)
(read-instruction #\b)
(read-instruction #\b) (read-instruction #\b)
(save-instruction 2) (finish-group-instruction 'b)
(match-instruction 0)))) (match-instruction))))
(check-equal? (compiled-regex-match-string r "aaabbb") (regex-execution-result 0 '(0 2 5)))) (define a-match (regular-match 0 1 #:groups (hash 'a (list (captured-group 0 1)))))
(define b-match (regular-match 0 1 #:groups (hash 'b (list (captured-group 0 1)))))
(check-equal? (compiled-regex-match-string r "a") a-match)
(check-equal? (compiled-regex-match-string r "b") b-match)
(check-equal? (compiled-regex-match-string r "ab") a-match)
(check-equal? (compiled-regex-match-string r "ba") b-match)
(check-equal? (compiled-regex-match-string r "c") (regular-match-failure 0 1))
(check-equal? (compiled-regex-match-string r "") (regular-match-failure 0 1)))
(test-case (name-string peek-instruction) (test-case (name-string peek-instruction)
(define r (define r
(compiled-regex (compiled-regex
(list (list
(save-instruction 0) (start-group-instruction 'a)
(peek-instruction #\a) (peek-instruction #\a)
(peek-instruction #\b) (peek-instruction #\b)
(peek-instruction #\c) (peek-instruction #\c)
(save-instruction 1) (finish-group-instruction 'a)
(match-instruction 0)))) (match-instruction))))
(check-equal? (compiled-regex-match-string r "abc") (regex-execution-result 0 '(0 0)))) (define expected
(regular-match 0 0 #:peek-distance 3 #:groups (hash 'a (list (captured-group 0 0)))))
(check-equal? (compiled-regex-match-string r "abc") expected))
(test-case (name-string reset-peek-instruction) (test-case (name-string reset-peek-instruction)
(define r (define r
(compiled-regex (compiled-regex
(list (list
(save-instruction 0) (start-group-instruction 'a)
(peek-instruction #\a) (peek-instruction #\a)
(peek-instruction #\b) (peek-instruction #\b)
(reset-peek-instruction) (reset-peek-instruction)
(peek-instruction #\a) (peek-instruction #\a)
(save-instruction 1) (finish-group-instruction 'a)
(match-instruction 0)))) (match-instruction))))
(check-equal? (compiled-regex-match-string r "abc") (regex-execution-result 0 '(0 0))))) (define expected
(regular-match 0 0 #:peek-distance 2 #:groups (hash 'a (list (captured-group 0 0)))))
(check-equal? (compiled-regex-match-string r "abc") expected)))

@ -0,0 +1,137 @@
#lang racket/base
(require racket/contract/base)
(provide
(contract-out
[regular-match
(->* (exact-nonnegative-integer? exact-nonnegative-integer?)
(#:peek-distance exact-nonnegative-integer?
#:groups (hash/c any/c (sequence/c captured-group?)))
regular-match?)]
[regular-match? predicate/c]
[regular-match-start (-> regular-match? exact-nonnegative-integer?)]
[regular-match-end (-> regular-match? exact-nonnegative-integer?)]
[regular-match-peek-distance (-> regular-match? exact-nonnegative-integer?)]
[regular-match-captured-groups
(-> regular-match? (hash/c any/c sorted-set? #:immutable #true #:flat? #true))]
[regular-match-failure
(-> exact-nonnegative-integer? exact-nonnegative-integer? regular-match-failure?)]
[regular-match-failure? predicate/c]
[regular-match-failure-start (-> regular-match-failure? exact-nonnegative-integer?)]
[regular-match-failure-peek-distance (-> regular-match-failure? exact-nonnegative-integer?)]
[captured-group (-> exact-nonnegative-integer? exact-nonnegative-integer? captured-group?)]
[captured-group? predicate/c]
[captured-group-start (-> captured-group? exact-nonnegative-integer?)]
[captured-group-end (-> captured-group? exact-nonnegative-integer?)]
[make-captured-groups-builder (-> captured-groups-builder?)]
[captured-groups-builder? predicate/c]
[captured-groups-builder-start-group!
(-> captured-groups-builder? any/c exact-nonnegative-integer? captured-groups-builder?)]
[captured-groups-builder-finish-group!
(-> captured-groups-builder? any/c exact-nonnegative-integer? captured-groups-builder?)]
[captured-groups-builder-copy (-> captured-groups-builder? captured-groups-builder?)]
[build-captured-groups
(-> captured-groups-builder? (hash/c any/c sorted-set? #:immutable #true #:flat? #true))]))
(require racket/match
racket/sequence
rebellion/base/comparator
rebellion/collection/sorted-set
rebellion/private/static-name
yaragg/private/hash)
;@----------------------------------------------------------------------------------------------------
(struct regular-match (start end peek-distance captured-groups)
#:constructor-name constructor:regular-match
#:omit-define-syntaxes
#:transparent
#:guard
(λ (position span peek-distance captured-groups _)
(define sorted-groups
(for/hash ([(key groups) (in-hash captured-groups)])
(values key (sequence->sorted-set groups #:comparator captured-group<=>))))
(values position span peek-distance sorted-groups))
#:property prop:custom-print-quotable 'never)
(struct regular-match-failure (start peek-distance) #:transparent)
(define (regular-match start end
#:peek-distance [peek-distance 0]
#:groups [groups (hash)])
(constructor:regular-match start end peek-distance groups))
(struct captured-group (start end) #:transparent)
(define captured-group<=>
(comparator-chain (comparator-map natural<=> captured-group-start)
(comparator-map natural<=> captured-group-end)))
(struct captured-groups-builder (started-groups finished-groups))
(define (make-captured-groups-builder)
(captured-groups-builder (make-hash) (make-hash)))
(define (captured-groups-builder-copy builder)
(define started (hash-copy (captured-groups-builder-started-groups builder)))
(define finished (make-hash))
(for ([(key group-set-builder) (in-hash (captured-groups-builder-finished-groups builder))])
(define group-set-copy
(sorted-set-builder-add-all
(make-sorted-set-builder captured-group<=>) (build-sorted-set group-set-builder)))
(hash-set! finished key group-set-copy))
(captured-groups-builder started finished))
(define (captured-groups-builder-start-group! builder key input-position)
(define started (captured-groups-builder-started-groups builder))
(when (hash-has-key? started key)
(raise-arguments-error (name captured-groups-builder-start-group!)
"already started capturing a group for this capture key"
"capture key" key
"previous start" (hash-ref started key)
"next start" input-position))
(hash-set! started key input-position)
builder)
(define (captured-groups-builder-finish-group! builder key input-position)
(define started (captured-groups-builder-started-groups builder))
(unless (hash-has-key? started key)
(raise-arguments-error (name captured-groups-builder-finish-group!)
"can't finish a capture group for this key, no group started yet"
"capture key" key
"finish position" input-position))
(define finished (captured-groups-builder-finished-groups builder))
(define start (hash-ref started key))
(hash-remove! started key)
(define group (captured-group start input-position))
(define groups-for-key (hash-ref! finished key (λ () (make-sorted-set-builder captured-group<=>))))
(sorted-set-builder-add groups-for-key group)
builder)
(define (build-captured-groups builder)
(define started (captured-groups-builder-started-groups builder))
(unless (hash-empty? (captured-groups-builder-started-groups builder))
(raise-arguments-error (name build-captured-groups)
"some capture groups were started but not finished"
"unfinished groups" started))
(define finished (captured-groups-builder-finished-groups builder))
(for/hash ([(key group-set-builder) (in-hash finished)])
(values key (build-sorted-set group-set-builder))))

@ -10,7 +10,8 @@
[element-pattern (-> any/c regular-pattern?)] [element-pattern (-> any/c regular-pattern?)]
[element-string-pattern (-> (sequence/c any/c) regular-pattern?)] [element-string-pattern (-> (sequence/c any/c) regular-pattern?)]
[element-set-pattern (-> (sequence/c any/c) regular-pattern?)] [element-set-pattern (-> (sequence/c any/c) regular-pattern?)]
[group-pattern (->* ((sequence/c regular-pattern?)) (#:capture? boolean?) regular-pattern?)] [group-pattern
(->* ((sequence/c regular-pattern?)) (#:capture-key (not/c #false)) regular-pattern?)]
[choice-pattern (-> (sequence/c regular-pattern?) regular-pattern?)] [choice-pattern (-> (sequence/c regular-pattern?) regular-pattern?)]
[repetition-pattern [repetition-pattern
(->i ([subpattern regular-pattern?]) (->i ([subpattern regular-pattern?])
@ -21,14 +22,15 @@
"minimum repetition count cannot be greater than the maximum repetition count" "minimum repetition count cannot be greater than the maximum repetition count"
(or (unsupplied-arg? min-count) (unsupplied-arg? max-count) (<= min-count max-count)) (or (unsupplied-arg? min-count) (unsupplied-arg? max-count) (<= min-count max-count))
[_ regular-pattern?])] [_ regular-pattern?])]
[optional-pattern (->* (regular-pattern?) (#:greedy? boolean?) regular-pattern?)]
[lookahead-pattern (-> regular-pattern? regular-pattern?)] [lookahead-pattern (-> regular-pattern? regular-pattern?)]
[regular-pattern-compile (-> regular-pattern? compiled-regex?)] [regular-pattern-compile (-> regular-pattern? compiled-regex?)]))
[regular-patterns-compile (-> (sequence/c regular-pattern?) compiled-regex?)]))
(require racket/match (require racket/match
racket/sequence racket/sequence
racket/set racket/set
rebellion/base/option
rebellion/collection/vector rebellion/collection/vector
rebellion/collection/vector/builder rebellion/collection/vector/builder
rebellion/streaming/transducer rebellion/streaming/transducer
@ -65,15 +67,15 @@
(choice-pattern choices)) (choice-pattern choices))
(struct group-pattern regular-pattern (subpatterns capture?) (struct group-pattern regular-pattern (subpatterns capture-key)
#:transparent #:transparent
#:name struct-transformer:group-pattern #:name struct-transformer:group-pattern
#:constructor-name constructor:group-pattern #:constructor-name constructor:group-pattern
#:guard (λ (subpatterns capture? _) (values (sequence->vector subpatterns) capture?))) #:guard (λ (subpatterns capture-key _) (values (sequence->vector subpatterns) capture-key)))
(define (group-pattern #:capture? [capture? #false] subpatterns) (define (group-pattern #:capture-key [capture-key #false] subpatterns)
(constructor:group-pattern subpatterns capture?)) (constructor:group-pattern subpatterns (falsey->option capture-key)))
(struct choice-pattern regular-pattern (choices) (struct choice-pattern regular-pattern (choices)
@ -100,20 +102,17 @@
(constructor:repetition-pattern subpattern min-count max-count greedy?)) (constructor:repetition-pattern subpattern min-count max-count greedy?))
(struct lookahead-pattern regular-pattern (subpattern) #:transparent) (define (optional-pattern subpattern #:greedy? [greedy? #true])
(repetition-pattern subpattern #:min-count 0 #:max-count 1 #:greedy? greedy?))
(define (regular-pattern-compile pattern) (struct lookahead-pattern regular-pattern (subpattern) #:transparent)
(regular-patterns-compile (vector-immutable pattern)))
(define (regular-patterns-compile patterns) (define (regular-pattern-compile pattern)
(define pattern-vector (sequence->vector patterns))
(define pattern-count (vector-length pattern-vector))
(define instructions (make-vector-builder)) (define instructions (make-vector-builder))
(define labels (make-hash)) (define labels (make-hash))
(define instruction-counter 0) (define instruction-counter 0)
(define savepoint-counter 0)
(define label-counter 0) (define label-counter 0)
(define (next-label!) (define (next-label!)
@ -121,11 +120,6 @@
(set! label-counter (add1 next)) (set! label-counter (add1 next))
next) next)
(define (next-savepoint!)
(define next savepoint-counter)
(set! savepoint-counter (add1 next))
next)
(define (label! key) (define (label! key)
(hash-set! labels key instruction-counter)) (hash-set! labels key instruction-counter))
@ -133,8 +127,7 @@
(vector-builder-add instructions instruction) (vector-builder-add instructions instruction)
(set! instruction-counter (add1 instruction-counter))) (set! instruction-counter (add1 instruction-counter)))
(define (compile-pattern! pattern pattern-index) (let loop ([pattern pattern] [peeking? #false])
(let loop ([pattern pattern] [peeking? #false])
(match pattern (match pattern
[(element-pattern expected) [(element-pattern expected)
@ -144,13 +137,14 @@
(loop subpattern #true) (loop subpattern #true)
(add-instruction! (reset-peek-instruction))] (add-instruction! (reset-peek-instruction))]
[(struct-transformer:group-pattern subpatterns capture?) [(struct-transformer:group-pattern subpatterns (== absent))
(when capture?
(add-instruction! (save-instruction (next-savepoint!))))
(for ([subpattern (in-vector subpatterns)]) (for ([subpattern (in-vector subpatterns)])
(loop subpattern peeking?)) (loop subpattern peeking?))]
(when capture?
(add-instruction! (save-instruction (next-savepoint!))))] [(struct-transformer:group-pattern subpatterns (present key))
(add-instruction! (start-group-instruction key))
(loop (group-pattern subpatterns) peeking?)
(add-instruction! (finish-group-instruction key))]
[(struct-transformer:choice-pattern choices) [(struct-transformer:choice-pattern choices)
(define post-choice-label (next-label!)) (define post-choice-label (next-label!))
@ -198,27 +192,17 @@
(loop subpattern peeking?)) (loop subpattern peeking?))
(loop (repetition-pattern subpattern #:max-count (- m n) #:greedy? greedy?) peeking?)])) (loop (repetition-pattern subpattern #:max-count (- m n) #:greedy? greedy?) peeking?)]))
(add-instruction! (match-instruction pattern-index))) (add-instruction! (match-instruction))
(for ([pattern (in-vector pattern-vector 0 (sub1 pattern-count))]
[i (in-naturals)])
(define pattern-label (next-label!))
(define skip-label (next-label!))
(add-instruction! (labeled-split-instruction pattern-label skip-label))
(label! pattern-label)
(compile-pattern! pattern i)
(label! skip-label))
(define last-index (sub1 pattern-count))
(compile-pattern! (vector-ref pattern-vector last-index) last-index)
(compiled-regex-with-labels (build-vector instructions) labels)) (compiled-regex-with-labels (build-vector instructions) labels))
(module+ test (module+ test
(test-case (name-string regular-patterns-compile) (test-case (name-string regular-pattern-compile)
(test-case (name-string element-pattern) (test-case (name-string element-pattern)
(define pattern (element-pattern #\a)) (define pattern (element-pattern #\a))
(define expected (compiled-regex (list (read-instruction #\a) (match-instruction 0)))) (define expected (compiled-regex (list (read-instruction #\a) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(define a (element-pattern #\a)) (define a (element-pattern #\a))
@ -235,20 +219,20 @@
(read-instruction #\a) (read-instruction #\a)
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case "capturing" (test-case "capturing"
(define pattern (group-pattern (list a b c) #:capture? #true)) (define pattern (group-pattern (list a b c) #:capture-key 'foo))
(define expected (define expected
(compiled-regex (compiled-regex
(list (list
(save-instruction 0) (start-group-instruction 'foo)
(read-instruction #\a) (read-instruction #\a)
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(save-instruction 1) (finish-group-instruction 'foo)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected))) (check-equal? (regular-pattern-compile pattern) expected)))
(test-case (name-string choice-pattern) (test-case (name-string choice-pattern)
@ -263,7 +247,7 @@
(read-instruction #\b) (read-instruction #\b)
(jump-instruction 7) (jump-instruction 7)
(read-instruction #\c) (read-instruction #\c)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case (name-string element-string-pattern) (test-case (name-string element-string-pattern)
@ -295,7 +279,7 @@
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(jump-instruction 0) (jump-instruction 0)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case "non-greedy without quantifiers" (test-case "non-greedy without quantifiers"
@ -308,7 +292,7 @@
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(jump-instruction 0) (jump-instruction 0)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case "greedy with minimum quantity" (test-case "greedy with minimum quantity"
@ -330,7 +314,7 @@
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(jump-instruction 9) (jump-instruction 9)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case "non-greedy with minimum quantity" (test-case "non-greedy with minimum quantity"
@ -352,7 +336,7 @@
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(jump-instruction 9) (jump-instruction 9)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case "greedy with maximum quantity" (test-case "greedy with maximum quantity"
@ -372,7 +356,7 @@
(read-instruction #\a) (read-instruction #\a)
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case "non-greedy with maximum quantity" (test-case "non-greedy with maximum quantity"
@ -392,7 +376,7 @@
(read-instruction #\a) (read-instruction #\a)
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case "greedy with minimum and maximum quantity" (test-case "greedy with minimum and maximum quantity"
@ -417,7 +401,7 @@
(read-instruction #\a) (read-instruction #\a)
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))
(test-case "non-greedy with minimum and maximum quantity" (test-case "non-greedy with minimum and maximum quantity"
@ -442,9 +426,36 @@
(read-instruction #\a) (read-instruction #\a)
(read-instruction #\b) (read-instruction #\b)
(read-instruction #\c) (read-instruction #\c)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected))) (check-equal? (regular-pattern-compile pattern) expected)))
(test-case (name-string optional-pattern)
(test-case "greedy"
(define pattern (optional-pattern abc))
(define expected
(compiled-regex
(list
(split-instruction 1 4)
(read-instruction #\a)
(read-instruction #\b)
(read-instruction #\c)
(match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected))
(test-case "non-greedy"
(define pattern (optional-pattern abc #:greedy? #false))
(define expected
(compiled-regex
(list
(split-instruction 4 1)
(read-instruction #\a)
(read-instruction #\b)
(read-instruction #\c)
(match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)))
(test-case (name-string lookahead-pattern) (test-case (name-string lookahead-pattern)
(define pattern (lookahead-pattern abc)) (define pattern (lookahead-pattern abc))
(define expected (define expected
@ -454,30 +465,5 @@
(peek-instruction #\b) (peek-instruction #\b)
(peek-instruction #\c) (peek-instruction #\c)
(reset-peek-instruction) (reset-peek-instruction)
(match-instruction 0)))) (match-instruction))))
(check-equal? (regular-pattern-compile pattern) expected)) (check-equal? (regular-pattern-compile pattern) expected))))
(test-case "multiple patterns"
(define patterns
(list
(element-string-pattern "aaa")
(element-string-pattern "bbb")
(element-string-pattern "ccc")))
(define expected
(compiled-regex
(list
(split-instruction 1 5)
(read-instruction #\a)
(read-instruction #\a)
(read-instruction #\a)
(match-instruction 0)
(split-instruction 6 10)
(read-instruction #\b)
(read-instruction #\b)
(read-instruction #\b)
(match-instruction 1)
(read-instruction #\c)
(read-instruction #\c)
(read-instruction #\c)
(match-instruction 2))))
(check-equal? (regular-patterns-compile patterns) expected))))

@ -0,0 +1,19 @@
#lang racket/base
(require racket/contract/base)
(provide
(contract-out
[hash->immutable-hash (-> hash? (and/c hash? immutable?))]))
;@----------------------------------------------------------------------------------------------------
(define (hash->immutable-hash h)
(if (and (hash? h) (immutable? h))
h
(for/hash ([(k v) (in-hash h)])
(values k v))))
Loading…
Cancel
Save