Add regular expression implementation
parent
dfbddc95fb
commit
df3a80c11a
@ -0,0 +1,219 @@
|
||||
#lang racket/base
|
||||
|
||||
|
||||
(require (for-syntax racket/base
|
||||
syntax/parse)
|
||||
racket/match
|
||||
racket/vector
|
||||
rebellion/collection/vector
|
||||
rebellion/streaming/reducer
|
||||
rebellion/streaming/transducer
|
||||
syntax/parse/define)
|
||||
|
||||
|
||||
(module+ test
|
||||
(require (submod "..")
|
||||
rackunit
|
||||
rebellion/private/static-name))
|
||||
|
||||
|
||||
;@----------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
;; Based on the "regular expression virtual machine" implementation strategy described in
|
||||
;; https://swtch.com/~rsc/regexp/regexp2.html.
|
||||
|
||||
|
||||
(struct compiled-regex (program)
|
||||
#:transparent
|
||||
#:guard (λ (instructions _) (sequence->vector instructions)))
|
||||
|
||||
|
||||
(struct regex-instruction () #:transparent)
|
||||
(struct read-instruction regex-instruction (expected-char) #:transparent)
|
||||
(struct jump-instruction regex-instruction (address) #:transparent)
|
||||
(struct split-instruction regex-instruction (primary-address secondary-address) #:transparent)
|
||||
(struct match-instruction regex-instruction (mode) #:transparent)
|
||||
(struct save-instruction regex-instruction (savepoint) #:transparent)
|
||||
|
||||
|
||||
(define (compiled-regex-savepoint-count compiled)
|
||||
(transduce (compiled-regex-program compiled)
|
||||
(filtering save-instruction?)
|
||||
(mapping save-instruction-savepoint)
|
||||
#:into into-count))
|
||||
|
||||
|
||||
(struct regex-execution-result (mode savepoints)
|
||||
#:transparent
|
||||
#:guard (λ (mode savepoints _) (values mode (sequence->vector savepoints))))
|
||||
|
||||
|
||||
(struct regex-thread (program-counter savepoints) #:transparent)
|
||||
|
||||
|
||||
(struct thread-list ([size #:mutable] threads-by-priority-order threads-by-program-counter)
|
||||
#:transparent)
|
||||
|
||||
|
||||
(define (make-thread-list #:program-size capacity)
|
||||
(thread-list 0 (make-vector capacity #false) (make-vector capacity #false)))
|
||||
|
||||
|
||||
(define (thread-list-get threads index)
|
||||
(vector-ref (thread-list-threads-by-priority-order threads) index))
|
||||
|
||||
|
||||
(define (thread-list-add! threads thread #:program program #:input-index i)
|
||||
(define pc (regex-thread-program-counter thread))
|
||||
(define savepoints (regex-thread-savepoints thread))
|
||||
(match (vector-ref program pc)
|
||||
[(jump-instruction address)
|
||||
(define new-thread (regex-thread address savepoints))
|
||||
(thread-list-add! threads new-thread #:program program #:input-index i)]
|
||||
[(split-instruction primary secondary)
|
||||
(define primary-thread (regex-thread primary savepoints))
|
||||
(define secondary-thread (regex-thread secondary (vector-copy savepoints)))
|
||||
(thread-list-add! threads primary-thread #:program program #:input-index i)
|
||||
(thread-list-add! threads secondary-thread #:program program #:input-index i)]
|
||||
[(save-instruction savepoint)
|
||||
(vector-set! savepoints savepoint i)
|
||||
(define new-thread (regex-thread (add1 pc) savepoints))
|
||||
(thread-list-add! threads new-thread #:program program #:input-index i)]
|
||||
[_
|
||||
(define by-pc (thread-list-threads-by-program-counter threads))
|
||||
(unless (vector-ref by-pc pc)
|
||||
(define size (thread-list-size threads))
|
||||
(vector-set! by-pc pc thread)
|
||||
(vector-set! (thread-list-threads-by-priority-order threads) size thread)
|
||||
(set-thread-list-size! threads (add1 size)))]))
|
||||
|
||||
|
||||
(define (thread-list-clear! threads)
|
||||
(define by-priority (thread-list-threads-by-priority-order threads))
|
||||
(define by-pc (thread-list-threads-by-program-counter threads))
|
||||
(for ([thread (in-vector by-priority 0 (thread-list-size threads))]
|
||||
[i (in-naturals)])
|
||||
(vector-set! by-pc (regex-thread-program-counter thread) #false)
|
||||
;; TODO: this isn't strictly necessary (setting size to zero makes the by-priority vector's
|
||||
;; contents unreadable anyway) but I'm leaving it in for now as a defense against other bugs, this
|
||||
;; should be removed once tests are passing.
|
||||
(vector-set! by-priority i #false))
|
||||
(set-thread-list-size! threads 0))
|
||||
|
||||
|
||||
(define (compiled-regex-match-string r str)
|
||||
(define strlen (string-length str))
|
||||
(define program (compiled-regex-program r))
|
||||
(define savepoint-count (compiled-regex-savepoint-count r))
|
||||
|
||||
(define (make-thread [pc 0] [savepoints (make-vector savepoint-count #false)])
|
||||
(regex-thread pc savepoints))
|
||||
|
||||
(define running-threads (make-thread-list #:program-size (vector-length program)))
|
||||
(define blocked-threads (make-thread-list #:program-size (vector-length program)))
|
||||
(thread-list-add! running-threads (make-thread) #:program program #:input-index 0)
|
||||
(for/fold ([running-threads running-threads]
|
||||
[blocked-threads blocked-threads]
|
||||
[last-match #false]
|
||||
#:result last-match)
|
||||
([input-index (in-range 0 (add1 strlen))])
|
||||
(let loop ([i 0])
|
||||
(cond
|
||||
[(equal? i (thread-list-size running-threads))
|
||||
(thread-list-clear! running-threads)
|
||||
(values blocked-threads running-threads last-match)]
|
||||
[else
|
||||
(define thread (thread-list-get running-threads i))
|
||||
(define pc (regex-thread-program-counter thread))
|
||||
(define savepoints (regex-thread-savepoints thread))
|
||||
(match (vector-ref program pc)
|
||||
[(read-instruction c)
|
||||
(when (and (< input-index strlen) (equal? (string-ref str input-index) c))
|
||||
(define next-thread (make-thread (add1 pc) savepoints))
|
||||
(thread-list-add! blocked-threads
|
||||
next-thread
|
||||
#:program program
|
||||
#:input-index input-index))
|
||||
(loop (add1 i))]
|
||||
[(match-instruction mode)
|
||||
(thread-list-clear! running-threads)
|
||||
(values blocked-threads running-threads (regex-execution-result mode savepoints))])]))))
|
||||
|
||||
|
||||
(module+ test
|
||||
|
||||
(test-case (name-string match-instruction)
|
||||
(define r (compiled-regex (list (match-instruction 42))))
|
||||
(check-equal? (compiled-regex-match-string r "aaaaaaaa") (regex-execution-result 42 '()))
|
||||
(check-equal? (compiled-regex-match-string r "") (regex-execution-result 42 '())))
|
||||
|
||||
(test-case (name-string read-instruction)
|
||||
|
||||
(test-case "reading one character"
|
||||
(define r (compiled-regex (list (read-instruction #\a) (match-instruction 0))))
|
||||
(check-equal? (compiled-regex-match-string r "a") (regex-execution-result 0 '()))
|
||||
(check-equal? (compiled-regex-match-string r "aaa") (regex-execution-result 0 '()))
|
||||
(check-equal? (compiled-regex-match-string r "ab") (regex-execution-result 0 '()))
|
||||
(check-false (compiled-regex-match-string r "b"))
|
||||
(check-false (compiled-regex-match-string r ""))
|
||||
(check-false (compiled-regex-match-string r "ba")))
|
||||
|
||||
(test-case "reading multiple characters"
|
||||
(define r
|
||||
(compiled-regex
|
||||
(list
|
||||
(read-instruction #\a)
|
||||
(read-instruction #\b)
|
||||
(read-instruction #\c)
|
||||
(match-instruction 0))))
|
||||
(check-equal? (compiled-regex-match-string r "abc") (regex-execution-result 0 '()))
|
||||
(check-equal? (compiled-regex-match-string r "abcxxx") (regex-execution-result 0 '()))
|
||||
(check-false (compiled-regex-match-string r "cba"))
|
||||
(check-false (compiled-regex-match-string r "a"))
|
||||
(check-false (compiled-regex-match-string r "ab"))
|
||||
(check-false (compiled-regex-match-string r "aaa"))
|
||||
(check-false (compiled-regex-match-string r "bbb"))
|
||||
(check-false (compiled-regex-match-string r "ccc"))
|
||||
(check-false (compiled-regex-match-string r "aabc"))))
|
||||
|
||||
(test-case (name-string jump-instruction)
|
||||
(define r
|
||||
(compiled-regex
|
||||
(list
|
||||
(jump-instruction 2) (read-instruction #\a) (read-instruction #\b) (match-instruction 0))))
|
||||
(check-equal? (compiled-regex-match-string r "b") (regex-execution-result 0 '()))
|
||||
(check-false (compiled-regex-match-string r "a"))
|
||||
(check-false (compiled-regex-match-string r "c")))
|
||||
|
||||
(test-case (name-string split-instruction)
|
||||
(define r
|
||||
(compiled-regex
|
||||
(list
|
||||
(split-instruction 1 3)
|
||||
(read-instruction #\a)
|
||||
(match-instruction 0)
|
||||
(read-instruction #\b)
|
||||
(match-instruction 1))))
|
||||
(check-equal? (compiled-regex-match-string r "a") (regex-execution-result 0 '()))
|
||||
(check-equal? (compiled-regex-match-string r "b") (regex-execution-result 1 '()))
|
||||
(check-equal? (compiled-regex-match-string r "ab") (regex-execution-result 0 '()))
|
||||
(check-equal? (compiled-regex-match-string r "ba") (regex-execution-result 1 '()))
|
||||
(check-false (compiled-regex-match-string r "c"))
|
||||
(check-false (compiled-regex-match-string r "")))
|
||||
|
||||
(test-case (name-string save-instruction)
|
||||
(define r
|
||||
(compiled-regex
|
||||
(list
|
||||
(save-instruction 0)
|
||||
(read-instruction #\a)
|
||||
(read-instruction #\a)
|
||||
(read-instruction #\a)
|
||||
(save-instruction 1)
|
||||
(read-instruction #\b)
|
||||
(read-instruction #\b)
|
||||
(read-instruction #\b)
|
||||
(save-instruction 2)
|
||||
(match-instruction 0))))
|
||||
(check-equal? (compiled-regex-match-string r "aaabbb") (regex-execution-result 0 '(0 2 5)))))
|
Loading…
Reference in New Issue