diff --git a/lexer/private/regex-vm.rkt b/lexer/private/regex-vm.rkt new file mode 100644 index 0000000..4680cdf --- /dev/null +++ b/lexer/private/regex-vm.rkt @@ -0,0 +1,219 @@ +#lang racket/base + + +(require (for-syntax racket/base + syntax/parse) + racket/match + racket/vector + rebellion/collection/vector + rebellion/streaming/reducer + rebellion/streaming/transducer + syntax/parse/define) + + +(module+ test + (require (submod "..") + rackunit + rebellion/private/static-name)) + + +;@---------------------------------------------------------------------------------------------------- + + +;; Based on the "regular expression virtual machine" implementation strategy described in +;; https://swtch.com/~rsc/regexp/regexp2.html. + + +(struct compiled-regex (program) + #:transparent + #:guard (λ (instructions _) (sequence->vector instructions))) + + +(struct regex-instruction () #:transparent) +(struct read-instruction regex-instruction (expected-char) #:transparent) +(struct jump-instruction regex-instruction (address) #:transparent) +(struct split-instruction regex-instruction (primary-address secondary-address) #:transparent) +(struct match-instruction regex-instruction (mode) #:transparent) +(struct save-instruction regex-instruction (savepoint) #:transparent) + + +(define (compiled-regex-savepoint-count compiled) + (transduce (compiled-regex-program compiled) + (filtering save-instruction?) + (mapping save-instruction-savepoint) + #:into into-count)) + + +(struct regex-execution-result (mode savepoints) + #:transparent + #:guard (λ (mode savepoints _) (values mode (sequence->vector savepoints)))) + + +(struct regex-thread (program-counter savepoints) #:transparent) + + +(struct thread-list ([size #:mutable] threads-by-priority-order threads-by-program-counter) + #:transparent) + + +(define (make-thread-list #:program-size capacity) + (thread-list 0 (make-vector capacity #false) (make-vector capacity #false))) + + +(define (thread-list-get threads index) + (vector-ref (thread-list-threads-by-priority-order threads) index)) + + +(define (thread-list-add! threads thread #:program program #:input-index i) + (define pc (regex-thread-program-counter thread)) + (define savepoints (regex-thread-savepoints thread)) + (match (vector-ref program pc) + [(jump-instruction address) + (define new-thread (regex-thread address savepoints)) + (thread-list-add! threads new-thread #:program program #:input-index i)] + [(split-instruction primary secondary) + (define primary-thread (regex-thread primary savepoints)) + (define secondary-thread (regex-thread secondary (vector-copy savepoints))) + (thread-list-add! threads primary-thread #:program program #:input-index i) + (thread-list-add! threads secondary-thread #:program program #:input-index i)] + [(save-instruction savepoint) + (vector-set! savepoints savepoint i) + (define new-thread (regex-thread (add1 pc) savepoints)) + (thread-list-add! threads new-thread #:program program #:input-index i)] + [_ + (define by-pc (thread-list-threads-by-program-counter threads)) + (unless (vector-ref by-pc pc) + (define size (thread-list-size threads)) + (vector-set! by-pc pc thread) + (vector-set! (thread-list-threads-by-priority-order threads) size thread) + (set-thread-list-size! threads (add1 size)))])) + + +(define (thread-list-clear! threads) + (define by-priority (thread-list-threads-by-priority-order threads)) + (define by-pc (thread-list-threads-by-program-counter threads)) + (for ([thread (in-vector by-priority 0 (thread-list-size threads))] + [i (in-naturals)]) + (vector-set! by-pc (regex-thread-program-counter thread) #false) + ;; TODO: this isn't strictly necessary (setting size to zero makes the by-priority vector's + ;; contents unreadable anyway) but I'm leaving it in for now as a defense against other bugs, this + ;; should be removed once tests are passing. + (vector-set! by-priority i #false)) + (set-thread-list-size! threads 0)) + + +(define (compiled-regex-match-string r str) + (define strlen (string-length str)) + (define program (compiled-regex-program r)) + (define savepoint-count (compiled-regex-savepoint-count r)) + + (define (make-thread [pc 0] [savepoints (make-vector savepoint-count #false)]) + (regex-thread pc savepoints)) + + (define running-threads (make-thread-list #:program-size (vector-length program))) + (define blocked-threads (make-thread-list #:program-size (vector-length program))) + (thread-list-add! running-threads (make-thread) #:program program #:input-index 0) + (for/fold ([running-threads running-threads] + [blocked-threads blocked-threads] + [last-match #false] + #:result last-match) + ([input-index (in-range 0 (add1 strlen))]) + (let loop ([i 0]) + (cond + [(equal? i (thread-list-size running-threads)) + (thread-list-clear! running-threads) + (values blocked-threads running-threads last-match)] + [else + (define thread (thread-list-get running-threads i)) + (define pc (regex-thread-program-counter thread)) + (define savepoints (regex-thread-savepoints thread)) + (match (vector-ref program pc) + [(read-instruction c) + (when (and (< input-index strlen) (equal? (string-ref str input-index) c)) + (define next-thread (make-thread (add1 pc) savepoints)) + (thread-list-add! blocked-threads + next-thread + #:program program + #:input-index input-index)) + (loop (add1 i))] + [(match-instruction mode) + (thread-list-clear! running-threads) + (values blocked-threads running-threads (regex-execution-result mode savepoints))])])))) + + +(module+ test + + (test-case (name-string match-instruction) + (define r (compiled-regex (list (match-instruction 42)))) + (check-equal? (compiled-regex-match-string r "aaaaaaaa") (regex-execution-result 42 '())) + (check-equal? (compiled-regex-match-string r "") (regex-execution-result 42 '()))) + + (test-case (name-string read-instruction) + + (test-case "reading one character" + (define r (compiled-regex (list (read-instruction #\a) (match-instruction 0)))) + (check-equal? (compiled-regex-match-string r "a") (regex-execution-result 0 '())) + (check-equal? (compiled-regex-match-string r "aaa") (regex-execution-result 0 '())) + (check-equal? (compiled-regex-match-string r "ab") (regex-execution-result 0 '())) + (check-false (compiled-regex-match-string r "b")) + (check-false (compiled-regex-match-string r "")) + (check-false (compiled-regex-match-string r "ba"))) + + (test-case "reading multiple characters" + (define r + (compiled-regex + (list + (read-instruction #\a) + (read-instruction #\b) + (read-instruction #\c) + (match-instruction 0)))) + (check-equal? (compiled-regex-match-string r "abc") (regex-execution-result 0 '())) + (check-equal? (compiled-regex-match-string r "abcxxx") (regex-execution-result 0 '())) + (check-false (compiled-regex-match-string r "cba")) + (check-false (compiled-regex-match-string r "a")) + (check-false (compiled-regex-match-string r "ab")) + (check-false (compiled-regex-match-string r "aaa")) + (check-false (compiled-regex-match-string r "bbb")) + (check-false (compiled-regex-match-string r "ccc")) + (check-false (compiled-regex-match-string r "aabc")))) + + (test-case (name-string jump-instruction) + (define r + (compiled-regex + (list + (jump-instruction 2) (read-instruction #\a) (read-instruction #\b) (match-instruction 0)))) + (check-equal? (compiled-regex-match-string r "b") (regex-execution-result 0 '())) + (check-false (compiled-regex-match-string r "a")) + (check-false (compiled-regex-match-string r "c"))) + + (test-case (name-string split-instruction) + (define r + (compiled-regex + (list + (split-instruction 1 3) + (read-instruction #\a) + (match-instruction 0) + (read-instruction #\b) + (match-instruction 1)))) + (check-equal? (compiled-regex-match-string r "a") (regex-execution-result 0 '())) + (check-equal? (compiled-regex-match-string r "b") (regex-execution-result 1 '())) + (check-equal? (compiled-regex-match-string r "ab") (regex-execution-result 0 '())) + (check-equal? (compiled-regex-match-string r "ba") (regex-execution-result 1 '())) + (check-false (compiled-regex-match-string r "c")) + (check-false (compiled-regex-match-string r ""))) + + (test-case (name-string save-instruction) + (define r + (compiled-regex + (list + (save-instruction 0) + (read-instruction #\a) + (read-instruction #\a) + (read-instruction #\a) + (save-instruction 1) + (read-instruction #\b) + (read-instruction #\b) + (read-instruction #\b) + (save-instruction 2) + (match-instruction 0)))) + (check-equal? (compiled-regex-match-string r "aaabbb") (regex-execution-result 0 '(0 2 5)))))