#lang racket/base (provide (all-defined-out)) (require (for-syntax racket/base syntax/parse) racket/match racket/vector rebellion/collection/vector rebellion/streaming/reducer rebellion/streaming/transducer syntax/parse/define yaragg/lexer/private/regular-match yaragg/private/hash) (module+ test (require (submod "..") rackunit rebellion/private/static-name)) ;@---------------------------------------------------------------------------------------------------- ;; Based on the "regular expression virtual machine" implementation strategy described in ;; https://swtch.com/~rsc/regexp/regexp2.html. (define (compiled-regex-with-labels program labels) (compiled-regex (for/vector ([instruction (in-vector program)]) (match instruction [(labeled-jump-instruction label) (jump-instruction (hash-ref labels label))] [(labeled-split-instruction primary secondary) (split-instruction (hash-ref labels primary) (hash-ref labels secondary))] [other other])))) (struct compiled-regex (program) #:transparent #:guard (λ (instructions _) (sequence->vector instructions))) (struct regex-instruction () #:transparent) (struct read-instruction regex-instruction (expected-char) #:transparent) (struct peek-instruction regex-instruction (expected-char) #:transparent) (struct reset-peek-instruction regex-instruction () #:transparent) (struct jump-instruction regex-instruction (address) #:transparent) (struct split-instruction regex-instruction (primary-address secondary-address) #:transparent) (struct labeled-jump-instruction regex-instruction (label) #:transparent) (struct labeled-split-instruction regex-instruction (primary-label secondary-label) #:transparent) (struct match-instruction regex-instruction () #:transparent) (struct start-group-instruction regex-instruction (key) #:transparent) (struct finish-group-instruction regex-instruction (key) #:transparent) (struct fail-instruction regex-instruction () #:transparent) (struct regex-thread (program-counter captured-groups) #:transparent) (struct thread-list ([size #:mutable] threads-by-priority-order threads-by-program-counter) #:transparent) (define (make-thread-list #:program-size capacity) (thread-list 0 (make-vector capacity #false) (make-vector capacity #false))) (define (thread-list-get threads index) (vector-ref (thread-list-threads-by-priority-order threads) index)) (define (thread-list-add! threads thread #:program program #:input input #:input-index i) (let loop ([thread thread] [i i] [peek i] [max-peek i]) (define pc (regex-thread-program-counter thread)) (define groups (regex-thread-captured-groups thread)) (match (vector-ref program pc) [(jump-instruction address) (loop (regex-thread address groups) i peek max-peek)] [(split-instruction primary secondary) (define secondary-groups (captured-groups-builder-copy groups)) (max (loop (regex-thread primary groups) i peek max-peek) (loop (regex-thread secondary secondary-groups) i peek max-peek))] [(start-group-instruction key) (captured-groups-builder-start-group! groups key i) (loop (regex-thread (add1 pc) groups) i peek max-peek)] [(finish-group-instruction key) (captured-groups-builder-finish-group! groups key i) (loop (regex-thread (add1 pc) groups) i peek max-peek)] [(peek-instruction expected) (cond [(equal? (string-ref input peek) expected) (define next-peek (add1 peek)) (loop (regex-thread (add1 pc) groups) i next-peek (max max-peek next-peek))] [else max-peek])] [(reset-peek-instruction) (loop (regex-thread (add1 pc) groups) i i max-peek)] [_ (define by-pc (thread-list-threads-by-program-counter threads)) (unless (vector-ref by-pc pc) (define size (thread-list-size threads)) (vector-set! by-pc pc thread) (vector-set! (thread-list-threads-by-priority-order threads) size thread) (set-thread-list-size! threads (add1 size))) max-peek]))) (define (thread-list-clear! threads) (define by-priority (thread-list-threads-by-priority-order threads)) (define by-pc (thread-list-threads-by-program-counter threads)) (for ([thread (in-vector by-priority 0 (thread-list-size threads))] [i (in-naturals)]) (vector-set! by-pc (regex-thread-program-counter thread) #false) ;; TODO: this isn't strictly necessary (setting size to zero makes the by-priority vector's ;; contents unreadable anyway) but I'm leaving it in for now as a defense against other bugs, this ;; should be removed once tests are passing. (vector-set! by-priority i #false)) (set-thread-list-size! threads 0)) (define (compiled-regex-match-string r str [start 0] [end (string-length str)]) (define program (compiled-regex-program r)) (define (make-thread [pc 0] [groups (make-captured-groups-builder)]) (regex-thread pc groups)) (define running-threads (make-thread-list #:program-size (vector-length program))) (define blocked-threads (make-thread-list #:program-size (vector-length program))) (define max-peek (thread-list-add! running-threads (make-thread) #:program program #:input str #:input-index start)) (for/fold ([running-threads running-threads] [blocked-threads blocked-threads] [last-match #false] [max-peek max-peek] #:result (or last-match (regular-match-failure start (- max-peek start)))) ([input-index (in-range start (add1 end))]) (let loop ([i 0] [max-peek max-peek]) (cond [(equal? i (thread-list-size running-threads)) (thread-list-clear! running-threads) (values blocked-threads running-threads last-match max-peek)] [else (define thread (thread-list-get running-threads i)) (define pc (regex-thread-program-counter thread)) (define groups (regex-thread-captured-groups thread)) (match (vector-ref program pc) [(read-instruction c) (cond [(and (< input-index end) (equal? (string-ref str input-index) c)) (define next-thread (make-thread (add1 pc) groups)) (define next-max-peek (thread-list-add! blocked-threads next-thread #:program program #:input str #:input-index (add1 input-index))) (loop (add1 i) (max max-peek next-max-peek))] [else (loop (add1 i) (max (add1 input-index) max-peek))])] [(match-instruction) (thread-list-clear! running-threads) (define resulting-match (regular-match start input-index #:peek-distance (- max-peek input-index) #:groups (build-captured-groups groups))) (values blocked-threads running-threads resulting-match max-peek)])])))) (module+ test (test-case (name-string match-instruction) (define r (compiled-regex (list (match-instruction)))) (define expected (regular-match 0 0)) (check-equal? (compiled-regex-match-string r "aaaaaaaa") expected) (check-equal? (compiled-regex-match-string r "") expected)) (test-case (name-string read-instruction) (test-case "reading one character" (define r (compiled-regex (list (read-instruction #\a) (match-instruction)))) (define expected (regular-match 0 1)) (check-equal? (compiled-regex-match-string r "a") expected) (check-equal? (compiled-regex-match-string r "aaa") expected) (check-equal? (compiled-regex-match-string r "ab") expected) (check-equal? (compiled-regex-match-string r "b") (regular-match-failure 0 1)) (check-equal? (compiled-regex-match-string r "") (regular-match-failure 0 1)) (check-equal? (compiled-regex-match-string r "ba") (regular-match-failure 0 1))) (test-case "reading multiple characters" (define r (compiled-regex (list (read-instruction #\a) (read-instruction #\b) (read-instruction #\c) (match-instruction)))) (define expected (regular-match 0 3)) (check-equal? (compiled-regex-match-string r "abc") expected) (check-equal? (compiled-regex-match-string r "abcxxx") expected) (check-equal? (compiled-regex-match-string r "cba") (regular-match-failure 0 1)) (check-equal? (compiled-regex-match-string r "a") (regular-match-failure 0 2)) (check-equal? (compiled-regex-match-string r "ab") (regular-match-failure 0 3)) (check-equal? (compiled-regex-match-string r "aaa") (regular-match-failure 0 2)) (check-equal? (compiled-regex-match-string r "bbb") (regular-match-failure 0 1)) (check-equal? (compiled-regex-match-string r "ccc") (regular-match-failure 0 1)) (check-equal? (compiled-regex-match-string r "aabc") (regular-match-failure 0 2)))) (test-case (name-string jump-instruction) (define r (compiled-regex (list (jump-instruction 2) (read-instruction #\a) (read-instruction #\b) (match-instruction)))) (check-equal? (compiled-regex-match-string r "b") (regular-match 0 1)) (check-equal? (compiled-regex-match-string r "a") (regular-match-failure 0 1)) (check-equal? (compiled-regex-match-string r "c") (regular-match-failure 0 1))) (test-case "group capturing instructions" (define r (compiled-regex (list (start-group-instruction 'a) (read-instruction #\a) (read-instruction #\a) (read-instruction #\a) (finish-group-instruction 'a) (start-group-instruction 'b) (read-instruction #\b) (read-instruction #\b) (read-instruction #\b) (finish-group-instruction 'b) (match-instruction)))) (define expected (regular-match 0 6 #:groups (hash 'a (list (captured-group 0 3)) 'b (list (captured-group 3 6))))) (check-equal? (compiled-regex-match-string r "aaabbb") expected)) (test-case (name-string split-instruction) (define r (compiled-regex (list (split-instruction 1 5) (start-group-instruction 'a) (read-instruction #\a) (finish-group-instruction 'a) (match-instruction) (start-group-instruction 'b) (read-instruction #\b) (finish-group-instruction 'b) (match-instruction)))) (define a-match (regular-match 0 1 #:groups (hash 'a (list (captured-group 0 1))))) (define b-match (regular-match 0 1 #:groups (hash 'b (list (captured-group 0 1))))) (check-equal? (compiled-regex-match-string r "a") a-match) (check-equal? (compiled-regex-match-string r "b") b-match) (check-equal? (compiled-regex-match-string r "ab") a-match) (check-equal? (compiled-regex-match-string r "ba") b-match) (check-equal? (compiled-regex-match-string r "c") (regular-match-failure 0 1)) (check-equal? (compiled-regex-match-string r "") (regular-match-failure 0 1))) (test-case (name-string peek-instruction) (define r (compiled-regex (list (start-group-instruction 'a) (peek-instruction #\a) (peek-instruction #\b) (peek-instruction #\c) (finish-group-instruction 'a) (match-instruction)))) (define expected (regular-match 0 0 #:peek-distance 3 #:groups (hash 'a (list (captured-group 0 0))))) (check-equal? (compiled-regex-match-string r "abc") expected)) (test-case (name-string reset-peek-instruction) (define r (compiled-regex (list (start-group-instruction 'a) (peek-instruction #\a) (peek-instruction #\b) (reset-peek-instruction) (peek-instruction #\a) (finish-group-instruction 'a) (match-instruction)))) (define expected (regular-match 0 0 #:peek-distance 2 #:groups (hash 'a (list (captured-group 0 0))))) (check-equal? (compiled-regex-match-string r "abc") expected)))