#lang racket/base (require yaragg/parser-tools/private-lex/util syntax/id-table racket/syntax) (provide parse) (define (bad-args stx num) (raise-syntax-error #f (format "incorrect number of arguments (should have ~a)" num) stx)) ;; char-range-arg: syntax-object syntax-object -> nat ;; If c contains is a character or length 1 string, returns the integer ;; for the character. Otherwise raises a syntax error. (define (char-range-arg stx containing-stx) (define c (syntax-e stx)) (cond [(char? c) (char->integer c)] [(and (string? c) (= (string-length c) 1)) (char->integer (string-ref c 0))] [else (raise-syntax-error #f "not a char or single-char string" containing-stx stx)])) (module+ test (check-equal? (char-range-arg #'#\1 #'here) (char->integer #\1)) (check-equal? (char-range-arg #'"1" #'here) (char->integer #\1))) (define orig-insp (variable-reference->module-declaration-inspector (#%variable-reference))) (define (disarm stx) (syntax-disarm stx orig-insp)) ;; parse : syntax-object (box (list-of syntax-object)) -> s-re (see re.rkt) ;; checks for errors and generates the plain s-exp form for s ;; Expands lex-abbrevs and applies lex-trans. (define (parse stx) (let loop ([stx stx] ;; seen-lex-abbrevs: id-table [seen-lex-abbrevs (make-immutable-free-id-table)]) (define (recur s) (loop (syntax-rearm s stx) seen-lex-abbrevs)) (define (recur/abbrev s id) (loop (syntax-rearm s stx) (free-id-table-set seen-lex-abbrevs id id))) (syntax-case (disarm stx) (repetition union intersection complement concatenation char-range char-complement) [_ (identifier? stx) (let ([expansion (syntax-local-value/record stx (λ (v) #t))]) (unless (lex-abbrev? expansion) (raise-syntax-error 'regular-expression "undefined abbreviation" stx)) ;; Check for cycles. (when (free-id-table-ref seen-lex-abbrevs stx (λ () #f)) (raise-syntax-error 'regular-expression "illegal lex-abbrev cycle detected" stx #f (list (free-id-table-ref seen-lex-abbrevs stx)))) (recur/abbrev ((lex-abbrev-get-abbrev expansion)) stx))] [_ (or (char? (syntax-e stx)) (string? (syntax-e stx))) (syntax-e stx)] [(repetition ARG ...) (let ([arg-list (syntax->list #'(ARG ...))]) (unless (= 3 (length arg-list)) (bad-args stx 2)) (define low (syntax-e (car arg-list))) (define high (syntax-e (cadr arg-list))) (define re (caddr arg-list)) (unless (and (number? low) (exact? low) (integer? low) (>= low 0)) (raise-syntax-error #f "not a non-negative exact integer" stx (car arg-list))) (unless (or (and (number? high) (exact? high) (integer? high) (>= high 0)) (eqv? high +inf.0)) (raise-syntax-error #f "not a non-negative exact integer or +inf.0" stx (cadr arg-list))) (unless (<= low high) (raise-syntax-error #f "the first argument is not less than or equal to the second argument" stx)) `(repetition ,low ,high ,(recur re)))] [(union RE ...) `(union ,@(map recur (syntax->list #'(RE ...))))] [(intersection RE ...) `(intersection ,@(map recur (syntax->list #'(RE ...))))] [(complement RE ...) (let ([re-list (syntax->list #'(RE ...))]) (unless (= 1 (length re-list)) (bad-args stx 1)) `(complement ,(recur (car re-list))))] [(concatenation RE ...) `(concatenation ,@(map recur (syntax->list #'(RE ...))))] [(char-range ARG ...) (let ((arg-list (syntax->list #'(ARG ...)))) (unless (= 2 (length arg-list)) (bad-args stx 2)) (define i1 (char-range-arg (car arg-list) stx)) (define i2 (char-range-arg (cadr arg-list) stx)) (if (<= i1 i2) `(char-range ,(integer->char i1) ,(integer->char i2)) (raise-syntax-error #f "the first argument does not precede or equal second argument" stx)))] [(char-complement ARG ...) (let ([arg-list (syntax->list #'(ARG ...))]) (unless (= 1 (length arg-list)) (bad-args stx 1)) (define parsed (recur (car arg-list))) (unless (char-set? parsed) (raise-syntax-error #f "not a character set" stx (car arg-list))) `(char-complement ,parsed))] ((OP form ...) (identifier? #'OP) (let* ([expansion (syntax-local-value/record #'OP (λ (v) #t))]) (cond [(lex-trans? expansion) (recur ((lex-trans-f expansion) (disarm stx)))] [expansion (raise-syntax-error 'regular-expression "not a lex-trans" stx)] [else (raise-syntax-error 'regular-expression "undefined operator" stx)]))) [_ (raise-syntax-error 'regular-expression "not a char, string, identifier, or (op args ...)" stx)]))) ;; char-set? : s-re -> bool ;; A char-set is an re that matches only strings of length 1. ;; char-set? is conservative. (define (char-set? s-re) (cond [(char? s-re)] [(string? s-re) (= (string-length s-re) 1)] [(list? s-re) (case (car s-re) [(union intersection) (andmap char-set? (cdr s-re))] [(char-range char-complement) #t] [(repetition) (and (= 1 (cadr s-re) (caddr s-re)) (char-set? (cadddr s-re)))] [(concatenation) (and (= 2 (length s-re)) (char-set? (cadr s-re)))] (else #f))] [else #f])) (module+ test (require rackunit) (check-equal? (char-set? #\a) #t) (check-equal? (char-set? "12") #f) (check-equal? (char-set? "1") #t) (check-equal? (char-set? '(repetition 1 2 #\1)) #f) (check-equal? (char-set? '(repetition 1 1 "12")) #f) (check-equal? (char-set? '(repetition 1 1 "1")) #t) (check-equal? (char-set? '(repetition 6 6 "1")) #f) (check-equal? (char-set? '(union "1" "2" "3")) #t) (check-equal? (char-set? '(union "1" "" "3")) #f) (check-equal? (char-set? '(intersection "1" "2" (union "3" "4"))) #t) (check-equal? (char-set? '(intersection "1" "")) #f) (check-equal? (char-set? '(complement "1")) #f) (check-equal? (char-set? '(concatenation "1" "2")) #f) (check-equal? (char-set? '(concatenation "" "2")) #f) (check-equal? (char-set? '(concatenation "1")) #t) (check-equal? (char-set? '(concatenation "12")) #f) (check-equal? (char-set? '(char-range #\1 #\2)) #t) (check-equal? (char-set? '(char-complement #\1)) #t)) ;; yikes... these test cases all have the wrong arity, now. ;; and by "now", I mean it's been broken since before we ;; moved to git. (module+ test (check-equal? (parse #'#\a) #\a) (check-equal? (parse #'"1") "1") (check-equal? (parse #'(repetition 1 1 #\1)) '(repetition 1 1 #\1)) (check-equal? (parse #'(repetition 0 +inf.0 #\1)) '(repetition 0 +inf.0 #\1)) (check-equal? (parse #'(union #\1 (union "2") (union))) '(union #\1 (union "2") (union))) (check-equal? (parse #'(intersection #\1 (intersection "2") (intersection))) '(intersection #\1 (intersection "2") (intersection))) (check-equal? (parse #'(complement (union #\1 #\2))) '(complement (union #\1 #\2))) (check-equal? (parse #'(concatenation "1" "2" (concatenation))) '(concatenation "1" "2" (concatenation))) (check-equal? (parse #'(char-range "1" #\1)) '(char-range #\1 #\1)) (check-equal? (parse #'(char-range #\1 "1")) '(char-range #\1 #\1)) (check-equal? (parse #'(char-range "1" "3")) '(char-range #\1 #\3)) (check-equal? (parse #'(char-complement (union "1" "2"))) '(char-complement (union "1" "2"))) (check-equal? (parse #'(char-complement (repetition 1 1 "1"))) '(char-complement (repetition 1 1 "1"))) (check-exn #rx"not a character set" (λ () (parse #'(char-complement (repetition 6 6 "1"))))))