diff --git a/brag-lib/brag/codegen/expander.rkt b/brag-lib/brag/codegen/expander.rkt index cf1d137..39690d7 100755 --- a/brag-lib/brag/codegen/expander.rkt +++ b/brag-lib/brag/codegen/expander.rkt @@ -18,7 +18,7 @@ (define-for-syntax (rules->token-types rules) (define-values (implicit-tokens explicit-tokens) (rules-collect-token-types rules)) (remove-duplicates (append (for/list ([it (in-list implicit-tokens)]) - (string->symbol (syntax-e it))) + (string->symbol (syntax-e it))) (map syntax-e explicit-tokens)) eq?)) (define-syntax (brag-module-begin rules-stx) @@ -38,7 +38,7 @@ (with-syntax ([START-ID (first rule-ids)] ; The first rule, by default, is the start rule. [((TOKEN-TYPE . TOKEN-TYPE-CONSTRUCTOR) ...) (for/list ([tt (in-list (rules->token-types rules))]) - (cons tt (string->symbol (format "token-~a" tt))))] + (cons tt (string->symbol (format "token-~a" tt))))] ;; Flatten rules to use the yacc-style ruleset that br-parser-tools supports [GENERATED-RULE-CODES (map flat-rule->yacc-rule (flatten-rules rules))] ;; main exports. Break hygiene so they're also available at top-level / repl @@ -71,38 +71,41 @@ (cons eof token-EOF) (cons 'TOKEN-TYPE TOKEN-TYPE-CONSTRUCTOR) ...))) - (define-syntax (MAKE-RULE-PARSER stx) - (syntax-case stx () - [(_ START-RULE-ID) - (and (identifier? #'START-RULE-ID) (member (syntax-e #'START-RULE-ID) 'RULE-IDS)) + (define-syntax (MAKE-RULE-PARSER rule-id-stx) + (syntax-case rule-id-stx () + [(_ start-rule) + (and (identifier? #'start-rule) + (member (syntax-e #'start-rule) 'RULE-IDS)) ;; The cfg-parser depends on the start-rule provided in (start ...) to have the same ;; context as the rest of this body. Hence RECOLORED-START-RULE - (with-syntax ([RECOLORED-START-RULE (datum->syntax #'RULES-STX (syntax-e #'START-RULE-ID))]) - #'(let () - (define (rule-parser tokenizer) - (define rule-grammar (cfg-parser (tokens enumerated-tokens) - (src-pos) - (start RECOLORED-START-RULE) - (end EOF) - (error the-error-handler) - (grammar . GENERATED-RULE-CODES))) - (define next-token (make-permissive-tokenizer tokenizer all-tokens-hash/mutable)) - ;; here's how we support grammar "cuts" on top rule name - (define parse-tree-stx (rule-grammar next-token)) - (syntax-case parse-tree-stx () - [(TOP-RULE-NAME . _) - (if (eq? (syntax-property #'TOP-RULE-NAME 'hide-or-splice?) 'hide) - (remove-rule-name parse-tree-stx) ; use `remove-rule-name` so we get the same housekeeping - parse-tree-stx)] - [_ (error 'malformed-parse-tree)])) - (case-lambda [(tokenizer) (rule-parser tokenizer)] - [(source tokenizer) - (parameterize ([current-source source]) - (rule-parser tokenizer))])))] + (with-syntax ([RECOLORED-START-RULE (datum->syntax #'RULES-STX (syntax-e #'start-rule))]) + #'(let ([THE-GRAMMAR (cfg-parser (tokens enumerated-tokens) + (src-pos) + (start RECOLORED-START-RULE) + (end EOF) + (error THE-ERROR-HANDLER) + (grammar . GENERATED-RULE-CODES))]) + (procedure-rename + (case-lambda [(tokenizer) + (define next-token + (make-permissive-tokenizer tokenizer all-tokens-hash/mutable)) + ;; little post-processor to support cuts on top rule name + (define parse-tree-stx (THE-GRAMMAR next-token)) + (define top-rule-name-stx (syntax-case parse-tree-stx () + [(TRN . REST) #'TRN] + [_ (error 'malformed-parse-tree)])) + (if (eq? (syntax-property top-rule-name-stx 'hide-or-splice?) 'hide) + ;; use `remove-rule-name` so we get the same housekeeping + (remove-rule-name parse-tree-stx) + parse-tree-stx)] + [(source tokenizer) + (parameterize ([current-source source]) + (PARSE tokenizer))]) + (string->symbol (format "~a-rule-parser" 'start-rule)))))] [(_ not-a-rule-id) (raise-syntax-error #f (format "Rule ~a is not defined in the grammar" (syntax-e #'not-a-rule-id)) - stx)])) + rule-id-stx)])) ;; start-id has to be a value, not an expr, because make-rule-parser is a macro (define PARSE (procedure-rename (MAKE-RULE-PARSER START-ID) 'PARSE)) diff --git a/brag-lib/brag/codegen/runtime.rkt b/brag-lib/brag/codegen/runtime.rkt index d2ef10b..315caf4 100755 --- a/brag-lib/brag/codegen/runtime.rkt +++ b/brag-lib/brag/codegen/runtime.rkt @@ -7,7 +7,7 @@ brag/private/internal-support) -(provide the-error-handler +(provide THE-ERROR-HANDLER make-permissive-tokenizer atomic-datum->syntax positions->srcloc @@ -19,7 +19,7 @@ ;; The level of indirection here is necessary since the yacc grammar wants a ;; function value for the error handler up front. We want to delay that decision ;; till parse time. -(define (the-error-handler tok-ok? tok-name tok-value start-pos end-pos) +(define (THE-ERROR-HANDLER tok-ok? tok-name tok-value start-pos end-pos) (match (positions->srcloc start-pos end-pos) [(list src line col offset span) ((current-parser-error-handler) tok-name diff --git a/brag-lib/brag/examples/codepoints.rkt b/brag-lib/brag/examples/codepoints.rkt new file mode 100644 index 0000000..2a618f8 --- /dev/null +++ b/brag-lib/brag/examples/codepoints.rkt @@ -0,0 +1,6 @@ +#lang brag +start: A c def hello-world +A : "\"\101\\" ; A +c : '\'\U0063\\' ; c +def : "*\u64\\\x65f\"" ; de +hello-world : "\150\145\154\154\157\40\167\157\162\154\144" \ No newline at end of file diff --git a/brag-lib/brag/examples/subrule.rkt b/brag-lib/brag/examples/subrule.rkt deleted file mode 100644 index 7183bfc..0000000 --- a/brag-lib/brag/examples/subrule.rkt +++ /dev/null @@ -1,4 +0,0 @@ -#lang brag - -start: next -next: "0" diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 925e6c9..020e8f9 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -1,4 +1,4 @@ -#lang racket/base +#lang at-exp racket/base (require (for-syntax racket/base "parser.rkt")) (require br-parser-tools/lex (prefix-in : br-parser-tools/lex-sre) @@ -35,28 +35,65 @@ (define-lex-abbrev esc-chars (union "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\e")) -(define (unescape-lexeme lexeme quote-char) - ;; convert the literal string representation back into an escape char with lookup table - (define unescapes (hash "a" 7 "b" 8 "t" 9 "n" 10 "v" 11 "f" 12 "r" 13 "e" 27 "\"" 34 "'" 39 "\\" 92)) - (define pat (regexp (format "(?<=^~a\\\\).(?=~a$)" quote-char quote-char))) - (cond - [(regexp-match pat lexeme) - => (λ (m) (string quote-char (integer->char (hash-ref unescapes (car m))) quote-char))] - [else lexeme])) - +(define (unescape-double-quoted-lexeme lexeme start-pos end-pos) + ;; use `read` so brag strings have all the notational semantics of Racket strings + (with-handlers ([exn:fail:read? + (λ (e) ((current-parser-error-handler) + #f + 'error + lexeme + (position->pos start-pos) + (position->pos end-pos)))]) + (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\")))) + +(define (convert-to-double-quoted lexeme) + ;; brag supports single-quoted strings, for some reason + ;; (Racket does not. A single quote denotes a datum) + ;; let's convert a single-quoted string into standard double-quoted style + ;; so we can use Racket's `read` function on it. + ;; and thereby support all the standard Racket string elements: + ;; https://docs.racket-lang.org/reference/reader.html#%28part._parse-string%29 + (define outside-quotes-removed (string-trim lexeme "'")) + (define single-quotes-unescaped (string-replace outside-quotes-removed "\\'" "'")) + (define double-quotes-escaped (string-replace single-quotes-unescaped "\"" "\\\"")) + (define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\"")) + double-quotes-on-ends) + +(define-lex-abbrev backslash "\\") +(define-lex-abbrev single-quote "'") +(define-lex-abbrev escaped-single-quote (:: backslash single-quote)) +(define-lex-abbrev double-quote "\"") +(define-lex-abbrev escaped-double-quote (:: backslash double-quote)) +(define-lex-abbrev escaped-backslash (:: backslash backslash)) (define lex/1 (lexer-src-pos - ;; handle whitespace & escape chars within quotes as literal tokens: "\n" "\t" '\n' '\t' - ;; match the escaped version, and then unescape them before they become token-LITs - [(:: "'" - (:or (:* (:or "\\'" esc-chars (:~ "'" "\\"))) "\\\\") - "'") - (token-LIT (unescape-lexeme lexeme #\'))] - [(:: "\"" - (:or (:* (:or "\\\"" esc-chars (:~ "\"" "\\"))) "\\\\") - "\"") - (token-LIT (unescape-lexeme lexeme #\"))] + [(:: double-quote ;; start with double quote + (intersection ;; two conditions need to be true inside the quotes: + ;; we can have anything except + ;; a plain double-quote (which would close the quote) + ;; plus we specially allow escaped double quotes and backslashes + (:* (:or escaped-double-quote escaped-backslash (:~ double-quote))) + ;; we must forbid one situation with the string \\" + ;; the problem is that it's ambiguous: + ;; it can be lexed as (:: escaped-backlash double-quote) = \\ + " + ;; or (:: backlash escaped-double-quote) = \ + \" + ;; because escapes should be "left associative", + ;; we forbid the second possibility + ;; There are still some weird corner cases but the current tests work. + ;; with single and double quotes in the mix, + ;; I'm not sure how much better this can be. + (complement (:: any-string backslash escaped-double-quote any-string))) + double-quote) ;; end with double quote + (token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos))] + ;; single-quoted string follows the same pattern, + ;; but with escaped-single-quote instead of escaped-double-quote + [(:: single-quote + (intersection + (:* (:or escaped-single-quote escaped-backslash (:~ single-quote))) + (complement (:: any-string backslash escaped-single-quote any-string))) + single-quote) + (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme) start-pos end-pos))] [(:or "()" "Ø" "∅") (token-EMPTY lexeme)] ["(" (token-LPAREN lexeme)] diff --git a/brag-lib/brag/test/test-codepoints.rkt b/brag-lib/brag/test/test-codepoints.rkt new file mode 100755 index 0000000..bb3b3b7 --- /dev/null +++ b/brag-lib/brag/test/test-codepoints.rkt @@ -0,0 +1,10 @@ +#lang racket/base + +(require brag/examples/codepoints + rackunit) + +(check-equal? (parse-to-datum '("\"A\\" "'c\\" "*d\\ef\"" "hello world")) + '(start (A "\"A\\") + (c "'c\\") + (def "*d\\ef\"") + (hello-world "hello world"))) diff --git a/brag-lib/brag/test/test-lexer.rkt b/brag-lib/brag/test/test-lexer.rkt index 49f3774..8e7830d 100755 --- a/brag-lib/brag/test/test-lexer.rkt +++ b/brag-lib/brag/test/test-lexer.rkt @@ -51,11 +51,13 @@ (check-equal? (l "]") '(RBRACKET "]" 1 2)) +;; 220111: lexer now converts single-quoted lexemes +;; to standard Racket-style double-quoted string literal (check-equal? (l "'hello'") - '(LIT "'hello'" 1 8)) + '(LIT "\"hello\"" 1 8)) (check-equal? (l "'he\\'llo'") - '(LIT "'he\\'llo'" 1 10)) + '(LIT "\"he'llo\"" 1 10)) (check-equal? (l "/") '(HIDE "/" 1 2)) diff --git a/brag-lib/brag/test/test-make-rule-parser.rkt b/brag-lib/brag/test/test-make-rule-parser.rkt deleted file mode 100644 index 51a47a2..0000000 --- a/brag-lib/brag/test/test-make-rule-parser.rkt +++ /dev/null @@ -1,17 +0,0 @@ -#lang racket/base -(require rackunit - brag/support - brag/examples/subrule) - -(define parse-next (make-rule-parser next)) -(define parse-start (make-rule-parser start)) - -(check-equal? (syntax->datum (parse #f "0")) '(start (next "0"))) -(check-equal? (syntax->datum (parse #f "0")) (syntax->datum (parse "0"))) - -(check-equal? (syntax->datum (parse-start #f "0")) '(start (next "0"))) -(check-equal? (syntax->datum (parse-start #f "0")) (syntax->datum (parse-start "0"))) - -(check-equal? (syntax->datum (parse-next #f "0")) '(next "0")) -(check-equal? (syntax->datum (parse-next #f "0")) (syntax->datum (parse-next "0"))) - diff --git a/brag/brag/brag.scrbl b/brag/brag/brag.scrbl index 2babb1d..6c40f6c 100755 --- a/brag/brag/brag.scrbl +++ b/brag/brag/brag.scrbl @@ -538,24 +538,16 @@ Here's the definition for brag/examples/simple-line-drawing/semantics #:read my-read #:read-syntax my-read-syntax - #:info my-get-info #:whole-body-readers? #t (require brag/examples/simple-line-drawing/lexer - brag/examples/simple-line-drawing/grammar) + brag/examples/simple-line-drawing/grammar) (define (my-read in) - (syntax->datum (my-read-syntax #f in))) + (syntax->datum (my-read-syntax #f in))) (define (my-read-syntax src ip) - (list (parse src (tokenize ip)))) - - (define (my-get-info key default default-filter) - (case key - [(color-lexer) - (dynamic-require 'syntax-color/default-lexer 'default-lexer)] - [else - (default-filter key default)])) + (list (parse src (tokenize ip)))) }| } @@ -572,43 +564,43 @@ compilation: (require (for-syntax racket/base syntax/parse)) (provide #%module-begin - ;; We reuse Racket's treatment of raw datums, specifically - ;; for strings and numbers: - #%datum - - ;; And otherwise, we provide definitions of these three forms. - ;; During compiliation, Racket uses these definitions to - ;; rewrite into for loops, displays, and newlines. - drawing rows chunk) + ;; We reuse Racket's treatment of raw datums, specifically + ;; for strings and numbers: + #%datum + + ;; And otherwise, we provide definitions of these three forms. + ;; During compiliation, Racket uses these definitions to + ;; rewrite into for loops, displays, and newlines. + drawing rows chunk) ;; Define a few compile-time functions to do the syntax rewriting: (begin-for-syntax - (define (compile-drawing drawing-stx) - (syntax-parse drawing-stx - [({~literal drawing} row-stxs ...) - - (syntax/loc drawing-stx - (begin row-stxs ...))])) - - (define (compile-rows row-stx) - (syntax-parse row-stx - [({~literal rows} - ({~literal repeat} repeat-number) - chunks ... - ";") - - (syntax/loc row-stx - (for ([i repeat-number]) - chunks ... - (newline)))])) - - (define (compile-chunk chunk-stx) - (syntax-parse chunk-stx - [({~literal chunk} chunk-size chunk-string) - - (syntax/loc chunk-stx - (for ([k chunk-size]) - (display chunk-string)))]))) + (define (compile-drawing drawing-stx) + (syntax-parse drawing-stx + [({~literal drawing} rows-stxs ...) + + (syntax/loc drawing-stx + (begin rows-stxs ...))])) + + (define (compile-rows rows-stx) + (syntax-parse rows-stx + [({~literal rows} + ({~literal repeat} repeat-number) + chunks ... + ";") + + (syntax/loc rows-stx + (for ([i repeat-number]) + chunks ... + (newline)))])) + + (define (compile-chunk chunk-stx) + (syntax-parse chunk-stx + [({~literal chunk} chunk-size chunk-string) + + (syntax/loc chunk-stx + (for ([k chunk-size]) + (display chunk-string)))]))) ;; Wire up the use of "drawing", "rows", and "chunk" to these