From 6983208426fc4970b3753eea075cc492b64d5103 Mon Sep 17 00:00:00 2001 From: Markus Pfeiffer Date: Tue, 16 Nov 2021 17:35:40 +0000 Subject: [PATCH 1/9] Fix make-rule-parser (#28) When using `make-rule-parser` for any rule that is not the start rule, and applying the resulting parser while giving it a source-path it would just use the entire grammar from the start rule. --- brag-lib/brag/codegen/expander.rkt | 61 ++++++++++---------- brag-lib/brag/codegen/runtime.rkt | 4 +- brag-lib/brag/examples/subrule.rkt | 4 ++ brag-lib/brag/test/test-make-rule-parser.rkt | 17 ++++++ 4 files changed, 52 insertions(+), 34 deletions(-) create mode 100644 brag-lib/brag/examples/subrule.rkt create mode 100644 brag-lib/brag/test/test-make-rule-parser.rkt diff --git a/brag-lib/brag/codegen/expander.rkt b/brag-lib/brag/codegen/expander.rkt index 39690d7..cf1d137 100755 --- a/brag-lib/brag/codegen/expander.rkt +++ b/brag-lib/brag/codegen/expander.rkt @@ -18,7 +18,7 @@ (define-for-syntax (rules->token-types rules) (define-values (implicit-tokens explicit-tokens) (rules-collect-token-types rules)) (remove-duplicates (append (for/list ([it (in-list implicit-tokens)]) - (string->symbol (syntax-e it))) + (string->symbol (syntax-e it))) (map syntax-e explicit-tokens)) eq?)) (define-syntax (brag-module-begin rules-stx) @@ -38,7 +38,7 @@ (with-syntax ([START-ID (first rule-ids)] ; The first rule, by default, is the start rule. [((TOKEN-TYPE . TOKEN-TYPE-CONSTRUCTOR) ...) (for/list ([tt (in-list (rules->token-types rules))]) - (cons tt (string->symbol (format "token-~a" tt))))] + (cons tt (string->symbol (format "token-~a" tt))))] ;; Flatten rules to use the yacc-style ruleset that br-parser-tools supports [GENERATED-RULE-CODES (map flat-rule->yacc-rule (flatten-rules rules))] ;; main exports. Break hygiene so they're also available at top-level / repl @@ -71,41 +71,38 @@ (cons eof token-EOF) (cons 'TOKEN-TYPE TOKEN-TYPE-CONSTRUCTOR) ...))) - (define-syntax (MAKE-RULE-PARSER rule-id-stx) - (syntax-case rule-id-stx () - [(_ start-rule) - (and (identifier? #'start-rule) - (member (syntax-e #'start-rule) 'RULE-IDS)) + (define-syntax (MAKE-RULE-PARSER stx) + (syntax-case stx () + [(_ START-RULE-ID) + (and (identifier? #'START-RULE-ID) (member (syntax-e #'START-RULE-ID) 'RULE-IDS)) ;; The cfg-parser depends on the start-rule provided in (start ...) to have the same ;; context as the rest of this body. Hence RECOLORED-START-RULE - (with-syntax ([RECOLORED-START-RULE (datum->syntax #'RULES-STX (syntax-e #'start-rule))]) - #'(let ([THE-GRAMMAR (cfg-parser (tokens enumerated-tokens) - (src-pos) - (start RECOLORED-START-RULE) - (end EOF) - (error THE-ERROR-HANDLER) - (grammar . GENERATED-RULE-CODES))]) - (procedure-rename - (case-lambda [(tokenizer) - (define next-token - (make-permissive-tokenizer tokenizer all-tokens-hash/mutable)) - ;; little post-processor to support cuts on top rule name - (define parse-tree-stx (THE-GRAMMAR next-token)) - (define top-rule-name-stx (syntax-case parse-tree-stx () - [(TRN . REST) #'TRN] - [_ (error 'malformed-parse-tree)])) - (if (eq? (syntax-property top-rule-name-stx 'hide-or-splice?) 'hide) - ;; use `remove-rule-name` so we get the same housekeeping - (remove-rule-name parse-tree-stx) - parse-tree-stx)] - [(source tokenizer) - (parameterize ([current-source source]) - (PARSE tokenizer))]) - (string->symbol (format "~a-rule-parser" 'start-rule)))))] + (with-syntax ([RECOLORED-START-RULE (datum->syntax #'RULES-STX (syntax-e #'START-RULE-ID))]) + #'(let () + (define (rule-parser tokenizer) + (define rule-grammar (cfg-parser (tokens enumerated-tokens) + (src-pos) + (start RECOLORED-START-RULE) + (end EOF) + (error the-error-handler) + (grammar . GENERATED-RULE-CODES))) + (define next-token (make-permissive-tokenizer tokenizer all-tokens-hash/mutable)) + ;; here's how we support grammar "cuts" on top rule name + (define parse-tree-stx (rule-grammar next-token)) + (syntax-case parse-tree-stx () + [(TOP-RULE-NAME . _) + (if (eq? (syntax-property #'TOP-RULE-NAME 'hide-or-splice?) 'hide) + (remove-rule-name parse-tree-stx) ; use `remove-rule-name` so we get the same housekeeping + parse-tree-stx)] + [_ (error 'malformed-parse-tree)])) + (case-lambda [(tokenizer) (rule-parser tokenizer)] + [(source tokenizer) + (parameterize ([current-source source]) + (rule-parser tokenizer))])))] [(_ not-a-rule-id) (raise-syntax-error #f (format "Rule ~a is not defined in the grammar" (syntax-e #'not-a-rule-id)) - rule-id-stx)])) + stx)])) ;; start-id has to be a value, not an expr, because make-rule-parser is a macro (define PARSE (procedure-rename (MAKE-RULE-PARSER START-ID) 'PARSE)) diff --git a/brag-lib/brag/codegen/runtime.rkt b/brag-lib/brag/codegen/runtime.rkt index 315caf4..d2ef10b 100755 --- a/brag-lib/brag/codegen/runtime.rkt +++ b/brag-lib/brag/codegen/runtime.rkt @@ -7,7 +7,7 @@ brag/private/internal-support) -(provide THE-ERROR-HANDLER +(provide the-error-handler make-permissive-tokenizer atomic-datum->syntax positions->srcloc @@ -19,7 +19,7 @@ ;; The level of indirection here is necessary since the yacc grammar wants a ;; function value for the error handler up front. We want to delay that decision ;; till parse time. -(define (THE-ERROR-HANDLER tok-ok? tok-name tok-value start-pos end-pos) +(define (the-error-handler tok-ok? tok-name tok-value start-pos end-pos) (match (positions->srcloc start-pos end-pos) [(list src line col offset span) ((current-parser-error-handler) tok-name diff --git a/brag-lib/brag/examples/subrule.rkt b/brag-lib/brag/examples/subrule.rkt new file mode 100644 index 0000000..7183bfc --- /dev/null +++ b/brag-lib/brag/examples/subrule.rkt @@ -0,0 +1,4 @@ +#lang brag + +start: next +next: "0" diff --git a/brag-lib/brag/test/test-make-rule-parser.rkt b/brag-lib/brag/test/test-make-rule-parser.rkt new file mode 100644 index 0000000..51a47a2 --- /dev/null +++ b/brag-lib/brag/test/test-make-rule-parser.rkt @@ -0,0 +1,17 @@ +#lang racket/base +(require rackunit + brag/support + brag/examples/subrule) + +(define parse-next (make-rule-parser next)) +(define parse-start (make-rule-parser start)) + +(check-equal? (syntax->datum (parse #f "0")) '(start (next "0"))) +(check-equal? (syntax->datum (parse #f "0")) (syntax->datum (parse "0"))) + +(check-equal? (syntax->datum (parse-start #f "0")) '(start (next "0"))) +(check-equal? (syntax->datum (parse-start #f "0")) (syntax->datum (parse-start "0"))) + +(check-equal? (syntax->datum (parse-next #f "0")) '(next "0")) +(check-equal? (syntax->datum (parse-next #f "0")) (syntax->datum (parse-next "0"))) + -- 2.25.1 From 92b7dcc0676e92ca689b72b9f78aea7e570e0f06 Mon Sep 17 00:00:00 2001 From: "D. Ben Knoble" Date: Tue, 16 Nov 2021 12:56:27 -0500 Subject: [PATCH 2/9] scribble: fix indentation in file examples (#27) I copied in the actual file contents. It would perhaps be safest to do some sort of "read the file and splice the contents in," but I'm not sure how to do that at the moment. --- brag/brag/brag.scrbl | 82 ++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/brag/brag/brag.scrbl b/brag/brag/brag.scrbl index 6c40f6c..2babb1d 100755 --- a/brag/brag/brag.scrbl +++ b/brag/brag/brag.scrbl @@ -538,16 +538,24 @@ Here's the definition for brag/examples/simple-line-drawing/semantics #:read my-read #:read-syntax my-read-syntax + #:info my-get-info #:whole-body-readers? #t (require brag/examples/simple-line-drawing/lexer - brag/examples/simple-line-drawing/grammar) + brag/examples/simple-line-drawing/grammar) (define (my-read in) - (syntax->datum (my-read-syntax #f in))) + (syntax->datum (my-read-syntax #f in))) (define (my-read-syntax src ip) - (list (parse src (tokenize ip)))) + (list (parse src (tokenize ip)))) + + (define (my-get-info key default default-filter) + (case key + [(color-lexer) + (dynamic-require 'syntax-color/default-lexer 'default-lexer)] + [else + (default-filter key default)])) }| } @@ -564,43 +572,43 @@ compilation: (require (for-syntax racket/base syntax/parse)) (provide #%module-begin - ;; We reuse Racket's treatment of raw datums, specifically - ;; for strings and numbers: - #%datum - - ;; And otherwise, we provide definitions of these three forms. - ;; During compiliation, Racket uses these definitions to - ;; rewrite into for loops, displays, and newlines. - drawing rows chunk) + ;; We reuse Racket's treatment of raw datums, specifically + ;; for strings and numbers: + #%datum + + ;; And otherwise, we provide definitions of these three forms. + ;; During compiliation, Racket uses these definitions to + ;; rewrite into for loops, displays, and newlines. + drawing rows chunk) ;; Define a few compile-time functions to do the syntax rewriting: (begin-for-syntax - (define (compile-drawing drawing-stx) - (syntax-parse drawing-stx - [({~literal drawing} rows-stxs ...) - - (syntax/loc drawing-stx - (begin rows-stxs ...))])) - - (define (compile-rows rows-stx) - (syntax-parse rows-stx - [({~literal rows} - ({~literal repeat} repeat-number) - chunks ... - ";") - - (syntax/loc rows-stx - (for ([i repeat-number]) - chunks ... - (newline)))])) - - (define (compile-chunk chunk-stx) - (syntax-parse chunk-stx - [({~literal chunk} chunk-size chunk-string) - - (syntax/loc chunk-stx - (for ([k chunk-size]) - (display chunk-string)))]))) + (define (compile-drawing drawing-stx) + (syntax-parse drawing-stx + [({~literal drawing} row-stxs ...) + + (syntax/loc drawing-stx + (begin row-stxs ...))])) + + (define (compile-rows row-stx) + (syntax-parse row-stx + [({~literal rows} + ({~literal repeat} repeat-number) + chunks ... + ";") + + (syntax/loc row-stx + (for ([i repeat-number]) + chunks ... + (newline)))])) + + (define (compile-chunk chunk-stx) + (syntax-parse chunk-stx + [({~literal chunk} chunk-size chunk-string) + + (syntax/loc chunk-stx + (for ([k chunk-size]) + (display chunk-string)))]))) ;; Wire up the use of "drawing", "rows", and "chunk" to these -- 2.25.1 From 1fdd63cd1768b15df8d802dc1fb2a3dd0350db38 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Tue, 11 Jan 2022 19:47:20 -0800 Subject: [PATCH 3/9] refactor to make things clearer --- brag-lib/brag/rules/lexer.rkt | 34 +++++++++++++++---------------- brag-lib/brag/test/test-lexer.rkt | 6 ++++-- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 925e6c9..7d19c95 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -1,4 +1,4 @@ -#lang racket/base +#lang debug at-exp racket/base (require (for-syntax racket/base "parser.rkt")) (require br-parser-tools/lex (prefix-in : br-parser-tools/lex-sre) @@ -35,28 +35,28 @@ (define-lex-abbrev esc-chars (union "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\e")) -(define (unescape-lexeme lexeme quote-char) - ;; convert the literal string representation back into an escape char with lookup table - (define unescapes (hash "a" 7 "b" 8 "t" 9 "n" 10 "v" 11 "f" 12 "r" 13 "e" 27 "\"" 34 "'" 39 "\\" 92)) - (define pat (regexp (format "(?<=^~a\\\\).(?=~a$)" quote-char quote-char))) - (cond - [(regexp-match pat lexeme) - => (λ (m) (string quote-char (integer->char (hash-ref unescapes (car m))) quote-char))] - [else lexeme])) +(require syntax-color/racket-lexer) +(define (unescape-double-quoted-lexeme lexeme) + (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\"))) + +(define (convert-to-double-quoted lexeme) + (define outside-quotes-removed (string-trim lexeme "'")) + (define single-quotes-unescaped (string-replace outside-quotes-removed "\\'" "'")) + (define double-quotes-escaped (string-replace single-quotes-unescaped "\"" "\\\"")) + (define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\"")) + double-quotes-on-ends) (define lex/1 (lexer-src-pos ;; handle whitespace & escape chars within quotes as literal tokens: "\n" "\t" '\n' '\t' ;; match the escaped version, and then unescape them before they become token-LITs - [(:: "'" - (:or (:* (:or "\\'" esc-chars (:~ "'" "\\"))) "\\\\") - "'") - (token-LIT (unescape-lexeme lexeme #\'))] - [(:: "\"" - (:or (:* (:or "\\\"" esc-chars (:~ "\"" "\\"))) "\\\\") - "\"") - (token-LIT (unescape-lexeme lexeme #\"))] + [(:or (:: "'\\\\'") ; aka '\\' + (:: "'" (:* (:or "\\'" esc-chars (:~ "'" "\\"))) "'")) + (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme)))] + [(:or (:: "\"\\\\\"") ; aka "\\" + (:: "\"" (:* (:or "\\\"" esc-chars (:~ "\"" "\\"))) "\"")) + (token-LIT (unescape-double-quoted-lexeme lexeme))] [(:or "()" "Ø" "∅") (token-EMPTY lexeme)] ["(" (token-LPAREN lexeme)] diff --git a/brag-lib/brag/test/test-lexer.rkt b/brag-lib/brag/test/test-lexer.rkt index 49f3774..8e7830d 100755 --- a/brag-lib/brag/test/test-lexer.rkt +++ b/brag-lib/brag/test/test-lexer.rkt @@ -51,11 +51,13 @@ (check-equal? (l "]") '(RBRACKET "]" 1 2)) +;; 220111: lexer now converts single-quoted lexemes +;; to standard Racket-style double-quoted string literal (check-equal? (l "'hello'") - '(LIT "'hello'" 1 8)) + '(LIT "\"hello\"" 1 8)) (check-equal? (l "'he\\'llo'") - '(LIT "'he\\'llo'" 1 10)) + '(LIT "\"he'llo\"" 1 10)) (check-equal? (l "/") '(HIDE "/" 1 2)) -- 2.25.1 From 2eb0b5c920d7d7f78338cbf1c26e54c23b68ad10 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Thu, 13 Jan 2022 04:51:47 -0800 Subject: [PATCH 4/9] support codepoints & pass current tests --- brag-lib/brag/examples/codepoints.rkt | 5 +++++ brag-lib/brag/rules/lexer.rkt | 22 ++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 brag-lib/brag/examples/codepoints.rkt diff --git a/brag-lib/brag/examples/codepoints.rkt b/brag-lib/brag/examples/codepoints.rkt new file mode 100644 index 0000000..c8be6f9 --- /dev/null +++ b/brag-lib/brag/examples/codepoints.rkt @@ -0,0 +1,5 @@ +#lang brag +start: "\101" ; A + | "\U0063" ; c + | "\u64" ; d + | "\x65" ; e \ No newline at end of file diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 7d19c95..8ab5493 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -47,15 +47,29 @@ (define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\"")) double-quotes-on-ends) +(define-lex-abbrev escaped-single-quote "\\'") +(define-lex-abbrev single-quote "'") +(define-lex-abbrev escaped-double-quote "\\\"") +(define-lex-abbrev double-quote "\"") +(define-lex-abbrev escaped-backslash "\\\\") + (define lex/1 (lexer-src-pos ;; handle whitespace & escape chars within quotes as literal tokens: "\n" "\t" '\n' '\t' ;; match the escaped version, and then unescape them before they become token-LITs - [(:or (:: "'\\\\'") ; aka '\\' - (:: "'" (:* (:or "\\'" esc-chars (:~ "'" "\\"))) "'")) + [(:: single-quote + (:or + (:+ escaped-backslash) ; aka '\\' + (intersection (:* (:or escaped-single-quote (:~ single-quote))) + (complement (:: escaped-backslash any-string)))) + single-quote) (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme)))] - [(:or (:: "\"\\\\\"") ; aka "\\" - (:: "\"" (:* (:or "\\\"" esc-chars (:~ "\"" "\\"))) "\"")) + [(:: double-quote + (:or + (:+ escaped-backslash) ; aka "\\" + (intersection (:* (:or escaped-double-quote (:~ double-quote))) + (complement (:: escaped-backslash any-string)))) + double-quote) (token-LIT (unescape-double-quoted-lexeme lexeme))] [(:or "()" "Ø" "∅") (token-EMPTY lexeme)] ["(" -- 2.25.1 From a49c932a548f40846e54bc27824431154698a60b Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Fri, 14 Jan 2022 08:21:13 -0800 Subject: [PATCH 5/9] correct ambiguity with certain escape sequences --- brag-lib/brag/rules/lexer.rkt | 47 +++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 8ab5493..874bce3 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -38,39 +38,54 @@ (require syntax-color/racket-lexer) (define (unescape-double-quoted-lexeme lexeme) + ;; use `read` so brag strings have all the notational semantics of Racket strings (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\"))) (define (convert-to-double-quoted lexeme) + ;; brag supports single-quoted strings, for some reason + ;; (Racket does not. A single quote denotes a datum) + ;; let's convert a single-quoted string into standard double-quoted style + ;; so we can use Racket's `read` function on it. + ;; and thereby support all the standard Racket string elements: + ;; https://docs.racket-lang.org/reference/reader.html#%28part._parse-string%29 (define outside-quotes-removed (string-trim lexeme "'")) (define single-quotes-unescaped (string-replace outside-quotes-removed "\\'" "'")) (define double-quotes-escaped (string-replace single-quotes-unescaped "\"" "\\\"")) (define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\"")) double-quotes-on-ends) -(define-lex-abbrev escaped-single-quote "\\'") +(define-lex-abbrev backslash "\\") (define-lex-abbrev single-quote "'") -(define-lex-abbrev escaped-double-quote "\\\"") +(define-lex-abbrev escaped-single-quote (:: backslash single-quote)) (define-lex-abbrev double-quote "\"") -(define-lex-abbrev escaped-backslash "\\\\") +(define-lex-abbrev escaped-double-quote (:: backslash double-quote)) +(define-lex-abbrev escaped-backslash (:: backslash backslash)) (define lex/1 (lexer-src-pos - ;; handle whitespace & escape chars within quotes as literal tokens: "\n" "\t" '\n' '\t' - ;; match the escaped version, and then unescape them before they become token-LITs + [(:: double-quote ;; start with double quote + (intersection ;; two conditions need to be true inside the quotes: + ;; we can have anything except + ;; a plain double-quote (which would close the quote) + ;; plus we specially allow escaped double quotes and backslashes + (:* (:or escaped-double-quote escaped-backslash (:~ double-quote))) + ;; we must forbid one situation with the string \\" + ;; the problem is that it's ambiguous: + ;; it can be lexed as (:: escaped-backlash double-quote) = \\ + " + ;; or (:: backlash escaped-double-quote) = \ + \" + ;; because escapes should be "left associative", + ;; we forbid the second possibility + (complement (:: any-string backslash escaped-double-quote any-string))) + double-quote) ;; end with double quote + (token-LIT (unescape-double-quoted-lexeme lexeme))] + ;; single-quoted string follows the same pattern, + ;; but with escaped-single-quote instead of escaped-double-quote [(:: single-quote - (:or - (:+ escaped-backslash) ; aka '\\' - (intersection (:* (:or escaped-single-quote (:~ single-quote))) - (complement (:: escaped-backslash any-string)))) + (intersection + (:* (:or escaped-single-quote escaped-backslash (:~ single-quote))) + (complement (:: any-string backslash escaped-single-quote any-string))) single-quote) (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme)))] - [(:: double-quote - (:or - (:+ escaped-backslash) ; aka "\\" - (intersection (:* (:or escaped-double-quote (:~ double-quote))) - (complement (:: escaped-backslash any-string)))) - double-quote) - (token-LIT (unescape-double-quoted-lexeme lexeme))] [(:or "()" "Ø" "∅") (token-EMPTY lexeme)] ["(" (token-LPAREN lexeme)] -- 2.25.1 From b0b073d3934d651b167438a9096ad83008bbba7d Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Fri, 14 Jan 2022 08:21:23 -0800 Subject: [PATCH 6/9] simple codepoint test --- brag-lib/brag/examples/codepoints.rkt | 9 +++++---- brag-lib/brag/test/test-codepoints.rkt | 6 ++++++ 2 files changed, 11 insertions(+), 4 deletions(-) create mode 100755 brag-lib/brag/test/test-codepoints.rkt diff --git a/brag-lib/brag/examples/codepoints.rkt b/brag-lib/brag/examples/codepoints.rkt index c8be6f9..f92afa7 100644 --- a/brag-lib/brag/examples/codepoints.rkt +++ b/brag-lib/brag/examples/codepoints.rkt @@ -1,5 +1,6 @@ #lang brag -start: "\101" ; A - | "\U0063" ; c - | "\u64" ; d - | "\x65" ; e \ No newline at end of file +start: A c d e +A : "\101" ; A +c : "\U0063" ; c +d : "\u64" ; d +e : "\x65" ; e \ No newline at end of file diff --git a/brag-lib/brag/test/test-codepoints.rkt b/brag-lib/brag/test/test-codepoints.rkt new file mode 100755 index 0000000..751804f --- /dev/null +++ b/brag-lib/brag/test/test-codepoints.rkt @@ -0,0 +1,6 @@ +#lang racket/base + +(require brag/examples/codepoints + rackunit) + +(check-equal? (parse-to-datum "Acde") '(start (A "A") (c "c") (d "d") (e "e"))) -- 2.25.1 From 3414c5a0081d2fb12c9c4f401ac75cfb738d13fa Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Fri, 14 Jan 2022 08:42:25 -0800 Subject: [PATCH 7/9] harder test --- brag-lib/brag/examples/codepoints.rkt | 9 ++++----- brag-lib/brag/test/test-codepoints.rkt | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/brag-lib/brag/examples/codepoints.rkt b/brag-lib/brag/examples/codepoints.rkt index f92afa7..c4be182 100644 --- a/brag-lib/brag/examples/codepoints.rkt +++ b/brag-lib/brag/examples/codepoints.rkt @@ -1,6 +1,5 @@ #lang brag -start: A c d e -A : "\101" ; A -c : "\U0063" ; c -d : "\u64" ; d -e : "\x65" ; e \ No newline at end of file +start: A c def +A : "\"\101\\" ; A +c : '\'\U0063\\' ; c +def : "*\u64\x65f" ; de \ No newline at end of file diff --git a/brag-lib/brag/test/test-codepoints.rkt b/brag-lib/brag/test/test-codepoints.rkt index 751804f..d92652f 100755 --- a/brag-lib/brag/test/test-codepoints.rkt +++ b/brag-lib/brag/test/test-codepoints.rkt @@ -3,4 +3,5 @@ (require brag/examples/codepoints rackunit) -(check-equal? (parse-to-datum "Acde") '(start (A "A") (c "c") (d "d") (e "e"))) +(check-equal? (parse-to-datum '("\"A\\" "'c\\" "*def")) + '(start (A "\"A\\") (c "'c\\") (def "*def"))) -- 2.25.1 From 68935b805e4188c326689d35ad013877557558cc Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Fri, 14 Jan 2022 11:51:02 -0800 Subject: [PATCH 8/9] report string-reading error as grammar-parsing error --- brag-lib/brag/rules/lexer.rkt | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 874bce3..9ff7976 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -35,11 +35,16 @@ (define-lex-abbrev esc-chars (union "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\e")) -(require syntax-color/racket-lexer) - -(define (unescape-double-quoted-lexeme lexeme) +(define (unescape-double-quoted-lexeme lexeme start-pos end-pos) ;; use `read` so brag strings have all the notational semantics of Racket strings - (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\"))) + (with-handlers ([exn:fail:read? + (λ (e) ((current-parser-error-handler) + #f + 'error + lexeme + (position->pos start-pos) + (position->pos end-pos)))]) + (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\")))) (define (convert-to-double-quoted lexeme) ;; brag supports single-quoted strings, for some reason @@ -77,7 +82,9 @@ ;; we forbid the second possibility (complement (:: any-string backslash escaped-double-quote any-string))) double-quote) ;; end with double quote - (token-LIT (unescape-double-quoted-lexeme lexeme))] + (let () + (displayln lexeme) + (token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos)))] ;; single-quoted string follows the same pattern, ;; but with escaped-single-quote instead of escaped-double-quote [(:: single-quote @@ -85,7 +92,7 @@ (:* (:or escaped-single-quote escaped-backslash (:~ single-quote))) (complement (:: any-string backslash escaped-single-quote any-string))) single-quote) - (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme)))] + (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme) start-pos end-pos))] [(:or "()" "Ø" "∅") (token-EMPTY lexeme)] ["(" (token-LPAREN lexeme)] -- 2.25.1 From 1b19a1cb0834e1efe4cb5cd46dceeae90a06cd6c Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Fri, 14 Jan 2022 23:00:15 -0800 Subject: [PATCH 9/9] more torturous test --- brag-lib/brag/examples/codepoints.rkt | 5 +++-- brag-lib/brag/rules/lexer.rkt | 11 ++++++----- brag-lib/brag/test/test-codepoints.rkt | 7 +++++-- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/brag-lib/brag/examples/codepoints.rkt b/brag-lib/brag/examples/codepoints.rkt index c4be182..2a618f8 100644 --- a/brag-lib/brag/examples/codepoints.rkt +++ b/brag-lib/brag/examples/codepoints.rkt @@ -1,5 +1,6 @@ #lang brag -start: A c def +start: A c def hello-world A : "\"\101\\" ; A c : '\'\U0063\\' ; c -def : "*\u64\x65f" ; de \ No newline at end of file +def : "*\u64\\\x65f\"" ; de +hello-world : "\150\145\154\154\157\40\167\157\162\154\144" \ No newline at end of file diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 9ff7976..020e8f9 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -1,4 +1,4 @@ -#lang debug at-exp racket/base +#lang at-exp racket/base (require (for-syntax racket/base "parser.rkt")) (require br-parser-tools/lex (prefix-in : br-parser-tools/lex-sre) @@ -79,12 +79,13 @@ ;; it can be lexed as (:: escaped-backlash double-quote) = \\ + " ;; or (:: backlash escaped-double-quote) = \ + \" ;; because escapes should be "left associative", - ;; we forbid the second possibility + ;; we forbid the second possibility + ;; There are still some weird corner cases but the current tests work. + ;; with single and double quotes in the mix, + ;; I'm not sure how much better this can be. (complement (:: any-string backslash escaped-double-quote any-string))) double-quote) ;; end with double quote - (let () - (displayln lexeme) - (token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos)))] + (token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos))] ;; single-quoted string follows the same pattern, ;; but with escaped-single-quote instead of escaped-double-quote [(:: single-quote diff --git a/brag-lib/brag/test/test-codepoints.rkt b/brag-lib/brag/test/test-codepoints.rkt index d92652f..bb3b3b7 100755 --- a/brag-lib/brag/test/test-codepoints.rkt +++ b/brag-lib/brag/test/test-codepoints.rkt @@ -3,5 +3,8 @@ (require brag/examples/codepoints rackunit) -(check-equal? (parse-to-datum '("\"A\\" "'c\\" "*def")) - '(start (A "\"A\\") (c "'c\\") (def "*def"))) +(check-equal? (parse-to-datum '("\"A\\" "'c\\" "*d\\ef\"" "hello world")) + '(start (A "\"A\\") + (c "'c\\") + (def "*d\\ef\"") + (hello-world "hello world"))) -- 2.25.1