From 81f4fd25e78ce9cbb0abf39b89d85e07637b4cb5 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Sat, 15 Jan 2022 20:29:40 -0800 Subject: [PATCH] lex double-quoted strings with Racket lexer (#32) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow up on an idea in #31: use the Racket lexer to lex double-quoted strings in a grammar, thereby giving them all the semantics of Racket strings, and preventing any corner cases. This solution can’t be extended to single-quoted strings, however, because even with readtable tomfoolery to recognize a single quote as an opening delimiter for a string, the string still needs to be closed with a double quote. (I don’t know why this should be so, but it is the documented behavior.) Therefore single-quoted strings still are subject to the homegrown lexing solution and the flaws therein. Still, I don’t see that single-quoted strings have ever been a documented feature of brag (or its predecessor ragg). Perhaps they could be dropped altogether. For now, this solution is satisfying, because double-quoted strings are the dominant notation, and this PR will make them as good as they can be. --- brag-lib/brag/examples/codepoints.rkt | 4 +- brag-lib/brag/rules/lexer.rkt | 59 ++++++++++++++------------ brag-lib/brag/test/test-codepoints.rkt | 4 +- 3 files changed, 35 insertions(+), 32 deletions(-) diff --git a/brag-lib/brag/examples/codepoints.rkt b/brag-lib/brag/examples/codepoints.rkt index 2a618f8..1196aef 100644 --- a/brag-lib/brag/examples/codepoints.rkt +++ b/brag-lib/brag/examples/codepoints.rkt @@ -2,5 +2,5 @@ start: A c def hello-world A : "\"\101\\" ; A c : '\'\U0063\\' ; c -def : "*\u64\\\x65f\"" ; de -hello-world : "\150\145\154\154\157\40\167\157\162\154\144" \ No newline at end of file +def : "*\u64\\\"\\\x65f\"" ; de +hello-world : "\150\145\154\154\157\40\167\157\162\154\144" diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 020e8f9..af52495 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -1,11 +1,13 @@ -#lang at-exp racket/base +#lang racket/base (require (for-syntax racket/base "parser.rkt")) (require br-parser-tools/lex (prefix-in : br-parser-tools/lex-sre) "parser.rkt" "rule-structs.rkt" (only-in brag/support from/to) - racket/string) + racket/string + syntax-color/racket-lexer + racket/match) (provide lex/1 tokenize) (module+ lex-abbrevs @@ -33,8 +35,6 @@ (define-lex-abbrev id (:& (complement (:+ digit)) (:+ id-char))) (define-lex-abbrev id-separator (:or ":" "::=")) -(define-lex-abbrev esc-chars (union "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\e")) - (define (unescape-double-quoted-lexeme lexeme start-pos end-pos) ;; use `read` so brag strings have all the notational semantics of Racket strings (with-handlers ([exn:fail:read? @@ -66,35 +66,18 @@ (define-lex-abbrev escaped-double-quote (:: backslash double-quote)) (define-lex-abbrev escaped-backslash (:: backslash backslash)) -(define lex/1 +(define brag-lex (lexer-src-pos - [(:: double-quote ;; start with double quote - (intersection ;; two conditions need to be true inside the quotes: - ;; we can have anything except - ;; a plain double-quote (which would close the quote) - ;; plus we specially allow escaped double quotes and backslashes - (:* (:or escaped-double-quote escaped-backslash (:~ double-quote))) - ;; we must forbid one situation with the string \\" - ;; the problem is that it's ambiguous: - ;; it can be lexed as (:: escaped-backlash double-quote) = \\ + " - ;; or (:: backlash escaped-double-quote) = \ + \" - ;; because escapes should be "left associative", - ;; we forbid the second possibility - ;; There are still some weird corner cases but the current tests work. - ;; with single and double quotes in the mix, - ;; I'm not sure how much better this can be. - (complement (:: any-string backslash escaped-double-quote any-string))) - double-quote) ;; end with double quote - (token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos))] - ;; single-quoted string follows the same pattern, - ;; but with escaped-single-quote instead of escaped-double-quote + ;; we delegate lexing of double-quoted strings to the Racket lexer (see below) + ;; single-quoted string has to be handled manually (see lex/1 for details) [(:: single-quote (intersection (:* (:or escaped-single-quote escaped-backslash (:~ single-quote))) (complement (:: any-string backslash escaped-single-quote any-string))) single-quote) (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme) start-pos end-pos))] - [(:or "()" "Ø" "∅") (token-EMPTY lexeme)] + [(:or "()" "Ø" "∅") + (token-EMPTY lexeme)] ["(" (token-LPAREN lexeme)] ["[" @@ -146,6 +129,28 @@ (position->pos start-pos) (position->pos end-pos-2)))])) +(define (lex/1-with-racket-lexer ip [conversion-proc values]) + ;; delegate lexing of strings to the default Racket lexer + (define-values (line-start col-start pos-start) (port-next-location ip)) + (define str (read ip)) + (define-values (line-end col-end pos-end) (port-next-location ip)) + (make-position-token (token-LIT (string-append "\"" str "\"")) + (make-position pos-start line-start col-start) + (make-position pos-end line-end col-end))) + +(define (lex/1 ip) + (match (peek-bytes 1 0 ip) + [#"\"" (lex/1-with-racket-lexer ip)] + ;; it would be nice to also handle single-quoted strings with the Racket lexer + ;; but we can only change the opening delimiter with the readtable. + ;; for whatever reason, the closing delimiter still has to be a double quote. + ;; "mapping a character to the same action as a " means that the character starts a string, but the string is still terminated with a closing ". " + ;; https://docs.racket-lang.org/reference/readtables.html#%28def._%28%28quote._~23~25kernel%29._make-readtable%29%29 + #;[#"'" (parameterize ([current-readtable (make-readtable (current-readtable) + #\' #\" #f)]) + 'lex-single-quoted-string + (lex/1-with-racket-lexer ip convert-to-double-quoted))] + [_ (brag-lex ip)])) ;; This is the helper for the error production. (define lex-nonwhitespace @@ -157,8 +162,6 @@ [(eof) (values "" end-pos)])) - - ;; position->pos: position -> pos ;; Converts position structures from br-parser-tools/lex to our own pos structures. (define (position->pos a-pos) diff --git a/brag-lib/brag/test/test-codepoints.rkt b/brag-lib/brag/test/test-codepoints.rkt index bb3b3b7..13544f8 100755 --- a/brag-lib/brag/test/test-codepoints.rkt +++ b/brag-lib/brag/test/test-codepoints.rkt @@ -3,8 +3,8 @@ (require brag/examples/codepoints rackunit) -(check-equal? (parse-to-datum '("\"A\\" "'c\\" "*d\\ef\"" "hello world")) +(check-equal? (parse-to-datum '("\"A\\" "'c\\" "*d\\\"\\ef\"" "hello world")) '(start (A "\"A\\") (c "'c\\") - (def "*d\\ef\"") + (def "*d\\\"\\ef\"") (hello-world "hello world")))