From b7bb0aa50ae5707e81102edaba1680ef59fdbb6c Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Sat, 15 Jan 2022 12:40:15 -0800 Subject: [PATCH 1/3] use Racket lexer for double-quoted strings --- brag-lib/brag/examples/codepoints.rkt | 4 ++-- brag-lib/brag/rules/lexer.rkt | 27 ++++++++++++++++++++------ brag-lib/brag/test/test-codepoints.rkt | 4 ++-- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/brag-lib/brag/examples/codepoints.rkt b/brag-lib/brag/examples/codepoints.rkt index 2a618f8..1196aef 100644 --- a/brag-lib/brag/examples/codepoints.rkt +++ b/brag-lib/brag/examples/codepoints.rkt @@ -2,5 +2,5 @@ start: A c def hello-world A : "\"\101\\" ; A c : '\'\U0063\\' ; c -def : "*\u64\\\x65f\"" ; de -hello-world : "\150\145\154\154\157\40\167\157\162\154\144" \ No newline at end of file +def : "*\u64\\\"\\\x65f\"" ; de +hello-world : "\150\145\154\154\157\40\167\157\162\154\144" diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 020e8f9..8f9abad 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -1,11 +1,12 @@ -#lang at-exp racket/base +#lang debug racket/base (require (for-syntax racket/base "parser.rkt")) (require br-parser-tools/lex (prefix-in : br-parser-tools/lex-sre) "parser.rkt" "rule-structs.rkt" (only-in brag/support from/to) - racket/string) + racket/string + syntax-color/racket-lexer) (provide lex/1 tokenize) (module+ lex-abbrevs @@ -66,9 +67,10 @@ (define-lex-abbrev escaped-double-quote (:: backslash double-quote)) (define-lex-abbrev escaped-backslash (:: backslash backslash)) -(define lex/1 +(define brag-lex (lexer-src-pos - [(:: double-quote ;; start with double quote + ;; we are delegating lexing of double-quoted strings to the Racket lexer (see below) + #;[(:: double-quote ;; start with double quote (intersection ;; two conditions need to be true inside the quotes: ;; we can have anything except ;; a plain double-quote (which would close the quote) @@ -146,6 +148,21 @@ (position->pos start-pos) (position->pos end-pos-2)))])) +(define (lex/1 ip) + (cond + [(equal? (peek-bytes 1 0 ip) #"\"") + ;; delegate lexing of strings to the default Racket lexer + ;; this doesn't yet work with single-quoted strings + ;; but maybe there's a way to do that by messing with the readtable + (define-values (line-start col-start pos-start) (port-next-location ip)) + (define-values (str type paren also-pos-start also-pos-end) (racket-lexer ip)) + (define-values (line-end col-end pos-end) (port-next-location ip)) + (make-position-token (token-LIT (string-append "\"" + (read (open-input-string str)) + "\"")) + (make-position pos-start line-start col-start) + (make-position pos-end line-end col-end))] + [else (brag-lex ip)])) ;; This is the helper for the error production. (define lex-nonwhitespace @@ -157,8 +174,6 @@ [(eof) (values "" end-pos)])) - - ;; position->pos: position -> pos ;; Converts position structures from br-parser-tools/lex to our own pos structures. (define (position->pos a-pos) diff --git a/brag-lib/brag/test/test-codepoints.rkt b/brag-lib/brag/test/test-codepoints.rkt index bb3b3b7..13544f8 100755 --- a/brag-lib/brag/test/test-codepoints.rkt +++ b/brag-lib/brag/test/test-codepoints.rkt @@ -3,8 +3,8 @@ (require brag/examples/codepoints rackunit) -(check-equal? (parse-to-datum '("\"A\\" "'c\\" "*d\\ef\"" "hello world")) +(check-equal? (parse-to-datum '("\"A\\" "'c\\" "*d\\\"\\ef\"" "hello world")) '(start (A "\"A\\") (c "'c\\") - (def "*d\\ef\"") + (def "*d\\\"\\ef\"") (hello-world "hello world"))) -- 2.25.1 From 977983e0cb9119f434e2352eac2d8254e5a66dfd Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Sat, 15 Jan 2022 12:53:39 -0800 Subject: [PATCH 2/3] step toward using readtable --- brag-lib/brag/rules/lexer.rkt | 85 +++++++++-------------------------- 1 file changed, 22 insertions(+), 63 deletions(-) diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index 8f9abad..aa9d9a8 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -6,7 +6,8 @@ "rule-structs.rkt" (only-in brag/support from/to) racket/string - syntax-color/racket-lexer) + syntax-color/racket-lexer + racket/match) (provide lex/1 tokenize) (module+ lex-abbrevs @@ -34,19 +35,6 @@ (define-lex-abbrev id (:& (complement (:+ digit)) (:+ id-char))) (define-lex-abbrev id-separator (:or ":" "::=")) -(define-lex-abbrev esc-chars (union "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\e")) - -(define (unescape-double-quoted-lexeme lexeme start-pos end-pos) - ;; use `read` so brag strings have all the notational semantics of Racket strings - (with-handlers ([exn:fail:read? - (λ (e) ((current-parser-error-handler) - #f - 'error - lexeme - (position->pos start-pos) - (position->pos end-pos)))]) - (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\")))) - (define (convert-to-double-quoted lexeme) ;; brag supports single-quoted strings, for some reason ;; (Racket does not. A single quote denotes a datum) @@ -60,43 +48,11 @@ (define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\"")) double-quotes-on-ends) -(define-lex-abbrev backslash "\\") -(define-lex-abbrev single-quote "'") -(define-lex-abbrev escaped-single-quote (:: backslash single-quote)) -(define-lex-abbrev double-quote "\"") -(define-lex-abbrev escaped-double-quote (:: backslash double-quote)) -(define-lex-abbrev escaped-backslash (:: backslash backslash)) - (define brag-lex (lexer-src-pos - ;; we are delegating lexing of double-quoted strings to the Racket lexer (see below) - #;[(:: double-quote ;; start with double quote - (intersection ;; two conditions need to be true inside the quotes: - ;; we can have anything except - ;; a plain double-quote (which would close the quote) - ;; plus we specially allow escaped double quotes and backslashes - (:* (:or escaped-double-quote escaped-backslash (:~ double-quote))) - ;; we must forbid one situation with the string \\" - ;; the problem is that it's ambiguous: - ;; it can be lexed as (:: escaped-backlash double-quote) = \\ + " - ;; or (:: backlash escaped-double-quote) = \ + \" - ;; because escapes should be "left associative", - ;; we forbid the second possibility - ;; There are still some weird corner cases but the current tests work. - ;; with single and double quotes in the mix, - ;; I'm not sure how much better this can be. - (complement (:: any-string backslash escaped-double-quote any-string))) - double-quote) ;; end with double quote - (token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos))] - ;; single-quoted string follows the same pattern, - ;; but with escaped-single-quote instead of escaped-double-quote - [(:: single-quote - (intersection - (:* (:or escaped-single-quote escaped-backslash (:~ single-quote))) - (complement (:: any-string backslash escaped-single-quote any-string))) - single-quote) - (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme) start-pos end-pos))] - [(:or "()" "Ø" "∅") (token-EMPTY lexeme)] + ;; we delegate lexing of strings to the Racket lexer (see below) + [(:or "()" "Ø" "∅") + (token-EMPTY lexeme)] ["(" (token-LPAREN lexeme)] ["[" @@ -148,21 +104,24 @@ (position->pos start-pos) (position->pos end-pos-2)))])) +(define (lex/1-with-racket-lexer ip [conversion-proc values]) + ;; delegate lexing of strings to the default Racket lexer + (define-values (line-start col-start pos-start) (port-next-location ip)) + (define str (read ip)) + (define-values (line-end col-end pos-end) (port-next-location ip)) + #R str + (make-position-token (token-LIT (string-append "\"" str "\"")) + (make-position pos-start line-start col-start) + (make-position pos-end line-end col-end))) + (define (lex/1 ip) - (cond - [(equal? (peek-bytes 1 0 ip) #"\"") - ;; delegate lexing of strings to the default Racket lexer - ;; this doesn't yet work with single-quoted strings - ;; but maybe there's a way to do that by messing with the readtable - (define-values (line-start col-start pos-start) (port-next-location ip)) - (define-values (str type paren also-pos-start also-pos-end) (racket-lexer ip)) - (define-values (line-end col-end pos-end) (port-next-location ip)) - (make-position-token (token-LIT (string-append "\"" - (read (open-input-string str)) - "\"")) - (make-position pos-start line-start col-start) - (make-position pos-end line-end col-end))] - [else (brag-lex ip)])) + (match (peek-bytes 1 0 ip) + [#"\"" (lex/1-with-racket-lexer ip)] + [#"'" (parameterize ([current-readtable (make-readtable (current-readtable) + #\' #\" #f)]) + #R 'lex-single-quoted-string + (lex/1-with-racket-lexer ip convert-to-double-quoted))] + [_ (brag-lex ip)])) ;; This is the helper for the error production. (define lex-nonwhitespace -- 2.25.1 From afe8dbe8ae00d852ba648c9a8bfc57a7c519ed54 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Sat, 15 Jan 2022 20:21:02 -0800 Subject: [PATCH 3/3] restore treatment of single-quoted strings --- brag-lib/brag/rules/lexer.rkt | 43 +++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/brag-lib/brag/rules/lexer.rkt b/brag-lib/brag/rules/lexer.rkt index aa9d9a8..af52495 100755 --- a/brag-lib/brag/rules/lexer.rkt +++ b/brag-lib/brag/rules/lexer.rkt @@ -1,4 +1,4 @@ -#lang debug racket/base +#lang racket/base (require (for-syntax racket/base "parser.rkt")) (require br-parser-tools/lex (prefix-in : br-parser-tools/lex-sre) @@ -35,6 +35,17 @@ (define-lex-abbrev id (:& (complement (:+ digit)) (:+ id-char))) (define-lex-abbrev id-separator (:or ":" "::=")) +(define (unescape-double-quoted-lexeme lexeme start-pos end-pos) + ;; use `read` so brag strings have all the notational semantics of Racket strings + (with-handlers ([exn:fail:read? + (λ (e) ((current-parser-error-handler) + #f + 'error + lexeme + (position->pos start-pos) + (position->pos end-pos)))]) + (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\")))) + (define (convert-to-double-quoted lexeme) ;; brag supports single-quoted strings, for some reason ;; (Racket does not. A single quote denotes a datum) @@ -48,9 +59,23 @@ (define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\"")) double-quotes-on-ends) +(define-lex-abbrev backslash "\\") +(define-lex-abbrev single-quote "'") +(define-lex-abbrev escaped-single-quote (:: backslash single-quote)) +(define-lex-abbrev double-quote "\"") +(define-lex-abbrev escaped-double-quote (:: backslash double-quote)) +(define-lex-abbrev escaped-backslash (:: backslash backslash)) + (define brag-lex (lexer-src-pos - ;; we delegate lexing of strings to the Racket lexer (see below) + ;; we delegate lexing of double-quoted strings to the Racket lexer (see below) + ;; single-quoted string has to be handled manually (see lex/1 for details) + [(:: single-quote + (intersection + (:* (:or escaped-single-quote escaped-backslash (:~ single-quote))) + (complement (:: any-string backslash escaped-single-quote any-string))) + single-quote) + (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme) start-pos end-pos))] [(:or "()" "Ø" "∅") (token-EMPTY lexeme)] ["(" @@ -109,7 +134,6 @@ (define-values (line-start col-start pos-start) (port-next-location ip)) (define str (read ip)) (define-values (line-end col-end pos-end) (port-next-location ip)) - #R str (make-position-token (token-LIT (string-append "\"" str "\"")) (make-position pos-start line-start col-start) (make-position pos-end line-end col-end))) @@ -117,10 +141,15 @@ (define (lex/1 ip) (match (peek-bytes 1 0 ip) [#"\"" (lex/1-with-racket-lexer ip)] - [#"'" (parameterize ([current-readtable (make-readtable (current-readtable) - #\' #\" #f)]) - #R 'lex-single-quoted-string - (lex/1-with-racket-lexer ip convert-to-double-quoted))] + ;; it would be nice to also handle single-quoted strings with the Racket lexer + ;; but we can only change the opening delimiter with the readtable. + ;; for whatever reason, the closing delimiter still has to be a double quote. + ;; "mapping a character to the same action as a " means that the character starts a string, but the string is still terminated with a closing ". " + ;; https://docs.racket-lang.org/reference/readtables.html#%28def._%28%28quote._~23~25kernel%29._make-readtable%29%29 + #;[#"'" (parameterize ([current-readtable (make-readtable (current-readtable) + #\' #\" #f)]) + 'lex-single-quoted-string + (lex/1-with-racket-lexer ip convert-to-double-quoted))] [_ (brag-lex ip)])) ;; This is the helper for the error production. -- 2.25.1