4 changed files with 77 additions and 22 deletions
--- a/brag-lib/brag/examples/codepoints.rkt
+++ b/brag-lib/brag/examples/codepoints.rkt
@ -0,0 +1,6 @@
 #lang brag
 start: A c def hello-world
 A : "\"\101\\" ; A
 c : '\'\U0063\\' ; c
 def : "*\u64\\\x65f\"" ; de
 hello-world : "\150\145\154\154\157\40\167\157\162\154\144"
--- a/brag-lib/brag/rules/lexer.rkt
+++ b/brag-lib/brag/rules/lexer.rkt
@ -1,4 +1,4 @@
-#lang racket/base
+#lang at-exp racket/base
 (require (for-syntax racket/base "parser.rkt"))
 (require br-parser-tools/lex
         (prefix-in : br-parser-tools/lex-sre)
@ -35,28 +35,65 @@
 (define-lex-abbrev esc-chars (union "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\e"))
-(define (unescape-lexeme lexeme quote-char)
+(define (unescape-double-quoted-lexeme lexeme start-pos end-pos)
-  ;; convert the literal string representation back into an escape char with lookup table
+  ;; use `read` so brag strings have all the notational semantics of Racket strings
-  (define unescapes (hash "a" 7 "b" 8 "t" 9 "n" 10 "v" 11 "f" 12 "r" 13 "e" 27 "\"" 34 "'" 39 "\\" 92))
+  (with-handlers ([exn:fail:read?
-  (define pat (regexp (format "(?<=^~a\\\\).(?=~a$)" quote-char quote-char)))
+                   (λ (e) ((current-parser-error-handler)
-  (cond
+                           #f
-    [(regexp-match pat lexeme)
+                           'error
-     => (λ (m) (string quote-char (integer->char (hash-ref unescapes (car m))) quote-char))]
+                           lexeme
-    [else lexeme]))
+                           (position->pos start-pos)
-
+                           (position->pos end-pos)))])
    (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\"))))
 (define (convert-to-double-quoted lexeme)
  ;; brag supports single-quoted strings, for some reason
  ;; (Racket does not. A single quote denotes a datum)
  ;; let's convert a single-quoted string into standard double-quoted style
  ;; so we can use Racket's `read` function on it.
  ;; and thereby support all the standard Racket string elements:
  ;; https://docs.racket-lang.org/reference/reader.html#%28part._parse-string%29
  (define outside-quotes-removed (string-trim lexeme "'"))
  (define single-quotes-unescaped (string-replace outside-quotes-removed "\\'" "'"))
  (define double-quotes-escaped (string-replace single-quotes-unescaped "\"" "\\\""))
  (define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\""))
  double-quotes-on-ends)
 (define-lex-abbrev backslash "\\")
 (define-lex-abbrev single-quote "'")
 (define-lex-abbrev escaped-single-quote (:: backslash single-quote))
 (define-lex-abbrev double-quote "\"")
 (define-lex-abbrev escaped-double-quote (:: backslash double-quote))
 (define-lex-abbrev escaped-backslash (:: backslash backslash))
 (define lex/1
  (lexer-src-pos
-   ;; handle whitespace & escape chars within quotes as literal tokens: "\n" "\t" '\n' '\t'
+   [(:: double-quote ;; start with double quote
-   ;; match the escaped version, and then unescape them before they become token-LITs
+        (intersection ;; two conditions need to be true inside the quotes:
-   [(:: "'"
+         ;; we can have anything except
-        (:or (:* (:or "\\'" esc-chars (:~ "'" "\\"))) "\\\\")
+         ;; a plain double-quote (which would close the quote)
-        "'")
+         ;; plus we specially allow escaped double quotes and backslashes
-    (token-LIT (unescape-lexeme lexeme #\'))]
+         (:* (:or escaped-double-quote escaped-backslash (:~ double-quote)))
-   [(:: "\""
+         ;; we must forbid one situation with the string \\"
-        (:or (:* (:or "\\\"" esc-chars (:~ "\"" "\\"))) "\\\\")
+         ;; the problem is that it's ambiguous:
-        "\"")
+         ;; it can be lexed as (:: escaped-backlash double-quote) = \\ + "
-    (token-LIT (unescape-lexeme lexeme #\"))]
+         ;; or  (:: backlash escaped-double-quote) = \ + \"
         ;; because escapes should be "left associative",
         ;; we forbid the second possibility
         ;; There are still some weird corner cases but the current tests work.
         ;; with single and double quotes in the mix,
         ;; I'm not sure how much better this can be.
         (complement (:: any-string backslash escaped-double-quote any-string)))
        double-quote) ;; end with double quote
    (token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos))]
   ;; single-quoted string follows the same pattern,
   ;; but with escaped-single-quote instead of escaped-double-quote
   [(:: single-quote
        (intersection
         (:* (:or escaped-single-quote escaped-backslash (:~ single-quote)))
         (complement (:: any-string backslash escaped-single-quote any-string)))
        single-quote)
    (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme) start-pos end-pos))]
   [(:or "()" "Ø" "∅") (token-EMPTY lexeme)]
   ["("
    (token-LPAREN lexeme)]
--- a/brag-lib/brag/test/test-codepoints.rkt
+++ b/brag-lib/brag/test/test-codepoints.rkt
@ -0,0 +1,10 @@
 #lang racket/base
 (require brag/examples/codepoints
         rackunit)
 (check-equal? (parse-to-datum '("\"A\\" "'c\\" "*d\\ef\"" "hello world"))
              '(start (A "\"A\\")
                      (c "'c\\")
                      (def "*d\\ef\"")
                      (hello-world "hello world")))
--- a/brag-lib/brag/test/test-lexer.rkt
+++ b/brag-lib/brag/test/test-lexer.rkt
@ -51,11 +51,13 @@
 (check-equal? (l "]")
              '(RBRACKET "]" 1 2))
 ;; 220111: lexer now converts single-quoted lexemes
 ;; to standard Racket-style double-quoted string literal
 (check-equal? (l "'hello'")
-              '(LIT "'hello'" 1 8))
+              '(LIT "\"hello\"" 1 8))
 (check-equal? (l "'he\\'llo'")
-              '(LIT "'he\\'llo'" 1 10))
+              '(LIT "\"he'llo\"" 1 10))
 (check-equal? (l "/")
              '(HIDE "/" 1 2))