From d9526112937cd8babd01eed2de868c47dbf9d62a Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Wed, 13 Jun 2018 22:54:36 -0700 Subject: [PATCH] handle escape chars better --- brag/examples/whitespace.rkt | 4 +++- brag/rules/lexer.rkt | 29 +++++++++++++++++------------ brag/test/test-whitespace.rkt | 12 ++++++++---- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/brag/examples/whitespace.rkt b/brag/examples/whitespace.rkt index b76ebf4..9b2101a 100644 --- a/brag/examples/whitespace.rkt +++ b/brag/examples/whitespace.rkt @@ -1,6 +1,8 @@ #lang brag -start: (tab | space | newline | letter)* +start: (tab | space | newline | letter | return | all)* tab: '\t' space: " " newline: "\n" +return : "\r" +all : "\a" "\b" "\t" "\n" "\v" "\f" "\r" "\e" letter: "x" | "y" | "z" \ No newline at end of file diff --git a/brag/rules/lexer.rkt b/brag/rules/lexer.rkt index 9f73e61..4b0da8d 100755 --- a/brag/rules/lexer.rkt +++ b/brag/rules/lexer.rkt @@ -32,26 +32,31 @@ (define-lex-abbrev id (:& (complement (:+ digit)) (:+ id-char))) (define-lex-abbrev id-separator (:or ":" "::=")) +(define-lex-abbrev esc-chars (char-set "\\a\\b\\t\\n\\v\\f\\r\\e")) + +(define (escape-lexeme lexeme quote-char) + ;; convert the literal string representation back into an escape char with lookup table + ;; maybe use `read` instead? + (define escapes (hash "a" 7 "b" 8 "t" 9 "n" 10 "v" 11 "f" 12 "r" 13 "e" 27 "\"" 34 "'" 39)) + (define pat (regexp (format "(?<=^~a\\\\).(?=~a$)" quote-char quote-char))) + (cond + [(regexp-match pat lexeme) + => (λ (m) (string quote-char (integer->char (hash-ref escapes (car m))) quote-char))] + [else lexeme])) + + (define lex/1 (lexer-src-pos ;; handle whitespace chars within quotes as literal tokens: "\n" "\t" '\n' '\t' ;; by matching the escaped version, and then unescaping them before they become token-LITs [(:: "'" - (:* (:or "\\'" "\\n" "\\t" (:~ "'" "\\"))) + (:* (:or "\\'" esc-chars (:~ "'" "\\"))) "'") - (token-LIT (case lexeme - [("'\\''") "\"'\""] - [("'\\n'") "'\n'"] - [("'\\t'") "'\t'"] - [else lexeme]))] + (token-LIT (escape-lexeme lexeme #\'))] [(:: "\"" - (:* (:or "\\\"" "\\n" "\\t" (:~ "\"" "\\"))) + (:* (:or "\\\"" esc-chars (:~ "\"" "\\"))) "\"") - (token-LIT (case lexeme - [("\"\\\"\"") "\"\"\""] - [("\"\\n\"") "\"\n\""] - [("\"\\t\"") "\"\t\""] - [else lexeme]))] + (token-LIT (escape-lexeme lexeme #\"))] ["(" (token-LPAREN lexeme)] ["[" diff --git a/brag/test/test-whitespace.rkt b/brag/test/test-whitespace.rkt index 455ee3f..d753eaa 100755 --- a/brag/test/test-whitespace.rkt +++ b/brag/test/test-whitespace.rkt @@ -4,9 +4,13 @@ rackunit) (check-equal? - (parse-to-datum "\ty\n x\tz") - '(start (tab "\t") (letter "y") (newline "\n") (space " ") (letter "x") (tab "\t") (letter "z"))) + (parse-to-datum "\ty\n x\tz\r") + '(start (tab "\t") (letter "y") (newline "\n") (space " ") (letter "x") (tab "\t") (letter "z") (return "\r"))) (check-equal? - (parse-to-datum "\t\n \t") - '(start (tab "\t") (newline "\n") (space " ") (tab "\t"))) + (parse-to-datum "\t\n \t\r") + '(start (tab "\t") (newline "\n") (space " ") (tab "\t") (return "\r"))) + +(check-equal? + (parse-to-datum "\a\b\t\n\v\f\r\e") + '(start (all "\a" "\b" "\t" "\n" "\v" "\f" "\r" "\e")))