brag/brag-lib/brag/rules/lexer.rkt

#lang at-exp racket/base
(require (for-syntax racket/base "parser.rkt"))
(require br-parser-tools/lex
         (prefix-in : br-parser-tools/lex-sre)
         "parser.rkt"
         "rule-structs.rkt"
         (only-in brag/support from/to)
         racket/string)

(provide lex/1 tokenize)
(module+ lex-abbrevs
  (provide hide-char splice-char id-char letter digit NL id))

;; A newline can be any one of the following.
(define-lex-abbrev NL (:or "\r\n" "\r" "\n"))

;; reserved-chars = chars used for quantifiers & parse-tree filtering
(define-for-syntax quantifiers "+:*?{}") ; colon is reserved to separate rules and productions
(define-lex-trans reserved-chars
  (λ(stx) #`(char-set #,(format "~a~a~a" quantifiers hide-char splice-char))))

(define-lex-trans hide-char-trans (λ(stx) #`(char-set #,(format "~a" hide-char))))
(define-lex-trans splice-char-trans (λ(stx) #`(char-set #,(format "~a" splice-char))))

(define-lex-abbrevs
  [letter (:or (:/ "a" "z") (:/ #\A #\Z))]
  [digit (:/ #\0 #\9)]
  [id-char (:or letter digit (:& (char-set "+:*@!-.$%&/=?^_~<>") (char-complement (reserved-chars))))]
  [hide-char (hide-char-trans)]
  [splice-char (splice-char-trans)]
  )

(define-lex-abbrev id (:& (complement (:+ digit)) (:+ id-char)))
(define-lex-abbrev id-separator (:or ":" "::="))

(define-lex-abbrev esc-chars (union "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\e"))

(define (unescape-double-quoted-lexeme lexeme start-pos end-pos)
  ;; use `read` so brag strings have all the notational semantics of Racket strings
  (with-handlers ([exn:fail:read?
                   (λ (e) ((current-parser-error-handler)
                           #f
                           'error
                           lexeme
                           (position->pos start-pos)
                           (position->pos end-pos)))])
    (list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\"))))

(define (convert-to-double-quoted lexeme)
  ;; brag supports single-quoted strings, for some reason
  ;; (Racket does not. A single quote denotes a datum)
  ;; let's convert a single-quoted string into standard double-quoted style
  ;; so we can use Racket's `read` function on it.
  ;; and thereby support all the standard Racket string elements:
  ;; https://docs.racket-lang.org/reference/reader.html#%28part._parse-string%29
  (define outside-quotes-removed (string-trim lexeme "'"))
  (define single-quotes-unescaped (string-replace outside-quotes-removed "\\'" "'"))
  (define double-quotes-escaped (string-replace single-quotes-unescaped "\"" "\\\""))
  (define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\""))
  double-quotes-on-ends)

(define-lex-abbrev backslash "\\")
(define-lex-abbrev single-quote "'")
(define-lex-abbrev escaped-single-quote (:: backslash single-quote))
(define-lex-abbrev double-quote "\"")
(define-lex-abbrev escaped-double-quote (:: backslash double-quote))
(define-lex-abbrev escaped-backslash (:: backslash backslash))

(define lex/1
  (lexer-src-pos
   [(:: double-quote ;; start with double quote
        (intersection ;; two conditions need to be true inside the quotes:
         ;; we can have anything except
         ;; a plain double-quote (which would close the quote)
         ;; plus we specially allow escaped double quotes and backslashes
         (:* (:or escaped-double-quote escaped-backslash (:~ double-quote)))
         ;; we must forbid one situation with the string \\"
         ;; the problem is that it's ambiguous:
         ;; it can be lexed as (:: escaped-backlash double-quote) = \\ + "
         ;; or  (:: backlash escaped-double-quote) = \ + \"
         ;; because escapes should be "left associative",
         ;; we forbid the second possibility
         ;; There are still some weird corner cases but the current tests work.
         ;; with single and double quotes in the mix,
         ;; I'm not sure how much better this can be.
         (complement (:: any-string backslash escaped-double-quote any-string)))
        double-quote) ;; end with double quote
    (token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos))]
   ;; single-quoted string follows the same pattern,
   ;; but with escaped-single-quote instead of escaped-double-quote
   [(:: single-quote
        (intersection
         (:* (:or escaped-single-quote escaped-backslash (:~ single-quote)))
         (complement (:: any-string backslash escaped-single-quote any-string)))
        single-quote)
    (token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme) start-pos end-pos))]
   [(:or "()" "Ø" "∅") (token-EMPTY lexeme)]
   ["("
    (token-LPAREN lexeme)]
   ["["
    (token-LBRACKET lexeme)]
   [")"
    (token-RPAREN lexeme)]
   ["]"
    (token-RBRACKET lexeme)]
   [hide-char
    (token-HIDE lexeme)]
   [splice-char
    (token-SPLICE lexeme)]
   ["|"
    (token-PIPE lexeme)]
   [(:or "+" "*" "?"
         (:: "{" (:* digit) (:? (:: "," (:* digit))) "}"))
    (token-REPEAT lexeme)]
   ;; Skip whitespace
   [whitespace
    (return-without-pos (lex/1 input-port))]
   ;; skip multiline comments
   [(from/to "(*" "*)") (return-without-pos (lex/1 input-port))]
   ;; Skip comments up to end of line
   [(:: (:or "#" ";")
        (complement (:: (:* any-char) NL (:* any-char)))
        (:or NL ""))
    (return-without-pos (lex/1 input-port))]
   ;; skip commas (concatenation is implied)
   ["," (return-without-pos (lex/1 input-port))]
   [(eof)
    (token-EOF lexeme)]
   [(:: id (:* whitespace) id-separator)
    (token-RULE_HEAD lexeme)]
   [(:: hide-char id (:* whitespace) id-separator)
    (token-RULE_HEAD_HIDDEN lexeme)]
   [(:: splice-char id (:* whitespace) id-separator)
    (token-RULE_HEAD_SPLICED lexeme)]
   [id
    (token-ID lexeme)]
   
   ;; We call the error handler for everything else:
   [(:: any-char)
    (let-values ([(rest-of-text end-pos-2)
                  (lex-nonwhitespace input-port)])
      ((current-parser-error-handler)
       #f
       'error
       (string-append lexeme rest-of-text)
       (position->pos start-pos)
       (position->pos end-pos-2)))]))


;; This is the helper for the error production.
(define lex-nonwhitespace
  (lexer
   [(:+ (char-complement whitespace))
    (values lexeme end-pos)]
   [any-char
    (values lexeme end-pos)]
   [(eof)
    (values "" end-pos)]))


;; position->pos: position -> pos
;; Converts position structures from br-parser-tools/lex to our own pos structures.
(define (position->pos a-pos)
  (pos (position-offset a-pos)
       (position-line a-pos)
       (position-col a-pos)))


;; tokenize: input-port -> (-> token)
(define (tokenize ip #:source [source (object-name ip)])
  (λ () (parameterize ([file-path source])
          (lex/1 ip))))
support for codepoint escape sequences in strings (closes #29) (#31) This improves the lexing of escape sequences within strings that appear in a grammar. It relies on Racket’s `read` to interpret these escape sequences rather than a hard-coded hash table. This gives strings in a grammar pretty much the same semantics as standard Racket strings, including support for octal and hex escape sequences for Unicode codepoints. Though this passes all current tests, there are still some oddball corner cases that can be discovered by sticking together certain combinations of escape sequences (backslashes, double quotes, and codepoints). The better solution would be to peek into the input port for a double quote, and if it’s there, use the standard Racket lexer to pull out the string (this lexer already handles the weirdo cases). We can’t do this, however, because brag also supports single-quoted strings, which need to have the same semantics, and the Racket lexer won’t work with those. So I think we’re stuck with the homegrown solution (for consistency with both kinds of quotes) even at the expense of a few unresolved corner cases. Let’s leave that question for another day, as these cases haven’t surfaced in practical use thus far. 2 years ago			`#lang at-exp racket/base`
next 7 years ago			`(require (for-syntax racket/base "parser.rkt"))`
			`(require br-parser-tools/lex`
			`(prefix-in : br-parser-tools/lex-sre)`
			`"parser.rkt"`
			`"rule-structs.rkt"`
add multiline comments; ignore commas 6 years ago			`(only-in brag/support from/to)`
next 7 years ago			`racket/string)`

			`(provide lex/1 tokenize)`
syntax colorer 7 years ago			`(module+ lex-abbrevs`
			`(provide hide-char splice-char id-char letter digit NL id))`
next 7 years ago
			`;; A newline can be any one of the following.`
			`(define-lex-abbrev NL (:or "\r\n" "\r" "\n"))`

Add curly quantifier notation 6 years ago			`;; reserved-chars = chars used for quantifiers & parse-tree filtering`
add ? quantifier 6 years ago			`(define-for-syntax quantifiers "+:*?{}") ; colon is reserved to separate rules and productions`
next 7 years ago			`(define-lex-trans reserved-chars`
			(λ(stx) #`(char-set #,(format "~a~a~a" quantifiers hide-char splice-char))))

			(define-lex-trans hide-char-trans (λ(stx) #`(char-set #,(format "~a" hide-char))))
			(define-lex-trans splice-char-trans (λ(stx) #`(char-set #,(format "~a" splice-char))))

			`(define-lex-abbrevs`
			`[letter (:or (:/ "a" "z") (:/ #\A #\Z))]`
			`[digit (:/ #\0 #\9)]`
			`[id-char (:or letter digit (:& (char-set "+:*@!-.$%&/=?^_~<>") (char-complement (reserved-chars))))]`
			`[hide-char (hide-char-trans)]`
			`[splice-char (splice-char-trans)]`
			`)`

			`(define-lex-abbrev id (:& (complement (:+ digit)) (:+ id-char)))`
permit ::= as rule separator 6 years ago			`(define-lex-abbrev id-separator (:or ":" "::="))`
next 7 years ago
touchup 6 years ago			`(define-lex-abbrev esc-chars (union "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\e"))`
handle escape chars better 6 years ago
support for codepoint escape sequences in strings (closes #29) (#31) This improves the lexing of escape sequences within strings that appear in a grammar. It relies on Racket’s `read` to interpret these escape sequences rather than a hard-coded hash table. This gives strings in a grammar pretty much the same semantics as standard Racket strings, including support for octal and hex escape sequences for Unicode codepoints. Though this passes all current tests, there are still some oddball corner cases that can be discovered by sticking together certain combinations of escape sequences (backslashes, double quotes, and codepoints). The better solution would be to peek into the input port for a double quote, and if it’s there, use the standard Racket lexer to pull out the string (this lexer already handles the weirdo cases). We can’t do this, however, because brag also supports single-quoted strings, which need to have the same semantics, and the Racket lexer won’t work with those. So I think we’re stuck with the homegrown solution (for consistency with both kinds of quotes) even at the expense of a few unresolved corner cases. Let’s leave that question for another day, as these cases haven’t surfaced in practical use thus far. 2 years ago			`(define (unescape-double-quoted-lexeme lexeme start-pos end-pos)`
			;; use `read` so brag strings have all the notational semantics of Racket strings
			`(with-handlers ([exn:fail:read?`
			`(λ (e) ((current-parser-error-handler)`
			`#f`
			`'error`
			`lexeme`
			`(position->pos start-pos)`
			`(position->pos end-pos)))])`
			(list->string `(#\" ,@(string->list (read (open-input-string lexeme))) #\"))))
handle escape chars better 6 years ago
support for codepoint escape sequences in strings (closes #29) (#31) This improves the lexing of escape sequences within strings that appear in a grammar. It relies on Racket’s `read` to interpret these escape sequences rather than a hard-coded hash table. This gives strings in a grammar pretty much the same semantics as standard Racket strings, including support for octal and hex escape sequences for Unicode codepoints. Though this passes all current tests, there are still some oddball corner cases that can be discovered by sticking together certain combinations of escape sequences (backslashes, double quotes, and codepoints). The better solution would be to peek into the input port for a double quote, and if it’s there, use the standard Racket lexer to pull out the string (this lexer already handles the weirdo cases). We can’t do this, however, because brag also supports single-quoted strings, which need to have the same semantics, and the Racket lexer won’t work with those. So I think we’re stuck with the homegrown solution (for consistency with both kinds of quotes) even at the expense of a few unresolved corner cases. Let’s leave that question for another day, as these cases haven’t surfaced in practical use thus far. 2 years ago			`(define (convert-to-double-quoted lexeme)`
			`;; brag supports single-quoted strings, for some reason`
			`;; (Racket does not. A single quote denotes a datum)`
			`;; let's convert a single-quoted string into standard double-quoted style`
			;; so we can use Racket's `read` function on it.
			`;; and thereby support all the standard Racket string elements:`
			`;; https://docs.racket-lang.org/reference/reader.html#%28part._parse-string%29`
			`(define outside-quotes-removed (string-trim lexeme "'"))`
			`(define single-quotes-unescaped (string-replace outside-quotes-removed "\\'" "'"))`
			`(define double-quotes-escaped (string-replace single-quotes-unescaped "\"" "\\\""))`
			`(define double-quotes-on-ends (string-append "\"" double-quotes-escaped "\""))`
			`double-quotes-on-ends)`

			`(define-lex-abbrev backslash "\\")`
			`(define-lex-abbrev single-quote "'")`
			`(define-lex-abbrev escaped-single-quote (:: backslash single-quote))`
			`(define-lex-abbrev double-quote "\"")`
			`(define-lex-abbrev escaped-double-quote (:: backslash double-quote))`
			`(define-lex-abbrev escaped-backslash (:: backslash backslash))`
handle escape chars better 6 years ago
next 7 years ago			`(define lex/1`
			`(lexer-src-pos`
support for codepoint escape sequences in strings (closes #29) (#31) This improves the lexing of escape sequences within strings that appear in a grammar. It relies on Racket’s `read` to interpret these escape sequences rather than a hard-coded hash table. This gives strings in a grammar pretty much the same semantics as standard Racket strings, including support for octal and hex escape sequences for Unicode codepoints. Though this passes all current tests, there are still some oddball corner cases that can be discovered by sticking together certain combinations of escape sequences (backslashes, double quotes, and codepoints). The better solution would be to peek into the input port for a double quote, and if it’s there, use the standard Racket lexer to pull out the string (this lexer already handles the weirdo cases). We can’t do this, however, because brag also supports single-quoted strings, which need to have the same semantics, and the Racket lexer won’t work with those. So I think we’re stuck with the homegrown solution (for consistency with both kinds of quotes) even at the expense of a few unresolved corner cases. Let’s leave that question for another day, as these cases haven’t surfaced in practical use thus far. 2 years ago			`[(:: double-quote ;; start with double quote`
			`(intersection ;; two conditions need to be true inside the quotes:`
			`;; we can have anything except`
			`;; a plain double-quote (which would close the quote)`
			`;; plus we specially allow escaped double quotes and backslashes`
			`(:* (:or escaped-double-quote escaped-backslash (:~ double-quote)))`
			`;; we must forbid one situation with the string \\"`
			`;; the problem is that it's ambiguous:`
			`;; it can be lexed as (:: escaped-backlash double-quote) = \\ + "`
			`;; or (:: backlash escaped-double-quote) = \ + \"`
			`;; because escapes should be "left associative",`
			`;; we forbid the second possibility`
			`;; There are still some weird corner cases but the current tests work.`
			`;; with single and double quotes in the mix,`
			`;; I'm not sure how much better this can be.`
			`(complement (:: any-string backslash escaped-double-quote any-string)))`
			`double-quote) ;; end with double quote`
			`(token-LIT (unescape-double-quoted-lexeme lexeme start-pos end-pos))]`
			`;; single-quoted string follows the same pattern,`
			`;; but with escaped-single-quote instead of escaped-double-quote`
			`[(:: single-quote`
			`(intersection`
			`(:* (:or escaped-single-quote escaped-backslash (:~ single-quote)))`
			`(complement (:: any-string backslash escaped-single-quote any-string)))`
			`single-quote)`
			`(token-LIT (unescape-double-quoted-lexeme (convert-to-double-quoted lexeme) start-pos end-pos))]`
empty parens better than empty brackets 6 years ago			`[(:or "()" "Ø" "∅") (token-EMPTY lexeme)]`
next 7 years ago			`["("`
			`(token-LPAREN lexeme)]`
			`["["`
			`(token-LBRACKET lexeme)]`
			`[")"`
			`(token-RPAREN lexeme)]`
			`["]"`
			`(token-RBRACKET lexeme)]`
			`[hide-char`
			`(token-HIDE lexeme)]`
			`[splice-char`
			`(token-SPLICE lexeme)]`
			`["\|"`
			`(token-PIPE lexeme)]`
add ? quantifier 6 years ago			`[(:or "+" "*" "?"`
Add curly quantifier notation 6 years ago			`(:: "{" (:* digit) (:? (:: "," (:* digit))) "}"))`
next 7 years ago			`(token-REPEAT lexeme)]`
add multiline comments; ignore commas 6 years ago			`;; Skip whitespace`
next 7 years ago			`[whitespace`
			`(return-without-pos (lex/1 input-port))]`
add multiline comments; ignore commas 6 years ago			`;; skip multiline comments`
			`[(from/to "(" ")") (return-without-pos (lex/1 input-port))]`
next 7 years ago			`;; Skip comments up to end of line`
syntax colorer 7 years ago			`[(:: (:or "#" ";")`
next 7 years ago			`(complement (:: (:* any-char) NL (:* any-char)))`
			`(:or NL ""))`
syntax colorer 7 years ago			`(return-without-pos (lex/1 input-port))]`
add multiline comments; ignore commas 6 years ago			`;; skip commas (concatenation is implied)`
			`["," (return-without-pos (lex/1 input-port))]`
next 7 years ago			`[(eof)`
			`(token-EOF lexeme)]`
permit ::= as rule separator 6 years ago			`[(:: id (:* whitespace) id-separator)`
next 7 years ago			`(token-RULE_HEAD lexeme)]`
permit ::= as rule separator 6 years ago			`[(:: hide-char id (:* whitespace) id-separator)`
next 7 years ago			`(token-RULE_HEAD_HIDDEN lexeme)]`
permit ::= as rule separator 6 years ago			`[(:: splice-char id (:* whitespace) id-separator)`
next 7 years ago			`(token-RULE_HEAD_SPLICED lexeme)]`
			`[id`
			`(token-ID lexeme)]`

			`;; We call the error handler for everything else:`
			`[(:: any-char)`
			`(let-values ([(rest-of-text end-pos-2)`
			`(lex-nonwhitespace input-port)])`
			`((current-parser-error-handler)`
			`#f`
			`'error`
			`(string-append lexeme rest-of-text)`
			`(position->pos start-pos)`
			`(position->pos end-pos-2)))]))`


			`;; This is the helper for the error production.`
			`(define lex-nonwhitespace`
			`(lexer`
			`[(:+ (char-complement whitespace))`
			`(values lexeme end-pos)]`
			`[any-char`
			`(values lexeme end-pos)]`
			`[(eof)`
			`(values "" end-pos)]))`



			`;; position->pos: position -> pos`
touchup 6 years ago			`;; Converts position structures from br-parser-tools/lex to our own pos structures.`
next 7 years ago			`(define (position->pos a-pos)`
			`(pos (position-offset a-pos)`
			`(position-line a-pos)`
			`(position-col a-pos)))`



			`;; tokenize: input-port -> (-> token)`
touchup 6 years ago			`(define (tokenize ip #:source [source (object-name ip)])`
			`(λ () (parameterize ([file-path source])`
			`(lex/1 ip))))`