You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
305 lines
11 KiB
Racket
305 lines
11 KiB
Racket
#lang racket/base
|
|
(require yaragg/parser-tools/yacc
|
|
yaragg/parser-tools/lex
|
|
racket/match
|
|
yaragg/rules/rule-structs)
|
|
|
|
;; A parser for grammars.
|
|
|
|
(provide hide-char
|
|
splice-char
|
|
tokens
|
|
token-LPAREN
|
|
token-RPAREN
|
|
token-HIDE ; for hider
|
|
token-SPLICE ; for splicer
|
|
token-LBRACKET
|
|
token-RBRACKET
|
|
token-PIPE
|
|
token-REPEAT
|
|
token-RULE_HEAD
|
|
token-RULE_HEAD_HIDDEN
|
|
token-RULE_HEAD_SPLICED
|
|
token-ID
|
|
token-LIT
|
|
token-EOF
|
|
token-EMPTY
|
|
grammar-parser
|
|
|
|
current-source
|
|
current-parser-error-handler
|
|
current-prefix-out
|
|
|
|
[struct-out rule]
|
|
[struct-out lhs-id]
|
|
[struct-out pattern]
|
|
[struct-out pattern-id]
|
|
[struct-out pattern-lit]
|
|
[struct-out pattern-token]
|
|
[struct-out pattern-choice]
|
|
[struct-out pattern-repeat]
|
|
[struct-out pattern-seq])
|
|
|
|
(define-tokens tokens (LPAREN
|
|
RPAREN
|
|
LBRACKET
|
|
RBRACKET
|
|
HIDE
|
|
SPLICE
|
|
PIPE
|
|
REPEAT
|
|
RULE_HEAD
|
|
RULE_HEAD_HIDDEN
|
|
RULE_HEAD_SPLICED
|
|
ID
|
|
LIT
|
|
EOF
|
|
EMPTY))
|
|
|
|
(define hide-char #\/)
|
|
(define splice-char #\@)
|
|
|
|
(define id-separators-pattern "(:|::=)")
|
|
|
|
;; grammar-parser: (-> token) -> (listof rule)
|
|
(define grammar-parser
|
|
(parser
|
|
(tokens tokens)
|
|
(src-pos)
|
|
(start rules)
|
|
(end EOF)
|
|
|
|
(grammar
|
|
[rules
|
|
[(rules*) $1]]
|
|
|
|
[rules*
|
|
[(rule rules*)
|
|
(cons $1 $2)]
|
|
[()
|
|
'()]]
|
|
|
|
;; I have a separate token type for rule identifiers to avoid the
|
|
;; shift/reduce conflict that happens with the implicit sequencing
|
|
;; of top-level rules. i.e. the parser can't currently tell, when
|
|
;; it sees an ID, if it should shift or reduce to a new rule.
|
|
[rule
|
|
[(RULE_HEAD pattern)
|
|
(begin
|
|
(define trimmed (regexp-replace (pregexp (format "\\s*~a$" id-separators-pattern)) $1 ""))
|
|
(rule (position->pos $1-start-pos)
|
|
(position->pos $2-end-pos)
|
|
(lhs-id (position->pos $1-start-pos)
|
|
(pos (+ (position-offset $1-start-pos)
|
|
(string-length trimmed))
|
|
(position-line $1-start-pos)
|
|
(position-col $1-start-pos))
|
|
trimmed
|
|
#f)
|
|
$2))]
|
|
|
|
[(RULE_HEAD_HIDDEN pattern) ; slash indicates hiding
|
|
(begin
|
|
(define trimmed (cadr (regexp-match (pregexp (format "~a(\\S+)\\s*~a$" hide-char id-separators-pattern)) $1)))
|
|
(rule (position->pos $1-start-pos)
|
|
(position->pos $2-end-pos)
|
|
(lhs-id (position->pos $1-start-pos)
|
|
(pos (+ (position-offset $1-start-pos)
|
|
(string-length trimmed)
|
|
(string-length "!"))
|
|
(position-line $1-start-pos)
|
|
(position-col $1-start-pos))
|
|
trimmed
|
|
''hide) ; symbol needs to be double quoted in this case
|
|
$2))]
|
|
|
|
[(RULE_HEAD_SPLICED pattern) ; atsign indicates splicing
|
|
(begin
|
|
(define trimmed (cadr (regexp-match (pregexp (format "~a(\\S+)\\s*~a$" splice-char id-separators-pattern)) $1)))
|
|
(rule (position->pos $1-start-pos)
|
|
(position->pos $2-end-pos)
|
|
(lhs-id (position->pos $1-start-pos)
|
|
(pos (+ (position-offset $1-start-pos)
|
|
(string-length trimmed)
|
|
(string-length "@"))
|
|
(position-line $1-start-pos)
|
|
(position-col $1-start-pos))
|
|
trimmed
|
|
''splice) ; symbol needs to be double quoted in this case
|
|
$2))]]
|
|
|
|
[pattern
|
|
[(implicit-pattern-sequence PIPE pattern)
|
|
(if (pattern-choice? $3)
|
|
(pattern-choice (position->pos $1-start-pos)
|
|
(position->pos $3-end-pos)
|
|
(cons $1 (pattern-choice-vals $3))
|
|
#f)
|
|
(pattern-choice (position->pos $1-start-pos)
|
|
(position->pos $3-end-pos)
|
|
(list $1 $3)
|
|
#f))]
|
|
[(implicit-pattern-sequence)
|
|
$1]]
|
|
|
|
[implicit-pattern-sequence
|
|
[(repeatable-pattern implicit-pattern-sequence)
|
|
(if (pattern-seq? $2)
|
|
(pattern-seq (position->pos $1-start-pos)
|
|
(position->pos $2-end-pos)
|
|
(cons $1 (pattern-seq-vals $2))
|
|
#f)
|
|
(pattern-seq (position->pos $1-start-pos)
|
|
(position->pos $2-end-pos)
|
|
(list $1 $2)
|
|
#f))]
|
|
[(repeatable-pattern)
|
|
$1]]
|
|
|
|
[repeatable-pattern
|
|
[(atomic-pattern REPEAT)
|
|
(let ()
|
|
(match-define (cons min-repeat max-repeat)
|
|
(cond [(string=? $2 "*") (cons 0 #f)]
|
|
[(string=? $2 "+") (cons 1 #f)]
|
|
[(string=? $2 "?") (cons 0 1)]
|
|
[(regexp-match #px"^\\{(\\d+)?(,)?(\\d+)?\\}$" $2) ; "{min,max}" with both min & max optional
|
|
=> (λ (m)
|
|
(match-define (list all min range? max) m)
|
|
(let* ([min (if min (string->number min) 0)]
|
|
[max (cond
|
|
[(and range? max) (string->number max)]
|
|
[(not (or range? max)) (if (zero? min)
|
|
#f ; {} -> {0,}
|
|
min)] ; {3} -> {3,3}
|
|
[else #f])])
|
|
(cons min max)))]
|
|
[else (raise-argument-error 'grammar-parse "unknown repetition operator" $2)]))
|
|
(pattern-repeat (position->pos $1-start-pos)
|
|
(position->pos $2-end-pos)
|
|
min-repeat max-repeat $1
|
|
#f))]
|
|
[(atomic-pattern)
|
|
$1]]
|
|
|
|
[atomic-pattern
|
|
[(LIT)
|
|
(pattern-lit (position->pos $1-start-pos)
|
|
(position->pos $1-end-pos)
|
|
(substring $1 1 (sub1 (string-length $1)))
|
|
#f)]
|
|
|
|
[(ID)
|
|
(if (token-id? $1)
|
|
(pattern-token (position->pos $1-start-pos)
|
|
(position->pos $1-end-pos)
|
|
$1
|
|
#f)
|
|
(pattern-id (position->pos $1-start-pos)
|
|
(position->pos $1-end-pos)
|
|
$1
|
|
#f))]
|
|
|
|
[(EMPTY)
|
|
(pattern-repeat (position->pos $1-start-pos)
|
|
(position->pos $1-end-pos)
|
|
0 0 (pattern-lit (position->pos $1-start-pos)
|
|
(position->pos $1-end-pos)
|
|
"" #f)
|
|
#f)]
|
|
|
|
[(LBRACKET pattern RBRACKET)
|
|
(pattern-repeat (position->pos $1-start-pos)
|
|
(position->pos $3-end-pos)
|
|
0 1 $2
|
|
#f)]
|
|
|
|
[(LPAREN pattern RPAREN)
|
|
(relocate-pattern $2 (position->pos $1-start-pos) (position->pos $3-end-pos))]
|
|
|
|
[(HIDE atomic-pattern)
|
|
(relocate-pattern $2 (position->pos $1-start-pos) (position->pos $2-end-pos) 'hide)]
|
|
|
|
[(SPLICE ID)
|
|
;; only works for nonterminals on the right side
|
|
;; (meaningless with terminals)
|
|
(if (token-id? $2)
|
|
(error 'brag "Can't use splice operator with terminal")
|
|
(pattern-id (position->pos $1-start-pos)
|
|
(position->pos $2-end-pos)
|
|
$2
|
|
'splice))]])
|
|
|
|
|
|
(error (lambda (tok-ok? tok-name tok-value start-pos end-pos)
|
|
((current-parser-error-handler) tok-ok? tok-name tok-value (position->pos start-pos) (position->pos end-pos))))))
|
|
|
|
;; relocate-pattern: pattern -> pattern
|
|
;; Rewrites the pattern's start and end pos accordingly.
|
|
(define (relocate-pattern a-pat start-pos end-pos [hide? #f])
|
|
(match a-pat
|
|
[(pattern-id _ _ v h)
|
|
(pattern-id start-pos end-pos v (or hide? h))]
|
|
[(pattern-token _ _ v h)
|
|
(pattern-token start-pos end-pos v (or hide? h))]
|
|
[(pattern-lit _ _ v h)
|
|
(pattern-lit start-pos end-pos v (or hide? h))]
|
|
[(pattern-choice _ _ vs h)
|
|
(pattern-choice start-pos end-pos vs (or hide? h))]
|
|
[(pattern-repeat _ _ min max v h)
|
|
(pattern-repeat start-pos end-pos min max v (or hide? h))]
|
|
[(pattern-seq _ _ vs h)
|
|
(pattern-seq start-pos end-pos vs (or hide? h))]
|
|
[else
|
|
(error 'relocate-pattern "Internal error when relocating ~s\n" a-pat)]))
|
|
|
|
|
|
; token-id: string -> boolean
|
|
;; Produces true if the id we see should be treated as the name of a token.
|
|
;; By convention, tokens are all upper-cased.
|
|
(define (token-id? id)
|
|
(string=? (string-upcase id)
|
|
id))
|
|
|
|
|
|
|
|
;; position->pos: position -> pos
|
|
;; Coerses position structures from yaragg/parser-tools/lex to our own pos structures.
|
|
(define (position->pos a-pos)
|
|
(pos (position-offset a-pos)
|
|
(position-line a-pos)
|
|
(position-col a-pos)))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; During parsing, we should define the source of the input.
|
|
(define current-source (make-parameter #f))
|
|
|
|
(define current-prefix-out (make-parameter #f))
|
|
|
|
|
|
;; When bad things happen, we need to emit errors with source location.
|
|
(struct exn:fail:parse-grammar exn:fail (srclocs)
|
|
#:transparent
|
|
#:property prop:exn:srclocs (lambda (instance)
|
|
(exn:fail:parse-grammar-srclocs instance)))
|
|
|
|
(define current-parser-error-handler
|
|
(make-parameter
|
|
(lambda (tok-ok? tok-name tok-value start-pos end-pos)
|
|
(raise (exn:fail:parse-grammar
|
|
(format "Error while parsing grammar near: ~e [line=~a, column=~a, position=~a]"
|
|
tok-value
|
|
(pos-line start-pos)
|
|
(pos-col start-pos)
|
|
(pos-offset start-pos))
|
|
(current-continuation-marks)
|
|
(list (srcloc (current-source)
|
|
(pos-line start-pos)
|
|
(pos-col start-pos)
|
|
(pos-offset start-pos)
|
|
(and (and (number? (pos-offset end-pos))
|
|
(number? (pos-offset start-pos)))
|
|
(- (pos-offset end-pos)
|
|
(pos-offset start-pos))))))))))
|