Rework the grammar APIs

- Make symbol kind mismatches trigger contract errors in more places.
- Add a notion of "flat" grammars and "normal" grammars, where normal grammars can have recursive expressions for their substitution instead of just a list of symbols.
remotes/jackfirth/master
Jack Firth 2 years ago
parent 95e911dd98
commit e5d7cab7cb

@ -5,24 +5,65 @@
(provide
(struct-out cf-grammar)
(struct-out cf-production-rule)
(contract-out
[cf-grammar-start-rules (-> cf-grammar? (set/c cf-production-rule? #:kind 'immutable))]
[make-cf-grammar (-> #:rules (sequence/c cf-production-rule?) #:start-symbol any/c cf-grammar?)]
[make-cf-production-rule
(-> #:nonterminal any/c #:substitution (sequence/c grammar-symbol?) #:action semantic-action?
cf-production-rule?)]))
[grammar (-> #:rules (sequence/c production-rule?) #:start-symbol nonterminal-symbol? grammar?)]
[grammar? predicate/c]
[production-rule
(-> #:nonterminal nonterminal-symbol?
#:action semantic-action?
#:substitution (or/c production-expression? grammar-symbol?)
production-rule?)]
[production-rule? predicate/c]
[production-expression? predicate/c]
[group-expression (-> (sequence/c (or/c production-expression? grammar-symbol?)) group-expression?)]
[group-expression? predicate/c]
[choice-expression
(-> (sequence/c (or/c production-expression? grammar-symbol?)) choice-expression?)]
[choice-expression? predicate/c]
[repetition-expression
(->* ((or/c production-expression? grammar-symbol?))
(#:min-count exact-nonnegative-integer? #:max-count (or/c exact-nonnegative-integer? +inf.0))
repetition-expression?)]
[repetition-expression? predicate/c]))
(require racket/sequence
(module+ private
(provide
(contract-out
[grammar-flatten (-> grammar? flat-grammar?)]
[flat-grammar
(-> #:rules (sequence/c flat-production-rule?) #:start-symbol nonterminal-symbol? flat-grammar?)]
[flat-grammar? predicate/c]
[flat-grammar-rules (-> flat-grammar? (vectorof flat-production-rule? #:immutable #true))]
[flat-grammar-start-rules (-> flat-grammar? (set/c flat-production-rule? #:kind 'immutable))]
[flat-grammar-start-symbol (-> flat-grammar? any/c)]
[flat-production-rule
(-> #:nonterminal nonterminal-symbol?
#:substitution (sequence/c grammar-symbol?)
#:action semantic-action?
flat-production-rule?)]
[flat-production-rule? predicate/c]
[flat-production-rule-action (-> flat-production-rule? semantic-action?)]
[flat-production-rule-nonterminal (-> flat-production-rule? any/c)]
[flat-production-rule-substitution
(-> flat-production-rule? (vectorof grammar-symbol? #:immutable #true))])))
(require racket/match
racket/sequence
racket/set
rebellion/collection/vector
yaragg/base/derivation
rebellion/collection/vector/builder
yaragg/base/semantic-action
yaragg/base/symbol)
(module+ test
(require (submod "..")
rackunit
rebellion/private/static-name))
;@----------------------------------------------------------------------------------------------------
@ -34,27 +75,254 @@
;; A: the labels that grammar rules may have attached to them via the (Label-Action A) semantic
;; action. These show up in parse tree branches, and can be used to determine which production
;; rule produced a derivation.
(struct cf-grammar (rules start-symbol) #:transparent)
(struct flat-grammar (rules start-symbol)
#:transparent
#:omit-define-syntaxes
#:constructor-name constructor:flat-grammar)
(define (cf-grammar-start-rules grammar)
(define start (cf-grammar-start-symbol grammar))
(for/set ([rule (in-vector (cf-grammar-rules grammar))]
#:when (equal? (cf-production-rule-nonterminal rule) start))
(define (flat-grammar-start-rules grammar)
(define start (flat-grammar-start-symbol grammar))
(for/set ([rule (in-vector (flat-grammar-rules grammar))]
#:when (equal? (flat-production-rule-nonterminal rule) start))
rule))
;; A (Context-Free-Production-Rule T S A) contains a nonterminal symbol of type S, semantic action of
;; type (Semnatic-Action A), and a substitution sequence of (Grammar-Symbol T S) values, stored in an
;; immutable vector.
(struct cf-production-rule (nonterminal action substitution) #:transparent)
(struct flat-production-rule (nonterminal action substitution)
#:transparent
#:omit-define-syntaxes
#:constructor-name constructor:flat-production-rule)
(define (flat-grammar #:rules rules #:start-symbol start)
(constructor:flat-grammar (sequence->vector rules) start))
(define (flat-production-rule #:nonterminal nonterminal
#:action action
#:substitution substitution)
(constructor:flat-production-rule nonterminal action (sequence->vector substitution)))
(struct grammar (rules start-symbol)
#:transparent
#:guard (λ (rules start-symbol _) (values (sequence->vector rules) start-symbol))
#:omit-define-syntaxes
#:constructor-name constructor:grammar)
(define (grammar #:rules rules #:start-symbol start-symbol)
(constructor:grammar rules start-symbol))
(struct production-rule (nonterminal action substitution)
#:transparent
#:omit-define-syntaxes
#:constructor-name constructor:production-rule)
(define (production-rule #:nonterminal nonterminal #:action action #:substitution substitution)
(constructor:production-rule nonterminal action substitution))
(struct production-expression () #:transparent)
(struct group-expression production-expression (subexpressions)
#:transparent
#:guard (λ (subexpressions _) (sequence->vector subexpressions)))
(struct repetition-expression production-expression (subexpression min-count max-count)
#:transparent
#:omit-define-syntaxes
#:constructor-name constructor:repetition-expression)
(define (repetition-expression subexpression #:min-count [min-count 0] #:max-count [max-count +inf.0])
(constructor:repetition-expression subexpression min-count max-count))
(struct choice-expression production-expression (choices)
#:guard (λ (choices _) (sequence->vector choices))
#:transparent)
(struct virtual-symbol (base-symbol counter) #:transparent)
(define (production-rule-flatten rule)
(define original-nonterminal (production-rule-nonterminal rule))
(define new-rules (make-vector-builder))
(define counter 0)
(define (fresh-symbol!)
(define sym
(nonterminal-symbol (virtual-symbol (nonterminal-symbol-value original-nonterminal) counter)))
(set! counter (add1 counter))
sym)
(define (process-top-level-expression expression)
(match expression
[(group-expression subexpressions)
(for/vector ([expression (in-vector subexpressions)])
(process-expression expression))]
[_ (list (process-expression expression))]))
(define (process-expression expression)
(match expression
[(? terminal-symbol?) expression]
[(? nonterminal-symbol?) expression]
[(group-expression subexpressions)
(define subrule-symbol (fresh-symbol!))
(define group-symbols
(for/vector ([subexpr (in-vector subexpressions)])
(process-expression subexpr)))
(define group-rule
(flat-production-rule
#:nonterminal subrule-symbol #:action splice-action #:substitution group-symbols))
(vector-builder-add new-rules group-rule)
subrule-symbol]
[(choice-expression choices)
(define subrule-symbol (fresh-symbol!))
(define choice-symbol-vectors
(for/list ([choice (in-vector choices)])
(process-top-level-expression choice)))
;; We reverse the order the choice symbols are added to ensure the final vector of added rules
;; is in the right order.
(for ([choice-symbols (in-list (reverse choice-symbol-vectors))])
(define choice-rule
(flat-production-rule
#:nonterminal subrule-symbol #:action splice-action #:substitution choice-symbols))
(vector-builder-add new-rules choice-rule))
subrule-symbol]
[(? repetition-expression?)
#:when (and (equal? (repetition-expression-min-count expression) 0)
(equal? (repetition-expression-max-count expression) +inf.0))
(define subexpression (repetition-expression-subexpression expression))
(define subrule-symbol (fresh-symbol!))
(define repetition-symbols (process-top-level-expression subexpression))
(define empty-rule
(flat-production-rule
#:nonterminal subrule-symbol #:action splice-action #:substitution '()))
(define repetition-rule
(flat-production-rule
#:nonterminal subrule-symbol
#:action splice-action
#:substitution
(sequence-append repetition-symbols (list subrule-symbol))))
(vector-builder-add new-rules empty-rule repetition-rule)
subrule-symbol]))
(define processed (process-top-level-expression (production-rule-substitution rule)))
(define top-level-rule
(flat-production-rule
#:nonterminal original-nonterminal
#:action (production-rule-action rule)
#:substitution processed))
(vector-builder-add new-rules top-level-rule)
(vector-reverse (build-vector new-rules)))
(define (vector-reverse vec)
(define size (vector-length vec))
(define copy (make-vector size))
(for/vector ([i (in-range 0 size)])
(define j (- size i 1))
(vector-set! copy j (vector-ref vec i)))
(vector->immutable-vector copy))
(define (grammar-flatten grammar)
(define builder (make-vector-builder))
(for ([rule (in-vector (grammar-rules grammar))])
(vector-builder-add-all builder (production-rule-flatten rule)))
(flat-grammar #:rules (build-vector builder) #:start-symbol (grammar-start-symbol grammar)))
(module+ test
(test-case (name-string production-rule-flatten)
(define x (nonterminal-symbol 'x))
(define x.0 (nonterminal-symbol (virtual-symbol 'x 0)))
(define a (terminal-symbol 'a))
(define b (terminal-symbol 'b))
(define c (terminal-symbol 'c))
(define d (terminal-symbol 'd))
(define (make-cf-grammar #:rules rules #:start-symbol start)
(cf-grammar (sequence->vector rules) start))
;; x: a, (b, c), d
;;
;; ->
;;
;; x: a, x.0, d
;; x.0: b, c (* splice *)
(test-case "grouping"
(define rule
(production-rule
#:nonterminal x
#:action (label-action 'x)
#:substitution (group-expression (list a (group-expression (list b c)) d))))
(define expected-rules
(vector
(flat-production-rule
#:nonterminal x #:action (label-action 'x) #:substitution (list a x.0 d))
(flat-production-rule #:nonterminal x.0 #:action splice-action #:substitution (list b c))))
(check-equal? (production-rule-flatten rule) expected-rules))
;; x: a, (b | c), d
;;
;; ->
;;
;; x: a, x.0, d
;; x.0: b (* splice *)
;; x.0: c (* splice *)
(test-case "choice"
(define rule
(production-rule
#:nonterminal x
#:action (label-action 'x)
#:substitution (group-expression (list a (choice-expression (list b c)) d))))
(define expected-rules
(vector
(flat-production-rule
#:nonterminal x #:action (label-action 'x) #:substitution (list a x.0 d))
(flat-production-rule #:nonterminal x.0 #:action splice-action #:substitution (list b))
(flat-production-rule #:nonterminal x.0 #:action splice-action #:substitution (list c))))
(check-equal? (production-rule-flatten rule) expected-rules))
(define (make-cf-production-rule #:nonterminal nonterminal
#:action action
#:substitution substitution)
(cf-production-rule nonterminal action (sequence->vector substitution)))
;; x: a b*
;;
;; ->
;;
;; x: a x.0
;; x.0: b x.0 (* splice *)
;; x.0: (* splice *)
(test-case "repeating"
(define rule
(production-rule
#:nonterminal x
#:action (label-action 'x)
#:substitution (group-expression (list a (repetition-expression b) c))))
(define expected-rules
(vector
(flat-production-rule
#:nonterminal x #:action (label-action 'x) #:substitution (list a x.0 c))
(flat-production-rule
#:nonterminal x.0
#:action splice-action
#:substitution (list b x.0))
(flat-production-rule
#:nonterminal x.0
#:action splice-action
#:substitution '())))
(check-equal? (production-rule-flatten rule) expected-rules))))

@ -6,7 +6,7 @@
(provide
(contract-out
[earley-parser (-> cf-grammar? parser?)]))
[earley-parser (-> grammar? parser?)]))
(require racket/contract
@ -18,6 +18,7 @@
rebellion/private/guarded-block
yaragg/base/derivation
yaragg/base/grammar
(submod yaragg/base/grammar private)
yaragg/base/semantic-action
yaragg/base/symbol
yaragg/base/token
@ -34,7 +35,8 @@
(define (earley-parser grammar)
(make-parser #:deriver (λ (tokens) (earley-parse grammar tokens))))
(define flat (grammar-flatten grammar))
(make-parser #:deriver (λ (tokens) (earley-parse flat tokens))))
;; The hash keys are sppf-keys and the values are a list of sppf-child-pairs
@ -103,7 +105,7 @@
(guard (complete-sppf-key? key) then
(define tok (vector-ref tokens (complete-sppf-key-input-start key)))
(stream (terminal-derivation (token-value tok))))
(define action (cf-production-rule-action (incomplete-sppf-key-rule key)))
(define action (flat-production-rule-action (incomplete-sppf-key-rule key)))
(define possible-children (possible-children-lists forest key))
(for*/stream ([children (in-stream possible-children)]
[processed-children (in-stream (cartesian-stream (map loop children)))])
@ -119,9 +121,9 @@
(λ (_) 'earley-state)
(λ (this)
(define rule (earley-state-rule this))
(define substitution (cf-production-rule-substitution rule))
(define substitution (flat-production-rule-substitution rule))
(define pos (earley-state-substitution-position this))
(append (list (cf-production-rule-nonterminal rule) '->)
(append (list (nonterminal-symbol-value (flat-production-rule-nonterminal rule)) '->)
(for/list ([sym (in-vector substitution 0 pos)])
(if (terminal-symbol? sym)
(terminal-symbol-value sym)
@ -135,14 +137,14 @@
(define (initial-earley-states grammar)
(for/set ([rule (cf-grammar-start-rules grammar)])
(for/set ([rule (flat-grammar-start-rules grammar)])
(earley-state rule 0 0 #false)))
(define (earley-state-represents-successful-parse? state grammar)
(and (zero? (earley-state-input-position state))
(equal? (cf-production-rule-nonterminal (earley-state-rule state))
(cf-grammar-start-symbol grammar))))
(equal? (flat-production-rule-nonterminal (earley-state-rule state))
(flat-grammar-start-symbol grammar))))
(define (earley-parse grammar token-sequence)
@ -165,7 +167,7 @@
(guard (completed-state? next) then
;; find all states in S(j) of the form (X → α • Y β, j) and add (X → α Y • β, j)
(define j (earley-state-input-position next))
(define completed (cf-production-rule-nonterminal (earley-state-rule next)))
(define completed (flat-production-rule-nonterminal (earley-state-rule next)))
(define parent-states
(if (equal? j k)
(set-union unprocessed processed)
@ -174,7 +176,7 @@
(define symbol (earley-state-next-symbol next))
(guard (nonterminal-symbol? symbol) else
(set))
(predictor-states grammar (nonterminal-symbol-value symbol) k)))
(predictor-states grammar symbol k)))
(define new-unprocessed (set-subtract (set-remove added-states next) processed))
(process-states (set-union (set-rest unprocessed) new-unprocessed) (set-add processed next)))
@ -195,13 +197,13 @@
(define (completed-state? state)
(match-define (earley-state rule substitution-position _ _) state)
(equal? substitution-position
(vector-length (cf-production-rule-substitution rule))))
(vector-length (flat-production-rule-substitution rule))))
(define/contract (earley-state-next-symbol state)
(-> (and/c earley-state? (not/c completed-state?)) grammar-symbol?)
(match-define (earley-state rule substitution-position _ _) state)
(vector-ref (cf-production-rule-substitution rule) substitution-position))
(vector-ref (flat-production-rule-substitution rule) substitution-position))
(define (earley-state-advance-substitution state #:key key)
@ -210,9 +212,8 @@
(define (completer-states completed-nonterminal states completed-key #:forest forest)
(define expected (nonterminal-symbol completed-nonterminal))
(for/set ([s (in-set states)]
#:when (equal? (earley-state-next-symbol s) expected))
#:when (equal? (earley-state-next-symbol s) completed-nonterminal))
(define rule (earley-state-rule s))
(define start (earley-state-input-position s))
(define end (sppf-key-input-end completed-key))
@ -225,8 +226,8 @@
(define (predictor-states grammar nonterminal k)
;; add (Y → • γ, k) for every production in the grammar with Y on the left-hand side
(for/set ([rule (in-vector (cf-grammar-rules grammar))]
#:when (equal? (cf-production-rule-nonterminal rule) nonterminal))
(for/set ([rule (in-vector (flat-grammar-rules grammar))]
#:when (equal? (flat-production-rule-nonterminal rule) nonterminal))
(earley-state rule 0 k #false)))
@ -250,31 +251,25 @@
(test-case "earley-parser integration test"
;; Grammar and input taken from https://en.wikipedia.org/wiki/Earley_parser#Example
(define P-rule
(make-cf-production-rule
#:nonterminal 'P #:action (label-action 'P) #:substitution (list (nonterminal-symbol 'S))))
(define P (nonterminal-symbol 'P))
(define S (nonterminal-symbol 'S))
(define M (nonterminal-symbol 'M))
(define T (nonterminal-symbol 'T))
(define + (terminal-symbol '+))
(define * (terminal-symbol '*))
(define number (terminal-symbol 'number))
(define P-rule (production-rule #:nonterminal P #:action (label-action 'P) #:substitution S))
(define S-rule0
(make-cf-production-rule
#:nonterminal 'S
#:action (label-action 'S0)
#:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M))))
(define S-rule1
(make-cf-production-rule
#:nonterminal 'S #:action (label-action 'S1) #:substitution (list (nonterminal-symbol 'M))))
(production-rule
#:nonterminal S #:action (label-action 'S0) #:substitution (group-expression (list S + M))))
(define S-rule1 (production-rule #:nonterminal S #:action (label-action 'S1) #:substitution M))
(define M-rule0
(make-cf-production-rule
#:nonterminal 'M
#:action (label-action 'M0)
#:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T))))
(define M-rule1
(make-cf-production-rule
#:nonterminal 'M #:action (label-action 'M1) #:substitution (list (nonterminal-symbol 'T))))
(define T-rule
(make-cf-production-rule
#:nonterminal 'T #:action (label-action 'T) #:substitution (list (terminal-symbol 'number))))
(production-rule
#:nonterminal M #:action (label-action 'M0) #:substitution (group-expression (list M * T))))
(define M-rule1 (production-rule #:nonterminal M #:action (label-action 'M1) #:substitution T))
(define T-rule (production-rule #:nonterminal T #:action (label-action 'T) #:substitution number))
(define arithmetic-grammar
(make-cf-grammar
#:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P))
(grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol P))
(define parser (earley-parser arithmetic-grammar))
(test-case "datum parser"

Loading…
Cancel
Save