From e5d7cab7cbbcce2791cb4d127cf474664c704c58 Mon Sep 17 00:00:00 2001 From: Jack Firth Date: Thu, 21 Apr 2022 02:00:58 -0700 Subject: [PATCH] Rework the grammar APIs - Make symbol kind mismatches trigger contract errors in more places. - Add a notion of "flat" grammars and "normal" grammars, where normal grammars can have recursive expressions for their substitution instead of just a list of symbols. --- base/grammar.rkt | 310 ++++++++++++++++++++++++++++++++++++++++++---- parser/earley.rkt | 71 +++++------ 2 files changed, 322 insertions(+), 59 deletions(-) diff --git a/base/grammar.rkt b/base/grammar.rkt index 991f26c..1342bc6 100644 --- a/base/grammar.rkt +++ b/base/grammar.rkt @@ -5,24 +5,65 @@ (provide - (struct-out cf-grammar) - (struct-out cf-production-rule) (contract-out - [cf-grammar-start-rules (-> cf-grammar? (set/c cf-production-rule? #:kind 'immutable))] - [make-cf-grammar (-> #:rules (sequence/c cf-production-rule?) #:start-symbol any/c cf-grammar?)] - [make-cf-production-rule - (-> #:nonterminal any/c #:substitution (sequence/c grammar-symbol?) #:action semantic-action? - cf-production-rule?)])) + [grammar (-> #:rules (sequence/c production-rule?) #:start-symbol nonterminal-symbol? grammar?)] + [grammar? predicate/c] + [production-rule + (-> #:nonterminal nonterminal-symbol? + #:action semantic-action? + #:substitution (or/c production-expression? grammar-symbol?) + production-rule?)] + [production-rule? predicate/c] + [production-expression? predicate/c] + [group-expression (-> (sequence/c (or/c production-expression? grammar-symbol?)) group-expression?)] + [group-expression? predicate/c] + [choice-expression + (-> (sequence/c (or/c production-expression? grammar-symbol?)) choice-expression?)] + [choice-expression? predicate/c] + [repetition-expression + (->* ((or/c production-expression? grammar-symbol?)) + (#:min-count exact-nonnegative-integer? #:max-count (or/c exact-nonnegative-integer? +inf.0)) + repetition-expression?)] + [repetition-expression? predicate/c])) -(require racket/sequence +(module+ private + (provide + (contract-out + [grammar-flatten (-> grammar? flat-grammar?)] + [flat-grammar + (-> #:rules (sequence/c flat-production-rule?) #:start-symbol nonterminal-symbol? flat-grammar?)] + [flat-grammar? predicate/c] + [flat-grammar-rules (-> flat-grammar? (vectorof flat-production-rule? #:immutable #true))] + [flat-grammar-start-rules (-> flat-grammar? (set/c flat-production-rule? #:kind 'immutable))] + [flat-grammar-start-symbol (-> flat-grammar? any/c)] + [flat-production-rule + (-> #:nonterminal nonterminal-symbol? + #:substitution (sequence/c grammar-symbol?) + #:action semantic-action? + flat-production-rule?)] + [flat-production-rule? predicate/c] + [flat-production-rule-action (-> flat-production-rule? semantic-action?)] + [flat-production-rule-nonterminal (-> flat-production-rule? any/c)] + [flat-production-rule-substitution + (-> flat-production-rule? (vectorof grammar-symbol? #:immutable #true))]))) + + +(require racket/match + racket/sequence racket/set rebellion/collection/vector - yaragg/base/derivation + rebellion/collection/vector/builder yaragg/base/semantic-action yaragg/base/symbol) +(module+ test + (require (submod "..") + rackunit + rebellion/private/static-name)) + + ;@---------------------------------------------------------------------------------------------------- @@ -34,27 +75,254 @@ ;; A: the labels that grammar rules may have attached to them via the (Label-Action A) semantic ;; action. These show up in parse tree branches, and can be used to determine which production ;; rule produced a derivation. -(struct cf-grammar (rules start-symbol) #:transparent) +(struct flat-grammar (rules start-symbol) + #:transparent + #:omit-define-syntaxes + #:constructor-name constructor:flat-grammar) -(define (cf-grammar-start-rules grammar) - (define start (cf-grammar-start-symbol grammar)) - (for/set ([rule (in-vector (cf-grammar-rules grammar))] - #:when (equal? (cf-production-rule-nonterminal rule) start)) +(define (flat-grammar-start-rules grammar) + (define start (flat-grammar-start-symbol grammar)) + (for/set ([rule (in-vector (flat-grammar-rules grammar))] + #:when (equal? (flat-production-rule-nonterminal rule) start)) rule)) ;; A (Context-Free-Production-Rule T S A) contains a nonterminal symbol of type S, semantic action of ;; type (Semnatic-Action A), and a substitution sequence of (Grammar-Symbol T S) values, stored in an ;; immutable vector. -(struct cf-production-rule (nonterminal action substitution) #:transparent) +(struct flat-production-rule (nonterminal action substitution) + #:transparent + #:omit-define-syntaxes + #:constructor-name constructor:flat-production-rule) + + +(define (flat-grammar #:rules rules #:start-symbol start) + (constructor:flat-grammar (sequence->vector rules) start)) + + +(define (flat-production-rule #:nonterminal nonterminal + #:action action + #:substitution substitution) + (constructor:flat-production-rule nonterminal action (sequence->vector substitution))) + + +(struct grammar (rules start-symbol) + #:transparent + #:guard (λ (rules start-symbol _) (values (sequence->vector rules) start-symbol)) + #:omit-define-syntaxes + #:constructor-name constructor:grammar) + + +(define (grammar #:rules rules #:start-symbol start-symbol) + (constructor:grammar rules start-symbol)) + + +(struct production-rule (nonterminal action substitution) + #:transparent + #:omit-define-syntaxes + #:constructor-name constructor:production-rule) + + +(define (production-rule #:nonterminal nonterminal #:action action #:substitution substitution) + (constructor:production-rule nonterminal action substitution)) + + +(struct production-expression () #:transparent) + + +(struct group-expression production-expression (subexpressions) + #:transparent + #:guard (λ (subexpressions _) (sequence->vector subexpressions))) + + +(struct repetition-expression production-expression (subexpression min-count max-count) + #:transparent + #:omit-define-syntaxes + #:constructor-name constructor:repetition-expression) + + +(define (repetition-expression subexpression #:min-count [min-count 0] #:max-count [max-count +inf.0]) + (constructor:repetition-expression subexpression min-count max-count)) + + +(struct choice-expression production-expression (choices) + #:guard (λ (choices _) (sequence->vector choices)) + #:transparent) + + +(struct virtual-symbol (base-symbol counter) #:transparent) + + +(define (production-rule-flatten rule) + + (define original-nonterminal (production-rule-nonterminal rule)) + + (define new-rules (make-vector-builder)) + (define counter 0) + + (define (fresh-symbol!) + (define sym + (nonterminal-symbol (virtual-symbol (nonterminal-symbol-value original-nonterminal) counter))) + (set! counter (add1 counter)) + sym) + + (define (process-top-level-expression expression) + (match expression + [(group-expression subexpressions) + (for/vector ([expression (in-vector subexpressions)]) + (process-expression expression))] + [_ (list (process-expression expression))])) + + (define (process-expression expression) + (match expression + [(? terminal-symbol?) expression] + [(? nonterminal-symbol?) expression] + + [(group-expression subexpressions) + (define subrule-symbol (fresh-symbol!)) + (define group-symbols + (for/vector ([subexpr (in-vector subexpressions)]) + (process-expression subexpr))) + (define group-rule + (flat-production-rule + #:nonterminal subrule-symbol #:action splice-action #:substitution group-symbols)) + (vector-builder-add new-rules group-rule) + subrule-symbol] + + [(choice-expression choices) + (define subrule-symbol (fresh-symbol!)) + (define choice-symbol-vectors + (for/list ([choice (in-vector choices)]) + (process-top-level-expression choice))) + + ;; We reverse the order the choice symbols are added to ensure the final vector of added rules + ;; is in the right order. + (for ([choice-symbols (in-list (reverse choice-symbol-vectors))]) + (define choice-rule + (flat-production-rule + #:nonterminal subrule-symbol #:action splice-action #:substitution choice-symbols)) + (vector-builder-add new-rules choice-rule)) + subrule-symbol] + + [(? repetition-expression?) + #:when (and (equal? (repetition-expression-min-count expression) 0) + (equal? (repetition-expression-max-count expression) +inf.0)) + (define subexpression (repetition-expression-subexpression expression)) + (define subrule-symbol (fresh-symbol!)) + (define repetition-symbols (process-top-level-expression subexpression)) + (define empty-rule + (flat-production-rule + #:nonterminal subrule-symbol #:action splice-action #:substitution '())) + (define repetition-rule + (flat-production-rule + #:nonterminal subrule-symbol + #:action splice-action + #:substitution + (sequence-append repetition-symbols (list subrule-symbol)))) + (vector-builder-add new-rules empty-rule repetition-rule) + subrule-symbol])) + + (define processed (process-top-level-expression (production-rule-substitution rule))) + + (define top-level-rule + (flat-production-rule + #:nonterminal original-nonterminal + #:action (production-rule-action rule) + #:substitution processed)) + (vector-builder-add new-rules top-level-rule) + (vector-reverse (build-vector new-rules))) + + +(define (vector-reverse vec) + (define size (vector-length vec)) + (define copy (make-vector size)) + (for/vector ([i (in-range 0 size)]) + (define j (- size i 1)) + (vector-set! copy j (vector-ref vec i))) + (vector->immutable-vector copy)) + + +(define (grammar-flatten grammar) + (define builder (make-vector-builder)) + (for ([rule (in-vector (grammar-rules grammar))]) + (vector-builder-add-all builder (production-rule-flatten rule))) + (flat-grammar #:rules (build-vector builder) #:start-symbol (grammar-start-symbol grammar))) + + +(module+ test + (test-case (name-string production-rule-flatten) + (define x (nonterminal-symbol 'x)) + (define x.0 (nonterminal-symbol (virtual-symbol 'x 0))) + (define a (terminal-symbol 'a)) + (define b (terminal-symbol 'b)) + (define c (terminal-symbol 'c)) + (define d (terminal-symbol 'd)) -(define (make-cf-grammar #:rules rules #:start-symbol start) - (cf-grammar (sequence->vector rules) start)) + ;; x: a, (b, c), d + ;; + ;; -> + ;; + ;; x: a, x.0, d + ;; x.0: b, c (* splice *) + (test-case "grouping" + (define rule + (production-rule + #:nonterminal x + #:action (label-action 'x) + #:substitution (group-expression (list a (group-expression (list b c)) d)))) + (define expected-rules + (vector + (flat-production-rule + #:nonterminal x #:action (label-action 'x) #:substitution (list a x.0 d)) + (flat-production-rule #:nonterminal x.0 #:action splice-action #:substitution (list b c)))) + (check-equal? (production-rule-flatten rule) expected-rules)) + ;; x: a, (b | c), d + ;; + ;; -> + ;; + ;; x: a, x.0, d + ;; x.0: b (* splice *) + ;; x.0: c (* splice *) + (test-case "choice" + (define rule + (production-rule + #:nonterminal x + #:action (label-action 'x) + #:substitution (group-expression (list a (choice-expression (list b c)) d)))) + (define expected-rules + (vector + (flat-production-rule + #:nonterminal x #:action (label-action 'x) #:substitution (list a x.0 d)) + (flat-production-rule #:nonterminal x.0 #:action splice-action #:substitution (list b)) + (flat-production-rule #:nonterminal x.0 #:action splice-action #:substitution (list c)))) + (check-equal? (production-rule-flatten rule) expected-rules)) -(define (make-cf-production-rule #:nonterminal nonterminal - #:action action - #:substitution substitution) - (cf-production-rule nonterminal action (sequence->vector substitution))) + ;; x: a b* + ;; + ;; -> + ;; + ;; x: a x.0 + ;; x.0: b x.0 (* splice *) + ;; x.0: (* splice *) + (test-case "repeating" + (define rule + (production-rule + #:nonterminal x + #:action (label-action 'x) + #:substitution (group-expression (list a (repetition-expression b) c)))) + (define expected-rules + (vector + (flat-production-rule + #:nonterminal x #:action (label-action 'x) #:substitution (list a x.0 c)) + (flat-production-rule + #:nonterminal x.0 + #:action splice-action + #:substitution (list b x.0)) + (flat-production-rule + #:nonterminal x.0 + #:action splice-action + #:substitution '()))) + (check-equal? (production-rule-flatten rule) expected-rules)))) diff --git a/parser/earley.rkt b/parser/earley.rkt index 94b5f05..c4be75f 100644 --- a/parser/earley.rkt +++ b/parser/earley.rkt @@ -6,7 +6,7 @@ (provide (contract-out - [earley-parser (-> cf-grammar? parser?)])) + [earley-parser (-> grammar? parser?)])) (require racket/contract @@ -18,6 +18,7 @@ rebellion/private/guarded-block yaragg/base/derivation yaragg/base/grammar + (submod yaragg/base/grammar private) yaragg/base/semantic-action yaragg/base/symbol yaragg/base/token @@ -34,7 +35,8 @@ (define (earley-parser grammar) - (make-parser #:deriver (λ (tokens) (earley-parse grammar tokens)))) + (define flat (grammar-flatten grammar)) + (make-parser #:deriver (λ (tokens) (earley-parse flat tokens)))) ;; The hash keys are sppf-keys and the values are a list of sppf-child-pairs @@ -103,7 +105,7 @@ (guard (complete-sppf-key? key) then (define tok (vector-ref tokens (complete-sppf-key-input-start key))) (stream (terminal-derivation (token-value tok)))) - (define action (cf-production-rule-action (incomplete-sppf-key-rule key))) + (define action (flat-production-rule-action (incomplete-sppf-key-rule key))) (define possible-children (possible-children-lists forest key)) (for*/stream ([children (in-stream possible-children)] [processed-children (in-stream (cartesian-stream (map loop children)))]) @@ -119,9 +121,9 @@ (λ (_) 'earley-state) (λ (this) (define rule (earley-state-rule this)) - (define substitution (cf-production-rule-substitution rule)) + (define substitution (flat-production-rule-substitution rule)) (define pos (earley-state-substitution-position this)) - (append (list (cf-production-rule-nonterminal rule) '->) + (append (list (nonterminal-symbol-value (flat-production-rule-nonterminal rule)) '->) (for/list ([sym (in-vector substitution 0 pos)]) (if (terminal-symbol? sym) (terminal-symbol-value sym) @@ -135,14 +137,14 @@ (define (initial-earley-states grammar) - (for/set ([rule (cf-grammar-start-rules grammar)]) + (for/set ([rule (flat-grammar-start-rules grammar)]) (earley-state rule 0 0 #false))) (define (earley-state-represents-successful-parse? state grammar) (and (zero? (earley-state-input-position state)) - (equal? (cf-production-rule-nonterminal (earley-state-rule state)) - (cf-grammar-start-symbol grammar)))) + (equal? (flat-production-rule-nonterminal (earley-state-rule state)) + (flat-grammar-start-symbol grammar)))) (define (earley-parse grammar token-sequence) @@ -165,7 +167,7 @@ (guard (completed-state? next) then ;; find all states in S(j) of the form (X → α • Y β, j) and add (X → α Y • β, j) (define j (earley-state-input-position next)) - (define completed (cf-production-rule-nonterminal (earley-state-rule next))) + (define completed (flat-production-rule-nonterminal (earley-state-rule next))) (define parent-states (if (equal? j k) (set-union unprocessed processed) @@ -174,7 +176,7 @@ (define symbol (earley-state-next-symbol next)) (guard (nonterminal-symbol? symbol) else (set)) - (predictor-states grammar (nonterminal-symbol-value symbol) k))) + (predictor-states grammar symbol k))) (define new-unprocessed (set-subtract (set-remove added-states next) processed)) (process-states (set-union (set-rest unprocessed) new-unprocessed) (set-add processed next))) @@ -195,13 +197,13 @@ (define (completed-state? state) (match-define (earley-state rule substitution-position _ _) state) (equal? substitution-position - (vector-length (cf-production-rule-substitution rule)))) + (vector-length (flat-production-rule-substitution rule)))) (define/contract (earley-state-next-symbol state) (-> (and/c earley-state? (not/c completed-state?)) grammar-symbol?) (match-define (earley-state rule substitution-position _ _) state) - (vector-ref (cf-production-rule-substitution rule) substitution-position)) + (vector-ref (flat-production-rule-substitution rule) substitution-position)) (define (earley-state-advance-substitution state #:key key) @@ -210,9 +212,8 @@ (define (completer-states completed-nonterminal states completed-key #:forest forest) - (define expected (nonterminal-symbol completed-nonterminal)) (for/set ([s (in-set states)] - #:when (equal? (earley-state-next-symbol s) expected)) + #:when (equal? (earley-state-next-symbol s) completed-nonterminal)) (define rule (earley-state-rule s)) (define start (earley-state-input-position s)) (define end (sppf-key-input-end completed-key)) @@ -225,8 +226,8 @@ (define (predictor-states grammar nonterminal k) ;; add (Y → • γ, k) for every production in the grammar with Y on the left-hand side - (for/set ([rule (in-vector (cf-grammar-rules grammar))] - #:when (equal? (cf-production-rule-nonterminal rule) nonterminal)) + (for/set ([rule (in-vector (flat-grammar-rules grammar))] + #:when (equal? (flat-production-rule-nonterminal rule) nonterminal)) (earley-state rule 0 k #false))) @@ -250,31 +251,25 @@ (test-case "earley-parser integration test" ;; Grammar and input taken from https://en.wikipedia.org/wiki/Earley_parser#Example - (define P-rule - (make-cf-production-rule - #:nonterminal 'P #:action (label-action 'P) #:substitution (list (nonterminal-symbol 'S)))) + (define P (nonterminal-symbol 'P)) + (define S (nonterminal-symbol 'S)) + (define M (nonterminal-symbol 'M)) + (define T (nonterminal-symbol 'T)) + (define + (terminal-symbol '+)) + (define * (terminal-symbol '*)) + (define number (terminal-symbol 'number)) + (define P-rule (production-rule #:nonterminal P #:action (label-action 'P) #:substitution S)) (define S-rule0 - (make-cf-production-rule - #:nonterminal 'S - #:action (label-action 'S0) - #:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M)))) - (define S-rule1 - (make-cf-production-rule - #:nonterminal 'S #:action (label-action 'S1) #:substitution (list (nonterminal-symbol 'M)))) + (production-rule + #:nonterminal S #:action (label-action 'S0) #:substitution (group-expression (list S + M)))) + (define S-rule1 (production-rule #:nonterminal S #:action (label-action 'S1) #:substitution M)) (define M-rule0 - (make-cf-production-rule - #:nonterminal 'M - #:action (label-action 'M0) - #:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T)))) - (define M-rule1 - (make-cf-production-rule - #:nonterminal 'M #:action (label-action 'M1) #:substitution (list (nonterminal-symbol 'T)))) - (define T-rule - (make-cf-production-rule - #:nonterminal 'T #:action (label-action 'T) #:substitution (list (terminal-symbol 'number)))) + (production-rule + #:nonterminal M #:action (label-action 'M0) #:substitution (group-expression (list M * T)))) + (define M-rule1 (production-rule #:nonterminal M #:action (label-action 'M1) #:substitution T)) + (define T-rule (production-rule #:nonterminal T #:action (label-action 'T) #:substitution number)) (define arithmetic-grammar - (make-cf-grammar - #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P)) + (grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol P)) (define parser (earley-parser arithmetic-grammar)) (test-case "datum parser"