diff --git a/grammar.rkt b/grammar.rkt new file mode 100644 index 0000000..14e4aed --- /dev/null +++ b/grammar.rkt @@ -0,0 +1,288 @@ +#lang racket/base + + +(require racket/contract + racket/match + racket/sequence + racket/set + racket/struct + rebellion/collection/vector + rebellion/private/guarded-block) + + +(module+ test + (require (submod "..") + rackunit)) + + +;; Parsing takes a (Grammar T S L) and a sequence of (Token T V) and produces a set of +;; (Parser-Derivation V S L) (also called a "parse forest"). A grammar contains an immutable +;; vector of (Context-Free-Production-Rule T S L) and a start symbol of type S. +;; T: the terminals the grammar parses. Corresponds to the type field of the input tokens. +;; S: the nonterminals the grammar rules are defined in terms of. These show up in parse tree +;; branches. +;; L: the lables that grammar rules may have attached to them. These show up in parse tree +;; branches alongside nonterminals, and can be used to determine which production rule for a +;; particular nonterminal produced a derivation. +(struct context-free-grammar (rules start-symbol) #:transparent) + + +(define (grammar-start-rules grammar) + (define start (context-free-grammar-start-symbol grammar)) + (for/set ([rule (in-vector (context-free-grammar-rules grammar))] + #:when (equal? (context-free-production-rule-nonterminal rule) start)) + rule)) + + +;; A (Token T V) is a tagged value. The grammar rules are defined in terms of the type tag, +;; whereas the value is what appears in leaf nodes of the resulting parse trees. +(struct token (type value) #:transparent) + + +;; A (Context-Free-Production-Rule T S L) contains a nonterminal symbol of type S, a label of type L, +;; and a substitution sequence of (Grammar-Symbol T S) values, stored in an immutable vector. +(struct context-free-production-rule (nonterminal label substitution) #:transparent) + + +;; A (Grammar-Symbol T S) is either a (Terminal-Symbol T) or a (Nonterminal-Symbol S) +(struct grammar-symbol () #:transparent) +(struct terminal-symbol grammar-symbol (value) #:transparent) +(struct nonterminal-symbol grammar-symbol (value) #:transparent) + + +(define (make-grammar #:rules rules #:start-symbol start) + (context-free-grammar (sequence->vector rules) start)) + + +(define (make-rule #:symbol symbol #:substitution substitution #:label label) + (context-free-production-rule symbol label (sequence->vector substitution))) + + +;; A (Parser-Derivation V S L) is either a (Terminal-Derivation V) or a (Nonterminal-Derivation T S L) +(struct parser-derivation () #:transparent) + +;; A (Terminal-Derivation V) represents a terminal that was matched by the grammar. It contains the +;; value of the (Token T V) that was matched. +(struct terminal-derivation parser-derivation (value) #:transparent) + +;; A (Nonterminal-Derivation T S L) represents a nonterminal that was matched by the grammar. It +;; contains the nonterminal symbol of type T that was matched, the label of type L of the specific +;; rule that matched, and an immutable vector of subderivations +(struct nonterminal-derivation parser-derivation (symbol label children) #:transparent) + + +(define (make-nonterminal-derivation symbol label [children '()]) + (nonterminal-derivation symbol label (sequence->vector children))) + + +(define derivation + (case-lambda + [(value) (terminal-derivation value)] + [(symbol label . children) (make-nonterminal-derivation symbol label children)])) + + +(define a (terminal-symbol 'a)) +(define b (terminal-symbol 'b)) +(define S (nonterminal-symbol 'S)) + + +(define as-then-bs-grammar + (make-grammar + #:rules + (list + (make-rule #:symbol (nonterminal-symbol-value S) #:substitution (list a S b) #:label 'recur) + (make-rule #:symbol (nonterminal-symbol-value S) #:substitution (list a b) #:label 'done)) + #:start-symbol (nonterminal-symbol-value S))) + + +(define input + (list (token 'a 'a1) (token 'a 'a2) (token 'a 'a3) (token 'b 'b1) (token 'b 'b2) (token 'b 'b3))) + + +(define expected-parse-tree + (derivation 'S 'recur + (derivation 'a1) + (derivation 'S 'recur + (derivation 'a2) + (derivation 'S 'done (derivation 'a3) (derivation 'b1)) + (derivation 'b2)) + (derivation 'b3))) + + +;; Earley parser + + +(struct earley-state (rule substitution-position input-position) + #:transparent + #:property prop:custom-print-quotable 'never + #:methods gen:custom-write + [(define write-proc + (make-constructor-style-printer + (λ (_) 'earley-state) + (λ (this) + (define rule (earley-state-rule this)) + (define substitution (context-free-production-rule-substitution rule)) + (define pos (earley-state-substitution-position this)) + (list (list (context-free-production-rule-nonterminal rule) '->) + (for/list ([sym (in-vector substitution 0 pos)]) + (if (terminal-symbol? sym) + (terminal-symbol-value sym) + (nonterminal-symbol-value sym))) + (list '•) + (for/list ([sym (in-vector substitution pos)]) + (if (terminal-symbol? sym) + (terminal-symbol-value sym) + (nonterminal-symbol-value sym))) + (earley-state-input-position this)))))]) + + +(define (initial-earley-states grammar) + (for/set ([rule (grammar-start-rules grammar)]) + (earley-state rule 0 0))) + + +(define (earley-parse grammar token-sequence) + (define tokens (sequence->vector token-sequence)) + (define token-count (vector-length tokens)) + (define position-count (add1 token-count)) + (define states (make-vector position-count (set))) + (vector-set! states 0 (initial-earley-states grammar)) + (for ([k (in-range 0 position-count)]) + + ;; Prediction and completion + (define/guard (process-states unprocessed processed) + (guard (set-empty? unprocessed) then + processed) + (define next (set-first unprocessed)) + + (define added-states + (guarded-block + (guard (completed-state? next) then + ;; find all states in S(j) of the form (X → α • Y β, j) and add (X → α Y • β, j) + (define j (earley-state-input-position next)) + (define completed (context-free-production-rule-nonterminal (earley-state-rule next))) + (define parent-states + (if (equal? j k) + (set-union unprocessed processed) + (vector-ref states j))) + (completer-states completed parent-states)) + (define symbol (earley-state-next-symbol next)) + (guard (nonterminal-symbol? symbol) else + (set)) + (predictor-states grammar (nonterminal-symbol-value symbol) k))) + + (define new-unprocessed (set-subtract (set-remove added-states next) processed)) + (process-states (set-union (set-rest unprocessed) new-unprocessed) (set-add processed next))) + + (define processed (process-states (vector-ref states k) (set))) + (vector-set! states k processed) + (unless (equal? k token-count) + (vector-set! states (add1 k) (scanner-states processed k (vector-ref tokens k))))) + states) + + +(define (completed-state? state) + (match-define (earley-state rule substitution-position _) state) + (equal? substitution-position + (vector-length (context-free-production-rule-substitution rule)))) + + +(define/contract (earley-state-next-symbol state) + (-> (and/c earley-state? (not/c completed-state?)) grammar-symbol?) + (match-define (earley-state rule substitution-position _) state) + (vector-ref (context-free-production-rule-substitution rule) substitution-position)) + + +(define (earley-state-advance-substitution state) + (match-define (earley-state rule substitution-position input-position) state) + (earley-state rule (add1 substitution-position) input-position)) + + +(define (completer-states completed-nonterminal states) + (for/set ([s (in-set states)] + #:when (equal? (earley-state-next-symbol s) (nonterminal-symbol completed-nonterminal))) + (earley-state-advance-substitution s))) + + +(define (predictor-states grammar nonterminal k) + ;; add (Y → • γ, k) for every production in the grammar with Y on the left-hand side + (for/set ([rule (in-vector (context-free-grammar-rules grammar))] + #:when (equal? (context-free-production-rule-nonterminal rule) nonterminal)) + (earley-state rule 0 k))) + + +(define (scanner-states states k next-token) + (define type (token-type next-token)) + (define expected (terminal-symbol type)) + (for/set ([s (in-set states)] + #:when (not (completed-state? s)) + #:when (equal? (earley-state-next-symbol s) expected)) + (earley-state-advance-substitution s))) + + + +(module+ test + (test-case "earley-parse integration test" + + ;; Grammar, input, and states taken from https://en.wikipedia.org/wiki/Earley_parser#Example + (define P-rule (make-rule #:symbol 'P #:label 0 #:substitution (list (nonterminal-symbol 'S)))) + (define S-rule0 + (make-rule + #:symbol 'S + #:label 0 + #:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M)))) + (define S-rule1 (make-rule #:symbol 'S #:label 1 #:substitution (list (nonterminal-symbol 'M)))) + (define M-rule0 + (make-rule + #:symbol 'M + #:label 0 + #:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T)))) + (define M-rule1 (make-rule #:symbol 'M #:label 1 #:substitution (list (nonterminal-symbol 'T)))) + (define T-rule (make-rule #:symbol 'T #:label 0 #:substitution (list (terminal-symbol 'number)))) + (define arithmetic-grammar + (make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P)) + (define input-tokens + (list (token 'number 2) (token '+ '+) (token 'number 3) (token '* '*) (token 'number 4))) + + (define arithmetic-parse-table + (earley-parse arithmetic-grammar input-tokens)) + + (define expected-arithmetic-parse-table + (vector + (set + (earley-state P-rule 0 0) + (earley-state S-rule0 0 0) + (earley-state S-rule1 0 0) + (earley-state M-rule0 0 0) + (earley-state M-rule1 0 0) + (earley-state T-rule 0 0)) + (set + (earley-state T-rule 1 0) + (earley-state M-rule0 1 0) + (earley-state M-rule1 1 0) + (earley-state S-rule0 1 0) + (earley-state S-rule1 1 0) + (earley-state P-rule 1 0)) + (set + (earley-state S-rule0 2 0) + (earley-state M-rule0 0 2) + (earley-state M-rule1 0 2) + (earley-state T-rule 0 2)) + (set + (earley-state T-rule 1 2) + (earley-state M-rule0 1 2) + (earley-state M-rule1 1 2) + (earley-state S-rule0 3 0) + (earley-state S-rule0 1 0) + (earley-state P-rule 1 0)) + (set + (earley-state M-rule0 2 2) + (earley-state T-rule 0 4)) + (set + (earley-state T-rule 1 4) + (earley-state M-rule0 3 2) + (earley-state M-rule0 1 2) + (earley-state S-rule0 3 0) + (earley-state S-rule0 1 0) + (earley-state P-rule 1 0)))) + (check-equal? arithmetic-parse-table expected-arithmetic-parse-table)))