Add new grammar API and Earley parser

remotes/jackfirth/master
Jack Firth 2 years ago
parent 5c953a3d08
commit 14949f8241

@ -0,0 +1,288 @@
#lang racket/base
(require racket/contract
racket/match
racket/sequence
racket/set
racket/struct
rebellion/collection/vector
rebellion/private/guarded-block)
(module+ test
(require (submod "..")
rackunit))
;; Parsing takes a (Grammar T S L) and a sequence of (Token T V) and produces a set of
;; (Parser-Derivation V S L) (also called a "parse forest"). A grammar contains an immutable
;; vector of (Context-Free-Production-Rule T S L) and a start symbol of type S.
;; T: the terminals the grammar parses. Corresponds to the type field of the input tokens.
;; S: the nonterminals the grammar rules are defined in terms of. These show up in parse tree
;; branches.
;; L: the lables that grammar rules may have attached to them. These show up in parse tree
;; branches alongside nonterminals, and can be used to determine which production rule for a
;; particular nonterminal produced a derivation.
(struct context-free-grammar (rules start-symbol) #:transparent)
(define (grammar-start-rules grammar)
(define start (context-free-grammar-start-symbol grammar))
(for/set ([rule (in-vector (context-free-grammar-rules grammar))]
#:when (equal? (context-free-production-rule-nonterminal rule) start))
rule))
;; A (Token T V) is a tagged value. The grammar rules are defined in terms of the type tag,
;; whereas the value is what appears in leaf nodes of the resulting parse trees.
(struct token (type value) #:transparent)
;; A (Context-Free-Production-Rule T S L) contains a nonterminal symbol of type S, a label of type L,
;; and a substitution sequence of (Grammar-Symbol T S) values, stored in an immutable vector.
(struct context-free-production-rule (nonterminal label substitution) #:transparent)
;; A (Grammar-Symbol T S) is either a (Terminal-Symbol T) or a (Nonterminal-Symbol S)
(struct grammar-symbol () #:transparent)
(struct terminal-symbol grammar-symbol (value) #:transparent)
(struct nonterminal-symbol grammar-symbol (value) #:transparent)
(define (make-grammar #:rules rules #:start-symbol start)
(context-free-grammar (sequence->vector rules) start))
(define (make-rule #:symbol symbol #:substitution substitution #:label label)
(context-free-production-rule symbol label (sequence->vector substitution)))
;; A (Parser-Derivation V S L) is either a (Terminal-Derivation V) or a (Nonterminal-Derivation T S L)
(struct parser-derivation () #:transparent)
;; A (Terminal-Derivation V) represents a terminal that was matched by the grammar. It contains the
;; value of the (Token T V) that was matched.
(struct terminal-derivation parser-derivation (value) #:transparent)
;; A (Nonterminal-Derivation T S L) represents a nonterminal that was matched by the grammar. It
;; contains the nonterminal symbol of type T that was matched, the label of type L of the specific
;; rule that matched, and an immutable vector of subderivations
(struct nonterminal-derivation parser-derivation (symbol label children) #:transparent)
(define (make-nonterminal-derivation symbol label [children '()])
(nonterminal-derivation symbol label (sequence->vector children)))
(define derivation
(case-lambda
[(value) (terminal-derivation value)]
[(symbol label . children) (make-nonterminal-derivation symbol label children)]))
(define a (terminal-symbol 'a))
(define b (terminal-symbol 'b))
(define S (nonterminal-symbol 'S))
(define as-then-bs-grammar
(make-grammar
#:rules
(list
(make-rule #:symbol (nonterminal-symbol-value S) #:substitution (list a S b) #:label 'recur)
(make-rule #:symbol (nonterminal-symbol-value S) #:substitution (list a b) #:label 'done))
#:start-symbol (nonterminal-symbol-value S)))
(define input
(list (token 'a 'a1) (token 'a 'a2) (token 'a 'a3) (token 'b 'b1) (token 'b 'b2) (token 'b 'b3)))
(define expected-parse-tree
(derivation 'S 'recur
(derivation 'a1)
(derivation 'S 'recur
(derivation 'a2)
(derivation 'S 'done (derivation 'a3) (derivation 'b1))
(derivation 'b2))
(derivation 'b3)))
;; Earley parser
(struct earley-state (rule substitution-position input-position)
#:transparent
#:property prop:custom-print-quotable 'never
#:methods gen:custom-write
[(define write-proc
(make-constructor-style-printer
(λ (_) 'earley-state)
(λ (this)
(define rule (earley-state-rule this))
(define substitution (context-free-production-rule-substitution rule))
(define pos (earley-state-substitution-position this))
(list (list (context-free-production-rule-nonterminal rule) '->)
(for/list ([sym (in-vector substitution 0 pos)])
(if (terminal-symbol? sym)
(terminal-symbol-value sym)
(nonterminal-symbol-value sym)))
(list ')
(for/list ([sym (in-vector substitution pos)])
(if (terminal-symbol? sym)
(terminal-symbol-value sym)
(nonterminal-symbol-value sym)))
(earley-state-input-position this)))))])
(define (initial-earley-states grammar)
(for/set ([rule (grammar-start-rules grammar)])
(earley-state rule 0 0)))
(define (earley-parse grammar token-sequence)
(define tokens (sequence->vector token-sequence))
(define token-count (vector-length tokens))
(define position-count (add1 token-count))
(define states (make-vector position-count (set)))
(vector-set! states 0 (initial-earley-states grammar))
(for ([k (in-range 0 position-count)])
;; Prediction and completion
(define/guard (process-states unprocessed processed)
(guard (set-empty? unprocessed) then
processed)
(define next (set-first unprocessed))
(define added-states
(guarded-block
(guard (completed-state? next) then
;; find all states in S(j) of the form (X → α • Y β, j) and add (X → α Y • β, j)
(define j (earley-state-input-position next))
(define completed (context-free-production-rule-nonterminal (earley-state-rule next)))
(define parent-states
(if (equal? j k)
(set-union unprocessed processed)
(vector-ref states j)))
(completer-states completed parent-states))
(define symbol (earley-state-next-symbol next))
(guard (nonterminal-symbol? symbol) else
(set))
(predictor-states grammar (nonterminal-symbol-value symbol) k)))
(define new-unprocessed (set-subtract (set-remove added-states next) processed))
(process-states (set-union (set-rest unprocessed) new-unprocessed) (set-add processed next)))
(define processed (process-states (vector-ref states k) (set)))
(vector-set! states k processed)
(unless (equal? k token-count)
(vector-set! states (add1 k) (scanner-states processed k (vector-ref tokens k)))))
states)
(define (completed-state? state)
(match-define (earley-state rule substitution-position _) state)
(equal? substitution-position
(vector-length (context-free-production-rule-substitution rule))))
(define/contract (earley-state-next-symbol state)
(-> (and/c earley-state? (not/c completed-state?)) grammar-symbol?)
(match-define (earley-state rule substitution-position _) state)
(vector-ref (context-free-production-rule-substitution rule) substitution-position))
(define (earley-state-advance-substitution state)
(match-define (earley-state rule substitution-position input-position) state)
(earley-state rule (add1 substitution-position) input-position))
(define (completer-states completed-nonterminal states)
(for/set ([s (in-set states)]
#:when (equal? (earley-state-next-symbol s) (nonterminal-symbol completed-nonterminal)))
(earley-state-advance-substitution s)))
(define (predictor-states grammar nonterminal k)
;; add (Y → • γ, k) for every production in the grammar with Y on the left-hand side
(for/set ([rule (in-vector (context-free-grammar-rules grammar))]
#:when (equal? (context-free-production-rule-nonterminal rule) nonterminal))
(earley-state rule 0 k)))
(define (scanner-states states k next-token)
(define type (token-type next-token))
(define expected (terminal-symbol type))
(for/set ([s (in-set states)]
#:when (not (completed-state? s))
#:when (equal? (earley-state-next-symbol s) expected))
(earley-state-advance-substitution s)))
(module+ test
(test-case "earley-parse integration test"
;; Grammar, input, and states taken from https://en.wikipedia.org/wiki/Earley_parser#Example
(define P-rule (make-rule #:symbol 'P #:label 0 #:substitution (list (nonterminal-symbol 'S))))
(define S-rule0
(make-rule
#:symbol 'S
#:label 0
#:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M))))
(define S-rule1 (make-rule #:symbol 'S #:label 1 #:substitution (list (nonterminal-symbol 'M))))
(define M-rule0
(make-rule
#:symbol 'M
#:label 0
#:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T))))
(define M-rule1 (make-rule #:symbol 'M #:label 1 #:substitution (list (nonterminal-symbol 'T))))
(define T-rule (make-rule #:symbol 'T #:label 0 #:substitution (list (terminal-symbol 'number))))
(define arithmetic-grammar
(make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P))
(define input-tokens
(list (token 'number 2) (token '+ '+) (token 'number 3) (token '* '*) (token 'number 4)))
(define arithmetic-parse-table
(earley-parse arithmetic-grammar input-tokens))
(define expected-arithmetic-parse-table
(vector
(set
(earley-state P-rule 0 0)
(earley-state S-rule0 0 0)
(earley-state S-rule1 0 0)
(earley-state M-rule0 0 0)
(earley-state M-rule1 0 0)
(earley-state T-rule 0 0))
(set
(earley-state T-rule 1 0)
(earley-state M-rule0 1 0)
(earley-state M-rule1 1 0)
(earley-state S-rule0 1 0)
(earley-state S-rule1 1 0)
(earley-state P-rule 1 0))
(set
(earley-state S-rule0 2 0)
(earley-state M-rule0 0 2)
(earley-state M-rule1 0 2)
(earley-state T-rule 0 2))
(set
(earley-state T-rule 1 2)
(earley-state M-rule0 1 2)
(earley-state M-rule1 1 2)
(earley-state S-rule0 3 0)
(earley-state S-rule0 1 0)
(earley-state P-rule 1 0))
(set
(earley-state M-rule0 2 2)
(earley-state T-rule 0 4))
(set
(earley-state T-rule 1 4)
(earley-state M-rule0 3 2)
(earley-state M-rule0 1 2)
(earley-state S-rule0 3 0)
(earley-state S-rule0 1 0)
(earley-state P-rule 1 0))))
(check-equal? arithmetic-parse-table expected-arithmetic-parse-table)))
Loading…
Cancel
Save