|
|
#lang racket/base
|
|
|
|
|
|
|
|
|
(require racket/contract
|
|
|
racket/match
|
|
|
racket/sequence
|
|
|
racket/set
|
|
|
racket/struct
|
|
|
rebellion/collection/vector
|
|
|
rebellion/private/guarded-block)
|
|
|
|
|
|
|
|
|
(module+ test
|
|
|
(require (submod "..")
|
|
|
rackunit))
|
|
|
|
|
|
|
|
|
;; Parsing takes a (Grammar T S L) and a sequence of (Token T V) and produces a set of
|
|
|
;; (Parser-Derivation V S L) (also called a "parse forest"). A grammar contains an immutable
|
|
|
;; vector of (Context-Free-Production-Rule T S L) and a start symbol of type S.
|
|
|
;; T: the terminals the grammar parses. Corresponds to the type field of the input tokens.
|
|
|
;; S: the nonterminals the grammar rules are defined in terms of. These show up in parse tree
|
|
|
;; branches.
|
|
|
;; L: the lables that grammar rules may have attached to them. These show up in parse tree
|
|
|
;; branches alongside nonterminals, and can be used to determine which production rule for a
|
|
|
;; particular nonterminal produced a derivation.
|
|
|
(struct context-free-grammar (rules start-symbol) #:transparent)
|
|
|
|
|
|
|
|
|
(define (grammar-start-rules grammar)
|
|
|
(define start (context-free-grammar-start-symbol grammar))
|
|
|
(for/set ([rule (in-vector (context-free-grammar-rules grammar))]
|
|
|
#:when (equal? (context-free-production-rule-nonterminal rule) start))
|
|
|
rule))
|
|
|
|
|
|
|
|
|
;; A (Token T V) is a tagged value. The grammar rules are defined in terms of the type tag,
|
|
|
;; whereas the value is what appears in leaf nodes of the resulting parse trees.
|
|
|
(struct token (type value) #:transparent)
|
|
|
|
|
|
|
|
|
;; A (Context-Free-Production-Rule T S L) contains a nonterminal symbol of type S, a label of type L,
|
|
|
;; and a substitution sequence of (Grammar-Symbol T S) values, stored in an immutable vector.
|
|
|
(struct context-free-production-rule (nonterminal label substitution) #:transparent)
|
|
|
|
|
|
|
|
|
;; A (Grammar-Symbol T S) is either a (Terminal-Symbol T) or a (Nonterminal-Symbol S)
|
|
|
(struct grammar-symbol () #:transparent)
|
|
|
(struct terminal-symbol grammar-symbol (value) #:transparent)
|
|
|
(struct nonterminal-symbol grammar-symbol (value) #:transparent)
|
|
|
|
|
|
|
|
|
(define (make-grammar #:rules rules #:start-symbol start)
|
|
|
(context-free-grammar (sequence->vector rules) start))
|
|
|
|
|
|
|
|
|
(define (make-rule #:symbol symbol #:substitution substitution #:label label)
|
|
|
(context-free-production-rule symbol label (sequence->vector substitution)))
|
|
|
|
|
|
|
|
|
;; A (Parser-Derivation V S L) is either a (Terminal-Derivation V) or a (Nonterminal-Derivation T S L)
|
|
|
(struct parser-derivation () #:transparent)
|
|
|
|
|
|
;; A (Terminal-Derivation V) represents a terminal that was matched by the grammar. It contains the
|
|
|
;; value of the (Token T V) that was matched.
|
|
|
(struct terminal-derivation parser-derivation (value) #:transparent)
|
|
|
|
|
|
;; A (Nonterminal-Derivation T S L) represents a nonterminal that was matched by the grammar. It
|
|
|
;; contains the nonterminal symbol of type T that was matched, the label of type L of the specific
|
|
|
;; rule that matched, and an immutable vector of subderivations
|
|
|
(struct nonterminal-derivation parser-derivation (symbol label children) #:transparent)
|
|
|
|
|
|
|
|
|
(define (make-nonterminal-derivation symbol label [children '()])
|
|
|
(nonterminal-derivation symbol label (sequence->vector children)))
|
|
|
|
|
|
|
|
|
(define derivation
|
|
|
(case-lambda
|
|
|
[(value) (terminal-derivation value)]
|
|
|
[(symbol label . children) (make-nonterminal-derivation symbol label children)]))
|
|
|
|
|
|
|
|
|
(define a (terminal-symbol 'a))
|
|
|
(define b (terminal-symbol 'b))
|
|
|
(define S (nonterminal-symbol 'S))
|
|
|
|
|
|
|
|
|
(define as-then-bs-grammar
|
|
|
(make-grammar
|
|
|
#:rules
|
|
|
(list
|
|
|
(make-rule #:symbol (nonterminal-symbol-value S) #:substitution (list a S b) #:label 'recur)
|
|
|
(make-rule #:symbol (nonterminal-symbol-value S) #:substitution (list a b) #:label 'done))
|
|
|
#:start-symbol (nonterminal-symbol-value S)))
|
|
|
|
|
|
|
|
|
(define input
|
|
|
(list (token 'a 'a1) (token 'a 'a2) (token 'a 'a3) (token 'b 'b1) (token 'b 'b2) (token 'b 'b3)))
|
|
|
|
|
|
|
|
|
(define expected-parse-tree
|
|
|
(derivation 'S 'recur
|
|
|
(derivation 'a1)
|
|
|
(derivation 'S 'recur
|
|
|
(derivation 'a2)
|
|
|
(derivation 'S 'done (derivation 'a3) (derivation 'b1))
|
|
|
(derivation 'b2))
|
|
|
(derivation 'b3)))
|
|
|
|
|
|
|
|
|
;; Earley parser
|
|
|
|
|
|
|
|
|
(struct earley-state (rule substitution-position input-position)
|
|
|
#:transparent
|
|
|
#:property prop:custom-print-quotable 'never
|
|
|
#:methods gen:custom-write
|
|
|
[(define write-proc
|
|
|
(make-constructor-style-printer
|
|
|
(λ (_) 'earley-state)
|
|
|
(λ (this)
|
|
|
(define rule (earley-state-rule this))
|
|
|
(define substitution (context-free-production-rule-substitution rule))
|
|
|
(define pos (earley-state-substitution-position this))
|
|
|
(list (list (context-free-production-rule-nonterminal rule) '->)
|
|
|
(for/list ([sym (in-vector substitution 0 pos)])
|
|
|
(if (terminal-symbol? sym)
|
|
|
(terminal-symbol-value sym)
|
|
|
(nonterminal-symbol-value sym)))
|
|
|
(list '•)
|
|
|
(for/list ([sym (in-vector substitution pos)])
|
|
|
(if (terminal-symbol? sym)
|
|
|
(terminal-symbol-value sym)
|
|
|
(nonterminal-symbol-value sym)))
|
|
|
(earley-state-input-position this)))))])
|
|
|
|
|
|
|
|
|
(define (initial-earley-states grammar)
|
|
|
(for/set ([rule (grammar-start-rules grammar)])
|
|
|
(earley-state rule 0 0)))
|
|
|
|
|
|
|
|
|
(define (earley-parse grammar token-sequence)
|
|
|
(define tokens (sequence->vector token-sequence))
|
|
|
(define token-count (vector-length tokens))
|
|
|
(define position-count (add1 token-count))
|
|
|
(define states (make-vector position-count (set)))
|
|
|
(vector-set! states 0 (initial-earley-states grammar))
|
|
|
(for ([k (in-range 0 position-count)])
|
|
|
|
|
|
;; Prediction and completion
|
|
|
(define/guard (process-states unprocessed processed)
|
|
|
(guard (set-empty? unprocessed) then
|
|
|
processed)
|
|
|
(define next (set-first unprocessed))
|
|
|
|
|
|
(define added-states
|
|
|
(guarded-block
|
|
|
(guard (completed-state? next) then
|
|
|
;; find all states in S(j) of the form (X → α • Y β, j) and add (X → α Y • β, j)
|
|
|
(define j (earley-state-input-position next))
|
|
|
(define completed (context-free-production-rule-nonterminal (earley-state-rule next)))
|
|
|
(define parent-states
|
|
|
(if (equal? j k)
|
|
|
(set-union unprocessed processed)
|
|
|
(vector-ref states j)))
|
|
|
(completer-states completed parent-states))
|
|
|
(define symbol (earley-state-next-symbol next))
|
|
|
(guard (nonterminal-symbol? symbol) else
|
|
|
(set))
|
|
|
(predictor-states grammar (nonterminal-symbol-value symbol) k)))
|
|
|
|
|
|
(define new-unprocessed (set-subtract (set-remove added-states next) processed))
|
|
|
(process-states (set-union (set-rest unprocessed) new-unprocessed) (set-add processed next)))
|
|
|
|
|
|
(define processed (process-states (vector-ref states k) (set)))
|
|
|
(vector-set! states k processed)
|
|
|
(unless (equal? k token-count)
|
|
|
(vector-set! states (add1 k) (scanner-states processed k (vector-ref tokens k)))))
|
|
|
states)
|
|
|
|
|
|
|
|
|
(define (completed-state? state)
|
|
|
(match-define (earley-state rule substitution-position _) state)
|
|
|
(equal? substitution-position
|
|
|
(vector-length (context-free-production-rule-substitution rule))))
|
|
|
|
|
|
|
|
|
(define/contract (earley-state-next-symbol state)
|
|
|
(-> (and/c earley-state? (not/c completed-state?)) grammar-symbol?)
|
|
|
(match-define (earley-state rule substitution-position _) state)
|
|
|
(vector-ref (context-free-production-rule-substitution rule) substitution-position))
|
|
|
|
|
|
|
|
|
(define (earley-state-advance-substitution state)
|
|
|
(match-define (earley-state rule substitution-position input-position) state)
|
|
|
(earley-state rule (add1 substitution-position) input-position))
|
|
|
|
|
|
|
|
|
(define (completer-states completed-nonterminal states)
|
|
|
(for/set ([s (in-set states)]
|
|
|
#:when (equal? (earley-state-next-symbol s) (nonterminal-symbol completed-nonterminal)))
|
|
|
(earley-state-advance-substitution s)))
|
|
|
|
|
|
|
|
|
(define (predictor-states grammar nonterminal k)
|
|
|
;; add (Y → • γ, k) for every production in the grammar with Y on the left-hand side
|
|
|
(for/set ([rule (in-vector (context-free-grammar-rules grammar))]
|
|
|
#:when (equal? (context-free-production-rule-nonterminal rule) nonterminal))
|
|
|
(earley-state rule 0 k)))
|
|
|
|
|
|
|
|
|
(define (scanner-states states k next-token)
|
|
|
(define type (token-type next-token))
|
|
|
(define expected (terminal-symbol type))
|
|
|
(for/set ([s (in-set states)]
|
|
|
#:when (not (completed-state? s))
|
|
|
#:when (equal? (earley-state-next-symbol s) expected))
|
|
|
(earley-state-advance-substitution s)))
|
|
|
|
|
|
|
|
|
|
|
|
(module+ test
|
|
|
(test-case "earley-parse integration test"
|
|
|
|
|
|
;; Grammar, input, and states taken from https://en.wikipedia.org/wiki/Earley_parser#Example
|
|
|
(define P-rule (make-rule #:symbol 'P #:label 0 #:substitution (list (nonterminal-symbol 'S))))
|
|
|
(define S-rule0
|
|
|
(make-rule
|
|
|
#:symbol 'S
|
|
|
#:label 0
|
|
|
#:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M))))
|
|
|
(define S-rule1 (make-rule #:symbol 'S #:label 1 #:substitution (list (nonterminal-symbol 'M))))
|
|
|
(define M-rule0
|
|
|
(make-rule
|
|
|
#:symbol 'M
|
|
|
#:label 0
|
|
|
#:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T))))
|
|
|
(define M-rule1 (make-rule #:symbol 'M #:label 1 #:substitution (list (nonterminal-symbol 'T))))
|
|
|
(define T-rule (make-rule #:symbol 'T #:label 0 #:substitution (list (terminal-symbol 'number))))
|
|
|
(define arithmetic-grammar
|
|
|
(make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P))
|
|
|
(define input-tokens
|
|
|
(list (token 'number 2) (token '+ '+) (token 'number 3) (token '* '*) (token 'number 4)))
|
|
|
|
|
|
(define arithmetic-parse-table
|
|
|
(earley-parse arithmetic-grammar input-tokens))
|
|
|
|
|
|
(define expected-arithmetic-parse-table
|
|
|
(vector
|
|
|
(set
|
|
|
(earley-state P-rule 0 0)
|
|
|
(earley-state S-rule0 0 0)
|
|
|
(earley-state S-rule1 0 0)
|
|
|
(earley-state M-rule0 0 0)
|
|
|
(earley-state M-rule1 0 0)
|
|
|
(earley-state T-rule 0 0))
|
|
|
(set
|
|
|
(earley-state T-rule 1 0)
|
|
|
(earley-state M-rule0 1 0)
|
|
|
(earley-state M-rule1 1 0)
|
|
|
(earley-state S-rule0 1 0)
|
|
|
(earley-state S-rule1 1 0)
|
|
|
(earley-state P-rule 1 0))
|
|
|
(set
|
|
|
(earley-state S-rule0 2 0)
|
|
|
(earley-state M-rule0 0 2)
|
|
|
(earley-state M-rule1 0 2)
|
|
|
(earley-state T-rule 0 2))
|
|
|
(set
|
|
|
(earley-state T-rule 1 2)
|
|
|
(earley-state M-rule0 1 2)
|
|
|
(earley-state M-rule1 1 2)
|
|
|
(earley-state S-rule0 3 0)
|
|
|
(earley-state S-rule0 1 0)
|
|
|
(earley-state P-rule 1 0))
|
|
|
(set
|
|
|
(earley-state M-rule0 2 2)
|
|
|
(earley-state T-rule 0 4))
|
|
|
(set
|
|
|
(earley-state T-rule 1 4)
|
|
|
(earley-state M-rule0 3 2)
|
|
|
(earley-state M-rule0 1 2)
|
|
|
(earley-state S-rule0 3 0)
|
|
|
(earley-state S-rule0 1 0)
|
|
|
(earley-state P-rule 1 0))))
|
|
|
(check-equal? arithmetic-parse-table expected-arithmetic-parse-table)))
|