You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
brag/parser/earley.rkt

298 lines
11 KiB
Racket

#lang racket/base
3 years ago
(require racket/contract/base)
(provide
(contract-out
[earley-parser (-> cf-grammar? parser?)]))
3 years ago
(require racket/contract
racket/match
racket/set
racket/stream
racket/struct
rebellion/collection/vector
rebellion/private/guarded-block
yaragg/base/derivation
3 years ago
yaragg/base/grammar
yaragg/base/semantic-action
yaragg/base/symbol
3 years ago
yaragg/base/token
yaragg/parser
(submod yaragg/parser private))
(module+ test
(require (submod "..")
rackunit))
3 years ago
;@----------------------------------------------------------------------------------------------------
3 years ago
(define (earley-parser grammar)
(make-parser #:deriver (λ (tokens) (earley-parse grammar tokens))))
;; The hash keys are sppf-keys and the values are a list of sppf-child-pairs
(struct sppf-forest (hash))
(define (make-sppf-forest)
(sppf-forest (make-hash)))
(define (sppf-forest-add-node! forest key)
(define h (sppf-forest-hash forest))
(unless (hash-has-key? h key)
(hash-set! h key '())))
(define (sppf-forest-add-child-pair! forest key #:left left-child #:right right-child)
(define pair (sppf-child-pair left-child right-child))
(hash-update! (sppf-forest-hash forest) key (λ (children) (cons pair children)) '()))
;; SPPF trees walk each rule down the left side, and each right child of the left spine corresponds to
;; the rule's components. See https://twitter.com/doitwithalambda/status/1510217894776348681 for a
;; diagram of the binarised parse tree.
(struct sppf-child-pair (left-child right-child) #:transparent)
(struct sppf-key () #:transparent)
(struct complete-sppf-key sppf-key (symbol input-start input-end) #:transparent)
(struct incomplete-sppf-key sppf-key (rule substitution-position input-start input-end)
#:transparent)
(define (sppf-key-input-end key)
(if (complete-sppf-key? key)
(complete-sppf-key-input-end key)
(incomplete-sppf-key-input-end key)))
(define (possible-children-lists forest key)
(define hash (sppf-forest-hash forest))
(let loop ([key key] [right-children '()])
(guarded-block
(guard key else
(stream right-children))
(match-define (list (sppf-child-pair left right) ...) (hash-ref hash key '()))
(apply stream-append
(for/list ([l (in-list left)]
[r (in-list right)])
(loop l (cons r right-children)))))))
(define (cartesian-stream streams)
(define (combine s1 s2)
(for*/stream ([x (in-stream s1)]
[y (in-stream s2)])
(cons x y)))
(foldr combine (stream '()) streams))
(define (sppf-forest-derivations forest key tokens)
(define hash (sppf-forest-hash forest))
(let loop ([key key])
(guarded-block
(guard (complete-sppf-key? key) then
(define tok (vector-ref tokens (complete-sppf-key-input-start key)))
(stream (terminal-derivation (token-value tok))))
(define action (cf-production-rule-action (incomplete-sppf-key-rule key)))
(define possible-children (possible-children-lists forest key))
(for*/stream ([children (in-stream possible-children)]
[processed-children (in-stream (cartesian-stream (map loop children)))])
(nonterminal-derivation action processed-children)))))
(struct earley-state (rule substitution-position input-position key)
#:transparent
#:property prop:custom-print-quotable 'never
#:methods gen:custom-write
[(define write-proc
(make-constructor-style-printer
(λ (_) 'earley-state)
(λ (this)
(define rule (earley-state-rule this))
(define substitution (cf-production-rule-substitution rule))
(define pos (earley-state-substitution-position this))
(append (list (cf-production-rule-nonterminal rule) '->)
(for/list ([sym (in-vector substitution 0 pos)])
(if (terminal-symbol? sym)
(terminal-symbol-value sym)
(nonterminal-symbol-value sym)))
(list ')
(for/list ([sym (in-vector substitution pos)])
(if (terminal-symbol? sym)
(terminal-symbol-value sym)
(nonterminal-symbol-value sym)))
(list (earley-state-input-position this) (earley-state-key this))))))])
(define (initial-earley-states grammar)
(for/set ([rule (cf-grammar-start-rules grammar)])
(earley-state rule 0 0 #false)))
(define (earley-state-represents-successful-parse? state grammar)
(and (zero? (earley-state-input-position state))
(equal? (cf-production-rule-nonterminal (earley-state-rule state))
(cf-grammar-start-symbol grammar))))
(define (earley-parse grammar token-sequence)
(define tokens (sequence->vector token-sequence))
(define token-count (vector-length tokens))
(define position-count (add1 token-count))
(define forest (make-sppf-forest))
(define states (make-vector position-count (set)))
(vector-set! states 0 (initial-earley-states grammar))
(for ([k (in-range 0 position-count)])
;; Prediction and completion
(define/guard (process-states unprocessed processed)
(guard (set-empty? unprocessed) then
processed)
(define next (set-first unprocessed))
(define added-states
(guarded-block
(guard (completed-state? next) then
;; find all states in S(j) of the form (X → α • Y β, j) and add (X → α Y • β, j)
(define j (earley-state-input-position next))
(define completed (cf-production-rule-nonterminal (earley-state-rule next)))
(define parent-states
(if (equal? j k)
(set-union unprocessed processed)
(vector-ref states j)))
(completer-states completed parent-states (earley-state-key next) #:forest forest))
(define symbol (earley-state-next-symbol next))
(guard (nonterminal-symbol? symbol) else
(set))
(predictor-states grammar (nonterminal-symbol-value symbol) k)))
(define new-unprocessed (set-subtract (set-remove added-states next) processed))
(process-states (set-union (set-rest unprocessed) new-unprocessed) (set-add processed next)))
(define processed (process-states (vector-ref states k) (set)))
(vector-set! states k processed)
(unless (equal? k token-count)
(define next-states (scanner-states processed k (vector-ref tokens k) #:forest forest))
(vector-set! states (add1 k) next-states)))
(define last-state-set (vector-ref states (sub1 position-count)))
(for/stream ([s (in-set last-state-set)]
#:when (earley-state-represents-successful-parse? s grammar)
[derivation (in-stream (sppf-forest-derivations forest (earley-state-key s) tokens))])
3 years ago
derivation))
(define (completed-state? state)
(match-define (earley-state rule substitution-position _ _) state)
(equal? substitution-position
(vector-length (cf-production-rule-substitution rule))))
(define/contract (earley-state-next-symbol state)
(-> (and/c earley-state? (not/c completed-state?)) grammar-symbol?)
(match-define (earley-state rule substitution-position _ _) state)
(vector-ref (cf-production-rule-substitution rule) substitution-position))
(define (earley-state-advance-substitution state #:key key)
(match-define (earley-state rule substitution-position input-position _) state)
(earley-state rule (add1 substitution-position) input-position key))
(define (completer-states completed-nonterminal states completed-key #:forest forest)
(define expected (nonterminal-symbol completed-nonterminal))
(for/set ([s (in-set states)]
#:when (equal? (earley-state-next-symbol s) expected))
(define rule (earley-state-rule s))
(define start (earley-state-input-position s))
(define end (sppf-key-input-end completed-key))
(define old-key (earley-state-key s))
(define new-key
(incomplete-sppf-key rule (add1 (earley-state-substitution-position s)) start end))
(sppf-forest-add-child-pair! forest new-key #:left old-key #:right completed-key)
(earley-state-advance-substitution s #:key new-key)))
(define (predictor-states grammar nonterminal k)
;; add (Y → • γ, k) for every production in the grammar with Y on the left-hand side
(for/set ([rule (in-vector (cf-grammar-rules grammar))]
#:when (equal? (cf-production-rule-nonterminal rule) nonterminal))
(earley-state rule 0 k #false)))
(define (scanner-states states k next-token #:forest forest)
(define expected (terminal-symbol (token-type next-token)))
(for/set ([s (in-set states)]
#:when (not (completed-state? s))
#:when (equal? (earley-state-next-symbol s) expected))
(define rule (earley-state-rule s))
(define start (earley-state-input-position s))
(define end (add1 k))
(define old-key (earley-state-key s))
(define new-key
(incomplete-sppf-key rule (add1 (earley-state-substitution-position s)) start end))
(define scanned-key (complete-sppf-key expected k end))
(sppf-forest-add-child-pair! forest new-key #:left old-key #:right scanned-key)
(earley-state-advance-substitution s #:key new-key)))
(module+ test
3 years ago
(test-case "earley-parser integration test"
;; Grammar and input taken from https://en.wikipedia.org/wiki/Earley_parser#Example
(define P-rule
(make-cf-production-rule
#:nonterminal 'P #:action (label-action 'P) #:substitution (list (nonterminal-symbol 'S))))
(define S-rule0
(make-cf-production-rule
#:nonterminal 'S
#:action (label-action 'S0)
#:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M))))
(define S-rule1
(make-cf-production-rule
#:nonterminal 'S #:action (label-action 'S1) #:substitution (list (nonterminal-symbol 'M))))
(define M-rule0
(make-cf-production-rule
#:nonterminal 'M
#:action (label-action 'M0)
#:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T))))
(define M-rule1
(make-cf-production-rule
#:nonterminal 'M #:action (label-action 'M1) #:substitution (list (nonterminal-symbol 'T))))
(define T-rule
(make-cf-production-rule
#:nonterminal 'T #:action (label-action 'T) #:substitution (list (terminal-symbol 'number))))
(define arithmetic-grammar
(make-cf-grammar
#:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P))
(define parser (earley-parser arithmetic-grammar))
3 years ago
(test-case "datum parser"
(define input-tokens
(list
(token 'number 2) (token '+ 'plus) (token 'number 3) (token '* 'times) (token 'number 4)))
(define expected-arithmetic-parse-tree
'(P (S0 (S1 (M1 (T 2))) plus (M0 (M1 (T 3)) times (T 4)))))
3 years ago
(check-equal? (parse-datum parser input-tokens) expected-arithmetic-parse-tree))
3 years ago
(test-case "syntax parser"
(define input-tokens
(list
(syntax-token 'number 2 #:position 1 #:span 1)
(syntax-token '+ #:position 2 #:span 1)
(syntax-token 'number 3 #:position 3 #:span 1)
(syntax-token '* #:position 4 #:span 1)
(syntax-token 'number 4 #:position 5 #:span 1)))
(check-equal? (syntax->datum (parse-syntax parser input-tokens))
'(P (S0 (S1 (M1 (T 2))) + (M0 (M1 (T 3)) * (T 4))))))))