You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
brag/parser/earley.rkt

296 lines
11 KiB
Racket

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#lang racket/base
(require racket/contract/base)
(provide
(contract-out
[earley-parser (-> cf-grammar? parser?)]))
(require racket/contract
racket/match
racket/set
racket/stream
racket/struct
rebellion/collection/vector
rebellion/private/guarded-block
yaragg/base/derivation
yaragg/base/grammar
yaragg/base/token
yaragg/parser
(submod yaragg/parser private))
(module+ test
(require (submod "..")
rackunit))
;@----------------------------------------------------------------------------------------------------
(define (earley-parser grammar)
(make-parser #:deriver (λ (tokens) (earley-parse grammar tokens))))
;; The hash keys are sppf-keys and the values are a list of sppf-child-pairs
(struct sppf-forest (hash))
(define (make-sppf-forest)
(sppf-forest (make-hash)))
(define (sppf-forest-add-node! forest key)
(define h (sppf-forest-hash forest))
(unless (hash-has-key? h key)
(hash-set! h key '())))
(define (sppf-forest-add-child-pair! forest key #:left left-child #:right right-child)
(define pair (sppf-child-pair left-child right-child))
(hash-update! (sppf-forest-hash forest) key (λ (children) (cons pair children)) '()))
;; SPPF trees walk each rule down the left side, and each right child of the left spine corresponds to
;; the rule's components. See https://twitter.com/doitwithalambda/status/1510217894776348681 for a
;; diagram of the binarised parse tree.
(struct sppf-child-pair (left-child right-child) #:transparent)
(struct sppf-key () #:transparent)
(struct complete-sppf-key sppf-key (symbol input-start input-end) #:transparent)
(struct incomplete-sppf-key sppf-key (rule substitution-position input-start input-end)
#:transparent)
(define (sppf-key-input-end key)
(if (complete-sppf-key? key)
(complete-sppf-key-input-end key)
(incomplete-sppf-key-input-end key)))
(define (possible-children-lists forest key)
(define hash (sppf-forest-hash forest))
(let loop ([key key] [right-children '()])
(guarded-block
(guard key else
(stream right-children))
(match-define (list (sppf-child-pair left right) ...) (hash-ref hash key '()))
(apply stream-append
(for/list ([l (in-list left)]
[r (in-list right)])
(loop l (cons r right-children)))))))
(define (cartesian-stream streams)
(define (combine s1 s2)
(for*/stream ([x (in-stream s1)]
[y (in-stream s2)])
(cons x y)))
(foldr combine (stream '()) streams))
(define (sppf-forest-derivations forest key tokens)
(define hash (sppf-forest-hash forest))
(let loop ([key key])
(guarded-block
(guard (complete-sppf-key? key) then
(define tok (vector-ref tokens (complete-sppf-key-input-start key)))
(stream (terminal-derivation (token-value tok))))
(define action (cf-production-rule-action (incomplete-sppf-key-rule key)))
(define possible-children (possible-children-lists forest key))
(for*/stream ([children (in-stream possible-children)]
[processed-children (in-stream (cartesian-stream (map loop children)))])
(nonterminal-derivation action processed-children)))))
(struct earley-state (rule substitution-position input-position key)
#:transparent
#:property prop:custom-print-quotable 'never
#:methods gen:custom-write
[(define write-proc
(make-constructor-style-printer
(λ (_) 'earley-state)
(λ (this)
(define rule (earley-state-rule this))
(define substitution (cf-production-rule-substitution rule))
(define pos (earley-state-substitution-position this))
(append (list (cf-production-rule-nonterminal rule) '->)
(for/list ([sym (in-vector substitution 0 pos)])
(if (terminal-symbol? sym)
(terminal-symbol-value sym)
(nonterminal-symbol-value sym)))
(list ')
(for/list ([sym (in-vector substitution pos)])
(if (terminal-symbol? sym)
(terminal-symbol-value sym)
(nonterminal-symbol-value sym)))
(list (earley-state-input-position this) (earley-state-key this))))))])
(define (initial-earley-states grammar)
(for/set ([rule (cf-grammar-start-rules grammar)])
(earley-state rule 0 0 #false)))
(define (earley-state-represents-successful-parse? state grammar)
(and (zero? (earley-state-input-position state))
(equal? (cf-production-rule-nonterminal (earley-state-rule state))
(cf-grammar-start-symbol grammar))))
(define (earley-parse grammar token-sequence)
(define tokens (sequence->vector token-sequence))
(define token-count (vector-length tokens))
(define position-count (add1 token-count))
(define forest (make-sppf-forest))
(define states (make-vector position-count (set)))
(vector-set! states 0 (initial-earley-states grammar))
(for ([k (in-range 0 position-count)])
;; Prediction and completion
(define/guard (process-states unprocessed processed)
(guard (set-empty? unprocessed) then
processed)
(define next (set-first unprocessed))
(define added-states
(guarded-block
(guard (completed-state? next) then
;; find all states in S(j) of the form (X → α • Y β, j) and add (X → α Y • β, j)
(define j (earley-state-input-position next))
(define completed (cf-production-rule-nonterminal (earley-state-rule next)))
(define parent-states
(if (equal? j k)
(set-union unprocessed processed)
(vector-ref states j)))
(completer-states completed parent-states (earley-state-key next) #:forest forest))
(define symbol (earley-state-next-symbol next))
(guard (nonterminal-symbol? symbol) else
(set))
(predictor-states grammar (nonterminal-symbol-value symbol) k)))
(define new-unprocessed (set-subtract (set-remove added-states next) processed))
(process-states (set-union (set-rest unprocessed) new-unprocessed) (set-add processed next)))
(define processed (process-states (vector-ref states k) (set)))
(vector-set! states k processed)
(unless (equal? k token-count)
(define next-states (scanner-states processed k (vector-ref tokens k) #:forest forest))
(vector-set! states (add1 k) next-states)))
(define last-state-set (vector-ref states (sub1 position-count)))
(for/stream ([s (in-set last-state-set)]
#:when (earley-state-represents-successful-parse? s grammar)
[derivation (in-stream (sppf-forest-derivations forest (earley-state-key s) tokens))])
derivation))
(define (completed-state? state)
(match-define (earley-state rule substitution-position _ _) state)
(equal? substitution-position
(vector-length (cf-production-rule-substitution rule))))
(define/contract (earley-state-next-symbol state)
(-> (and/c earley-state? (not/c completed-state?)) grammar-symbol?)
(match-define (earley-state rule substitution-position _ _) state)
(vector-ref (cf-production-rule-substitution rule) substitution-position))
(define (earley-state-advance-substitution state #:key key)
(match-define (earley-state rule substitution-position input-position _) state)
(earley-state rule (add1 substitution-position) input-position key))
(define (completer-states completed-nonterminal states completed-key #:forest forest)
(define expected (nonterminal-symbol completed-nonterminal))
(for/set ([s (in-set states)]
#:when (equal? (earley-state-next-symbol s) expected))
(define rule (earley-state-rule s))
(define start (earley-state-input-position s))
(define end (sppf-key-input-end completed-key))
(define old-key (earley-state-key s))
(define new-key
(incomplete-sppf-key rule (add1 (earley-state-substitution-position s)) start end))
(sppf-forest-add-child-pair! forest new-key #:left old-key #:right completed-key)
(earley-state-advance-substitution s #:key new-key)))
(define (predictor-states grammar nonterminal k)
;; add (Y → • γ, k) for every production in the grammar with Y on the left-hand side
(for/set ([rule (in-vector (cf-grammar-rules grammar))]
#:when (equal? (cf-production-rule-nonterminal rule) nonterminal))
(earley-state rule 0 k #false)))
(define (scanner-states states k next-token #:forest forest)
(define expected (terminal-symbol (token-type next-token)))
(for/set ([s (in-set states)]
#:when (not (completed-state? s))
#:when (equal? (earley-state-next-symbol s) expected))
(define rule (earley-state-rule s))
(define start (earley-state-input-position s))
(define end (add1 k))
(define old-key (earley-state-key s))
(define new-key
(incomplete-sppf-key rule (add1 (earley-state-substitution-position s)) start end))
(define scanned-key (complete-sppf-key expected k end))
(sppf-forest-add-child-pair! forest new-key #:left old-key #:right scanned-key)
(earley-state-advance-substitution s #:key new-key)))
(module+ test
(test-case "earley-parser integration test"
;; Grammar and input taken from https://en.wikipedia.org/wiki/Earley_parser#Example
(define P-rule
(make-cf-production-rule
#:nonterminal 'P #:action (label-action 'P) #:substitution (list (nonterminal-symbol 'S))))
(define S-rule0
(make-cf-production-rule
#:nonterminal 'S
#:action (label-action 'S0)
#:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M))))
(define S-rule1
(make-cf-production-rule
#:nonterminal 'S #:action (label-action 'S1) #:substitution (list (nonterminal-symbol 'M))))
(define M-rule0
(make-cf-production-rule
#:nonterminal 'M
#:action (label-action 'M0)
#:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T))))
(define M-rule1
(make-cf-production-rule
#:nonterminal 'M #:action (label-action 'M1) #:substitution (list (nonterminal-symbol 'T))))
(define T-rule
(make-cf-production-rule
#:nonterminal 'T #:action (label-action 'T) #:substitution (list (terminal-symbol 'number))))
(define arithmetic-grammar
(make-cf-grammar
#:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P))
(define parser (earley-parser arithmetic-grammar))
(test-case "datum parser"
(define input-tokens
(list
(token 'number 2) (token '+ 'plus) (token 'number 3) (token '* 'times) (token 'number 4)))
(define expected-arithmetic-parse-tree
'(P (S0 (S1 (M1 (T 2))) plus (M0 (M1 (T 3)) times (T 4)))))
(check-equal? (parse-datum parser input-tokens) expected-arithmetic-parse-tree))
(test-case "syntax parser"
(define input-tokens
(list
(syntax-token 'number 2 #:position 1 #:span 1)
(syntax-token '+ #:position 2 #:span 1)
(syntax-token 'number 3 #:position 3 #:span 1)
(syntax-token '* #:position 4 #:span 1)
(syntax-token 'number 4 #:position 5 #:span 1)))
(check-equal? (syntax->datum (parse-syntax parser input-tokens))
'(P (S0 (S1 (M1 (T 2))) + (M0 (M1 (T 3)) * (T 4))))))))