Add a parser API

remotes/jackfirth/master
Jack Firth 2 years ago
parent 605d943d50
commit c65391f703

@ -0,0 +1,67 @@
#lang racket/base
(require racket/contract/base)
(provide
(struct-out terminal-symbol)
(struct-out nonterminal-symbol)
(struct-out context-free-grammar)
(struct-out context-free-production-rule)
(contract-out
[grammar-symbol? predicate/c]
[grammar-start-rules
(-> context-free-grammar? (set/c context-free-production-rule? #:kind 'immutable))]
[make-grammar
(-> #:rules (sequence/c context-free-production-rule?) #:start-symbol any/c context-free-grammar?)]
[make-rule
(-> #:symbol any/c #:substitution (sequence/c grammar-symbol?) #:label any/c
context-free-production-rule?)]))
(require racket/sequence
racket/set
rebellion/collection/vector)
;@----------------------------------------------------------------------------------------------------
;; Parsing takes a (Grammar T S L) and a sequence of (Token T V) and produces a set of
;; (Parser-Derivation V L) (also called a "parse forest"). A grammar contains an immutable
;; vector of (Context-Free-Production-Rule T S L) and a start symbol of type S.
;; T: the terminals the grammar parses. Corresponds to the type field of the input tokens.
;; S: the nonterminals the grammar rules are defined in terms of.
;; L: the labels that grammar rules may have attached to them. These show up in parse tree
;; branches, and can be used to determine which production rule produced a derivation.
(struct context-free-grammar (rules start-symbol) #:transparent)
(define (grammar-start-rules grammar)
(define start (context-free-grammar-start-symbol grammar))
(for/set ([rule (in-vector (context-free-grammar-rules grammar))]
#:when (equal? (context-free-production-rule-nonterminal rule) start))
rule))
;; A (Context-Free-Production-Rule T S L) contains a nonterminal symbol of type S, a label of type L,
;; and a substitution sequence of (Grammar-Symbol T S) values, stored in an immutable vector.
(struct context-free-production-rule (nonterminal label substitution) #:transparent)
;; A (Grammar-Symbol T S) is either a (Terminal-Symbol T) or a (Nonterminal-Symbol S)
(define (grammar-symbol? v)
(or (terminal-symbol? v) (nonterminal-symbol? v)))
(struct terminal-symbol (value) #:transparent)
(struct nonterminal-symbol (value) #:transparent)
(define (make-grammar #:rules rules #:start-symbol start)
(context-free-grammar (sequence->vector rules) start))
(define (make-rule #:symbol symbol #:substitution substitution #:label label)
(context-free-production-rule symbol label (sequence->vector substitution)))

@ -0,0 +1,79 @@
#lang racket/base
(require racket/contract/base)
(provide
(contract-out
[parser? predicate/c]
[parse-datum (-> parser? (sequence/c token?) parser-derivation?)]
[parse-syntax (-> parser? (sequence/c syntax-token?) syntax?)]
[parse-ambiguous-datum (-> parser? (sequence/c token?) (set/c parser-derivation?))]
[parse-ambiguous-syntax (-> parser? (sequence/c syntax-token?) (set/c syntax?))]))
(module+ private
(provide
(contract-out
[make-parser
(-> #:datum-function (-> (sequence/c token?) (stream/c parser-derivation?))
#:syntax-function (-> (sequence/c syntax-token?) (stream/c syntax?))
parser?)])))
(require racket/sequence
racket/set
racket/stream
yaragg/base/token
yaragg/base/derivation)
;@----------------------------------------------------------------------------------------------------
(struct parser (datum-function syntax-function))
(define (make-parser #:datum-function datum-function #:syntax-function syntax-function)
(parser datum-function syntax-function))
(define (parse-ambiguous-syntax p tokens)
(for/set ([stx (in-stream ((parser-syntax-function p) tokens))])
stx))
(define (parse-ambiguous-datum p tokens)
(for/set ([derivation (in-stream ((parser-datum-function p) tokens))])
derivation))
(define (parse-syntax p tokens)
(define stx-stream ((parser-syntax-function p) tokens))
(when (stream-empty? stx-stream)
(raise-arguments-error 'parse-syntax "no parse trees produced" "parser" p "tokens" tokens))
(define stx (stream-first stx-stream))
(unless (stream-empty? (stream-rest stx-stream))
(raise-arguments-error 'parse-syntax
"ambiguous parse, multiple parse trees produced"
"parser" p
"tokens" tokens
"first parse tree" stx
"second parse tree" (stream-first (stream-rest stx-stream))))
stx)
(define (parse-datum p tokens)
(define derivation-stream ((parser-datum-function p) tokens))
(when (stream-empty? derivation-stream)
(raise-arguments-error 'parse-datum "no parse trees produced" "parser" p "tokens" tokens))
(define derivation (stream-first derivation-stream))
(unless (stream-empty? (stream-rest derivation-stream))
(raise-arguments-error 'parse-datum
"ambiguous parse, multiple parse trees produced"
"parser" p
"tokens" tokens
"first parse tree" derivation
"second parse tree" (stream-first (stream-rest derivation-stream))))
derivation)

@ -1,6 +1,14 @@
#lang racket/base #lang racket/base
(require racket/contract/base)
(provide
(contract-out
[earley-parser (-> context-free-grammar? parser?)]))
(require racket/contract (require racket/contract
racket/match racket/match
racket/set racket/set
@ -9,7 +17,10 @@
rebellion/collection/vector rebellion/collection/vector
rebellion/private/guarded-block rebellion/private/guarded-block
yaragg/base/derivation yaragg/base/derivation
yaragg/base/token) yaragg/base/grammar
yaragg/base/token
yaragg/parser
(submod yaragg/parser private))
(module+ test (module+ test
@ -20,43 +31,9 @@
;@---------------------------------------------------------------------------------------------------- ;@----------------------------------------------------------------------------------------------------
;; Parsing takes a (Grammar T S L) and a sequence of (Token T V) and produces a set of (define (earley-parser grammar)
;; (Parser-Derivation V L) (also called a "parse forest"). A grammar contains an immutable (make-parser #:datum-function (λ (tokens) (earley-parse-datum grammar tokens))
;; vector of (Context-Free-Production-Rule T S L) and a start symbol of type S. #:syntax-function (λ (tokens) (earley-parse-syntax grammar tokens))))
;; T: the terminals the grammar parses. Corresponds to the type field of the input tokens.
;; S: the nonterminals the grammar rules are defined in terms of.
;; L: the labels that grammar rules may have attached to them. These show up in parse tree
;; branches, and can be used to determine which production rule produced a derivation.
(struct context-free-grammar (rules start-symbol) #:transparent)
(define (grammar-start-rules grammar)
(define start (context-free-grammar-start-symbol grammar))
(for/set ([rule (in-vector (context-free-grammar-rules grammar))]
#:when (equal? (context-free-production-rule-nonterminal rule) start))
rule))
;; A (Context-Free-Production-Rule T S L) contains a nonterminal symbol of type S, a label of type L,
;; and a substitution sequence of (Grammar-Symbol T S) values, stored in an immutable vector.
(struct context-free-production-rule (nonterminal label substitution) #:transparent)
;; A (Grammar-Symbol T S) is either a (Terminal-Symbol T) or a (Nonterminal-Symbol S)
(struct grammar-symbol () #:transparent)
(struct terminal-symbol grammar-symbol (value) #:transparent)
(struct nonterminal-symbol grammar-symbol (value) #:transparent)
(define (make-grammar #:rules rules #:start-symbol start)
(context-free-grammar (sequence->vector rules) start))
(define (make-rule #:symbol symbol #:substitution substitution #:label label)
(context-free-production-rule symbol label (sequence->vector substitution)))
;; Earley parser
;; The hash keys are sppf-labels and the values are a list of sppf-child-pairs ;; The hash keys are sppf-labels and the values are a list of sppf-child-pairs
@ -167,7 +144,7 @@
(context-free-grammar-start-symbol grammar)))) (context-free-grammar-start-symbol grammar))))
(define (earley-parse grammar token-sequence) (define (earley-parse-datum grammar token-sequence)
(define tokens (sequence->vector token-sequence)) (define tokens (sequence->vector token-sequence))
(define token-count (vector-length tokens)) (define token-count (vector-length tokens))
(define position-count (add1 token-count)) (define position-count (add1 token-count))
@ -268,96 +245,100 @@
(earley-state-advance-substitution s #:key new-key))) (earley-state-advance-substitution s #:key new-key)))
(define (grammar-parse-to-syntax grammar token-sequence) (define (earley-parse-syntax grammar token-sequence)
(define tokens (define tokens
(for/vector ([t token-sequence]) (for/vector ([t token-sequence])
(token (syntax-token-type t) t))) (token (syntax-token-type t) t)))
(for/set ([derivation (in-set (earley-parse grammar tokens))]) (for/set ([derivation (in-set (earley-parse-datum grammar tokens))])
(parser-derivation->syntax derivation))) (parser-derivation->syntax derivation)))
(module+ test (module+ test
(test-case "earley-parse integration test" (test-case "earley-parser integration test"
;; Grammar, input, and states taken from https://en.wikipedia.org/wiki/Earley_parser#Example ;; Grammar and input taken from https://en.wikipedia.org/wiki/Earley_parser#Example
(define P-rule (make-rule #:symbol 'P #:label 'P #:substitution (list (nonterminal-symbol 'S))))
(define S-rule0 (test-case "datum parser"
(make-rule (define P-rule (make-rule #:symbol 'P #:label 'P #:substitution (list (nonterminal-symbol 'S))))
#:symbol 'S (define S-rule0
#:label 'S0 (make-rule
#:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M)))) #:symbol 'S
(define S-rule1 (make-rule #:symbol 'S #:label 'S1 #:substitution (list (nonterminal-symbol 'M)))) #:label 'S0
(define M-rule0 #:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M))))
(make-rule (define S-rule1
#:symbol 'M (make-rule #:symbol 'S #:label 'S1 #:substitution (list (nonterminal-symbol 'M))))
#:label 'M0 (define M-rule0
#:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T)))) (make-rule
(define M-rule1 (make-rule #:symbol 'M #:label 'M1 #:substitution (list (nonterminal-symbol 'T)))) #:symbol 'M
(define T-rule (make-rule #:symbol 'T #:label 'T #:substitution (list (terminal-symbol 'number)))) #:label 'M0
(define arithmetic-grammar #:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T))))
(make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P)) (define M-rule1
(define input-tokens (make-rule #:symbol 'M #:label 'M1 #:substitution (list (nonterminal-symbol 'T))))
(list (token 'number 2) (token '+ 'plus) (token 'number 3) (token '* 'times) (token 'number 4))) (define T-rule
(make-rule #:symbol 'T #:label 'T #:substitution (list (terminal-symbol 'number))))
(define arithmetic-parse-forest (define arithmetic-grammar
(earley-parse arithmetic-grammar input-tokens)) (make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P))
(define input-tokens
(define expected-arithmetic-parse-tree (list
(parser-derivation (token 'number 2) (token '+ 'plus) (token 'number 3) (token '* 'times) (token 'number 4)))
'P (define parser (earley-parser arithmetic-grammar))
(parser-derivation (define expected-arithmetic-parse-tree
'S0
(parser-derivation 'S1 (parser-derivation 'M1 (parser-derivation 'T (parser-derivation 2))))
(parser-derivation 'plus)
(parser-derivation (parser-derivation
'M0 'P
(parser-derivation 'M1 (parser-derivation 'T (parser-derivation 3))) (parser-derivation
(parser-derivation 'times) 'S0
(parser-derivation 'T (parser-derivation 4)))))) (parser-derivation 'S1 (parser-derivation 'M1 (parser-derivation 'T (parser-derivation 2))))
(parser-derivation 'plus)
(parser-derivation
'M0
(parser-derivation 'M1 (parser-derivation 'T (parser-derivation 3)))
(parser-derivation 'times)
(parser-derivation 'T (parser-derivation 4))))))
(check-equal? arithmetic-parse-forest (set expected-arithmetic-parse-tree)))) (check-equal? (parse-datum parser input-tokens) expected-arithmetic-parse-tree))
;; Grammar, input, and states taken from https://en.wikipedia.org/wiki/Earley_parser#Example
(define P-rule
(make-rule #:symbol 'P #:label (syntax-label 'P) #:substitution (list (nonterminal-symbol 'S))))
(define S-rule0
(make-rule
#:symbol 'S
#:label (syntax-label 'S0)
#:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M))))
(define S-rule1 (test-case "syntax parser"
(make-rule #:symbol 'S #:label (syntax-label 'S1) #:substitution (list (nonterminal-symbol 'M)))) (define P-rule
(make-rule
#:symbol 'P #:label (syntax-label 'P) #:substitution (list (nonterminal-symbol 'S))))
(define M-rule0 (define S-rule0
(make-rule (make-rule
#:symbol 'M #:symbol 'S
#:label (syntax-label 'M0) #:label (syntax-label 'S0)
#:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T)))) #:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M))))
(define M-rule1 (define S-rule1
(make-rule #:symbol 'M #:label (syntax-label 'M1) #:substitution (list (nonterminal-symbol 'T)))) (make-rule
#:symbol 'S #:label (syntax-label 'S1) #:substitution (list (nonterminal-symbol 'M))))
(define T-rule (define M-rule0
(make-rule #:symbol 'T #:label (syntax-label 'T) #:substitution (list (terminal-symbol 'number)))) (make-rule
#:symbol 'M
#:label (syntax-label 'M0)
#:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T))))
(define arithmetic-grammar (define M-rule1
(make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P)) (make-rule
#:symbol 'M #:label (syntax-label 'M1) #:substitution (list (nonterminal-symbol 'T))))
(define T-rule
(make-rule
#:symbol 'T #:label (syntax-label 'T) #:substitution (list (terminal-symbol 'number))))
(define input-tokens (define arithmetic-grammar
(list (make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P))
(syntax-token 'number 2 #:position 1 #:span 1)
(syntax-token '+ #:position 2 #:span 1)
(syntax-token 'number 3 #:position 3 #:span 1)
(syntax-token '* #:position 4 #:span 1)
(syntax-token 'number 4 #:position 5 #:span 1)))
(grammar-parse-to-syntax arithmetic-grammar input-tokens) (define input-tokens
(list
(syntax-token 'number 2 #:position 1 #:span 1)
(syntax-token '+ #:position 2 #:span 1)
(syntax-token 'number 3 #:position 3 #:span 1)
(syntax-token '* #:position 4 #:span 1)
(syntax-token 'number 4 #:position 5 #:span 1)))
(define parser (earley-parser arithmetic-grammar))
(define arithmetic-parse-forest (check-equal? (syntax->datum (parse-syntax parser input-tokens))
(grammar-parse-to-syntax arithmetic-grammar input-tokens)) '(P (S0 (S1 (M1 (T 2))) + (M0 (M1 (T 3)) * (T 4))))))))
Loading…
Cancel
Save