From c65391f703756000d281e9f089d787f11964fb63 Mon Sep 17 00:00:00 2001 From: Jack Firth Date: Mon, 4 Apr 2022 21:45:47 -0700 Subject: [PATCH] Add a parser API --- base/grammar.rkt | 67 ++++++ parser.rkt | 79 +++++++ .../earley.rkt | 205 ++++++++---------- 3 files changed, 239 insertions(+), 112 deletions(-) create mode 100644 base/grammar.rkt create mode 100644 parser.rkt rename private/primitive-grammar.rkt => parser/earley.rkt (64%) diff --git a/base/grammar.rkt b/base/grammar.rkt new file mode 100644 index 0000000..2996e3c --- /dev/null +++ b/base/grammar.rkt @@ -0,0 +1,67 @@ +#lang racket/base + + +(require racket/contract/base) + + +(provide + (struct-out terminal-symbol) + (struct-out nonterminal-symbol) + (struct-out context-free-grammar) + (struct-out context-free-production-rule) + (contract-out + [grammar-symbol? predicate/c] + [grammar-start-rules + (-> context-free-grammar? (set/c context-free-production-rule? #:kind 'immutable))] + [make-grammar + (-> #:rules (sequence/c context-free-production-rule?) #:start-symbol any/c context-free-grammar?)] + [make-rule + (-> #:symbol any/c #:substitution (sequence/c grammar-symbol?) #:label any/c + context-free-production-rule?)])) + + +(require racket/sequence + racket/set + rebellion/collection/vector) + + +;@---------------------------------------------------------------------------------------------------- + + +;; Parsing takes a (Grammar T S L) and a sequence of (Token T V) and produces a set of +;; (Parser-Derivation V L) (also called a "parse forest"). A grammar contains an immutable +;; vector of (Context-Free-Production-Rule T S L) and a start symbol of type S. +;; T: the terminals the grammar parses. Corresponds to the type field of the input tokens. +;; S: the nonterminals the grammar rules are defined in terms of. +;; L: the labels that grammar rules may have attached to them. These show up in parse tree +;; branches, and can be used to determine which production rule produced a derivation. +(struct context-free-grammar (rules start-symbol) #:transparent) + + +(define (grammar-start-rules grammar) + (define start (context-free-grammar-start-symbol grammar)) + (for/set ([rule (in-vector (context-free-grammar-rules grammar))] + #:when (equal? (context-free-production-rule-nonterminal rule) start)) + rule)) + + +;; A (Context-Free-Production-Rule T S L) contains a nonterminal symbol of type S, a label of type L, +;; and a substitution sequence of (Grammar-Symbol T S) values, stored in an immutable vector. +(struct context-free-production-rule (nonterminal label substitution) #:transparent) + + +;; A (Grammar-Symbol T S) is either a (Terminal-Symbol T) or a (Nonterminal-Symbol S) +(define (grammar-symbol? v) + (or (terminal-symbol? v) (nonterminal-symbol? v))) + + +(struct terminal-symbol (value) #:transparent) +(struct nonterminal-symbol (value) #:transparent) + + +(define (make-grammar #:rules rules #:start-symbol start) + (context-free-grammar (sequence->vector rules) start)) + + +(define (make-rule #:symbol symbol #:substitution substitution #:label label) + (context-free-production-rule symbol label (sequence->vector substitution))) diff --git a/parser.rkt b/parser.rkt new file mode 100644 index 0000000..6a7c009 --- /dev/null +++ b/parser.rkt @@ -0,0 +1,79 @@ +#lang racket/base + + +(require racket/contract/base) + + +(provide + (contract-out + [parser? predicate/c] + [parse-datum (-> parser? (sequence/c token?) parser-derivation?)] + [parse-syntax (-> parser? (sequence/c syntax-token?) syntax?)] + [parse-ambiguous-datum (-> parser? (sequence/c token?) (set/c parser-derivation?))] + [parse-ambiguous-syntax (-> parser? (sequence/c syntax-token?) (set/c syntax?))])) + + +(module+ private + (provide + (contract-out + [make-parser + (-> #:datum-function (-> (sequence/c token?) (stream/c parser-derivation?)) + #:syntax-function (-> (sequence/c syntax-token?) (stream/c syntax?)) + parser?)]))) + + +(require racket/sequence + racket/set + racket/stream + yaragg/base/token + yaragg/base/derivation) + + +;@---------------------------------------------------------------------------------------------------- + + +(struct parser (datum-function syntax-function)) + + +(define (make-parser #:datum-function datum-function #:syntax-function syntax-function) + (parser datum-function syntax-function)) + + +(define (parse-ambiguous-syntax p tokens) + (for/set ([stx (in-stream ((parser-syntax-function p) tokens))]) + stx)) + + +(define (parse-ambiguous-datum p tokens) + (for/set ([derivation (in-stream ((parser-datum-function p) tokens))]) + derivation)) + + +(define (parse-syntax p tokens) + (define stx-stream ((parser-syntax-function p) tokens)) + (when (stream-empty? stx-stream) + (raise-arguments-error 'parse-syntax "no parse trees produced" "parser" p "tokens" tokens)) + (define stx (stream-first stx-stream)) + (unless (stream-empty? (stream-rest stx-stream)) + (raise-arguments-error 'parse-syntax + "ambiguous parse, multiple parse trees produced" + "parser" p + "tokens" tokens + "first parse tree" stx + "second parse tree" (stream-first (stream-rest stx-stream)))) + stx) + + +(define (parse-datum p tokens) + (define derivation-stream ((parser-datum-function p) tokens)) + (when (stream-empty? derivation-stream) + (raise-arguments-error 'parse-datum "no parse trees produced" "parser" p "tokens" tokens)) + (define derivation (stream-first derivation-stream)) + (unless (stream-empty? (stream-rest derivation-stream)) + (raise-arguments-error 'parse-datum + "ambiguous parse, multiple parse trees produced" + "parser" p + "tokens" tokens + "first parse tree" derivation + "second parse tree" (stream-first (stream-rest derivation-stream)))) + derivation) diff --git a/private/primitive-grammar.rkt b/parser/earley.rkt similarity index 64% rename from private/primitive-grammar.rkt rename to parser/earley.rkt index c638857..048d23e 100644 --- a/private/primitive-grammar.rkt +++ b/parser/earley.rkt @@ -1,6 +1,14 @@ #lang racket/base +(require racket/contract/base) + + +(provide + (contract-out + [earley-parser (-> context-free-grammar? parser?)])) + + (require racket/contract racket/match racket/set @@ -9,7 +17,10 @@ rebellion/collection/vector rebellion/private/guarded-block yaragg/base/derivation - yaragg/base/token) + yaragg/base/grammar + yaragg/base/token + yaragg/parser + (submod yaragg/parser private)) (module+ test @@ -20,43 +31,9 @@ ;@---------------------------------------------------------------------------------------------------- -;; Parsing takes a (Grammar T S L) and a sequence of (Token T V) and produces a set of -;; (Parser-Derivation V L) (also called a "parse forest"). A grammar contains an immutable -;; vector of (Context-Free-Production-Rule T S L) and a start symbol of type S. -;; T: the terminals the grammar parses. Corresponds to the type field of the input tokens. -;; S: the nonterminals the grammar rules are defined in terms of. -;; L: the labels that grammar rules may have attached to them. These show up in parse tree -;; branches, and can be used to determine which production rule produced a derivation. -(struct context-free-grammar (rules start-symbol) #:transparent) - - -(define (grammar-start-rules grammar) - (define start (context-free-grammar-start-symbol grammar)) - (for/set ([rule (in-vector (context-free-grammar-rules grammar))] - #:when (equal? (context-free-production-rule-nonterminal rule) start)) - rule)) - - -;; A (Context-Free-Production-Rule T S L) contains a nonterminal symbol of type S, a label of type L, -;; and a substitution sequence of (Grammar-Symbol T S) values, stored in an immutable vector. -(struct context-free-production-rule (nonterminal label substitution) #:transparent) - - -;; A (Grammar-Symbol T S) is either a (Terminal-Symbol T) or a (Nonterminal-Symbol S) -(struct grammar-symbol () #:transparent) -(struct terminal-symbol grammar-symbol (value) #:transparent) -(struct nonterminal-symbol grammar-symbol (value) #:transparent) - - -(define (make-grammar #:rules rules #:start-symbol start) - (context-free-grammar (sequence->vector rules) start)) - - -(define (make-rule #:symbol symbol #:substitution substitution #:label label) - (context-free-production-rule symbol label (sequence->vector substitution))) - - -;; Earley parser +(define (earley-parser grammar) + (make-parser #:datum-function (λ (tokens) (earley-parse-datum grammar tokens)) + #:syntax-function (λ (tokens) (earley-parse-syntax grammar tokens)))) ;; The hash keys are sppf-labels and the values are a list of sppf-child-pairs @@ -167,7 +144,7 @@ (context-free-grammar-start-symbol grammar)))) -(define (earley-parse grammar token-sequence) +(define (earley-parse-datum grammar token-sequence) (define tokens (sequence->vector token-sequence)) (define token-count (vector-length tokens)) (define position-count (add1 token-count)) @@ -268,96 +245,100 @@ (earley-state-advance-substitution s #:key new-key))) -(define (grammar-parse-to-syntax grammar token-sequence) +(define (earley-parse-syntax grammar token-sequence) (define tokens (for/vector ([t token-sequence]) (token (syntax-token-type t) t))) - (for/set ([derivation (in-set (earley-parse grammar tokens))]) + (for/set ([derivation (in-set (earley-parse-datum grammar tokens))]) (parser-derivation->syntax derivation))) (module+ test - (test-case "earley-parse integration test" - - ;; Grammar, input, and states taken from https://en.wikipedia.org/wiki/Earley_parser#Example - (define P-rule (make-rule #:symbol 'P #:label 'P #:substitution (list (nonterminal-symbol 'S)))) - (define S-rule0 - (make-rule - #:symbol 'S - #:label 'S0 - #:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M)))) - (define S-rule1 (make-rule #:symbol 'S #:label 'S1 #:substitution (list (nonterminal-symbol 'M)))) - (define M-rule0 - (make-rule - #:symbol 'M - #:label 'M0 - #:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T)))) - (define M-rule1 (make-rule #:symbol 'M #:label 'M1 #:substitution (list (nonterminal-symbol 'T)))) - (define T-rule (make-rule #:symbol 'T #:label 'T #:substitution (list (terminal-symbol 'number)))) - (define arithmetic-grammar - (make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P)) - (define input-tokens - (list (token 'number 2) (token '+ 'plus) (token 'number 3) (token '* 'times) (token 'number 4))) - - (define arithmetic-parse-forest - (earley-parse arithmetic-grammar input-tokens)) - - (define expected-arithmetic-parse-tree - (parser-derivation - 'P - (parser-derivation - 'S0 - (parser-derivation 'S1 (parser-derivation 'M1 (parser-derivation 'T (parser-derivation 2)))) - (parser-derivation 'plus) + (test-case "earley-parser integration test" + + ;; Grammar and input taken from https://en.wikipedia.org/wiki/Earley_parser#Example + + (test-case "datum parser" + (define P-rule (make-rule #:symbol 'P #:label 'P #:substitution (list (nonterminal-symbol 'S)))) + (define S-rule0 + (make-rule + #:symbol 'S + #:label 'S0 + #:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M)))) + (define S-rule1 + (make-rule #:symbol 'S #:label 'S1 #:substitution (list (nonterminal-symbol 'M)))) + (define M-rule0 + (make-rule + #:symbol 'M + #:label 'M0 + #:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T)))) + (define M-rule1 + (make-rule #:symbol 'M #:label 'M1 #:substitution (list (nonterminal-symbol 'T)))) + (define T-rule + (make-rule #:symbol 'T #:label 'T #:substitution (list (terminal-symbol 'number)))) + (define arithmetic-grammar + (make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P)) + (define input-tokens + (list + (token 'number 2) (token '+ 'plus) (token 'number 3) (token '* 'times) (token 'number 4))) + (define parser (earley-parser arithmetic-grammar)) + (define expected-arithmetic-parse-tree (parser-derivation - 'M0 - (parser-derivation 'M1 (parser-derivation 'T (parser-derivation 3))) - (parser-derivation 'times) - (parser-derivation 'T (parser-derivation 4)))))) + 'P + (parser-derivation + 'S0 + (parser-derivation 'S1 (parser-derivation 'M1 (parser-derivation 'T (parser-derivation 2)))) + (parser-derivation 'plus) + (parser-derivation + 'M0 + (parser-derivation 'M1 (parser-derivation 'T (parser-derivation 3))) + (parser-derivation 'times) + (parser-derivation 'T (parser-derivation 4)))))) - (check-equal? arithmetic-parse-forest (set expected-arithmetic-parse-tree)))) - - -;; Grammar, input, and states taken from https://en.wikipedia.org/wiki/Earley_parser#Example -(define P-rule - (make-rule #:symbol 'P #:label (syntax-label 'P) #:substitution (list (nonterminal-symbol 'S)))) - -(define S-rule0 - (make-rule - #:symbol 'S - #:label (syntax-label 'S0) - #:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M)))) + (check-equal? (parse-datum parser input-tokens) expected-arithmetic-parse-tree)) -(define S-rule1 - (make-rule #:symbol 'S #:label (syntax-label 'S1) #:substitution (list (nonterminal-symbol 'M)))) + (test-case "syntax parser" + (define P-rule + (make-rule + #:symbol 'P #:label (syntax-label 'P) #:substitution (list (nonterminal-symbol 'S)))) -(define M-rule0 - (make-rule - #:symbol 'M - #:label (syntax-label 'M0) - #:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T)))) + (define S-rule0 + (make-rule + #:symbol 'S + #:label (syntax-label 'S0) + #:substitution (list (nonterminal-symbol 'S) (terminal-symbol '+) (nonterminal-symbol 'M)))) -(define M-rule1 - (make-rule #:symbol 'M #:label (syntax-label 'M1) #:substitution (list (nonterminal-symbol 'T)))) + (define S-rule1 + (make-rule + #:symbol 'S #:label (syntax-label 'S1) #:substitution (list (nonterminal-symbol 'M)))) -(define T-rule - (make-rule #:symbol 'T #:label (syntax-label 'T) #:substitution (list (terminal-symbol 'number)))) + (define M-rule0 + (make-rule + #:symbol 'M + #:label (syntax-label 'M0) + #:substitution (list (nonterminal-symbol 'M) (terminal-symbol '*) (nonterminal-symbol 'T)))) -(define arithmetic-grammar - (make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P)) + (define M-rule1 + (make-rule + #:symbol 'M #:label (syntax-label 'M1) #:substitution (list (nonterminal-symbol 'T)))) + (define T-rule + (make-rule + #:symbol 'T #:label (syntax-label 'T) #:substitution (list (terminal-symbol 'number)))) -(define input-tokens - (list - (syntax-token 'number 2 #:position 1 #:span 1) - (syntax-token '+ #:position 2 #:span 1) - (syntax-token 'number 3 #:position 3 #:span 1) - (syntax-token '* #:position 4 #:span 1) - (syntax-token 'number 4 #:position 5 #:span 1))) + (define arithmetic-grammar + (make-grammar #:rules (list P-rule S-rule0 S-rule1 M-rule0 M-rule1 T-rule) #:start-symbol 'P)) -(grammar-parse-to-syntax arithmetic-grammar input-tokens) + (define input-tokens + (list + (syntax-token 'number 2 #:position 1 #:span 1) + (syntax-token '+ #:position 2 #:span 1) + (syntax-token 'number 3 #:position 3 #:span 1) + (syntax-token '* #:position 4 #:span 1) + (syntax-token 'number 4 #:position 5 #:span 1))) + (define parser (earley-parser arithmetic-grammar)) -(define arithmetic-parse-forest - (grammar-parse-to-syntax arithmetic-grammar input-tokens)) + (check-equal? (syntax->datum (parse-syntax parser input-tokens)) + '(P (S0 (S1 (M1 (T 2))) + (M0 (M1 (T 3)) * (T 4))))))))