#lang racket/base (require (for-syntax racket/base syntax/parse) yaragg/parser-tools/lex racket/string (prefix-in : yaragg/parser-tools/lex-sre)) (provide (all-from-out yaragg/parser-tools/lex) (all-from-out yaragg/parser-tools/lex-sre) [struct-out token-struct] token [struct-out exn:fail:parsing]) (define (token-print token port mode) (write-string (format "~a" (cons 'token-struct (map (λ(proc) (format "~v" (proc token))) (list token-struct-type token-struct-val token-struct-line token-struct-column token-struct-offset token-struct-span token-struct-skip?)))) port)) (struct token-struct (type val offset line column span skip?) #:auto-value #f #:transparent) ;; Token constructor. ;; This is intended to be a general token structure constructor that's nice ;; to work with. ;; It should cooperate with the tokenizers constructed with make-permissive-tokenizer. (define (token type ;; (U symbol string) [val #f] ;; any [srcloc #f] #:position [position #f] ;; (U #f number) #:line [line #f] ;; (U #f number) #:column [column #f] ;; (U #f number) #:span [span #f] ;; boolean #:skip? [skip? #f]) (token-struct (if (string? type) (string->symbol type) type) val ;; keyword values take precedence over srcloc values (or position (and srcloc (srcloc-position srcloc))) (or line (and srcloc (srcloc-line srcloc))) (or column (and srcloc (srcloc-column srcloc))) (or span (and srcloc (srcloc-span srcloc))) skip?)) ;; When bad things happen, we need to emit errors with source location. (struct exn:fail:parsing exn:fail (srclocs) #:transparent #:property prop:exn:srclocs (lambda (instance) (exn:fail:parsing-srclocs instance))) (define (open-input-string-with-locs str) (parameterize ([port-count-lines-enabled #t]) (open-input-string str))) (provide (rename-out [apply-port-proc apply-lexer]) apply-port-proc) (define (apply-port-proc proc [val (current-input-port)]) (for/list ([t (in-port proc (if (string? val) (open-input-string-with-locs val) val))]) t)) (provide apply-tokenizer-maker (rename-out [apply-tokenizer-maker apply-tokenizer])) (define (apply-tokenizer-maker tokenize [in (current-input-port)]) (define input-port (if (string? in) (open-input-string-with-locs in) in)) (define token-producer (tokenize input-port)) (for/list ([token (in-producer token-producer (λ(tok) (define val (cond ;; position-tokens are produced by lexer-src-pos, [(position-token? tok) (position-token-token tok)] ;; and srcloc-tokens by lexer-srcloc [(srcloc-token? tok) (srcloc-token-token tok)] [else tok])) (or (eof-object? val) (void? val))))]) token)) (provide apply-colorer) (define (apply-colorer colorer port-or-string) (define p (if (string? port-or-string) (open-input-string port-or-string) port-or-string)) (let loop ([acc '()]) (define-values (lex cat shape start end) (colorer p)) (if (or (eq? 'eof cat) (eof-object? lex)) (reverse acc) (loop (cons (list lex cat shape start end) acc))))) (provide trim-ends) (define (trim-ends left lexeme right) (string-trim (string-trim lexeme left #:right? #f) right #:left? #f)) (provide from/to) (define-lex-trans from/to (λ (stx) (syntax-parse stx [(_ OPEN CLOSE) #'(:seq (from/stop-before OPEN CLOSE) CLOSE)]))) (provide from/stop-before) (define-lex-trans from/stop-before (λ (stx) (syntax-parse stx [(_ OPEN CLOSE) ;; (:seq any-string CLOSE any-string) pattern makes it non-greedy #'(:seq OPEN (complement (:seq any-string CLOSE any-string)))]))) (provide uc+lc) (define-lex-trans uc+lc (λ (stx) (syntax-parse stx [(_ . STRS) (with-syntax ([(UCSTR ...) (map (compose1 string-upcase syntax->datum) (syntax->list #'STRS))] [(LCSTR ...) (map (compose1 string-downcase syntax->datum) (syntax->list #'STRS))]) #'(union (union UCSTR ...) (union LCSTR ...)))]))) ;; change names of lexer abbreviations to be consistent with Racket srcloc conventions (define-syntax-rule (dprt ID-IN ID-OUT) (begin (provide ID-IN) (define-syntax ID-IN (make-rename-transformer (syntax ID-OUT))))) (dprt lexeme-start start-pos) (dprt lexeme-end end-pos) (dprt line position-line) (dprt col position-col) (dprt pos position-offset) (provide span) (define (span lexeme-start lexeme-end) (abs ; thus same result in reverse order (- (pos lexeme-end) (pos lexeme-start))))