You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
#lang racket/base
|
|
|
|
(require parser-tools/lex (prefix-in : parser-tools/lex-sre) ragg/support)
|
|
|
|
(provide tokenize)
|
|
|
|
|
|
|
|
;; tokenizer prepares source for parser by
|
|
|
|
;; 1) identifying tokens, the smallest unit of information
|
|
|
|
;; 2) throwing away anything irrelevant (whitespace, comments)
|
|
|
|
;; tokenizer cooperates with the lexer, which is a fancy regular-expression processor
|
|
|
|
|
|
|
|
(define (tokenize ip)
|
|
|
|
(define get-token
|
|
|
|
(lexer
|
|
|
|
[(char-set "><-.,+[]") lexeme]
|
|
|
|
;; todo: try adding support for line comments
|
|
|
|
#;[(:: "#" (:* (complement "\n")) "\n") (token 'comment #:skip? #t)]
|
|
|
|
[whitespace (token 'white #:skip? #t)]
|
|
|
|
;; treat other characters as comments
|
|
|
|
[(char-range #\nul #\~) (token 'ascii #:skip? #t)]
|
|
|
|
[(eof) eof]))
|
|
|
|
|
|
|
|
(define (next-token) (get-token ip))
|
|
|
|
|
|
|
|
next-token)
|
|
|
|
|
|
|
|
(module+ test
|
|
|
|
(require rackunit)
|
|
|
|
(define (test-tokenize str)
|
|
|
|
(define ip (open-input-string str))
|
|
|
|
(define token-producer (tokenize ip))
|
|
|
|
(for/list ([token (in-producer token-producer eof)])
|
|
|
|
token))
|
|
|
|
|
|
|
|
(check-equal? (test-tokenize "+") (list "+")))
|