#lang racket/base
(require parser-tools/lex (prefix-in : parser-tools/lex-sre) ragg/support)
(provide tokenize)

;; tokenizer prepares source for parser by
;; 1) identifying tokens, the smallest unit of information
;; 2) throwing away anything irrelevant (whitespace, comments)
;; tokenizer cooperates with the lexer, which is a fancy regular-expression processor

(define (tokenize ip)
  (define get-token
    (lexer
     [(char-set "><-.,+[]") lexeme]
     ;; todo: try adding support for line comments
     #;[(:: "#" (:* (complement "\n")) "\n") (token 'comment #:skip? #t)]
     [whitespace (token 'white #:skip? #t)]
     ;; treat other characters as comments
     [(char-range #\nul #\~) (token 'ascii #:skip? #t)]
     [(eof) eof]))
  
  (define (next-token) (get-token ip))
  
  next-token)

(module+ test
  (require rackunit)
  (define (test-tokenize str)
    (define ip (open-input-string str))
    (define token-producer (tokenize ip))
    (for/list ([token (in-producer token-producer eof)])
              token))
  
  (check-equal? (test-tokenize "+") (list "+")))