From f847785c606af7faa8f81e2db98365512f886c47 Mon Sep 17 00:00:00 2001
From: Matthew Butterick <mb@mbtype.com>
Date: Wed, 29 Mar 2017 16:04:18 -0700
Subject: [PATCH] objects & streams

---
 pitfall/pitfall/parse.rkt       | 59 ++++++++++++++++++++++++---------
 pitfall/pitfall/parser-test.rkt |  6 ++++
 pitfall/pitfall/parser.rkt      | 14 +++++---
 pitfall/pitfall/test.rkt        | 46 +++++++++++++++++++++++--
 pitfall/pitfall/tokenizer.rkt   | 54 ++++++++++++++++--------------
 5 files changed, 132 insertions(+), 47 deletions(-)
 create mode 100644 pitfall/pitfall/parser-test.rkt

diff --git a/pitfall/pitfall/parse.rkt b/pitfall/pitfall/parse.rkt
index 4744d598..23015978 100644
--- a/pitfall/pitfall/parse.rkt
+++ b/pitfall/pitfall/parse.rkt
@@ -1,5 +1,5 @@
 #lang at-exp br/quicklang
-(require "parser.rkt" "tokenizer.rkt" gregor)
+(require "parser.rkt" "tokenizer.rkt" gregor racket/bytes)
 (provide (matching-identifiers-out #rx"pf-" (all-defined-out)))
 
 (module+ test (require rackunit))
@@ -7,7 +7,7 @@
 (module+ reader (provide read-syntax))
 
 (define (read-syntax src port)
-  (define parse-tree (parse (make-tokenizer src port)))
+  (define parse-tree (parse (make-tokenizer port src)))
   (strip-bindings
    #`(module pitfall-parse-mod pitfall/parse
        #,parse-tree)))
@@ -24,7 +24,7 @@
 (define (pf-name str)
   (let* ([str (string-trim str "/" #:right? #f)]
          [str (regexp-replace* @pregexp{#(\d\d)} str (λ (m sub) (string (integer->char (string->number sub 16)))))])
-         (string->symbol str)))
+    (string->symbol str)))
 
 (module+ test
   (check-equal? (pf-name "B#45#20NICE") '|BE NICE|))
@@ -34,25 +34,54 @@
     [(andmap byte? (cons arg tail)) (cons arg tail)]
     [(string-prefix? arg "D:")
      #;(parameterize ([current-locale "en"])
-    (parse-date "2015-03-15T02:02:02-04:00" "yyyy-MM-dd'T'HH:mm:ssxxx"))
+         (parse-date "2015-03-15T02:02:02-04:00" "yyyy-MM-dd'T'HH:mm:ssxxx"))
      #f]     
     [else
-     (let* ([str (regexp-replace @regexp{^\((.*)\)$} arg "\\1")]
-            [str (regexp-replace* @pregexp{\\(\\|\))} str "\\1")]
-           [str (regexp-replace* @pregexp{\\(\d\d\d)} str (λ (m sub) (string (integer->char (string->number sub)))))])
+     (let* ([str (regexp-replace @regexp{^\((.*)\)$} arg "\\1")] ; remove parens
+            [str (string-replace str (string-append "\\" "\n") "")]
+            [str (regexp-replace* @pregexp{\\(n|r|t|b|f|\(|\)|\\)} str (λ (m sub)
+                                                                         (case sub
+                                                                           [("n") "\n"]
+                                                                           [("r") "\r"]
+                                                                           [("t") "\t"]
+                                                                           [("b") "\b"]
+                                                                           [("f") "\f"]
+                                                                           [else sub])))]
+            [str (regexp-replace* @pregexp{\\(\d{2,3})} str (λ (m sub) (string (integer->char (string->number sub 8)))))])
        str)]))
 
 (module+ test
-  (check-equal? (pf-string "(Testing)") "Testing")
-  (check-equal? (pf-string "(Test\\)ing)") "Test)ing")
-  (check-equal? (pf-string "(Test\\\\ing)") "Test\\ing")
-  (check-equal? (pf-string "(A\\043B)") "A+B")
-  #;(check-equal? (pf-string "(D:19990209153925-08\'00\')") )
-  #;(check-true (andmap byte? (pf-string "<1C2D3F>")))
-  #;(check-true (andmap byte? (pf-string "<1C 2D 3F>"))))
+  (check-equal? @pf-string{(Testing)} "Testing")
+   (check-equal? (pf-string @string-append{(Test\
+ ing)}) "Testing")
+  (check-equal? @pf-string{(Test\)ing)} "Test)ing")
+  (check-equal? @pf-string{(Test\ning)} "Test\ning")
+  (check-equal? @pf-string{(Test\\ing)} "Test\\ing")
+  (check-equal? @pf-string{(A\53B)} "A+B")
+  (check-equal? @pf-string{(A\053B)} "A+B")
+  #;(check-equal? @pf-string{(D:19990209153925-08\'00\')})
+  #;(check-true (andmap byte? @pf-string{<1C2D3F>}))
+  #;(check-true (andmap byte? @pf-string{<1C 2D 3F>})))
 
 (define (pf-array . xs) xs)
 
 (define (pf-dict . args)
   (apply hash args))
-      
\ No newline at end of file
+
+(struct $stream (dict data) #:transparent)
+(define (pf-stream dict str)
+  (define data (string->bytes/utf-8 str))
+  (when (not (equal? (hash-ref dict 'Length) (bytes-length data)))
+    (raise-argument-error 'pf-stream (format "~a bytes of data" (hash-ref dict 'Length)) (format "~a = ~a" (bytes-length data) data)))
+  ($stream dict data))
+
+(define indirect-objects (make-hash))
+(provide indirect-objects)
+
+(define (pf-indirect-object obj-num generation-num thing)
+  (hash-set! indirect-objects (vector obj-num generation-num) thing))
+
+(define-macro (pf-indirect-object-ref (OBJ-NUM GENERATION-NUM "R"))
+  #'(hash-ref (report indirect-objects) (vector (string->number OBJ-NUM)
+                                       (string->number GENERATION-NUM))))
+  
\ No newline at end of file
diff --git a/pitfall/pitfall/parser-test.rkt b/pitfall/pitfall/parser-test.rkt
new file mode 100644
index 00000000..6fabf73a
--- /dev/null
+++ b/pitfall/pitfall/parser-test.rkt
@@ -0,0 +1,6 @@
+#lang at-exp br
+(require rackunit "parser.rkt" "tokenizer.rkt" brag/support)
+
+(apply-tokenizer-maker make-tokenizer @string-append{(string () here) << /A (B) >>})
+
+#;(parse-to-datum (apply-tokenizer-maker make-tokenizer @string-append{(string () here) << /A (B) >>}))
\ No newline at end of file
diff --git a/pitfall/pitfall/parser.rkt b/pitfall/pitfall/parser.rkt
index ba53d575..4d46e715 100644
--- a/pitfall/pitfall/parser.rkt
+++ b/pitfall/pitfall/parser.rkt
@@ -1,10 +1,14 @@
 #lang brag
 
 pf-program : pf-thing*
-@pf-thing : NULL | CHAR | BOOLEAN | INT | REAL | pf-name | pf-string | pf-array | pf-dict
+@pf-thing : pf-null | CHAR | BOOLEAN | INT | REAL | pf-name | pf-string | pf-array | pf-dict | pf-stream | pf-indirect-object | pf-indirect-object-ref
+@pf-null : NULL
 pf-name : NAME
-pf-string : PAREN-TOK | /LEFT-ANGLE HEX-DIGIT-PAIR+ /RIGHT-ANGLE
-pf-array : /LEFT-BRACKET pf-thing* /RIGHT-BRACKET
-pf-dict : /DOUBLE-LEFT-ANGLE (pf-dict-key pf-dict-value)* /DOUBLE-RIGHT-ANGLE
+pf-string : STRING-TOK | /"<" HEX-DIGIT-PAIR+ /">"
+pf-array : /"[" pf-thing* /"]"
+pf-dict : /"<<" (pf-dict-key pf-dict-value)* /">>"
 @pf-dict-key : pf-thing
-@pf-dict-value : pf-thing
\ No newline at end of file
+@pf-dict-value : pf-thing
+pf-stream : pf-dict STREAM-DATA
+pf-indirect-object : INT INT /"obj" pf-thing /"endobj"
+pf-indirect-object-ref : INDIRECT-OBJECT-REF-TOK
\ No newline at end of file
diff --git a/pitfall/pitfall/test.rkt b/pitfall/pitfall/test.rkt
index 087f55fa..9c9e7b8c 100644
--- a/pitfall/pitfall/test.rkt
+++ b/pitfall/pitfall/test.rkt
@@ -15,19 +15,61 @@ false
 /ThisIsName37
 /Lime#20Green
 /SSCN_SomeSecondClassName
+/Adobe#20Green
+/The_Key_of_F#23_Minor
 (Testing)
 (A\053B)
+
 %(D:19990209153925-08'00')
 <1C2D3F>
 <1C 2D 3F>
 [0 0 612 792]
 [(T) -20.5 (H) 4 (E)]
 [[1 2 3][4 5 6]]
+<<
+/Type /Example
+/Subtype /DictionaryExample
+/Version 0.01
+/IntegerItem 12
+/StringItem (a string)
+/Subdictionary <<
+/Item1 0.4
+/Item2 true
+/LastItem (not!)
+/VeryLastItem (OK) >> >>
+( This string contains \245two octal characters\307 . )
+(Strings may contain balanced parentheses ( ) and special \ncharacters (*!&}^% and so on).)
 % a more human-readable dictionary
 <<
   /Type /Page
   /Author (Leonard Rosenthol)
-  /Resources << /Font [ /F1 /F2 ] >>
+  /Resources 42
 >>
 % stripped
-<</Length 3112/Subtype/XML/Type/Metadata>>
\ No newline at end of file
+<</Length 3112/Subtype/XML/Type/Metadata>>
+
+<<
+  /Type /Xobject /Subtype /Image /Filter /FlateDecode /Length 4 /Height 32 /Width 32
+>>
+stream
+abcd
+endstream
+
+12 0 obj
+( Brillig )
+endobj
+
+8 0 obj
+63
+endobj
+
+7 0 obj
+<< /Length 8 0 R >>
+stream
+BT
+/F1 12 Tf
+72 712 Td
+(A stream with an indirect length) Tj
+ET
+endstream
+endobj
\ No newline at end of file
diff --git a/pitfall/pitfall/tokenizer.rkt b/pitfall/pitfall/tokenizer.rkt
index 6ad2c321..33751114 100644
--- a/pitfall/pitfall/tokenizer.rkt
+++ b/pitfall/pitfall/tokenizer.rkt
@@ -1,40 +1,44 @@
-#lang br
+#lang at-exp br
 (require brag/support)
 (provide make-tokenizer)
 
 (define-lex-abbrev digit (char-set "0123456789"))
 (define-lex-abbrev hex-digit (:or digit (char-set "ABCDEF")))
 (define-lex-abbrev digits (:+ digit))
-(define-lex-abbrev sign (:? (:or "+" "-")))
-(define-lex-abbrev blackspace (:~ whitespace))
-(define-lex-abbrev ascii-char (char-set ascii))
+(define-lex-abbrev optional-sign (:? (:or "+" "-")))
+(define-lex-abbrev pdf-whitespace (char-set "\u0000\t\n\f\r "))
+(define-lex-abbrev pdf-delimiter (char-set "()<>[]{}/%"))
+#;(define-lex-abbrev pdf-reg)
+(define-lex-abbrev blackspace (:~ pdf-whitespace))
+(define-lex-abbrev not-right-paren (:~ ")"))
+(define-lex-abbrev substring (:seq "(" (:* not-right-paren) ")"))
 
 (define-lex-abbrev nonreg-char (:seq "#" hex-digit hex-digit))
 
-(define (make-tokenizer src port)
+(define (make-tokenizer port [src #f])
   (port-count-lines! port)
   (lexer-file-path src)
-  (define lex-once
+  (define lex-one-token
     (lexer-srcloc
      [(eof) eof]
-     [(:or whitespace
-       (from/stop-before "%" "\n"))
-      (token 'IGNORE lexeme #:skip? #t)]
+     [(:seq digits (:+ pdf-whitespace) digits (:+ pdf-whitespace) "R")
+      (begin (println (string-split lexeme))
+      (token 'INDIRECT-OBJECT-REF-TOK (string-split lexeme)))]
+     [(:or pdf-whitespace
+           (from/stop-before "%" #\newline)) (token 'IGNORE lexeme #:skip? #t)]
      [(:or "true" "false") (token 'BOOLEAN (equal? lexeme "true"))]
-     [(:seq sign digits) (token 'INT (string->number lexeme))]
-     [(:seq sign (:or (:seq digits "." (:? digits))
-                      (:seq "." digits)))
+     [(:seq optional-sign digits) (token 'INT (string->number lexeme))]
+     [(:seq optional-sign (:or (:seq digits "." (:? digits))
+                               (:seq "." digits)))
       (token 'REAL (string->number lexeme))]
-     [(:seq "/" (:+ (:or nonreg-char alphabetic "_" numeric)))
-      (token 'NAME lexeme)]
-      ["null" (token 'NULL 'null)]
-      [(from/to "(" ")") (token 'PAREN-TOK lexeme)]
-      [(:seq hex-digit hex-digit) (token 'HEX-DIGIT-PAIR (string->number lexeme 16))]
-      ["<" (token 'LEFT-ANGLE)]
-      [">" (token 'RIGHT-ANGLE)]
-      ["<<" (token 'DOUBLE-LEFT-ANGLE)]
-      [">>" (token 'DOUBLE-RIGHT-ANGLE)]
-      ["[" (token 'LEFT-BRACKET)]
-      ["]" (token 'RIGHT-BRACKET)]
-      [any-char (token 'CHAR lexeme)]))
-(λ () (lex-once port)))
\ No newline at end of file
+     [(from/stop-before "/" (:or pdf-delimiter pdf-whitespace)) (token 'NAME lexeme)]
+     ["null" (token 'NULL 'null)]
+     [(:seq "(" (:* (:or not-right-paren substring)) ")") (token 'STRING-TOK lexeme)]
+     [(:seq hex-digit hex-digit) (token 'HEX-DIGIT-PAIR (string->number lexeme 16))]
+     [(:or "<" ">" "<<" ">>" "[" "]" "obj" "endobj") (token lexeme lexeme)]
+     [(from/to "stream" "endstream") (token 'STREAM-DATA (string-trim (trim-ends "stream" lexeme "endstream")))]
+     [any-char (token 'CHAR lexeme)]))
+  (λ () (lex-one-token port)))
+
+(module+ test
+  (apply-tokenizer-maker make-tokenizer @string-append{(s(t)r) << /A (B) >>}))
\ No newline at end of file