From 266f2f75a8d0d9a7c9cf7b1756c5894b690b59c1 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Mon, 17 Jul 2017 13:45:30 -0700 Subject: [PATCH] parsing --- pitfall/pitfall/old/parse copy.rkt | 98 -------------------------- pitfall/pitfall/parse.rkt | 108 +++++++++++++++++++++++++++++ pitfall/pitfall/test/minimal.pdf | 58 ++++++++++++++++ pitfall/sugar/port.rkt | 7 +- 4 files changed, 171 insertions(+), 100 deletions(-) delete mode 100644 pitfall/pitfall/old/parse copy.rkt create mode 100644 pitfall/pitfall/parse.rkt create mode 100644 pitfall/pitfall/test/minimal.pdf diff --git a/pitfall/pitfall/old/parse copy.rkt b/pitfall/pitfall/old/parse copy.rkt deleted file mode 100644 index 90e19d4f..00000000 --- a/pitfall/pitfall/old/parse copy.rkt +++ /dev/null @@ -1,98 +0,0 @@ -#lang at-exp br/quicklang -(require "parser.rkt" "tokenizer.rkt" "struct.rkt" gregor racket/bytes) -(provide (matching-identifiers-out #rx"pf-" (all-defined-out))) - -(module+ test (require rackunit)) - -(module+ reader (provide read-syntax)) - -(define (read-syntax src port) - ;; use latin-1 reencoding to make one char = one byte (note `latin1` respelling is correct) - (define parse-tree (parse (make-tokenizer (reencode-input-port port "latin1") src))) - (strip-bindings - #`(module pitfall-parse-mod pitfall/parse - #,parse-tree))) - -(define-macro (my-mb ARG ...) - #'(#%module-begin ARG ...)) -(provide (rename-out [my-mb #%module-begin]) - require) - -(provide null) - -(define-macro (pf-program COS-OBJECT ...) - #'(begin COS-OBJECT ...)) - -(define (pf-name str) - (let* ([str (string-trim str "/" #:right? #f)] - [str (regexp-replace* @pregexp{#(\d\d)} str (λ (m sub) (string (integer->char (string->number sub 16)))))]) - (string->symbol str))) - -(module+ test - (check-equal? (pf-name "B#45#20NICE") '|BE NICE|)) - -(define (pf-string arg . tail) - (cond - [(andmap byte? (cons arg tail)) (cons arg tail)] - [(string-prefix? arg "D:") - #;(parameterize ([current-locale "en"]) - (parse-date "2015-03-15T02:02:02-04:00" "yyyy-MM-dd'T'HH:mm:ssxxx")) - #f] - [else - (let* ([str (regexp-replace @regexp{^\((.*)\)$} arg "\\1")] ; remove parens - [str (string-replace str (string-append "\\" "\n") "")] - [str (regexp-replace* @pregexp{\\(n|r|t|b|f|\(|\)|\\)} str (λ (m sub) - (case sub - [("n") "\n"] - [("r") "\r"] - [("t") "\t"] - [("b") "\b"] - [("f") "\f"] - [else sub])))] - [str (regexp-replace* @pregexp{\\(\d{2,3})} str (λ (m sub) (string (integer->char (string->number sub 8)))))]) - (co-string str))])) - -(module+ test - (check-equal? @pf-string{(Testing)} (co-string "Testing")) - (check-equal? (pf-string @string-append{(Test\ - ing)}) (co-string "Testing")) - (check-equal? @pf-string{(Test\)ing)} (co-string "Test)ing")) - (check-equal? @pf-string{(Test\ning)} (co-string "Test\ning")) - (check-equal? @pf-string{(Test\\ing)} (co-string "Test\\ing")) - (check-equal? @pf-string{(A\53B)} (co-string "A+B")) - (check-equal? @pf-string{(A\053B)} (co-string "A+B")) - #;(check-equal? @pf-string{(D:19990209153925-08\'00\')}) - #;(check-true (andmap byte? @pf-string{<1C2D3F>})) - #;(check-true (andmap byte? @pf-string{<1C 2D 3F>}))) - -(define (pf-array . xs) (co-array xs)) - -(define (pf-dict . args) - (co-dict (apply hasheq args))) - - -(define (pf-stream dict str) - (define data (string->bytes/utf-8 str)) - (when (not (equal? (hash-ref (co-dict-dict dict) 'Length) (bytes-length data))) - (raise-argument-error 'pf-stream - (format "~a bytes of data" (hash-ref (co-dict-dict dict) 'Length)) - (format "~a = ~v" (bytes-length data) data))) - (co-stream dict data)) - -(define (pf-indirect-object obj gen thing) - (co-io obj gen thing)) - -(define-macro (pf-indirect-object-ref (OBJ GEN _)) - #'(co-io-ref OBJ GEN)) - -(define (pf-header num) (co-header num)) - -(define (pf-comment text) (co-comment text)) - -(define (pf-trailer dict) - (let ([h (co-dict-dict dict)]) - (unless (and (hash-has-key? h 'Size) (hash-has-key? h 'Root)) - (raise-argument-error 'pf-trailer - "Size and Root keys are required for trailer" - (hash-keys h)))) - (co-trailer dict)) diff --git a/pitfall/pitfall/parse.rkt b/pitfall/pitfall/parse.rkt new file mode 100644 index 00000000..b355404b --- /dev/null +++ b/pitfall/pitfall/parse.rkt @@ -0,0 +1,108 @@ +#lang br +(require racket/file sugar/cache sugar/port racket/dict) + +(define dict-key-pattern #px"^/\\S+") + +(struct ref (id) #:transparent) + +(define (trim-whitespace bs) ; can't use string-trim with bytes + (cadr (regexp-match #px"^\\s*(.*?)\\s*$" bs))) + +(define (parse-dict p) + (define terms (for/list ([next (in-port parse-one p)] + #:break (eq? next 'end-dict)) + next)) + (for/list ([k (in-list terms)] + [v (in-list (cdr terms))] + [i (in-naturals)] + #:when (even? i)) + (cons k v))) + +(define (parse-list p) + (for/list ([next (in-port parse-one p)] + #:break (eq? next 'end-list)) + next)) + +(define (parse-one p) + (let loop ([p-in p]) + (define p (if (input-port? p-in) p-in (open-input-bytes p-in))) + (cond + ;; skip whitespace + [(regexp-try-match #px"^\\s+" p) (loop p)] + ;; indirect reference => reference int + [(regexp-try-match #px"^(\\d+) (\\d+) R" p) => (λ (m) (string->symbol (format "ref-~a" (loop (cadr m)))))] + ;; object => trim and reparse + [(regexp-try-match #px"^\\d+ 0 obj(.*?)endobj" p) => (λ (m) (loop (cadr m)))] + ;; integer string => int + [(regexp-try-match #px"^\\d+" p) => (λ (m) (string->number (bytes->string/utf-8 (car m))))] + ;; dict => dispatch to handler + [(regexp-try-match #rx"^<<" p) (parse-dict p)] + [(regexp-try-match #rx"^>>" p) 'end-dict] + ;; list => unroll + [(regexp-try-match #rx"^\\[" p) (parse-list p)] + [(regexp-try-match #rx"^]" p) 'end-list] + ;; slash-prefixed dictionary key => symbol + [(regexp-try-match dict-key-pattern p) => (λ (m) (string->symbol (string-trim (bytes->string/utf-8 (car m)) "/" #:right? #f)))] + [(regexp-try-match #rx"^startxref" p) (loop p)] ; pulls off next item, which is an integer + [(regexp-try-match #rx"^<([0123456789abcdef]*)>" p) => (λ (m) (cadr m))] + [(eof-object? (peek-bytes 1 0 p)) eof] + [else (report* (file-position p) (peek-bytes 3 0 p)) (error 'unknown-type)]))) + +(struct trailer (dict xref-ptr) #:transparent) +(define (parse-trailer p) + (define eof-position (file-size (object-name p))) + (define trailer-pos (for*/first ([pos (in-range eof-position 0 -1)] + [m (in-value (regexp-try-match "^\ntrailer\n.*" p pos))] + #:when m) + pos)) + (define dict-offset (caar (regexp-match-positions #rx"<<.*?>>" (port-position p trailer-pos)))) + (define trailer-dict (parse-one (port-position p (+ trailer-pos dict-offset)))) + (define xref-ptr (parse-one p)) + (trailer trailer-dict xref-ptr)) + + +(define (parse-xref p offset) + (define bs (car (regexp-match #rx"(?<=xref).*?(?=trailer)" (port-position p offset)))) + (define str (bytes->string/utf-8 bs)) + (for/hash ([k (in-naturals 1)] + [v (in-list (drop (string-split str "\n") 2))]) + (values k (string->number (car (regexp-match #px"^\\d+" v)))))) + +(struct $pdf (port xrefs) #:transparent) +(define/caching (doc-ref pdf idx) + (define offset (hash-ref ($pdf-xrefs pdf) idx)) + (parse-one (port-position ($pdf-port pdf) offset))) + +(define/caching (doc-ref* pdf idx) + ;; resolve nested ref-x pointers + (define visited empty) ; prevent circular references by tracking visited refs + (let loop ([result (doc-ref pdf idx)]) + (cond + [(dict? result) (for/list ([(k v) (in-dict result)]) + (cons k (loop v)))] + [(list? result) (map loop result)] + [(and (symbol? result) (regexp-match #px"^ref-(\\d+)" (symbol->string result))) + => (λ (m) (define next-idx (string->number (cadr m))) + (define old-visited visited) + (set! visited (cons next-idx visited)) + (if (member next-idx old-visited) + result + (loop (doc-ref pdf next-idx))))] + [else result]))) + + +(define (port->pdf p) + ($pdf p (parse-xref p (trailer-xref-ptr (parse-trailer p))))) + +(module+ test + (require rackunit) + (define p (open-input-file "test/minimal.pdf")) + (define pdf (port->pdf p)) + (check-equal? (doc-ref pdf 1) '((Type . Catalog) (Pages . ref-2))) + (check-equal? (doc-ref pdf 2) '((Type . Pages) (Kids ref-3) (Count . 1) (MediaBox 0 0 300 144))) + (check-equal? (doc-ref pdf 3) '((Type . Page) (Parent . ref-2) (Resources (Font (F1 (Type . Font) (Subtype . Type1) (BaseFont . Times-Roman)))) (Contents . ref-4))) + (check-equal? (doc-ref pdf 4) '((Length . 55)))) + + +(define p (open-input-file "test/test-cff/textedit-sample.pdf")) +(define pdf (port->pdf p)) diff --git a/pitfall/pitfall/test/minimal.pdf b/pitfall/pitfall/test/minimal.pdf new file mode 100644 index 00000000..1c641810 --- /dev/null +++ b/pitfall/pitfall/test/minimal.pdf @@ -0,0 +1,58 @@ +%PDF-1.1 +%¥±ë + +1 0 obj + << /Type /Catalog + /Pages 2 0 R + >> +endobj + +2 0 obj + << /Type /Pages + /Kids [3 0 R] + /Count 1 + /MediaBox [0 0 300 144] + >> +endobj + +3 0 obj + << /Type /Page + /Parent 2 0 R + /Resources + << /Font + << /F1 + << /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman + >> + >> + >> + /Contents 4 0 R + >> +endobj + +4 0 obj + << /Length 55 >> +stream + BT + /F1 18 Tf + 0 0 Td + (Hello World) Tj + ET +endstream +endobj + +xref +0 5 +0000000000 65535 f +0000000018 00000 n +0000000077 00000 n +0000000178 00000 n +0000000457 00000 n +trailer + << /Root 1 0 R + /Size 5 + >> +startxref +565 +%%EOF diff --git a/pitfall/sugar/port.rkt b/pitfall/sugar/port.rkt index 70f9528b..d90b6b31 100644 --- a/pitfall/sugar/port.rkt +++ b/pitfall/sugar/port.rkt @@ -2,8 +2,11 @@ (require racket/port) (provide (all-defined-out) (all-from-out racket/port)) -(define (port-position ip) - (file-position ip)) +(define (port-position ip [where #f]) + (cond + [where (file-position ip where) + ip] + [else (file-position ip)])) (define (set-port-position! ip where) (file-position ip where))