From c2298d2659245c28a30bb6aec397fe851c31b4c0 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Mon, 17 Jul 2017 17:48:51 -0700 Subject: [PATCH] more parsing --- pitfall/pitfall/parse.rkt | 60 ++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/pitfall/pitfall/parse.rkt b/pitfall/pitfall/parse.rkt index b355404b..163ca9fc 100644 --- a/pitfall/pitfall/parse.rkt +++ b/pitfall/pitfall/parse.rkt @@ -1,7 +1,7 @@ #lang br -(require racket/file sugar/cache sugar/port racket/dict) +(require racket/file sugar/cache sugar/port racket/dict (prefix-in zlib: pitfall/zlib)) -(define dict-key-pattern #px"^/\\S+") +(define dict-key-pattern #px"^/[-:\\w]+") (struct ref (id) #:transparent) @@ -23,6 +23,37 @@ #:break (eq? next 'end-list)) next)) +(define current-pdf (make-parameter #f)) + +(struct $stream (dict bytes) #:transparent) + +(define (stream-decode bs filter) + (case filter + [(FlateDecode) (zlib:inflate bs)] + [else bs])) + +(define (indirect-ref? val) + (and (symbol? val) (regexp-match #rx"^ref-" (symbol->string val)))) + +(define (resolve-ref val) + (doc-ref (current-pdf) (string->number (string-trim (format "~a" val) "ref-")))) + +(define (parse-object p) + (define result (parse-one p)) + (cond + [(regexp-try-match #px"^\\s*stream(\n|\r\n?)" p) + (define stream-dict result) + (define stream-pos (port-position p)) + (define stream-length (let ([val (dict-ref stream-dict 'Length)]) + (if (indirect-ref? val) + (resolve-ref val) + val))) + (define stream-bytes (read-bytes stream-length (port-position p stream-pos))) + ($stream stream-dict (if (dict-has-key? stream-dict 'Filter) + (stream-decode stream-bytes (dict-ref stream-dict 'Filter)) + stream-bytes))] + [else result])) + (define (parse-one p) (let loop ([p-in p]) (define p (if (input-port? p-in) p-in (open-input-bytes p-in))) @@ -32,11 +63,11 @@ ;; indirect reference => reference int [(regexp-try-match #px"^(\\d+) (\\d+) R" p) => (λ (m) (string->symbol (format "ref-~a" (loop (cadr m)))))] ;; object => trim and reparse - [(regexp-try-match #px"^\\d+ 0 obj(.*?)endobj" p) => (λ (m) (loop (cadr m)))] + [(regexp-try-match #px"^\\d+ 0 obj" p) (parse-object p)] ;; integer string => int - [(regexp-try-match #px"^\\d+" p) => (λ (m) (string->number (bytes->string/utf-8 (car m))))] + [(regexp-try-match #px"^(-)?\\d+[.\\d*]?" p) => (λ (m) (string->number (bytes->string/utf-8 (car m))))] ;; dict => dispatch to handler - [(regexp-try-match #rx"^<<" p) (parse-dict p)] + [(regexp-try-match #rx"^<<" p) #;(report 'got-dict) (parse-dict p)] [(regexp-try-match #rx"^>>" p) 'end-dict] ;; list => unroll [(regexp-try-match #rx"^\\[" p) (parse-list p)] @@ -44,7 +75,7 @@ ;; slash-prefixed dictionary key => symbol [(regexp-try-match dict-key-pattern p) => (λ (m) (string->symbol (string-trim (bytes->string/utf-8 (car m)) "/" #:right? #f)))] [(regexp-try-match #rx"^startxref" p) (loop p)] ; pulls off next item, which is an integer - [(regexp-try-match #rx"^<([0123456789abcdef]*)>" p) => (λ (m) (cadr m))] + [(regexp-try-match #rx"^<([0123456789abcdefABCDEF]*)>" p) => (λ (m) (cadr m))] [(eof-object? (peek-bytes 1 0 p)) eof] [else (report* (file-position p) (peek-bytes 3 0 p)) (error 'unknown-type)]))) @@ -52,11 +83,13 @@ (define (parse-trailer p) (define eof-position (file-size (object-name p))) (define trailer-pos (for*/first ([pos (in-range eof-position 0 -1)] - [m (in-value (regexp-try-match "^\ntrailer\n.*" p pos))] + [m (in-value (regexp-try-match #px"^\\s+trailer\\s*" p pos))] #:when m) pos)) + (unless trailer-pos (error 'no-trailer-found)) (define dict-offset (caar (regexp-match-positions #rx"<<.*?>>" (port-position p trailer-pos)))) - (define trailer-dict (parse-one (port-position p (+ trailer-pos dict-offset)))) + (port-position p (+ trailer-pos dict-offset)) + (define trailer-dict (parse-one p)) (define xref-ptr (parse-one p)) (trailer trailer-dict xref-ptr)) @@ -65,13 +98,14 @@ (define bs (car (regexp-match #rx"(?<=xref).*?(?=trailer)" (port-position p offset)))) (define str (bytes->string/utf-8 bs)) (for/hash ([k (in-naturals 1)] - [v (in-list (drop (string-split str "\n") 2))]) + [v (in-list (drop (string-split str #px"\n|\r\n?") 2))]) (values k (string->number (car (regexp-match #px"^\\d+" v)))))) (struct $pdf (port xrefs) #:transparent) (define/caching (doc-ref pdf idx) (define offset (hash-ref ($pdf-xrefs pdf) idx)) - (parse-one (port-position ($pdf-port pdf) offset))) + (and (positive? offset) + (parse-one (port-position ($pdf-port pdf) offset)))) (define/caching (doc-ref* pdf idx) ;; resolve nested ref-x pointers @@ -101,8 +135,12 @@ (check-equal? (doc-ref pdf 1) '((Type . Catalog) (Pages . ref-2))) (check-equal? (doc-ref pdf 2) '((Type . Pages) (Kids ref-3) (Count . 1) (MediaBox 0 0 300 144))) (check-equal? (doc-ref pdf 3) '((Type . Page) (Parent . ref-2) (Resources (Font (F1 (Type . Font) (Subtype . Type1) (BaseFont . Times-Roman)))) (Contents . ref-4))) - (check-equal? (doc-ref pdf 4) '((Length . 55)))) + (check-equal? (doc-ref pdf 4) ($stream '((Length . 55)) #" BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET"))) (define p (open-input-file "test/test-cff/textedit-sample.pdf")) (define pdf (port->pdf p)) +(current-pdf pdf) +pdf + +(doc-ref pdf 1)