that wasn't too bad

main
Matthew Butterick 6 years ago
parent c0f8084fc9
commit 94fbf2ab94

@ -17,37 +17,73 @@
#'(cond
[(regexp-try-match (pregexp (string-append "^" PAT)) PORT) . REST] ... [else ELSE-CLAUSE]))]))
(define (parse-1 x)
(pat-lex x
["\\s+" (parse-1 x)] ; whitespace
["\\d+ 0 obj" (parse-1 x)] ;; obj name
["(<<\n.*\n>>)\\s*stream\n" ; dict with stream
=> (λ (m)
(define d (parse-1 (open-input-bytes (cadr m))))
(define stream-length
(read (open-input-bytes (cdr (assoc #"/Length" d)))))
(define stream (read-bytes stream-length x))
(append d (list (cons 'stream stream))))]
["<<\n(.*)\n>>" ;; dict
=> (λ (m)
(define items (parse-pdf-bytes (cadr m)))
(unless (even? (length items))
(raise items))
(sort ; put hash into order so it's comparable
(for/list ([kv (in-slice 2 items)])
(apply cons kv))
bytes<?
#:key car))]
["\\[(.*?)\\]" => (compose1 parse-pdf-bytes cadr)] ;; list
["\\d+ 0 R"] ; xref
["[-]?\\d+"] ; number
["\\(.*?\\)"] ; parenstring
["/\\S+"] ; keystring
[else eof]))
(define (between-delimiters bs left right)
(parameterize ([current-input-port (if (input-port? bs) bs (open-input-bytes bs))])
(let loop ([acc null][depth 0])
(cond
[(regexp-try-match (pregexp (string-append "^" (regexp-quote (~a left)))) (current-input-port))
=> (λ (m)
(loop (if (empty? acc)
acc ; impliedly throw away first left delimiter
(cons (car m) acc)) (add1 depth)))]
[(regexp-try-match (pregexp (string-append "^" (regexp-quote (~a right)))) (current-input-port))
=> (λ (m)
(case depth
[(1) (bytes-append* (reverse acc))]
[else (loop (cons (car m) acc) (sub1 depth))]))]
[else
(define bstr (read-byte))
(and (not (eof-object? bstr))
(loop (if (zero? depth)
acc ; impliedly throw away leading non-delimiter bytes
(cons (bytes bstr) acc)) depth))]))))
(module+ test
(require rackunit)
(define bs #"a<<b<<c>>x<<z>>d>>e<<f>>g")
(check-equal? (between-delimiters bs #"<<" #">>") #"b<<c>>x<<z>>d")
(check-equal? (between-delimiters (between-delimiters bs #"<<" #">>") #"<<" #">>") #"c")
(check-false (between-delimiters #"abc" #"<<" #">>"))
(check-equal? (between-delimiters #"[a[b]c]" #"[" #"]") #"a[b]c")
(check-equal? (let ([ip (open-input-bytes #"<</foo 42>>z")])
(parse-1 ip)
(port->bytes ip)) #"z"))
(define (parse-1 ip)
(cond
;; the complication is that arrays & dicts can contain other arrays & dicts
;; so we have to scan ahead in an intelligent way.
[(equal? (peek-bytes 1 0 ip) #"[") ;; array
(parse-pdf-bytes (between-delimiters ip #"[" #"]"))]
[(equal? (peek-bytes 2 0 ip) #"<<") ;; dict, maybe with stream
(define items (parse-pdf-bytes (between-delimiters ip #"<<" #">>")))
(unless (even? (length items)) (raise items))
(define dic
(sort ; put hash into order so it's comparable
(for/list ([kv (in-slice 2 items)])
(apply cons kv))
bytes<?
#:key car))
(cond
[(regexp-try-match #px"^\\s*stream\n" ip)
(define stream-length
(read (open-input-bytes (cdr (assoc #"/Length" dic)))))
(define stream (read-bytes stream-length ip))
(append dic (list (cons 'stream stream)))]
[else dic])]
[else
(pat-lex ip
["\\s+" (parse-1 ip)] ; whitespace
["\\d+ 0 obj" (parse-1 ip)] ;; obj name
["\\d+ 0 R"] ; xref
["[-]?\\d+"] ; number
["\\(.*?\\)"] ; parenstring
["/\\S+"] ; keystring
[else eof])]))
(define (parse-pdf-bytes bs)
(for/list ([tok (in-port parse-1 (open-input-bytes bs))])
tok))
tok))
(define (pdf->dict pdf)
(define pdf-bs (file->bytes pdf))
@ -59,17 +95,20 @@
(sort ; sort by byte offset
(cdr ; drop zeroth record: there is no zeroth object
(for/list ([i (in-range ref-count)])
(cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" xref-ip)))))))
(cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" xref-ip)))))))
< #:key cdr)
(list (cons #f xoff))))
(sort ; sort by index
(parameterize ([current-input-port (open-input-bytes pdf-bs)])
(for/list ([(idx start) (in-dict obj-locations)]
[(_ end) (in-dict (cdr obj-locations))])
(cons idx (parse-pdf-bytes (peek-bytes (- end start) start)))))
(cons idx (parse-pdf-bytes (peek-bytes (- end start) start)))))
< #:key car))
(define-simple-check (check-pdfs-equal? ps1 ps2)
(equal? (pdf->dict ps1) (pdf->dict ps2)))
(for ([p (in-directory)]
#:when (path-has-extension? p #"pdf"))
(with-handlers ([exn:fail? (λ (exn) (println (format "~a failed" p)))])
(pdf->dict p)))

@ -1,24 +0,0 @@
#lang debug br
(require racket/bytes)
(define (outer-delimiter bs left right)
(parameterize ([current-input-port (open-input-bytes bs)])
(let loop ([acc null][depth 0])
(cond
[(regexp-try-match (regexp (format "^~a" left)) (current-input-port))
=> (λ (m) (loop (cons (car m) acc) (add1 depth)))]
[(regexp-try-match (regexp (format "^~a" right)) (current-input-port))
=> (λ (m)
(case depth
[(= depth 1) (apply bytes-append (reverse (cons (car m) acc)))]
[else (loop (cons (car m) acc) (sub1 depth))]))]
[else
(define b (read-byte))
(loop (if (empty? acc)
acc ; drop leading non-delimiter bytes
(cons (bytes b) acc)) depth)]))))
(define bs #"aaa<<bbb<<ccc>>xxx<<zzz>>ddd>>eee<<fff>>ggg")
(outer-delimiter bs #"<<" #">>")
Loading…
Cancel
Save