wicked
parent
470d476fbc
commit
943340314d
@ -0,0 +1,69 @@
|
|||||||
|
#lang debug racket
|
||||||
|
(require rackunit)
|
||||||
|
(provide check-pdfs-equal?)
|
||||||
|
|
||||||
|
(define (xref-offset bs)
|
||||||
|
(match (regexp-match #px"(?<=startxref\n)\\d+" bs)
|
||||||
|
[(list val) (read (open-input-bytes val))]
|
||||||
|
[_ (error 'no-xref)]))
|
||||||
|
|
||||||
|
(define-syntax (pat-lex stx)
|
||||||
|
(syntax-case stx (else)
|
||||||
|
[(_ PORT [PAT . REST] ... [else ELSE-CLAUSE])
|
||||||
|
(with-syntax ([(REST ...)
|
||||||
|
(map (λ (stx) (syntax-case stx ()
|
||||||
|
[() #'(=> car)]
|
||||||
|
[_ stx])) (syntax->list #'(REST ...)))])
|
||||||
|
#'(cond
|
||||||
|
[(regexp-try-match (pregexp (string-append "^" PAT)) PORT) . REST] ... [else ELSE-CLAUSE]))]))
|
||||||
|
|
||||||
|
(define (parse-1 x)
|
||||||
|
(pat-lex x
|
||||||
|
["\\s+" (parse-1 x)] ; whitespace
|
||||||
|
["\\d+ 0 obj" (parse-1 x)] ;; obj name
|
||||||
|
["stream\n(.*?)\nendstream" => cadr] ;; stream
|
||||||
|
["<<\n(.*)\n>>" ;; dict
|
||||||
|
=> (λ (m)
|
||||||
|
(define items (parse-pdf-bytes (cadr m)))
|
||||||
|
(unless (even? (length items))
|
||||||
|
(raise items))
|
||||||
|
(sort ; put hash into order so it's comparable
|
||||||
|
(for/list ([kv (in-slice 2 items)])
|
||||||
|
(apply cons kv))
|
||||||
|
bytes<?
|
||||||
|
#:key car))]
|
||||||
|
["\\[(.*?)\\]" => (compose1 parse-pdf-bytes cadr)] ;; list
|
||||||
|
["\\d+ 0 R"] ; xref
|
||||||
|
["[-]?\\d+"] ; number
|
||||||
|
["\\(.*?\\)"] ; parenstring
|
||||||
|
["/\\S+"] ; keystring
|
||||||
|
[else eof]))
|
||||||
|
|
||||||
|
(define (parse-pdf-bytes bs)
|
||||||
|
(for/list ([tok (in-port parse-1 (open-input-bytes bs))])
|
||||||
|
tok))
|
||||||
|
|
||||||
|
(define (pdf->dict pdf)
|
||||||
|
(define pdf-bs (file->bytes pdf))
|
||||||
|
(define xoff (xref-offset pdf-bs))
|
||||||
|
(define xref-ip (open-input-bytes (subbytes pdf-bs (+ xoff (bytes-length #"xref\n0")))))
|
||||||
|
(define ref-count (read xref-ip))
|
||||||
|
(define obj-locations
|
||||||
|
(append
|
||||||
|
(sort ; sort by byte offset
|
||||||
|
(cdr ; drop zeroth record: there is no zeroth object
|
||||||
|
(for/list ([i (in-range ref-count)])
|
||||||
|
(cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" xref-ip)))))))
|
||||||
|
< #:key cdr)
|
||||||
|
(list (cons #f xoff))))
|
||||||
|
(sort ; sort by index
|
||||||
|
(parameterize ([current-input-port (open-input-bytes pdf-bs)])
|
||||||
|
(for/list ([(idx start) (in-dict obj-locations)]
|
||||||
|
[(_ end) (in-dict (cdr obj-locations))])
|
||||||
|
(cons idx (parse-pdf-bytes (peek-bytes (- end start) start)))))
|
||||||
|
< #:key car))
|
||||||
|
|
||||||
|
(define-simple-check (check-pdfs-equal? ps1 ps2)
|
||||||
|
(equal? (pdf->dict ps1) (pdf->dict ps2)))
|
||||||
|
|
||||||
|
|
@ -1,74 +0,0 @@
|
|||||||
#lang debug racket
|
|
||||||
|
|
||||||
(define pdf (file->bytes "test11rkt.pdf"))
|
|
||||||
(define pdf2 (file->bytes "test11rkt copy.pdf"))
|
|
||||||
|
|
||||||
(define (xref-offset pdf)
|
|
||||||
(match (regexp-match #px"(?<=startxref\n)\\d+" pdf)
|
|
||||||
[(list val) (read (open-input-bytes val))]
|
|
||||||
[_ (error 'no-xref)]))
|
|
||||||
|
|
||||||
(define (parse-1-pdf-item x)
|
|
||||||
(cond
|
|
||||||
;; obj name
|
|
||||||
[(regexp-try-match #px"^\\d+ 0 obj" x) #false]
|
|
||||||
;; stream
|
|
||||||
[(regexp-try-match #px"^stream\n(.*?)\nendstream" x) => cadr]
|
|
||||||
;; dict
|
|
||||||
[(regexp-try-match #px"^<<\n(.*)\n>>" x)
|
|
||||||
=> (λ (m)
|
|
||||||
(define items (parse-pdf-items (cadr m)))
|
|
||||||
(unless (even? (length items))
|
|
||||||
(raise items))
|
|
||||||
(sort
|
|
||||||
(for/list ([kv (in-slice 2 items)])
|
|
||||||
(apply cons kv))
|
|
||||||
bytes<?
|
|
||||||
#:key car))]
|
|
||||||
;; list
|
|
||||||
[(regexp-try-match #px"^\\[(.*?)\\]" x) => (compose1 parse-pdf-items cadr)]
|
|
||||||
;; xref
|
|
||||||
[(regexp-try-match #px"^\\d+ 0 R" x) => car]
|
|
||||||
;; number
|
|
||||||
[(regexp-try-match #px"^[-]?\\d+" x) => car]
|
|
||||||
;; parenstring
|
|
||||||
[(regexp-try-match #px"^\\(.*?\\)" x) => car]
|
|
||||||
;; keystring
|
|
||||||
[(regexp-try-match #px"^/\\S+" x) => car]
|
|
||||||
;; whitespace
|
|
||||||
[(regexp-match #px"\\s" x) #false]
|
|
||||||
[else eof]))
|
|
||||||
|
|
||||||
(define (parse-pdf-items bs)
|
|
||||||
(for/list ([tok (in-port parse-1-pdf-item (open-input-bytes bs))]
|
|
||||||
#:when tok)
|
|
||||||
tok))
|
|
||||||
|
|
||||||
(define (pdf->dict pdf)
|
|
||||||
(define xoff (xref-offset pdf))
|
|
||||||
(define ip (open-input-bytes pdf))
|
|
||||||
(void (read-bytes xoff ip))
|
|
||||||
(define ref-count
|
|
||||||
(match (regexp-match #px"(?<=xref\n0 )\\d+" ip)
|
|
||||||
[(list val) (read (open-input-bytes val))]
|
|
||||||
[_ (error 'no-xref-count)]))
|
|
||||||
(define obj-locations
|
|
||||||
(append
|
|
||||||
;; sort by byte offset
|
|
||||||
(sort
|
|
||||||
(cdr ; drop zeroth record: there is no zeroth object
|
|
||||||
(for/list ([i ref-count])
|
|
||||||
(cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" ip))))))) < #:key cdr)
|
|
||||||
(list (cons #f xoff))))
|
|
||||||
(define ip2 (open-input-bytes pdf))
|
|
||||||
(sort
|
|
||||||
(hash->list
|
|
||||||
(for/hash ([(idx start) (in-dict obj-locations)]
|
|
||||||
[(_ end) (in-dict (cdr obj-locations))])
|
|
||||||
(values idx (parse-pdf-items (peek-bytes (- end start) start ip2)))))
|
|
||||||
< #:key car))
|
|
||||||
|
|
||||||
(equal?
|
|
||||||
(pdf->dict pdf)
|
|
||||||
(pdf->dict pdf2)
|
|
||||||
)
|
|
Loading…
Reference in New Issue