From 943340314df9837e8ab4980ce8a6a45cbb2a69eb Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Sat, 17 Nov 2018 18:43:02 -0800 Subject: [PATCH] wicked --- pitfall/ptest/check-pdf.rkt | 69 ++++++++++++++++++++++++++++++++++ pitfall/ptest/parse-pdf.rkt | 74 ------------------------------------- 2 files changed, 69 insertions(+), 74 deletions(-) create mode 100644 pitfall/ptest/check-pdf.rkt delete mode 100644 pitfall/ptest/parse-pdf.rkt diff --git a/pitfall/ptest/check-pdf.rkt b/pitfall/ptest/check-pdf.rkt new file mode 100644 index 00000000..26ce01b9 --- /dev/null +++ b/pitfall/ptest/check-pdf.rkt @@ -0,0 +1,69 @@ +#lang debug racket +(require rackunit) +(provide check-pdfs-equal?) + +(define (xref-offset bs) + (match (regexp-match #px"(?<=startxref\n)\\d+" bs) + [(list val) (read (open-input-bytes val))] + [_ (error 'no-xref)])) + +(define-syntax (pat-lex stx) + (syntax-case stx (else) + [(_ PORT [PAT . REST] ... [else ELSE-CLAUSE]) + (with-syntax ([(REST ...) + (map (λ (stx) (syntax-case stx () + [() #'(=> car)] + [_ stx])) (syntax->list #'(REST ...)))]) + #'(cond + [(regexp-try-match (pregexp (string-append "^" PAT)) PORT) . REST] ... [else ELSE-CLAUSE]))])) + +(define (parse-1 x) + (pat-lex x + ["\\s+" (parse-1 x)] ; whitespace + ["\\d+ 0 obj" (parse-1 x)] ;; obj name + ["stream\n(.*?)\nendstream" => cadr] ;; stream + ["<<\n(.*)\n>>" ;; dict + => (λ (m) + (define items (parse-pdf-bytes (cadr m))) + (unless (even? (length items)) + (raise items)) + (sort ; put hash into order so it's comparable + (for/list ([kv (in-slice 2 items)]) + (apply cons kv)) + bytes (compose1 parse-pdf-bytes cadr)] ;; list + ["\\d+ 0 R"] ; xref + ["[-]?\\d+"] ; number + ["\\(.*?\\)"] ; parenstring + ["/\\S+"] ; keystring + [else eof])) + +(define (parse-pdf-bytes bs) + (for/list ([tok (in-port parse-1 (open-input-bytes bs))]) + tok)) + +(define (pdf->dict pdf) + (define pdf-bs (file->bytes pdf)) + (define xoff (xref-offset pdf-bs)) + (define xref-ip (open-input-bytes (subbytes pdf-bs (+ xoff (bytes-length #"xref\n0"))))) + (define ref-count (read xref-ip)) + (define obj-locations + (append + (sort ; sort by byte offset + (cdr ; drop zeroth record: there is no zeroth object + (for/list ([i (in-range ref-count)]) + (cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" xref-ip))))))) + < #:key cdr) + (list (cons #f xoff)))) + (sort ; sort by index + (parameterize ([current-input-port (open-input-bytes pdf-bs)]) + (for/list ([(idx start) (in-dict obj-locations)] + [(_ end) (in-dict (cdr obj-locations))]) + (cons idx (parse-pdf-bytes (peek-bytes (- end start) start))))) + < #:key car)) + +(define-simple-check (check-pdfs-equal? ps1 ps2) + (equal? (pdf->dict ps1) (pdf->dict ps2))) + + diff --git a/pitfall/ptest/parse-pdf.rkt b/pitfall/ptest/parse-pdf.rkt deleted file mode 100644 index 509436d8..00000000 --- a/pitfall/ptest/parse-pdf.rkt +++ /dev/null @@ -1,74 +0,0 @@ -#lang debug racket - -(define pdf (file->bytes "test11rkt.pdf")) -(define pdf2 (file->bytes "test11rkt copy.pdf")) - -(define (xref-offset pdf) - (match (regexp-match #px"(?<=startxref\n)\\d+" pdf) - [(list val) (read (open-input-bytes val))] - [_ (error 'no-xref)])) - -(define (parse-1-pdf-item x) - (cond - ;; obj name - [(regexp-try-match #px"^\\d+ 0 obj" x) #false] - ;; stream - [(regexp-try-match #px"^stream\n(.*?)\nendstream" x) => cadr] - ;; dict - [(regexp-try-match #px"^<<\n(.*)\n>>" x) - => (λ (m) - (define items (parse-pdf-items (cadr m))) - (unless (even? (length items)) - (raise items)) - (sort - (for/list ([kv (in-slice 2 items)]) - (apply cons kv)) - bytes (compose1 parse-pdf-items cadr)] - ;; xref - [(regexp-try-match #px"^\\d+ 0 R" x) => car] - ;; number - [(regexp-try-match #px"^[-]?\\d+" x) => car] - ;; parenstring - [(regexp-try-match #px"^\\(.*?\\)" x) => car] - ;; keystring - [(regexp-try-match #px"^/\\S+" x) => car] - ;; whitespace - [(regexp-match #px"\\s" x) #false] - [else eof])) - -(define (parse-pdf-items bs) - (for/list ([tok (in-port parse-1-pdf-item (open-input-bytes bs))] - #:when tok) - tok)) - -(define (pdf->dict pdf) - (define xoff (xref-offset pdf)) - (define ip (open-input-bytes pdf)) - (void (read-bytes xoff ip)) - (define ref-count - (match (regexp-match #px"(?<=xref\n0 )\\d+" ip) - [(list val) (read (open-input-bytes val))] - [_ (error 'no-xref-count)])) - (define obj-locations - (append - ;; sort by byte offset - (sort - (cdr ; drop zeroth record: there is no zeroth object - (for/list ([i ref-count]) - (cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" ip))))))) < #:key cdr) - (list (cons #f xoff)))) - (define ip2 (open-input-bytes pdf)) - (sort - (hash->list - (for/hash ([(idx start) (in-dict obj-locations)] - [(_ end) (in-dict (cdr obj-locations))]) - (values idx (parse-pdf-items (peek-bytes (- end start) start ip2))))) - < #:key car)) - -(equal? - (pdf->dict pdf) - (pdf->dict pdf2) - ) \ No newline at end of file