#lang debug racket (require rackunit (prefix-in zlib: fontland/zlib) fontland/table/cff/cff-top) (provide (all-defined-out)) (define (xref-offset bs) (match (regexp-match #px"(?<=startxref\n)\\d+" bs) [(list val) (read (open-input-bytes val))] [_ (error 'no-xref)])) (define-syntax (pat-lex stx) (syntax-case stx (else) [(_ PORT [PAT . REST] ... [else ELSE-CLAUSE]) (with-syntax ([(REST ...) (map (λ (stx) (syntax-case stx () [() #'(=> car)] [_ stx])) (syntax->list #'(REST ...)))]) #'(cond [(regexp-try-match (pregexp (string-append "^" PAT)) PORT) . REST] ... [else ELSE-CLAUSE]))])) (define (between-delimiters bs left right) (parameterize ([current-input-port (if (input-port? bs) bs (open-input-bytes bs))]) (let loop ([acc null][depth 0]) (cond [(regexp-try-match (pregexp (string-append "^" (regexp-quote (~a left)))) (current-input-port)) => (λ (m) (loop (if (empty? acc) acc ; impliedly throw away first left delimiter (cons (car m) acc)) (add1 depth)))] [(regexp-try-match (pregexp (string-append "^" (regexp-quote (~a right)))) (current-input-port)) => (λ (m) (case depth [(1) (bytes-append* (reverse acc))] [else (loop (cons (car m) acc) (sub1 depth))]))] [else (define bstr (read-byte)) (and (not (eof-object? bstr)) (loop (if (zero? depth) acc ; impliedly throw away leading non-delimiter bytes (cons (bytes bstr) acc)) depth))])))) (module+ test (require rackunit) (define bs #"a<>x<>d>>e<>g") (check-equal? (between-delimiters bs #"<<" #">>") #"b<>x<>d") (check-equal? (between-delimiters (between-delimiters bs #"<<" #">>") #"<<" #">>") #"c") (check-false (between-delimiters #"abc" #"<<" #">>")) (check-equal? (between-delimiters #"[a[b]c]" #"[" #"]") #"a[b]c") (check-equal? (let ([ip (open-input-bytes #"<>z")]) (parse-1 ip) (port->bytes ip)) #"z")) (define excluded-keys (list #"/Producer" #"/Creator" #"/CreationDate" #"/ModDate" #"/Keywords" #"/Title" #"/Author" #"/Subject")) (define (parse-1 ip) (cond ;; the complication is that arrays & dicts can contain other arrays & dicts ;; so we have to scan ahead in an intelligent way. [(equal? (peek-bytes 1 0 ip) #"[") ;; array (parse-pdf-bytes (between-delimiters ip #"[" #"]"))] [(equal? (peek-bytes 2 0 ip) #"<<") ;; dict, maybe with stream (define items (parse-pdf-bytes (between-delimiters ip #"<<" #">>"))) (unless (even? (length items)) (raise items)) (define dic (sort ; put hash into order so it's comparable (for/list ([kv (in-slice 2 items)] ;; suppress these keys so we can compare pdfkit & pitfall output #:unless (member (car kv) excluded-keys)) (apply cons kv)) bytes cadr] ; font keystring. prefix is random, so ignore ["/\\S+"] ; keystring [else eof])])) (define (parse-pdf-bytes bs) (for/list ([tok (in-port parse-1 (open-input-bytes bs))]) tok)) (define (pdf->dict pdf) (define pdf-bs (if (bytes? pdf) pdf (file->bytes pdf))) (define xoff (xref-offset pdf-bs)) (define xref-ip (open-input-bytes (subbytes pdf-bs (+ xoff (bytes-length #"xref\n0"))))) (define ref-count (read xref-ip)) (define obj-locations (append (sort ; sort by byte offset (cdr ; drop zeroth record: there is no zeroth object (for/list ([i (in-range ref-count)]) (cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" xref-ip))))))) < #:key cdr) (list (cons #f xoff)))) (sort ; sort by index (parameterize ([current-input-port (open-input-bytes pdf-bs)]) (for/list ([(idx start) (in-dict obj-locations)] [(_ end) (in-dict (cdr obj-locations))]) (cons idx (car (parse-pdf-bytes (peek-bytes (- end start) start)))))) < #:key car)) (define (dict-compare arg1 arg2 [obj-idx #f]) (define d1 (if (dict? arg1) arg1 (pdf->dict arg1))) (define d2 (if (dict? arg2) arg2 (pdf->dict arg2))) (unless (dict? d1) (error "d1 is not a dict")) (unless (dict? d2) (error "d2 is not a dict")) (unless (= (length d1) (length d2)) (error (format "dict lengths different in d1 (~a) and d2 (~a)" (length d1) (length d2)))) (for/and ([(k1 v1) (in-dict d1)] [(k2 v2) (in-dict d2)]) (define current-object-idx (or obj-idx k1)) (cond [(dict? v1) (dict-compare v1 v2 current-object-idx)] [(not (equal? k1 k2)) (error (format "keys unequal in object ~a: ~a ≠ ~a" current-object-idx k1 k2))] [(not (equal? v1 v2)) (error (format "values unequal in object ~a for key ~e: ~e ≠ ~e" current-object-idx k1 v1 v2))] [else #true]))) (define-simple-check (check-headers-equal? ps1 ps2) (equal? (peek-bytes 14 0 (open-input-file ps1)) (peek-bytes 14 0 (open-input-file ps2)))) (define-simple-check (check-pdfs-equal? ps1 ps2) (dict-compare ps1 ps2)) (define-simple-check (check-font-subsets-equal? f1 f2) (define misses null) (define (dump val) (cond [(promise? val) 'promise-omitted] [(vector? val) (dump (vector->list val))] [(dict? val) (for/list ([(k v) (in-dict (sort (dict->list val) #:key car symboldict f1) 8) 'stream)) (define cfftop1 (dump (send CFFTop x:decode (open-input-bytes ibs1)))) (define ibs2 (dict-ref (dict-ref (pdf->dict f2) 8) 'stream)) (define cfftop2 (dump (send CFFTop x:decode (open-input-bytes ibs2)))) (cmp cfftop1 cfftop2) (check-true (null? misses))) #;(module+ main (for ([p (in-directory)] #:when (path-has-extension? p #"pdf")) (with-handlers ([exn:fail? (λ (exn) (println (format "~a failed" p)))]) (pdf->dict p))))