#lang debug racket (require rackunit) (provide (all-defined-out)) (define (xref-offset bs) (match (regexp-match #px"(?<=startxref\n)\\d+" bs) [(list val) (read (open-input-bytes val))] [_ (error 'no-xref)])) (define-syntax (pat-lex stx) (syntax-case stx (else) [(_ PORT [PAT . REST] ... [else ELSE-CLAUSE]) (with-syntax ([(REST ...) (map (λ (stx) (syntax-case stx () [() #'(=> car)] [_ stx])) (syntax->list #'(REST ...)))]) #'(cond [(regexp-try-match (pregexp (string-append "^" PAT)) PORT) . REST] ... [else ELSE-CLAUSE]))])) (define (between-delimiters bs left right) (parameterize ([current-input-port (if (input-port? bs) bs (open-input-bytes bs))]) (let loop ([acc null][depth 0]) (cond [(regexp-try-match (pregexp (string-append "^" (regexp-quote (~a left)))) (current-input-port)) => (λ (m) (loop (if (empty? acc) acc ; impliedly throw away first left delimiter (cons (car m) acc)) (add1 depth)))] [(regexp-try-match (pregexp (string-append "^" (regexp-quote (~a right)))) (current-input-port)) => (λ (m) (case depth [(1) (bytes-append* (reverse acc))] [else (loop (cons (car m) acc) (sub1 depth))]))] [else (define bstr (read-byte)) (and (not (eof-object? bstr)) (loop (if (zero? depth) acc ; impliedly throw away leading non-delimiter bytes (cons (bytes bstr) acc)) depth))])))) (module+ test (require rackunit) (define bs #"a<>x<>d>>e<>g") (check-equal? (between-delimiters bs #"<<" #">>") #"b<>x<>d") (check-equal? (between-delimiters (between-delimiters bs #"<<" #">>") #"<<" #">>") #"c") (check-false (between-delimiters #"abc" #"<<" #">>")) (check-equal? (between-delimiters #"[a[b]c]" #"[" #"]") #"a[b]c") (check-equal? (let ([ip (open-input-bytes #"<>z")]) (parse-1 ip) (port->bytes ip)) #"z")) (define (parse-1 ip) (cond ;; the complication is that arrays & dicts can contain other arrays & dicts ;; so we have to scan ahead in an intelligent way. [(equal? (peek-bytes 1 0 ip) #"[") ;; array (parse-pdf-bytes (between-delimiters ip #"[" #"]"))] [(equal? (peek-bytes 2 0 ip) #"<<") ;; dict, maybe with stream (define items (parse-pdf-bytes (between-delimiters ip #"<<" #">>"))) (unless (even? (length items)) (raise items)) (define dic (sort ; put hash into order so it's comparable (for/list ([kv (in-slice 2 items)]) (apply cons kv)) bytesdict pdf) (define pdf-bs (file->bytes pdf)) (define xoff (xref-offset pdf-bs)) (define xref-ip (open-input-bytes (subbytes pdf-bs (+ xoff (bytes-length #"xref\n0"))))) (define ref-count (read xref-ip)) (define obj-locations (append (sort ; sort by byte offset (cdr ; drop zeroth record: there is no zeroth object (for/list ([i (in-range ref-count)]) (cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" xref-ip))))))) < #:key cdr) (list (cons #f xoff)))) (sort ; sort by index (parameterize ([current-input-port (open-input-bytes pdf-bs)]) (for/list ([(idx start) (in-dict obj-locations)] [(_ end) (in-dict (cdr obj-locations))]) (cons idx (parse-pdf-bytes (peek-bytes (- end start) start))))) < #:key car)) (define-simple-check (check-pdfs-equal? ps1 ps2) (equal? (pdf->dict ps1) (pdf->dict ps2))) (for ([p (in-directory)] #:when (path-has-extension? p #"pdf")) (with-handlers ([exn:fail? (λ (exn) (println (format "~a failed" p)))]) (pdf->dict p)))