diff --git a/pitfall/ptest/check-pdf.rkt b/pitfall/ptest/check-pdf.rkt index 4c90520f..35d4a2c0 100644 --- a/pitfall/ptest/check-pdf.rkt +++ b/pitfall/ptest/check-pdf.rkt @@ -17,37 +17,73 @@ #'(cond [(regexp-try-match (pregexp (string-append "^" PAT)) PORT) . REST] ... [else ELSE-CLAUSE]))])) -(define (parse-1 x) - (pat-lex x - ["\\s+" (parse-1 x)] ; whitespace - ["\\d+ 0 obj" (parse-1 x)] ;; obj name - ["(<<\n.*\n>>)\\s*stream\n" ; dict with stream - => (λ (m) - (define d (parse-1 (open-input-bytes (cadr m)))) - (define stream-length - (read (open-input-bytes (cdr (assoc #"/Length" d))))) - (define stream (read-bytes stream-length x)) - (append d (list (cons 'stream stream))))] - ["<<\n(.*)\n>>" ;; dict - => (λ (m) - (define items (parse-pdf-bytes (cadr m))) - (unless (even? (length items)) - (raise items)) - (sort ; put hash into order so it's comparable - (for/list ([kv (in-slice 2 items)]) - (apply cons kv)) - bytes (compose1 parse-pdf-bytes cadr)] ;; list - ["\\d+ 0 R"] ; xref - ["[-]?\\d+"] ; number - ["\\(.*?\\)"] ; parenstring - ["/\\S+"] ; keystring - [else eof])) +(define (between-delimiters bs left right) + (parameterize ([current-input-port (if (input-port? bs) bs (open-input-bytes bs))]) + (let loop ([acc null][depth 0]) + (cond + [(regexp-try-match (pregexp (string-append "^" (regexp-quote (~a left)))) (current-input-port)) + => (λ (m) + (loop (if (empty? acc) + acc ; impliedly throw away first left delimiter + (cons (car m) acc)) (add1 depth)))] + [(regexp-try-match (pregexp (string-append "^" (regexp-quote (~a right)))) (current-input-port)) + => (λ (m) + (case depth + [(1) (bytes-append* (reverse acc))] + [else (loop (cons (car m) acc) (sub1 depth))]))] + [else + (define bstr (read-byte)) + (and (not (eof-object? bstr)) + (loop (if (zero? depth) + acc ; impliedly throw away leading non-delimiter bytes + (cons (bytes bstr) acc)) depth))])))) + +(module+ test + (require rackunit) + (define bs #"a<>x<>d>>e<>g") + (check-equal? (between-delimiters bs #"<<" #">>") #"b<>x<>d") + (check-equal? (between-delimiters (between-delimiters bs #"<<" #">>") #"<<" #">>") #"c") + (check-false (between-delimiters #"abc" #"<<" #">>")) + (check-equal? (between-delimiters #"[a[b]c]" #"[" #"]") #"a[b]c") + (check-equal? (let ([ip (open-input-bytes #"<>z")]) + (parse-1 ip) + (port->bytes ip)) #"z")) + +(define (parse-1 ip) + (cond + ;; the complication is that arrays & dicts can contain other arrays & dicts + ;; so we have to scan ahead in an intelligent way. + [(equal? (peek-bytes 1 0 ip) #"[") ;; array + (parse-pdf-bytes (between-delimiters ip #"[" #"]"))] + [(equal? (peek-bytes 2 0 ip) #"<<") ;; dict, maybe with stream + (define items (parse-pdf-bytes (between-delimiters ip #"<<" #">>"))) + (unless (even? (length items)) (raise items)) + (define dic + (sort ; put hash into order so it's comparable + (for/list ([kv (in-slice 2 items)]) + (apply cons kv)) + bytesdict pdf) (define pdf-bs (file->bytes pdf)) @@ -59,17 +95,20 @@ (sort ; sort by byte offset (cdr ; drop zeroth record: there is no zeroth object (for/list ([i (in-range ref-count)]) - (cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" xref-ip))))))) + (cons i (read (open-input-bytes (car (regexp-match #px"\\d{10}" xref-ip))))))) < #:key cdr) (list (cons #f xoff)))) (sort ; sort by index (parameterize ([current-input-port (open-input-bytes pdf-bs)]) (for/list ([(idx start) (in-dict obj-locations)] [(_ end) (in-dict (cdr obj-locations))]) - (cons idx (parse-pdf-bytes (peek-bytes (- end start) start))))) + (cons idx (parse-pdf-bytes (peek-bytes (- end start) start))))) < #:key car)) (define-simple-check (check-pdfs-equal? ps1 ps2) (equal? (pdf->dict ps1) (pdf->dict ps2))) - +(for ([p (in-directory)] + #:when (path-has-extension? p #"pdf")) + (with-handlers ([exn:fail? (λ (exn) (println (format "~a failed" p)))]) + (pdf->dict p))) diff --git a/pitfall/ptest/pushy.rkt b/pitfall/ptest/pushy.rkt deleted file mode 100644 index 1fd90676..00000000 --- a/pitfall/ptest/pushy.rkt +++ /dev/null @@ -1,24 +0,0 @@ -#lang debug br -(require racket/bytes) - - -(define (outer-delimiter bs left right) - (parameterize ([current-input-port (open-input-bytes bs)]) - (let loop ([acc null][depth 0]) - (cond - [(regexp-try-match (regexp (format "^~a" left)) (current-input-port)) - => (λ (m) (loop (cons (car m) acc) (add1 depth)))] - [(regexp-try-match (regexp (format "^~a" right)) (current-input-port)) - => (λ (m) - (case depth - [(= depth 1) (apply bytes-append (reverse (cons (car m) acc)))] - [else (loop (cons (car m) acc) (sub1 depth))]))] - [else - (define b (read-byte)) - (loop (if (empty? acc) - acc ; drop leading non-delimiter bytes - (cons (bytes b) acc)) depth)])))) - -(define bs #"aaa<>xxx<>ddd>>eee<>ggg") - -(outer-delimiter bs #"<<" #">>") \ No newline at end of file