parsing
parent
7445bc9e91
commit
266f2f75a8
@ -1,98 +0,0 @@
|
||||
#lang at-exp br/quicklang
|
||||
(require "parser.rkt" "tokenizer.rkt" "struct.rkt" gregor racket/bytes)
|
||||
(provide (matching-identifiers-out #rx"pf-" (all-defined-out)))
|
||||
|
||||
(module+ test (require rackunit))
|
||||
|
||||
(module+ reader (provide read-syntax))
|
||||
|
||||
(define (read-syntax src port)
|
||||
;; use latin-1 reencoding to make one char = one byte (note `latin1` respelling is correct)
|
||||
(define parse-tree (parse (make-tokenizer (reencode-input-port port "latin1") src)))
|
||||
(strip-bindings
|
||||
#`(module pitfall-parse-mod pitfall/parse
|
||||
#,parse-tree)))
|
||||
|
||||
(define-macro (my-mb ARG ...)
|
||||
#'(#%module-begin ARG ...))
|
||||
(provide (rename-out [my-mb #%module-begin])
|
||||
require)
|
||||
|
||||
(provide null)
|
||||
|
||||
(define-macro (pf-program COS-OBJECT ...)
|
||||
#'(begin COS-OBJECT ...))
|
||||
|
||||
(define (pf-name str)
|
||||
(let* ([str (string-trim str "/" #:right? #f)]
|
||||
[str (regexp-replace* @pregexp{#(\d\d)} str (λ (m sub) (string (integer->char (string->number sub 16)))))])
|
||||
(string->symbol str)))
|
||||
|
||||
(module+ test
|
||||
(check-equal? (pf-name "B#45#20NICE") '|BE NICE|))
|
||||
|
||||
(define (pf-string arg . tail)
|
||||
(cond
|
||||
[(andmap byte? (cons arg tail)) (cons arg tail)]
|
||||
[(string-prefix? arg "D:")
|
||||
#;(parameterize ([current-locale "en"])
|
||||
(parse-date "2015-03-15T02:02:02-04:00" "yyyy-MM-dd'T'HH:mm:ssxxx"))
|
||||
#f]
|
||||
[else
|
||||
(let* ([str (regexp-replace @regexp{^\((.*)\)$} arg "\\1")] ; remove parens
|
||||
[str (string-replace str (string-append "\\" "\n") "")]
|
||||
[str (regexp-replace* @pregexp{\\(n|r|t|b|f|\(|\)|\\)} str (λ (m sub)
|
||||
(case sub
|
||||
[("n") "\n"]
|
||||
[("r") "\r"]
|
||||
[("t") "\t"]
|
||||
[("b") "\b"]
|
||||
[("f") "\f"]
|
||||
[else sub])))]
|
||||
[str (regexp-replace* @pregexp{\\(\d{2,3})} str (λ (m sub) (string (integer->char (string->number sub 8)))))])
|
||||
(co-string str))]))
|
||||
|
||||
(module+ test
|
||||
(check-equal? @pf-string{(Testing)} (co-string "Testing"))
|
||||
(check-equal? (pf-string @string-append{(Test\
|
||||
ing)}) (co-string "Testing"))
|
||||
(check-equal? @pf-string{(Test\)ing)} (co-string "Test)ing"))
|
||||
(check-equal? @pf-string{(Test\ning)} (co-string "Test\ning"))
|
||||
(check-equal? @pf-string{(Test\\ing)} (co-string "Test\\ing"))
|
||||
(check-equal? @pf-string{(A\53B)} (co-string "A+B"))
|
||||
(check-equal? @pf-string{(A\053B)} (co-string "A+B"))
|
||||
#;(check-equal? @pf-string{(D:19990209153925-08\'00\')})
|
||||
#;(check-true (andmap byte? @pf-string{<1C2D3F>}))
|
||||
#;(check-true (andmap byte? @pf-string{<1C 2D 3F>})))
|
||||
|
||||
(define (pf-array . xs) (co-array xs))
|
||||
|
||||
(define (pf-dict . args)
|
||||
(co-dict (apply hasheq args)))
|
||||
|
||||
|
||||
(define (pf-stream dict str)
|
||||
(define data (string->bytes/utf-8 str))
|
||||
(when (not (equal? (hash-ref (co-dict-dict dict) 'Length) (bytes-length data)))
|
||||
(raise-argument-error 'pf-stream
|
||||
(format "~a bytes of data" (hash-ref (co-dict-dict dict) 'Length))
|
||||
(format "~a = ~v" (bytes-length data) data)))
|
||||
(co-stream dict data))
|
||||
|
||||
(define (pf-indirect-object obj gen thing)
|
||||
(co-io obj gen thing))
|
||||
|
||||
(define-macro (pf-indirect-object-ref (OBJ GEN _))
|
||||
#'(co-io-ref OBJ GEN))
|
||||
|
||||
(define (pf-header num) (co-header num))
|
||||
|
||||
(define (pf-comment text) (co-comment text))
|
||||
|
||||
(define (pf-trailer dict)
|
||||
(let ([h (co-dict-dict dict)])
|
||||
(unless (and (hash-has-key? h 'Size) (hash-has-key? h 'Root))
|
||||
(raise-argument-error 'pf-trailer
|
||||
"Size and Root keys are required for trailer"
|
||||
(hash-keys h))))
|
||||
(co-trailer dict))
|
@ -0,0 +1,108 @@
|
||||
#lang br
|
||||
(require racket/file sugar/cache sugar/port racket/dict)
|
||||
|
||||
(define dict-key-pattern #px"^/\\S+")
|
||||
|
||||
(struct ref (id) #:transparent)
|
||||
|
||||
(define (trim-whitespace bs) ; can't use string-trim with bytes
|
||||
(cadr (regexp-match #px"^\\s*(.*?)\\s*$" bs)))
|
||||
|
||||
(define (parse-dict p)
|
||||
(define terms (for/list ([next (in-port parse-one p)]
|
||||
#:break (eq? next 'end-dict))
|
||||
next))
|
||||
(for/list ([k (in-list terms)]
|
||||
[v (in-list (cdr terms))]
|
||||
[i (in-naturals)]
|
||||
#:when (even? i))
|
||||
(cons k v)))
|
||||
|
||||
(define (parse-list p)
|
||||
(for/list ([next (in-port parse-one p)]
|
||||
#:break (eq? next 'end-list))
|
||||
next))
|
||||
|
||||
(define (parse-one p)
|
||||
(let loop ([p-in p])
|
||||
(define p (if (input-port? p-in) p-in (open-input-bytes p-in)))
|
||||
(cond
|
||||
;; skip whitespace
|
||||
[(regexp-try-match #px"^\\s+" p) (loop p)]
|
||||
;; indirect reference => reference int
|
||||
[(regexp-try-match #px"^(\\d+) (\\d+) R" p) => (λ (m) (string->symbol (format "ref-~a" (loop (cadr m)))))]
|
||||
;; object => trim and reparse
|
||||
[(regexp-try-match #px"^\\d+ 0 obj(.*?)endobj" p) => (λ (m) (loop (cadr m)))]
|
||||
;; integer string => int
|
||||
[(regexp-try-match #px"^\\d+" p) => (λ (m) (string->number (bytes->string/utf-8 (car m))))]
|
||||
;; dict => dispatch to handler
|
||||
[(regexp-try-match #rx"^<<" p) (parse-dict p)]
|
||||
[(regexp-try-match #rx"^>>" p) 'end-dict]
|
||||
;; list => unroll
|
||||
[(regexp-try-match #rx"^\\[" p) (parse-list p)]
|
||||
[(regexp-try-match #rx"^]" p) 'end-list]
|
||||
;; slash-prefixed dictionary key => symbol
|
||||
[(regexp-try-match dict-key-pattern p) => (λ (m) (string->symbol (string-trim (bytes->string/utf-8 (car m)) "/" #:right? #f)))]
|
||||
[(regexp-try-match #rx"^startxref" p) (loop p)] ; pulls off next item, which is an integer
|
||||
[(regexp-try-match #rx"^<([0123456789abcdef]*)>" p) => (λ (m) (cadr m))]
|
||||
[(eof-object? (peek-bytes 1 0 p)) eof]
|
||||
[else (report* (file-position p) (peek-bytes 3 0 p)) (error 'unknown-type)])))
|
||||
|
||||
(struct trailer (dict xref-ptr) #:transparent)
|
||||
(define (parse-trailer p)
|
||||
(define eof-position (file-size (object-name p)))
|
||||
(define trailer-pos (for*/first ([pos (in-range eof-position 0 -1)]
|
||||
[m (in-value (regexp-try-match "^\ntrailer\n.*" p pos))]
|
||||
#:when m)
|
||||
pos))
|
||||
(define dict-offset (caar (regexp-match-positions #rx"<<.*?>>" (port-position p trailer-pos))))
|
||||
(define trailer-dict (parse-one (port-position p (+ trailer-pos dict-offset))))
|
||||
(define xref-ptr (parse-one p))
|
||||
(trailer trailer-dict xref-ptr))
|
||||
|
||||
|
||||
(define (parse-xref p offset)
|
||||
(define bs (car (regexp-match #rx"(?<=xref).*?(?=trailer)" (port-position p offset))))
|
||||
(define str (bytes->string/utf-8 bs))
|
||||
(for/hash ([k (in-naturals 1)]
|
||||
[v (in-list (drop (string-split str "\n") 2))])
|
||||
(values k (string->number (car (regexp-match #px"^\\d+" v))))))
|
||||
|
||||
(struct $pdf (port xrefs) #:transparent)
|
||||
(define/caching (doc-ref pdf idx)
|
||||
(define offset (hash-ref ($pdf-xrefs pdf) idx))
|
||||
(parse-one (port-position ($pdf-port pdf) offset)))
|
||||
|
||||
(define/caching (doc-ref* pdf idx)
|
||||
;; resolve nested ref-x pointers
|
||||
(define visited empty) ; prevent circular references by tracking visited refs
|
||||
(let loop ([result (doc-ref pdf idx)])
|
||||
(cond
|
||||
[(dict? result) (for/list ([(k v) (in-dict result)])
|
||||
(cons k (loop v)))]
|
||||
[(list? result) (map loop result)]
|
||||
[(and (symbol? result) (regexp-match #px"^ref-(\\d+)" (symbol->string result)))
|
||||
=> (λ (m) (define next-idx (string->number (cadr m)))
|
||||
(define old-visited visited)
|
||||
(set! visited (cons next-idx visited))
|
||||
(if (member next-idx old-visited)
|
||||
result
|
||||
(loop (doc-ref pdf next-idx))))]
|
||||
[else result])))
|
||||
|
||||
|
||||
(define (port->pdf p)
|
||||
($pdf p (parse-xref p (trailer-xref-ptr (parse-trailer p)))))
|
||||
|
||||
(module+ test
|
||||
(require rackunit)
|
||||
(define p (open-input-file "test/minimal.pdf"))
|
||||
(define pdf (port->pdf p))
|
||||
(check-equal? (doc-ref pdf 1) '((Type . Catalog) (Pages . ref-2)))
|
||||
(check-equal? (doc-ref pdf 2) '((Type . Pages) (Kids ref-3) (Count . 1) (MediaBox 0 0 300 144)))
|
||||
(check-equal? (doc-ref pdf 3) '((Type . Page) (Parent . ref-2) (Resources (Font (F1 (Type . Font) (Subtype . Type1) (BaseFont . Times-Roman)))) (Contents . ref-4)))
|
||||
(check-equal? (doc-ref pdf 4) '((Length . 55))))
|
||||
|
||||
|
||||
(define p (open-input-file "test/test-cff/textedit-sample.pdf"))
|
||||
(define pdf (port->pdf p))
|
@ -0,0 +1,58 @@
|
||||
%PDF-1.1
|
||||
%¥±ë
|
||||
|
||||
1 0 obj
|
||||
<< /Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
|
||||
2 0 obj
|
||||
<< /Type /Pages
|
||||
/Kids [3 0 R]
|
||||
/Count 1
|
||||
/MediaBox [0 0 300 144]
|
||||
>>
|
||||
endobj
|
||||
|
||||
3 0 obj
|
||||
<< /Type /Page
|
||||
/Parent 2 0 R
|
||||
/Resources
|
||||
<< /Font
|
||||
<< /F1
|
||||
<< /Type /Font
|
||||
/Subtype /Type1
|
||||
/BaseFont /Times-Roman
|
||||
>>
|
||||
>>
|
||||
>>
|
||||
/Contents 4 0 R
|
||||
>>
|
||||
endobj
|
||||
|
||||
4 0 obj
|
||||
<< /Length 55 >>
|
||||
stream
|
||||
BT
|
||||
/F1 18 Tf
|
||||
0 0 Td
|
||||
(Hello World) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
|
||||
xref
|
||||
0 5
|
||||
0000000000 65535 f
|
||||
0000000018 00000 n
|
||||
0000000077 00000 n
|
||||
0000000178 00000 n
|
||||
0000000457 00000 n
|
||||
trailer
|
||||
<< /Root 1 0 R
|
||||
/Size 5
|
||||
>>
|
||||
startxref
|
||||
565
|
||||
%%EOF
|
Loading…
Reference in New Issue