You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
272 lines
15 KiB
Racket
272 lines
15 KiB
Racket
#lang racket/base
|
|
(require xml
|
|
txexpr/base
|
|
racket/list
|
|
racket/match
|
|
sugar/list
|
|
sugar/define
|
|
sugar/test
|
|
"setup.rkt"
|
|
"private/splice.rkt"
|
|
"unstable/typography.rkt")
|
|
|
|
(provide (all-from-out "unstable/typography.rkt")) ; bw compat, includes `whitespace?`
|
|
|
|
(define (->list/tx x)
|
|
;; same as ->list but catches special case of single txexpr,
|
|
;; which is itself a list, but in this case should be wrapped into a list,
|
|
;; for use with append-map.
|
|
(cond
|
|
[(txexpr? x) (list x)]
|
|
[(list? x) x]
|
|
[else (list x)]))
|
|
|
|
(define decode-proc-output-contract (or/c txexpr-element? txexpr-elements?))
|
|
|
|
;; decoder wireframe
|
|
(define+provide/contract (decode tx-in
|
|
#:txexpr-tag-proc [txexpr-tag-proc values]
|
|
#:txexpr-attrs-proc [txexpr-attrs-proc values]
|
|
#:txexpr-elements-proc [txexpr-elements-proc values]
|
|
#:txexpr-proc [txexpr-proc values]
|
|
#:block-txexpr-proc [block-txexpr-proc values]
|
|
#:inline-txexpr-proc [inline-txexpr-proc values]
|
|
#:string-proc [string-proc values]
|
|
#:entity-proc [entity-proc values]
|
|
#:cdata-proc [cdata-proc values]
|
|
#:exclude-tags [excluded-tags empty]
|
|
#:exclude-attrs [excluded-attrs empty])
|
|
((xexpr/c)
|
|
(#:txexpr-tag-proc (txexpr-tag? . -> . txexpr-tag?)
|
|
#:txexpr-attrs-proc (txexpr-attrs? . -> . txexpr-attrs?)
|
|
#:txexpr-elements-proc (txexpr-elements? . -> . txexpr-elements?)
|
|
#:txexpr-proc (txexpr? . -> . decode-proc-output-contract)
|
|
#:block-txexpr-proc (block-txexpr? . -> . decode-proc-output-contract)
|
|
#:inline-txexpr-proc (txexpr? . -> . decode-proc-output-contract)
|
|
#:string-proc (string? . -> . decode-proc-output-contract)
|
|
#:entity-proc ((or/c symbol? valid-char?) . -> . decode-proc-output-contract)
|
|
#:cdata-proc (cdata? . -> . decode-proc-output-contract)
|
|
#:exclude-tags txexpr-tags?
|
|
#:exclude-attrs txexpr-attrs?) . ->* . decode-proc-output-contract)
|
|
(let loop ([x tx-in])
|
|
(match x
|
|
[(? txexpr?)
|
|
(define-values (tag attrs elements) (txexpr->values x))
|
|
(cond
|
|
[(or (memq tag excluded-tags)
|
|
(for/or ([attr (in-list attrs)])
|
|
(member attr excluded-attrs))) x] ; because it's excluded
|
|
[else
|
|
;; we apply processing here rather than do recursive descent on the pieces
|
|
;; because if we send them back through loop, certain element types are ambiguous
|
|
;; e.g., ((p "foo")) tests out as both txexpr-attrs and txexpr-elements
|
|
(define decoded-txexpr
|
|
(make-txexpr (txexpr-tag-proc tag)
|
|
(txexpr-attrs-proc attrs)
|
|
(txexpr-elements-proc (append-map (λ (x) (->list/tx (loop x))) elements))))
|
|
(txexpr-proc ((if (block-txexpr? decoded-txexpr)
|
|
block-txexpr-proc
|
|
inline-txexpr-proc) decoded-txexpr))])]
|
|
[(? string?) (string-proc x)]
|
|
[(? symbol?) (entity-proc x)]
|
|
[(? valid-char?) (entity-proc x)]
|
|
[(? cdata?) (cdata-proc x)]
|
|
[else (raise-argument-error 'decode "decodable thing" x)])))
|
|
|
|
(module-test-external
|
|
(require racket/list txexpr racket/function)
|
|
(define (doubler x) (list x x))
|
|
(define (doubletag x) (txexpr (string->symbol (format "~a~a" (get-tag x) (get-tag x))) (get-attrs x) (get-elements x)))
|
|
(check-equal? (decode #:txexpr-elements-proc identity '(p "foo")) '(p "foo"))
|
|
;; can't use doubler on txexpr-elements because it needs a list, not list of lists
|
|
(check-equal? (decode #:txexpr-elements-proc (λ (elems) (append elems elems)) '(p "foo")) '(p "foo" "foo"))
|
|
(check-equal? (decode #:block-txexpr-proc identity '(p "foo")) '(p "foo"))
|
|
(check-equal? (decode #:block-txexpr-proc doubler '(p "foo")) (list '(p "foo") '(p "foo")))
|
|
(check-equal? (decode #:block-txexpr-proc doubler '(p "foo")) (list '(p "foo") '(p "foo")))
|
|
(check-equal? (decode #:txexpr-proc doubletag '(root (p "foo") (b "bar"))) '(rootroot (pp "foo") (bb "bar")))
|
|
(check-equal? (decode #:block-txexpr-proc doubletag '(root (p "foo") (b "bar"))) '(rootroot (pp "foo") (b "bar")))
|
|
(check-equal? (decode #:inline-txexpr-proc doubletag '(root (p "foo") (b "bar"))) '(root (p "foo") (bb "bar")))
|
|
(check-equal? (decode #:inline-txexpr-proc identity '(p (span "foo"))) '(p (span "foo")))
|
|
(check-equal? (decode #:inline-txexpr-proc doubler '(p (span "foo"))) '(p (span "foo") (span "foo")))
|
|
(check-equal? (decode #:string-proc identity '(p (span "foo"))) '(p (span "foo")))
|
|
(check-equal? (decode #:string-proc doubler '(p (span "foo"))) '(p (span "foo" "foo")))
|
|
(check-equal? (decode #:entity-proc identity '(p (span "foo" 'amp))) '(p (span "foo" 'amp)))
|
|
(check-equal? (decode #:entity-proc identity '(p 42)) '(p 42))
|
|
(check-equal? (decode #:entity-proc doubler '(p 42)) '(p 42 42))
|
|
(check-equal? (decode #:entity-proc identity '(p amp)) '(p amp))
|
|
;; next text doesn't work because list of symbol elements is ambiguous with tagged X-expression
|
|
;; is there a general patch for this? maybe, but for now it's better to not patch selectively
|
|
;; otherwise ambiguous expressions will have erratic misbehavior (instead of merely consistent misbehavior)
|
|
;;(check-equal? (decode #:entity-proc doubler '(p amp)) '(p amp amp))
|
|
(check-equal? (decode-elements #:string-proc identity '("foo")) '("foo"))
|
|
(check-equal? (decode-elements #:string-proc doubler '("foo")) '("foo" "foo")))
|
|
|
|
;; it would be nice to not repeat this, but with all the keywords, it's simpler to repeat than do a macro
|
|
(define+provide/contract decode-elements
|
|
((txexpr-elements?)
|
|
(#:txexpr-tag-proc (txexpr-tag? . -> . txexpr-tag?)
|
|
#:txexpr-attrs-proc (txexpr-attrs? . -> . txexpr-attrs?)
|
|
#:txexpr-elements-proc (txexpr-elements? . -> . txexpr-elements?)
|
|
#:txexpr-proc (txexpr? . -> . decode-proc-output-contract)
|
|
#:block-txexpr-proc (block-txexpr? . -> . decode-proc-output-contract)
|
|
#:inline-txexpr-proc (txexpr? . -> . decode-proc-output-contract)
|
|
#:string-proc (string? . -> . decode-proc-output-contract)
|
|
#:entity-proc ((or/c symbol? valid-char?) . -> . decode-proc-output-contract)
|
|
#:cdata-proc (cdata? . -> . decode-proc-output-contract)
|
|
#:exclude-tags txexpr-tags?
|
|
#:exclude-attrs txexpr-attrs?) . ->* . decode-proc-output-contract)
|
|
(make-keyword-procedure
|
|
(λ (kws kwargs . args)
|
|
(define temp-tag (gensym "temp-tag"))
|
|
(define elements (first args))
|
|
(get-elements (keyword-apply decode kws kwargs (list (cons temp-tag elements)))))))
|
|
|
|
(define+provide/contract (block-txexpr? x)
|
|
(any/c . -> . boolean?)
|
|
;; Mostly this is used inside `decode`,
|
|
;; so rather than test for `txexpr?` at the beginning (which is potentially slow)
|
|
;; just look at the tag.
|
|
(and (pair? x) (memq (get-tag x) (setup:block-tags)) #t))
|
|
|
|
(define+provide/contract (decode-linebreaks elems [maybe-linebreak-proc '(br)]
|
|
#:separator [newline (setup:linebreak-separator)])
|
|
((txexpr-elements?)
|
|
((or/c #f txexpr-element?
|
|
(txexpr-element? txexpr-element? . -> . (or/c #f txexpr-element?))) #:separator string?)
|
|
. ->* . txexpr-elements?)
|
|
(unless (string? newline)
|
|
(raise-argument-error 'decode-linebreaks "string" newline))
|
|
(define linebreak-proc (match maybe-linebreak-proc
|
|
[(? procedure? proc) proc]
|
|
[val (λ (e1 e2) val)]))
|
|
(define elems-vec (list->vector elems))
|
|
(filter values
|
|
(for/list ([(elem idx) (in-indexed elems-vec)])
|
|
(cond
|
|
[(zero? idx) elem] ; pass first item
|
|
[(= idx (sub1 (vector-length elems-vec))) elem] ; pass through last item
|
|
[(equal? elem newline)
|
|
(define prev (vector-ref elems-vec (sub1 idx)))
|
|
(define next (vector-ref elems-vec (add1 idx)))
|
|
;; only convert if neither adjacent tag is a block
|
|
;; (because blocks automatically force a newline before & after)
|
|
(if (or (block-txexpr? prev) (block-txexpr? next))
|
|
#false ; flag for filtering
|
|
(linebreak-proc prev next))]
|
|
[else elem]))))
|
|
|
|
(module-test-external
|
|
(check-equal? (decode-linebreaks '("foo" "\n" "bar")) '("foo" (br) "bar"))
|
|
(check-equal? (decode-linebreaks '("foo" "\n" "bar") #f) '("foo" "bar"))
|
|
(check-equal? (decode-linebreaks '("\n" "foo" "\n" "bar" "\n")) '("\n" "foo" (br) "bar" "\n"))
|
|
(check-equal? (decode-linebreaks '((p "foo") "\n" (p "bar"))) '((p "foo") (p "bar")))
|
|
(check-equal? (decode-linebreaks '("foo" "\n" (p "bar"))) '("foo" (p "bar")))
|
|
(check-equal? (decode-linebreaks '("foo" "moo" "bar")) '("foo" "moo" "bar"))
|
|
(check-equal? (decode-linebreaks '("foo" "moo" "bar") "moo") '("foo" "moo" "bar"))
|
|
(check-equal? (decode-linebreaks '("foo" "\n\n" "bar")) '("foo" "\n\n" "bar")))
|
|
|
|
|
|
;; Find adjacent newline characters in a list and merge them into one item
|
|
;; Scribble, by default, makes each newline a separate list item.
|
|
;; Ignore empty strings.
|
|
;; Descend into txexprs.
|
|
(define+provide/contract (merge-newlines x)
|
|
(txexpr-elements? . -> . txexpr-elements?)
|
|
(define newline-pat (regexp (format "^~a+$" (setup:newline))))
|
|
(define (newline? x) (match x
|
|
[(regexp newline-pat) #true]
|
|
[_ #false]))
|
|
(define (merge-newline-slice xs)
|
|
(match xs
|
|
;; if first member of slice is newline, they all are
|
|
[(cons (? newline?) _) (list (apply string-append xs))]
|
|
[_ xs]))
|
|
(define (empty-string? x) (equal? x ""))
|
|
(let loop ([x x])
|
|
(match x
|
|
[(? pair? x) #:when (not (attrs? x))
|
|
(define xs (map loop (filter-not empty-string? x)))
|
|
(append-map merge-newline-slice (slicef xs newline?))]
|
|
[_ x])))
|
|
|
|
(module-test-external
|
|
(require racket/list)
|
|
(check-equal? (merge-newlines empty) empty)
|
|
(check-equal? (merge-newlines '((p ((id "")) "\n" "" "\n"))) '((p ((id "")) "\n\n")))
|
|
(check-equal? (merge-newlines '((p "\n" "" "\n"))) '((p "\n\n")))
|
|
(check-equal? (merge-newlines '(p "\n" "\n" "foo" "\n" "\n\n" "bar" (em "\n" "\n" "\n")))
|
|
'(p "\n\n" "foo" "\n\n\n" "bar" (em "\n\n\n"))))
|
|
|
|
(define+provide/contract (decode-paragraphs elements-in [maybe-wrap-proc 'p]
|
|
#:linebreak-proc [linebreak-proc decode-linebreaks]
|
|
#:force? [force-paragraph #f])
|
|
((txexpr-elements?) ((or/c txexpr-tag? ((listof xexpr?) . -> . txexpr?))
|
|
#:linebreak-proc (txexpr-elements? . -> . txexpr-elements?)
|
|
#:force? boolean?)
|
|
. ->* . txexpr-elements?)
|
|
(define paragraph-separator (setup:paragraph-separator))
|
|
(unless (string? paragraph-separator)
|
|
(raise-argument-error 'decode-paragraphs "string" paragraph-separator))
|
|
|
|
(define (prep-paragraph-flow elems)
|
|
(linebreak-proc (merge-newlines (trimf elems whitespace?))))
|
|
|
|
(define (paragraph-break? x)
|
|
(define paragraph-pattern (pregexp (format "^~a+$" paragraph-separator)))
|
|
(match x
|
|
[(pregexp paragraph-pattern) #true]
|
|
[_ #false]))
|
|
|
|
(define (explicit-or-implicit-paragraph-break? x)
|
|
(or (paragraph-break? x) (block-txexpr? x)))
|
|
|
|
(define wrap-proc (match maybe-wrap-proc
|
|
[(? procedure? proc) proc]
|
|
[_ (λ (elems) (list* maybe-wrap-proc elems))]))
|
|
|
|
(define (wrap-paragraph elems)
|
|
(match elems
|
|
[(list (? block-txexpr?) ...) elems] ; leave a series of block xexprs alone
|
|
[_ (list (wrap-proc elems))])) ; otherwise wrap in p tag
|
|
|
|
(define elements (prep-paragraph-flow elements-in))
|
|
(if (ormap explicit-or-implicit-paragraph-break? elements) ; need this condition to prevent infinite recursion
|
|
;; use `append-map` on `wrap-paragraph` rather than `map` to permit return of multiple elements
|
|
(append-map wrap-paragraph
|
|
(append-map (λ (es) (filter-split es paragraph-break?)) (slicef elements block-txexpr?))) ; split into ¶¶, using both implied and explicit paragraph breaks
|
|
(if force-paragraph
|
|
;; upconverts non-block elements to paragraphs
|
|
(append-map wrap-paragraph (slicef elements block-txexpr?))
|
|
elements)))
|
|
|
|
(module-test-external
|
|
(check-equal? (decode-paragraphs '("First para" "\n\n" "Second para"))
|
|
'((p "First para") (p "Second para")))
|
|
(check-equal? (decode-paragraphs '("First para" "\n\n" "Second para" "\n" "Second line"))
|
|
'((p "First para") (p "Second para" (br) "Second line")))
|
|
(check-equal? (decode-paragraphs '("First para" "\n\n" (div "Second block")))
|
|
'((p "First para") (div "Second block")))
|
|
(check-equal? (decode-paragraphs '((div "First block") "\n\n" (div "Second block")))
|
|
'((div "First block") (div "Second block")))
|
|
(check-equal? (decode-paragraphs '("First para" "\n\n" "Second para") 'ns:p)
|
|
'((ns:p "First para") (ns:p "Second para")))
|
|
(check-equal? (decode-paragraphs '("First para" "\n\n" "Second para" "\n" "Second line")
|
|
#:linebreak-proc (λ (x) (decode-linebreaks x '(newline))))
|
|
'((p "First para") (p "Second para" (newline) "Second line")))
|
|
(check-equal? (decode-paragraphs '("foo" "\n\n" (div "bar") (div "zam")))
|
|
'((p "foo") (div "bar") (div "zam")))
|
|
(check-equal? (decode-paragraphs '("foo" "\n\n" (div "bar") "\n\n" (div "zam")))
|
|
'((p "foo") (div "bar") (div "zam")))
|
|
(check-equal? (decode-paragraphs '("foo")) '("foo"))
|
|
(check-equal? (decode-paragraphs '("foo") #:force? #t) '((p "foo")))
|
|
(check-equal? (decode-paragraphs '((div "foo"))) '((div "foo")))
|
|
(check-equal? (decode-paragraphs '((div "foo")) #:force? #t) '((div "foo")))
|
|
(check-equal? (decode-paragraphs '("foo" "\n\n" (div "bar"))) '((p "foo") (div "bar")))
|
|
(check-equal? (decode-paragraphs '("foo" (div "bar"))) '((p "foo") (div "bar")))
|
|
(check-equal? (decode-paragraphs '("foo" (div "bar")) #:force? #t) '((p "foo") (div "bar")))
|
|
(check-equal? (decode-paragraphs '("foo" (div "bar") "zam")) '((p "foo") (div "bar") (p "zam")))
|
|
(check-equal? (decode-paragraphs '("foo" (span "zing") (div "bar") "zam")) '((p "foo" (span "zing")) (div "bar") (p "zam")))
|
|
(check-equal? (decode-paragraphs '("foo" (span "zing") (div "bar") "zam") #:force? #t) '((p "foo" (span "zing")) (div "bar") (p "zam"))))
|
|
|
|
(define+provide detect-paragraphs decode-paragraphs) ; bw compat
|
|
(define+provide detect-linebreaks decode-linebreaks) ; bw compat |