You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pollen/decode.rkt

462 lines
24 KiB
Racket

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#lang racket/base
(require xml txexpr racket/string racket/match racket/list (prefix-in html: pollen/html) sugar/list sugar/container sugar/len sugar/define sugar/coerce sugar/test)
(require "debug.rkt" "world.rkt")
(define (symbols? x) (and (list? x) (andmap symbol? x)))
(define+provide (to-string x)
(if (string? x)
x ; fast exit for strings
(with-handlers ([exn:fail? (λ(exn) (error (format "Pollen decoder: can't convert ~v to ~a" x 'string)))])
(cond
[(equal? '() x) ""]
[(symbol? x) (symbol->string x)]
[(number? x) (number->string x)]
[(path? x) (path->string x)]
[(char? x) (format "~a" x)]
[(void? x) ""]
;; todo: guard against weird shit like lists of procedures
[(or (list? x) (hash? x) (vector? x)) (format "~v" x)] ; ok to convert datatypes
[else (error)])))) ; but things like procedures should throw an error
(define decode-proc-output-contract (or/c xexpr/c (non-empty-listof xexpr/c)))
(define (->list/tx x)
;; same as ->list but catches special case of single txexpr,
;; which is itself a list, but in this case should be wrapped into a list,
;; for use with append-map.
(if (txexpr? x)
(list x)
(->list x)))
;; decoder wireframe
(define+provide/contract (decode tx-in
#:txexpr-tag-proc [txexpr-tag-proc (λ(x)x)]
#:txexpr-attrs-proc [txexpr-attrs-proc (λ(x)x)]
#:txexpr-elements-proc [txexpr-elements-proc (λ(x)x)]
#:block-txexpr-proc [block-txexpr-proc (λ(x)x)]
#:inline-txexpr-proc [inline-txexpr-proc (λ(x)x)]
#:string-proc [string-proc (λ(x)x)]
#:entity-proc [entity-proc (λ(x)x)]
#:cdata-proc [cdata-proc (λ(x)x)]
#:exclude-tags [excluded-tags '()]
#:exclude-attrs [excluded-attrs '()])
((xexpr/c)
(#:txexpr-tag-proc (txexpr-tag? . -> . txexpr-tag?)
#:txexpr-attrs-proc (txexpr-attrs? . -> . txexpr-attrs?)
#:txexpr-elements-proc (txexpr-elements? . -> . txexpr-elements?)
#:block-txexpr-proc (block-txexpr? . -> . decode-proc-output-contract)
#:inline-txexpr-proc (txexpr? . -> . decode-proc-output-contract)
#:string-proc (string? . -> . decode-proc-output-contract)
#:entity-proc ((or/c symbol? valid-char?) . -> . decode-proc-output-contract)
#:cdata-proc (cdata? . -> . decode-proc-output-contract)
#:exclude-tags (listof txexpr-tag?)
#:exclude-attrs txexpr-attrs?) . ->* . (or/c xexpr/c (non-empty-listof xexpr/c)))
(let loop ([x tx-in])
(cond
[(txexpr? x) (let-values([(tag attrs elements) (txexpr->values x)])
(if (or (member tag excluded-tags) (ormap (λ(attr) (member attr excluded-attrs)) attrs))
x ; because it's excluded
;; we apply processing here rather than do recursive descent on the pieces
;; because if we send them back through loop, certain element types are ambiguous
;; e.g., ((p "foo")) tests out as both txexpr-attrs and txexpr-elements
(let ([decoded-txexpr
(apply make-txexpr (list (txexpr-tag-proc tag)
(txexpr-attrs-proc attrs)
(txexpr-elements-proc (append-map (compose1 ->list/tx loop) elements))))])
((if (block-txexpr? decoded-txexpr)
block-txexpr-proc
inline-txexpr-proc) decoded-txexpr))))]
[(string? x) (string-proc x)]
[(or (symbol? x) (valid-char? x)) (entity-proc x)]
[(cdata? x) (cdata-proc x)]
[else (error "decode: can't decode" x)])))
(module-test-external
(require racket/list txexpr racket/function)
(define (doubler x) (list x x))
(check-equal? (decode #:txexpr-elements-proc identity '(p "foo")) '(p "foo"))
;; can't use doubler on txexpr-elements because it needs a list, not list of lists
(check-equal? (decode #:txexpr-elements-proc (λ(elems) (append elems elems)) '(p "foo")) '(p "foo" "foo"))
(check-equal? (decode #:block-txexpr-proc identity '(p "foo")) '(p "foo"))
(check-equal? (decode #:block-txexpr-proc doubler '(p "foo")) (list '(p "foo") '(p "foo")))
(check-equal? (decode #:inline-txexpr-proc identity '(p (span "foo"))) '(p (span "foo")))
(check-equal? (decode #:inline-txexpr-proc doubler '(p (span "foo"))) '(p (span "foo") (span "foo")))
(check-equal? (decode #:string-proc identity '(p (span "foo"))) '(p (span "foo")))
(check-equal? (decode #:string-proc doubler '(p (span "foo"))) '(p (span "foo" "foo")))
(check-equal? (decode #:entity-proc identity '(p (span "foo" 'amp))) '(p (span "foo" 'amp)))
(check-equal? (decode #:entity-proc identity '(p 42)) '(p 42))
(check-equal? (decode #:entity-proc doubler '(p 42)) '(p 42 42))
(check-equal? (decode #:entity-proc identity '(p amp)) '(p amp))
;; next text doesn't work because list of symbol elements is ambiguous with tagged X-expression
;; is there a general patch for this? maybe, but for now it's better to not patch selectively
;; otherwise ambiguous expressions will have erratic misbehavior (instead of merely consistent misbehavior)
;;(check-equal? (decode #:entity-proc doubler '(p amp)) '(p amp amp))
(check-equal? (decode-elements #:string-proc identity '("foo")) '("foo"))
(check-equal? (decode-elements #:string-proc doubler '("foo")) '("foo" "foo")))
;; it would be nice to not repeat this, but with all the keywords, it's simpler to repeat than do a macro
(define+provide/contract (decode-elements elements
#:txexpr-tag-proc [txexpr-tag-proc (λ(x)x)]
#:txexpr-attrs-proc [txexpr-attrs-proc (λ(x)x)]
#:txexpr-elements-proc [txexpr-elements-proc (λ(x)x)]
#:block-txexpr-proc [block-txexpr-proc (λ(x)x)]
#:inline-txexpr-proc [inline-txexpr-proc (λ(x)x)]
#:string-proc [string-proc (λ(x)x)]
#:entity-proc [entity-proc (λ(x)x)]
#:cdata-proc [cdata-proc (λ(x)x)]
#:exclude-tags [excluded-tags '()]
#:exclude-attrs [excluded-attrs '()])
((txexpr-elements?)
(#:txexpr-tag-proc (txexpr-tag? . -> . txexpr-tag?)
#:txexpr-attrs-proc (txexpr-attrs? . -> . txexpr-attrs?)
#:txexpr-elements-proc (txexpr-elements? . -> . txexpr-elements?)
#:block-txexpr-proc (block-txexpr? . -> . decode-proc-output-contract)
#:inline-txexpr-proc (txexpr? . -> . decode-proc-output-contract)
#:string-proc (string? . -> . decode-proc-output-contract)
#:entity-proc ((or/c symbol? valid-char?) . -> . decode-proc-output-contract)
#:cdata-proc (cdata? . -> . decode-proc-output-contract)
#:exclude-tags (listof txexpr-tag?)
#:exclude-attrs txexpr-attrs?) . ->* . (or/c xexpr/c (non-empty-listof xexpr/c)))
(define temp-tag (gensym "temp-tag"))
(define decode-result (decode `(temp-tag ,@elements)
#:txexpr-tag-proc txexpr-tag-proc
#:txexpr-attrs-proc txexpr-attrs-proc
#:txexpr-elements-proc txexpr-elements-proc
#:block-txexpr-proc block-txexpr-proc
#:inline-txexpr-proc inline-txexpr-proc
#:string-proc string-proc
#:entity-proc entity-proc
#:cdata-proc cdata-proc
#:exclude-tags excluded-tags
#:exclude-attrs excluded-attrs))
(get-elements decode-result))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Blocks
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; initial set of block tags: from html
(define+provide project-block-tags
(make-parameter html:block-tags))
;; tags are inline unless they're registered as block tags.
(define+provide/contract (block-txexpr? x)
(any/c . -> . boolean?)
(and (txexpr? x) (member (get-tag x) (project-block-tags)) #t))
(define+provide/contract (register-block-tag tag)
(txexpr-tag? . -> . void?)
(project-block-tags (cons tag (project-block-tags))))
(module-test-external
(check-true (begin (register-block-tag 'barfoo) (block-txexpr? '(barfoo "foo")))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Typography
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (make-replacer query+replacement)
(let ([queries (map car query+replacement)]
[replacements (map second query+replacement)])
;; reverse because first in list should be first applied to str (and compose1 works right-to-left)
(apply compose1 (reverse (map (λ(query replacement) (λ(str) (regexp-replace* query str replacement))) queries replacements)))))
(define+provide/contract (smart-dashes str)
(string? . -> . string?)
(define dashes
;; fix em dashes first, else they'll be mistaken for en dashes
;; \\s is whitespace + #\u00A0 is nonbreaking space
'((#px"[\\s#\u00A0]*(---|—)[\\s#\u00A0]*" "") ; em dash
(#px"[\\s#\u00A0]*(--|)[\\s#\u00A0]*" ""))) ; en dash
((make-replacer dashes) str))
(module-test-external
(check-equal? (smart-dashes "I had --- maybe 13 -- 20 --- hob-nobs.") "I had—maybe 1320—hob-nobs.")
(check-equal? (smart-quotes "\"Why,\" she could've asked, \"are we in Oahu watching 'Mame'?\"")
"“Why,” she couldve asked, “are we in Oahu watching Mame?”")
(check-equal? (smart-quotes "\"\'Impossible.\' Yes.\"") "Impossible. Yes.”"))
(define+provide/contract (smart-quotes str)
(string? . -> . string?)
(define quotes
'((#px"(?<=\\w)'(?=\\w)" "") ; apostrophe
(#px"(?<!\\w)'(?=\\S)" "") ; single_at_beginning
(#px"(?<=\\S)'(?!\\w)" "") ; single_at_end
(#px"(?<!\\w)\"(?=\\S)" "") ; double_at_beginning
(#px"(?<=\\S)\"(?!\\w)" ""))) ; double_at_end
((make-replacer quotes) str))
;; insert nbsp between last two words
(define+provide/contract (nonbreaking-last-space x #:nbsp [nbsp (->string #\u00A0)]
#:minimum-word-length [minimum-word-length 6]
#:last-word-proc [last-word-proc (λ(x) x)])
((txexpr?) (#:nbsp string? #:minimum-word-length integer? #:last-word-proc procedure?) . ->* . txexpr?)
;; todo: parameterize this, as it will be different for each project
(define tags-to-pay-attention-to '(p aside)) ; only apply to paragraphs
(define (replace-last-space str)
(if (#\space . in? . str)
(let ([reversed-str-list (reverse (string->list str))]
[reversed-nbsp (reverse (string->list (->string nbsp)))])
(define-values (last-word-chars other-chars)
(splitf-at reversed-str-list (λ(i) (not (eq? i #\space)))))
(define front-chars (if (< (len last-word-chars) minimum-word-length) ; OK for long words to be on their own line
; first char of other-chars will be the space, so use cdr
(string-append (list->string (reverse (cdr other-chars))) (->string nbsp))
(list->string (reverse other-chars))))
(define last-word (list->string (reverse last-word-chars)))
`(,front-chars ,(last-word-proc last-word))) ; don't concatenate last word bc last-word-proc might be a txexpr wrapper
(list str)))
(define (find-last-word-space x) ; recursively traverse xexpr
(cond
[(string? x) (replace-last-space x)] ; todo: this assumes a paragraph only has one string in it.
[(txexpr? x)
(let-values([(tag attr elements) (txexpr->values x)])
(if (> (length elements) 0) ; elements is list of xexprs
(let-values ([(all-but-last last) (split-at elements (sub1 (length elements)))])
(define result (find-last-word-space (car last)))
(define result-items (if (txexpr? result) (list result) result)) ; might be txexpr, or list of new elements
(make-txexpr tag attr `(,@all-but-last ,@result-items)))
x))]
[else x]))
(if ((car x) . in? . tags-to-pay-attention-to)
(find-last-word-space x)
x))
(module-test-external
;; todo: make some tougher tests, it gets flaky with edge cases
(check-equal? (nonbreaking-last-space '(p "Hi there")) '(p "Hi " "there")) ; nbsp in between last two words
(check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "Ø") '(p "HiØ" "there")) ; but let's make it visible
(check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "_up_") '(p "Hi_up_" "there"))
(check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "_up_" #:minimum-word-length 3)
'(p "Hi " "there"))
(check-equal? (nonbreaking-last-space '(p "Hi here" (em "ho there")) #:nbsp "Ø") '(p "Hi here" (em "hoØ" "there"))))
; wrap initial quotes for hanging punctuation
; todo: improve this
; does not handle <p>“<em>thing</em> properly
(define+provide/contract (wrap-hanging-quotes nx
#:single-prepend [single-pp '(squo)]
#:double-prepend [double-pp '(dquo)])
((txexpr?) (#:single-prepend list? #:double-prepend list?) . ->* . txexpr?)
(define two-or-more-char-string? (λ(i) (and (string? i) (>= (len i) 2))))
(define-values (tag attr elements) (txexpr->values nx))
(make-txexpr tag attr
(if (and (list? elements) (not (empty? elements)))
(let ([new-car-elements (match (car elements)
[(? two-or-more-char-string? tcs)
(define str-first (get tcs 0))
(define str-rest (get tcs 1 (len tcs)))
(cond
[(str-first . in? . '("\"" ""))
;; can wrap with any inline tag
;; so that linebreak detection etc still works
`(,@double-pp ,(->string #\“) ,str-rest)]
[(str-first . in? . '("\'" ""))
`(,@single-pp ,(->string #\) ,str-rest)]
[else tcs])]
[(? txexpr? nx) (wrap-hanging-quotes nx)]
[else (car elements)])])
(cons new-car-elements (cdr elements)))
elements)))
(module-test-external
(check-equal? (wrap-hanging-quotes '(p "\"Hi\" there")) '(p (dquo "" "Hi\" there")))
(check-equal? (wrap-hanging-quotes '(p "'Hi' there")) '(p (squo "" "Hi' there")))
(check-equal? (wrap-hanging-quotes '(p "'Hi' there") #:single-prepend '(foo ((bar "ino"))))
'(p (foo ((bar "ino")) "" "Hi' there")))
;; make sure txexpr without elements passes through unscathed
(check-equal? (wrap-hanging-quotes '(div ((style "height:2em")))) '(div ((style "height:2em")))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Lines, blocks, paragraphs
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; turn the right items into <br> tags
(define+provide/contract (detect-linebreaks xc
#:separator [newline (world:current-linebreak-separator)]
#:insert [linebreak '(br)])
((txexpr-elements?) (#:separator string? #:insert xexpr?) . ->* . txexpr-elements?)
;; todo: should this test be not block + not whitespace?
(define not-block? (λ(i) (not (block-txexpr? i))))
(filter-not empty?
(for/list ([i (in-range (len xc))])
(let ([item (get xc i)])
(cond
;; skip first and last
[(or (= i 0) (= i (sub1 (len xc)))) item]
[(equal? item newline)
(match (get xc (- i 1) (+ i 2)) ; a three-element slice with x[i] in the middle
;; only convert if neither adjacent tag is a block
;; (because blocks automatically force a newline before & after)
[(list (? not-block?) newline (? not-block?)) linebreak]
[else empty])] ; otherwise delete
[else item])))))
(module-test-external
(check-equal? (detect-linebreaks '("foo" "\n" "bar")) '("foo" (br) "bar"))
(check-equal? (detect-linebreaks '("\n" "foo" "\n" "bar" "\n")) '("\n" "foo" (br) "bar" "\n"))
(check-equal? (detect-linebreaks '((p "foo") "\n" (p "bar"))) '((p "foo") (p "bar")))
(check-equal? (detect-linebreaks '("foo" "\n" (p "bar"))) '("foo" (p "bar")))
(check-equal? (detect-linebreaks '("foo" "moo" "bar")) '("foo" "moo" "bar"))
(check-equal? (detect-linebreaks '("foo" "moo" "bar") #:insert "moo") '("foo" "moo" "bar"))
(check-equal? (detect-linebreaks '("foo" "\n\n" "bar")) '("foo" "\n\n" "bar")))
(define+provide/contract (whitespace? x [nbsp? #f])
((any/c)(boolean?) . ->* . coerce/boolean?)
(define pat (pregexp (format "^[\\s~a]+$" (if nbsp? #\u00A0 ""))))
(cond
[(equal? "" x) #t] ; empty string is deemed whitespace
[(or (string? x) (symbol? x)) (regexp-match pat (->string x))]
[(or (list? x) (vector? x)) (and (not (empty? x)) (andmap (λ(i) (whitespace? i nbsp?)) (->list x)))] ; andmap returns #t for empty lists
[else #f]))
(module-test-external
(require racket/format)
(check-true (whitespace? " "))
(check-false (whitespace? (~a #\u00A0)))
(check-true (whitespace/nbsp? (~a #\u00A0)))
(check-true (whitespace/nbsp? (vector (~a #\u00A0))))
(check-false (whitespace? (format " ~a " #\u00A0)))
(check-true (whitespace/nbsp? (format " ~a " #\u00A0))))
(define+provide/contract (whitespace/nbsp? x)
(any/c . -> . coerce/boolean?)
(whitespace? x #t))
;; is x a paragraph break?
(define+provide/contract (paragraph-break? x #:separator [sep (world:current-paragraph-separator)])
((any/c) (#:separator pregexp?) . ->* . coerce/boolean?)
(define paragraph-pattern (pregexp (format "^~a+$" sep)))
(and (string? x) (regexp-match paragraph-pattern x)))
(define (newline? x)
(and (string? x) (equal? (world:current-newline) x)))
(define (not-newline? x)
(not (newline? x)))
(define (do-merge xs [acc '()])
(if (empty? xs)
acc
;; Try to peel the newlines off the front.
(let-values ([(leading-newlines remainder) (splitf-at xs newline?)])
(if (not (empty? leading-newlines)) ; if you got newlines ...
;; combine them into a string and append them to the accumulator,
;; and recurse on the rest
(do-merge remainder (append acc (list (apply string-append leading-newlines))))
;; otherwise peel off elements up to the next newline, append them to accumulator,
;; and recurse on the rest
(do-merge (dropf remainder not-newline?)
(append acc (takef remainder not-newline?)))))))
;; Find adjacent newline characters in a list and merge them into one item
;; Scribble, by default, makes each newline a separate list item
;; In practice, this is worthless.
(define+provide/contract (merge-newlines x)
(txexpr-elements? . -> . txexpr-elements?)
(cond
[(list? x) (do-merge (map merge-newlines x))]
[else x]))
(module-test-external
(check-equal? (merge-newlines '(p "\n" "foo" "\n" "\n" "bar" (em "\n" "\n" "\n")))
'(p "\n" "foo" "\n\n" "bar" (em "\n\n\n"))))
;; detect paragraphs
;; todo: unit tests
(define+provide/contract (detect-paragraphs elements #:tag [tag 'p]
#:separator [sep (world:current-paragraph-separator)]
#:linebreak-proc [linebreak-proc detect-linebreaks]
#:force? [force-paragraph #f])
((txexpr-elements?) (#:tag symbol? #:separator string? #:linebreak-proc (txexpr-elements? . -> . txexpr-elements?) #:force? boolean?)
. ->* . txexpr-elements?)
;; prepare elements for paragraph testing
(define (prep-paragraph-flow elems)
(linebreak-proc (merge-newlines (trimf elems whitespace?))))
(define my-paragraph-break? (λ(x) (and (paragraph-break? x #:separator sep) #t)))
(define (wrap-paragraph elems)
(match elems
[(list (? block-txexpr? bxs) ...) bxs] ; leave a series of block xexprs alone
[else (list (make-txexpr tag empty elems))])) ; otherwise wrap in p tag
(let ([elements (prep-paragraph-flow elements)])
(define explicit-or-implicit-paragraph-break? (λ(x) (or (my-paragraph-break? x) (block-txexpr? x))))
(if (ormap explicit-or-implicit-paragraph-break? elements) ; need this condition to prevent infinite recursion
;; use append-map on wrap-paragraph rather than map to permit return of multiple elements
(append-map wrap-paragraph (append-map (λ(es) (filter-split es my-paragraph-break?)) (slicef elements block-txexpr?))) ; split into ¶¶, using both implied and explicit paragraph breaks
(if force-paragraph
(append-map wrap-paragraph (slicef elements block-txexpr?)) ; upconverts non-block elements to paragraphs
elements))))
(module-test-external
(check-equal? (detect-paragraphs '("First para" "\n\n" "Second para"))
'((p "First para") (p "Second para")))
(check-equal? (detect-paragraphs '("First para" "\n\n" "Second para" "\n" "Second line"))
'((p "First para") (p "Second para" (br) "Second line")))
(check-equal? (detect-paragraphs '("First para" "\n\n" (div "Second block")))
'((p "First para") (div "Second block")))
(check-equal? (detect-paragraphs '((div "First block") "\n\n" (div "Second block")))
'((div "First block") (div "Second block")))
(check-equal? (detect-paragraphs '("First para" "\n\n" "Second para") #:tag 'ns:p)
'((ns:p "First para") (ns:p "Second para")))
(check-equal? (detect-paragraphs '("First para" "\n\n" "Second para" "\n" "Second line")
#:linebreak-proc (λ(x) (detect-linebreaks x #:insert '(newline))))
'((p "First para") (p "Second para" (newline) "Second line")))
(check-equal? (detect-paragraphs '("foo" "\n\n" (div "bar") (div "zam")))
'((p "foo") (div "bar") (div "zam")))
(check-equal? (detect-paragraphs '("foo" "\n\n" (div "bar") "\n\n" (div "zam")))
'((p "foo") (div "bar") (div "zam")))
(check-equal? (detect-paragraphs '("foo")) '("foo"))
(check-equal? (detect-paragraphs '("foo") #:force? #t) '((p "foo")))
(check-equal? (detect-paragraphs '((div "foo"))) '((div "foo")))
(check-equal? (detect-paragraphs '((div "foo")) #:force? #t) '((div "foo")))
(check-equal? (detect-paragraphs '("foo" "\n\n" (div "bar"))) '((p "foo") (div "bar")))
(check-equal? (detect-paragraphs '("foo" (div "bar"))) '((p "foo") (div "bar")))
(check-equal? (detect-paragraphs '("foo" (div "bar")) #:force? #t) '((p "foo") (div "bar")))
(check-equal? (detect-paragraphs '("foo" (div "bar") "zam")) '((p "foo") (div "bar") (p "zam")))
(check-equal? (detect-paragraphs '("foo" (span "zing") (div "bar") "zam")) '((p "foo" (span "zing")) (div "bar") (p "zam")))
(check-equal? (detect-paragraphs '("foo" (span "zing") (div "bar") "zam") #:force? #t) '((p "foo" (span "zing")) (div "bar") (p "zam"))))