diff --git a/decode.rkt b/decode.rkt index 876f8a2..fa193d0 100644 --- a/decode.rkt +++ b/decode.rkt @@ -1,8 +1,7 @@ #lang racket/base -(require xml txexpr sugar/define) -(require "decode/block.rkt" "decode/typography.rkt" "debug.rkt") +(require xml txexpr sugar racket/match racket/list (prefix-in html: css-tools/html)) +(require "debug.rkt" "world.rkt") -(provide (all-from-out "decode/block.rkt" "decode/typography.rkt")) (define+provide (to-string x) @@ -66,8 +65,252 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Blocks +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; initial set of block tags: from html +(define+provide project-block-tags + (make-parameter html:block-tags)) + + +;; tags are inline unless they're registered as block tags. +(define+provide/contract (block-txexpr? x) + (any/c . -> . boolean?) + (and (txexpr? x) (member (get-tag x) (project-block-tags)) #t)) + + +(define+provide/contract (register-block-tag tag) + (txexpr-tag? . -> . void?) + (project-block-tags (cons tag (project-block-tags)))) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Typography + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + +(define (make-replacer query+replacement) + (let ([queries (map car query+replacement)] + [replacements (map second query+replacement)]) + ;; reverse because first in list should be first applied to str (and compose1 works right-to-left) + (apply compose1 (reverse (map (λ(query replacement) (λ(str) (regexp-replace* query str replacement))) queries replacements))))) + +(define+provide/contract (smart-dashes str) + (string? . -> . string?) + + (define dashes + ;; fix em dashes first, else they'll be mistaken for en dashes + ;; \\s is whitespace + #\u00A0 is nonbreaking space + '((#px"[\\s#\u00A0]*(---|—)[\\s#\u00A0]*" "—") ; em dash + (#px"[\\s#\u00A0]*(--|–)[\\s#\u00A0]*" "–"))) ; en dash + + + ((make-replacer dashes) str)) + + +(define+provide/contract (smart-quotes str) + (string? . -> . string?) + + (define quotes + '((#px"(?<=\\w)'(?=\\w)" "’") ; apostrophe + (#px"(?string #\u00A0)] + #:minimum-word-length [minimum-word-length 6]) + ((txexpr?) (#:nbsp string? #:minimum-word-length integer?) . ->* . txexpr?) + + ;; todo: parameterize this, as it will be different for each project + (define tags-to-pay-attention-to '(p aside)) ; only apply to paragraphs + + (define (replace-last-space str) + (if (#\space . in? . str) + (let ([reversed-str-list (reverse (string->list str))] + [reversed-nbsp (reverse (string->list (->string nbsp)))]) + (define-values (last-word-chars other-chars) + (splitf-at reversed-str-list (λ(i) (not (eq? i #\space))))) + + (define front-chars (if (< (len last-word-chars) minimum-word-length) ; OK for long words to be on their own line + ; first char of other-chars will be the space, so use cdr + (string-append (list->string (reverse (cdr other-chars))) (->string nbsp)) + (list->string (reverse other-chars)))) + `(,front-chars (span [[pollen "no-hyphens"]] ,(list->string (reverse last-word-chars))))) + (list str))) + + (define (find-last-word-space x) ; recursively traverse xexpr + (cond + [(string? x) (replace-last-space x)] ; todo: this assumes a paragraph only has one string in it. + [(txexpr? x) + (let-values([(tag attr elements) (txexpr->values x)]) + (if (> (length elements) 0) ; elements is list of xexprs + (let-values ([(all-but-last last) (split-at elements (sub1 (length elements)))]) + (define result (find-last-word-space (car last))) + (define result-items (if (txexpr? result) (list result) result)) ; might be txexpr, or list of new elements + (make-txexpr tag attr `(,@all-but-last ,@result-items))) + x))] + [else x])) + + (if ((car x) . in? . tags-to-pay-attention-to) + (find-last-word-space x) + x)) + + +; wrap initial quotes for hanging punctuation +; todo: improve this +; does not handle
“thing properly
+(define+provide/contract (wrap-hanging-quotes nx
+ #:single-prepend [single-pp '(squo)]
+ #:double-prepend [double-pp '(dquo)])
+ ((txexpr?) (#:single-prepend list? #:double-prepend list?) . ->* . txexpr?)
+
+ (define two-or-more-char-string? (λ(i) (and (string? i) (>= (len i) 2))))
+ (define-values (tag attr elements) (txexpr->values nx))
+ (make-txexpr tag attr
+ (if (and (list? elements) (not (empty? elements)))
+ (let ([new-car-elements (match (car elements)
+ [(? two-or-more-char-string? tcs)
+ (define str-first (get tcs 0))
+ (define str-rest (get tcs 1 'end))
+ (cond
+ [(str-first . in? . '("\"" "“"))
+ ;; can wrap with any inline tag
+ ;; so that linebreak detection etc still works
+ `(,@double-pp ,(->string #\“) ,str-rest)]
+ [(str-first . in? . '("\'" "‘"))
+ `(,@single-pp ,(->string #\‘) ,str-rest)]
+ [else tcs])]
+ [(? txexpr? nx) (wrap-hanging-quotes nx)]
+ [else (car elements)])])
+ (cons new-car-elements (cdr elements)))
+ elements)))
+
+
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Lines, blocks, paragraphs
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+;; turn the right items into
tags
+(define+provide/contract (detect-linebreaks xc
+ #:separator [newline world:linebreak-separator]
+ #:insert [linebreak '(br)])
+ ((txexpr-elements?) (#:separator string? #:insert xexpr?) . ->* . txexpr-elements?)
+ ;; todo: should this test be not block + not whitespace?
+ (define not-block? (λ(i) (not (block-txexpr? i))))
+ (filter-not empty?
+ (for/list ([i (len xc)])
+ (let ([item (get xc i)])
+ (cond
+ ;; skip first and last
+ [(or (= i 0) (= i (sub1 (len xc)))) item]
+ [(equal? item newline)
+ (match (get xc (- i 1) (+ i 2)) ; a three-element slice with x[i] in the middle
+ ;; only convert if neither adjacent tag is a block
+ ;; (because blocks automatically force a newline before & after)
+ [(list (? not-block?) newline (? not-block?)) linebreak]
+ [else empty])] ; otherwise delete
+ [else item])))))
+
+
+
+;; recursive whitespace test
+(define+provide/contract (whitespace? x)
+ (any/c . -> . coerce/boolean?)
+ (cond
+ [(equal? "" x) #t] ; empty string is deemed whitespace
+ [(or (string? x) (symbol? x)) (regexp-match #px"^\\s+$" (->string x))]
+ [(or (list? x) (vector? x)) (andmap whitespace? (->list x))]
+ [else #f]))
+
+
+(define+provide/contract (whitespace/nbsp? x)
+ (any/c . -> . coerce/boolean?)
+ (or (whitespace? x) (equal? (->string x) (->string #\u00A0))))
+
+;; is x a paragraph break?
+(define+provide/contract (paragraph-break? x #:separator [sep world:paragraph-separator])
+ ((any/c) (#:separator pregexp?) . ->* . coerce/boolean?)
+ (define paragraph-pattern (pregexp (format "^~a+$" sep)))
+ (and (string? x) (regexp-match paragraph-pattern x)))
+
+
+
+(define (newline? x)
+ (and (string? x) (equal? world:newline x)))
+(define (not-newline? x)
+ (not (newline? x)))
+
+(define (do-merge xs [acc '()])
+ (if (empty? xs)
+ acc
+ ;; Try to peel the newlines off the front.
+ (let-values ([(leading-newlines remainder) (splitf-at xs newline?)])
+ (if (not (empty? leading-newlines)) ; if you got newlines ...
+ ;; combine them into a string and append them to the accumulator,
+ ;; and recurse on the rest
+ (do-merge remainder (append acc (list (apply string-append leading-newlines))))
+ ;; otherwise peel off elements up to the next newline, append them to accumulator,
+ ;; and recurse on the rest
+ (do-merge (dropf remainder not-newline?)
+ (append acc (takef remainder not-newline?)))))))
+
+
+;; Find adjacent newline characters in a list and merge them into one item
+;; Scribble, by default, makes each newline a separate list item
+;; In practice, this is worthless.
+(define+provide/contract (merge-newlines x)
+ (txexpr-elements? . -> . txexpr-elements?)
+ (cond
+ [(list? x) (do-merge (map merge-newlines x))]
+ [else x]))
+
+
+
+
+
+;; detect paragraphs
+;; todo: unit tests
+(define+provide/contract (detect-paragraphs elements #:tag [tag 'p]
+ #:separator [sep world:paragraph-separator]
+ #:linebreak-proc [linebreak-proc detect-linebreaks])
+ ((txexpr-elements?) (#:tag symbol? #:separator string? #:linebreak-proc (txexpr-elements? . -> . txexpr-elements?))
+ . ->* . txexpr-elements?)
+
+ ;; prepare elements for paragraph testing
+ (define (prep-paragraph-flow xc)
+ (linebreak-proc (merge-newlines (trim xc whitespace?))))
+
+
+ (define my-paragraph-break? (λ(x) (and (paragraph-break? x #:separator sep) #t)))
+
+ (define (wrap-paragraph xc)
+ (match xc
+ [(list (? block-txexpr? bx)) bx] ; leave a single block xexpr alone
+ [else (make-txexpr tag empty xc)])) ; otherwise wrap in p tag
+
+
+ (let ([elements (prep-paragraph-flow elements)])
+ (if (ormap my-paragraph-break? elements) ; need this condition to prevent infinite recursion
+ (map wrap-paragraph (splitf-at* elements my-paragraph-break?)) ; split into ¶¶
+ elements)))
diff --git a/decode/block.rkt b/decode/block.rkt
deleted file mode 100644
index 39d5ef8..0000000
--- a/decode/block.rkt
+++ /dev/null
@@ -1,18 +0,0 @@
-#lang racket/base
-(require (prefix-in html: css-tools/html) sugar/define txexpr)
-
-
-;; initial set of block tags: from html
-(define+provide project-block-tags
- (make-parameter html:block-tags))
-
-
-;; tags are inline unless they're registered as block tags.
-(define+provide/contract (block-txexpr? x)
- (any/c . -> . boolean?)
- (and (txexpr? x) (member (get-tag x) (project-block-tags)) #t))
-
-
-(define+provide/contract (register-block-tag tag)
- (txexpr-tag? . -> . void?)
- (project-block-tags (cons tag (project-block-tags))))
diff --git a/decode/typography.rkt b/decode/typography.rkt
deleted file mode 100644
index 4c309f8..0000000
--- a/decode/typography.rkt
+++ /dev/null
@@ -1,222 +0,0 @@
-#lang racket/base
-(require racket/list racket/match xml)
-(require "block.rkt" "../world.rkt" sugar txexpr)
-
-
-(define (make-replacer query+replacement)
- (let ([queries (map car query+replacement)]
- [replacements (map second query+replacement)])
- ;; reverse because first in list should be first applied to str (and compose1 works right-to-left)
- (apply compose1 (reverse (map (λ(query replacement) (λ(str) (regexp-replace* query str replacement))) queries replacements)))))
-
-(define+provide/contract (smart-dashes str)
- (string? . -> . string?)
-
- (define dashes
- ;; fix em dashes first, else they'll be mistaken for en dashes
- ;; \\s is whitespace + #\u00A0 is nonbreaking space
- '((#px"[\\s#\u00A0]*(---|—)[\\s#\u00A0]*" "—") ; em dash
- (#px"[\\s#\u00A0]*(--|–)[\\s#\u00A0]*" "–"))) ; en dash
-
-
- ((make-replacer dashes) str))
-
-
-(define+provide/contract (smart-quotes str)
- (string? . -> . string?)
-
- (define quotes
- '((#px"(?<=\\w)'(?=\\w)" "’") ; apostrophe
- (#px"(?string #\u00A0)]
- #:minimum-word-length [minimum-word-length 6])
- ((txexpr?) (#:nbsp string? #:minimum-word-length integer?) . ->* . txexpr?)
-
- ;; todo: parameterize this, as it will be different for each project
- (define tags-to-pay-attention-to '(p aside)) ; only apply to paragraphs
-
- (define (replace-last-space str)
- (if (#\space . in? . str)
- (let ([reversed-str-list (reverse (string->list str))]
- [reversed-nbsp (reverse (string->list (->string nbsp)))])
- (define-values (last-word-chars other-chars)
- (splitf-at reversed-str-list (λ(i) (not (eq? i #\space)))))
-
- (define front-chars (if (< (len last-word-chars) minimum-word-length) ; OK for long words to be on their own line
- ; first char of other-chars will be the space, so use cdr
- (string-append (list->string (reverse (cdr other-chars))) (->string nbsp))
- (list->string (reverse other-chars))))
- `(,front-chars (span [[pollen "no-hyphens"]] ,(list->string (reverse last-word-chars)))))
- (list str)))
-
- (define (find-last-word-space x) ; recursively traverse xexpr
- (cond
- [(string? x) (replace-last-space x)] ; todo: this assumes a paragraph only has one string in it.
- [(txexpr? x)
- (let-values([(tag attr elements) (txexpr->values x)])
- (if (> (length elements) 0) ; elements is list of xexprs
- (let-values ([(all-but-last last) (split-at elements (sub1 (length elements)))])
- (define result (find-last-word-space (car last)))
- (define result-items (if (txexpr? result) (list result) result)) ; might be txexpr, or list of new elements
- (make-txexpr tag attr `(,@all-but-last ,@result-items)))
- x))]
- [else x]))
-
- (if ((car x) . in? . tags-to-pay-attention-to)
- (find-last-word-space x)
- x))
-
-
-; wrap initial quotes for hanging punctuation
-; todo: improve this
-; does not handle
“thing properly
-(define+provide/contract (wrap-hanging-quotes nx
- #:single-prepend [single-pp '(squo)]
- #:double-prepend [double-pp '(dquo)])
- ((txexpr?) (#:single-prepend list? #:double-prepend list?) . ->* . txexpr?)
-
- (define two-or-more-char-string? (λ(i) (and (string? i) (>= (len i) 2))))
- (define-values (tag attr elements) (txexpr->values nx))
- (make-txexpr tag attr
- (if (and (list? elements) (not (empty? elements)))
- (let ([new-car-elements (match (car elements)
- [(? two-or-more-char-string? tcs)
- (define str-first (get tcs 0))
- (define str-rest (get tcs 1 'end))
- (cond
- [(str-first . in? . '("\"" "“"))
- ;; can wrap with any inline tag
- ;; so that linebreak detection etc still works
- `(,@double-pp ,(->string #\“) ,str-rest)]
- [(str-first . in? . '("\'" "‘"))
- `(,@single-pp ,(->string #\‘) ,str-rest)]
- [else tcs])]
- [(? txexpr? nx) (wrap-hanging-quotes nx)]
- [else (car elements)])])
- (cons new-car-elements (cdr elements)))
- elements)))
-
-
-
-
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; Lines, blocks, paragraphs
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-
-;; turn the right items into
tags
-(define+provide/contract (detect-linebreaks xc
- #:separator [newline world:linebreak-separator]
- #:insert [linebreak '(br)])
- ((txexpr-elements?) (#:separator string? #:insert xexpr?) . ->* . txexpr-elements?)
- ;; todo: should this test be not block + not whitespace?
- (define not-block? (λ(i) (not (block-txexpr? i))))
- (filter-not empty?
- (for/list ([i (len xc)])
- (let ([item (get xc i)])
- (cond
- ;; skip first and last
- [(or (= i 0) (= i (sub1 (len xc)))) item]
- [(equal? item newline)
- (match (get xc (- i 1) (+ i 2)) ; a three-element slice with x[i] in the middle
- ;; only convert if neither adjacent tag is a block
- ;; (because blocks automatically force a newline before & after)
- [(list (? not-block?) newline (? not-block?)) linebreak]
- [else empty])] ; otherwise delete
- [else item])))))
-
-
-
-;; recursive whitespace test
-(define+provide/contract (whitespace? x)
- (any/c . -> . coerce/boolean?)
- (cond
- [(equal? "" x) #t] ; empty string is deemed whitespace
- [(or (string? x) (symbol? x)) (regexp-match #px"^\\s+$" (->string x))]
- [(or (list? x) (vector? x)) (andmap whitespace? (->list x))]
- [else #f]))
-
-
-(define+provide/contract (whitespace/nbsp? x)
- (any/c . -> . coerce/boolean?)
- (or (whitespace? x) (equal? (->string x) (->string #\u00A0))))
-
-;; is x a paragraph break?
-(define+provide/contract (paragraph-break? x #:separator [sep world:paragraph-separator])
- ((any/c) (#:separator pregexp?) . ->* . coerce/boolean?)
- (define paragraph-pattern (pregexp (format "^~a+$" sep)))
- (and (string? x) (regexp-match paragraph-pattern x)))
-
-
-
-(define (newline? x)
- (and (string? x) (equal? world:newline x)))
-(define (not-newline? x)
- (not (newline? x)))
-
-(define (do-merge xs [acc '()])
- (if (empty? xs)
- acc
- ;; Try to peel the newlines off the front.
- (let-values ([(leading-newlines remainder) (splitf-at xs newline?)])
- (if (not (empty? leading-newlines)) ; if you got newlines ...
- ;; combine them into a string and append them to the accumulator,
- ;; and recurse on the rest
- (do-merge remainder (append acc (list (apply string-append leading-newlines))))
- ;; otherwise peel off elements up to the next newline, append them to accumulator,
- ;; and recurse on the rest
- (do-merge (dropf remainder not-newline?)
- (append acc (takef remainder not-newline?)))))))
-
-
-;; Find adjacent newline characters in a list and merge them into one item
-;; Scribble, by default, makes each newline a separate list item
-;; In practice, this is worthless.
-(define+provide/contract (merge-newlines x)
- (txexpr-elements? . -> . txexpr-elements?)
- (cond
- [(list? x) (do-merge (map merge-newlines x))]
- [else x]))
-
-
-
-
-
-;; detect paragraphs
-;; todo: unit tests
-(define+provide/contract (detect-paragraphs elements #:tag [tag 'p]
- #:separator [sep world:paragraph-separator]
- #:linebreak-proc [linebreak-proc detect-linebreaks])
- ((txexpr-elements?) (#:tag symbol? #:separator string? #:linebreak-proc (txexpr-elements? . -> . txexpr-elements?))
- . ->* . txexpr-elements?)
-
- ;; prepare elements for paragraph testing
- (define (prep-paragraph-flow xc)
- (linebreak-proc (merge-newlines (trim xc whitespace?))))
-
-
- (define my-paragraph-break? (λ(x) (and (paragraph-break? x #:separator sep) #t)))
-
- (define (wrap-paragraph xc)
- (match xc
- [(list (? block-txexpr? bx)) bx] ; leave a single block xexpr alone
- [else (make-txexpr tag empty xc)])) ; otherwise wrap in p tag
-
-
- (let ([elements (prep-paragraph-flow elements)])
- (if (ormap my-paragraph-break? elements) ; need this condition to prevent infinite recursion
- (map wrap-paragraph (splitf-at* elements my-paragraph-break?)) ; split into ¶¶
- elements)))
\ No newline at end of file
diff --git a/decode/tests.rkt b/tests/tests-decode.rkt
similarity index 98%
rename from decode/tests.rkt
rename to tests/tests-decode.rkt
index 2ebd952..614458b 100644
--- a/decode/tests.rkt
+++ b/tests/tests-decode.rkt
@@ -1,5 +1,5 @@
#lang racket/base
-(require pollen/decode pollen/predicates)
+(require pollen/decode)
(module+ test (require rackunit))
(module+ test