diff --git a/decode.rkt b/decode.rkt index 06fa4d2..4d128b5 100644 --- a/decode.rkt +++ b/decode.rkt @@ -35,12 +35,12 @@ (#:txexpr-tag-proc (txexpr-tag? . -> . txexpr-tag?) #:txexpr-attrs-proc (txexpr-attrs? . -> . txexpr-attrs?) #:txexpr-elements-proc (txexpr-elements? . -> . txexpr-elements?) - #:block-txexpr-proc (block-txexpr? . -> . block-txexpr?) - #:inline-txexpr-proc (txexpr? . -> . txexpr?) - #:string-proc (string? . -> . string?) - #:symbol-proc (symbol? . -> . symbol?) - #:valid-char-proc (valid-char? . -> . valid-char?) - #:cdata-proc (cdata? . -> . cdata?) + #:block-txexpr-proc (block-txexpr? . -> . xexpr?) + #:inline-txexpr-proc (txexpr? . -> . xexpr?) + #:string-proc (string? . -> . xexpr?) + #:symbol-proc (symbol? . -> . xexpr?) + #:valid-char-proc (valid-char? . -> . xexpr?) + #:cdata-proc (cdata? . -> . xexpr?) #:exclude-tags (listof symbol?) ) . ->* . txexpr?) diff --git a/decode/typography.rkt b/decode/typography.rkt index 928d04f..d2dbbcc 100644 --- a/decode/typography.rkt +++ b/decode/typography.rkt @@ -1,65 +1,45 @@ #lang racket/base -(require racket/match) -(require "../tools.rkt" "block.rkt" sugar txexpr) +(require racket/match xml) +(require "../tools.rkt" "block.rkt" "../world.rkt" sugar txexpr) -(provide (contract-out - [typogrify (string? . -> . string?)] - [nonbreaking-last-space ((txexpr?) (#:nbsp string? #:minimum-word-length integer?) . ->* . txexpr?)] - [wrap-hanging-quotes ((txexpr?) (#:single-prepend list? #:double-prepend list?) . ->* . txexpr?)] - [convert-linebreaks ((txexpr-elements?) (#:newline string?) . ->* . txexpr-elements?)] - [whitespace? (any/c . -> . boolean?)] - [paragraph-break? ((any/c) (#:pattern pregexp?) . ->* . boolean?)] - [merge-newlines (list? . -> . list?)] - [prep-paragraph-flow (txexpr-elements? . -> . txexpr-elements?)] - [wrap-paragraph ((txexpr-elements?) (#:tag symbol?) . ->* . block-txexpr?)] - [detect-paragraphs (txexpr-elements? . -> . txexpr-elements?)])) +(define (make-replacer query+replacement) + (let ([queries (map car query+replacement)] + [replacements (map second query+replacement)]) + ;; reverse because first in list should be first applied to str (and compose1 works right-to-left) + (apply compose1 (reverse (map (λ(query replacement) (λ(str) (regexp-replace* query str replacement))) queries replacements))))) -;; This module is a library of functions to be used in building pollen decoders. - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Typography - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - - -;; insert typographic niceties -;; ligatures are handled in css -(define (typogrify str) - ;; make set of functions for replacers - (define (make-replacer query replacement) - (λ(str) (regexp-replace* query str replacement))) +(define+provide/contract (smart-dashes str) + (string? . -> . string?) - ;; just store the query strings + replacement strings (define dashes ;; fix em dashes first, else they'll be mistaken for en dashes - ;; [\\s ] is whitespace + #\u00A0 is nonbreaking space + ;; \\s is whitespace + #\u00A0 is nonbreaking space '((#px"[\\s#\u00A0]*(---|—)[\\s#\u00A0]*" "—") ; em dash (#px"[\\s#\u00A0]*(--|–)[\\s#\u00A0]*" "–"))) ; en dash - (define smart-quotes + + ((make-replacer dashes) str)) + + +(define+provide/contract (smart-quotes str) + (string? . -> . string?) + + (define quotes '((#px"(?<=\\w)'(?=\\w)" "’") ; apostrophe (#px"(?string #\u00A0)] - #:minimum-word-length [minimum-word-length 6]) +(define+provide/contract (nonbreaking-last-space x #:nbsp [nbsp (->string #\u00A0)] + #:minimum-word-length [minimum-word-length 6]) + ((txexpr?) (#:nbsp string? #:minimum-word-length integer?) . ->* . txexpr?) ;; todo: parameterize this, as it will be different for each project (define tags-to-pay-attention-to '(p aside)) ; only apply to paragraphs @@ -99,9 +79,10 @@ ; wrap initial quotes for hanging punctuation ; todo: improve this ; does not handle
“thing properly
-(define (wrap-hanging-quotes nx
- #:single-prepend [single-pp '(squo)]
- #:double-prepend [double-pp '(dquo)])
+(define+provide/contract (wrap-hanging-quotes nx
+ #:single-prepend [single-pp '(squo)]
+ #:double-prepend [double-pp '(dquo)])
+ ((txexpr?) (#:single-prepend list? #:double-prepend list?) . ->* . txexpr?)
(define two-or-more-char-string? (λ(i) (and (string? i) (>= (len i) 2))))
(define-values (tag attr elements) (txexpr->values nx))
@@ -137,8 +118,10 @@
;; turn the right items into
tags
-(define (convert-linebreaks xc #:newline [newline "\n"])
-
+(define+provide/contract (convert-linebreaks xc
+ #:separator [newline world:linebreak-separator]
+ #:linebreak [linebreak '(br)])
+ ((txexpr-elements?) (#:separator string? #:linebreak xexpr?) . ->* . txexpr-elements?)
;; todo: should this test be not block + not whitespace?
(define not-block? (λ(i) (not (block-txexpr? i))))
(filter-not empty?
@@ -151,86 +134,85 @@
(match (get xc (- i 1) (+ i 2)) ; a three-element slice with x[i] in the middle
;; only convert if neither adjacent tag is a block
;; (because blocks automatically force a newline before & after)
- [(list (? not-block?) newline (? not-block?)) '(br)]
+ [(list (? not-block?) newline (? not-block?)) linebreak]
[else empty])] ; otherwise delete
[else item])))))
;; recursive whitespace test
-(define (whitespace? x)
+(define+provide/contract (whitespace? x)
+ (any/c . -> . coerce/boolean?)
(cond
[(equal? "" x) #t] ; empty string is deemed whitespace
- [(or (string? x) (symbol? x)) (->boolean (regexp-match #px"^\\s+$" (->string x)))]
+ [(or (string? x) (symbol? x)) (regexp-match #px"^\\s+$" (->string x))]
[(or (list? x) (vector? x)) (andmap whitespace? (->list x))]
[else #f]))
;; is x a paragraph break?
-(define (paragraph-break? x #:pattern [paragraph-pattern #px"^\n\n+$"])
-
- (and (string? x) (->boolean (regexp-match paragraph-pattern x))))
+(define (paragraph-break? x #:separator [sep world:paragraph-separator])
+ ; ((any/c) (#:separator pregexp?) . ->* . coerce/boolean?)
+ (define paragraph-pattern (pregexp (format "^~a+$" sep)))
+ (and (string? x) (regexp-match paragraph-pattern x)))
+(define (newline? x)
+ (and (string? x) (equal? world:newline x)))
+(define (not-newline? x)
+ (not (newline? x)))
+
+(define (do-merge xs [acc '()])
+ (if (empty? xs)
+ acc
+ ;; Try to peel the newlines off the front.
+ (let-values ([(leading-newlines remainder) (splitf-at xs newline?)])
+ (if (not (empty? leading-newlines)) ; if you got newlines ...
+ ;; combine them into a string and append them to the accumulator,
+ ;; and recurse on the rest
+ (do-merge remainder (append acc (list (apply string-append leading-newlines))))
+ ;; otherwise peel off elements up to the next newline, append them to accumulator,
+ ;; and recurse on the rest
+ (do-merge (dropf remainder not-newline?)
+ (append acc (takef remainder not-newline?)))))))
;; Find adjacent newline characters in a list and merge them into one item
;; Scribble, by default, makes each newline a separate list item
;; In practice, this is worthless.
(define (merge-newlines x)
-
- (define (newline? x)
- (and (string? x) (equal? "\n" x)))
- (define (not-newline? x)
- (not (newline? x)))
-
- (define (really-merge-newlines xs [acc '()])
- (if (empty? xs)
- acc
- ;; Try to peel the newlines off the front.
- (let-values ([(leading-newlines remainder) (splitf-at xs newline?)])
- (if (not (empty? leading-newlines)) ; if you got newlines ...
- ;; combine them into a string and append them to the accumulator,
- ;; and recurse on the rest
- (really-merge-newlines remainder (append acc (list (apply string-append leading-newlines))))
- ;; otherwise peel off elements up to the next newline, append them to accumulator,
- ;; and recurse on the rest
- (really-merge-newlines (dropf remainder not-newline?)
- (append acc (takef remainder not-newline?)))))))
-
+ (txexpr-elements? . -> . txexpr-elements?)
(cond
- [(list? x) (really-merge-newlines (map merge-newlines x))]
+ [(list? x) (do-merge (map merge-newlines x))]
[else x]))
-;; todo: add native support for list-xexpr
-;; decode triple newlines to list items
-
-
-;; prepare elements for paragraph testing
-(define (prep-paragraph-flow xc)
-
- (convert-linebreaks (merge-newlines (trim xc whitespace?))))
-
-
-;; apply paragraph tag
-(define (wrap-paragraph xc #:tag [tag 'p])
-
- (match xc
- [(list (? block-txexpr? bx)) bx] ; leave a single block xexpr alone
- [else (make-txexpr tag empty xc)])) ; otherwise wrap in p tag
-
-
-
;; detect paragraphs
;; todo: unit tests
-(define (detect-paragraphs elements)
+(define+provide/contract (detect-paragraphs elements #:tag [tag 'p]
+ #:separator [sep world:paragraph-separator]
+ #:linebreak-proc [linebreak-proc convert-linebreaks])
+ ((txexpr-elements?) (#:tag symbol? #:separator string? #:linebreak-proc procedure?)
+ . ->* . txexpr-elements?)
+
+ ;; prepare elements for paragraph testing
+ (define (prep-paragraph-flow xc)
+ (linebreak-proc (merge-newlines (trim xc whitespace?))))
+
+
+ (define my-paragraph-break? (λ(x) (and (paragraph-break? x #:separator sep) #t)))
+
+ (define (wrap-paragraph xc)
+ (match xc
+ [(list (? block-txexpr? bx)) bx] ; leave a single block xexpr alone
+ [else (make-txexpr tag empty xc)])) ; otherwise wrap in p tag
+
(let ([elements (prep-paragraph-flow elements)])
- (if (ormap paragraph-break? elements) ; need this condition to prevent infinite recursion
- (map wrap-paragraph (splitf-at* elements paragraph-break?)) ; split into ¶¶
+ (if (ormap my-paragraph-break? elements) ; need this condition to prevent infinite recursion
+ (map wrap-paragraph (splitf-at* elements my-paragraph-break?)) ; split into ¶¶
elements)))
\ No newline at end of file
diff --git a/scribblings/decode.scrbl b/scribblings/decode.scrbl
index 88f77fa..23da891 100644
--- a/scribblings/decode.scrbl
+++ b/scribblings/decode.scrbl
@@ -1,9 +1,91 @@
#lang scribble/manual
-@(require scribble/eval pollen/decode pollen/world (for-label racket (except-in pollen #%module-begin) pollen/world pollen/cache pollen/decode txexpr xml pollen/predicates pollen/decode/typography pollen/decode/block))
+@(require scribble/eval pollen/decode pollen/world (for-label racket (except-in pollen #%module-begin) pollen/world pollen/cache pollen/decode txexpr xml pollen/predicates pollen/decode/block))
@(define my-eval (make-base-eval))
-@(my-eval `(require pollen pollen/decode pollen/decode/typography pollen/decode/block))
+@(my-eval `(require pollen pollen/decode pollen/decode/block xml))
+
+@section{Typography}
+@defmodule[pollen/decode/typography]
+
+An assortment of typography & layout functions, designed to be used with @racket[decode]. These aren't hard to write. So if you like these, use them. If not, make your own.
+
+@defproc[
+(smart-quotes
+[str string?])
+string?]
+Convert straight quotes in @racket[_str] to curly according to American English conventions.
+
+@examples[#:eval my-eval
+(define tricky-string
+"\"Why,\" she could've asked, \"are we in O‘ahu watching 'Mame'?\"")
+(display tricky-string)
+(display (smart-quotes tricky-string))
+]
+
+@defproc[
+(smart-dashes
+[str string?])
+string?]
+In @racket[_str], convert three hyphens to an em dash, and two hyphens to an en dash, and remove surrounding spaces.
+
+@examples[#:eval my-eval
+(define tricky-string "I had a few --- OK, like 6--8 --- thin mints.")
+(display tricky-string)
+(display (smart-dashes tricky-string))
+]
+
+
+@defproc[
+(convert-linebreaks
+[tagged-xexpr-elements txexpr-elements?]
+[#:separator linebreak-sep string? world:linebreak-separator]
+[#:linebreak linebreak xexpr? '(br)])
+txexpr-elements?]
+Within @racket[_tagged-xexpr-elements], convert occurrences of @racket[_linebreak-sep] (@racket["\n"] by default) to @racket[_linebreak], but only if @racket[_linebreak-sep] does not occur between blocks (see @racket[block-txexpr?]). Why? Because block-level elements automatically display on a new line, so adding @racket[_linebreak] would be superfluous. In that case, @racket[_linebreak-sep] just disappears.
+
+@examples[#:eval my-eval
+(convert-linebreaks '(div "Two items:" "\n" (em "Eggs") "\n" (em "Bacon")))
+(convert-linebreaks '(div "Two items:" "\n" (div "Eggs") "\n" (div "Bacon")))
+]
+
+@defproc[
+(whitespace?
+[v any/c])
+boolean?]
+Returns @racket[#t] for any stringlike @racket[_v] that's entirely whitespace, but also the empty string, as well as lists and vectors that are made only of @racket[whitespace?] members.
+
+@examples[#:eval my-eval
+(whitespace? "\n\n ")
+(whitespace? (string->symbol "\n\n "))
+(whitespace? "")
+(whitespace? '("" " " "\n\n\n" " \n"))
+]
+
+@defproc[
+(detect-paragraphs
+[elements txexpr-elements?]
+[#:tag paragraph-tag symbol? 'p]
+[#:separator paragraph-sep string? world:paragraph-separator]
+[#:linebreak-proc linebreak-proc procedure? convert-linebreaks])
+txexpr-elements?]
+Find paragraphs within @racket[_elements], as denoted by @racket[_paragraph-sep], and wrap them with @racket[_paragraph-tag], unless the @racket[_element] is already a @racket[block-txexpr?] (because in that case, the wrapping is superfluous). Thus, as a consequence, if @racket[_paragraph-sep] occurs between two blocks, it's ignored.
+
+The @racket[_paragraph-tag] argument sets the tag used to wrap paragraphs.
+
+The @racket[_linebreak-proc] argument allows you to use a different linebreaking procedure other than the usual @racket[convert-linebreaks].
+
+@examples[#:eval my-eval
+(detect-paragraphs '("First para" "\n\n" "Second para"))
+(detect-paragraphs '("First para" "\n\n" "Second para" "\n" "Second line"))
+(detect-paragraphs '("First para" "\n\n" (div "Second block")))
+(detect-paragraphs '((div "First block") "\n\n" (div "Second block")))
+(detect-paragraphs '("First para" "\n\n" "Second para") #:tag 'ns:p)
+(detect-paragraphs '("First para" "\n\n" "Second para" "\n" "Second line")
+#:linebreak-proc (λ(x) (convert-linebreaks x #:linebreak '(newline))))
+
+]
+
@section{Decode}
@@ -15,12 +97,12 @@
[#:txexpr-tag-proc txexpr-tag-proc (txexpr-tag? . -> . txexpr-tag?) (λ(tag) tag)]
[#:txexpr-attrs-proc txexpr-attrs-proc (txexpr-attrs? . -> . txexpr-attrs?) (λ(attrs) attrs)]
[#:txexpr-elements-proc txexpr-elements-proc (txexpr-elements? . -> . txexpr-elements?) (λ(elements) elements)]
-[#:block-txexpr-proc block-txexpr-proc (block-txexpr? . -> . block-txexpr?) (λ(tx) tx)]
-[#:inline-txexpr-proc inline-txexpr-proc (txexpr? . -> . txexpr?) (λ(tx) tx)]
-[#:string-proc string-proc (string? . -> . string?) (λ(str) str)]
-[#:symbol-proc symbol-proc (symbol? . -> . symbol?) (λ(sym) sym)]
-[#:valid-char-proc valid-char-proc (valid-char? . -> . valid-char?) (λ(vc) vc)]
-[#:cdata-proc cdata-proc (cdata? . -> . cdata?) (λ(cdata) cdata)]
+[#:block-txexpr-proc block-txexpr-proc (block-txexpr? . -> . xexpr?) (λ(tx) tx)]
+[#:inline-txexpr-proc inline-txexpr-proc (txexpr? . -> . xexpr?) (λ(tx) tx)]
+[#:string-proc string-proc (string? . -> . xexpr?) (λ(str) str)]
+[#:symbol-proc symbol-proc (symbol? . -> . xexpr?) (λ(sym) sym)]
+[#:valid-char-proc valid-char-proc (valid-char? . -> . xexpr?) (λ(vc) vc)]
+[#:cdata-proc cdata-proc (cdata? . -> . xexpr?) (λ(cdata) cdata)]
[#:exclude-tags tags-to-exclude (listof symbol?) null]
)
txexpr?]
@@ -28,16 +110,26 @@ Recursively process a @racket[_tagged-xexpr], usually the one exported from a Po
@margin-note{This is different from the Scribble approach, where the decoding logic is fixed for every document. In Pollen, you only get the decoding you ask for, and you can customize it to any degree.}
-By default, the @racket[_tagged-xexpr] from a source file is tagged with @racket[root]. Recall from @secref{Pollen mechanics} that any tag can have a function attached to it. So the typical way to use @racket[decode] is to attach your decoding functions to it, and then define @racket[root] to invoke your @racket[decode] function. Then it will be automatically applied to every @racket['doc] during compile.
+By default, the @racket[_tagged-xexpr] from a source file is tagged with @racket[root]. Recall from @secref{Pollen mechanics} that any tag can have a function attached to it. So the typical way to use @racket[decode] is to attach your decoding functions to it, and then define @racket[root] to invoke your @racket[decode] function. Then it will be automatically applied to every @racket['doc] during compile.
+
+For instance, here's how @racket[decode] is attached to @racket['root] in @italic{Butterick's Practical Typography}:
+
+@codeblock|{
+(define (root . items)
+ (decode (make-txexpr 'root null items)
+ #:xexpr-elements-proc detect-paragraphs
+ #:block-xexpr-proc
+ (λ(bx) (wrap-hanging-quotes (nonbreaking-last-space bx)))
+ #:string-proc (compose1 smart-quotes smart-dashes)))}|
-While @racket[decode] presents an imposing list of arguments, you're unlikely to use all of them at once. These represent possibilities, not requirements. Let's see what happens when @racket[decode] is invoked without any of its optional arguments:
+That's it. Which illustrates another important point: even though @racket[decode] presents an imposing list of arguments, you're unlikely to use all of them at once. These represent possibilities, not requirements. To that end, let's see what happens when @racket[decode] is invoked without any of its optional arguments.
@examples[#:eval my-eval
(define tx '(root "I wonder" (em "why") "this works."))
(decode tx)
]
-Right — nothing. That's because the default value for the decoding arguments is the identity function, @racket[(λ(x)x)]. So everything gets passed through intact, until other action is specified.
+Right — nothing. That's because the default value for the decoding arguments is the identity function, @racket[(λ(x)x)]. So all the input gets passed through intact unless another action is specified.
The @racket[_txexpr-tag-proc] argument is a procedure that handles X-expression tags.
@@ -83,8 +175,27 @@ So why do you need @racket[_txexpr-elements-proc]? Because some types of element
]
-The @racket[_block-txexpr-proc] argument is a procedure that operates on tagged X-expressions that are deemed block-level (as opposed to inline) elements. That is, they meet the @racket[block-txexpr?] test. (See also @racket[register-block-tag].)
+The @racket[_block-txexpr-proc] argument and the @racket[_inline-txexpr-proc] arguments are procedures that operate on tagged X-expressions. If the X-expression meets the @racket[block-txexpr?] test, it is processed by @racket[_block-txexpr-proc]. Otherwise, it is processed by @racket[_inline-txexpr-proc]. Thus every tagged X-expression will be handled by one or the other. Of course, if you want block and inline elements to be handled the same way, you can set @racket[_block-txexpr-proc] and @racket[_inline-txexpr-proc] to be the same procedure.
+
+@examples[#:eval my-eval
+(define tx '(div "Please" (em "mind the gap") (h1 "Tuesdays only")))
+(define add-ns (λ(tx) (cons (string->symbol (format "ns:~a" (car tx)))
+(cdr tx))))
+(decode tx #:block-txexpr-proc add-ns)
+(decode tx #:inline-txexpr-proc add-ns)
+(decode tx #:block-txexpr-proc add-ns #:inline-txexpr-proc add-ns)
+]
+
+The @racket[_string-proc], @racket[_symbol-proc], @racket[_valid-char-proc], and @racket[_cdata-proc] arguments are procedures that operate on X-expressions that are strings, symbols, valid-chars, and CDATA, respectively. Deliberately, the output contracts for these procedures accept any kind of X-expression (meaning, the procedure can change the X-expression type).
+@examples[#:eval my-eval
+(define tx `(div "Moe" amp 62 ,(cdata #f #f "3 > 2;")))
+(define rulify (λ(x) '(hr)))
+(decode tx #:string-proc rulify)
+(decode tx #:symbol-proc rulify)
+(decode tx #:valid-char-proc rulify)
+(decode tx #:cdata-proc rulify)
+]
@@ -111,7 +222,7 @@ The @racket[_tags-to-exclude] argument is useful if you're decoding source that'
@section{Blocks}
@defmodule[pollen/decode/block]
-Because it's convenient, Pollen categorizes tagged X-expressions into two categories: @italic{block} and @italic{inline}. Why is it convenient? When decoding, you often want to treat the two categories differently. Not that you have to. But this is how you can.
+Because it's convenient, Pollen categorizes tagged X-expressions into two categories: @italic{block} and @italic{inline}. Why is it convenient? When using @racket[decode], you often want to treat the two categories differently. Not that you have to. But this is how you can.
@defproc[
(register-block-tag
@@ -144,13 +255,11 @@ If you find the idea of registering block tags unbearable, good news. The @racke
]
-
-
@defproc[
(block-txexpr?
[v any/c])
boolean?]
-Predicate that tests whether @racket[_v] is a tagged X-expression, and if so, whether the tag is among the @racket[project-block-tags]. If not, it is treated as inline.
+Predicate that tests whether @racket[_v] is a tagged X-expression, and if so, whether the tag is among the @racket[project-block-tags]. If not, it is treated as inline. To adjust how this test works, use @racket[register-block-tag].
@defparam[project-block-tags block-tags (listof txexpr-tag?)
#:value html-block-tags]{
diff --git a/world.rkt b/world.rkt
index 7305473..9cb68c9 100644
--- a/world.rkt
+++ b/world.rkt
@@ -39,8 +39,9 @@
(define missing-file-boilerplace "#lang pollen\n\n")
-(define line-break "\n")
-(define paragraph-break "\n\n")
+(define newline "\n")
+(define linebreak-separator newline)
+(define paragraph-separator "\n\n")
(define output-subdir 'public)