From 4218102aedeaf85f1851b4fe2f094cb9c86fb0e2 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Fri, 21 Feb 2014 14:58:10 -0800 Subject: [PATCH] decode: fast versions --- decode.rkt | 375 +++---------------------------------- decode/fast.rkt | 63 +++++++ decode/tests.rkt | 72 +++++++ decode/typography-fast.rkt | 230 +++++++++++++++++++++++ main.rkt | 2 +- 5 files changed, 395 insertions(+), 347 deletions(-) create mode 100644 decode/fast.rkt create mode 100644 decode/tests.rkt create mode 100644 decode/typography-fast.rkt diff --git a/decode.rkt b/decode.rkt index 5639acf..4ef68d2 100644 --- a/decode.rkt +++ b/decode.rkt @@ -1,347 +1,30 @@ #lang racket/base -(require racket/contract racket/list racket/string racket/match) -(require (only-in xml xexpr/c)) -(require "tools.rkt" "predicates.rkt" sugar txexpr) - -(module+ test (require rackunit)) - -(provide (except-out (all-defined-out) decode register-block-tag)) - -;; general way of coercing to string -(define (to-string x) - (if (string? x) - x ; fast exit for strings - (with-handlers ([exn:fail? (λ(exn) (error (format "Pollen parser: can't convert ~a to ~a" x 'string)))]) - (cond - [(equal? '() x) ""] - [(symbol? x) (symbol->string x)] - [(number? x) (number->string x)] - [(path? x) (path->string x)] - [(char? x) (format "~a" x)] - [else (error)])))) ; put this last so other xexprish things don't get caught - - -;; add a block tag to the list -;; this function is among the predicates because it alters a predicate globally. -(define+provide/contract (register-block-tag tag) - (symbol? . -> . void?) - (append-block-tag tag)) - -(module+ test - (check-true (begin (register-block-tag 'barfoo) (block-xexpr? '(barfoo "foo"))))) - - -;; decoder wireframe -(define+provide/contract (decode nx - #:exclude-xexpr-tags [excluded-xexpr-tags '()] - #:xexpr-tag-proc [xexpr-tag-proc (λ(x)x)] - #:xexpr-attrs-proc [xexpr-attrs-proc (λ(x)x)] - #:xexpr-elements-proc [xexpr-elements-proc (λ(x)x)] - #:block-xexpr-proc [block-xexpr-proc (λ(x)x)] - #:inline-xexpr-proc [inline-xexpr-proc (λ(x)x)] - #:string-proc [string-proc (λ(x)x)]) - ((xexpr/c) ;; use xexpr/c for contract on nx because it gives better error messages - - ;; todo: how to write more specific contracts for these procedures? - ;; e.g., string-proc should be restricted to procs that accept a string as input - ;; and return a string as output - (#:exclude-xexpr-tags list? - #:xexpr-tag-proc procedure? - #:xexpr-attrs-proc procedure? - #:xexpr-elements-proc procedure? - #:block-xexpr-proc procedure? - #:inline-xexpr-proc procedure? - #:string-proc procedure?) - . ->* . txexpr?) - (when (not (txexpr? nx)) - (error (format "decode: ~v not a full txexpr" nx))) - - - (define (&decode x) - (cond - [(txexpr? x) (let-values([(tag attr elements) (txexpr->values x)]) - (if (tag . in? . excluded-xexpr-tags) - x ; let x pass through untouched - (let ([decoded-xexpr (apply make-txexpr - (map &decode (list tag attr elements)))]) - ((if (block-xexpr? decoded-xexpr) - block-xexpr-proc - inline-xexpr-proc) decoded-xexpr))))] - [(txexpr-tag? x) (xexpr-tag-proc x)] - [(txexpr-attrs? x) (xexpr-attrs-proc x)] - ;; need this for operations that may depend on context in list - [(txexpr-elements? x) (map &decode (xexpr-elements-proc x))] - [(string? x) (string-proc x)] - ;; if something has made it through undecoded, that's a problem - [else (error "Can't decode" x)])) - - - (&decode nx)) - - -;; This module is a library of functions to be used in building pollen decoders. - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Typography - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - - -;; insert typographic niceties -;; ligatures are handled in css -(define (typogrify str) - (string? . -> . string?) - ;; make set of functions for replacers - (define (make-replacer query replacement) - (λ(str) (regexp-replace* query str replacement))) - - ;; just store the query strings + replacement strings - (define dashes - ;; fix em dashes first, else they'll be mistaken for en dashes - ;; [\\s ] is whitespace + nonbreaking space - '((#px"[\\s ]*(---|—)[\\s ]*" "—") ; em dash - (#px"[\\s ]*(--|–)[\\s ]*" "–"))) ; en dash - - (define smart-quotes - '((#px"(?<=\\w)'(?=\\w)" "’") ; apostrophe - (#px"(?string #\u00A0)] - #:minimum-word-length [minimum-word-length 6]) - ((txexpr?) (#:nbsp string? #:minimum-word-length integer?) . ->* . txexpr?) - ;; todo: parameterize this, as it will be different for each project - (define tags-to-pay-attention-to '(p aside)) ; only apply to paragraphs - - (define (replace-last-space str) - (if (#\space . in? . str) - (let ([reversed-str-list (reverse (string->list str))] - [reversed-nbsp (reverse (string->list nbsp))]) - (define-values (last-word-chars other-chars) - (splitf-at reversed-str-list (λ(i) (not (eq? i #\space))))) - (list->string (reverse (append last-word-chars - ; OK for long words to be on their own line. - (if (< (len last-word-chars) minimum-word-length) - ; first char of other-chars will be the space, so use cdr - (append reversed-nbsp (cdr other-chars)) - other-chars))))) - str)) - - (define (find-last-word-space x) ; recursively traverse xexpr - (cond - [(string? x) (replace-last-space x)] - [(txexpr? x) - (let-values([(tag attr elements) (txexpr->values x)]) - (if (> (length elements) 0) ; elements is list of xexprs - (let-values ([(all-but-last last) (split-at elements (sub1 (length elements)))]) - (make-txexpr tag attr `(,@all-but-last ,(find-last-word-space (car last))))) - x))] - [else x])) - - (if ((car x) . in? . tags-to-pay-attention-to) - (find-last-word-space x) - x)) - -;; todo: make some tougher tests, it gets flaky with edge cases -(module+ test - (check-equal? (nonbreaking-last-space '(p "Hi there")) '(p "Hi there")) ; nbsp in between last two words - (check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "Ø") '(p "HiØthere")) ; but let's make it visible - (check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "_up_") '(p "Hi_up_there")) - (check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "_up_" #:minimum-word-length 3) - '(p "Hi there")) - (check-equal? (nonbreaking-last-space '(p "Hi here" (em "ho there")) #:nbsp "Ø") '(p "Hi here" (em "hoØthere")))) - - -; wrap initial quotes for hanging punctuation -; todo: improve this -; does not handle

thing properly -(define/contract (wrap-hanging-quotes nx - #:single-prepend [single-pp '(squo)] - #:double-prepend [double-pp '(dquo)]) - ((txexpr?) (#:single-prepend list? #:double-prepend list?) . ->* . txexpr?) - (define two-or-more-char-string? (λ(i) (and (string? i) (>= (len i) 2)))) - (define-values (tag attr elements) (txexpr->values nx)) - (make-txexpr tag attr - (if (and (list? elements) (not (empty? elements))) - (let ([new-car-elements (match (car elements) - [(? two-or-more-char-string? tcs) - (define str-first (get tcs 0)) - (define str-rest (get tcs 1 'end)) - (cond - [(str-first . in? . '("\"" "“")) - ;; can wrap with any inline tag - ;; so that linebreak detection etc still works - `(,@double-pp ,(->string #\“) ,str-rest)] - [(str-first . in? . '("\'" "‘")) - `(,@single-pp ,(->string #\‘) ,str-rest)] - [else tcs])] - [(? txexpr? nx) (wrap-hanging-quotes nx)] - [else (car elements)])]) - (cons new-car-elements (cdr elements))) - elements))) - - - -(module+ test - (check-equal? (wrap-hanging-quotes '(p "\"Hi\" there")) '(p (dquo "“" "Hi\" there"))) - (check-equal? (wrap-hanging-quotes '(p "'Hi' there")) '(p (squo "‘" "Hi' there"))) - (check-equal? (wrap-hanging-quotes '(p "'Hi' there") #:single-prepend '(foo ((bar "ino")))) - '(p (foo ((bar "ino")) "‘" "Hi' there"))) - ;; make sure txexpr without elements passes through unscathed - (check-equal? (wrap-hanging-quotes '(div ((style "height:2em")))) '(div ((style "height:2em"))))) - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Lines, blocks, paragraphs - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - -;; turn the right items into
tags -(define/contract (convert-linebreaks xc #:newline [newline "\n"]) - ((txexpr-elements?) (#:newline string?) . ->* . txexpr-elements?) - ;; todo: should this test be not block + not whitespace? - (define not-block? (λ(i) (not (block-xexpr? i)))) - (filter-not empty? - (for/list ([i (len xc)]) - (let ([item (get xc i)]) - (cond - ;; skip first and last - [(or (= i 0) (= i (sub1 (len xc)))) item] - [(equal? item newline) - (match (get xc (- i 1) (+ i 2)) ; a three-element slice with x[i] in the middle - ;; only convert if neither adjacent tag is a block - ;; (because blocks automatically force a newline before & after) - [(list (? not-block?) newline (? not-block?)) '(br)] - [else empty])] ; otherwise delete - [else item]))))) - -(module+ test - (check-equal? (convert-linebreaks '("foo" "\n" "bar")) '("foo" (br) "bar")) - (check-equal? (convert-linebreaks '("\n" "foo" "\n" "bar" "\n")) '("\n" "foo" (br) "bar" "\n")) - (check-equal? (convert-linebreaks '((p "foo") "\n" (p "bar"))) '((p "foo") (p "bar"))) - (check-equal? (convert-linebreaks '("foo" "\n" (p "bar"))) '("foo" (p "bar"))) - (check-equal? (convert-linebreaks '("foo" "moo" "bar")) '("foo" "moo" "bar")) - (check-equal? (convert-linebreaks '("foo" "moo" "bar") #:newline "moo") '("foo" (br) "bar")) - (check-equal? (convert-linebreaks '("foo" "\n\n" "bar")) '("foo" "\n\n" "bar"))) - - - -;; recursive whitespace test -(define/contract (whitespace? x) - (any/c . -> . boolean?) - (cond - [(or (string? x) (symbol? x)) (->boolean (regexp-match #px"^\\s+$" (->string x)))] - [(equal? "" x) #t] ; empty string is deemed whitespace - [(or (list? x) (vector? x)) (andmap whitespace? (->list x))] - [else #f])) - - -;; is x a paragraph break? -(define/contract (paragraph-break? x #:pattern [paragraph-pattern #px"^\n\n+$"]) - ((any/c) (#:pattern pregexp?) . ->* . boolean?) - (and (string? x) (->boolean (regexp-match paragraph-pattern x)))) - -(module+ test - (check-false (paragraph-break? "foo")) - (check-false (paragraph-break? "\n")) - (check-false (paragraph-break? "\n \n")) - (check-true (paragraph-break? "\n \n" #:pattern #px"^\n \n$")) - (check-true (paragraph-break? "\n\n")) - (check-true (paragraph-break? "\n\n\n"))) - - - -;; Find adjacent newline characters in a list and merge them into one item -;; Scribble, by default, makes each newline a separate list item -;; In practice, this is worthless. -(define/contract (merge-newlines x) - (list? . -> . list?) - (define (newline? x) - (and (string? x) (equal? "\n" x))) - (define (not-newline? x) - (not (newline? x))) - - (define (really-merge-newlines xs [acc '()]) - (if (empty? xs) - acc - ;; Try to peel the newlines off the front. - (let-values ([(leading-newlines remainder) (splitf-at xs newline?)]) - (if (not (empty? leading-newlines)) ; if you got newlines ... - ;; combine them into a string and append them to the accumulator, - ;; and recurse on the rest - (really-merge-newlines remainder (append acc (list (string-join leading-newlines "")))) - ;; otherwise peel off elements up to the next newline, append them to accumulator, - ;; and recurse on the rest - (really-merge-newlines (dropf remainder not-newline?) - (append acc (takef remainder not-newline?))))))) - - (cond - [(list? x) (really-merge-newlines (map merge-newlines x))] - [else x])) - -(module+ test - (check-equal? (merge-newlines '(p "\n" "foo" "\n" "\n" "bar" (em "\n" "\n" "\n"))) - '(p "\n" "foo" "\n\n" "bar" (em "\n\n\n")))) - - - -;; todo: add native support for list-xexpr -;; decode triple newlines to list items - - -;; prepare elements for paragraph testing -(define/contract (prep-paragraph-flow xc) - (txexpr-elements? . -> . txexpr-elements?) - (convert-linebreaks (merge-newlines (trim xc whitespace?)))) - -(module+ test - (check-equal? (prep-paragraph-flow '("\n" "foo" "\n" "\n" "bar" "\n" "ino" "\n")) - '("foo" "\n\n" "bar" (br) "ino"))) - -;; apply paragraph tag -(define/contract (wrap-paragraph xc #:tag [tag 'p]) - ((txexpr-elements?) (#:tag symbol?) . ->* . block-xexpr?) - (match xc - [(list (? block-xexpr? bx)) bx] ; leave a single block xexpr alone - [else (make-txexpr tag empty xc)])) ; otherwise wrap in p tag - -(module+ test - (check-equal? (wrap-paragraph '("foo" "bar")) '(p "foo" "bar")) - (check-equal? (begin (append-block-tag 'para) (wrap-paragraph #:tag 'para '("foo" "bar"))) - '(para "foo" "bar")) - (check-equal? (wrap-paragraph '((p "bar" "foo"))) '(p "bar" "foo")) - (check-equal? (wrap-paragraph '((div "bar" "foo") "Hi" )) '(p (div "bar" "foo") "Hi"))) - - -;; detect paragraphs -;; todo: unit tests -(define/contract (detect-paragraphs elements) - (txexpr-elements? . -> . txexpr-elements?) - (let ([elements (prep-paragraph-flow elements)]) - (if (ormap paragraph-break? elements) ; need this condition to prevent infinite recursion - (map wrap-paragraph (splitf-at* elements paragraph-break?)) ; split into ¶¶ - elements))) \ No newline at end of file +(require racket/contract xml txexpr) +(require "decode/fast.rkt" "predicates.rkt" "decode/typography-fast.rkt") + +(provide to-string (contract-out [register-block-tag (symbol? . -> . void?)] + [decode ((xexpr/c) ;; use xexpr/c for contract on nx because it gives better error messages + + ;; todo: how to write more specific contracts for these procedures? + ;; e.g., string-proc should be restricted to procs that accept a string as input + ;; and return a string as output + (#:exclude-xexpr-tags list? + #:xexpr-tag-proc procedure? + #:xexpr-attrs-proc procedure? + #:xexpr-elements-proc procedure? + #:block-xexpr-proc procedure? + #:inline-xexpr-proc procedure? + #:string-proc procedure?) + . ->* . txexpr?)] + + [typogrify (string? . -> . string?)] + [nonbreaking-last-space ((txexpr?) (#:nbsp string? #:minimum-word-length integer?) . ->* . txexpr?)] + [wrap-hanging-quotes ((txexpr?) (#:single-prepend list? #:double-prepend list?) . ->* . txexpr?)] + [convert-linebreaks ((txexpr-elements?) (#:newline string?) . ->* . txexpr-elements?)] + [whitespace? (any/c . -> . boolean?)] + [paragraph-break? ((any/c) (#:pattern pregexp?) . ->* . boolean?)] + [merge-newlines (list? . -> . list?)] + [prep-paragraph-flow (txexpr-elements? . -> . txexpr-elements?)] + [wrap-paragraph ((txexpr-elements?) (#:tag symbol?) . ->* . block-xexpr?)] + [detect-paragraphs (txexpr-elements? . -> . txexpr-elements?)] + )) \ No newline at end of file diff --git a/decode/fast.rkt b/decode/fast.rkt new file mode 100644 index 0000000..682d565 --- /dev/null +++ b/decode/fast.rkt @@ -0,0 +1,63 @@ +#lang racket/base +(require racket/match xml) +(require "../tools.rkt" "../predicates.rkt" txexpr/fast "typography-fast.rkt") + + +(provide (all-defined-out) (all-from-out "typography-fast.rkt")) + +;; general way of coercing to string +(define (to-string x) + (if (string? x) + x ; fast exit for strings + (with-handlers ([exn:fail? (λ(exn) (error (format "Pollen parser: can't convert ~a to ~a" x 'string)))]) + (cond + [(equal? '() x) ""] + [(symbol? x) (symbol->string x)] + [(number? x) (number->string x)] + [(path? x) (path->string x)] + [(char? x) (format "~a" x)] + [else (error)])))) ; put this last so other xexprish things don't get caught + + +;; add a block tag to the list +;; this function is among the predicates because it alters a predicate globally. +(define (register-block-tag tag) + (append-block-tag tag)) + + + +;; decoder wireframe +(define (decode nx + #:exclude-xexpr-tags [excluded-xexpr-tags '()] + #:xexpr-tag-proc [xexpr-tag-proc (λ(x)x)] + #:xexpr-attrs-proc [xexpr-attrs-proc (λ(x)x)] + #:xexpr-elements-proc [xexpr-elements-proc (λ(x)x)] + #:block-xexpr-proc [block-xexpr-proc (λ(x)x)] + #:inline-xexpr-proc [inline-xexpr-proc (λ(x)x)] + #:string-proc [string-proc (λ(x)x)]) + + (when (not (txexpr? nx)) + (error (format "decode: ~v not a full txexpr" nx))) + + + (define (&decode x) + (cond + [(txexpr? x) (let-values([(tag attr elements) (txexpr->values x)]) + (if (member tag excluded-xexpr-tags) + x ; let x pass through untouched + (let ([decoded-xexpr (apply make-txexpr + (map &decode (list tag attr elements)))]) + ((if (block-xexpr? decoded-xexpr) + block-xexpr-proc + inline-xexpr-proc) decoded-xexpr))))] + [(txexpr-tag? x) (xexpr-tag-proc x)] + [(txexpr-attrs? x) (xexpr-attrs-proc x)] + ;; need this for operations that may depend on context in list + [(txexpr-elements? x) (map &decode (xexpr-elements-proc x))] + [(string? x) (string-proc x)] + ;; if something has made it through undecoded, that's a problem + [else (error "Can't decode" x)])) + + + (&decode nx)) + diff --git a/decode/tests.rkt b/decode/tests.rkt new file mode 100644 index 0000000..2ebd952 --- /dev/null +++ b/decode/tests.rkt @@ -0,0 +1,72 @@ +#lang racket/base +(require pollen/decode pollen/predicates) +(module+ test (require rackunit)) + +(module+ test + (check-true (begin (register-block-tag 'barfoo) (block-xexpr? '(barfoo "foo")))) + + (check-equal? (typogrify "I had --- maybe 13 -- 20 --- hob-nobs.") "I had—maybe 13–20—hob-nobs.") + (check-equal? (typogrify "\"Why,\" she could've asked, \"are we in O‘ahu watching 'Mame'?\"") + "“Why,” she could’ve asked, “are we in O‘ahu watching ‘Mame’?”")) + + + +;; todo: make some tougher tests, it gets flaky with edge cases +(module+ test + (check-equal? (nonbreaking-last-space '(p "Hi there")) '(p "Hi there")) ; nbsp in between last two words + (check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "Ø") '(p "HiØthere")) ; but let's make it visible + (check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "_up_") '(p "Hi_up_there")) + (check-equal? (nonbreaking-last-space '(p "Hi there") #:nbsp "_up_" #:minimum-word-length 3) + '(p "Hi there")) + (check-equal? (nonbreaking-last-space '(p "Hi here" (em "ho there")) #:nbsp "Ø") '(p "Hi here" (em "hoØthere")))) + + +(module+ test + (check-equal? (wrap-hanging-quotes '(p "\"Hi\" there")) '(p (dquo "“" "Hi\" there"))) + (check-equal? (wrap-hanging-quotes '(p "'Hi' there")) '(p (squo "‘" "Hi' there"))) + (check-equal? (wrap-hanging-quotes '(p "'Hi' there") #:single-prepend '(foo ((bar "ino")))) + '(p (foo ((bar "ino")) "‘" "Hi' there"))) + ;; make sure txexpr without elements passes through unscathed + (check-equal? (wrap-hanging-quotes '(div ((style "height:2em")))) '(div ((style "height:2em"))))) + + +(module+ test + (check-equal? (convert-linebreaks '("foo" "\n" "bar")) '("foo" (br) "bar")) + (check-equal? (convert-linebreaks '("\n" "foo" "\n" "bar" "\n")) '("\n" "foo" (br) "bar" "\n")) + (check-equal? (convert-linebreaks '((p "foo") "\n" (p "bar"))) '((p "foo") (p "bar"))) + (check-equal? (convert-linebreaks '("foo" "\n" (p "bar"))) '("foo" (p "bar"))) + (check-equal? (convert-linebreaks '("foo" "moo" "bar")) '("foo" "moo" "bar")) + (check-equal? (convert-linebreaks '("foo" "moo" "bar") #:newline "moo") '("foo" (br) "bar")) + (check-equal? (convert-linebreaks '("foo" "\n\n" "bar")) '("foo" "\n\n" "bar"))) + + + +(module+ test + (check-false (paragraph-break? "foo")) + (check-false (paragraph-break? "\n")) + (check-false (paragraph-break? "\n \n")) + (check-true (paragraph-break? "\n \n" #:pattern #px"^\n \n$")) + (check-true (paragraph-break? "\n\n")) + (check-true (paragraph-break? "\n\n\n"))) + + +(module+ test + (check-equal? (merge-newlines '(p "\n" "foo" "\n" "\n" "bar" (em "\n" "\n" "\n"))) + '(p "\n" "foo" "\n\n" "bar" (em "\n\n\n")))) + + + +;; todo: add native support for list-xexpr +;; decode triple newlines to list items + + +(module+ test + (check-equal? (prep-paragraph-flow '("\n" "foo" "\n" "\n" "bar" "\n" "ino" "\n")) + '("foo" "\n\n" "bar" (br) "ino"))) + +(module+ test + (check-equal? (wrap-paragraph '("foo" "bar")) '(p "foo" "bar")) + (check-equal? (begin (append-block-tag 'para) (wrap-paragraph #:tag 'para '("foo" "bar"))) + '(para "foo" "bar")) + (check-equal? (wrap-paragraph '((p "bar" "foo"))) '(p "bar" "foo")) + (check-equal? (wrap-paragraph '((div "bar" "foo") "Hi" )) '(p (div "bar" "foo") "Hi"))) diff --git a/decode/typography-fast.rkt b/decode/typography-fast.rkt new file mode 100644 index 0000000..c5b6132 --- /dev/null +++ b/decode/typography-fast.rkt @@ -0,0 +1,230 @@ +#lang racket/base +(require racket/match) +(require "../tools.rkt" "../predicates.rkt" sugar txexpr/fast) + + +(provide (all-defined-out)) + + +;; This module is a library of functions to be used in building pollen decoders. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Typography + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + +;; insert typographic niceties +;; ligatures are handled in css +(define (typogrify str) + ;; make set of functions for replacers + (define (make-replacer query replacement) + (λ(str) (regexp-replace* query str replacement))) + + ;; just store the query strings + replacement strings + (define dashes + ;; fix em dashes first, else they'll be mistaken for en dashes + ;; [\\s ] is whitespace + nonbreaking space + '((#px"[\\s ]*(---|—)[\\s ]*" "—") ; em dash + (#px"[\\s ]*(--|–)[\\s ]*" "–"))) ; en dash + + (define smart-quotes + '((#px"(?<=\\w)'(?=\\w)" "’") ; apostrophe + (#px"(?string #\u00A0)] + #:minimum-word-length [minimum-word-length 6]) + + ;; todo: parameterize this, as it will be different for each project + (define tags-to-pay-attention-to '(p aside)) ; only apply to paragraphs + + (define (replace-last-space str) + (if (#\space . in? . str) + (let ([reversed-str-list (reverse (string->list str))] + [reversed-nbsp (reverse (string->list nbsp))]) + (define-values (last-word-chars other-chars) + (splitf-at reversed-str-list (λ(i) (not (eq? i #\space))))) + (list->string (reverse (append last-word-chars + ; OK for long words to be on their own line. + (if (< (len last-word-chars) minimum-word-length) + ; first char of other-chars will be the space, so use cdr + (append reversed-nbsp (cdr other-chars)) + other-chars))))) + str)) + + (define (find-last-word-space x) ; recursively traverse xexpr + (cond + [(string? x) (replace-last-space x)] + [(txexpr? x) + (let-values([(tag attr elements) (txexpr->values x)]) + (if (> (length elements) 0) ; elements is list of xexprs + (let-values ([(all-but-last last) (split-at elements (sub1 (length elements)))]) + (make-txexpr tag attr `(,@all-but-last ,(find-last-word-space (car last))))) + x))] + [else x])) + + (if ((car x) . in? . tags-to-pay-attention-to) + (find-last-word-space x) + x)) + + + + +; wrap initial quotes for hanging punctuation +; todo: improve this +; does not handle

thing properly +(define (wrap-hanging-quotes nx + #:single-prepend [single-pp '(squo)] + #:double-prepend [double-pp '(dquo)]) + + (define two-or-more-char-string? (λ(i) (and (string? i) (>= (len i) 2)))) + (define-values (tag attr elements) (txexpr->values nx)) + (make-txexpr tag attr + (if (and (list? elements) (not (empty? elements))) + (let ([new-car-elements (match (car elements) + [(? two-or-more-char-string? tcs) + (define str-first (get tcs 0)) + (define str-rest (get tcs 1 'end)) + (cond + [(str-first . in? . '("\"" "“")) + ;; can wrap with any inline tag + ;; so that linebreak detection etc still works + `(,@double-pp ,(->string #\“) ,str-rest)] + [(str-first . in? . '("\'" "‘")) + `(,@single-pp ,(->string #\‘) ,str-rest)] + [else tcs])] + [(? txexpr? nx) (wrap-hanging-quotes nx)] + [else (car elements)])]) + (cons new-car-elements (cdr elements))) + elements))) + + + + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Lines, blocks, paragraphs + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +;; turn the right items into
tags +(define (convert-linebreaks xc #:newline [newline "\n"]) + + ;; todo: should this test be not block + not whitespace? + (define not-block? (λ(i) (not (block-xexpr? i)))) + (filter-not empty? + (for/list ([i (len xc)]) + (let ([item (get xc i)]) + (cond + ;; skip first and last + [(or (= i 0) (= i (sub1 (len xc)))) item] + [(equal? item newline) + (match (get xc (- i 1) (+ i 2)) ; a three-element slice with x[i] in the middle + ;; only convert if neither adjacent tag is a block + ;; (because blocks automatically force a newline before & after) + [(list (? not-block?) newline (? not-block?)) '(br)] + [else empty])] ; otherwise delete + [else item]))))) + + + +;; recursive whitespace test +(define (whitespace? x) + + (cond + [(or (string? x) (symbol? x)) (->boolean (regexp-match #px"^\\s+$" (->string x)))] + [(equal? "" x) #t] ; empty string is deemed whitespace + [(or (list? x) (vector? x)) (andmap whitespace? (->list x))] + [else #f])) + + +;; is x a paragraph break? +(define (paragraph-break? x #:pattern [paragraph-pattern #px"^\n\n+$"]) + + (and (string? x) (->boolean (regexp-match paragraph-pattern x)))) + + + + + +;; Find adjacent newline characters in a list and merge them into one item +;; Scribble, by default, makes each newline a separate list item +;; In practice, this is worthless. +(define (merge-newlines x) + + (define (newline? x) + (and (string? x) (equal? "\n" x))) + (define (not-newline? x) + (not (newline? x))) + + (define (really-merge-newlines xs [acc '()]) + (if (empty? xs) + acc + ;; Try to peel the newlines off the front. + (let-values ([(leading-newlines remainder) (splitf-at xs newline?)]) + (if (not (empty? leading-newlines)) ; if you got newlines ... + ;; combine them into a string and append them to the accumulator, + ;; and recurse on the rest + (really-merge-newlines remainder (append acc (list (apply string-append leading-newlines)))) + ;; otherwise peel off elements up to the next newline, append them to accumulator, + ;; and recurse on the rest + (really-merge-newlines (dropf remainder not-newline?) + (append acc (takef remainder not-newline?))))))) + + (cond + [(list? x) (really-merge-newlines (map merge-newlines x))] + [else x])) + + + + +;; todo: add native support for list-xexpr +;; decode triple newlines to list items + + +;; prepare elements for paragraph testing +(define (prep-paragraph-flow xc) + + (convert-linebreaks (merge-newlines (trim xc whitespace?)))) + + +;; apply paragraph tag +(define (wrap-paragraph xc #:tag [tag 'p]) + + (match xc + [(list (? block-xexpr? bx)) bx] ; leave a single block xexpr alone + [else (make-txexpr tag empty xc)])) ; otherwise wrap in p tag + + + + +;; detect paragraphs +;; todo: unit tests +(define (detect-paragraphs elements) + + (let ([elements (prep-paragraph-flow elements)]) + (if (ormap paragraph-break? elements) ; need this condition to prevent infinite recursion + (map wrap-paragraph (splitf-at* elements paragraph-break?)) ; split into ¶¶ + elements))) \ No newline at end of file diff --git a/main.rkt b/main.rkt index 701b164..6b9e44a 100644 --- a/main.rkt +++ b/main.rkt @@ -55,7 +55,7 @@ ;; set up the 'main export - (require pollen/decode) + (require pollen/decode/fast) (define here-ext (car (regexp-match #px"\\w+$" inner-here-path))) (define wants-decoder? (member here-ext (map to-string DECODABLE_EXTENSIONS))) (define main (apply (cond