#lang racket/base
( require xml txexpr sugar racket/match racket/list ( prefix-in html: pollen/html ) )
( require " debug.rkt " " world.rkt " )
( define ( symbols? x ) ( and ( list? x ) ( andmap symbol? x ) ) )
( define+provide ( to-string x )
( if ( string? x )
x ; fast exit for strings
( with-handlers ( [ exn:fail? ( λ ( exn ) ( error ( format " Pollen decoder: can't convert ~v to ~a " x ' string ) ) ) ] )
( cond
[ ( equal? ' ( ) x ) " " ]
[ ( symbol? x ) ( symbol->string x ) ]
[ ( number? x ) ( number->string x ) ]
[ ( path? x ) ( path->string x ) ]
[ ( char? x ) ( format " ~a " x ) ]
[ ( void? x ) " " ]
;; todo: guard against weird shit like lists of procedures
[ ( or ( list? x ) ( hash? x ) ( vector? x ) ) ( format " ~v " x ) ] ; ok to convert datatypes
[ else ( error ) ] ) ) ) ) ; but things like procedures should throw an error
;; decoder wireframe
( define+provide/contract ( decode txexpr
#:txexpr-tag-proc [ txexpr-tag-proc ( λ ( x ) x ) ]
#:txexpr-attrs-proc [ txexpr-attrs-proc ( λ ( x ) x ) ]
#:txexpr-elements-proc [ txexpr-elements-proc ( λ ( x ) x ) ]
#:block-txexpr-proc [ block-txexpr-proc ( λ ( x ) x ) ]
#:inline-txexpr-proc [ inline-txexpr-proc ( λ ( x ) x ) ]
#:string-proc [ string-proc ( λ ( x ) x ) ]
#:symbol-proc [ symbol-proc ( λ ( x ) x ) ]
#:valid-char-proc [ valid-char-proc ( λ ( x ) x ) ]
#:cdata-proc [ cdata-proc ( λ ( x ) x ) ]
#:exclude-tags [ excluded-tags ' ( ) ]
#:exclude-attrs [ excluded-attrs ' ( ) ] )
( ( xexpr/c )
( #:txexpr-tag-proc ( txexpr-tag? . -> . txexpr-tag? )
#:txexpr-attrs-proc ( txexpr-attrs? . -> . txexpr-attrs? )
#:txexpr-elements-proc ( txexpr-elements? . -> . txexpr-elements? )
#:block-txexpr-proc ( block-txexpr? . -> . xexpr? )
#:inline-txexpr-proc ( txexpr? . -> . xexpr? )
#:string-proc ( string? . -> . xexpr? )
#:symbol-proc ( symbol? . -> . xexpr? )
#:valid-char-proc ( valid-char? . -> . xexpr? )
#:cdata-proc ( cdata? . -> . xexpr? )
#:exclude-tags ( listof txexpr-tag? )
#:exclude-attrs txexpr-attrs? ) . ->* . txexpr? )
( let loop ( [ x txexpr ] )
( cond
[ ( txexpr? x ) ( let-values ( [ ( tag attrs elements ) ( txexpr->values x ) ] )
( if ( or ( member tag excluded-tags ) ( ormap ( λ ( attr ) ( member attr excluded-attrs ) ) attrs ) )
x ; because it's excluded
;; we apply processing here rather than do recursive descent on the pieces
;; because if we send them back through loop, certain element types are ambiguous
;; e.g., ((p "foo")) tests out as both txexpr-attrs and txexpr-elements
( let ( [ decoded-txexpr
( apply make-txexpr ( list ( txexpr-tag-proc tag )
( txexpr-attrs-proc attrs )
( map loop ( txexpr-elements-proc elements ) ) ) ) ] )
( ( if ( block-txexpr? decoded-txexpr )
block-txexpr-proc
inline-txexpr-proc ) decoded-txexpr ) ) ) ) ]
[ ( string? x ) ( string-proc x ) ]
[ ( symbol? x ) ( symbol-proc x ) ]
[ ( valid-char? x ) ( valid-char-proc x ) ]
[ ( cdata? x ) ( cdata-proc x ) ]
[ else ( error " decode: can't decode " x ) ] ) ) )
( define+provide/contract ( decode-elements elements
#:txexpr-tag-proc [ txexpr-tag-proc ( λ ( x ) x ) ]
#:txexpr-attrs-proc [ txexpr-attrs-proc ( λ ( x ) x ) ]
#:txexpr-elements-proc [ txexpr-elements-proc ( λ ( x ) x ) ]
#:block-txexpr-proc [ block-txexpr-proc ( λ ( x ) x ) ]
#:inline-txexpr-proc [ inline-txexpr-proc ( λ ( x ) x ) ]
#:string-proc [ string-proc ( λ ( x ) x ) ]
#:symbol-proc [ symbol-proc ( λ ( x ) x ) ]
#:valid-char-proc [ valid-char-proc ( λ ( x ) x ) ]
#:cdata-proc [ cdata-proc ( λ ( x ) x ) ]
#:exclude-tags [ excluded-tags ' ( ) ]
#:exclude-attrs [ excluded-attrs ' ( ) ] )
( ( txexpr-elements? )
( #:txexpr-tag-proc ( txexpr-tag? . -> . txexpr-tag? )
#:txexpr-attrs-proc ( txexpr-attrs? . -> . txexpr-attrs? )
#:txexpr-elements-proc ( txexpr-elements? . -> . txexpr-elements? )
#:block-txexpr-proc ( block-txexpr? . -> . xexpr? )
#:inline-txexpr-proc ( txexpr? . -> . xexpr? )
#:string-proc ( string? . -> . xexpr? )
#:symbol-proc ( symbol? . -> . xexpr? )
#:valid-char-proc ( valid-char? . -> . xexpr? )
#:cdata-proc ( cdata? . -> . xexpr? )
#:exclude-tags ( listof txexpr-tag? )
#:exclude-attrs txexpr-attrs? ) . ->* . txexpr-elements? )
( define temp-tag ( gensym " temp-tag " ) )
( define decode-result ( decode ` ( temp-tag ,@ elements )
#:txexpr-tag-proc txexpr-tag-proc
#:txexpr-attrs-proc txexpr-attrs-proc
#:txexpr-elements-proc txexpr-elements-proc
#:block-txexpr-proc block-txexpr-proc
#:inline-txexpr-proc inline-txexpr-proc
#:string-proc string-proc
#:symbol-proc symbol-proc
#:valid-char-proc valid-char-proc
#:cdata-proc cdata-proc
#:exclude-tags excluded-tags
#:exclude-attrs excluded-attrs ) )
( get-elements decode-result ) )
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Blocks
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; initial set of block tags: from html
( define+provide project-block-tags
( make-parameter html:block-tags ) )
;; tags are inline unless they're registered as block tags.
( define+provide/contract ( block-txexpr? x )
( any/c . -> . boolean? )
( and ( txexpr? x ) ( member ( get-tag x ) ( project-block-tags ) ) #t ) )
( define+provide/contract ( register-block-tag tag )
( txexpr-tag? . -> . void? )
( project-block-tags ( cons tag ( project-block-tags ) ) ) )
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Typography
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
( define ( make-replacer query+replacement )
( let ( [ queries ( map car query+replacement ) ]
[ replacements ( map second query+replacement ) ] )
;; reverse because first in list should be first applied to str (and compose1 works right-to-left)
( apply compose1 ( reverse ( map ( λ ( query replacement ) ( λ ( str ) ( regexp-replace* query str replacement ) ) ) queries replacements ) ) ) ) )
( define+provide/contract ( smart-dashes str )
( string? . -> . string? )
( define dashes
;; fix em dashes first, else they'll be mistaken for en dashes
;; \\s is whitespace + #\u00A0 is nonbreaking space
' ( ( #px"[\\s#\u00A0]*(---|—)[\\s#\u00A0]*" " — " ) ; em dash
( #px"[\\s#\u00A0]*(--|– )[\\s#\u00A0]*" " – " ) ) ) ; en dash
( ( make-replacer dashes ) str ) )
( define+provide/contract ( smart-quotes str )
( string? . -> . string? )
( define quotes
' ( ( #px"(?<=\\w)'(?=\\w)" " ’ " ) ; apostrophe
( #px"(?<!\\w)'(?=\\S)" " ‘ " ) ; single_at_beginning
( #px"(?<=\\S)'(?!\\w)" " ’ " ) ; single_at_end
( #px"(?<!\\w)\"(?=\\S)" " “ " ) ; double_at_beginning
( #px"(?<=\\S)\"(?!\\w)" " ” " ) ) ) ; double_at_end
( ( make-replacer quotes ) str ) )
;; insert nbsp between last two words
( define+provide/contract ( nonbreaking-last-space x #:nbsp [ nbsp ( ->string #\u00A0 ) ]
#:minimum-word-length [ minimum-word-length 6 ]
#:last-word-proc [ last-word-proc ( λ ( x ) x ) ] )
( ( txexpr? ) ( #:nbsp string? #:minimum-word-length integer? #:last-word-proc procedure? ) . ->* . txexpr? )
;; todo: parameterize this, as it will be different for each project
( define tags-to-pay-attention-to ' ( p aside ) ) ; only apply to paragraphs
( define ( replace-last-space str )
( if ( #\space . in? . str )
( let ( [ reversed-str-list ( reverse ( string->list str ) ) ]
[ reversed-nbsp ( reverse ( string->list ( ->string nbsp ) ) ) ] )
( define-values ( last-word-chars other-chars )
( splitf-at reversed-str-list ( λ ( i ) ( not ( eq? i #\space ) ) ) ) )
( define front-chars ( if ( < ( len last-word-chars ) minimum-word-length ) ; OK for long words to be on their own line
; first char of other-chars will be the space, so use cdr
( string-append ( list->string ( reverse ( cdr other-chars ) ) ) ( ->string nbsp ) )
( list->string ( reverse other-chars ) ) ) )
( define last-word ( list->string ( reverse last-word-chars ) ) )
` ( , front-chars , ( last-word-proc last-word ) ) ) ; don't concatenate last word bc last-word-proc might be a txexpr wrapper
( list str ) ) )
( define ( find-last-word-space x ) ; recursively traverse xexpr
( cond
[ ( string? x ) ( replace-last-space x ) ] ; todo: this assumes a paragraph only has one string in it.
[ ( txexpr? x )
( let-values ( [ ( tag attr elements ) ( txexpr->values x ) ] )
( if ( > ( length elements ) 0 ) ; elements is list of xexprs
( let-values ( [ ( all-but-last last ) ( split-at elements ( sub1 ( length elements ) ) ) ] )
( define result ( find-last-word-space ( car last ) ) )
( define result-items ( if ( txexpr? result ) ( list result ) result ) ) ; might be txexpr, or list of new elements
( make-txexpr tag attr ` ( ,@ all-but-last ,@ result-items ) ) )
x ) ) ]
[ else x ] ) )
( if ( ( car x ) . in? . tags-to-pay-attention-to )
( find-last-word-space x )
x ) )
; wrap initial quotes for hanging punctuation
; todo: improve this
; does not handle <p>“<em>thing</em> properly
( define+provide/contract ( wrap-hanging-quotes nx
#:single-prepend [ single-pp ' ( squo ) ]
#:double-prepend [ double-pp ' ( dquo ) ] )
( ( txexpr? ) ( #:single-prepend list? #:double-prepend list? ) . ->* . txexpr? )
( define two-or-more-char-string? ( λ ( i ) ( and ( string? i ) ( >= ( len i ) 2 ) ) ) )
( define-values ( tag attr elements ) ( txexpr->values nx ) )
( make-txexpr tag attr
( if ( and ( list? elements ) ( not ( empty? elements ) ) )
( let ( [ new-car-elements ( match ( car elements )
[ ( ? two-or-more-char-string? tcs )
( define str-first ( get tcs 0 ) )
( define str-rest ( get tcs 1 ( len tcs ) ) )
( cond
[ ( str-first . in? . ' ( " \" " " “ " ) )
;; can wrap with any inline tag
;; so that linebreak detection etc still works
` ( ,@ double-pp , ( ->string #\“ ) , str-rest ) ]
[ ( str-first . in? . ' ( " \' " " ‘ " ) )
` ( ,@ single-pp , ( ->string #\‘ ) , str-rest ) ]
[ else tcs ] ) ]
[ ( ? txexpr? nx ) ( wrap-hanging-quotes nx ) ]
[ else ( car elements ) ] ) ] )
( cons new-car-elements ( cdr elements ) ) )
elements ) ) )
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Lines, blocks, paragraphs
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; turn the right items into <br> tags
( define+provide/contract ( detect-linebreaks xc
#:separator [ newline world:linebreak-separator ]
#:insert [ linebreak ' ( br ) ] )
( ( txexpr-elements? ) ( #:separator string? #:insert xexpr? ) . ->* . txexpr-elements? )
;; todo: should this test be not block + not whitespace?
( define not-block? ( λ ( i ) ( not ( block-txexpr? i ) ) ) )
( filter-not empty?
( for/list ( [ i ( in-range ( len xc ) ) ] )
( let ( [ item ( get xc i ) ] )
( cond
;; skip first and last
[ ( or ( = i 0 ) ( = i ( sub1 ( len xc ) ) ) ) item ]
[ ( equal? item newline )
( match ( get xc ( - i 1 ) ( + i 2 ) ) ; a three-element slice with x[i] in the middle
;; only convert if neither adjacent tag is a block
;; (because blocks automatically force a newline before & after)
[ ( list ( ? not-block? ) newline ( ? not-block? ) ) linebreak ]
[ else empty ] ) ] ; otherwise delete
[ else item ] ) ) ) ) )
( define+provide/contract ( whitespace? x [ nbsp? #f ] )
( ( any/c ) ( boolean? ) . ->* . coerce/boolean? )
( define pat ( pregexp ( format " ^[ \\ s~a]+$ " ( if nbsp? #\u00A0 " " ) ) ) )
( cond
[ ( equal? " " x ) #t ] ; empty string is deemed whitespace
[ ( or ( string? x ) ( symbol? x ) ) ( regexp-match pat ( ->string x ) ) ]
[ ( or ( list? x ) ( vector? x ) ) ( and ( not ( empty? x ) ) ( andmap ( λ ( i ) ( whitespace? i nbsp? ) ) ( ->list x ) ) ) ] ; andmap returns #t for empty lists
[ else #f ] ) )
( define+provide/contract ( whitespace/nbsp? x )
( any/c . -> . coerce/boolean? )
( whitespace? x #t ) )
;; is x a paragraph break?
( define+provide/contract ( paragraph-break? x #:separator [ sep world:paragraph-separator ] )
( ( any/c ) ( #:separator pregexp? ) . ->* . coerce/boolean? )
( define paragraph-pattern ( pregexp ( format " ^~a+$ " sep ) ) )
( and ( string? x ) ( regexp-match paragraph-pattern x ) ) )
( define ( newline? x )
( and ( string? x ) ( equal? world:newline x ) ) )
( define ( not-newline? x )
( not ( newline? x ) ) )
( define ( do-merge xs [ acc ' ( ) ] )
( if ( empty? xs )
acc
;; Try to peel the newlines off the front.
( let-values ( [ ( leading-newlines remainder ) ( splitf-at xs newline? ) ] )
( if ( not ( empty? leading-newlines ) ) ; if you got newlines ...
;; combine them into a string and append them to the accumulator,
;; and recurse on the rest
( do-merge remainder ( append acc ( list ( apply string-append leading-newlines ) ) ) )
;; otherwise peel off elements up to the next newline, append them to accumulator,
;; and recurse on the rest
( do-merge ( dropf remainder not-newline? )
( append acc ( takef remainder not-newline? ) ) ) ) ) ) )
;; Find adjacent newline characters in a list and merge them into one item
;; Scribble, by default, makes each newline a separate list item
;; In practice, this is worthless.
( define+provide/contract ( merge-newlines x )
( txexpr-elements? . -> . txexpr-elements? )
( cond
[ ( list? x ) ( do-merge ( map merge-newlines x ) ) ]
[ else x ] ) )
;; detect paragraphs
;; todo: unit tests
( define+provide/contract ( detect-paragraphs elements #:tag [ tag ' p ]
#:separator [ sep world:paragraph-separator ]
#:linebreak-proc [ linebreak-proc detect-linebreaks ]
#:force? [ force-paragraph #f ] )
( ( txexpr-elements? ) ( #:tag symbol? #:separator string? #:linebreak-proc ( txexpr-elements? . -> . txexpr-elements? ) #:force? boolean? )
. ->* . txexpr-elements? )
;; prepare elements for paragraph testing
( define ( prep-paragraph-flow elems )
( linebreak-proc ( merge-newlines ( trimf elems whitespace? ) ) ) )
( define my-paragraph-break? ( λ ( x ) ( and ( paragraph-break? x #:separator sep ) #t ) ) )
( define ( wrap-paragraph elems )
( match elems
[ ( list ( ? block-txexpr? bxs ) ... ) bxs ] ; leave a series of block xexprs alone
[ else ( list ( make-txexpr tag empty elems ) ) ] ) ) ; otherwise wrap in p tag
( let ( [ elements ( prep-paragraph-flow elements ) ] )
( define explicit-or-implicit-paragraph-break? ( λ ( x ) ( or ( my-paragraph-break? x ) ( block-txexpr? x ) ) ) )
( if ( ormap explicit-or-implicit-paragraph-break? elements ) ; need this condition to prevent infinite recursion
;; use append-map on wrap-paragraph rather than map to permit return of multiple elements
( append-map wrap-paragraph ( append-map ( λ ( es ) ( filter-split es my-paragraph-break? ) ) ( slicef elements block-txexpr? ) ) ) ; split into ¶¶, using both implied and explicit paragraph breaks
( if force-paragraph
( append-map wrap-paragraph ( slicef elements block-txexpr? ) ) ; upconverts non-block elements to paragraphs
elements ) ) ) )