@ -162,14 +162,32 @@
( define ( joiner->string joiner ) ( format " ~a " joiner ) )
( define ( apply-proc proc x [ omit-string ( λ ( x ) #f ) ] [ omit-txexpr ( λ ( x ) #f ) ] [ joiner default-joiner ] )
( define ( apply-proc proc x
[ omit-string ( λ ( x ) #f ) ]
[ omit-txexpr ( λ ( x ) #f ) ]
[ joiner default-joiner ]
#:intercap-min-length [ intercap-min-length #false ] )
( let loop ( [ x x ] )
( cond
[ ( and ( string? x ) ( not ( omit-string x ) ) )
;; handle intercapped words as capitalized pieces
( define letter-before-uc #px"(?<=\\p{Ll})(?=\\p{Lu}\\p{Ll})" ) ; match xXx but not xXX or XXX
( string-join ( for/list ( [ x ( in-list ( string-split x letter-before-uc ) ) ] )
( proc x ) ) ( joiner->string joiner ) ) ]
( define words
( cond
[ intercap-min-length
;; handle intercapped words as a list of subwords,
;; subject to the intercap-min-length
( define zero-length-quantifier " " )
( define letter-before-uc
;; match xXx but not xXX or XXX
( pregexp ( format " (?<= \\ p{L}{~a})(?= \\ p{Lu} \\ p{Ll}{~a}) "
( if ( > intercap-min-length 0 )
intercap-min-length
zero-length-quantifier )
( if ( > intercap-min-length 1 )
( sub1 intercap-min-length )
zero-length-quantifier ) ) ) )
( string-split x letter-before-uc ) ]
[ else ( list x ) ] ) )
( string-join ( map proc words ) ( joiner->string joiner ) ) ]
[ ( and ( txexpr? x ) ( not ( omit-txexpr x ) ) )
( make-txexpr ( get-tag x ) ( get-attrs x ) ( map loop ( get-elements x ) ) ) ]
[ else x ] ) ) )
@ -199,7 +217,8 @@
[ else word ] ) ] ) )
( define ( insert-hyphens text ) ( regexp-replace* word-pattern text replacer ) )
( begin0
( apply-proc insert-hyphens x omit-string? omit-txexpr? joiner )
( apply-proc insert-hyphens x omit-string? omit-txexpr? joiner
#:intercap-min-length min-length )
;; deleting from the main cache is cheaper than having to do two cache lookups for every word
;; (missing words will just be regenerated later)
( for-each ( λ ( ee ) ( remove-exception-word word-cache ee ) ) extra-exceptions ) ) )