#lang racket/base (require racket/string racket/list) (require (planet mb/pollen/hyphenation-data)) (require (planet mb/pollen/readability)) (require (planet mb/pollen/tools)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Hyphenate.rkt ;;; Racket port of Ned Batchelder's hyphenate.py ;;; http://nedbatchelder.com/code/modules/hyphenate.html ;;; (in the public domain) ;;; which in turn was an implementation ;;; of the Liang hyphenation algorithm in TeX ;;; (also in the public domain) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (provide (all-defined-out)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Exceptions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (make-exceptions exception-data) (define (make-key x) (string-replace x "-" "")) (define (make-value x) (list->vector (cons 0 (map (ƒ(x) (int (=str x "-"))) (regexp-split #px"[a-z]" x))))) (make-hash (map (ƒ(x) (cons (make-key x) (make-value x))) exception-data))) ; global data, so this only needs to be defined once (define exceptions (make-exceptions exception-data)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Helper functions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (make-pattern-tree pattern-data) (define tree (make-hash)) (define (insert-pattern pat) (let* ([chars (regexp-replace* #px"[0-9]" pat "")] [points (map (λ(x) (int x)) (regexp-split #px"[.a-z]" pat))] [tree tree]) (for ([char chars]) (when (not (in? tree char)) (change tree char (make-hash))) (set! tree (get tree char))) (change tree empty points))) (map insert-pattern pattern-data) tree) ; global data, so this only needs to be defined once (define pattern-tree (make-pattern-tree pattern-data)) (define (make-points word) (define (make-zeroes points) ; controls hyphenation zone from edges of word ; todo: parameterize this setting ; todo: does this count end-of-word punctuation? it shouldn't. (map (ƒ(i) (change points i 0)) (list 1 2 (- (len points) 2) (- (len points) 3))) points) (let* ([word (to-lc word)] [points (if (in? exceptions word) (get exceptions word) (let* ([work (str "." word ".")] [points (make-vector (add1 (len work)) 0)]) (for ([i (len work)]) (let ([tree pattern-tree]) (for ([char (get work i 'end)] #:break (not (in? tree char))) (set! tree (get tree char)) (when (in? tree empty) (let ([point (get tree empty)]) (for ([j (len point)]) (change points (+ i j) (max (get points (+ i j)) (get point j))))))))) points))]) ; make-zeroes controls minimum hyphenation distance from edge. ; todo: dropping first 2 elements is needed for mysterious reasons to be documented later ; see python code for why (get (make-zeroes points) 2 'end))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Main function ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (hyphenate-word word #:filter [filter (λ(x)x)]) ; Given a word, returns a list of pieces, ; broken at the possible hyphenation points. (if (or (<= (len word) 4) (filter word)) ; Short words aren't hyphenated. (as-list word) ; Examine the points to build the pieces list. (string-split ; split on whitespace (list->string ; concatenate chars (flatten ; get rid of cons pairs (for/list ([char word] [point (make-points word)]) (if (even? point) char ; even point denotes character (cons char #\ )))))))) ; odd point denotes char + syllable (define (hyphenate-string text #:joiner [joiner (integer->char #x00AD)] #:filter [filter (λ(x)x)]) (regexp-replace* #px"\\w+" text (ƒ(word) (string-join (hyphenate-word word #:filter filter) (as-string joiner))))) (define (capitalized? word) ; match property = \\p ; match unicode uppercase = {Lu} (regexp-match #px"\\p{Lu}" (get word 0))) (define (hyphenate x #:only [only-proc (ƒ(x) x)]) ; recursively hyphenate strings within xexpr (define exclusions '(style script)) ; omit these from ever being hyphenated (define (capitalized-or-ligated? word) ; filter function for hyphenate ; filtering ligatable words because once the soft hyphens go in, ; the browser won't automatically substitute the ligs. ; so it looks weird, because some are ligated and some not. ; not ideal, because it removes hyphenation options but ... whatever (or (capitalized? word) (any (ƒ(lig) (regexp-match lig word)) '("ff" "fi" "fl" "ffi" "ffl")))) (cond ; todo: the only-proc semantics are illogical. ; main issue: keep it out of tags like