|
|
|
@ -1,8 +1,10 @@
|
|
|
|
|
#lang racket/base
|
|
|
|
|
(require racket/string racket/list)
|
|
|
|
|
(require (planet mb/pollen/hyphenation-data))
|
|
|
|
|
(require (planet mb/pollen/readability))
|
|
|
|
|
(require (planet mb/pollen/tools))
|
|
|
|
|
(require racket/string racket/list racket/contract)
|
|
|
|
|
(require "hyphenation-data.rkt")
|
|
|
|
|
(require "../readability.rkt")
|
|
|
|
|
(require "../tools.rkt")
|
|
|
|
|
|
|
|
|
|
(module+ test (require rackunit))
|
|
|
|
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
;;; Hyphenate.rkt
|
|
|
|
@ -25,10 +27,10 @@
|
|
|
|
|
(string-replace x "-" ""))
|
|
|
|
|
|
|
|
|
|
(define (make-value x)
|
|
|
|
|
(list->vector (cons 0 (map (ƒ(x) (int (=str x "-"))) (regexp-split #px"[a-z]" x)))))
|
|
|
|
|
(list->vector (cons 0 (map (λ(x) (if (equal? x "-") 1 0)) (regexp-split #px"[a-z]" x)))))
|
|
|
|
|
|
|
|
|
|
(make-hash
|
|
|
|
|
(map (ƒ(x) (cons (make-key x) (make-value x))) exception-data)))
|
|
|
|
|
(map (λ(x) (cons (make-key x) (make-value x))) exception-data)))
|
|
|
|
|
|
|
|
|
|
; global data, so this only needs to be defined once
|
|
|
|
|
(define exceptions (make-exceptions exception-data))
|
|
|
|
@ -41,10 +43,11 @@
|
|
|
|
|
(define tree (make-hash))
|
|
|
|
|
(define (insert-pattern pat)
|
|
|
|
|
(let* ([chars (regexp-replace* #px"[0-9]" pat "")]
|
|
|
|
|
[points (map (λ(x) (int x)) (regexp-split #px"[.a-z]" pat))]
|
|
|
|
|
;; regexp returns list of strings
|
|
|
|
|
[points (map (λ(x) (if (> (len x) 0) (string->number x) 0)) (regexp-split #px"[.a-z]" pat))]
|
|
|
|
|
[tree tree])
|
|
|
|
|
(for ([char chars])
|
|
|
|
|
(when (not (in? tree char))
|
|
|
|
|
(when (not (char . in? . tree))
|
|
|
|
|
(change tree char (make-hash)))
|
|
|
|
|
(set! tree (get tree char)))
|
|
|
|
|
(change tree empty points)))
|
|
|
|
@ -60,21 +63,21 @@
|
|
|
|
|
; controls hyphenation zone from edges of word
|
|
|
|
|
; todo: parameterize this setting
|
|
|
|
|
; todo: does this count end-of-word punctuation? it shouldn't.
|
|
|
|
|
(map (ƒ(i) (change points i 0)) (list 1 2 (- (len points) 2) (- (len points) 3)))
|
|
|
|
|
(map (λ(i) (change points i 0)) (list 1 2 (- (len points) 2) (- (len points) 3)))
|
|
|
|
|
points)
|
|
|
|
|
|
|
|
|
|
(let* ([word (to-lc word)]
|
|
|
|
|
(let* ([word (string-downcase word)]
|
|
|
|
|
[points
|
|
|
|
|
(if (in? exceptions word)
|
|
|
|
|
(if (word . in? . exceptions)
|
|
|
|
|
(get exceptions word)
|
|
|
|
|
(let* ([work (str "." word ".")]
|
|
|
|
|
(let* ([work (string-append "." (->string word) ".")]
|
|
|
|
|
[points (make-vector (add1 (len work)) 0)])
|
|
|
|
|
(for ([i (len work)])
|
|
|
|
|
(let ([tree pattern-tree])
|
|
|
|
|
(for ([char (get work i 'end)]
|
|
|
|
|
#:break (not (in? tree char)))
|
|
|
|
|
#:break (not (char . in? . tree)))
|
|
|
|
|
(set! tree (get tree char))
|
|
|
|
|
(when (in? tree empty)
|
|
|
|
|
(when (empty . in? . tree)
|
|
|
|
|
(let ([point (get tree empty)])
|
|
|
|
|
(for ([j (len point)])
|
|
|
|
|
(change points (+ i j) (max (get points (+ i j)) (get point j)))))))))
|
|
|
|
@ -92,40 +95,47 @@
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (hyphenate-word word #:filter [filter (λ(x)x)])
|
|
|
|
|
(define/contract (word->hyphenated-pieces word #:omit [omit? (λ(x) #f)])
|
|
|
|
|
((string?) (#:omit procedure?) . ->* . (listof string?))
|
|
|
|
|
; Given a word, returns a list of pieces,
|
|
|
|
|
; broken at the possible hyphenation points.
|
|
|
|
|
(if (or (<= (len word) 4) (filter word))
|
|
|
|
|
; Short words aren't hyphenated.
|
|
|
|
|
(as-list word)
|
|
|
|
|
(if (or (<= (len word) 4) (omit? word))
|
|
|
|
|
;; boundary conditions:
|
|
|
|
|
;; Short words aren't hyphenated, nor omitted words
|
|
|
|
|
(->list word)
|
|
|
|
|
; Examine the points to build the pieces list.
|
|
|
|
|
(string-split ; split on whitespace
|
|
|
|
|
(list->string ; concatenate chars
|
|
|
|
|
(flatten ; get rid of cons pairs
|
|
|
|
|
(for/list ([char word] [point (make-points word)])
|
|
|
|
|
(for/list ([char word]
|
|
|
|
|
[point (make-points word)])
|
|
|
|
|
(if (even? point)
|
|
|
|
|
char ; even point denotes character
|
|
|
|
|
(cons char #\ )))))))) ; odd point denotes char + syllable
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (hyphenate-string text #:joiner [joiner (integer->char #x00AD)] #:filter [filter (λ(x)x)])
|
|
|
|
|
(regexp-replace* #px"\\w+" text (ƒ(word) (string-join (hyphenate-word word #:filter filter) (as-string joiner)))))
|
|
|
|
|
(define (hyphenate-string text #:joiner [joiner (integer->char #x00AD)] #:omit [omit? (λ(x)#f)])
|
|
|
|
|
(regexp-replace* #px"\\w+" text (λ(word) (string-join (word->hyphenated-pieces word #:omit omit?) (->string joiner)))))
|
|
|
|
|
|
|
|
|
|
(define (capitalized? word)
|
|
|
|
|
; match property = \\p
|
|
|
|
|
; match unicode uppercase = {Lu}
|
|
|
|
|
(regexp-match #px"\\p{Lu}" (get word 0)))
|
|
|
|
|
|
|
|
|
|
(define (ligated? word)
|
|
|
|
|
(ormap (λ(lig) (regexp-match lig word)) '("ff" "fi" "fl" "ffi" "ffl")))
|
|
|
|
|
|
|
|
|
|
(define (capitalized-or-ligated? word)
|
|
|
|
|
; filter function for hyphenate
|
|
|
|
|
; filtering ligatable words because once the soft hyphens go in,
|
|
|
|
|
; the browser won't automatically substitute the ligs.
|
|
|
|
|
; so it looks weird, because some are ligated and some not.
|
|
|
|
|
; not ideal, because it removes hyphenation options but ... whatever
|
|
|
|
|
(or (capitalized? word) (ligated? word)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (hyphenate x #:only [only-proc (ƒ(x) x)]) ; recursively hyphenate strings within xexpr
|
|
|
|
|
(define (hyphenate x #:only [only-proc (λ(x) x)]) ; recursively hyphenate strings within xexpr
|
|
|
|
|
(define exclusions '(style script)) ; omit these from ever being hyphenated
|
|
|
|
|
(define (capitalized-or-ligated? word)
|
|
|
|
|
; filter function for hyphenate
|
|
|
|
|
; filtering ligatable words because once the soft hyphens go in,
|
|
|
|
|
; the browser won't automatically substitute the ligs.
|
|
|
|
|
; so it looks weird, because some are ligated and some not.
|
|
|
|
|
; not ideal, because it removes hyphenation options but ... whatever
|
|
|
|
|
(or (capitalized? word) (any (ƒ(lig) (regexp-match lig word)) '("ff" "fi" "fl" "ffi" "ffl"))))
|
|
|
|
|
|
|
|
|
|
(cond
|
|
|
|
|
; todo: the only-proc semantics are illogical.
|
|
|
|
@ -135,30 +145,14 @@
|
|
|
|
|
; Won't it make hyphenation naturally overinclusive?
|
|
|
|
|
; Problem with opt-in: conceals a lot of tags that naturally live inside other tags
|
|
|
|
|
; only reaches text at the "root level" of the tag.
|
|
|
|
|
[(named-xexpr? x) (if (and (only-proc x) (not (in? exclusions (car x))))
|
|
|
|
|
(map-xexpr-content hyphenate x)
|
|
|
|
|
(map-xexpr-content hyphenate x #:only named-xexpr?))] ; only process subxexprs
|
|
|
|
|
[(tagged-xexpr? x) (if (and (only-proc x) (not ((car x) . in? . exclusions)))
|
|
|
|
|
(map-xexpr-elements hyphenate x)
|
|
|
|
|
(map-xexpr-elements (λ(x) (if (tagged-xexpr? x) (hyphenate x) x)) x))] ; only process subxexprs
|
|
|
|
|
|
|
|
|
|
[(string? x)
|
|
|
|
|
; hyphenate everything but last word
|
|
|
|
|
; todo: problem here is that it's string-based, not paragraph based.
|
|
|
|
|
; meaning, the last word of every STRING gets exempted,
|
|
|
|
|
; even if that word doesn't fall at the end of a block.
|
|
|
|
|
; should work the way nonbreak spacer works.
|
|
|
|
|
; todo: question - should hyphenator ignore possible ligature pairs, like fi?
|
|
|
|
|
; because auto ligatures will skip combos with a soft hyphen between
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
; regexp matches everything up to last word, and allows trailing whitespace
|
|
|
|
|
; parenthesized matches become series of lambda arguments. Arity must match
|
|
|
|
|
; [^\\s\u00A0] = characters that are neither whitespace nor nbsp (which is not included in \s)
|
|
|
|
|
; +\\s*$ = catches trailing whitespace up to end
|
|
|
|
|
(regexp-replace #px"(.*?)([^\\s\u00A0]+\\s*$)"
|
|
|
|
|
x
|
|
|
|
|
; by default, filter out capitalized words and words with ligatable combos
|
|
|
|
|
; m0 m1 m2 are the match groups from regexp-replace
|
|
|
|
|
(ƒ(m0 m1 m2) (string-append (hyphenate-string m1 #:filter capitalized-or-ligated?) m2)))]
|
|
|
|
|
[(string? x) (hyphenate-string x)]
|
|
|
|
|
[else x]))
|
|
|
|
|
|
|
|
|
|
(module+ main
|
|
|
|
|
(hyphenate '(p "circular firing squad") #:only (ƒ(xexpr) (in? '(p) (first xexpr)))))
|
|
|
|
|
(module+ test
|
|
|
|
|
(check-equal? (word->hyphenated-pieces "polymorphism") '("poly" "mor" "phism"))
|
|
|
|
|
(check-equal? (hyphenate "circular polymorphism squandering") "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing")
|
|
|
|
|
(check-equal? (hyphenate '(p "circular polymorphism")) '(p "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism")))
|