#lang racket/base (require txexpr/base racket/string racket/list) (provide hyphenate unhyphenate word->hyphenation-points exception-word->word+pattern string->hashpair) (module+ test (require rackunit)) ;; module default values (define default-min-length 5) (define default-min-left-length 2) (define default-min-right-length 2) (define default-joiner #\u00AD) (define default-min-hyphen-count 1) (define (exception-word->word+pattern ew) ;; an exception word indicates its breakpoints with added hyphens (define word (string-replace ew "-" "")) ;; pattern has same number of points as word letters. 1 marks hyphenation point; 0 no hyphenation (define points (cdr (map (λ (x) (if (equal? x "-") 1 0)) (regexp-split #px"\\p{L}" ew)))) ;; use list here so we can `apply` in `add-exception-word` (list word points)) (module+ test (check-equal? (exception-word->word+pattern "snôw-man") '("snôwman" (0 0 0 1 0 0 0))) (check-equal? (exception-word->word+pattern "snôwman") '("snôwman" (0 0 0 0 0 0 0))) (check-equal? (exception-word->word+pattern "sn-ôw-ma-n") '("snôwman" (0 1 0 1 0 1 0)))) (define (add-exception-word word-cache word) ;; `hash-set!` not `hash-ref!`, because we want an exception to override an existing value (apply hash-set! word-cache (exception-word->word+pattern word))) (define (remove-exception-word word-cache word) (hash-remove! word-cache (string-replace word "-" ""))) (define (string->natural i) (let* ([result (string->number i)] [result (and result (inexact->exact result))] [result (and (exact-nonnegative-integer? result) result)]) result)) (module+ test (check-equal? (string->natural "Foo") #f) (check-equal? (string->natural "1.0") 1) (check-equal? (string->natural "-3") #f)) (define (string->hashpair pat) (define-values (strs nums) (for/lists (strs nums) ;; using unicode-aware regexps to allow unicode hyphenation patterns ;; a pattern is a list of subpatterns, each of which is a character possibly followed by a number. ;; also, the first position may just have a number. ([subpat (in-list (regexp-match* #px"^\\d?|(\\p{L}|\\p{P})\\d?" pat))]) (define str (cond [(regexp-match #px"(\\p{L}|\\p{P})?" subpat) => car] [else ""])) (define num (cond [(regexp-match #px"\\d" subpat) => (compose1 string->natural car)] [else 0])) (values str num))) (list (string-append* strs) nums)) (module+ test (check-equal? (string->hashpair "2'2") '("'" (2 2))) (check-equal? (string->hashpair ".â4") '(".â" (0 0 4))) (check-equal? (string->hashpair ".ý4") '(".ý" (0 0 4))) (check-equal? (string->hashpair "'ý4") '("'ý" (0 0 4))) (check-equal? (string->hashpair "’ý4") '("’ý" (0 0 4))) (check-equal? (string->hashpair "4ý-") '("ý-" (4 0 0))) (check-equal? (string->hashpair ".ach4") '(".ach" (0 0 0 0 4)))) (define (calculate-max-pattern patterns) ;; each pattern is a list of numbers ;; all the patterns have the same length ;; run max against parallel elements (apply map (λ xs (apply max xs)) patterns)) (module+ test (require rackunit) (check-equal? (calculate-max-pattern '((1 0 0))) '(1 0 0)) (check-equal? (calculate-max-pattern '((1 0 0) (0 1 0))) '(1 1 0)) (check-equal? (calculate-max-pattern '((1 0 0) (0 1 0) (0 0 1))) '(1 1 1)) (check-equal? (calculate-max-pattern '((1 2 3) (2 3 1) (3 1 2))) '(3 3 3))) (define (make-points word word-cache pattern-cache) (hash-ref! word-cache word (λ () ;; dots are used to denote the beginning and end of the word when matching patterns (define boundary-marker ".") (define word-with-boundaries (format "~a~a~a" boundary-marker (string-downcase word) boundary-marker)) (define word-length (string-length word-with-boundaries)) (define default-zero-pattern (make-list (add1 word-length) 0)) ;; walk through all the substrings and see if there's a matching pattern. (define matching-patterns (for*/list ([start (in-range word-length)] [end (in-range start word-length)] [substr (in-value (substring word-with-boundaries start (add1 end)))] [partial-pattern (in-value (hash-ref pattern-cache (string->symbol substr) #f))] #:when partial-pattern) ;; pad out partial-pattern to full length ;; (so we can compare patterns to find max value for each slot) (define left-zeroes (make-list start 0)) (define right-zeroes (make-list (- (add1 word-length) (length partial-pattern) start) 0)) (append left-zeroes partial-pattern right-zeroes))) (define max-pattern (calculate-max-pattern (cons default-zero-pattern matching-patterns))) ;; for point list generated from a pattern, ;; drop first two elements because they represent hyphenation weight ;; before the starting "." and between "." and the first letter. ;; drop last element because it represents hyphen after last "." ;; after you drop these two, then each number corresponds to ;; whether a hyphen goes after that letter. (drop-right (drop max-pattern 2) 1)))) ;; Find hyphenation points in a word. This is not quite synonymous with syllables. (define (word->hyphenation-points word word-cache pattern-cache [min-length #f] [min-left-length #f] [min-right-length #f]) (cond [(< (string-length word) (or min-length default-min-length)) (list word)] [else ;; points is a list corresponding to the letters of the word. ;; to create a no-hyphenation zone of length n, zero out the first n-1 points ;; and the last n points (because the last value in points is always superfluous) (define word-points (let* ([points (make-points word word-cache pattern-cache)] [left-zeroes (min (or min-left-length default-min-left-length) (length points))] [right-zeroes (min (or min-right-length default-min-right-length) (length points))]) (for/list ([(point idx) (in-indexed points)]) (if (<= left-zeroes (add1 idx) (- (length points) right-zeroes)) point 0)))) ;; odd-valued points in the pattern denote hyphenation points (define odd-point-indexes (for/list ([(wp idx) (in-indexed word-points)] #:when (odd? wp)) idx)) ;; the hyphenation goes after the indexed letter, so add1 to the raw points for slicing (define breakpoints (append (list 0) (map add1 odd-point-indexes) (list (string-length word)))) (for/list ([start (in-list breakpoints)] [end (in-list (cdr breakpoints))]) ; shorter list controls exit of loop (substring word start end))])) ;; joiner contract allows char or string; this coerces to string. (define (joiner->string joiner) (format "~a" joiner)) (define (apply-proc proc x [omit-string (λ (x) #f)] [omit-txexpr (λ (x) #f)] [joiner default-joiner] #:intercap-min-length [intercap-min-length #false]) (let loop ([x x]) (cond [(and (string? x) (not (omit-string x))) (define words (cond [intercap-min-length ;; handle intercapped words as a list of subwords, ;; subject to the intercap-min-length (define zero-length-quantifier "") (define letter-before-uc ;; match xXx but not xXX or XXX (pregexp (format "(?<=\\p{L}{~a})(?=\\p{Lu}\\p{Ll}{~a})" (if (> intercap-min-length 0) intercap-min-length zero-length-quantifier) (if (> intercap-min-length 1) (sub1 intercap-min-length) zero-length-quantifier)))) (string-split x letter-before-uc)] [else (list x)])) (string-join (map proc words) (joiner->string joiner))] [(and (txexpr? x) (not (omit-txexpr x))) (make-txexpr (get-tag x) (get-attrs x) (map loop (get-elements x)))] [else x]))) (define (hyphenate word-cache pattern-cache x [joiner default-joiner] #:exceptions [extra-exceptions empty] #:min-length [min-length default-min-length] #:min-left-length [min-left-length default-min-left-length] #:min-right-length [min-right-length default-min-right-length] #:min-hyphens [min-hyphens-added default-min-hyphen-count] #:omit-word [omit-word? (λ (x) #f)] #:omit-string [omit-string? (λ (x) #f)] #:omit-txexpr [omit-txexpr? (λ (x) #f)]) ;; todo?: connect this regexp pattern to the one used in word? predicate (for-each (λ (ee) (add-exception-word word-cache ee)) extra-exceptions) (define word-pattern #px"\\p{L}+") ;; more restrictive than exception-word (define (replacer word . words) (cond [(omit-word? word) word] [else (define hyphenation-points (word->hyphenation-points word word-cache pattern-cache min-length min-left-length min-right-length)) (cond [(>= (sub1 (length hyphenation-points)) min-hyphens-added) (string-join hyphenation-points (joiner->string joiner))] [else word])])) (define (insert-hyphens text) (regexp-replace* word-pattern text replacer)) (begin0 (apply-proc insert-hyphens x omit-string? omit-txexpr? joiner #:intercap-min-length min-length) ;; deleting from the main cache is cheaper than having to do two cache lookups for every word ;; (missing words will just be regenerated later) (for-each (λ (ee) (remove-exception-word word-cache ee)) extra-exceptions))) (define (unhyphenate x [joiner default-joiner] #:omit-word [omit-word? (λ (x) #f)] #:omit-string [omit-string? (λ (x) #f)] #:omit-txexpr [omit-txexpr? (λ (x) #f)]) (define word-pattern (pregexp (format "[\\w~a]+" joiner))) (define (replacer word . words) (if (not (omit-word? word)) (string-replace word (joiner->string joiner) "") word)) (define (remove-hyphens text) (regexp-replace* word-pattern text replacer)) (apply-proc remove-hyphens x omit-string? omit-txexpr?))