#lang racket/base (require racket/string racket/list racket/contract racket/vector racket/bool) (require "hyphenation-patterns.rkt") (module+ test (require rackunit)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Hyphenate module ;;; Racket port of Ned Batchelder's hyphenate.py ;;; http://nedbatchelder.com/code/modules/hyphenate.html ;;; (in the public domain) ;;; which in turn was an implementation ;;; of the Liang hyphenation algorithm in TeX ;;; (also in the public domain) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (provide (contract-out [hyphenate ((string?) ((or/c char? string?) #:exceptions (listof word?) #:min-length (or/c integer? false?)) . ->* . string?)]) (contract-out [hyphenatef ((string? procedure?) ((or/c char? string?) #:exceptions (listof word?) #:min-length (or/c integer? false?)) . ->* . string?)]) (contract-out [unhyphenate ((string?) ((or/c char? string?)) . ->* . string?)])) ;; global data, define now but set! them later (because they're potentially big & slow) (define exceptions #f) (define pattern-tree #f) ;; global default values (define default-min-length 5) (define default-joiner (integer->char #x00AD)) ;; Convert the hyphenated pattern into a point array for use later. (define (list->exceptions exn-strings) (define (make-key x) (string-replace x "-" "")) (define (make-value x) (list->vector (cons 0 (map (λ(x) (if (equal? x "-") 1 0)) (regexp-split #px"[a-z]" x))))) (make-hash (map (λ(x) (cons (make-key x) (make-value x))) exn-strings))) ;; A word is a string without whitespace. (define (word? x) (if (regexp-match #px"^\\S+$" x) #t #f)) (module+ test (check-true (word? "Foobar")) (check-true (word? "foobar")) (check-true (word? "foo-bar")) (check-false (word? "foo bar"))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Helper functions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Convert a pattern like 'a1bc3d4' into a string of chars 'abcd' ;; and a list of points [ 1, 0, 3, 4 ]. (define (make-pattern-tree pattern-data) (define tree (make-hash)) ;; Insert the pattern into the tree. Each character finds a dict ;; another level down in the tree, and leaf nodes have the list of ;; points. (define (insert-pattern pat) (let* ([chars (regexp-replace* #px"[0-9]" pat "")] ;; regexp returns list of strings [points (map (λ(x) (if (> (string-length x) 0) (string->number x) 0)) (regexp-split #px"[.a-z]" pat))] [tree tree]) (for ([char chars]) (when (not (hash-has-key? tree char)) (hash-set! tree char (make-hash))) (set! tree (hash-ref tree char))) (hash-set! tree empty points))) (map insert-pattern pattern-data) tree) (define (make-points word) (define (make-zeroes points) ; controls hyphenation zone from edges of word ; todo: parameterize this setting ; todo: does this count end-of-word punctuation? it shouldn't. (map (λ(i) (vector-set! points i 0)) (list 1 2 (- (vector-length points) 2) (- (vector-length points) 3))) points) (let* ([word (string-downcase word)] [points (if (hash-has-key? exceptions word) (hash-ref exceptions word) (let* ([work (string-append "." word ".")] [points (make-vector (add1 (string-length work)) 0)]) (for ([i (string-length work)]) (let ([tree pattern-tree]) (for ([char (substring work i (string-length work))] #:break (not (hash-has-key? tree char))) (set! tree (hash-ref tree char)) (when (hash-has-key? tree empty) (let ([point (hash-ref tree empty)]) (for ([j (length point)]) (vector-set! points (+ i j) (max (vector-ref points (+ i j)) (list-ref point j))))))))) points))]) ; make-zeroes controls minimum hyphenation distance from edge. ; todo: dropping first 2 elements is needed for mysterious reasons to be documented later (vector-drop (make-zeroes points) 2))) ;; helpful extension of splitf-at (define (splitf-at* xs split-test) (define (trim items test-proc) (dropf-right (dropf items test-proc) test-proc)) (define (&splitf-at* xs [acc '()]) (if (empty? xs) ;; reverse because accumulation is happening backward ;; (because I'm using cons to push latest match onto front of list) (reverse acc) (let-values ([(item rest) ;; drop matching elements from front ;; then split on nonmatching ;; = nonmatching item + other elements (which will start with matching) (splitf-at (dropf xs split-test) (compose1 not split-test))]) ;; recurse, and store new item in accumulator (&splitf-at* rest (cons item acc))))) ;; trim off elements matching split-test (&splitf-at* (trim xs split-test))) ;; Find hyphenatable pieces of a word. This is not quite synonymous with syllables. (define (word->pieces word [min-length default-min-length]) (define (make-pieces word) (define word-dissected (flatten (for/list ([char word] [point (make-points word)]) (if (even? point) char ; even point denotes character (cons char 'syllable))))) ; odd point denotes char + syllable (map list->string (splitf-at* word-dissected symbol?))) (if (and min-length (< (string-length word) min-length)) (list word) (make-pieces word))) ;; joiner contract allows char or string; this coerces to string. (define (joiner->string joiner) (if (char? joiner) (format "~a" joiner) joiner)) ;; Hyphenate using a filter procedure. ;; Theoretically possible to do this externally, ;; but it would just mean doing the regexp-replace twice. (define (hyphenatef text proc [joiner default-joiner] #:exceptions [extra-exceptions '()] #:min-length [min-length default-min-length]) ;; set up module data (set! exceptions (list->exceptions (append default-exceptions extra-exceptions))) (when (not pattern-tree) (set! pattern-tree (make-pattern-tree default-patterns))) (define joiner-string (joiner->string joiner)) (regexp-replace* #px"\\w+" text (λ(word) (if (proc word) (string-join (word->pieces word min-length) joiner-string) word)))) ;; Default hyphenate function. (define (hyphenate text [joiner default-joiner] #:exceptions [extra-exceptions '()] #:min-length [min-length default-min-length]) (hyphenatef text (λ(x) #t) joiner #:exceptions extra-exceptions #:min-length min-length)) (define (unhyphenate text [joiner default-joiner]) (define joiner-string (joiner->string joiner)) (string-replace text joiner-string "")) (module+ test (check-equal? (hyphenate "polymorphism") "poly\u00ADmor\u00ADphism") (check-equal? (hyphenate "polymorphism" #:min-length 100) "polymorphism") (check-equal? (hyphenate "ugly" #:min-length 1) "ug\u00ADly") (check-equal? (unhyphenate "poly\u00ADmor\u00ADphism") "polymorphism") (check-equal? (hyphenatef "polymorphism" (λ(x) #f)) "polymorphism") (check-equal? (hyphenate "polymorphism" #\-) "poly-mor-phism") (check-equal? (hyphenate "polymorphism" "foo") "polyfoomorfoophism") (check-equal? (unhyphenate "polyfoomorfoophism" "foo") "polymorphism") (check-equal? (hyphenate "polymorphism" #\* #:exceptions '("polymo-rphism")) "polymo*rphism") (check-equal? (hyphenate "circular polymorphism squandering") "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing") (check-equal? (hyphenate "present project") "present project") ; exception words ;; test these last so exceptions have been set up already (check-equal? (word->pieces "polymorphism") '("poly" "mor" "phism")) (check-equal? (word->pieces "present") '("present"))) ; exception word