rewrite core algorithm in more Racketish way; add caching; add #:min-ends-length keyword argument

main
Matthew Butterick 10 years ago
parent 0d4c0541af
commit 6aa792d89c

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -305,4 +305,15 @@ tbody > tr:first-child > td > .together {
left: 0px; left: 0px;
top: 0px; top: 0px;
z-index: 1; z-index: 1;
} }
/* ---------------------------------------- */
/* For section source modules & tags */
.RPartExplain {
background: #eee;
font-size: 0.9rem;
margin-top: 0.2rem;
padding: 0.2rem;
text-align: left;
}

@ -0,0 +1,82 @@
/* For the Racket manual style */
AddOnLoad(function() {
/* Look for header elements that have x-source-module and x-part tag.
For those elements, add a hidden element that explains how to
link to the section, and set the element's onclick() to display
the explanation. */
var tag_names = ["h1", "h2", "h3", "h4", "h5"];
for (var j = 0; j < tag_names.length; j++) {
elems = document.getElementsByTagName(tag_names[j]);
for (var i = 0; i < elems.length; i++) {
var elem = elems.item(i);
AddPartTitleOnClick(elem);
}
}
})
function AddPartTitleOnClick(elem) {
var mod_path = elem.getAttribute("x-source-module");
var tag = elem.getAttribute("x-part-tag");
if (mod_path && tag) {
var info = document.createElement("div");
info.className = "RPartExplain";
/* The "top" tag refers to a whole document: */
var is_top = (tag == "\"top\"");
info.appendChild(document.createTextNode("Link to this "
+ (is_top ? "document" : "section")
+ " with "));
/* Break `secref` into two lines if the module path and tag
are long enough: */
var is_long = (is_top ? false : (mod_path.length + tag.length > 60));
var line1 = document.createElement("div");
var line2 = (is_long ? document.createElement("div") : line1);
function add(dest, str, cn) {
var s = document.createElement("span");
s.className = cn;
s.style.whiteSpace = "nowrap";
s.appendChild(document.createTextNode(str));
dest.appendChild(s);
}
/* Construct a `secref` call with suitable syntax coloring: */
add(line1, "\xA0@", "RktRdr");
add(line1, (is_top ? "other-doc" : "secref"), "RktSym");
add(line1, "[", "RktPn");
if (!is_top)
add(line1, tag, "RktVal");
if (is_long) {
/* indent second line: */
add(line2, "\xA0\xA0\xA0\xA0\xA0\xA0\xA0\xA0", "RktPn");
}
if (!is_top)
add(line2, " #:doc ", "RktPn");
add(line2, "'", "RktVal");
add(line2, mod_path, "RktVal");
add(line2, "]", "RktPn");
info.appendChild(line1);
if (is_long)
info.appendChild(line2);
info.style.display = "none";
/* Add the new element afterthe header: */
var n = elem.nextSibling;
if (n)
elem.parentNode.insertBefore(info, n);
else
elem.parentNode.appendChild(info);
/* Clicking the header shows the explanation element: */
elem.onclick = function () {
if (info.style.display == "none")
info.style.display = "block";
else
info.style.display = "none";
}
}
}

@ -20,7 +20,7 @@ var page_args =
function GetPageArg(key, def) { function GetPageArg(key, def) {
for (var i=0; i<page_args.length; i++) for (var i=0; i<page_args.length; i++)
if (page_args[i][0] == key) return unescape(page_args[i][1]); if (page_args[i][0] == key) return decodeURIComponent(page_args[i][1]);
return def; return def;
} }
@ -28,9 +28,13 @@ function MergePageArgsIntoLink(a) {
if (page_args.length == 0 || if (page_args.length == 0 ||
(!a.attributes["data-pltdoc"]) || (a.attributes["data-pltdoc"].value == "")) (!a.attributes["data-pltdoc"]) || (a.attributes["data-pltdoc"].value == ""))
return; return;
a.href.search(/^([^?#]*)(?:\?([^#]*))?(#.*)?$/); a.href = MergePageArgsIntoUrl(a.href);
}
function MergePageArgsIntoUrl(href) {
href.search(/^([^?#]*)(?:\?([^#]*))?(#.*)?$/);
if (RegExp.$2.length == 0) { if (RegExp.$2.length == 0) {
a.href = RegExp.$1 + "?" + page_query_string + RegExp.$3; return RegExp.$1 + "?" + page_query_string + RegExp.$3;
} else { } else {
// need to merge here, precedence to arguments that exist in `a' // need to merge here, precedence to arguments that exist in `a'
var i, j; var i, j;
@ -47,7 +51,7 @@ function MergePageArgsIntoLink(a) {
if (args[j] == page_args[i][0]) { exists = true; break; } if (args[j] == page_args[i][0]) { exists = true; break; }
if (!exists) str += "&" + page_args[i][0] + "=" + page_args[i][1]; if (!exists) str += "&" + page_args[i][0] + "=" + page_args[i][1];
} }
a.href = prefix + "?" + str + suffix; return prefix + "?" + str + suffix;
} }
} }
@ -127,8 +131,8 @@ function DoSearchKey(event, field, ver, top_path) {
if (event && event.keyCode == 13) { if (event && event.keyCode == 13) {
var u = GetCookie("PLT_Root."+ver, null); var u = GetCookie("PLT_Root."+ver, null);
if (u == null) u = top_path; // default: go to the top path if (u == null) u = top_path; // default: go to the top path
u += "search/index.html?q=" + escape(val); u += "search/index.html?q=" + encodeURIComponent(val);
if (page_query_string) u += "&" + page_query_string; u = MergePageArgsIntoUrl(u);
location = u; location = u;
return false; return false;
} }

@ -4,4 +4,4 @@
; Knuth and Liang's original exception patterns from classic TeX. ; Knuth and Liang's original exception patterns from classic TeX.
; In the public domain. ; In the public domain.
(define default-exceptions (list->vector (map symbol->string '(as-so-ciate as-so-ciates dec-li-na-tion oblig-a-tory phil-an-thropic present presents project projects reci-procity re-cog-ni-zance ref-or-ma-tion ret-ri-bu-tion ta-ble)))) (define default-exceptions '(as-so-ciate as-so-ciates dec-li-na-tion oblig-a-tory phil-an-thropic present presents project projects reci-procity re-cog-ni-zance ref-or-ma-tion ret-ri-bu-tion ta-ble))

@ -1,18 +1,8 @@
#lang racket/base #lang racket/base
(require (for-syntax racket/base)) (require (for-syntax racket/base))
(require racket/string racket/list racket/vector) (require racket/string racket/list racket/bool)
(require "patterns.rkt" "exceptions.rkt" txexpr xml) (require "patterns.rkt" "exceptions.rkt" txexpr xml)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Hyphenate module
;;; Racket port of Ned Batchelder's hyphenate.py
;;; http://nedbatchelder.com/code/modules/hyphenate.html
;;; (in the public domain)
;;; which in turn was an implementation
;;; of the Liang hyphenation algorithm in TeX
;;; (also in the public domain)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(module+ safe (require racket/contract)) (module+ safe (require racket/contract))
(define-syntax (define+provide+safe stx) (define-syntax (define+provide+safe stx)
@ -28,122 +18,138 @@
(provide (contract-out [name contract]))))])) (provide (contract-out [name contract]))))]))
;; module data, define now but set! them later (because they're potentially big & slow) ;; module data, define now but set! them later (because they're potentially big & slow)
(define exceptions #f) (define patterns #f)
(define pattern-tree #f) (define pattern-cache #f)
;; module default values ;; module default values
(define default-min-length 5) (define default-min-length 5)
(define default-joiner (integer->char #x00AD)) (define default-min-ends-length 2)
(define default-joiner #\u00AD)
(define (add-pattern-to-cache pat)
(hash-set! pattern-cache (car pat) (cdr pat)))
(define (initialize-patterns)
(when (not pattern-cache)
(set! pattern-cache (make-hash))
(for-each (compose1 add-exception symbol->string) default-exceptions))
(when (not patterns)
(set! patterns (make-hash (map (compose1 string->hashpair symbol->string) default-patterns)))))
;; Convert the hyphenated pattern into a point array for use later. ;; Convert the hyphenated pattern into a point array for use later.
(define (vector->exceptions exn-strings) (define (add-exception exception)
(define (make-key x) (define (make-key x) (format ".~a." (string-replace x "-" "")))
(string-replace x "-" "")) (define (make-value x) `(0 ,@(map (λ(x) (if (equal? x "-") 1 0)) (regexp-split #px"[a-z]" x)) 0))
(add-pattern-to-cache (cons (make-key exception) (make-value exception)))
(define (make-value x) (void))
(list->vector (cons 0 (map (λ(x) (if (equal? x "-") 1 0)) (regexp-split #px"[a-z]" x)))))
(make-hash (vector->list (vector-map (λ(x) (cons (make-key x) (make-value x))) exn-strings))))
;; An exception-word is a string of word characters or hyphens. ;; An exception-word is a string of word characters or hyphens.
(define (exception-word? x) (define (exception-word? x)
(if (regexp-match #px"^[\\w-]+$" x) #t #f)) (if (regexp-match #px"^[\\w-]+$" x) #t #f))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Helper functions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Convert a pattern like 'a1bc3d4' into a string of chars 'abcd' (define (string->hashpair pat)
;; and a list of points [ 1, 0, 3, 4 ]. (define boundary-name ".")
(define (make-pattern-tree pattern-data)
(define tree (make-hash)) ;; first convert the pattern to a list of alternating letters and numbers.
;; insert zeroes where there isn't a number in the pattern.
(define new-pat
(let* ([pat (map (λ(i) (format "~a" i)) (string->list pat))] ; convert to list
[pat (map (λ(i) (or (string->number i) i)) pat)] ; convert numbers
[pat (if (string? (car pat)) (cons 0 pat) pat)] ; add zeroes to front where needed
[pat (if (string? (car (reverse pat))) (reverse (cons 0 (reverse pat))) pat)]) ; and back
(flatten (for/list ([i (length pat)])
(define current (list-ref pat i))
(if (= i (sub1 (length pat)))
current
(let ([next (list-ref pat (add1 i))])
;; insert zeroes where there isn't a number
(if (and (or (equal? current boundary-name) (string? current)) (string? next))
(list current 0)
current)))))))
;; Insert the pattern into the tree. Each character finds a dict ;; then slice out the string & numerical parts to be a key / value pair.
;; another level down in the tree, and leaf nodes have the list of (define-values (value key) (partition number? new-pat))
;; points. (cons (apply string-append key) value))
(define (insert-pattern pat)
;; todo?: filter other characters out of input string?
(let* ([chars (regexp-replace* #px"[0-9]" pat "")]
;; regexp returns list of strings (define (make-points word)
[points (map (λ(x) (if (> (string-length x) 0) (string->number x) 0)) (regexp-split #px"[.a-z]" pat))] ;; walk through all the substrings and see if there's a matching pattern.
[tree tree]) ;; if so, pad it out to full length (so we can (apply map max ...) later on)
(for ([char chars]) (define word-with-dots (format ".~a." (string-downcase word)))
(when (not (hash-has-key? tree char)) (define matching-patterns
(hash-set! tree char (make-hash))) (if (hash-has-key? pattern-cache word-with-dots)
(set! tree (hash-ref tree char))) (list (hash-ref pattern-cache word-with-dots))
(hash-set! tree empty points))) (let ([word-as-list (string->list word-with-dots)])
(vector-map insert-pattern pattern-data) (cons (make-list (add1 (length word-as-list)) 0) ;; ensures there's at least one (null) element in return value
tree) (filter-not void?
(for*/list ([len (length word-as-list)] [index (- (length word-as-list) len)])
(define substring (list->string (take (drop word-as-list index) (add1 len))))
(when (hash-has-key? patterns substring)
(define (make-points word) (define value (hash-ref patterns substring))
;; put together head padding + value + tail padding
(append (make-list index 0) value (make-list (- (add1 (length word-as-list)) (length value) index) 0)))))))))
(define (make-zeroes points) (define max-value-pattern (apply map max matching-patterns))
; controls hyphenation zone from edges of word (add-pattern-to-cache (cons word-with-dots max-value-pattern))
; possible todo: make this user-configurable?
(vector-map (λ(i) (vector-set! points i 0)) (vector 1 2 (- (vector-length points) 2) (- (vector-length points) 3)))
points)
(let* ([word (string-downcase word)] ;; for point list,
[points ;; drop first two elements because they represent hyphenation weight
(if (hash-has-key? exceptions word) ;; before the starting "." and between "." and the first letter.
(hash-ref exceptions word) ;; drop last element because it represents hyphen after last "."
(let* ([work (string-append "." word ".")] ;; after you drop these two, then each number corresponds to
[points (make-vector (add1 (string-length work)) 0)]) ;; whether a hyphen goes after that letter.
(for ([i (string-length work)]) (drop-right (drop max-value-pattern 2) 1))
(let ([tree pattern-tree])
(for ([char (substring work i (string-length work))]
#:break (not (hash-has-key? tree char)))
(set! tree (hash-ref tree char))
(when (hash-has-key? tree empty)
(let ([point (hash-ref tree empty)])
(for ([j (length point)])
(vector-set! points (+ i j) (max (vector-ref points (+ i j)) (list-ref point j)))))))))
points))])
; make-zeroes controls minimum hyphenation distance from edge.
; todo: dropping first 2 elements is needed for mysterious reasons to be documented later
(vector-drop (make-zeroes points) 2)))
;; helpful extension of splitf-at ;; helpful extension of splitf-at
(define (splitf-at* xs split-test) (define (splitf-at* xs split-test)
(define (trim items test-proc) (define (trimf items test-proc)
(dropf-right (dropf items test-proc) test-proc)) (dropf-right (dropf items test-proc) test-proc))
(define (&splitf-at* xs [acc '()]) (define (&splitf-at* xs [acc '()])
(if (empty? xs) (if (empty? xs)
;; reverse because accumulation is happening backward ;; reverse because accumulation is happening backward
;; (because I'm using cons to push latest match onto front of list) ;; (because I'm using cons to push latest match onto front of list)
(reverse acc) (reverse acc)
(let-values ([(item rest) (let-values ([(item rest)
;; drop matching elements from front ;; drop matching elements from front
;; then split on nonmatching ;; then split on nonmatching
;; = nonmatching item + other elements (which will start with matching) ;; = nonmatching item + other elements (which will start with matching)
(splitf-at (dropf xs split-test) (compose1 not split-test))]) (splitf-at (dropf xs split-test) (compose1 not split-test))])
;; recurse, and store new item in accumulator ;; recurse, and store new item in accumulator
(&splitf-at* rest (cons item acc))))) (&splitf-at* rest (cons item acc)))))
;; trim off elements matching split-test ;; trim off elements matching split-test
(&splitf-at* (trim xs split-test))) (&splitf-at* (trimf xs split-test)))
;; Find hyphenation points in a word. This is not quite synonymous with syllables. ;; Find hyphenation points in a word. This is not quite synonymous with syllables.
(define (word->hyphenation-points word [min-length default-min-length]) (define (word->hyphenation-points word [min-length default-min-length] [min-ends-length default-min-ends-length])
(define (add-no-hyphen-zone points)
; points is a list corresponding to the letters of the word.
; to create a no-hyphenation zone of length n, zero out the first n-1 points
; and the last n points (because the last value in points is always superfluous)
(let* ([min-ends-length (or min-ends-length default-min-ends-length)]
[min-ends-length (min min-ends-length (length points))])
(define points-with-zeroes-on-left (append (make-list (sub1 min-ends-length) 0) (drop points (sub1 min-ends-length))))
(define points-with-zeroes-on-left-and-right (append (drop-right points-with-zeroes-on-left min-ends-length) (make-list min-ends-length 0)))
points-with-zeroes-on-left-and-right))
(define (make-pieces word) (define (make-pieces word)
(define word-dissected (flatten (for/list ([char word] (define word-dissected (flatten (for/list ([char word]
[point (make-points word)]) [point (add-no-hyphen-zone (make-points word))])
(if (even? point) (if (even? point)
char ; even point denotes character char ; even point denotes character
(cons char 'syllable))))) ; odd point denotes char + syllable (cons char 'syllable))))) ; odd point denotes char + syllable
(map list->string (splitf-at* word-dissected symbol?))) (map list->string (splitf-at* word-dissected symbol?)))
(if (and min-length (< (string-length word) min-length)) (if (and min-length (< (string-length word) min-length))
(list word) (list word)
(make-pieces word))) (make-pieces word)))
;; joiner contract allows char or string; this coerces to string. ;; joiner contract allows char or string; this coerces to string.
@ -158,41 +164,39 @@
[(and (txexpr? x) (not (omit-txexpr x))) (cons (car x) (map loop (cdr x)))] [(and (txexpr? x) (not (omit-txexpr x))) (cons (car x) (map loop (cdr x)))]
[else x]))) [else x])))
(define+provide+safe (hyphenate x [joiner default-joiner] (define+provide+safe (hyphenate x [joiner default-joiner]
#:exceptions [extra-exceptions '()] #:exceptions [extra-exceptions '()]
#:min-length [min-length default-min-length] #:min-length [min-length default-min-length]
#:omit-word [omit-word? (λ(x) #f)] #:min-ends-length [min-ends-length default-min-ends-length]
#:omit-string [omit-string? (λ(x) #f)] #:omit-word [omit-word? (λ(x) #f)]
#:omit-txexpr [omit-txexpr? (λ(x) #f)]) #:omit-string [omit-string? (λ(x) #f)]
#:omit-txexpr [omit-txexpr? (λ(x) #f)])
((xexpr?) ((or/c char? string?) ((xexpr?) ((or/c char? string?)
#:exceptions (listof exception-word?) #:exceptions (listof exception-word?)
#:min-length (or/c integer? #f) #:min-length (or/c integer? #f)
#:omit-word (string? . -> . any/c) #:omit-word (string? . -> . any/c)
#:omit-string (string? . -> . any/c) #:omit-string (string? . -> . any/c)
#:omit-txexpr (txexpr? . -> . any/c)) . ->* . xexpr/c) #:omit-txexpr (txexpr? . -> . any/c)
#:min-ends-length (or/c integer? #f)) . ->* . xexpr/c)
;; set up module data (initialize-patterns) ; reset everything each time hyphenate is called
;; todo?: change set! to parameterize (for-each add-exception extra-exceptions)
(set! exceptions (vector->exceptions (vector-append default-exceptions (list->vector extra-exceptions))))
(when (not pattern-tree) (set! pattern-tree (make-pattern-tree default-patterns)))
(define joiner-string (joiner->string joiner)) (define joiner-string (joiner->string joiner))
;; todo?: connect this regexp pattern to the one used in word? predicate ;; todo?: connect this regexp pattern to the one used in word? predicate
(define word-pattern #px"\\w+") ;; more restrictive than exception-word (define word-pattern #px"\\w+") ;; more restrictive than exception-word
(define (insert-hyphens text) (define (insert-hyphens text)
(regexp-replace* word-pattern text (λ(word) (if (not (omit-word? word)) (regexp-replace* word-pattern text (λ(word) (if (not (omit-word? word))
(string-join (word->hyphenation-points word min-length) joiner-string) (string-join (word->hyphenation-points word min-length min-ends-length) joiner-string)
word)))) word))))
(apply-proc insert-hyphens x omit-string? omit-txexpr?)) (apply-proc insert-hyphens x omit-string? omit-txexpr?))
(define+provide+safe (unhyphenate x [joiner default-joiner] (define+provide+safe (unhyphenate x [joiner default-joiner]
#:omit-word [omit-word? (λ(x) #f)] #:omit-word [omit-word? (λ(x) #f)]
#:omit-string [omit-string? (λ(x) #f)] #:omit-string [omit-string? (λ(x) #f)]
#:omit-txexpr [omit-txexpr? (λ(x) #f)]) #:omit-txexpr [omit-txexpr? (λ(x) #f)])
((xexpr/c) ((or/c char? string?) ((xexpr/c) ((or/c char? string?)
#:omit-word (string? . -> . any/c) #:omit-word (string? . -> . any/c)
#:omit-string (string? . -> . any/c) #:omit-string (string? . -> . any/c)
@ -201,7 +205,15 @@
(define word-pattern (pregexp (format "[\\w~a]+" joiner))) (define word-pattern (pregexp (format "[\\w~a]+" joiner)))
(define (remove-hyphens text) (define (remove-hyphens text)
(regexp-replace* word-pattern text (λ(word) (if (not (omit-word? word)) (regexp-replace* word-pattern text (λ(word) (if (not (omit-word? word))
(string-replace word (joiner->string joiner) "") (string-replace word (joiner->string joiner) "")
word)))) word))))
(apply-proc remove-hyphens x omit-string? omit-txexpr?)) (apply-proc remove-hyphens x omit-string? omit-txexpr?))
(module+ main
(initialize-patterns)
(define t "supercalifragilisticexpialidocious")
(hyphenate t "-"))

File diff suppressed because one or more lines are too long

@ -13,7 +13,7 @@
@defmodule[#:multi (hyphenate (submod hyphenate safe))] @defmodule[#:multi (hyphenate (submod hyphenate safe))]
A simple hyphenation engine that uses the KnuthLiang hyphenation algorithm originally developed for TeX. This implementation is a port of Ned Batchelder's @link["http://nedbatchelder.com/code/modules/hyphenate.html"]{Python version}. I have added little to their work. Accordingly, I take little credit. A simple hyphenation engine that uses the KnuthLiang hyphenation algorithm originally developed for TeX. I have added little to their work. Accordingly, I take little credit.
I originally put together this module to handle hyphenation for my web-based book @link["http://practicaltypography.com"]{Butterick's Practical Typography} (which I made with Racket & Scribble). Though support for CSS-based hyphenation in web browsers is @link["http://caniuse.com/#search=hyphen"]{still iffy}, soft hyphens work reliably well. But putting them into the text manually is a drag. Thus a module was born. I originally put together this module to handle hyphenation for my web-based book @link["http://practicaltypography.com"]{Butterick's Practical Typography} (which I made with Racket & Scribble). Though support for CSS-based hyphenation in web browsers is @link["http://caniuse.com/#search=hyphen"]{still iffy}, soft hyphens work reliably well. But putting them into the text manually is a drag. Thus a module was born.
@ -40,6 +40,7 @@ Safe mode enables the function contracts documented below. Use safe mode by impo
[joiner (or/c char? string?) (integer->char #x00AD)] [joiner (or/c char? string?) (integer->char #x00AD)]
[#:exceptions exceptions (listof string?) empty] [#:exceptions exceptions (listof string?) empty]
[#:min-length length (or/c integer? false?) 5] [#:min-length length (or/c integer? false?) 5]
[#:min-ends-length ends-length (or/c integer? false?) 2]
[#:omit-word word-test (string? . -> . any/c) (λ(x) #f)] [#:omit-word word-test (string? . -> . any/c) (λ(x) #f)]
[#:omit-string string-test (string? . -> . any/c) (λ(x) #f)] [#:omit-string string-test (string? . -> . any/c) (λ(x) #f)]
[#:omit-txexpr txexpr-test (txexpr? . -> . any/c) (λ(x) #f)]) [#:omit-txexpr txexpr-test (txexpr? . -> . any/c) (λ(x) #f)])
@ -49,12 +50,22 @@ Hyphenate @racket[_xexpr] by calculating hyphenation points and inserting @racke
@margin-note{The REPL displays a soft hyphen as @code{\u00AD}. But in ordinary use, you'll only see a soft hyphen when it appears at the end of a line or page as part of a hyphenated word. Otherwise it's not displayed. In most of the examples here, I use a standard hyphen for clarity (by adding @code{#\-} as an argument).} @margin-note{The REPL displays a soft hyphen as @code{\u00AD}. But in ordinary use, you'll only see a soft hyphen when it appears at the end of a line or page as part of a hyphenated word. Otherwise it's not displayed. In most of the examples here, I use a standard hyphen for clarity (by adding @code{#\-} as an argument).}
@examples[#:eval my-eval @examples[#:eval my-eval
(hyphenate "ergo polymorphic") (hyphenate "ergo polymorphism")
(hyphenate "ergo polymorphic" #\-) (hyphenate "ergo polymorphism" #\-)
(hyphenate "ergo polymorphic" #:min-length 13) (hyphenate "ergo polymorphism" #:min-length 13)
(hyphenate "ergo polymorphic" #:min-length #f) (hyphenate "ergo polymorphism" #:min-length #f)
] ]
The @racket[#:min-ends-length] keyword argument sets a minimum distance between a potential hyphen and either end of the word. The default is 2 characters. Larger values will reduce hyphens, but also prevent small word breaks. This value will override a smaller @racket[#:min-length] value.
@examples[#:eval my-eval
(hyphenate "ergo polymorphism" #\-)
(hyphenate "ergo polymorphism" #\- #:min-ends-length #f)
(hyphenate "ergo polymorphism" #\- #:min-ends-length 5)
(code:comment @#,t{Words won't be hyphenated becase of large #:min-ends-length})
(hyphenate "ergo polymorphism" #\- #:min-length #f #:min-ends-length 15)
]
Because the hyphenation is based on an algorithm rather than a dictionary, it makes good guesses with unusual words: Because the hyphenation is based on an algorithm rather than a dictionary, it makes good guesses with unusual words:
@examples[#:eval my-eval @examples[#:eval my-eval
@ -66,9 +77,9 @@ Because the hyphenation is based on an algorithm rather than a dictionary, it ma
Using the @racket[#:exceptions] keyword, you can pass hyphenation exceptions as a list of words with hyphenation points marked with regular hyphens (@racket["-"]). If an exception word contains no hyphens, that word will never be hyphenated. Using the @racket[#:exceptions] keyword, you can pass hyphenation exceptions as a list of words with hyphenation points marked with regular hyphens (@racket["-"]). If an exception word contains no hyphens, that word will never be hyphenated.
@examples[#:eval my-eval @examples[#:eval my-eval
(hyphenate "polymorphic" #\-) (hyphenate "polymorphism" #\-)
(hyphenate "polymorphic" #\- #:exceptions '("polymo-rphic")) (hyphenate "polymorphism" #\- #:exceptions '("polymo-rphism"))
(hyphenate "polymorphic" #\- #:exceptions '("polymorphic")) (hyphenate "polymorphism" #\- #:exceptions '("polymorphism"))
] ]
Knuth & Liang were sufficiently confident about their algorithm that they originally released it with only 14 exceptions: @italic{associate[s], declination, obligatory, philanthropic, present[s], project[s], reciprocity, recognizance, reformation, retribution}, and @italic{table}. Admirable bravado, but it's not hard to discover others that need adjustment. Knuth & Liang were sufficiently confident about their algorithm that they originally released it with only 14 exceptions: @italic{associate[s], declination, obligatory, philanthropic, present[s], project[s], reciprocity, recognizance, reformation, retribution}, and @italic{table}. Admirable bravado, but it's not hard to discover others that need adjustment.
@ -79,7 +90,7 @@ Knuth & Liang were sufficiently confident about their algorithm that they origin
#:exceptions '("col-umns" "sign-age" "law-yers")) #:exceptions '("col-umns" "sign-age" "law-yers"))
] ]
Overall, my impression is that the KnuthLiang algorithm is more likely to miss legitimate hyphenation points (i.e., generate false negatives) than create erroneous hyphenation points (i.e., false positives). This is good policy. Perfect hyphenation — that is, hyphenation that represents an exact linguistic syllabification of each word — is superfluous for typesetting. Hyphenation simply seeks to mark possible line-break and page-break locations for whatever layout engine is drawing the text. The ultimate goal is to permit more even text flow. Like horseshoes and hand grenades, close is good enough. And a word wrongly hyphenated is more likely to be noticed by a reader than a word inefficiently hyphenated. The KnuthLiang algorithm is designed to omit legitimate hyphenation points (i.e., generate false negatives) more often than it creates erroneous hyphenation points (i.e., false positives). This is good policy. Perfect hyphenation — that is, hyphenation that represents an exact linguistic syllabification of each word — is superfluous for typesetting. Hyphenation simply seeks to mark possible line-break and page-break locations for whatever layout engine is drawing the text. The ultimate goal is to permit more even text flow. Like horseshoes and hand grenades, close is good enough. And a word wrongly hyphenated is more likely to be noticed by a reader than a word inefficiently hyphenated.
For this reason, certain words can't be hyphenated algorithmically, because the correct hyphenation depends on meaning, not merely on spelling. For instance: For this reason, certain words can't be hyphenated algorithmically, because the correct hyphenation depends on meaning, not merely on spelling. For instance:

@ -3,14 +3,15 @@
(require/expose "main.rkt" (word->hyphenation-points exception-word?)) (require/expose "main.rkt" (word->hyphenation-points exception-word?))
(check-equal? (hyphenate "edges") "edges") ;; word without matching patterns
(check-equal? (hyphenate "polymorphism") "poly\u00ADmor\u00ADphism") (check-equal? (hyphenate "polymorphism") "poly\u00ADmor\u00ADphism")
(check-equal? (hyphenate "POLYmorPHISM") "POLY\u00ADmor\u00ADPHISM")
(check-equal? (hyphenate "polymorphism" #:min-length 100) "polymorphism") (check-equal? (hyphenate "polymorphism" #:min-length 100) "polymorphism")
(check-equal? (hyphenate "ugly" #:min-length 1) "ug\u00ADly") (check-equal? (hyphenate "ugly" #:min-length 1) "ug\u00ADly")
(check-equal? (unhyphenate "poly\u00ADmor\u00ADphism") "polymorphism") (check-equal? (unhyphenate "poly\u00ADmor\u00ADphism") "polymorphism")
(check-equal? (hyphenate "polymorphism" #\-) "poly-mor-phism") (check-equal? (hyphenate "polymorphism" #\-) "poly-mor-phism")
(check-equal? (hyphenate "polymorphism" "foo") "polyfoomorfoophism") (check-equal? (hyphenate "polymorphism" "foo") "polyfoomorfoophism")
(check-equal? (unhyphenate "polyfoomorfoophism" "foo") "polymorphism") (check-equal? (unhyphenate "polyfoomorfoophism" "foo") "polymorphism")
(check-equal? (hyphenate "polymorphism" #\* #:exceptions '("polymo-rphism")) "polymo*rphism")
(check-equal? (hyphenate "circular polymorphism squandering") "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing") (check-equal? (hyphenate "circular polymorphism squandering") "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing")
(check-equal? (hyphenate '(p "circular polymorphism" amp (em "squandering"))) '(p "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism" amp (em "squan\u00ADder\u00ADing"))) (check-equal? (hyphenate '(p "circular polymorphism" amp (em "squandering"))) '(p "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism" amp (em "squan\u00ADder\u00ADing")))
(check-equal? (hyphenate "present project") "present project") ; exception words (check-equal? (hyphenate "present project") "present project") ; exception words
@ -56,4 +57,8 @@
'(p (script "tail-feathers") (em "tailfeathers"))) '(p (script "tail-feathers") (em "tailfeathers")))
(check-equal? (unhyphenate '(p "cir-cu-lar poly-mor-phism" "cir-cu-lar poly-mor-phisms") #\- #:omit-string (λ(x) (regexp-match #rx"s$" x))) (check-equal? (unhyphenate '(p "cir-cu-lar poly-mor-phism" "cir-cu-lar poly-mor-phisms") #\- #:omit-string (λ(x) (regexp-match #rx"s$" x)))
'(p "circular polymorphism" "cir-cu-lar poly-mor-phisms")) '(p "circular polymorphism" "cir-cu-lar poly-mor-phisms"))
(check-equal? (hyphenate "polymorphism" #\- #:min-ends-length 5) "polymor-phism")
(check-equal? (hyphenate "polymorphism" #\- #:min-ends-length 7) "polymorphism")
(check-equal? (hyphenate "polymorphism" #\* #:exceptions '("polymo-rphism")) "polymo*rphism")
Loading…
Cancel
Save