add #:min-hyphens option

main
Matthew Butterick 5 years ago
parent 16ce7e19c7
commit 32cca4c6ab

@ -23,7 +23,8 @@
#:omit-string (string? . -> . any/c) #:omit-string (string? . -> . any/c)
#:omit-txexpr (txexpr? . -> . any/c) #:omit-txexpr (txexpr? . -> . any/c)
#:min-left-length (or/c (and/c integer? positive?) #f) #:min-left-length (or/c (and/c integer? positive?) #f)
#:min-right-length (or/c (and/c integer? positive?) #f)) . ->* . xexpr/c)) #:min-right-length (or/c (and/c integer? positive?) #f)
#:min-hyphens (and/c integer? positive?)) . ->* . xexpr/c))
(define (make-hyphenate-function patterns exceptions) (define (make-hyphenate-function patterns exceptions)
(make-keyword-procedure (make-keyword-procedure

@ -11,6 +11,7 @@
(define default-min-left-length 2) (define default-min-left-length 2)
(define default-min-right-length 2) (define default-min-right-length 2)
(define default-joiner #\u00AD) (define default-joiner #\u00AD)
(define default-min-hyphen-count 1)
(define (exception-word->word+pattern ew) (define (exception-word->word+pattern ew)
@ -108,11 +109,11 @@
[substr (in-value (substring word-with-boundaries start (add1 end)))] [substr (in-value (substring word-with-boundaries start (add1 end)))]
[partial-pattern (in-value (hash-ref pattern-cache (string->symbol substr) #f))] [partial-pattern (in-value (hash-ref pattern-cache (string->symbol substr) #f))]
#:when partial-pattern) #:when partial-pattern)
;; pad out partial-pattern to full length ;; pad out partial-pattern to full length
;; (so we can compare patterns to find max value for each slot) ;; (so we can compare patterns to find max value for each slot)
(define left-zeroes (make-list start 0)) (define left-zeroes (make-list start 0))
(define right-zeroes (make-list (- (add1 word-length) (length partial-pattern) start) 0)) (define right-zeroes (make-list (- (add1 word-length) (length partial-pattern) start) 0))
(append left-zeroes partial-pattern right-zeroes))) (append left-zeroes partial-pattern right-zeroes)))
(define max-pattern (calculate-max-pattern (cons default-zero-pattern matching-patterns))) (define max-pattern (calculate-max-pattern (cons default-zero-pattern matching-patterns)))
;; for point list generated from a pattern, ;; for point list generated from a pattern,
;; drop first two elements because they represent hyphenation weight ;; drop first two elements because they represent hyphenation weight
@ -141,20 +142,20 @@
[left-zeroes (min (or min-left-length default-min-left-length) (length points))] [left-zeroes (min (or min-left-length default-min-left-length) (length points))]
[right-zeroes (min (or min-right-length default-min-right-length) (length points))]) [right-zeroes (min (or min-right-length default-min-right-length) (length points))])
(for/list ([(point idx) (in-indexed points)]) (for/list ([(point idx) (in-indexed points)])
(if (<= left-zeroes (add1 idx) (- (length points) right-zeroes)) (if (<= left-zeroes (add1 idx) (- (length points) right-zeroes))
point point
0)))) 0))))
;; odd-valued points in the pattern denote hyphenation points ;; odd-valued points in the pattern denote hyphenation points
(define odd-point-indexes (for/list ([(wp idx) (in-indexed word-points)] (define odd-point-indexes (for/list ([(wp idx) (in-indexed word-points)]
#:when (odd? wp)) #:when (odd? wp))
idx)) idx))
;; the hyphenation goes after the indexed letter, so add1 to the raw points for slicing ;; the hyphenation goes after the indexed letter, so add1 to the raw points for slicing
(define breakpoints (append (list 0) (map add1 odd-point-indexes) (list (string-length word)))) (define breakpoints (append (list 0) (map add1 odd-point-indexes) (list (string-length word))))
(for/list ([start (in-list breakpoints)] (for/list ([start (in-list breakpoints)]
[end (in-list (cdr breakpoints))]) ; shorter list controls exit of loop [end (in-list (cdr breakpoints))]) ; shorter list controls exit of loop
(substring word start end))])) (substring word start end))]))
;; joiner contract allows char or string; this coerces to string. ;; joiner contract allows char or string; this coerces to string.
@ -168,7 +169,7 @@
;; handle intercapped words as capitalized pieces ;; handle intercapped words as capitalized pieces
(define letter-before-uc #px"(?<=\\p{Ll})(?=\\p{Lu}\\p{Ll})") ; match xXx but not xXX or XXX (define letter-before-uc #px"(?<=\\p{Ll})(?=\\p{Lu}\\p{Ll})") ; match xXx but not xXX or XXX
(string-join (for/list ([x (in-list (string-split x letter-before-uc))]) (string-join (for/list ([x (in-list (string-split x letter-before-uc))])
(proc x)) (joiner->string joiner))] (proc x)) (joiner->string joiner))]
[(and (txexpr? x) (not (omit-txexpr x))) [(and (txexpr? x) (not (omit-txexpr x)))
(make-txexpr (get-tag x) (get-attrs x) (map loop (get-elements x)))] (make-txexpr (get-tag x) (get-attrs x) (map loop (get-elements x)))]
[else x]))) [else x])))
@ -179,6 +180,7 @@
#:min-length [min-length default-min-length] #:min-length [min-length default-min-length]
#:min-left-length [min-left-length default-min-left-length] #:min-left-length [min-left-length default-min-left-length]
#:min-right-length [min-right-length default-min-right-length] #:min-right-length [min-right-length default-min-right-length]
#:min-hyphens [min-hyphens-added default-min-hyphen-count]
#:omit-word [omit-word? (λ (x) #f)] #:omit-word [omit-word? (λ (x) #f)]
#:omit-string [omit-string? (λ (x) #f)] #:omit-string [omit-string? (λ (x) #f)]
#:omit-txexpr [omit-txexpr? (λ (x) #f)]) #:omit-txexpr [omit-txexpr? (λ (x) #f)])
@ -187,15 +189,14 @@
(for-each (λ (ee) (add-exception-word word-cache ee)) extra-exceptions) (for-each (λ (ee) (add-exception-word word-cache ee)) extra-exceptions)
(define word-pattern #px"\\p{L}+") ;; more restrictive than exception-word (define word-pattern #px"\\p{L}+") ;; more restrictive than exception-word
(define (replacer word . words) (define (replacer word . words)
(if (omit-word? word) (cond
word [(omit-word? word) word]
(string-join (word->hyphenation-points word [else (define hyphenation-points
word-cache (word->hyphenation-points word word-cache pattern-cache min-length min-left-length min-right-length))
pattern-cache (cond
min-length [(>= (sub1 (length hyphenation-points)) min-hyphens-added)
min-left-length (string-join hyphenation-points (joiner->string joiner))]
min-right-length) [else word])]))
(joiner->string joiner))))
(define (insert-hyphens text) (regexp-replace* word-pattern text replacer)) (define (insert-hyphens text) (regexp-replace* word-pattern text replacer))
(begin0 (begin0
(apply-proc insert-hyphens x omit-string? omit-txexpr? joiner) (apply-proc insert-hyphens x omit-string? omit-txexpr? joiner)

@ -41,6 +41,7 @@ Safe mode enables the function contracts documented below. Use safe mode by impo
[#:min-length length (or/c integer? false?) 5] [#:min-length length (or/c integer? false?) 5]
[#:min-left-length left-length (or/c (and/c integer? positive?) #f) 2] [#:min-left-length left-length (or/c (and/c integer? positive?) #f) 2]
[#:min-right-length right-length (or/c (and/c integer? positive?) #f) 2] [#:min-right-length right-length (or/c (and/c integer? positive?) #f) 2]
[#:min-hyphens min-hyphen-count (and/c integer? positive?) 1]
[#:omit-word word-test (string? . -> . any/c) (λ (x) #f)] [#:omit-word word-test (string? . -> . any/c) (λ (x) #f)]
[#:omit-string string-test (string? . -> . any/c) (λ (x) #f)] [#:omit-string string-test (string? . -> . any/c) (λ (x) #f)]
[#:omit-txexpr txexpr-test (txexpr? . -> . any/c) (λ (x) #f)]) [#:omit-txexpr txexpr-test (txexpr? . -> . any/c) (λ (x) #f)])
@ -50,23 +51,34 @@ Hyphenate @racket[_xexpr] by calculating hyphenation points and inserting @racke
@margin-note{The REPL displays a soft hyphen as @code{\u00AD}. But in ordinary use, you'll only see a soft hyphen when it appears at the end of a line or page as part of a hyphenated word. Otherwise it's not displayed. In most of the examples here, I use a standard hyphen for clarity (by adding @code{#\-} as an argument).} @margin-note{The REPL displays a soft hyphen as @code{\u00AD}. But in ordinary use, you'll only see a soft hyphen when it appears at the end of a line or page as part of a hyphenated word. Otherwise it's not displayed. In most of the examples here, I use a standard hyphen for clarity (by adding @code{#\-} as an argument).}
@examples[#:eval my-eval @examples[#:eval my-eval
(hyphenate "ergo polymorphism") (hyphenate "snowman polymorphism")
(hyphenate "ergo polymorphism" #\-) (hyphenate "snowman polymorphism" #\-)
(hyphenate "ergo polymorphism" #:min-length 13) (hyphenate "snowman polymorphism" #:min-length 13)
(hyphenate "ergo polymorphism" #:min-length #f) (hyphenate "snowman polymorphism" #:min-length #f)
] ]
The @racket[#:min-left-length] and @racket[#:min-right-length] keyword arguments set the minimum distance between a potential hyphen and the left or right ends of the word. The default is 2 characters. Larger values will reduce hyphens, but also prevent small words from breaking. These values will override a smaller @racket[#:min-length] value. The @racket[#:min-left-length] and @racket[#:min-right-length] keyword arguments set the minimum distance between a potential hyphen and the left or right ends of the word. The default is 2 characters. Larger values will reduce hyphens, but also prevent small words from breaking. These values will override a smaller @racket[#:min-length] value.
@examples[#:eval my-eval @examples[#:eval my-eval
(hyphenate "ergo polymorphism" #\-) (hyphenate "snowman polymorphism" #\-)
(hyphenate "ergo polymorphism" #\- #:min-left-length #f) (hyphenate "snowman polymorphism" #\- #:min-left-length #f)
(hyphenate "ergo polymorphism" #\- #:min-length 2 #:min-left-length 5) (hyphenate "snowman polymorphism" #\- #:min-length 2 #:min-left-length 5)
(hyphenate "ergo polymorphism" #\- #:min-right-length 6) (hyphenate "snowman polymorphism" #\- #:min-right-length 6)
(code:comment @#,t{Next words won't be hyphenated becase of large #:min-left-length}) (code:comment @#,t{Next words won't be hyphenated becase of large #:min-left-length})
(hyphenate "ergo polymorphism" #\- #:min-length #f #:min-left-length 15) (hyphenate "snowman polymorphism" #\- #:min-length #f #:min-left-length 15)
] ]
Another way of controlling hyphen frequency is with the @racket[#:min-hyphens] keyword argument, which sets the minimum number of hyphens in a hyphenatable word. (It has no effect on non-hyphenatable words.) The default is 1 hyphen. Larger values will reduce hyphens, but also prevent small words from breaking.
@examples[#:eval my-eval
(hyphenate "snowman polymorphism" #\-)
(hyphenate "snowman polymorphism" #\- #:min-hyphens 1)
(code:comment @#,t{next "snowman" won't be hyphenated becase it doesn't have 2 hyphens})
(hyphenate "snowman polymorphism" #\- #:min-hyphens 2)
(code:comment @#,t{next "polymorphism" won't be hyphenated becase it doesn't have 3 hyphens})
(hyphenate "snowman polymorphism" #\- #:min-hyphens 3)
]
Because the hyphenation is based on an algorithm rather than a dictionary, it makes good guesses with unusual words: Because the hyphenation is based on an algorithm rather than a dictionary, it makes good guesses with unusual words:
@examples[#:eval my-eval @examples[#:eval my-eval

@ -11,9 +11,14 @@
(check-equal? (hyphenate "edges") "edges") ;; word without matching patterns (check-equal? (hyphenate "edges") "edges") ;; word without matching patterns
(check-equal? (hyphenate "polymorphism") "poly\u00ADmor\u00ADphism") (check-equal? (hyphenate "polymorphism") "poly\u00ADmor\u00ADphism")
(check-equal? (hyphenate "polymorphism" #:min-hyphens 1) "poly\u00ADmor\u00ADphism")
(check-equal? (hyphenate "polymorphism" #:min-hyphens 2) "poly\u00ADmor\u00ADphism")
(check-equal? (hyphenate "polymorphism" #:min-hyphens 3) "polymorphism")
(check-equal? (hyphenate "polymorphism" #:min-hyphens 42) "polymorphism")
(check-equal? (hyphenate "POLYmorPHISM") "POLY\u00ADmor\u00ADPHISM") (check-equal? (hyphenate "POLYmorPHISM") "POLY\u00ADmor\u00ADPHISM")
(check-equal? (hyphenate "polymorphism" #:min-length 100) "polymorphism") (check-equal? (hyphenate "polymorphism" #:min-length 100) "polymorphism")
(check-equal? (hyphenate "ugly" #:min-length 1) "ug\u00ADly") (check-equal? (hyphenate "ugly" #:min-length 1) "ug\u00ADly")
(check-equal? (hyphenate "ugly" #:min-length 1 #:min-hyphens 2) "ugly")
(check-equal? (unhyphenate "poly\u00ADmor\u00ADphism") "polymorphism") (check-equal? (unhyphenate "poly\u00ADmor\u00ADphism") "polymorphism")
(check-equal? (hyphenate "polymorphism" #\-) "poly-mor-phism") (check-equal? (hyphenate "polymorphism" #\-) "poly-mor-phism")
(check-equal? (hyphenate "compotumi" #\-) "com-po-tu-mi") (check-equal? (hyphenate "compotumi" #\-) "com-po-tu-mi")
@ -21,6 +26,8 @@
(check-equal? (hyphenate "polymorphism" "foo") "polyfoomorfoophism") (check-equal? (hyphenate "polymorphism" "foo") "polyfoomorfoophism")
(check-equal? (unhyphenate "polyfoomorfoophism" "foo") "polymorphism") (check-equal? (unhyphenate "polyfoomorfoophism" "foo") "polymorphism")
(check-equal? (hyphenate "circular polymorphism squandering") "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing") (check-equal? (hyphenate "circular polymorphism squandering") "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing")
(check-equal? (hyphenate "circular polymorphism squandering" #:min-hyphens 2) "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing")
(check-equal? (hyphenate "circular polymorphism squandering" #:min-hyphens 3) "circular polymorphism squandering")
(check-equal? (hyphenate '(p "circular polymorphism" amp (em "squandering"))) '(p "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism" amp (em "squan\u00ADder\u00ADing"))) (check-equal? (hyphenate '(p "circular polymorphism" amp (em "squandering"))) '(p "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism" amp (em "squan\u00ADder\u00ADing")))
(check-equal? (hyphenate "present project") "present project") ; exception words (check-equal? (hyphenate "present project") "present project") ; exception words
;; test these last so exceptions have been set up already ;; test these last so exceptions have been set up already

Loading…
Cancel
Save