From 32cca4c6abf5c04ce4952f63d843ec0265c20a01 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Sat, 23 Mar 2019 11:39:38 -0700 Subject: [PATCH] add #:min-hyphens option --- hyphenate/hyphenate/private/bootstrap.rkt | 3 +- hyphenate/hyphenate/private/core.rkt | 41 ++++++++++--------- .../hyphenate/scribblings/hyphenate.scrbl | 30 ++++++++++---- hyphenate/hyphenate/tests.rkt | 7 ++++ 4 files changed, 51 insertions(+), 30 deletions(-) diff --git a/hyphenate/hyphenate/private/bootstrap.rkt b/hyphenate/hyphenate/private/bootstrap.rkt index a2aa9939..0e658f27 100644 --- a/hyphenate/hyphenate/private/bootstrap.rkt +++ b/hyphenate/hyphenate/private/bootstrap.rkt @@ -23,7 +23,8 @@ #:omit-string (string? . -> . any/c) #:omit-txexpr (txexpr? . -> . any/c) #:min-left-length (or/c (and/c integer? positive?) #f) - #:min-right-length (or/c (and/c integer? positive?) #f)) . ->* . xexpr/c)) + #:min-right-length (or/c (and/c integer? positive?) #f) + #:min-hyphens (and/c integer? positive?)) . ->* . xexpr/c)) (define (make-hyphenate-function patterns exceptions) (make-keyword-procedure diff --git a/hyphenate/hyphenate/private/core.rkt b/hyphenate/hyphenate/private/core.rkt index d3d86887..85c14fda 100644 --- a/hyphenate/hyphenate/private/core.rkt +++ b/hyphenate/hyphenate/private/core.rkt @@ -11,6 +11,7 @@ (define default-min-left-length 2) (define default-min-right-length 2) (define default-joiner #\u00AD) +(define default-min-hyphen-count 1) (define (exception-word->word+pattern ew) @@ -108,11 +109,11 @@ [substr (in-value (substring word-with-boundaries start (add1 end)))] [partial-pattern (in-value (hash-ref pattern-cache (string->symbol substr) #f))] #:when partial-pattern) - ;; pad out partial-pattern to full length - ;; (so we can compare patterns to find max value for each slot) - (define left-zeroes (make-list start 0)) - (define right-zeroes (make-list (- (add1 word-length) (length partial-pattern) start) 0)) - (append left-zeroes partial-pattern right-zeroes))) + ;; pad out partial-pattern to full length + ;; (so we can compare patterns to find max value for each slot) + (define left-zeroes (make-list start 0)) + (define right-zeroes (make-list (- (add1 word-length) (length partial-pattern) start) 0)) + (append left-zeroes partial-pattern right-zeroes))) (define max-pattern (calculate-max-pattern (cons default-zero-pattern matching-patterns))) ;; for point list generated from a pattern, ;; drop first two elements because they represent hyphenation weight @@ -141,20 +142,20 @@ [left-zeroes (min (or min-left-length default-min-left-length) (length points))] [right-zeroes (min (or min-right-length default-min-right-length) (length points))]) (for/list ([(point idx) (in-indexed points)]) - (if (<= left-zeroes (add1 idx) (- (length points) right-zeroes)) - point - 0)))) + (if (<= left-zeroes (add1 idx) (- (length points) right-zeroes)) + point + 0)))) ;; odd-valued points in the pattern denote hyphenation points (define odd-point-indexes (for/list ([(wp idx) (in-indexed word-points)] #:when (odd? wp)) - idx)) + idx)) ;; the hyphenation goes after the indexed letter, so add1 to the raw points for slicing (define breakpoints (append (list 0) (map add1 odd-point-indexes) (list (string-length word)))) (for/list ([start (in-list breakpoints)] [end (in-list (cdr breakpoints))]) ; shorter list controls exit of loop - (substring word start end))])) + (substring word start end))])) ;; joiner contract allows char or string; this coerces to string. @@ -168,7 +169,7 @@ ;; handle intercapped words as capitalized pieces (define letter-before-uc #px"(?<=\\p{Ll})(?=\\p{Lu}\\p{Ll})") ; match xXx but not xXX or XXX (string-join (for/list ([x (in-list (string-split x letter-before-uc))]) - (proc x)) (joiner->string joiner))] + (proc x)) (joiner->string joiner))] [(and (txexpr? x) (not (omit-txexpr x))) (make-txexpr (get-tag x) (get-attrs x) (map loop (get-elements x)))] [else x]))) @@ -179,6 +180,7 @@ #:min-length [min-length default-min-length] #:min-left-length [min-left-length default-min-left-length] #:min-right-length [min-right-length default-min-right-length] + #:min-hyphens [min-hyphens-added default-min-hyphen-count] #:omit-word [omit-word? (λ (x) #f)] #:omit-string [omit-string? (λ (x) #f)] #:omit-txexpr [omit-txexpr? (λ (x) #f)]) @@ -187,15 +189,14 @@ (for-each (λ (ee) (add-exception-word word-cache ee)) extra-exceptions) (define word-pattern #px"\\p{L}+") ;; more restrictive than exception-word (define (replacer word . words) - (if (omit-word? word) - word - (string-join (word->hyphenation-points word - word-cache - pattern-cache - min-length - min-left-length - min-right-length) - (joiner->string joiner)))) + (cond + [(omit-word? word) word] + [else (define hyphenation-points + (word->hyphenation-points word word-cache pattern-cache min-length min-left-length min-right-length)) + (cond + [(>= (sub1 (length hyphenation-points)) min-hyphens-added) + (string-join hyphenation-points (joiner->string joiner))] + [else word])])) (define (insert-hyphens text) (regexp-replace* word-pattern text replacer)) (begin0 (apply-proc insert-hyphens x omit-string? omit-txexpr? joiner) diff --git a/hyphenate/hyphenate/scribblings/hyphenate.scrbl b/hyphenate/hyphenate/scribblings/hyphenate.scrbl index 0120c6fe..ee945c01 100644 --- a/hyphenate/hyphenate/scribblings/hyphenate.scrbl +++ b/hyphenate/hyphenate/scribblings/hyphenate.scrbl @@ -41,6 +41,7 @@ Safe mode enables the function contracts documented below. Use safe mode by impo [#:min-length length (or/c integer? false?) 5] [#:min-left-length left-length (or/c (and/c integer? positive?) #f) 2] [#:min-right-length right-length (or/c (and/c integer? positive?) #f) 2] +[#:min-hyphens min-hyphen-count (and/c integer? positive?) 1] [#:omit-word word-test (string? . -> . any/c) (λ (x) #f)] [#:omit-string string-test (string? . -> . any/c) (λ (x) #f)] [#:omit-txexpr txexpr-test (txexpr? . -> . any/c) (λ (x) #f)]) @@ -50,23 +51,34 @@ Hyphenate @racket[_xexpr] by calculating hyphenation points and inserting @racke @margin-note{The REPL displays a soft hyphen as @code{\u00AD}. But in ordinary use, you'll only see a soft hyphen when it appears at the end of a line or page as part of a hyphenated word. Otherwise it's not displayed. In most of the examples here, I use a standard hyphen for clarity (by adding @code{#\-} as an argument).} @examples[#:eval my-eval - (hyphenate "ergo polymorphism") - (hyphenate "ergo polymorphism" #\-) - (hyphenate "ergo polymorphism" #:min-length 13) - (hyphenate "ergo polymorphism" #:min-length #f) + (hyphenate "snowman polymorphism") + (hyphenate "snowman polymorphism" #\-) + (hyphenate "snowman polymorphism" #:min-length 13) + (hyphenate "snowman polymorphism" #:min-length #f) ] The @racket[#:min-left-length] and @racket[#:min-right-length] keyword arguments set the minimum distance between a potential hyphen and the left or right ends of the word. The default is 2 characters. Larger values will reduce hyphens, but also prevent small words from breaking. These values will override a smaller @racket[#:min-length] value. @examples[#:eval my-eval - (hyphenate "ergo polymorphism" #\-) - (hyphenate "ergo polymorphism" #\- #:min-left-length #f) - (hyphenate "ergo polymorphism" #\- #:min-length 2 #:min-left-length 5) - (hyphenate "ergo polymorphism" #\- #:min-right-length 6) + (hyphenate "snowman polymorphism" #\-) + (hyphenate "snowman polymorphism" #\- #:min-left-length #f) + (hyphenate "snowman polymorphism" #\- #:min-length 2 #:min-left-length 5) + (hyphenate "snowman polymorphism" #\- #:min-right-length 6) (code:comment @#,t{Next words won't be hyphenated becase of large #:min-left-length}) - (hyphenate "ergo polymorphism" #\- #:min-length #f #:min-left-length 15) + (hyphenate "snowman polymorphism" #\- #:min-length #f #:min-left-length 15) ] +Another way of controlling hyphen frequency is with the @racket[#:min-hyphens] keyword argument, which sets the minimum number of hyphens in a hyphenatable word. (It has no effect on non-hyphenatable words.) The default is 1 hyphen. Larger values will reduce hyphens, but also prevent small words from breaking. + +@examples[#:eval my-eval + (hyphenate "snowman polymorphism" #\-) + (hyphenate "snowman polymorphism" #\- #:min-hyphens 1) + (code:comment @#,t{next "snowman" won't be hyphenated becase it doesn't have 2 hyphens}) + (hyphenate "snowman polymorphism" #\- #:min-hyphens 2) + (code:comment @#,t{next "polymorphism" won't be hyphenated becase it doesn't have 3 hyphens}) + (hyphenate "snowman polymorphism" #\- #:min-hyphens 3) +] + Because the hyphenation is based on an algorithm rather than a dictionary, it makes good guesses with unusual words: @examples[#:eval my-eval diff --git a/hyphenate/hyphenate/tests.rkt b/hyphenate/hyphenate/tests.rkt index fa420882..9652e3c7 100644 --- a/hyphenate/hyphenate/tests.rkt +++ b/hyphenate/hyphenate/tests.rkt @@ -11,9 +11,14 @@ (check-equal? (hyphenate "edges") "edges") ;; word without matching patterns (check-equal? (hyphenate "polymorphism") "poly\u00ADmor\u00ADphism") +(check-equal? (hyphenate "polymorphism" #:min-hyphens 1) "poly\u00ADmor\u00ADphism") +(check-equal? (hyphenate "polymorphism" #:min-hyphens 2) "poly\u00ADmor\u00ADphism") +(check-equal? (hyphenate "polymorphism" #:min-hyphens 3) "polymorphism") +(check-equal? (hyphenate "polymorphism" #:min-hyphens 42) "polymorphism") (check-equal? (hyphenate "POLYmorPHISM") "POLY\u00ADmor\u00ADPHISM") (check-equal? (hyphenate "polymorphism" #:min-length 100) "polymorphism") (check-equal? (hyphenate "ugly" #:min-length 1) "ug\u00ADly") +(check-equal? (hyphenate "ugly" #:min-length 1 #:min-hyphens 2) "ugly") (check-equal? (unhyphenate "poly\u00ADmor\u00ADphism") "polymorphism") (check-equal? (hyphenate "polymorphism" #\-) "poly-mor-phism") (check-equal? (hyphenate "compotumi" #\-) "com-po-tu-mi") @@ -21,6 +26,8 @@ (check-equal? (hyphenate "polymorphism" "foo") "polyfoomorfoophism") (check-equal? (unhyphenate "polyfoomorfoophism" "foo") "polymorphism") (check-equal? (hyphenate "circular polymorphism squandering") "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing") +(check-equal? (hyphenate "circular polymorphism squandering" #:min-hyphens 2) "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing") +(check-equal? (hyphenate "circular polymorphism squandering" #:min-hyphens 3) "circular polymorphism squandering") (check-equal? (hyphenate '(p "circular polymorphism" amp (em "squandering"))) '(p "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism" amp (em "squan\u00ADder\u00ADing"))) (check-equal? (hyphenate "present project") "present project") ; exception words ;; test these last so exceptions have been set up already