From d74075b724a786539aa867ec9f456043d5070fd9 Mon Sep 17 00:00:00 2001 From: Matthew Butterick Date: Sat, 6 Aug 2022 12:13:54 -0700 Subject: [PATCH] improve handling of intercapped words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intercapped (aka camelCase) words are treated like a list of subwords joined by hyphens, but only if the pieces meet the usual min-length threshold. But don’t use this splitting technique when unhyphenating. --- hyphenate/hyphenate/private/core.rkt | 31 ++++++++++++++++++++++------ hyphenate/hyphenate/tests.rkt | 4 +++- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/hyphenate/hyphenate/private/core.rkt b/hyphenate/hyphenate/private/core.rkt index 85c14fda..9ef22945 100644 --- a/hyphenate/hyphenate/private/core.rkt +++ b/hyphenate/hyphenate/private/core.rkt @@ -162,14 +162,32 @@ (define (joiner->string joiner) (format "~a" joiner)) -(define (apply-proc proc x [omit-string (λ (x) #f)] [omit-txexpr (λ (x) #f)] [joiner default-joiner]) +(define (apply-proc proc x + [omit-string (λ (x) #f)] + [omit-txexpr (λ (x) #f)] + [joiner default-joiner] + #:intercap-min-length [intercap-min-length #false]) (let loop ([x x]) (cond [(and (string? x) (not (omit-string x))) - ;; handle intercapped words as capitalized pieces - (define letter-before-uc #px"(?<=\\p{Ll})(?=\\p{Lu}\\p{Ll})") ; match xXx but not xXX or XXX - (string-join (for/list ([x (in-list (string-split x letter-before-uc))]) - (proc x)) (joiner->string joiner))] + (define words + (cond + [intercap-min-length + ;; handle intercapped words as a list of subwords, + ;; subject to the intercap-min-length + (define zero-length-quantifier "") + (define letter-before-uc + ;; match xXx but not xXX or XXX + (pregexp (format "(?<=\\p{L}{~a})(?=\\p{Lu}\\p{Ll}{~a})" + (if (> intercap-min-length 0) + intercap-min-length + zero-length-quantifier) + (if (> intercap-min-length 1) + (sub1 intercap-min-length) + zero-length-quantifier)))) + (string-split x letter-before-uc)] + [else (list x)])) + (string-join (map proc words) (joiner->string joiner))] [(and (txexpr? x) (not (omit-txexpr x))) (make-txexpr (get-tag x) (get-attrs x) (map loop (get-elements x)))] [else x]))) @@ -199,7 +217,8 @@ [else word])])) (define (insert-hyphens text) (regexp-replace* word-pattern text replacer)) (begin0 - (apply-proc insert-hyphens x omit-string? omit-txexpr? joiner) + (apply-proc insert-hyphens x omit-string? omit-txexpr? joiner + #:intercap-min-length min-length) ;; deleting from the main cache is cheaper than having to do two cache lookups for every word ;; (missing words will just be regenerated later) (for-each (λ (ee) (remove-exception-word word-cache ee)) extra-exceptions))) diff --git a/hyphenate/hyphenate/tests.rkt b/hyphenate/hyphenate/tests.rkt index 9652e3c7..4f358100 100644 --- a/hyphenate/hyphenate/tests.rkt +++ b/hyphenate/hyphenate/tests.rkt @@ -22,7 +22,9 @@ (check-equal? (unhyphenate "poly\u00ADmor\u00ADphism") "polymorphism") (check-equal? (hyphenate "polymorphism" #\-) "poly-mor-phism") (check-equal? (hyphenate "compotumi" #\-) "com-po-tu-mi") -(check-equal? (hyphenate "CompOtumi" #\-) "Comp-Otu-mi") +(check-equal? (hyphenate "CompOtumi" #\-) "Com-pO-tu-mi") +(check-equal? (hyphenate "iMagnificence" #\-) "iMag-nif-i-cence") +(check-equal? (hyphenate "CompOtumi" #:min-length 4 #\-) "Comp-Otu-mi") (check-equal? (hyphenate "polymorphism" "foo") "polyfoomorfoophism") (check-equal? (unhyphenate "polyfoomorfoophism" "foo") "polymorphism") (check-equal? (hyphenate "circular polymorphism squandering") "cir\u00ADcu\u00ADlar poly\u00ADmor\u00ADphism squan\u00ADder\u00ADing")