You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
typesetting/hyphenate/hyphenate/private/core.rkt

238 lines
11 KiB
Racket

#lang racket/base
(require txexpr/base racket/string racket/list)
(provide hyphenate unhyphenate word->hyphenation-points exception-word->word+pattern string->hashpair)
(module+ test
(require rackunit))
;; module default values
(define default-min-length 5)
(define default-min-left-length 2)
(define default-min-right-length 2)
(define default-joiner #\u00AD)
(define default-min-hyphen-count 1)
(define (exception-word->word+pattern ew)
;; an exception word indicates its breakpoints with added hyphens
(define word (string-replace ew "-" ""))
;; pattern has same number of points as word letters. 1 marks hyphenation point; 0 no hyphenation
(define points
6 years ago
(cdr (map (λ (x) (if (equal? x "-") 1 0)) (regexp-split #px"\\p{L}" ew))))
;; use list here so we can `apply` in `add-exception-word`
(list word points))
(module+ test
(check-equal? (exception-word->word+pattern "snôw-man") '("snôwman" (0 0 0 1 0 0 0)))
(check-equal? (exception-word->word+pattern "snôwman") '("snôwman" (0 0 0 0 0 0 0)))
(check-equal? (exception-word->word+pattern "sn-ôw-ma-n") '("snôwman" (0 1 0 1 0 1 0))))
(define (add-exception-word word-cache word)
;; `hash-set!` not `hash-ref!`, because we want an exception to override an existing value
(apply hash-set! word-cache (exception-word->word+pattern word)))
(define (remove-exception-word word-cache word)
(hash-remove! word-cache (string-replace word "-" "")))
(define (string->natural i)
(let* ([result (string->number i)]
[result (and result (inexact->exact result))]
[result (and (exact-nonnegative-integer? result) result)])
result))
(module+ test
(check-equal? (string->natural "Foo") #f)
(check-equal? (string->natural "1.0") 1)
(check-equal? (string->natural "-3") #f))
(define (string->hashpair pat)
(define-values (strs nums)
(for/lists (strs nums)
;; using unicode-aware regexps to allow unicode hyphenation patterns
8 years ago
;; a pattern is a list of subpatterns, each of which is a character possibly followed by a number.
;; also, the first position may just have a number.
([subpat (in-list (regexp-match* #px"^\\d?|(\\p{L}|\\p{P})\\d?" pat))])
(define str (cond
[(regexp-match #px"(\\p{L}|\\p{P})?" subpat) => car]
[else ""]))
(define num (cond
[(regexp-match #px"\\d" subpat) => (compose1 string->natural car)]
[else 0]))
(values str num)))
(list (string-append* strs) nums))
(module+ test
(check-equal? (string->hashpair "2'2") '("'" (2 2)))
(check-equal? (string->hashpair ".â4") '("" (0 0 4)))
(check-equal? (string->hashpair ".ý4") '("" (0 0 4)))
(check-equal? (string->hashpair "'ý4") '("" (0 0 4)))
(check-equal? (string->hashpair "ý4") '("’ý" (0 0 4)))
(check-equal? (string->hashpair "4ý-") '("ý-" (4 0 0)))
(check-equal? (string->hashpair ".ach4") '(".ach" (0 0 0 0 4))))
(define (calculate-max-pattern patterns)
;; each pattern is a list of numbers
;; all the patterns have the same length
;; run max against parallel elements
(apply map (λ xs (apply max xs)) patterns))
(module+ test
(require rackunit)
(check-equal? (calculate-max-pattern '((1 0 0))) '(1 0 0))
(check-equal? (calculate-max-pattern '((1 0 0) (0 1 0))) '(1 1 0))
(check-equal? (calculate-max-pattern '((1 0 0) (0 1 0) (0 0 1))) '(1 1 1))
(check-equal? (calculate-max-pattern '((1 2 3) (2 3 1) (3 1 2))) '(3 3 3)))
(define (make-points word word-cache pattern-cache)
(hash-ref! word-cache word
8 years ago
(λ ()
;; dots are used to denote the beginning and end of the word when matching patterns
(define boundary-marker ".")
(define word-with-boundaries (format "~a~a~a" boundary-marker (string-downcase word) boundary-marker))
(define word-length (string-length word-with-boundaries))
(define default-zero-pattern (make-list (add1 word-length) 0))
;; walk through all the substrings and see if there's a matching pattern.
(define matching-patterns
(for*/list ([start (in-range word-length)]
[end (in-range start word-length)]
8 years ago
[substr (in-value (substring word-with-boundaries start (add1 end)))]
[partial-pattern (in-value (hash-ref pattern-cache (string->symbol substr) #f))]
#:when partial-pattern)
;; pad out partial-pattern to full length
;; (so we can compare patterns to find max value for each slot)
(define left-zeroes (make-list start 0))
(define right-zeroes (make-list (- (add1 word-length) (length partial-pattern) start) 0))
(append left-zeroes partial-pattern right-zeroes)))
(define max-pattern (calculate-max-pattern (cons default-zero-pattern matching-patterns)))
;; for point list generated from a pattern,
;; drop first two elements because they represent hyphenation weight
;; before the starting "." and between "." and the first letter.
;; drop last element because it represents hyphen after last "."
;; after you drop these two, then each number corresponds to
;; whether a hyphen goes after that letter.
(drop-right (drop max-pattern 2) 1))))
;; Find hyphenation points in a word. This is not quite synonymous with syllables.
(define (word->hyphenation-points word
word-cache pattern-cache
8 years ago
[min-length #f]
[min-left-length #f]
[min-right-length #f])
8 years ago
(cond
[(< (string-length word) (or min-length default-min-length)) (list word)]
[else
;; points is a list corresponding to the letters of the word.
;; to create a no-hyphenation zone of length n, zero out the first n-1 points
;; and the last n points (because the last value in points is always superfluous)
(define word-points
(let* ([points (make-points word word-cache pattern-cache)]
8 years ago
[left-zeroes (min (or min-left-length default-min-left-length) (length points))]
[right-zeroes (min (or min-right-length default-min-right-length) (length points))])
(for/list ([(point idx) (in-indexed points)])
(if (<= left-zeroes (add1 idx) (- (length points) right-zeroes))
point
0))))
8 years ago
;; odd-valued points in the pattern denote hyphenation points
(define odd-point-indexes (for/list ([(wp idx) (in-indexed word-points)]
8 years ago
#:when (odd? wp))
idx))
8 years ago
;; the hyphenation goes after the indexed letter, so add1 to the raw points for slicing
(define breakpoints (append (list 0) (map add1 odd-point-indexes) (list (string-length word))))
(for/list ([start (in-list breakpoints)]
[end (in-list (cdr breakpoints))]) ; shorter list controls exit of loop
(substring word start end))]))
8 years ago
;; joiner contract allows char or string; this coerces to string.
(define (joiner->string joiner) (format "~a" joiner))
8 years ago
(define (apply-proc proc x
[omit-string (λ (x) #f)]
[omit-txexpr (λ (x) #f)]
[joiner default-joiner]
#:intercap-min-length [intercap-min-length #false])
(let loop ([x x])
(cond
[(and (string? x) (not (omit-string x)))
(define words
(cond
[intercap-min-length
;; handle intercapped words as a list of subwords,
;; subject to the intercap-min-length
(define zero-length-quantifier "")
(define letter-before-uc
;; match xXx but not xXX or XXX
(pregexp (format "(?<=\\p{L}{~a})(?=\\p{Lu}\\p{Ll}{~a})"
(if (> intercap-min-length 0)
intercap-min-length
zero-length-quantifier)
(if (> intercap-min-length 1)
(sub1 intercap-min-length)
zero-length-quantifier))))
(string-split x letter-before-uc)]
[else (list x)]))
(string-join (map proc words) (joiner->string joiner))]
[(and (txexpr? x) (not (omit-txexpr x)))
(make-txexpr (get-tag x) (get-attrs x) (map loop (get-elements x)))]
[else x])))
8 years ago
(define (hyphenate word-cache pattern-cache x [joiner default-joiner]
#:exceptions [extra-exceptions empty]
#:min-length [min-length default-min-length]
#:min-left-length [min-left-length default-min-left-length]
#:min-right-length [min-right-length default-min-right-length]
#:min-hyphens [min-hyphens-added default-min-hyphen-count]
6 years ago
#:omit-word [omit-word? (λ (x) #f)]
#:omit-string [omit-string? (λ (x) #f)]
#:omit-txexpr [omit-txexpr? (λ (x) #f)])
;; todo?: connect this regexp pattern to the one used in word? predicate
6 years ago
(for-each (λ (ee) (add-exception-word word-cache ee)) extra-exceptions)
(define word-pattern #px"\\p{L}+") ;; more restrictive than exception-word
(define (replacer word . words)
(cond
[(omit-word? word) word]
[else (define hyphenation-points
(word->hyphenation-points word word-cache pattern-cache min-length min-left-length min-right-length))
(cond
[(>= (sub1 (length hyphenation-points)) min-hyphens-added)
(string-join hyphenation-points (joiner->string joiner))]
[else word])]))
6 years ago
(define (insert-hyphens text) (regexp-replace* word-pattern text replacer))
(begin0
(apply-proc insert-hyphens x omit-string? omit-txexpr? joiner
#:intercap-min-length min-length)
6 years ago
;; deleting from the main cache is cheaper than having to do two cache lookups for every word
;; (missing words will just be regenerated later)
(for-each (λ (ee) (remove-exception-word word-cache ee)) extra-exceptions)))
(define (unhyphenate x [joiner default-joiner]
6 years ago
#:omit-word [omit-word? (λ (x) #f)]
#:omit-string [omit-string? (λ (x) #f)]
#:omit-txexpr [omit-txexpr? (λ (x) #f)])
(define word-pattern (pregexp (format "[\\w~a]+" joiner)))
(define (replacer word . words)
(if (not (omit-word? word))
(string-replace word (joiner->string joiner) "")
word))
8 years ago
(define (remove-hyphens text) (regexp-replace* word-pattern text replacer))
(apply-proc remove-hyphens x omit-string? omit-txexpr?))