better sentence-ending edge cases (fixes #208)

pull/218/head
Matthew Butterick 5 years ago
parent d743f76bd6
commit 3d2dc8ccb8

@ -1 +1 @@
1566862068 1567968586

@ -1,13 +1,22 @@
#lang racket/base #lang racket/base
(require racket/list racket/string sugar/define sugar/test txexpr/base racket/match sugar/unstable/container sugar/coerce sugar/unstable/len "../private/whitespace.rkt") (require racket/list
racket/string
sugar/define
sugar/test
txexpr/base
racket/match
sugar/unstable/container
sugar/coerce
sugar/unstable/len
"../private/whitespace.rkt")
(provide whitespace? whitespace/nbsp?) (provide whitespace? whitespace/nbsp?)
(define (make-replacer query+replacement) (define ((make-replacer query+replacement) str)
(let ([queries (map car query+replacement)] (for/fold ([str str])
[replacements (map second query+replacement)]) ([qr (in-list query+replacement)])
;; reverse because first in list should be first applied to str (and compose1 works right-to-left) (match-define (list query replacement) qr)
(apply compose1 (reverse (map (λ (query replacement) (λ (str) (regexp-replace* query str replacement))) queries replacements))))) (regexp-replace* query str replacement)))
(define+provide/contract (smart-dashes str) (define+provide/contract (smart-dashes str)
(string? . -> . string?) (string? . -> . string?)
@ -36,10 +45,17 @@
#:single-open "" #:single-close "") #:single-open "" #:single-close "")
"«Why,» she couldzingve asked, «are we in Oahu watching Mame") "«Why,» she couldzingve asked, «are we in Oahu watching Mame")
(check-equal? (smart-quotes "\"\'Impossible.\' Yes.\"") "Impossible. Yes.”") (check-equal? (smart-quotes "\"\'Impossible.\' Yes.\"") "Impossible. Yes.”")
(check-equal? (smart-quotes "(\"No.\")") "(“No.”)")
(check-equal? (smart-quotes '(div "don'" (em "t"))) '(div "don" (em "t"))) (check-equal? (smart-quotes '(div "don'" (em "t"))) '(div "don" (em "t")))
(check-equal? (smart-quotes '(div "do '" (em "not'"))) '(div "do " (em "not")))) (check-equal? (smart-quotes '(div "do '" (em "not'"))) '(div "do " (em "not"))))
(define sentence-ender-exceptions (regexp-quote ",.:;?!])}"))
(define (at-beginning-pat str)
(pregexp (format "(?<!\\w)~a(?=\\S)" (regexp-quote str))))
(define (at-end-pat str)
(pregexp (format "(?<=\\S)~a(?!\\w)" (regexp-quote str))))
(define+provide/contract (smart-quotes x (define+provide/contract (smart-quotes x
#:apostrophe [apostrophe-str ""] #:apostrophe [apostrophe-str ""]
#:single-open [single-open-str ""] #:single-open [single-open-str ""]
@ -54,17 +70,18 @@
#:double-close string?) . ->* . (or/c string? txexpr?)) #:double-close string?) . ->* . (or/c string? txexpr?))
(define quotes (define quotes
`((#px"(?<=\\w)'(?=\\w)" ,apostrophe-str) ; apostrophe (list
(#px"(?<!\\w)'(?=[,.:;?!])" ,single-close-str) ; sentence ender on outside exceptions (list #px"(?<=\\w)'(?=\\w)" apostrophe-str) ; apostrophe
(#px"(?<!\\w)'(?=\\S)" ,single-open-str) ; single_at_beginning (list (pregexp (format "(?<!\\w)'(?=[~a])" sentence-ender-exceptions)) single-close-str) ; sentence ender on outside exceptions
(#px"(?<=\\S)'(?!\\w)" ,single-close-str) ; single_at_end (list (at-beginning-pat "'") single-open-str) ; single_at_beginning
(#px"(?<!\\w)\"(?=[,.:;?!])" ,double-close-str) ; sentence ender on outside exceptions (list (at-end-pat "'") single-close-str) ; single_at_end
(#px"(?<!\\w)\"(?=\\S)" ,double-open-str) ; double_at_beginning (list (pregexp (format "(?<!\\w)\"(?=[~a])" sentence-ender-exceptions)) double-close-str) ; sentence ender on outside exceptions
(#px"(?<=\\S)\"(?!\\w)" ,double-close-str))) ; double_at_end (list (at-beginning-pat "\"") double-open-str) ; double_at_beginning
(list (at-end-pat "\"") double-close-str))) ; double_at_end
(cond
[(string? x) ((make-replacer quotes) x)] (match x
[(txexpr? x) [(? string?) ((make-replacer quotes) x)]
[(? txexpr?)
;; convert the quotes as if the txexpr were a flat string, to get proximity right ;; convert the quotes as if the txexpr were a flat string, to get proximity right
;; then replace the actual strings with substrings from this converted result ;; then replace the actual strings with substrings from this converted result
;; todo: handle entities & chars correctly, for now they are ignored ;; todo: handle entities & chars correctly, for now they are ignored
@ -74,16 +91,16 @@
c)) c))
(define offset 0) (define offset 0)
(let loop ([x x]) (let loop ([x x])
(cond (match x
[(txexpr? x) (txexpr (get-tag x) (get-attrs x) (map loop (get-elements x)))] [(? txexpr?) (txexpr (get-tag x) (get-attrs x) (map loop (get-elements x)))]
[(string? x) [(? string?)
(define prev-offset offset) (define prev-offset offset)
(set! offset (+ prev-offset (string-length x))) (set! offset (+ prev-offset (string-length x)))
(list->string (list->string
(for/list ([c (in-vector char-vec prev-offset offset)]) (for/list ([c (in-vector char-vec prev-offset offset)])
c))] c))]
[else x]))] [_ x]))]
[else x])) [_ x]))
; wrap initial quotes for hanging punctuation ; wrap initial quotes for hanging punctuation
; todo: improve this ; todo: improve this
@ -96,23 +113,25 @@
(define two-or-more-char-string? (λ (i) (and (string? i) (>= (string-length i) 2)))) (define two-or-more-char-string? (λ (i) (and (string? i) (>= (string-length i) 2))))
(define-values (tag attr elements) (txexpr->values nx)) (define-values (tag attr elements) (txexpr->values nx))
(make-txexpr tag attr (make-txexpr tag attr
(if (and (list? elements) (not (empty? elements))) (match elements
(let ([new-car-elements (match (car elements) [(cons elem other-elems)
[(? two-or-more-char-string? tcs) (define new-car-elements
(define str-first (get tcs 0)) (match elem
(define str-rest (get tcs 1 (string-length tcs))) [(? two-or-more-char-string? tcs)
(cond (define str-first (get tcs 0))
[(str-first . in? . '("\"" "")) (define str-rest (get tcs 1 (string-length tcs)))
;; can wrap with any inline tag (cond
;; so that linebreak detection etc still works [(str-first . in? . '("\"" ""))
`(,@double-pp ,(->string #\“) ,str-rest)] ;; can wrap with any inline tag
[(str-first . in? . '("\'" "")) ;; so that linebreak detection etc still works
`(,@single-pp ,(->string #\) ,str-rest)] `(,@double-pp ,(->string #\“) ,str-rest)]
[else tcs])] [(str-first . in? . '("\'" ""))
[(? txexpr? nx) (wrap-hanging-quotes nx)] `(,@single-pp ,(->string #\) ,str-rest)]
[else (car elements)])]) [else tcs])]
(cons new-car-elements (cdr elements))) [(? txexpr? nx) (wrap-hanging-quotes nx)]
elements))) [_ elem]))
(cons new-car-elements other-elems)]
[_ elements])))
(module-test-external (module-test-external
(check-equal? (wrap-hanging-quotes '(p "\"Hi\" there")) '(p (dquo "" "Hi\" there"))) (check-equal? (wrap-hanging-quotes '(p "\"Hi\" there")) '(p (dquo "" "Hi\" there")))
@ -133,32 +152,35 @@
(define tags-to-pay-attention-to '(p aside)) ; only apply to paragraphs (define tags-to-pay-attention-to '(p aside)) ; only apply to paragraphs
(define (replace-last-space str) (define (replace-last-space str)
(if (#\space . in? . str) (cond
(let ([reversed-str-list (reverse (string->list str))] [(#\space . in? . str)
[reversed-nbsp (reverse (string->list (->string nbsp)))]) (define reversed-str-list (reverse (string->list str)))
(define-values (last-word-chars other-chars) (define reversed-nbsp (reverse (string->list (->string nbsp))))
(splitf-at reversed-str-list (λ (i) (not (eq? i #\space))))) (define-values (last-word-chars other-chars)
(splitf-at reversed-str-list (λ (i) (not (eq? i #\space)))))
(define front-chars (if (< (len last-word-chars) minimum-word-length) ; OK for long words to be on their own line (define front-chars
; first char of other-chars will be the space, so use cdr (cond
(string-append (list->string (reverse (cdr other-chars))) (->string nbsp)) [(< (len last-word-chars) minimum-word-length) ; OK for long words to be on their own line
(list->string (reverse other-chars)))) ; first char of other-chars will be the space, so use cdr
(define last-word (list->string (reverse last-word-chars))) (string-append (list->string (reverse (cdr other-chars))) (->string nbsp))]
`(,front-chars ,(last-word-proc last-word))) ; don't concatenate last word bc last-word-proc might be a txexpr wrapper [else (list->string (reverse other-chars))]))
(list str))) (define last-word (list->string (reverse last-word-chars)))
`(,front-chars ,(last-word-proc last-word))] ; don't concatenate last word bc last-word-proc might be a txexpr wrapper
[else (list str)]))
(define (find-last-word-space x) ; recursively traverse xexpr (define (find-last-word-space x) ; recursively traverse xexpr
(cond (match x
[(string? x) (replace-last-space x)] ; todo: this assumes a paragraph only has one string in it. [(? string?) (replace-last-space x)] ; todo: this assumes a paragraph only has one string in it.
[(txexpr? x) [(? txexpr?)
(let-values([(tag attr elements) (txexpr->values x)]) (define-values (tag attr elements) (txexpr->values x))
(if (> (length elements) 0) ; elements is list of xexprs (match elements
(let-values ([(all-but-last last) (split-at elements (sub1 (length elements)))]) [(list all-but-last ... last-item) ; elements is list of xexprs
(define result (find-last-word-space (car last))) (define result-items (match (find-last-word-space last-item)
(define result-items (if (txexpr? result) (list result) result)) ; might be txexpr, or list of new elements [(? txexpr? tx) (list tx)]
(make-txexpr tag attr `(,@all-but-last ,@result-items))) [other other])) ; might be txexpr, or list of new elements
x))] (make-txexpr tag attr `(,@all-but-last ,@result-items))]
[else x])) [_ x])]
[_ x]))
(if ((car x) . in? . tags-to-pay-attention-to) (if ((car x) . in? . tags-to-pay-attention-to)
(find-last-word-space x) (find-last-word-space x)

Loading…
Cancel
Save