delete readability file

main
Matthew Butterick 11 years ago
parent b344d1a868
commit 9719e6354b

@ -1,11 +1,11 @@
#lang racket/base
(require racket/string racket/list racket/contract racket/vector racket/bool)
(require "data.rkt" "readability.rkt")
(require "hyphenation-patterns.rkt")
(module+ test (require rackunit))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Hyphenate.rkt
;;; Hyphenate module
;;; Racket port of Ned Batchelder's hyphenate.py
;;; http://nedbatchelder.com/code/modules/hyphenate.html
;;; (in the public domain)
@ -16,13 +16,13 @@
(provide (contract-out
[hyphenate
((string?) ((or/c char? string?) #:exceptions (listof word?) #:min-length (or/c integer? false?)) . ->* . string?)])
((string?) ((or/c char? string?) #:exceptions (listof word?) #:min-length (or/c integer? false?)) . ->* . string?)])
(contract-out
[hyphenatef
((string? procedure?) ((or/c char? string?) #:exceptions (listof word?) #:min-length (or/c integer? false?)) . ->* . string?)])
((string? procedure?) ((or/c char? string?) #:exceptions (listof word?) #:min-length (or/c integer? false?)) . ->* . string?)])
(contract-out
[unhyphenate
((string?) ((or/c char? string?)) . ->* . string?)]))
((string?) ((or/c char? string?)) . ->* . string?)]))
;; global data, define now but set! them later (because they're potentially big & slow)
(define exceptions #f)
@ -46,7 +46,7 @@
;; A word is a string without whitespace.
(define (word? x)
(->boolean (regexp-match #px"^\\S+$" x)))
(if (regexp-match #px"^\\S+$" x) #t #f))
(module+ test
(check-true (word? "Foobar"))
@ -69,7 +69,7 @@
(define (insert-pattern pat)
(let* ([chars (regexp-replace* #px"[0-9]" pat "")]
;; regexp returns list of strings
[points (map (λ(x) (if (> (len x) 0) (string->number x) 0)) (regexp-split #px"[.a-z]" pat))]
[points (map (λ(x) (if (> (string-length x) 0) (string->number x) 0)) (regexp-split #px"[.a-z]" pat))]
[tree tree])
(for ([char chars])
(when (not (hash-has-key? tree char))
@ -87,23 +87,23 @@
; controls hyphenation zone from edges of word
; todo: parameterize this setting
; todo: does this count end-of-word punctuation? it shouldn't.
(map (λ(i) (vector-set! points i 0)) (list 1 2 (- (len points) 2) (- (len points) 3)))
(map (λ(i) (vector-set! points i 0)) (list 1 2 (- (vector-length points) 2) (- (vector-length points) 3)))
points)
(let* ([word (string-downcase word)]
[points
(if (hash-has-key? exceptions word)
(hash-ref exceptions word)
(let* ([work (string-append "." (->string word) ".")]
[points (make-vector (add1 (len work)) 0)])
(for ([i (len work)])
(let* ([work (string-append "." word ".")]
[points (make-vector (add1 (string-length work)) 0)])
(for ([i (string-length work)])
(let ([tree pattern-tree])
(for ([char (substring work i (len work))]
(for ([char (substring work i (string-length work))]
#:break (not (hash-has-key? tree char)))
(set! tree (hash-ref tree char))
(when (hash-has-key? tree empty)
(let ([point (hash-ref tree empty)])
(for ([j (len point)])
(for ([j (length point)])
(vector-set! points (+ i j) (max (vector-ref points (+ i j)) (list-ref point j)))))))))
points))])
@ -112,6 +112,28 @@
(vector-drop (make-zeroes points) 2)))
;; helpful extension of splitf-at
(define (splitf-at* xs split-test)
(define (trim items test-proc)
(dropf-right (dropf items test-proc) test-proc))
(define (&splitf-at* xs [acc '()])
(if (empty? xs)
;; reverse because accumulation is happening backward
;; (because I'm using cons to push latest match onto front of list)
(reverse acc)
(let-values ([(item rest)
;; drop matching elements from front
;; then split on nonmatching
;; = nonmatching item + other elements (which will start with matching)
(splitf-at (dropf xs split-test) (compose1 not split-test))])
;; recurse, and store new item in accumulator
(&splitf-at* rest (cons item acc)))))
;; trim off elements matching split-test
(&splitf-at* (trim xs split-test)))
;; Find hyphenatable pieces of a word. This is not quite synonymous with syllables.
(define (word->pieces word [min-length default-min-length])
@ -124,11 +146,15 @@
(cons char 'syllable))))) ; odd point denotes char + syllable
(map list->string (splitf-at* word-dissected symbol?)))
(if (and min-length (< (len word) min-length))
(if (and min-length (< (string-length word) min-length))
(list word)
(make-pieces word)))
;; joiner contract allows char or string; this coerces to string.
(define (joiner->string joiner)
(if (char? joiner) (format "~a" joiner) joiner))
;; Hyphenate using a filter procedure.
;; Theoretically possible to do this externally,
;; but it would just mean doing the regexp-replace twice.
@ -136,18 +162,20 @@
;; set up module data
(set! exceptions (list->exceptions (append default-exceptions (map ->string extra-exceptions))))
(set! exceptions (list->exceptions (append default-exceptions extra-exceptions)))
(when (not pattern-tree) (set! pattern-tree (make-pattern-tree default-patterns)))
(regexp-replace* #px"\\w+" text (λ(word) (if (proc word) (string-join (word->pieces word min-length) (->string joiner)) word))))
(define joiner-string (joiner->string joiner))
(regexp-replace* #px"\\w+" text (λ(word) (if (proc word) (string-join (word->pieces word min-length) joiner-string) word))))
;; Default hyphenate function.
(define (hyphenate text [joiner default-joiner] #:exceptions [extra-exceptions '()] #:min-length [min-length default-min-length])
(hyphenatef text (λ(x) #t) joiner #:exceptions extra-exceptions #:min-length min-length))
(define (unhyphenate text [joiner default-joiner])
(string-replace text (->string joiner) ""))
(define (unhyphenate text [joiner default-joiner])
(define joiner-string (joiner->string joiner))
(string-replace text joiner-string ""))
(module+ test

@ -1,153 +0,0 @@
#lang racket/base
(require racket/contract net/url xml)
(require (only-in racket/list empty? range splitf-at dropf dropf-right))
(require (only-in racket/format ~a))
(require (only-in racket/string string-join))
(require (only-in racket/vector vector-member))
(require (only-in racket/set set set->list set?))
(module+ test (require rackunit))
(provide (all-defined-out))
;; general way of coercing to string
(define/contract (->string x)
(any/c . -> . string?)
(cond
[(string? x) x]
[(empty? x) ""]
[(symbol? x) (symbol->string x)]
[(number? x) (number->string x)]
[(path? x) (path->string x)]
[(char? x) (~a x)]
[(xexpr? x) (xexpr->string x)] ; put this last so other xexprish things don't get caught
[else (error (format "Can't make ~a into string" x))]))
(module+ test
(check-equal? (->string "foo") "foo")
(check-equal? (->string '()) "")
(check-equal? (->string 'foo) "foo")
(check-equal? (->string 123) "123")
(define file-name-as-text "foo.txt")
(check-equal? (->string (string->path file-name-as-text)) file-name-as-text)
(check-equal? (->string #\¶) "")
(check-equal? (->string '(foo "bar")) "<foo>bar</foo>"))
;; general way of coercing to a list
(define/contract (->list x)
(any/c . -> . list?)
(cond
[(list? x) x]
[(vector? x) (vector->list x)]
[(set? x) (set->list x)]
[else (list x)]))
(module+ test
(check-equal? (->list '(1 2 3)) '(1 2 3))
(check-equal? (->list (list->vector '(1 2 3))) '(1 2 3))
(check-equal? (->list (set 1 2 3)) '(3 2 1))
(check-equal? (->list "foo") (list "foo")))
;; general way of coercing to vector
(define (->vector x)
(any/c . -> . vector?)
; todo: on bad input, it will pop a list error rather than vector error
(cond
[(vector? x) x]
[else (list->vector (->list x))]))
;; general way of coercing to boolean
(define/contract (->boolean x)
(any/c . -> . boolean?)
;; in Racket, everything but #f is true
(if x #t #f))
(module+ test
(check-true (->boolean #t))
(check-false (->boolean #f))
(check-true (->boolean "#f"))
(check-true (->boolean "foo"))
(check-true (->boolean '()))
(check-true (->boolean '(1 2 3))))
(define/contract (has-length? x)
(any/c . -> . boolean?)
(ormap (λ(proc) (proc x)) (list list? string? symbol? vector? hash? set?)))
;; general way of asking for length
(define/contract (len x)
(has-length? . -> . integer?)
(cond
[(list? x) (length x)]
[(string? x) (string-length x)]
[(symbol? x) (len (->string x))]
[(vector? x) (len (->list x))]
[(hash? x) (len (hash-keys x))]
[(set? x) (len (->list x))]
[else #f]))
(module+ test
(check-equal? (len '(1 2 3)) 3)
(check-not-equal? (len '(1 2)) 3) ; len 2
(check-equal? (len "foo") 3)
(check-not-equal? (len "fo") 3) ; len 2
(check-equal? (len 'foo) 3)
(check-not-equal? (len 'fo) 3) ; len 2
(check-equal? (len (list->vector '(1 2 3))) 3)
(check-not-equal? (len (list->vector '(1 2))) 3) ; len 2
(check-equal? (len (set 1 2 3)) 3)
(check-not-equal? (len (set 1 2)) 3) ; len 2
(check-equal? (len (make-hash '((a . 1) (b . 2) (c . 3)))) 3)
(check-not-equal? (len (make-hash '((a . 1) (b . 2)))) 3)) ; len 2
;; trim from beginning & end of list
(define (trim items test-proc)
(list? procedure? . -> . list?)
(dropf-right (dropf items test-proc) test-proc))
(module+ test
; (check-equal? (trim (list "\n" " " 1 2 3 "\n") whitespace?) '(1 2 3))
(check-equal? (trim (list 1 3 2 4 5 6 8 9 13) odd?) '(2 4 5 6 8)))
;; split list into list of sublists using test-proc
(define/contract (splitf-at* xs split-test)
;; todo: better error message when split-test is not a predicate
(list? predicate/c . -> . (listof list?))
(define (&splitf-at* xs [acc '()]) ; use acc for tail recursion
(if (empty? xs)
;; reverse because accumulation is happening backward
;; (because I'm using cons to push latest match onto front of list)
(reverse acc)
(let-values ([(item rest)
;; drop matching elements from front
;; then split on nonmatching
;; = nonmatching item + other elements (which will start with matching)
(splitf-at (dropf xs split-test) (compose1 not split-test))])
;; recurse, and store new item in accumulator
(&splitf-at* rest (cons item acc)))))
;; trim off elements matching split-test
(&splitf-at* (trim xs split-test)))
(module+ test
; (check-equal? (splitf-at* '("foo" " " "bar" "\n" "\n" "ino") whitespace?) '(("foo")("bar")("ino")))
(check-equal? (splitf-at* '(1 2 3 4 5 6) even?) '((1)(3)(5))))
Loading…
Cancel
Save