master
Matthew Butterick 5 years ago
parent ed368e92d1
commit 556b803572

@ -36,12 +36,21 @@
(define charidx (let () (define charidx (let ()
(unless (file-exists? charidx-file) (unless (file-exists? charidx-file)
(regenerate-char-index!)) (regenerate-char-index!))
(fasl->s-exp (open-input-file charidx-file)))) (list->vector (fasl->s-exp (open-input-file charidx-file)))))
(define (contains-char? charidx-entry c) (define (contains-char? charidx-entry c)
(bitwise-bit-set? charidx-entry (char->bitindex c))) (bitwise-bit-set? charidx-entry (char->bitindex c)))
(define capitalized-mask
(for/sum ([i (in-range 32 59)])
(expt 2 i)))
(define (capitalized? charidx-entry)
;; a cap only appears at the beginning of a word,
;; so it's sufficient to test whether a cap exists in the idx
(positive? (bitwise-and charidx-entry capitalized-mask)))
(module+ test (module+ test
(require rackunit) (require rackunit racket/vector)
(check-equal? (length (filter (λ (ce) (contains-char? ce #\z)) charidx)) 7830) (check-equal? (vector-length (vector-filter (λ (ce) (contains-char? ce #\z)) charidx)) 7830)
(check-equal? (charidx->chars (word->charidx "abuzz")) '(#\a #\b #\u #\z))) (check-equal? (charidx->chars (word->charidx "abuzz")) '(#\a #\b #\u #\z)))

Binary file not shown.

Binary file not shown.

@ -0,0 +1,16 @@
#lang debug racket/base
(require racket/file
racket/fasl
"words.rkt")
(provide (all-defined-out))
(define lengthidx-file "data/lengthidx.rktd")
(define (regenerate-length-index!)
(s-exp->fasl (map string-length usable-words) (open-output-file lengthidx-file #:exists 'replace)))
(define lengthidx (let ()
(unless (file-exists? lengthidx-file)
(regenerate-length-index!))
(list->vector (fasl->s-exp (open-input-file lengthidx-file)))))

@ -2,61 +2,47 @@
(require racket/list (require racket/list
racket/file racket/file
"words.rkt" "words.rkt"
"length-index.rkt"
"char-index.rkt") "char-index.rkt")
(define (wordlist #:letters [letters "etaoinshrdluw"] (define (wordlist #:letters [letters "etaoinshrdluw"]
#:mandatory [mandatory #f] #:mandatory [mandatory #f]
#:min [min-length 3] #:min [min-length 5]
#:max [max-length 10] #:max [max-length 10]
#:hide-plurals [hide-plurals? #f] #:hide-plurals [hide-plurals? #f]
#:proper-names [proper-names? #f] #:proper-names [proper-names? #f]
#:random [random #t] #:random [random #t]
#:max-words [max-words 10] #:max-words [max-words 10]
#:all-caps [all-caps? #f] #:all-caps [all-caps? #f]
#:initial-caps [initial-caps? #f] #:initial-caps [initial-caps? #f])
#:suffix [suffix ""]) (define mandatory-cs (if mandatory (string->list mandatory) null))
;; do local filtering here (i.e., filters that are true per query) (define letter-cs (append (if letters (string->list letters) null) mandatory-cs))
(define taker #f) (for*/fold ([ws null]
(let* ([ws (for/list ([w (in-list (if taker (take usable-words taker) usable-words))] [count 0]
[w-charidx (in-list (if taker (take charidx taker) charidx))] #:result (map (cond
[idx (in-naturals)] [all-caps? string-upcase]
#:when (and [initial-caps? string-titlecase]
;; between min and max length [else values]) ws))
(<= min-length (string-length w) max-length) ([idx (in-list ((if random shuffle values) (range (vector-length usable-words))))]
;; contains each mandatory, case-insensitive [w (in-value (vector-ref usable-words idx))]
;; slow: string match [w-charidx (in-value (vector-ref charidx idx))]
#;(if mandatory [w-lengthidx (in-value (vector-ref lengthidx idx))]
(for/and ([c (in-string mandatory)]) #:break (= count (or max-words +inf.0))
(regexp-match (regexp (format "(?i:~a)" c)) w)) #:when (and
#t) ;; between min and max length
;; fast: index math ((if (<= min-length max-length) <= >=) min-length w-lengthidx max-length)
(or (not mandatory) ;; contains each mandatory, case-insensitive
(for*/or ([mandatory-cs (in-value (string->list mandatory))] (or (not mandatory)
[c (in-list (map char-downcase (charidx->chars w-charidx)))]) (for/or ([c (in-list (map char-downcase (charidx->chars w-charidx)))])
(memv c mandatory-cs))) (memv c mandatory-cs)))
;; contains only letters + mandatory, case-insensitive ;; contains only letters + mandatory, case-insensitive
;; slow: string match (for/and ([c (in-list (map char-downcase (charidx->chars w-charidx)))])
#;(regexp-match (regexp (format "^(?i:[~a]+)$" (string-append letters (or mandatory "")))) w) (memv c letter-cs))
;; fast: index match ;; maybe only proper names
(if proper-names? (capitalized? w-charidx) (not (capitalized? w-charidx)))
(for*/and ([letter-cs (in-value (string->list letters))] ;; maybe hide plurals
[c (in-list (map char-downcase (charidx->chars w-charidx)))]) (if hide-plurals? (not (regexp-match #rx"s$" w)) #t)))
(memv c letter-cs)) (values (cons w ws) (add1 count))))
;; maybe only proper names
(regexp-match (if proper-names? #rx"^[A-Z]" #rx"^[a-z]") w)
;; maybe hide plurals
(if hide-plurals? (not (regexp-match #rx"s$" w)) #t)))
w)]
[ws ((if random shuffle values) ws)]
[ws (if max-words (take ws (min (length ws) max-words)) ws)]
[ws (map (cond
[all-caps? string-upcase]
[initial-caps? string-titlecase]
[else values]) ws)]
[ws (if (positive? (string-length suffix))
(map (λ (w) (string-append w "." suffix)) ws)
ws)])
ws))
(module+ test (module+ test
(time (wordlist))) (time (wordlist)))

@ -1,5 +1,6 @@
#lang debug racket/base #lang debug racket/base
(require racket/file (require racket/file
racket/fasl
racket/string) racket/string)
(provide usable-words) (provide usable-words)
@ -15,7 +16,15 @@
(regexp-match #rx"^[A-Za-z]+$" w) ; no accented letters (regexp-match #rx"^[A-Za-z]+$" w) ; no accented letters
(not (member w omit-words)) ; no bad words (not (member w omit-words)) ; no bad words
)) ))
w)) w))
ws) ws)
(define usable-words (make-wordlist)) (define wordidx-file "data/wordidx.rktd")
(define (regenerate-word-index!)
(s-exp->fasl (make-wordlist) (open-output-file wordidx-file #:exists 'replace)))
(define usable-words (let ()
(unless (file-exists? wordidx-file)
(regenerate-word-index!))
(list->vector (fasl->s-exp (open-input-file wordidx-file)))))
Loading…
Cancel
Save