master
Matthew Butterick 5 years ago
parent ed368e92d1
commit 556b803572

@ -36,12 +36,21 @@
(define charidx (let ()
(unless (file-exists? charidx-file)
(regenerate-char-index!))
(fasl->s-exp (open-input-file charidx-file))))
(list->vector (fasl->s-exp (open-input-file charidx-file)))))
(define (contains-char? charidx-entry c)
(bitwise-bit-set? charidx-entry (char->bitindex c)))
(define capitalized-mask
(for/sum ([i (in-range 32 59)])
(expt 2 i)))
(define (capitalized? charidx-entry)
;; a cap only appears at the beginning of a word,
;; so it's sufficient to test whether a cap exists in the idx
(positive? (bitwise-and charidx-entry capitalized-mask)))
(module+ test
(require rackunit)
(check-equal? (length (filter (λ (ce) (contains-char? ce #\z)) charidx)) 7830)
(require rackunit racket/vector)
(check-equal? (vector-length (vector-filter (λ (ce) (contains-char? ce #\z)) charidx)) 7830)
(check-equal? (charidx->chars (word->charidx "abuzz")) '(#\a #\b #\u #\z)))

Binary file not shown.

Binary file not shown.

@ -0,0 +1,16 @@
#lang debug racket/base
(require racket/file
racket/fasl
"words.rkt")
(provide (all-defined-out))
(define lengthidx-file "data/lengthidx.rktd")
(define (regenerate-length-index!)
(s-exp->fasl (map string-length usable-words) (open-output-file lengthidx-file #:exists 'replace)))
(define lengthidx (let ()
(unless (file-exists? lengthidx-file)
(regenerate-length-index!))
(list->vector (fasl->s-exp (open-input-file lengthidx-file)))))

@ -2,61 +2,47 @@
(require racket/list
racket/file
"words.rkt"
"length-index.rkt"
"char-index.rkt")
(define (wordlist #:letters [letters "etaoinshrdluw"]
#:mandatory [mandatory #f]
#:min [min-length 3]
#:min [min-length 5]
#:max [max-length 10]
#:hide-plurals [hide-plurals? #f]
#:proper-names [proper-names? #f]
#:random [random #t]
#:max-words [max-words 10]
#:all-caps [all-caps? #f]
#:initial-caps [initial-caps? #f]
#:suffix [suffix ""])
;; do local filtering here (i.e., filters that are true per query)
(define taker #f)
(let* ([ws (for/list ([w (in-list (if taker (take usable-words taker) usable-words))]
[w-charidx (in-list (if taker (take charidx taker) charidx))]
[idx (in-naturals)]
#:when (and
;; between min and max length
(<= min-length (string-length w) max-length)
;; contains each mandatory, case-insensitive
;; slow: string match
#;(if mandatory
(for/and ([c (in-string mandatory)])
(regexp-match (regexp (format "(?i:~a)" c)) w))
#t)
;; fast: index math
(or (not mandatory)
(for*/or ([mandatory-cs (in-value (string->list mandatory))]
[c (in-list (map char-downcase (charidx->chars w-charidx)))])
(memv c mandatory-cs)))
;; contains only letters + mandatory, case-insensitive
;; slow: string match
#;(regexp-match (regexp (format "^(?i:[~a]+)$" (string-append letters (or mandatory "")))) w)
;; fast: index match
(for*/and ([letter-cs (in-value (string->list letters))]
[c (in-list (map char-downcase (charidx->chars w-charidx)))])
(memv c letter-cs))
;; maybe only proper names
(regexp-match (if proper-names? #rx"^[A-Z]" #rx"^[a-z]") w)
;; maybe hide plurals
(if hide-plurals? (not (regexp-match #rx"s$" w)) #t)))
w)]
[ws ((if random shuffle values) ws)]
[ws (if max-words (take ws (min (length ws) max-words)) ws)]
[ws (map (cond
[all-caps? string-upcase]
[initial-caps? string-titlecase]
[else values]) ws)]
[ws (if (positive? (string-length suffix))
(map (λ (w) (string-append w "." suffix)) ws)
ws)])
ws))
#:initial-caps [initial-caps? #f])
(define mandatory-cs (if mandatory (string->list mandatory) null))
(define letter-cs (append (if letters (string->list letters) null) mandatory-cs))
(for*/fold ([ws null]
[count 0]
#:result (map (cond
[all-caps? string-upcase]
[initial-caps? string-titlecase]
[else values]) ws))
([idx (in-list ((if random shuffle values) (range (vector-length usable-words))))]
[w (in-value (vector-ref usable-words idx))]
[w-charidx (in-value (vector-ref charidx idx))]
[w-lengthidx (in-value (vector-ref lengthidx idx))]
#:break (= count (or max-words +inf.0))
#:when (and
;; between min and max length
((if (<= min-length max-length) <= >=) min-length w-lengthidx max-length)
;; contains each mandatory, case-insensitive
(or (not mandatory)
(for/or ([c (in-list (map char-downcase (charidx->chars w-charidx)))])
(memv c mandatory-cs)))
;; contains only letters + mandatory, case-insensitive
(for/and ([c (in-list (map char-downcase (charidx->chars w-charidx)))])
(memv c letter-cs))
;; maybe only proper names
(if proper-names? (capitalized? w-charidx) (not (capitalized? w-charidx)))
;; maybe hide plurals
(if hide-plurals? (not (regexp-match #rx"s$" w)) #t)))
(values (cons w ws) (add1 count))))
(module+ test
(time (wordlist)))

@ -1,5 +1,6 @@
#lang debug racket/base
(require racket/file
racket/fasl
racket/string)
(provide usable-words)
@ -15,7 +16,15 @@
(regexp-match #rx"^[A-Za-z]+$" w) ; no accented letters
(not (member w omit-words)) ; no bad words
))
w))
w))
ws)
(define usable-words (make-wordlist))
(define wordidx-file "data/wordidx.rktd")
(define (regenerate-word-index!)
(s-exp->fasl (make-wordlist) (open-output-file wordidx-file #:exists 'replace)))
(define usable-words (let ()
(unless (file-exists? wordidx-file)
(regenerate-word-index!))
(list->vector (fasl->s-exp (open-input-file wordidx-file)))))
Loading…
Cancel
Save