diff --git a/words/char-index.rkt b/words/char-index.rkt index 0df9797..8b0dd81 100644 --- a/words/char-index.rkt +++ b/words/char-index.rkt @@ -36,12 +36,21 @@ (define charidx (let () (unless (file-exists? charidx-file) (regenerate-char-index!)) - (fasl->s-exp (open-input-file charidx-file)))) + (list->vector (fasl->s-exp (open-input-file charidx-file))))) (define (contains-char? charidx-entry c) (bitwise-bit-set? charidx-entry (char->bitindex c))) +(define capitalized-mask + (for/sum ([i (in-range 32 59)]) + (expt 2 i))) + +(define (capitalized? charidx-entry) + ;; a cap only appears at the beginning of a word, + ;; so it's sufficient to test whether a cap exists in the idx + (positive? (bitwise-and charidx-entry capitalized-mask))) + (module+ test - (require rackunit) - (check-equal? (length (filter (λ (ce) (contains-char? ce #\z)) charidx)) 7830) + (require rackunit racket/vector) + (check-equal? (vector-length (vector-filter (λ (ce) (contains-char? ce #\z)) charidx)) 7830) (check-equal? (charidx->chars (word->charidx "abuzz")) '(#\a #\b #\u #\z))) diff --git a/words/data/lengthidx.rktd b/words/data/lengthidx.rktd new file mode 100644 index 0000000..dbf3926 Binary files /dev/null and b/words/data/lengthidx.rktd differ diff --git a/words/data/wordidx.rktd b/words/data/wordidx.rktd new file mode 100644 index 0000000..31d7c9d Binary files /dev/null and b/words/data/wordidx.rktd differ diff --git a/words/length-index.rkt b/words/length-index.rkt new file mode 100644 index 0000000..3af1a02 --- /dev/null +++ b/words/length-index.rkt @@ -0,0 +1,16 @@ +#lang debug racket/base +(require racket/file + racket/fasl + "words.rkt") +(provide (all-defined-out)) + +(define lengthidx-file "data/lengthidx.rktd") + +(define (regenerate-length-index!) + (s-exp->fasl (map string-length usable-words) (open-output-file lengthidx-file #:exists 'replace))) + +(define lengthidx (let () + (unless (file-exists? lengthidx-file) + (regenerate-length-index!)) + (list->vector (fasl->s-exp (open-input-file lengthidx-file))))) + diff --git a/words/main.rkt b/words/main.rkt index db6e325..1cbfd86 100644 --- a/words/main.rkt +++ b/words/main.rkt @@ -2,61 +2,47 @@ (require racket/list racket/file "words.rkt" + "length-index.rkt" "char-index.rkt") (define (wordlist #:letters [letters "etaoinshrdluw"] #:mandatory [mandatory #f] - #:min [min-length 3] + #:min [min-length 5] #:max [max-length 10] #:hide-plurals [hide-plurals? #f] #:proper-names [proper-names? #f] #:random [random #t] #:max-words [max-words 10] #:all-caps [all-caps? #f] - #:initial-caps [initial-caps? #f] - #:suffix [suffix ""]) - ;; do local filtering here (i.e., filters that are true per query) - (define taker #f) - (let* ([ws (for/list ([w (in-list (if taker (take usable-words taker) usable-words))] - [w-charidx (in-list (if taker (take charidx taker) charidx))] - [idx (in-naturals)] - #:when (and - ;; between min and max length - (<= min-length (string-length w) max-length) - ;; contains each mandatory, case-insensitive - ;; slow: string match - #;(if mandatory - (for/and ([c (in-string mandatory)]) - (regexp-match (regexp (format "(?i:~a)" c)) w)) - #t) - ;; fast: index math - (or (not mandatory) - (for*/or ([mandatory-cs (in-value (string->list mandatory))] - [c (in-list (map char-downcase (charidx->chars w-charidx)))]) - (memv c mandatory-cs))) - ;; contains only letters + mandatory, case-insensitive - ;; slow: string match - #;(regexp-match (regexp (format "^(?i:[~a]+)$" (string-append letters (or mandatory "")))) w) - ;; fast: index match - - (for*/and ([letter-cs (in-value (string->list letters))] - [c (in-list (map char-downcase (charidx->chars w-charidx)))]) - (memv c letter-cs)) - ;; maybe only proper names - (regexp-match (if proper-names? #rx"^[A-Z]" #rx"^[a-z]") w) - ;; maybe hide plurals - (if hide-plurals? (not (regexp-match #rx"s$" w)) #t))) - w)] - [ws ((if random shuffle values) ws)] - [ws (if max-words (take ws (min (length ws) max-words)) ws)] - [ws (map (cond - [all-caps? string-upcase] - [initial-caps? string-titlecase] - [else values]) ws)] - [ws (if (positive? (string-length suffix)) - (map (λ (w) (string-append w "." suffix)) ws) - ws)]) - ws)) + #:initial-caps [initial-caps? #f]) + (define mandatory-cs (if mandatory (string->list mandatory) null)) + (define letter-cs (append (if letters (string->list letters) null) mandatory-cs)) + (for*/fold ([ws null] + [count 0] + #:result (map (cond + [all-caps? string-upcase] + [initial-caps? string-titlecase] + [else values]) ws)) + ([idx (in-list ((if random shuffle values) (range (vector-length usable-words))))] + [w (in-value (vector-ref usable-words idx))] + [w-charidx (in-value (vector-ref charidx idx))] + [w-lengthidx (in-value (vector-ref lengthidx idx))] + #:break (= count (or max-words +inf.0)) + #:when (and + ;; between min and max length + ((if (<= min-length max-length) <= >=) min-length w-lengthidx max-length) + ;; contains each mandatory, case-insensitive + (or (not mandatory) + (for/or ([c (in-list (map char-downcase (charidx->chars w-charidx)))]) + (memv c mandatory-cs))) + ;; contains only letters + mandatory, case-insensitive + (for/and ([c (in-list (map char-downcase (charidx->chars w-charidx)))]) + (memv c letter-cs)) + ;; maybe only proper names + (if proper-names? (capitalized? w-charidx) (not (capitalized? w-charidx))) + ;; maybe hide plurals + (if hide-plurals? (not (regexp-match #rx"s$" w)) #t))) + (values (cons w ws) (add1 count)))) (module+ test (time (wordlist))) \ No newline at end of file diff --git a/words/words.rkt b/words/words.rkt index 181e0ee..99622f4 100644 --- a/words/words.rkt +++ b/words/words.rkt @@ -1,5 +1,6 @@ #lang debug racket/base (require racket/file + racket/fasl racket/string) (provide usable-words) @@ -15,7 +16,15 @@ (regexp-match #rx"^[A-Za-z]+$" w) ; no accented letters (not (member w omit-words)) ; no bad words )) - w)) + w)) ws) -(define usable-words (make-wordlist)) \ No newline at end of file +(define wordidx-file "data/wordidx.rktd") + +(define (regenerate-word-index!) + (s-exp->fasl (make-wordlist) (open-output-file wordidx-file #:exists 'replace))) + +(define usable-words (let () + (unless (file-exists? wordidx-file) + (regenerate-word-index!)) + (list->vector (fasl->s-exp (open-input-file wordidx-file))))) \ No newline at end of file