diff --git a/words/char-index.rkt b/words/char-index.rkt deleted file mode 100644 index 17b0af8..0000000 --- a/words/char-index.rkt +++ /dev/null @@ -1,57 +0,0 @@ -#lang debug racket/base -(require racket/file - racket/fasl - racket/vector - "words.rkt") -(provide (all-defined-out)) - -(define (char->bitindex c) - ;; 64-bit layout - ;; __________ZYXWVUTSRQPONMLKJIHGFEDCBA______zyxwvutsrqponmlkjihgfedcba - (cond - [(char<=? #\a c #\z) (- (char->integer c) 97)] ; 97 = (char->integer #\a) - [(char<=? #\A c #\Z) (- (char->integer c) 33)] ; 65 = (char->integer #\A) - [else 0])) - -(define (word->charidx word) - (apply bitwise-ior - (for/list ([c (in-string word)]) - (expt 2 (char->bitindex c))))) - -(define (bitindex->char i) - (cond - [(<= 0 i 26) (integer->char (+ i 97))] - [(<= 32 i 59) (integer->char (+ i 33))] - [else (error 'bong)])) - -(define (charidx->chars int) - (for/list ([i (in-range 64)] - #:when (bitwise-bit-set? int i)) - (bitindex->char i))) - -(define charidx-file "compiled/charidx.rktd") - -(define (regenerate-char-index!) - (s-exp->fasl (vector-map word->charidx usable-words) (open-output-file charidx-file #:exists 'replace))) - -(define charidx (let () - (unless (file-exists? charidx-file) - (regenerate-char-index!)) - (fasl->s-exp (open-input-file charidx-file)))) - -(define (contains-char? charidx-entry c) - (bitwise-bit-set? charidx-entry (char->bitindex c))) - -(define capitalized-mask - (for/sum ([i (in-range 32 59)]) - (expt 2 i))) - -(define (capitalized? charidx-entry) - ;; a cap only appears at the beginning of a word, - ;; so it's sufficient to test whether a cap exists in the idx - (positive? (bitwise-and charidx-entry capitalized-mask))) - -(module+ test - (require rackunit racket/vector) - (check-equal? (vector-length (vector-filter (λ (ce) (contains-char? ce #\z)) charidx)) 7830) - (check-equal? (charidx->chars (word->charidx "abuzz")) '(#\a #\b #\u #\z))) diff --git a/words/index.rkt b/words/index.rkt new file mode 100644 index 0000000..a58cdff --- /dev/null +++ b/words/index.rkt @@ -0,0 +1,68 @@ +#lang debug racket/base +(require racket/file + racket/fasl + racket/runtime-path) +(provide (all-defined-out)) + +(define-runtime-path wordidx-file "compiled/words/words-index.rktd") + +(struct word-rec (word charint length) #:prefab) + +(define (make-word-recs) + (define reverse-string (compose1 list->string reverse string->list)) + (define omit-words (map reverse-string (file->lines "data/omit.rktd"))) + (for/vector ([w (in-lines (open-input-file "data/words.rktd"))] + #:when (and (not (regexp-match "'" w)) ; no apostrophes + (regexp-match #rx"^[A-Za-z]+$" w) ; no accented letters + (not (member w omit-words)))) ; no bad words + (word-rec w + (word->charidx w) + (string-length w)))) + +(define (regenerate-word-index!) + (make-parent-directory* wordidx-file) + (s-exp->fasl + (make-word-recs) + (open-output-file wordidx-file #:exists 'replace))) + +(define wordrecs + (fasl->s-exp (open-input-file (and + (unless (file-exists? wordidx-file) + (regenerate-word-index!)) + wordidx-file)))) + +(define (char->bitindex c) + ;; 64-bit layout + ;; __________ZYXWVUTSRQPONMLKJIHGFEDCBA______zyxwvutsrqponmlkjihgfedcba + (cond + [(char<=? #\a c #\z) (- (char->integer c) 97)] ; 97 = (char->integer #\a) + [(char<=? #\A c #\Z) (- (char->integer c) 33)] ; 65 = (char->integer #\A) + [else 0])) + +(define (word->charidx word) + (apply bitwise-ior + (for/list ([c (in-string word)]) + (expt 2 (char->bitindex c))))) + +(define (bitindex->char i) + (cond + [(<= 0 i 26) (integer->char (+ i 97))] + [(<= 32 i 59) (integer->char (+ i 33))] + [else (error 'bong)])) + +(define (charidx->chars int) + (for/list ([i (in-range 64)] + #:when (bitwise-bit-set? int i)) + (bitindex->char i))) + +(define (contains-char? charidx-entry c) + (bitwise-bit-set? charidx-entry (char->bitindex c))) + +(define capitalized-mask + (for/sum ([i (in-range 32 59)]) + (expt 2 i))) + +(define (capitalized? charidx-entry) + ;; a cap only appears at the beginning of a word, + ;; so it's sufficient to test whether a cap exists in the idx + (positive? (bitwise-and charidx-entry capitalized-mask))) \ No newline at end of file diff --git a/words/length-index.rkt b/words/length-index.rkt deleted file mode 100644 index c862588..0000000 --- a/words/length-index.rkt +++ /dev/null @@ -1,17 +0,0 @@ -#lang debug racket/base -(require racket/file - racket/fasl - racket/vector - "words.rkt") -(provide (all-defined-out)) - -(define lengthidx-file "compiled/lengthidx.rktd") - -(define (regenerate-length-index!) - (s-exp->fasl (vector-map string-length usable-words) (open-output-file lengthidx-file #:exists 'replace))) - -(define lengthidx (let () - (unless (file-exists? lengthidx-file) - (regenerate-length-index!)) - (fasl->s-exp (open-input-file lengthidx-file)))) - diff --git a/words/main.rkt b/words/main.rkt index bf1151e..fe901bb 100644 --- a/words/main.rkt +++ b/words/main.rkt @@ -1,38 +1,48 @@ #lang debug racket/base (require racket/list racket/file - "words.rkt" - "length-index.rkt" - "char-index.rkt") + "index.rkt") -(define (wordlist #:letters [letters "etaoinshrdluw"] - #:mandatory [mandatory #f] - #:min [min-length 5] - #:max [max-length 10] - #:hide-plurals [hide-plurals? #f] - #:proper-names [proper-names? #f] - #:random [random #t] - #:max-words [max-words 10] - #:all-caps [all-caps? #f] - #:initial-caps [initial-caps? #f]) +(define (make-words #:letters [letters "etaoinshrdluw"] + #:mandatory [mandatory #f] + #:min [min-length 5] + #:max [max-length 10] + #:hide-plurals [hide-plurals? #f] + #:proper-names [proper-names? #f] + #:random [random #t] + #:max-words [max-words 10] + #:all-caps [all-caps? #f] + #:initial-caps [initial-caps? #f]) (define mandatory-cs - (if mandatory (remove-duplicates (map char-downcase (string->list mandatory)) char=?) null)) - (define letter-cs (remove-duplicates (append (if letters (map char-downcase (string->list letters)) null) mandatory-cs) char=?)) - (define letter-cs-charidx (word->charidx (list->string letter-cs))) - (for*/fold ([ws null] + (if mandatory (remove-duplicates (for/list ([c (in-string mandatory)]) + (char-downcase c)) char=?) null)) + (define letter-cs-charidx + (word->charidx + (list->string + (remove-duplicates + (append (if letters + (for/list ([c (in-string letters)]) + (char-downcase c)) + null) + mandatory-cs) + char=?)))) + + (define capitalizer (cond + [all-caps? string-upcase] + [initial-caps? string-titlecase] + [else values])) + + (for*/fold ([word-acc null] [count 0] - #:result (map (cond - [all-caps? string-upcase] - [initial-caps? string-titlecase] - [else values]) ws)) - ([idx (in-list ((if random shuffle values) (range (vector-length usable-words))))] - [w (in-value (vector-ref usable-words idx))] - [w-charidx (in-value (vector-ref charidx idx))] - [w-lengthidx (in-value (vector-ref lengthidx idx))] + #:result word-acc) + ([idx (in-list ((if random shuffle values) (range (vector-length wordrecs))))] + [rec (in-value (vector-ref wordrecs idx))] + [w (in-value (word-rec-word rec))] + [w-charidx (in-value (word-rec-charint rec))] #:break (= count (or max-words +inf.0)) #:when (and ;; between min and max length - ((if (<= min-length max-length) <= >=) min-length w-lengthidx max-length) + ((if (<= min-length max-length) <= >=) min-length (word-rec-length rec) max-length) ;; word contains each mandatory char, case-insensitive (or (not mandatory) (for/and ([mc (in-list mandatory-cs)]) @@ -44,10 +54,10 @@ (if proper-names? (capitalized? w-charidx) (not (capitalized? w-charidx))) ;; maybe hide plurals (if hide-plurals? (not (regexp-match #rx"s$" w)) #t))) - (values (cons w ws) (add1 count)))) + (values (cons (capitalizer w) word-acc) (add1 count)))) (module+ test (require rackunit) - (time (wordlist)) - (check-equal? (sort (wordlist #:mandatory "xyz") stringstring reverse string->list)) - -(define omit-words (map reverse-string (file->lines "data/omit.rktd"))) - -(define (make-wordlist) - ;; do global filtering here (i.e., filters that are always true) - (define ws - (for/list ([w (in-lines (open-input-file "data/words.rktd"))] - #:when (and (not (regexp-match "'" w)) ; no apostrophes - (regexp-match #rx"^[A-Za-z]+$" w) ; no accented letters - (not (member w omit-words)) ; no bad words - )) - w)) - ws) - -(define wordidx-file "compiled/wordidx.rktd") - -(define (regenerate-word-index!) - (s-exp->fasl (make-wordlist) (open-output-file wordidx-file #:exists 'replace))) - -(define usable-words (let () - (unless (file-exists? wordidx-file) - (regenerate-word-index!)) - (list->vector (fasl->s-exp (open-input-file wordidx-file))))) \ No newline at end of file