one index

master
Matthew Butterick 5 years ago
parent 9e7fa0d5b4
commit c2f785f7d7

@ -1,57 +0,0 @@
#lang debug racket/base
(require racket/file
racket/fasl
racket/vector
"words.rkt")
(provide (all-defined-out))
(define (char->bitindex c)
;; 64-bit layout
;; __________ZYXWVUTSRQPONMLKJIHGFEDCBA______zyxwvutsrqponmlkjihgfedcba
(cond
[(char<=? #\a c #\z) (- (char->integer c) 97)] ; 97 = (char->integer #\a)
[(char<=? #\A c #\Z) (- (char->integer c) 33)] ; 65 = (char->integer #\A)
[else 0]))
(define (word->charidx word)
(apply bitwise-ior
(for/list ([c (in-string word)])
(expt 2 (char->bitindex c)))))
(define (bitindex->char i)
(cond
[(<= 0 i 26) (integer->char (+ i 97))]
[(<= 32 i 59) (integer->char (+ i 33))]
[else (error 'bong)]))
(define (charidx->chars int)
(for/list ([i (in-range 64)]
#:when (bitwise-bit-set? int i))
(bitindex->char i)))
(define charidx-file "compiled/charidx.rktd")
(define (regenerate-char-index!)
(s-exp->fasl (vector-map word->charidx usable-words) (open-output-file charidx-file #:exists 'replace)))
(define charidx (let ()
(unless (file-exists? charidx-file)
(regenerate-char-index!))
(fasl->s-exp (open-input-file charidx-file))))
(define (contains-char? charidx-entry c)
(bitwise-bit-set? charidx-entry (char->bitindex c)))
(define capitalized-mask
(for/sum ([i (in-range 32 59)])
(expt 2 i)))
(define (capitalized? charidx-entry)
;; a cap only appears at the beginning of a word,
;; so it's sufficient to test whether a cap exists in the idx
(positive? (bitwise-and charidx-entry capitalized-mask)))
(module+ test
(require rackunit racket/vector)
(check-equal? (vector-length (vector-filter (λ (ce) (contains-char? ce #\z)) charidx)) 7830)
(check-equal? (charidx->chars (word->charidx "abuzz")) '(#\a #\b #\u #\z)))

@ -0,0 +1,68 @@
#lang debug racket/base
(require racket/file
racket/fasl
racket/runtime-path)
(provide (all-defined-out))
(define-runtime-path wordidx-file "compiled/words/words-index.rktd")
(struct word-rec (word charint length) #:prefab)
(define (make-word-recs)
(define reverse-string (compose1 list->string reverse string->list))
(define omit-words (map reverse-string (file->lines "data/omit.rktd")))
(for/vector ([w (in-lines (open-input-file "data/words.rktd"))]
#:when (and (not (regexp-match "'" w)) ; no apostrophes
(regexp-match #rx"^[A-Za-z]+$" w) ; no accented letters
(not (member w omit-words)))) ; no bad words
(word-rec w
(word->charidx w)
(string-length w))))
(define (regenerate-word-index!)
(make-parent-directory* wordidx-file)
(s-exp->fasl
(make-word-recs)
(open-output-file wordidx-file #:exists 'replace)))
(define wordrecs
(fasl->s-exp (open-input-file (and
(unless (file-exists? wordidx-file)
(regenerate-word-index!))
wordidx-file))))
(define (char->bitindex c)
;; 64-bit layout
;; __________ZYXWVUTSRQPONMLKJIHGFEDCBA______zyxwvutsrqponmlkjihgfedcba
(cond
[(char<=? #\a c #\z) (- (char->integer c) 97)] ; 97 = (char->integer #\a)
[(char<=? #\A c #\Z) (- (char->integer c) 33)] ; 65 = (char->integer #\A)
[else 0]))
(define (word->charidx word)
(apply bitwise-ior
(for/list ([c (in-string word)])
(expt 2 (char->bitindex c)))))
(define (bitindex->char i)
(cond
[(<= 0 i 26) (integer->char (+ i 97))]
[(<= 32 i 59) (integer->char (+ i 33))]
[else (error 'bong)]))
(define (charidx->chars int)
(for/list ([i (in-range 64)]
#:when (bitwise-bit-set? int i))
(bitindex->char i)))
(define (contains-char? charidx-entry c)
(bitwise-bit-set? charidx-entry (char->bitindex c)))
(define capitalized-mask
(for/sum ([i (in-range 32 59)])
(expt 2 i)))
(define (capitalized? charidx-entry)
;; a cap only appears at the beginning of a word,
;; so it's sufficient to test whether a cap exists in the idx
(positive? (bitwise-and charidx-entry capitalized-mask)))

@ -1,17 +0,0 @@
#lang debug racket/base
(require racket/file
racket/fasl
racket/vector
"words.rkt")
(provide (all-defined-out))
(define lengthidx-file "compiled/lengthidx.rktd")
(define (regenerate-length-index!)
(s-exp->fasl (vector-map string-length usable-words) (open-output-file lengthidx-file #:exists 'replace)))
(define lengthidx (let ()
(unless (file-exists? lengthidx-file)
(regenerate-length-index!))
(fasl->s-exp (open-input-file lengthidx-file))))

@ -1,38 +1,48 @@
#lang debug racket/base
(require racket/list
racket/file
"words.rkt"
"length-index.rkt"
"char-index.rkt")
"index.rkt")
(define (wordlist #:letters [letters "etaoinshrdluw"]
#:mandatory [mandatory #f]
#:min [min-length 5]
#:max [max-length 10]
#:hide-plurals [hide-plurals? #f]
#:proper-names [proper-names? #f]
#:random [random #t]
#:max-words [max-words 10]
#:all-caps [all-caps? #f]
#:initial-caps [initial-caps? #f])
(define (make-words #:letters [letters "etaoinshrdluw"]
#:mandatory [mandatory #f]
#:min [min-length 5]
#:max [max-length 10]
#:hide-plurals [hide-plurals? #f]
#:proper-names [proper-names? #f]
#:random [random #t]
#:max-words [max-words 10]
#:all-caps [all-caps? #f]
#:initial-caps [initial-caps? #f])
(define mandatory-cs
(if mandatory (remove-duplicates (map char-downcase (string->list mandatory)) char=?) null))
(define letter-cs (remove-duplicates (append (if letters (map char-downcase (string->list letters)) null) mandatory-cs) char=?))
(define letter-cs-charidx (word->charidx (list->string letter-cs)))
(for*/fold ([ws null]
(if mandatory (remove-duplicates (for/list ([c (in-string mandatory)])
(char-downcase c)) char=?) null))
(define letter-cs-charidx
(word->charidx
(list->string
(remove-duplicates
(append (if letters
(for/list ([c (in-string letters)])
(char-downcase c))
null)
mandatory-cs)
char=?))))
(define capitalizer (cond
[all-caps? string-upcase]
[initial-caps? string-titlecase]
[else values]))
(for*/fold ([word-acc null]
[count 0]
#:result (map (cond
[all-caps? string-upcase]
[initial-caps? string-titlecase]
[else values]) ws))
([idx (in-list ((if random shuffle values) (range (vector-length usable-words))))]
[w (in-value (vector-ref usable-words idx))]
[w-charidx (in-value (vector-ref charidx idx))]
[w-lengthidx (in-value (vector-ref lengthidx idx))]
#:result word-acc)
([idx (in-list ((if random shuffle values) (range (vector-length wordrecs))))]
[rec (in-value (vector-ref wordrecs idx))]
[w (in-value (word-rec-word rec))]
[w-charidx (in-value (word-rec-charint rec))]
#:break (= count (or max-words +inf.0))
#:when (and
;; between min and max length
((if (<= min-length max-length) <= >=) min-length w-lengthidx max-length)
((if (<= min-length max-length) <= >=) min-length (word-rec-length rec) max-length)
;; word contains each mandatory char, case-insensitive
(or (not mandatory)
(for/and ([mc (in-list mandatory-cs)])
@ -44,10 +54,10 @@
(if proper-names? (capitalized? w-charidx) (not (capitalized? w-charidx)))
;; maybe hide plurals
(if hide-plurals? (not (regexp-match #rx"s$" w)) #t)))
(values (cons w ws) (add1 count))))
(values (cons (capitalizer w) word-acc) (add1 count))))
(module+ test
(require rackunit)
(time (wordlist))
(check-equal? (sort (wordlist #:mandatory "xyz") string<?)
(time (make-words))
(check-equal? (sort (make-words #:mandatory "xyz") string<?)
'("azoxy" "dysoxidize" "isazoxy" "oxytonize" "rhizotaxy" "zootaxy")))

@ -1,30 +0,0 @@
#lang debug racket/base
(require racket/file
racket/fasl
racket/string)
(provide usable-words)
(define reverse-string (compose1 list->string reverse string->list))
(define omit-words (map reverse-string (file->lines "data/omit.rktd")))
(define (make-wordlist)
;; do global filtering here (i.e., filters that are always true)
(define ws
(for/list ([w (in-lines (open-input-file "data/words.rktd"))]
#:when (and (not (regexp-match "'" w)) ; no apostrophes
(regexp-match #rx"^[A-Za-z]+$" w) ; no accented letters
(not (member w omit-words)) ; no bad words
))
w))
ws)
(define wordidx-file "compiled/wordidx.rktd")
(define (regenerate-word-index!)
(s-exp->fasl (make-wordlist) (open-output-file wordidx-file #:exists 'replace)))
(define usable-words (let ()
(unless (file-exists? wordidx-file)
(regenerate-word-index!))
(list->vector (fasl->s-exp (open-input-file wordidx-file)))))
Loading…
Cancel
Save