You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
typesetting/csp/aima/text.txt

123 lines
4.9 KiB
Plaintext

## Create a Unigram text model from the words in the book "Flatland".
>>> flatland = DataFile("flat11.txt").read()
>>> wordseq = words(flatland)
>>> P = UnigramTextModel(wordseq)
## Now do segmentation, using the text model as a prior.
>>> s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P)
>>> s
['it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces']
>>> 1e-30 < p < 1e-20
True
>>> s, p = viterbi_segment('wheninthecourseofhumaneventsitbecomesnecessary', P)
>>> s
['when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary']
## Test the decoding system
>>> shift_encode("This is a secret message.", 17)
'Kyzj zj r jvtivk dvjjrxv.'
>>> ring = ShiftDecoder(flatland)
>>> ring.decode('Kyzj zj r jvtivk dvjjrxv.')
'This is a secret message.'
>>> ring.decode(rot13('Hello, world!'))
'Hello, world!'
## CountingProbDist
## Add a thousand samples of a roll of a die to D.
>>> D = CountingProbDist()
>>> for i in range(10000):
... D.add(random.choice('123456'))
>>> ps = [D[n] for n in '123456']
>>> 1./7. <= min(ps) <= max(ps) <= 1./5.
True
## demo
## Compare 1-, 2-, and 3-gram word models of the same text.
>>> flatland = DataFile("flat11.txt").read()
>>> wordseq = words(flatland)
>>> P1 = UnigramTextModel(wordseq)
>>> P2 = NgramTextModel(2, wordseq)
>>> P3 = NgramTextModel(3, wordseq)
## Generate random text from the N-gram models
>>> P1.samples(20)
'you thought known but were insides of see in depend by us dodecahedrons just but i words are instead degrees'
>>> P2.samples(20)
'flatland well then can anything else more into the total destruction and circles teach others confine women must be added'
>>> P3.samples(20)
'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
## The most frequent entries in each model
>>> P1.top(10)
[(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), (478, 'that'), (399, 'is'), (348, 'you')]
>>> P2.top(10)
[(368, ('of', 'the')), (152, ('to', 'the')), (152, ('in', 'the')), (86, ('of', 'a')), (80, ('it', 'is')), (71, ('by', 'the')), (68, ('for', 'the')), (68, ('and', 'the')), (62, ('on', 'the')), (60, ('to', 'be'))]
>>> P3.top(10)
[(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')), (16, ('the', 'sense', 'of')), (13, ('by', 'the', 'sense')), (13, ('as', 'well', 'as')), (12, ('of', 'the', 'circles')), (12, ('of', 'sight', 'recognition')), (11, ('the', 'number', 'of')), (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
## Probabilities of some common n-grams
>>> P1['the']
0.061139348356200607
>>> P2[('of', 'the')]
0.010812081325655188
>>> P3[('', '', 'but')]
0.0
>>> P3[('so', 'as', 'to')]
0.00032318721353860618
## Distributions given the previous n-1 words
>>> P2.cond_prob['went',].dictionary
>>> P3.cond_prob['in', 'order'].dictionary
{'to': 6}
## Build and test an IR System
>>> uc = UnixConsultant()
>>> uc.present_results("how do I remove a file")
76.83| ../data/man/rm.txt | RM(1) FSF RM(1)
67.83| ../data/man/tar.txt | TAR(1) TAR(1)
67.79| ../data/man/cp.txt | CP(1) FSF CP(1)
66.58| ../data/man/zip.txt | ZIP(1L) ZIP(1L)
64.58| ../data/man/gzip.txt | GZIP(1) GZIP(1)
63.74| ../data/man/pine.txt | pine(1) pine(1)
62.95| ../data/man/shred.txt | SHRED(1) FSF SHRED(1)
57.46| ../data/man/pico.txt | pico(1) pico(1)
43.38| ../data/man/login.txt | LOGIN(1) Linux Programmer's Manual
41.93| ../data/man/ln.txt | LN(1) FSF LN(1)
>>> uc.present_results("how do I delete a file")
75.47| ../data/man/diff.txt | DIFF(1) GNU Tools DIFF(1)
69.12| ../data/man/pine.txt | pine(1) pine(1)
63.56| ../data/man/tar.txt | TAR(1) TAR(1)
60.63| ../data/man/zip.txt | ZIP(1L) ZIP(1L)
57.46| ../data/man/pico.txt | pico(1) pico(1)
51.28| ../data/man/shred.txt | SHRED(1) FSF SHRED(1)
26.72| ../data/man/tr.txt | TR(1) User Commands TR(1)
>>> uc.present_results("email")
18.39| ../data/man/pine.txt | pine(1) pine(1)
12.01| ../data/man/info.txt | INFO(1) FSF INFO(1)
9.89| ../data/man/pico.txt | pico(1) pico(1)
8.73| ../data/man/grep.txt | GREP(1) GREP(1)
8.07| ../data/man/zip.txt | ZIP(1L) ZIP(1L)
>>> uc.present_results("word counts for files")
112.38| ../data/man/grep.txt | GREP(1) GREP(1)
101.84| ../data/man/wc.txt | WC(1) User Commands WC(1)
82.46| ../data/man/find.txt | FIND(1L) FIND(1L)
74.64| ../data/man/du.txt | DU(1) FSF DU(1)
>>> uc.present_results("learn: date")
>>> uc.present_results("2003")
14.58| ../data/man/pine.txt | pine(1) pine(1)
11.62| ../data/man/jar.txt | FASTJAR(1) GNU FASTJAR(1)