7 import importlib
.resources
as pkg_resources
9 import szyfrow
.support
.norms
10 from szyfrow
.support
.utilities
import sanitise
, deduplicate
11 from szyfrow
import language_model_files
14 def datafile(name
, sep
='\t'):
15 """Read key,value pairs from file.
17 with pkg_resources
.open_text(language_model_files
, name
) as f
:
18 # with open(p name), 'r') as f:
20 splits
= line
.split(sep
)
21 yield [splits
[0], int(splits
[1])]
23 english_counts
= collections
.Counter(dict(datafile('count_1l.txt')))
24 normalised_english_counts
= szyfrow
.support
.norms
.normalise(english_counts
)
26 english_bigram_counts
= collections
.Counter(dict(datafile('count_2l.txt')))
27 normalised_english_bigram_counts
= szyfrow
.support
.norms
.normalise(english_bigram_counts
)
29 english_trigram_counts
= collections
.Counter(dict(datafile('count_3l.txt')))
30 normalised_english_trigram_counts
= szyfrow
.support
.norms
.normalise(english_trigram_counts
)
32 with pkg_resources
.open_text(language_model_files
, 'words.txt') as f
:
33 keywords
= [line
.rstrip() for line
in f
]
36 def transpositions_of(keyword
):
37 """Finds the transpostions given by a keyword. For instance, the keyword
38 'clever' rearranges to 'celrv', so the first column (0) stays first, the
39 second column (1) moves to third, the third column (2) moves to second,
42 If passed a tuple, assume it's already a transposition and just return it.
44 >>> transpositions_of('clever')
46 >>> transpositions_of('fred')
48 >>> transpositions_of((3, 2, 0, 1))
51 if isinstance(keyword
, tuple):
54 key
= deduplicate(keyword
)
55 transpositions
= tuple(key
.index(l
) for l
in sorted(key
))
58 transpositions
= collections
.defaultdict(list)
60 transpositions
[transpositions_of(word
)] += [word
]
63 def weighted_choice(d
):
64 """Generate random item from a dictionary of item counts
66 target
= random
.uniform(0, sum(d
.values()))
68 for (l
, p
) in d
.items():
74 def random_english_letter():
75 """Generate a random letter based on English letter counts
77 return weighted_choice(normalised_english_counts
)
81 """Returns all n-grams of a text
83 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
84 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
86 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
87 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
88 'rown', 'ownf', 'wnfo', 'nfox']
90 return [text
[i
:i
+n
] for i
in range(len(text
)-n
+1)]
94 """A probability distribution estimated from counts in datafile.
95 Values are stored and returned as log probabilities.
97 def __init__(self
, data
=[], estimate_of_missing
=None):
98 data1
, data2
= itertools
.tee(data
)
99 self
.total
= sum([d
[1] for d
in data1
])
100 for key
, count
in data2
:
101 self
[key
] = log10(count
/ self
.total
)
102 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
103 def __missing__(self
, key
):
104 return self
.estimate_of_missing(key
, self
.total
)
106 def log_probability_of_unknown_word(key
, N
):
107 """Estimate the probability of an unknown word.
109 return -log10(N
* 10**((len(key
) - 2) * 1.4))
111 Pw
= Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word
)
112 Pl
= Pdist(datafile('count_1l.txt'), lambda _k
, _N
: 0)
113 P2l
= Pdist(datafile('count_2l.txt'), lambda _k
, _N
: 0)
114 P3l
= Pdist(datafile('count_3l.txt'), lambda _k
, _N
: 0)
117 """The Naive Bayes log probability of a sequence of words.
119 return sum(Pw
[w
.lower()] for w
in words
)
121 def Pletters(letters
):
122 """The Naive Bayes log probability of a sequence of letters.
124 return sum(Pl
[l
.lower()] for l
in letters
)
126 def Pbigrams(letters
):
127 """The Naive Bayes log probability of the bigrams formed from a sequence
130 return sum(P2l
[p
] for p
in ngrams(letters
, 2))
132 def Ptrigrams(letters
):
133 """The Naive Bayes log probability of the trigrams formed from a sequence
136 return sum(P3l
[p
] for p
in ngrams(letters
, 3))
139 def cosine_distance_score(text
):
140 """Finds the dissimilarity of a text to English, using the cosine distance
141 of the frequency distribution.
143 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
146 # return szyfrow.support.norms.cosine_distance(english_counts,
147 # collections.Counter(sanitise(text)))
148 return 1 - szyfrow
.support
.norms
.cosine_similarity(english_counts
,
149 collections
.Counter(sanitise(text
)))
152 if __name__
== "__main__":