8 import importlib
.resources
as pkg_resources
10 import szyfrow
.support
.norms
11 from szyfrow
.support
.utilities
import sanitise
14 from szyfrow
import language_model_files
17 def datafile(name
, sep
='\t'):
18 """Read key,value pairs from file.
20 with pkg_resources
.open_text(language_model_files
, name
) as f
:
21 # with open(p name), 'r') as f:
23 splits
= line
.split(sep
)
24 yield [splits
[0], int(splits
[1])]
26 english_counts
= collections
.Counter(dict(datafile('count_1l.txt')))
27 normalised_english_counts
= szyfrow
.support
.norms
.normalise(english_counts
)
29 english_bigram_counts
= collections
.Counter(dict(datafile('count_2l.txt')))
30 normalised_english_bigram_counts
= szyfrow
.support
.norms
.normalise(english_bigram_counts
)
32 english_trigram_counts
= collections
.Counter(dict(datafile('count_3l.txt')))
33 normalised_english_trigram_counts
= szyfrow
.support
.norms
.normalise(english_trigram_counts
)
35 with pkg_resources
.open_text(language_model_files
, 'words.txt') as f
:
36 keywords
= [line
.rstrip() for line
in f
]
39 def weighted_choice(d
):
40 """Generate random item from a dictionary of item counts
42 target
= random
.uniform(0, sum(d
.values()))
44 for (l
, p
) in d
.items():
50 def random_english_letter():
51 """Generate a random letter based on English letter counts
53 return weighted_choice(normalised_english_counts
)
57 """Returns all n-grams of a text
59 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
60 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
62 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
63 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
64 'rown', 'ownf', 'wnfo', 'nfox']
66 return [text
[i
:i
+n
] for i
in range(len(text
)-n
+1)]
70 """A probability distribution estimated from counts in datafile.
71 Values are stored and returned as log probabilities.
73 def __init__(self
, data
=[], estimate_of_missing
=None):
74 data1
, data2
= itertools
.tee(data
)
75 self
.total
= sum([d
[1] for d
in data1
])
76 for key
, count
in data2
:
77 self
[key
] = log10(count
/ self
.total
)
78 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
79 def __missing__(self
, key
):
80 return self
.estimate_of_missing(key
, self
.total
)
82 def log_probability_of_unknown_word(key
, N
):
83 """Estimate the probability of an unknown word.
85 return -log10(N
* 10**((len(key
) - 2) * 1.4))
87 Pw
= Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word
)
88 Pl
= Pdist(datafile('count_1l.txt'), lambda _k
, _N
: 0)
89 P2l
= Pdist(datafile('count_2l.txt'), lambda _k
, _N
: 0)
90 P3l
= Pdist(datafile('count_3l.txt'), lambda _k
, _N
: 0)
93 """The Naive Bayes log probability of a sequence of words.
95 return sum(Pw
[w
.lower()] for w
in words
)
97 def Pletters(letters
):
98 """The Naive Bayes log probability of a sequence of letters.
100 return sum(Pl
[l
.lower()] for l
in letters
)
102 def Pbigrams(letters
):
103 """The Naive Bayes log probability of the bigrams formed from a sequence
106 return sum(P2l
[p
] for p
in ngrams(letters
, 2))
108 def Ptrigrams(letters
):
109 """The Naive Bayes log probability of the trigrams formed from a sequence
112 return sum(P3l
[p
] for p
in ngrams(letters
, 3))
115 def cosine_distance_score(text
):
116 """Finds the dissimilarity of a text to English, using the cosine distance
117 of the frequency distribution.
119 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
122 # return szyfrow.support.norms.cosine_distance(english_counts,
123 # collections.Counter(sanitise(text)))
124 return 1 - szyfrow
.support
.norms
.cosine_similarity(english_counts
,
125 collections
.Counter(sanitise(text
)))
128 if __name__
== "__main__":