4 formats: ipynb,md,py:percent
9 jupytext_version: 1.14.5
11 display_name: Python 3 (ipykernel)
16 # Generate a dictionary of related words
21 from dataclasses import dataclass
22 from typing import Dict, Tuple, List, Set
23 from enum import Enum, auto
26 import multiprocessing
32 stop_words = set('my is in within lies and also always you will find the found but'.split())
33 negative_words = set('not never neither nor'.split())
37 ordinals : Dict[str, int] = { 'last': -1
52 reverse_ordinals : Dict[int, str] = {n: w for w, n in ordinals.items()}
56 dictionary : List[str] = [unicodedata.normalize('NFKD', w.strip()).\
57 encode('ascii', 'ignore').\
59 for w in open('/usr/share/dict/british-english').readlines()
60 if w.strip().islower()
61 if w.strip().isalpha()
62 if len(w.strip()) >= 5
63 if len(w.strip()) <= 12
64 if w not in stop_words
65 if w not in negative_words
70 Some types that will be used throughout the library
74 def edit_distance(s: str, t: str) -> int:
85 [ edit_distance(s[1:], t) + 1
86 , edit_distance(s, t[1:]) + 1
87 , edit_distance(s[1:], t[1:]) + cost
94 # def find_neighbours(word: str, limit: int = 4) -> Tuple[str, List[str]]:
95 def find_neighbours(word, limit=4):
98 for other in dictionary:
101 if (not sword <= soth and
102 not soth <= sword and
103 edit_distance(word, other) <= limit):
109 with multiprocessing.Pool() as pool:
110 # word_other_pairs = pool.imap_unordered(find_neighbours, dictionary, chunksize=5000)
111 word_other_pairs = pool.map(find_neighbours, dictionary)
115 with gzip.open('dictionary_neighbours.txt.gz', 'wt') as f:
116 for word, related in word_other_pairs:
117 f.write(f'{word},{",".join(related)}\n')