Removed neighbour generation out of the core library
[riddle-generator.git] / riddle_dict_builder.md
1 ---
2 jupyter:
3 jupytext:
4 formats: ipynb,md,py:percent
5 text_representation:
6 extension: .md
7 format_name: markdown
8 format_version: '1.3'
9 jupytext_version: 1.14.5
10 kernelspec:
11 display_name: Python 3 (ipykernel)
12 language: python
13 name: python3
14 ---
15
16 # Generate a dictionary of related words
17
18 ```python
19 import unicodedata
20 import re
21 from dataclasses import dataclass
22 from typing import Dict, Tuple, List, Set
23 from enum import Enum, auto
24 import functools
25 import random
26 import multiprocessing
27 import gzip
28 # import csv
29 ```
30
31 ```python
32 stop_words = set('my is in within lies and also always you will find the found but'.split())
33 negative_words = set('not never neither nor'.split())
34 ```
35
36 ```python
37 ordinals : Dict[str, int] = { 'last': -1
38 , 'first': 1
39 , 'second': 2
40 , 'third': 3
41 , 'fourth': 4
42 , 'fifth': 5
43 , 'sixth': 6
44 , 'seventh': 7
45 , 'eighth': 8
46 , 'ninth': 9
47 , 'tenth': 10
48 , 'eleventh': 11
49 , 'twelfth': 12
50 }
51
52 reverse_ordinals : Dict[int, str] = {n: w for w, n in ordinals.items()}
53 ```
54
55 ```python
56 dictionary : List[str] = [unicodedata.normalize('NFKD', w.strip()).\
57 encode('ascii', 'ignore').\
58 decode('utf-8')
59 for w in open('/usr/share/dict/british-english').readlines()
60 if w.strip().islower()
61 if w.strip().isalpha()
62 if len(w.strip()) >= 5
63 if len(w.strip()) <= 12
64 if w not in stop_words
65 if w not in negative_words
66 if w not in ordinals
67 ]
68 ```
69
70 Some types that will be used throughout the library
71
72 ```python
73 @functools.lru_cache
74 def edit_distance(s: str, t: str) -> int:
75 if s == "":
76 return len(t)
77 if t == "":
78 return len(s)
79 if s[0] == t[0]:
80 cost = 0
81 else:
82 cost = 1
83
84 res = min(
85 [ edit_distance(s[1:], t) + 1
86 , edit_distance(s, t[1:]) + 1
87 , edit_distance(s[1:], t[1:]) + cost
88 ])
89
90 return res
91 ```
92
93 ```python
94 # def find_neighbours(word: str, limit: int = 4) -> Tuple[str, List[str]]:
95 def find_neighbours(word, limit=4):
96 sword = set(word)
97 others = []
98 for other in dictionary:
99 if other != word:
100 soth = set(other)
101 if (not sword <= soth and
102 not soth <= sword and
103 edit_distance(word, other) <= limit):
104 others.append(other)
105 return word, others
106 ```
107
108 ```python
109 with multiprocessing.Pool() as pool:
110 # word_other_pairs = pool.imap_unordered(find_neighbours, dictionary, chunksize=5000)
111 word_other_pairs = pool.map(find_neighbours, dictionary)
112 ```
113
114 ```python
115 with gzip.open('dictionary_neighbours.txt.gz', 'wt') as f:
116 for word, related in word_other_pairs:
117 f.write(f'{word},{",".join(related)}\n')
118 ```
119
120 ```python
121
122 ```