c945df8ef101137430c3218aa4e65af10e8bcf0c
[ou-summer-of-code-2017.git] / 09-resolving-the-bill / subsequence.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# Subsequence strings\n",
8 "\n",
9 "Given two strings `a` and `b`, is `a` a subsequence of `b`?\n",
10 "\n",
11 "For example,\n",
12 "Given:\n",
13 "a = \"aabcc\",\n",
14 "b = \"dabaabcacb\",\n",
15 "\n",
16 "return True : dAbAaBCaCb\n",
17 "\n"
18 ]
19 },
20 {
21 "cell_type": "code",
22 "execution_count": 54,
23 "metadata": {
24 "collapsed": true
25 },
26 "outputs": [],
27 "source": [
28 "s1 = \"aabcc\"\n",
29 "s2t = \"dabaabcacb\"\n",
30 "s2f = \"dabacc\"\n",
31 "\n",
32 "s2 = s2t"
33 ]
34 },
35 {
36 "cell_type": "markdown",
37 "metadata": {},
38 "source": [
39 "`dp_table[i, j]` is True if first `i` characters of `s1` can be found in first `j` characters of `s2`."
40 ]
41 },
42 {
43 "cell_type": "code",
44 "execution_count": 4,
45 "metadata": {
46 "scrolled": true
47 },
48 "outputs": [
49 {
50 "data": {
51 "text/plain": [
52 "{(0, 0): False,\n",
53 " (0, 1): False,\n",
54 " (0, 2): False,\n",
55 " (0, 3): False,\n",
56 " (0, 4): False,\n",
57 " (0, 5): False,\n",
58 " (0, 6): False,\n",
59 " (0, 7): False,\n",
60 " (0, 8): False,\n",
61 " (0, 9): False,\n",
62 " (0, 10): False,\n",
63 " (1, 0): False,\n",
64 " (1, 1): False,\n",
65 " (1, 2): False,\n",
66 " (1, 3): False,\n",
67 " (1, 4): False,\n",
68 " (1, 5): False,\n",
69 " (1, 6): False,\n",
70 " (1, 7): False,\n",
71 " (1, 8): False,\n",
72 " (1, 9): False,\n",
73 " (1, 10): False,\n",
74 " (2, 0): False,\n",
75 " (2, 1): False,\n",
76 " (2, 2): False,\n",
77 " (2, 3): False,\n",
78 " (2, 4): False,\n",
79 " (2, 5): False,\n",
80 " (2, 6): False,\n",
81 " (2, 7): False,\n",
82 " (2, 8): False,\n",
83 " (2, 9): False,\n",
84 " (2, 10): False,\n",
85 " (3, 0): False,\n",
86 " (3, 1): False,\n",
87 " (3, 2): False,\n",
88 " (3, 3): False,\n",
89 " (3, 4): False,\n",
90 " (3, 5): False,\n",
91 " (3, 6): False,\n",
92 " (3, 7): False,\n",
93 " (3, 8): False,\n",
94 " (3, 9): False,\n",
95 " (3, 10): False,\n",
96 " (4, 0): False,\n",
97 " (4, 1): False,\n",
98 " (4, 2): False,\n",
99 " (4, 3): False,\n",
100 " (4, 4): False,\n",
101 " (4, 5): False,\n",
102 " (4, 6): False,\n",
103 " (4, 7): False,\n",
104 " (4, 8): False,\n",
105 " (4, 9): False,\n",
106 " (4, 10): False,\n",
107 " (5, 0): False,\n",
108 " (5, 1): False,\n",
109 " (5, 2): False,\n",
110 " (5, 3): False,\n",
111 " (5, 4): False,\n",
112 " (5, 5): False,\n",
113 " (5, 6): False,\n",
114 " (5, 7): False,\n",
115 " (5, 8): False,\n",
116 " (5, 9): False,\n",
117 " (5, 10): False}"
118 ]
119 },
120 "execution_count": 4,
121 "metadata": {},
122 "output_type": "execute_result"
123 }
124 ],
125 "source": [
126 "dp_table = {(i, j): False\n",
127 " for i in range(len(s1)+1)\n",
128 " for j in range(len(s2)+1)}\n",
129 "dp_table"
130 ]
131 },
132 {
133 "cell_type": "code",
134 "execution_count": 5,
135 "metadata": {
136 "collapsed": true
137 },
138 "outputs": [],
139 "source": [
140 "def show_table(table):\n",
141 " return '\\n'.join(\n",
142 " ' '.join(str(table[i, j])[0] for j in sorted(set([k[1] for k in table])))\n",
143 " for i in sorted(set([k[0] for k in table]))) "
144 ]
145 },
146 {
147 "cell_type": "code",
148 "execution_count": 6,
149 "metadata": {},
150 "outputs": [
151 {
152 "name": "stdout",
153 "output_type": "stream",
154 "text": [
155 "F F F F F F F F F F F\n",
156 "F F F F F F F F F F F\n",
157 "F F F F F F F F F F F\n",
158 "F F F F F F F F F F F\n",
159 "F F F F F F F F F F F\n",
160 "F F F F F F F F F F F\n"
161 ]
162 }
163 ],
164 "source": [
165 "print(show_table(dp_table))"
166 ]
167 },
168 {
169 "cell_type": "code",
170 "execution_count": 47,
171 "metadata": {},
172 "outputs": [
173 {
174 "name": "stdout",
175 "output_type": "stream",
176 "text": [
177 "aaaa aaababb\n",
178 "aa 0 0 ! ! True\n",
179 "aa 0 1 ! ! True\n",
180 "aa 0 2 ! ! True\n",
181 "aa 0 3 ! ! True\n",
182 "aa 0 4 ! ! True\n",
183 "aa 0 5 ! ! True\n",
184 "aa 0 6 ! ! True\n",
185 "aa 0 7 ! ! True\n",
186 "s1 1 1 a a True\n",
187 "s2 1 2 a a True\n",
188 "s1 1 2 a a True\n",
189 "s2 1 3 a a True\n",
190 "s1 1 3 a a True\n",
191 "s2 1 4 a b True\n",
192 "s2 1 5 a a True\n",
193 "s1 1 5 a a True\n",
194 "s2 1 6 a b True\n",
195 "s2 1 7 a b True\n",
196 "s1 2 2 a a True\n",
197 "s2 2 3 a a True\n",
198 "s1 2 3 a a True\n",
199 "s2 2 4 a b True\n",
200 "s2 2 5 a a True\n",
201 "s1 2 5 a a True\n",
202 "s2 2 6 a b True\n",
203 "s2 2 7 a b True\n",
204 "s1 3 3 a a True\n",
205 "s2 3 4 a b True\n",
206 "s2 3 5 a a True\n",
207 "s1 3 5 a a True\n",
208 "s2 3 6 a b True\n",
209 "s2 3 7 a b True\n",
210 "xx 4 4 a b False\n",
211 "s1 4 5 a a True\n",
212 "s2 4 6 a b True\n",
213 "s2 4 7 a b True\n",
214 "T T T T T T T T\n",
215 "F T T T T T T T\n",
216 "F F T T T T T T\n",
217 "F F F T T T T T\n",
218 "F F F F F T T T\n"
219 ]
220 },
221 {
222 "data": {
223 "text/plain": [
224 "{(1, 1): (0, 0, 'a', 's1'),\n",
225 " (1, 2): (0, 1, 'a', 's1'),\n",
226 " (1, 3): (0, 2, 'a', 's1'),\n",
227 " (1, 4): (1, 3, 'b', 's2'),\n",
228 " (1, 5): (0, 4, 'a', 's1'),\n",
229 " (1, 6): (1, 5, 'b', 's2'),\n",
230 " (1, 7): (1, 6, 'b', 's2'),\n",
231 " (2, 2): (1, 1, 'a', 's1'),\n",
232 " (2, 3): (1, 2, 'a', 's1'),\n",
233 " (2, 4): (2, 3, 'b', 's2'),\n",
234 " (2, 5): (1, 4, 'a', 's1'),\n",
235 " (2, 6): (2, 5, 'b', 's2'),\n",
236 " (2, 7): (2, 6, 'b', 's2'),\n",
237 " (3, 3): (2, 2, 'a', 's1'),\n",
238 " (3, 4): (3, 3, 'b', 's2'),\n",
239 " (3, 5): (2, 4, 'a', 's1'),\n",
240 " (3, 6): (3, 5, 'b', 's2'),\n",
241 " (3, 7): (3, 6, 'b', 's2'),\n",
242 " (4, 5): (3, 4, 'a', 's1'),\n",
243 " (4, 6): (4, 5, 'b', 's2'),\n",
244 " (4, 7): (4, 6, 'b', 's2')}"
245 ]
246 },
247 "execution_count": 47,
248 "metadata": {},
249 "output_type": "execute_result"
250 }
251 ],
252 "source": [
253 "s2 = s2f\n",
254 "\n",
255 "s1 = 'aaaa'\n",
256 "s2 = 'aaababb'\n",
257 "\n",
258 "print(s1, s2)\n",
259 "\n",
260 "dp_table = {(i, j): False\n",
261 " for i in range(len(s1)+1)\n",
262 " for j in range(len(s2)+1)}\n",
263 "\n",
264 "backpointers = {}\n",
265 "\n",
266 "for i in range(len(s1)+1):\n",
267 " for j in range(i, len(s2)+1):\n",
268 " if i == 0 or j == 0:\n",
269 " dp_table[i, j] = True\n",
270 " print('aa', i, j, '!', '!', dp_table[i, j])\n",
271 " else:\n",
272 " # extend by character from s2\n",
273 " if dp_table[i, j-1]:\n",
274 " dp_table[i, j] = True\n",
275 " backpointers[i, j] = (i, j-1, s2[j-1], 's2')\n",
276 " print('s2', i, j, s1[i-1], s2[j-1], dp_table[i, j]) \n",
277 " # extend by character from s1\n",
278 " if dp_table[i-1, j-1] and s1[i-1] == s2[j-1]:\n",
279 " dp_table[i, j] = True\n",
280 " backpointers[i, j] = (i-1, j-1, s1[i-1], 's1') \n",
281 " print('s1', i, j, s1[i-1], s2[j-1], dp_table[i, j])\n",
282 " if not dp_table[i, j]:\n",
283 " print('xx', i, j, s1[i-1], s2[j-1], dp_table[i, j])\n",
284 "\n",
285 "print(show_table(dp_table))\n",
286 "backpointers"
287 ]
288 },
289 {
290 "cell_type": "code",
291 "execution_count": 59,
292 "metadata": {
293 "collapsed": true
294 },
295 "outputs": [],
296 "source": [
297 "def show_backtrace(bps):\n",
298 " i = max([0] + [k[0] for k in bps])\n",
299 " j = max([0] + [k[1] for k in bps])\n",
300 " chars = ''\n",
301 " if (i, j) in bps:\n",
302 " while i != 0 or j != 0:\n",
303 " if bps[i, j][3] == 's1':\n",
304 " chars += bps[i, j][2].upper()\n",
305 " else:\n",
306 " chars += bps[i, j][2]\n",
307 " i, j = bps[i, j][0], bps[i, j][1] \n",
308 " return ''.join(list(reversed(chars)))\n",
309 " else:\n",
310 " return ''"
311 ]
312 },
313 {
314 "cell_type": "code",
315 "execution_count": 60,
316 "metadata": {},
317 "outputs": [
318 {
319 "data": {
320 "text/plain": [
321 "'AAAbAbb'"
322 ]
323 },
324 "execution_count": 60,
325 "metadata": {},
326 "output_type": "execute_result"
327 }
328 ],
329 "source": [
330 "show_backtrace(backpointers)"
331 ]
332 },
333 {
334 "cell_type": "code",
335 "execution_count": 49,
336 "metadata": {},
337 "outputs": [],
338 "source": [
339 "def is_subseq(seq1, seq2, return_backpointers=False, debug=False):\n",
340 " \"\"\"Return true if seq1 is a subsequence of seq2.\n",
341 " If return_backpointers, also return the set of backpointers to\n",
342 " reconstruct the subsequence\"\"\"\n",
343 " \n",
344 " # dp_table[i, j] is True if first i characters of seq1 can\n",
345 " # be found in the first j characters of seq2\n",
346 " \n",
347 " dp_table = {(i, j): False\n",
348 " for i in range(len(seq1)+1)\n",
349 " for j in range(len(seq2)+1)}\n",
350 "\n",
351 " backpointers = {}\n",
352 " \n",
353 " for i in range(len(seq1)+1):\n",
354 " for j in range(i, len(seq2)+1):\n",
355 " if i == 0 or j == 0:\n",
356 " dp_table[i, j] = True\n",
357 " if debug: print('aa', i, j, '!', '!', dp_table[i, j])\n",
358 " else:\n",
359 " # extend by character from s2\n",
360 " if dp_table[i, j-1]:\n",
361 " dp_table[i, j] = True\n",
362 " backpointers[i, j] = (i, j-1, seq2[j-1], 's2')\n",
363 " if debug: print('s2', i, j, seq1[i-1], seq2[j-1], dp_table[i, j]) \n",
364 " # extend by character from s1\n",
365 " if dp_table[i-1, j-1] and seq1[i-1] == seq2[j-1]:\n",
366 " dp_table[i, j] = True\n",
367 " backpointers[i, j] = (i-1, j-1, seq1[i-1], 's1') \n",
368 " if debug: print('s1', i, j, seq1[i-1], seq2[j-1], dp_table[i, j])\n",
369 " if not dp_table[i, j]:\n",
370 " if debug: print('xx', i, j, seq1[i-1], seq2[j-1], dp_table[i, j]) \n",
371 " \n",
372 " if return_backpointers:\n",
373 " return dp_table[len(seq1), len(seq2)], backpointers\n",
374 " else:\n",
375 " return dp_table[len(seq1), len(seq2)]"
376 ]
377 },
378 {
379 "cell_type": "code",
380 "execution_count": 39,
381 "metadata": {},
382 "outputs": [
383 {
384 "data": {
385 "text/plain": [
386 "True"
387 ]
388 },
389 "execution_count": 39,
390 "metadata": {},
391 "output_type": "execute_result"
392 }
393 ],
394 "source": [
395 "is_subseq(s1, s2t)"
396 ]
397 },
398 {
399 "cell_type": "code",
400 "execution_count": 55,
401 "metadata": {},
402 "outputs": [
403 {
404 "data": {
405 "text/plain": [
406 "True"
407 ]
408 },
409 "execution_count": 55,
410 "metadata": {},
411 "output_type": "execute_result"
412 }
413 ],
414 "source": [
415 "tf, bps = is_subseq(s1, s2t, return_backpointers=True)\n",
416 "tf"
417 ]
418 },
419 {
420 "cell_type": "code",
421 "execution_count": 56,
422 "metadata": {},
423 "outputs": [
424 {
425 "data": {
426 "text/plain": [
427 "'AABCaCb'"
428 ]
429 },
430 "execution_count": 56,
431 "metadata": {},
432 "output_type": "execute_result"
433 }
434 ],
435 "source": [
436 "show_backtrace(bps)"
437 ]
438 },
439 {
440 "cell_type": "code",
441 "execution_count": 42,
442 "metadata": {},
443 "outputs": [
444 {
445 "data": {
446 "text/plain": [
447 "False"
448 ]
449 },
450 "execution_count": 42,
451 "metadata": {},
452 "output_type": "execute_result"
453 }
454 ],
455 "source": [
456 "is_subseq(s1, s2f)"
457 ]
458 },
459 {
460 "cell_type": "code",
461 "execution_count": 43,
462 "metadata": {},
463 "outputs": [
464 {
465 "name": "stdout",
466 "output_type": "stream",
467 "text": [
468 "aa 0 0 ! ! True\n",
469 "aa 0 1 ! ! True\n",
470 "aa 0 2 ! ! True\n",
471 "aa 0 3 ! ! True\n",
472 "aa 0 4 ! ! True\n",
473 "aa 0 5 ! ! True\n",
474 "aa 0 6 ! ! True\n",
475 "aa 0 7 ! ! True\n",
476 "s1 1 1 a a True\n",
477 "s2 1 2 a a True\n",
478 "s1 1 2 a a True\n",
479 "s2 1 3 a a True\n",
480 "s1 1 3 a a True\n",
481 "s2 1 4 a b True\n",
482 "s2 1 5 a a True\n",
483 "s1 1 5 a a True\n",
484 "s2 1 6 a b True\n",
485 "s2 1 7 a b True\n",
486 "s1 2 2 a a True\n",
487 "s2 2 3 a a True\n",
488 "s1 2 3 a a True\n",
489 "s2 2 4 a b True\n",
490 "s2 2 5 a a True\n",
491 "s1 2 5 a a True\n",
492 "s2 2 6 a b True\n",
493 "s2 2 7 a b True\n",
494 "s1 3 3 a a True\n",
495 "s2 3 4 a b True\n",
496 "s2 3 5 a a True\n",
497 "s1 3 5 a a True\n",
498 "s2 3 6 a b True\n",
499 "s2 3 7 a b True\n",
500 "xx 4 4 a b False\n",
501 "s1 4 5 a a True\n",
502 "s2 4 6 a b True\n",
503 "s2 4 7 a b True\n"
504 ]
505 },
506 {
507 "data": {
508 "text/plain": [
509 "(True,\n",
510 " {(1, 1): (0, 1, 'a', 's1'),\n",
511 " (1, 2): (0, 2, 'a', 's1'),\n",
512 " (1, 3): (0, 3, 'a', 's1'),\n",
513 " (1, 4): (1, 3, 'b', 's2'),\n",
514 " (1, 5): (0, 5, 'a', 's1'),\n",
515 " (1, 6): (1, 5, 'b', 's2'),\n",
516 " (1, 7): (1, 6, 'b', 's2'),\n",
517 " (2, 2): (1, 2, 'a', 's1'),\n",
518 " (2, 3): (1, 3, 'a', 's1'),\n",
519 " (2, 4): (2, 3, 'b', 's2'),\n",
520 " (2, 5): (1, 5, 'a', 's1'),\n",
521 " (2, 6): (2, 5, 'b', 's2'),\n",
522 " (2, 7): (2, 6, 'b', 's2'),\n",
523 " (3, 3): (2, 3, 'a', 's1'),\n",
524 " (3, 4): (3, 3, 'b', 's2'),\n",
525 " (3, 5): (2, 5, 'a', 's1'),\n",
526 " (3, 6): (3, 5, 'b', 's2'),\n",
527 " (3, 7): (3, 6, 'b', 's2'),\n",
528 " (4, 5): (3, 5, 'a', 's1'),\n",
529 " (4, 6): (4, 5, 'b', 's2'),\n",
530 " (4, 7): (4, 6, 'b', 's2')})"
531 ]
532 },
533 "execution_count": 43,
534 "metadata": {},
535 "output_type": "execute_result"
536 }
537 ],
538 "source": [
539 "is_subseq('aaaa', 'aaababb', return_backpointers=True, debug=True)"
540 ]
541 },
542 {
543 "cell_type": "code",
544 "execution_count": null,
545 "metadata": {
546 "collapsed": true
547 },
548 "outputs": [],
549 "source": []
550 }
551 ],
552 "metadata": {
553 "kernelspec": {
554 "display_name": "Python 3",
555 "language": "python",
556 "name": "python3"
557 },
558 "language_info": {
559 "codemirror_mode": {
560 "name": "ipython",
561 "version": 3
562 },
563 "file_extension": ".py",
564 "mimetype": "text/x-python",
565 "name": "python",
566 "nbconvert_exporter": "python",
567 "pygments_lexer": "ipython3",
568 "version": "3.5.2+"
569 }
570 },
571 "nbformat": 4,
572 "nbformat_minor": 1
573 }