4 "cell_type": "markdown",
7 "# Subsequence strings\n",
9 "Given two strings `a` and `b`, is `a` a subsequence of `b`?\n",
14 "b = \"dabaabcacb\",\n",
16 "return True : dAbAaBCaCb\n",
22 "execution_count": 33,
40 "s2t = \"dabaabcacb\"\n",
47 "cell_type": "markdown",
50 "`dp_table[i, j]` is True if first `i` characters of `s1` can be found in first `j` characters of `s2`."
106 " (3, 10): False,\n",
117 " (4, 10): False,\n",
131 "execution_count": 4,
133 "output_type": "execute_result"
137 "dp_table = {(i, j): False\n",
138 " for i in range(len(s1)+1)\n",
139 " for j in range(len(s2)+1)}\n",
145 "execution_count": 5,
151 "def show_table(table):\n",
152 " return '\\n'.join(\n",
153 " ' '.join(str(table[i, j])[0] for j in sorted(set([k[1] for k in table])))\n",
154 " for i in sorted(set([k[0] for k in table]))) "
159 "execution_count": 6,
164 "output_type": "stream",
166 "F F F F F F F F F F F\n",
167 "F F F F F F F F F F F\n",
168 "F F F F F F F F F F F\n",
169 "F F F F F F F F F F F\n",
170 "F F F F F F F F F F F\n",
171 "F F F F F F F F F F F\n"
176 "print(show_table(dp_table))"
181 "execution_count": 7,
186 "output_type": "stream",
221 "xx 4 4 a b False\n",
235 "{(1, 1): (0, 0, 'a', 's1'),\n",
236 " (1, 2): (0, 1, 'a', 's1'),\n",
237 " (1, 3): (0, 2, 'a', 's1'),\n",
238 " (1, 4): (1, 3, 'b', 's2'),\n",
239 " (1, 5): (0, 4, 'a', 's1'),\n",
240 " (1, 6): (1, 5, 'b', 's2'),\n",
241 " (1, 7): (1, 6, 'b', 's2'),\n",
242 " (2, 2): (1, 1, 'a', 's1'),\n",
243 " (2, 3): (1, 2, 'a', 's1'),\n",
244 " (2, 4): (2, 3, 'b', 's2'),\n",
245 " (2, 5): (1, 4, 'a', 's1'),\n",
246 " (2, 6): (2, 5, 'b', 's2'),\n",
247 " (2, 7): (2, 6, 'b', 's2'),\n",
248 " (3, 3): (2, 2, 'a', 's1'),\n",
249 " (3, 4): (3, 3, 'b', 's2'),\n",
250 " (3, 5): (2, 4, 'a', 's1'),\n",
251 " (3, 6): (3, 5, 'b', 's2'),\n",
252 " (3, 7): (3, 6, 'b', 's2'),\n",
253 " (4, 5): (3, 4, 'a', 's1'),\n",
254 " (4, 6): (4, 5, 'b', 's2'),\n",
255 " (4, 7): (4, 6, 'b', 's2')}"
258 "execution_count": 7,
260 "output_type": "execute_result"
271 "dp_table = {(i, j): False\n",
272 " for i in range(len(s1)+1)\n",
273 " for j in range(len(s2)+1)}\n",
275 "backpointers = {}\n",
277 "for i in range(len(s1)+1):\n",
278 " for j in range(i, len(s2)+1):\n",
279 " if i == 0 or j == 0:\n",
280 " dp_table[i, j] = True\n",
281 " print('aa', i, j, '!', '!', dp_table[i, j])\n",
283 " # extend by character from s2\n",
284 " if dp_table[i, j-1]:\n",
285 " dp_table[i, j] = True\n",
286 " backpointers[i, j] = (i, j-1, s2[j-1], 's2')\n",
287 " print('s2', i, j, s1[i-1], s2[j-1], dp_table[i, j]) \n",
288 " # extend by character from s1\n",
289 " if dp_table[i-1, j-1] and s1[i-1] == s2[j-1]:\n",
290 " dp_table[i, j] = True\n",
291 " backpointers[i, j] = (i-1, j-1, s1[i-1], 's1') \n",
292 " print('s1', i, j, s1[i-1], s2[j-1], dp_table[i, j])\n",
293 " if not dp_table[i, j]:\n",
294 " print('xx', i, j, s1[i-1], s2[j-1], dp_table[i, j])\n",
296 "print(show_table(dp_table))\n",
302 "execution_count": 8,
308 "def show_backtrace(bps):\n",
309 " i = max([0] + [k[0] for k in bps])\n",
310 " j = max([0] + [k[1] for k in bps])\n",
312 " if (i, j) in bps:\n",
313 " while i != 0 and j != 0:\n",
314 " if bps[i, j][3] == 's1':\n",
315 " chars += bps[i, j][2].upper()\n",
317 " chars += bps[i, j][2]\n",
318 " i, j = bps[i, j][0], bps[i, j][1] \n",
319 " return ''.join(list(reversed(chars)))\n",
326 "execution_count": 9,
335 "execution_count": 9,
337 "output_type": "execute_result"
341 "show_backtrace(backpointers)"
346 "execution_count": 11,
352 "def is_subseq(seq1, seq2, return_backpointers=False, return_table=False, debug=False):\n",
353 " \"\"\"Return true if seq1 is a subsequence of seq2.\n",
354 " If return_backpointers, also return the set of backpointers to\n",
355 " reconstruct the subsequence\"\"\"\n",
357 " # dp_table[i, j] is True if first i characters of seq1 can\n",
358 " # be found in the first j characters of seq2\n",
360 " dp_table = {(i, j): False\n",
361 " for i in range(len(seq1)+1)\n",
362 " for j in range(len(seq2)+1)}\n",
364 " backpointers = {}\n",
366 " for i in range(len(seq1)+1):\n",
367 " for j in range(i, len(seq2)+1):\n",
368 " if i == 0 or j == 0:\n",
369 " dp_table[i, j] = True\n",
370 " if debug: print('aa', i, j, '!', '!', dp_table[i, j])\n",
372 " # extend by character from s2\n",
373 " if dp_table[i, j-1]:\n",
374 " dp_table[i, j] = True\n",
375 " backpointers[i, j] = (i, j-1, seq2[j-1], 's2')\n",
376 " if debug: print('s2', i, j, seq1[i-1], seq2[j-1], dp_table[i, j]) \n",
377 " # extend by character from s1\n",
378 " if dp_table[i-1, j-1] and seq1[i-1] == seq2[j-1]:\n",
379 " dp_table[i, j] = True\n",
380 " backpointers[i, j] = (i-1, j-1, seq1[i-1], 's1') \n",
381 " if debug: print('s1', i, j, seq1[i-1], seq2[j-1], dp_table[i, j])\n",
382 " if not dp_table[i, j]:\n",
383 " if debug: print('xx', i, j, seq1[i-1], seq2[j-1], dp_table[i, j]) \n",
385 "# if return_backpointers:\n",
386 "# return dp_table[len(seq1), len(seq2)], backpointers\n",
388 "# return dp_table[len(seq1), len(seq2)]\n",
390 " if return_backpointers or return_table:\n",
391 " retval = [dp_table[len(seq1), len(seq2)]]\n",
392 " if return_backpointers:\n",
393 " retval += [backpointers]\n",
394 " if return_table:\n",
395 " retval += [dp_table]\n",
396 " return tuple(retval)\n",
398 " return dp_table[len(seq1), len(seq2)]"
403 "execution_count": 12,
410 "output_type": "stream",
422 "aa 0 10 ! ! True\n",
423 "xx 1 1 a d False\n",
435 "s2 1 10 a b True\n",
436 "xx 2 2 a a False\n",
437 "xx 2 3 a b False\n",
446 "s2 2 10 a b True\n",
447 "xx 3 3 a b False\n",
448 "xx 3 4 a a False\n",
455 "s2 3 10 a b True\n",
456 "xx 4 4 a a False\n",
457 "xx 4 5 a a False\n",
458 "xx 4 6 a b False\n",
459 "xx 4 7 a c False\n",
471 "execution_count": 12,
473 "output_type": "execute_result"
477 "is_subseq(s1, s2t, debug=True)"
482 "execution_count": 13,
488 "def show_annotated_table(table, bps):\n",
489 " return '\\n'.join(' '.join('.' if (i, j) not in bps else bps[i, j][2] if table[i, j] else '.' for j in sorted(set([k[1] for k in table])))\n",
490 " for i in sorted(set([k[0] for k in table])))"
495 "execution_count": 14,
504 "execution_count": 14,
506 "output_type": "execute_result"
510 "tf, bps, tb = is_subseq(s1, s2t, return_backpointers=True, return_table=True)\n",
516 "execution_count": 15,
525 "execution_count": 15,
527 "output_type": "execute_result"
531 "show_backtrace(bps)"
536 "execution_count": 16,
541 "output_type": "stream",
543 "T T T T T T T T T T T\n",
544 "F F T T T T T T T T T\n",
545 "F F F F T T T T T T T\n",
546 "F F F F F T T T T T T\n",
547 "F F F F F F F F T T T\n"
552 "print(show_table(tb))"
557 "execution_count": 17,
562 "output_type": "stream",
564 ". . . . . . . . . . .\n",
565 ". . a b a a b c a c b\n",
566 ". . . . a a b c a c b\n",
567 ". . . . . a b c a c b\n",
568 ". . . . . . . . a c b\n"
573 "print(show_annotated_table(tb, bps))"
578 "execution_count": 18,
587 "execution_count": 18,
589 "output_type": "execute_result"
593 "len(show_backtrace(bps)) == len(s2t)"
598 "execution_count": 19,
604 "('aaaa', 'dabaabcacb')"
607 "execution_count": 19,
609 "output_type": "execute_result"
618 "execution_count": 20,
624 "{(1, 2): (0, 1, 'a', 's1'),\n",
625 " (1, 3): (1, 2, 'b', 's2'),\n",
626 " (1, 4): (0, 3, 'a', 's1'),\n",
627 " (1, 5): (0, 4, 'a', 's1'),\n",
628 " (1, 6): (1, 5, 'b', 's2'),\n",
629 " (1, 7): (1, 6, 'c', 's2'),\n",
630 " (1, 8): (0, 7, 'a', 's1'),\n",
631 " (1, 9): (1, 8, 'c', 's2'),\n",
632 " (1, 10): (1, 9, 'b', 's2'),\n",
633 " (2, 4): (1, 3, 'a', 's1'),\n",
634 " (2, 5): (1, 4, 'a', 's1'),\n",
635 " (2, 6): (2, 5, 'b', 's2'),\n",
636 " (2, 7): (2, 6, 'c', 's2'),\n",
637 " (2, 8): (1, 7, 'a', 's1'),\n",
638 " (2, 9): (2, 8, 'c', 's2'),\n",
639 " (2, 10): (2, 9, 'b', 's2'),\n",
640 " (3, 5): (2, 4, 'a', 's1'),\n",
641 " (3, 6): (3, 5, 'b', 's2'),\n",
642 " (3, 7): (3, 6, 'c', 's2'),\n",
643 " (3, 8): (2, 7, 'a', 's1'),\n",
644 " (3, 9): (3, 8, 'c', 's2'),\n",
645 " (3, 10): (3, 9, 'b', 's2'),\n",
646 " (4, 8): (3, 7, 'a', 's1'),\n",
647 " (4, 9): (4, 8, 'c', 's2'),\n",
648 " (4, 10): (4, 9, 'b', 's2')}"
651 "execution_count": 20,
653 "output_type": "execute_result"
662 "execution_count": 21,
671 "execution_count": 21,
673 "output_type": "execute_result"
682 "execution_count": 22,
689 "output_type": "stream",
723 "xx 4 4 a b False\n",
733 " {(1, 1): (0, 0, 'a', 's1'),\n",
734 " (1, 2): (0, 1, 'a', 's1'),\n",
735 " (1, 3): (0, 2, 'a', 's1'),\n",
736 " (1, 4): (1, 3, 'b', 's2'),\n",
737 " (1, 5): (0, 4, 'a', 's1'),\n",
738 " (1, 6): (1, 5, 'b', 's2'),\n",
739 " (1, 7): (1, 6, 'b', 's2'),\n",
740 " (2, 2): (1, 1, 'a', 's1'),\n",
741 " (2, 3): (1, 2, 'a', 's1'),\n",
742 " (2, 4): (2, 3, 'b', 's2'),\n",
743 " (2, 5): (1, 4, 'a', 's1'),\n",
744 " (2, 6): (2, 5, 'b', 's2'),\n",
745 " (2, 7): (2, 6, 'b', 's2'),\n",
746 " (3, 3): (2, 2, 'a', 's1'),\n",
747 " (3, 4): (3, 3, 'b', 's2'),\n",
748 " (3, 5): (2, 4, 'a', 's1'),\n",
749 " (3, 6): (3, 5, 'b', 's2'),\n",
750 " (3, 7): (3, 6, 'b', 's2'),\n",
751 " (4, 5): (3, 4, 'a', 's1'),\n",
752 " (4, 6): (4, 5, 'b', 's2'),\n",
753 " (4, 7): (4, 6, 'b', 's2')})"
756 "execution_count": 22,
758 "output_type": "execute_result"
762 "is_subseq('aaaa', 'aaababb', return_backpointers=True, debug=True)"
767 "execution_count": 27,
773 "def is_subseq_recursive(s1, s2):\n",
776 " elif len(s1) > len(s2):\n",
779 " if s1[-1] == s2[-1]:\n",
780 " return is_subseq_recursive(s1[:-1], s2[:-1]) or is_subseq_recursive(s1, s2[:-1])\n",
782 " return is_subseq_recursive(s1, s2[:-1])"
787 "execution_count": 29,
796 "execution_count": 29,
798 "output_type": "execute_result"
802 "is_subseq_recursive(s1, s2t)"
807 "execution_count": 28,
816 "execution_count": 28,
818 "output_type": "execute_result"
822 "is_subseq_recursive(s1, s2f)"
827 "execution_count": 30,
833 "def make_string(length, alphabet=None):\n",
834 " if not alphabet:\n",
835 " alphabet = 'abcdefgh'\n",
836 " return ''.join(random.choice(alphabet) for _ in range(length)) "
841 "execution_count": 31,
847 "def interleave(s1, s2, wander_limit=10, debug=False):\n",
848 " i1 = i2 = wander = 0\n",
849 " interleaved = []\n",
850 " while i1 <= len(s1) and i2 <= len(s2):\n",
851 " if i1 == len(s1):\n",
852 " if debug: print(i1, i2, wander, 'remaining s2', s2[i2:])\n",
853 " interleaved += s2[i2:]\n",
854 " i2 = len(s2) + 1\n",
855 " elif i2 == len(s2):\n",
856 " if debug: print(i1, i2, wander, 'remaining s1', s1[i1:])\n",
857 " interleaved += s1[i1:]\n",
858 " i1 = len(s1) + 1\n",
860 " if wander == wander_limit:\n",
862 " elif wander == -wander_limit:\n",
865 " step = random.choice([+1, -1])\n",
867 " if debug: print(i1, i2, wander, 'adding', s1[i1])\n",
868 " interleaved += s1[i1]\n",
872 " if debug: print(i1, i2, wander, 'adding', s2[i2])\n",
873 " interleaved += s2[i2]\n",
876 " return ''.join(interleaved)"
881 "execution_count": 40,
887 "sl1 = make_string(200)\n",
888 "sl2 = make_string(200)\n",
889 "sl3 = make_string(200)\n",
890 "sl12 = interleave(sl1, sl2)\n",
891 "sl23 = interleave(sl2, sl3)"
896 "execution_count": 41,
905 "execution_count": 41,
907 "output_type": "execute_result"
911 "is_subseq(sl1, sl12), is_subseq(sl1, sl23)"
916 "execution_count": null,
920 "is_subseq_recursive(sl1, sl12), is_subseq_recursive(sl1, sl23)"
925 "execution_count": null,
935 "display_name": "Python 3",
936 "language": "python",
944 "file_extension": ".py",
945 "mimetype": "text/x-python",
947 "nbconvert_exporter": "python",
948 "pygments_lexer": "ipython3",