Added edit distance to Label
[erd-marker.git] / lib / erd_handler / label.rb
1 class Label
2
3 attr_reader :original, :processed
4
5 def initialize(original = "")
6 @original = original
7 @processed = [original]
8 end
9
10 def split(opts = {})
11 if opts.class == Regexp
12 regexp = opts
13 split_camel_case = true
14 else
15 regexp = opts[:regexp] || /[\t\n _,.-]+/
16 if opts.has_key? :camel_case
17 split_camel_case = opts[:camel_case]
18 else
19 split_camel_case = true
20 end
21 if opts.has_key? :numbers
22 split_numbers = opts[:numbers]
23 else
24 split_numbers = true
25 end
26 end
27 @processed = @processed.map do |segment|
28 segment.split(regexp)
29 end.flatten
30
31 if split_camel_case
32 @processed = @processed.map do |segment|
33 segment.split(/(?<=[a-z])(?=[A-Z])/)
34 end.flatten
35 end
36
37 if split_numbers
38 @processed = @processed.map do |segment|
39 segment.split(/(?:(?<!\d)(?=\d))|(?:(?<=\d)(?!\d))/)
40 end.flatten
41 end
42 self
43 end
44
45 def downcase
46 @processed = @processed.map do |segment| segment.downcase end
47 self
48 end
49
50 def stem(gb_english = false)
51 @processed = @processed.map do |segment| segment.stem(gb_english) end
52 self
53 end
54
55 def tidy
56 self.split.downcase.stem
57 end
58
59 def levenshtein(other_object)
60 if other_object.class == Label
61 other = other_object.processed.join('')
62 else
63 other = other_object
64 end
65 s = @processed.join('')
66 n = s.length
67 m = other.length
68 return m if (0 == n)
69 return n if (0 == m)
70
71 d = Array.new(m+1) {Array.new(n+1, 0)} # one row for each characer in other, one column for each charater in self
72
73 (0..n).each {|i| d[0][i] = i}
74 (0..m).each {|j| d[j][0] = j}
75 (1..m).each do |i|
76 (1..n).each do |j|
77 d[i][j] = [d[i-1][j-1] + ((s[j-1] == other[i-1]) ? 0 : 1), # substitution
78 d[i-1][j] + 1, # deletion
79 d[i][j-1] + 1 # addition
80 ].min
81 end
82 end
83 d[-1][-1]
84 end
85
86 alias :edit_distance :levenshtein
87 end