3 attr_reader
:original, :processed
5 def initialize(original
= "")
7 @processed = [original
]
11 if opts
.class == Regexp
13 split_camel_case
= true
15 regexp
= opts
[:regexp] || /[\t\n _,.-]+/
16 if opts
.has_key
? :camel_case
17 split_camel_case
= opts
[:camel_case]
19 split_camel_case
= true
21 if opts
.has_key
? :numbers
22 split_numbers
= opts
[:numbers]
27 @processed = @processed.map
do |segment
|
32 @processed = @processed.map
do |segment
|
33 segment
.split(/(?<=[a-z])(?=[A-Z])/)
38 @processed = @processed.map
do |segment
|
39 segment
.split(/(?:(?<!\d)(?=\d))|(?:(?<=\d)(?!\d))/)
46 @processed = @processed.map
do |segment
| segment
.downcase
end
50 def stem(gb_english
= false)
51 @processed = @processed.map
do |segment
| segment
.stem(gb_english
) end
56 self.split
.downcase
.stem
59 def levenshtein(other_object
)
60 if other_object
.class == Label
61 other
= other_object
.processed
.join('')
65 s
= @processed.join('')
71 d
= Array
.new(m
+1) {Array
.new(n
+1, 0)} # one row for each characer in other, one column for each charater in self
73 (0..n
).each
{|i
| d
[0][i
] = i
}
74 (0..m
).each
{|j
| d
[j
][0] = j
}
77 d
[i
][j
] = [d
[i-1
][j-1
] + ((s
[j-1
] == other
[i-1
]) ? 0 : 1), # substitution
78 d
[i-1
][j
] + 1, # deletion
79 d
[i
][j-1
] + 1 # addition
86 alias :edit_distance :levenshtein