Renamed porter2_module to porter2_constants
[porter2stemmer.git] / lib / porter2.rb
1 # coding: utf-8
2
3 require 'porter2_constants'
4
5 # ==The Porter 2 stemmer
6 #
7 # This is the Porter 2 stemming algorithm, as described at
8 # http://snowball.tartarus.org/algorithms/english/stemmer.html
9 # The original paper is:
10 #
11 # Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14,
12 # no. 3, pp 130-137
13 #
14 # Constants for the stemmer are in the Porter2 module.
15 #
16 # Procedures that implement the stemmer are added to the String class.
17 #
18 # The stemmer algorithm is implemented in the porter2_stem procedure.
19 #
20 # ==Internationalisation
21 # There isn't much, as this is a stemmer that only works for English.
22 #
23 # The +gb_english+ flag to the various procedures allows the stemmer to treat the British
24 # English '-ise' the same as the American English '-ize'.
25 #
26 # ==Longest suffixes
27 # Several places in the algorithm require matching the longest suffix of a word. The
28 # regexp engine in Ruby 1.9 seems to handle alterntives in regexps by finding the
29 # alternative that matches at the first position in the string. As we're only talking
30 # about suffixes, that first match is also the longest suffix. If the regexp engine changes,
31 # this behaviour may change and break the stemmer.
32
33 class String
34 # Tidy up the word before we get down to the algorithm
35 def porter2_tidy
36 preword = self.to_s.strip.downcase
37
38 # map apostrophe-like characters to apostrophes
39 preword.gsub!(/‘/, "'")
40 preword.gsub!(/’/, "'")
41
42 preword
43 end
44
45
46 # Preprocess the word.
47 # Remove any initial ', if present. Then, set initial y, or y after a vowel, to Y
48 #
49 # (The comment to 'establish the regions R1 and R2' in the original description
50 # is an implementation optimisation that identifies where the regions start. As
51 # no modifications are made to the word that affect those positions, you may want
52 # to cache them now. This implementation doesn't do that.)
53 def porter2_preprocess
54 w = self.dup
55
56 # remove any initial apostrophe
57 w.gsub!(/^'*(.)/, '\1')
58
59 # set initial y, or y after a vowel, to Y
60 w.gsub!(/^y/, "Y")
61 w.gsub!(/(#{Porter2::V})y/, '\1Y')
62
63 w
64 end
65
66
67 # R1 is the portion of the word after the first non-vowel after the first vowel
68 # (with words beginning 'gener-', 'commun-', and 'arsen-' treated as special cases
69 def porter2_r1
70 if self =~ /^(gener|commun|arsen)(?<r1>.*)/
71 Regexp.last_match(:r1)
72 else
73 self =~ /#{Porter2::V}#{Porter2::C}(?<r1>.*)$/
74 Regexp.last_match(:r1) || ""
75 end
76 end
77
78
79 # R2 is the portion of R1 (porter2_r1) after the first non-vowel after the first vowel
80 def porter2_r2
81 self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?<r2>.*)$/
82 Regexp.last_match(:r2) || ""
83 end
84
85
86 # Returns true if the word ends with a short syllable
87 def porter2_ends_with_short_syllable?
88 self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false
89 end
90
91
92 # A word is short if it ends in a short syllable, and R1 is null
93 def porter2_is_short_word?
94 self.porter2_ends_with_short_syllable? and self.porter2_r1.empty?
95 end
96
97
98 # Search for the longest among the suffixes,
99 # * '
100 # * 's
101 # * 's'
102 # and remove if found.
103 def porter2_step0
104 self.sub!(/(.)('s'|'s|')$/, '\1') || self
105 end
106
107
108 # Search for the longest among the following suffixes, and perform the action indicated.
109 # sses:: replace by ss
110 # ied, ies:: replace by i if preceded by more than one letter, otherwise by ie
111 # s:: delete if the preceding word part contains a vowel not immediately before the s
112 # us, ss:: do nothing
113 def porter2_step1a
114 if self =~ /sses$/
115 self.sub(/sses$/, 'ss')
116 elsif self =~ /..(ied|ies)$/
117 self.sub(/(ied|ies)$/, 'i')
118 elsif self =~ /(ied|ies)$/
119 self.sub(/(ied|ies)$/, 'ie')
120 elsif self =~ /(us|ss)$/
121 self
122 elsif self =~ /s$/
123 if self =~ /(#{Porter2::V}.+)s$/
124 self.sub(/s$/, '')
125 else
126 self
127 end
128 else
129 self
130 end
131 end
132
133
134 # Search for the longest among the following suffixes, and perform the action indicated.
135 # eed, eedly:: replace by ee if the suffix is also in R1
136 # ed, edly, ing, ingly:: delete if the preceding word part contains a vowel and,
137 # after the deletion:
138 # * if the word ends at, bl or iz: add e, or
139 # * if the word ends with a double: remove the last letter, or
140 # * if the word is short: add e
141 #
142 # (If gb_english is +true+, treat the 'is' suffix as 'iz' above.)
143 def porter2_step1b(gb_english = false)
144 if self =~ /(eed|eedly)$/
145 if self.porter2_r1 =~ /(eed|eedly)$/
146 self.sub(/(eed|eedly)$/, 'ee')
147 else
148 self
149 end
150 else
151 w = self.dup
152 if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/
153 w.sub!(/(ed|edly|ing|ingly)$/, '')
154 if w =~ /(at|lb|iz)$/
155 w += 'e'
156 elsif w =~ /is$/ and gb_english
157 w += 'e'
158 elsif w =~ /#{Porter2::Double}$/
159 w.chop!
160 elsif w.porter2_is_short_word?
161 w += 'e'
162 end
163 end
164 w
165 end
166 end
167
168
169 # Replace a suffix of y or Y by i if it is preceded by a non-vowel which is
170 # not the first letter of the word.
171 def porter2_step1c
172 if self =~ /.+#{Porter2::C}(y|Y)$/
173 self.sub(/(y|Y)$/, 'i')
174 else
175 self
176 end
177 end
178
179
180 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_2_MAPS.
181 # If one is found and that suffix occurs in R1, replace it with the value
182 # found in STEP_2_MAPS.
183 #
184 # (Suffixes 'ogi' and 'li' are treated as special cases in the procedure.)
185 #
186 # (If gb_english is +true+, replace the 'iser' and 'isation' suffixes with
187 # 'ise', similarly to how 'izer' and 'ization' are treated.)
188 def porter2_step2(gb_english = false)
189 r1 = self.porter2_r1
190 s2m = Porter2::STEP_2_MAPS.dup
191 if gb_english
192 s2m["iser"] = "ise"
193 s2m["isation"] = "ise"
194 end
195 step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
196 if self =~ step_2_re
197 if r1 =~ /#{$&}$/
198 self.sub(/#{$&}$/, s2m[$&])
199 else
200 self
201 end
202 elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/
203 self.sub(/li$/, '')
204 elsif r1 =~ /ogi$/ and self =~ /logi$/
205 self.sub(/ogi$/, 'og')
206 else
207 self
208 end
209 end
210
211
212 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_3_MAPS.
213 # If one is found and that suffix occurs in R1, replace it with the value
214 # found in STEP_3_MAPS.
215 #
216 # (Suffix 'ative' is treated as a special case in the procedure.)
217 #
218 # (If gb_english is +true+, replace the 'alise' suffix with
219 # 'al', similarly to how 'alize' is treated.)
220 def porter2_step3(gb_english = false)
221 if self =~ /ative$/ and self.porter2_r2 =~ /ative$/
222 self.sub(/ative$/, '')
223 else
224 s3m = Porter2::STEP_3_MAPS.dup
225 if gb_english
226 s3m["alise"] = "al"
227 end
228 step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
229 r1 = self.porter2_r1
230 if self =~ step_3_re and r1 =~ /#{$&}$/
231 self.sub(/#{$&}$/, s3m[$&])
232 else
233 self
234 end
235 end
236 end
237
238
239 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_4_MAPS.
240 # If one is found and that suffix occurs in R2, replace it with the value
241 # found in STEP_4_MAPS.
242 #
243 # (Suffix 'ion' is treated as a special case in the procedure.)
244 #
245 # (If gb_english is +true+, delete the 'ise' suffix if found.)
246 def porter2_step4(gb_english = false)
247 if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/
248 self.sub(/ion$/, '')
249 else
250 s4m = Porter2::STEP_4_MAPS.dup
251 if gb_english
252 s4m["ise"] = ""
253 end
254 step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")})
255 r2 = self.porter2_r2
256 if self =~ step_4_re
257 if r2 =~ /#{$&}/
258 self.sub(/#{$&}$/, s4m[$&])
259 else
260 self
261 end
262 else
263 self
264 end
265 end
266 end
267
268
269 # Search for the the following suffixes, and, if found, perform the action indicated.
270 # e:: delete if in R2, or in R1 and not preceded by a short syllable
271 # l:: delete if in R2 and preceded by l
272 def porter2_step5
273 if self =~ /ll$/ and self.porter2_r2 =~ /l$/
274 self.sub(/ll$/, 'l')
275 elsif self =~ /e$/ and self.porter2_r2 =~ /e$/
276 self.sub(/e$/, '')
277 else
278 r1 = self.porter2_r1
279 if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/
280 self.sub(/e$/, '')
281 else
282 self
283 end
284 end
285 end
286
287
288 # Turn all Y letters into y
289 def porter2_postprocess
290 self.gsub(/Y/, 'y')
291 end
292
293 public
294
295 # Perform the stemming procedure. If +gb_english+ is true, treat '-ise' and similar suffixes
296 # as '-ize' in American English.
297 def porter2_stem(gb_english = false)
298 preword = self.porter2_tidy
299 return preword if preword.length <= 2
300
301 word = preword.porter2_preprocess
302
303 if Porter2::SPECIAL_CASES.has_key? word
304 Porter2::SPECIAL_CASES[word]
305 else
306 w1a = word.porter2_step0.porter2_step1a
307 if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
308 w1a
309 else
310 w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess
311 end
312 end
313 end
314
315 # A verbose version of porter2_stem that prints the output of each stage to STDOUT
316 def porter2_stem_verbose(gb_english = false)
317 preword = self.porter2_tidy
318 puts "Preword: #{preword}"
319 return preword if preword.length <= 2
320
321 word = preword.porter2_preprocess
322 puts "Preprocessed: #{word}"
323
324 if Porter2::SPECIAL_CASES.has_key? word
325 puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}"
326 Porter2::SPECIAL_CASES[word]
327 else
328 r1 = word.porter2_r1
329 r2 = word.porter2_r2
330 puts "R1 = #{r1}, R2 = #{r2}"
331
332 w0 = word.porter2_step0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
333 w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
334
335 if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
336 puts "Returning #{w1a} as 1a special case"
337 w1a
338 else
339 w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
340 w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
341 w2 = w1c.porter2_step2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
342 w3 = w2.porter2_step3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
343 w4 = w3.porter2_step4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
344 w5 = w4.porter2_step5 ; puts "After step 5: #{w5}"
345 wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}"
346 wpost
347 end
348 end
349 end
350
351 alias stem porter2_stem
352
353 end
354