2236b12272ad81b2db73a637176d693406b0c761
[porter2stemmer.git] / lib / porter2.rb
1 # coding: utf-8
2
3 # Porter 2 stemmer in Ruby.
4 #
5 # This is the Porter 2 stemming algorithm, as described at
6 # http://snowball.tartarus.org/algorithms/english/stemmer.html
7 # The original paper is:
8 #
9 # Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14,
10 # no. 3, pp 130-137
11
12 module Stemmable
13 # A non-vowel
14 C = "[^aeiouy]"
15
16 # A vowel
17 V = "[aeiouy]"
18
19 # A non-vowel other than w, x, or Y
20 CW = "[^aeiouywxY]"
21
22 # Doubles created when added a suffix: these are undoubled when stemmed
23 Double = "(bb|dd|ff|gg|mm|nn|pp|rr|tt)"
24
25 # A valid letter that can come before 'li'
26 Valid_LI = "[cdeghkmnrt]"
27
28 # A specification for a short syllable
29 SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))"
30
31 # Suffix transformations used in Step 2.
32 # (ogi, li endings dealt with in procedure)
33 STEP_2_MAPS = {"tional" => "tion",
34 "enci" => "ence",
35 "anci" => "ance",
36 "abli" => "able",
37 "entli" => "ent",
38 "ization" => "ize",
39 "izer" => "ize",
40 "ational" => "ate",
41 "ation" => "ate",
42 "ator" => "ate",
43 "alism" => "al",
44 "aliti" => "al",
45 "alli" => "al",
46 "fulness" => "ful",
47 "ousli" => "ous",
48 "ousness" => "ous",
49 "iveness" => "ive",
50 "iviti" => "ive",
51 "biliti" => "ble",
52 "bli" => "ble",
53 "fulli" => "ful",
54 "lessli" => "less" }
55
56 # Suffix transformations used in Step 3.
57 # (ative ending dealt with in procedure)
58 STEP_3_MAPS = {"tional" => "tion",
59 "ational" => "ate",
60 "alize" => "al",
61 "icate" => "ic",
62 "iciti" => "ic",
63 "ical" => "ic",
64 "ful" => "",
65 "ness" => "" }
66
67 # Suffix transformations used in Step 4.
68 STEP_4_MAPS = {"al" => "",
69 "ance" => "",
70 "ence" => "",
71 "er" => "",
72 "ic" => "",
73 "able" => "",
74 "ible" => "",
75 "ant" => "",
76 "ement" => "",
77 "ment" => "",
78 "ent" => "",
79 "ism" => "",
80 "ate" => "",
81 "iti" => "",
82 "ous" => "",
83 "ive" => "",
84 "ize" => "" }
85
86 # Special-case stemmings
87 SPECIAL_CASES = {"skis" => "ski",
88 "skies" => "sky",
89
90 "dying" => "die",
91 "lying" => "lie",
92 "tying" => "tie",
93 "idly" => "idl",
94 "gently" => "gentl",
95 "ugly" => "ugli",
96 "early" => "earli",
97 "only" => "onli",
98 "singly" =>"singl",
99
100 "sky" => "sky",
101 "news" => "news",
102 "howe" => "howe",
103 "atlas" => "atlas",
104 "cosmos" => "cosmos",
105 "bias" => "bias",
106 "andes" => "andes" }
107
108 # Special case words to ignore after step 1a.
109 STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ]
110
111 # Tidy up the word before we get down to the algorithm
112 def porter2_tidy
113 preword = self.to_s.strip.downcase
114
115 # map apostrophe-like characters to apostrophes
116 preword.gsub!(/‘/, "'")
117 preword.gsub!(/’/, "'")
118
119 preword
120 end
121
122 def porter2_preprocess
123 w = self.dup
124
125 # remove any initial apostrophe
126 w.gsub!(/^'*(.)/, '\1')
127
128 # set initial y, or y after a vowel, to Y
129 w.gsub!(/^y/, "Y")
130 w.gsub!(/(#{V})y/, '\1Y')
131
132 w
133 end
134
135 # The word after the first non-vowel after the first vowel
136 def porter2_r1
137 if self =~ /^(gener|commun|arsen)(?<r1>.*)/
138 Regexp.last_match(:r1)
139 else
140 self =~ /#{V}#{C}(?<r1>.*)$/
141 Regexp.last_match(:r1) || ""
142 end
143 end
144
145 # R1 after the first non-vowel after the first vowel
146 def porter2_r2
147 self.porter2_r1 =~ /#{V}#{C}(?<r2>.*)$/
148 Regexp.last_match(:r2) || ""
149 end
150
151 # A short syllable in a word is either
152 # 1. a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or
153 # 2. a vowel at the beginning of the word followed by a non-vowel.
154 def porter2_ends_with_short_syllable?
155 self =~ /#{SHORT_SYLLABLE}$/ ? true : false
156 end
157
158 # A word is short if it ends in a short syllable, and if R1 is null
159 def porter2_is_short_word?
160 self.porter2_ends_with_short_syllable? and self.porter2_r1.empty?
161 end
162
163 # Search for the longest among the suffixes,
164 # * '
165 # * 's
166 # * 's'
167 # and remove if found.
168 def step_0
169 self.sub!(/(.)('s'|'s|')$/, '\1') || self
170 end
171
172 # Remove plural suffixes
173 def step_1a
174 if self =~ /sses$/
175 self.sub(/sses$/, 'ss')
176 elsif self =~ /..(ied|ies)$/
177 self.sub(/(ied|ies)$/, 'i')
178 elsif self =~ /(ied|ies)$/
179 self.sub(/(ied|ies)$/, 'ie')
180 elsif self =~ /(us|ss)$/
181 self
182 elsif self =~ /s$/
183 if self =~ /(#{V}.+)s$/
184 self.sub(/s$/, '')
185 else
186 self
187 end
188 else
189 self
190 end
191 end
192
193 def step_1b(gb_english = false)
194 if self =~ /(eed|eedly)$/
195 if self.porter2_r1 =~ /(eed|eedly)$/
196 self.sub(/(eed|eedly)$/, 'ee')
197 else
198 self
199 end
200 else
201 w = self.dup
202 if w =~ /#{V}.*(ed|edly|ing|ingly)$/
203 w.sub!(/(ed|edly|ing|ingly)$/, '')
204 if w =~ /(at|lb|iz)$/
205 w += 'e'
206 elsif w =~ /is$/ and gb_english
207 w += 'e'
208 elsif w =~ /#{Double}$/
209 w.chop!
210 elsif w.porter2_is_short_word?
211 w += 'e'
212 end
213 end
214 w
215 end
216 end
217
218
219 def step_1c
220 if self =~ /.+#{C}(y|Y)$/
221 self.sub(/(y|Y)$/, 'i')
222 else
223 self
224 end
225 end
226
227
228 def step_2(gb_english = false)
229 r1 = self.porter2_r1
230 s2m = STEP_2_MAPS.dup
231 if gb_english
232 s2m["iser"] = "ise"
233 s2m["isation"] = "ise"
234 end
235 step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
236 if self =~ step_2_re
237 if r1 =~ /#{$&}$/
238 self.sub(/#{$&}$/, s2m[$&])
239 else
240 self
241 end
242 elsif r1 =~ /li$/ and self =~ /(#{Valid_LI})li$/
243 self.sub(/li$/, '')
244 elsif r1 =~ /ogi$/ and self =~ /logi$/
245 self.sub(/ogi$/, 'og')
246 else
247 self
248 end
249 end
250
251
252 def step_3(gb_english = false)
253 if self =~ /ative$/ and self.porter2_r2 =~ /ative$/
254 self.sub(/ative$/, '')
255 else
256 s3m = STEP_3_MAPS.dup
257 if gb_english
258 s3m["alise"] = "al"
259 end
260 step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
261 r1 = self.porter2_r1
262 if self =~ step_3_re and r1 =~ /#{$&}$/
263 self.sub(/#{$&}$/, s3m[$&])
264 else
265 self
266 end
267 end
268 end
269
270
271 def step_4(gb_english = false)
272 if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/
273 self.sub(/ion$/, '')
274 else
275 s4m = STEP_4_MAPS.dup
276 if gb_english
277 s4m["ise"] = ""
278 end
279 step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")})
280 r2 = self.porter2_r2
281 if self =~ step_4_re
282 if r2 =~ /#{$&}/
283 self.sub(/#{$&}$/, s4m[$&])
284 else
285 self
286 end
287 else
288 self
289 end
290 end
291 end
292
293
294 def step_5
295 if self =~ /ll$/ and self.porter2_r2 =~ /l$/
296 self.sub(/ll$/, 'l')
297 elsif self =~ /e$/ and self.porter2_r2 =~ /e$/
298 self.sub(/e$/, '')
299 else
300 r1 = self.porter2_r1
301 if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{SHORT_SYLLABLE}e$/
302 self.sub(/e$/, '')
303 else
304 self
305 end
306 end
307 end
308
309
310 def porter2_postprocess
311 self.gsub(/Y/, 'y')
312 end
313
314
315 def porter2_stem(gb_english = false)
316 preword = self.porter2_tidy
317 return preword if preword.length <= 2
318
319 word = preword.porter2_preprocess
320
321 if SPECIAL_CASES.has_key? word
322 SPECIAL_CASES[word]
323 else
324 w1a = word.step_0.step_1a
325 if STEP_1A_SPECIAL_CASES.include? w1a
326 w1a
327 else
328 w1a.step_1b(gb_english).step_1c.step_2(gb_english).step_3(gb_english).step_4(gb_english).step_5.porter2_postprocess
329 end
330 end
331 end
332
333 def porter2_stem_verbose(gb_english = false)
334 preword = self.porter2_tidy
335 puts "Preword: #{preword}"
336 return preword if preword.length <= 2
337
338 word = preword.porter2_preprocess
339 puts "Preprocessed: #{word}"
340
341 if SPECIAL_CASES.has_key? word
342 puts "Returning #{word} as special case #{SPECIAL_CASES[word]}"
343 SPECIAL_CASES[word]
344 else
345 r1 = word.porter2_r1
346 r2 = word.porter2_r2
347 puts "R1 = #{r1}, R2 = #{r2}"
348
349 w0 = word.step_0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
350 w1a = w0.step_1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
351
352 if STEP_1A_SPECIAL_CASES.include? w1a
353 puts "Returning #{w1a} as 1a special case"
354 w1a
355 else
356 w1b = w1a.step_1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
357 w1c = w1b.step_1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
358 w2 = w1c.step_2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
359 w3 = w2.step_3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
360 w4 = w3.step_4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
361 w5 = w4.step_5 ; puts "After step 5: #{w5}"
362 wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}"
363 wpost
364 end
365 end
366 end
367
368 alias stem porter2_stem
369
370 end
371
372 # Add stem method to all Strings
373 class String
374 include Stemmable
375
376 # private :porter2_preprocess, :porter2_r1, :porter2_r2
377 end