Got the gem layout working
[porter2stemmer.git] / lib / porter2_implementation.rb
1 # coding: utf-8
2
3 # Implementation of the Porter 2 stemmer. String#porter2_stem is the main stemming procedure.
4
5 class String
6 # Tidy up the word before we get down to the algorithm
7 def porter2_tidy
8 preword = self.to_s.strip.downcase
9
10 # map apostrophe-like characters to apostrophes
11 preword.gsub!(/‘/, "'")
12 preword.gsub!(/’/, "'")
13
14 preword
15 end
16
17
18 # Preprocess the word.
19 # Remove any initial ', if present. Then, set initial y, or y after a vowel, to Y
20 #
21 # (The comment to 'establish the regions R1 and R2' in the original description
22 # is an implementation optimisation that identifies where the regions start. As
23 # no modifications are made to the word that affect those positions, you may want
24 # to cache them now. This implementation doesn't do that.)
25 def porter2_preprocess
26 w = self.dup
27
28 # remove any initial apostrophe
29 w.gsub!(/^'*(.)/, '\1')
30
31 # set initial y, or y after a vowel, to Y
32 w.gsub!(/^y/, "Y")
33 w.gsub!(/(#{Porter2::V})y/, '\1Y')
34
35 w
36 end
37
38
39 # R1 is the portion of the word after the first non-vowel after the first vowel
40 # (with words beginning 'gener-', 'commun-', and 'arsen-' treated as special cases
41 def porter2_r1
42 if self =~ /^(gener|commun|arsen)(?<r1>.*)/
43 Regexp.last_match(:r1)
44 else
45 self =~ /#{Porter2::V}#{Porter2::C}(?<r1>.*)$/
46 Regexp.last_match(:r1) || ""
47 end
48 end
49
50
51 # R2 is the portion of R1 (porter2_r1) after the first non-vowel after the first vowel
52 def porter2_r2
53 self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?<r2>.*)$/
54 Regexp.last_match(:r2) || ""
55 end
56
57
58 # Returns true if the word ends with a short syllable
59 def porter2_ends_with_short_syllable?
60 self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false
61 end
62
63
64 # A word is short if it ends in a short syllable, and R1 is null
65 def porter2_is_short_word?
66 self.porter2_ends_with_short_syllable? and self.porter2_r1.empty?
67 end
68
69
70 # Search for the longest among the suffixes,
71 # * '
72 # * 's
73 # * 's'
74 # and remove if found.
75 def porter2_step0
76 self.sub!(/(.)('s'|'s|')$/, '\1') || self
77 end
78
79
80 # Search for the longest among the following suffixes, and perform the action indicated.
81 # sses:: replace by ss
82 # ied, ies:: replace by i if preceded by more than one letter, otherwise by ie
83 # s:: delete if the preceding word part contains a vowel not immediately before the s
84 # us, ss:: do nothing
85 def porter2_step1a
86 if self =~ /sses$/
87 self.sub(/sses$/, 'ss')
88 elsif self =~ /..(ied|ies)$/
89 self.sub(/(ied|ies)$/, 'i')
90 elsif self =~ /(ied|ies)$/
91 self.sub(/(ied|ies)$/, 'ie')
92 elsif self =~ /(us|ss)$/
93 self
94 elsif self =~ /s$/
95 if self =~ /(#{Porter2::V}.+)s$/
96 self.sub(/s$/, '')
97 else
98 self
99 end
100 else
101 self
102 end
103 end
104
105
106 # Search for the longest among the following suffixes, and perform the action indicated.
107 # eed, eedly:: replace by ee if the suffix is also in R1
108 # ed, edly, ing, ingly:: delete if the preceding word part contains a vowel and,
109 # after the deletion:
110 # * if the word ends at, bl or iz: add e, or
111 # * if the word ends with a double: remove the last letter, or
112 # * if the word is short: add e
113 #
114 # (If gb_english is +true+, treat the 'is' suffix as 'iz' above.)
115 def porter2_step1b(gb_english = false)
116 if self =~ /(eed|eedly)$/
117 if self.porter2_r1 =~ /(eed|eedly)$/
118 self.sub(/(eed|eedly)$/, 'ee')
119 else
120 self
121 end
122 else
123 w = self.dup
124 if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/
125 w.sub!(/(ed|edly|ing|ingly)$/, '')
126 if w =~ /(at|lb|iz)$/
127 w += 'e'
128 elsif w =~ /is$/ and gb_english
129 w += 'e'
130 elsif w =~ /#{Porter2::Double}$/
131 w.chop!
132 elsif w.porter2_is_short_word?
133 w += 'e'
134 end
135 end
136 w
137 end
138 end
139
140
141 # Replace a suffix of y or Y by i if it is preceded by a non-vowel which is
142 # not the first letter of the word.
143 def porter2_step1c
144 if self =~ /.+#{Porter2::C}(y|Y)$/
145 self.sub(/(y|Y)$/, 'i')
146 else
147 self
148 end
149 end
150
151
152 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_2_MAPS.
153 # If one is found and that suffix occurs in R1, replace it with the value
154 # found in STEP_2_MAPS.
155 #
156 # (Suffixes 'ogi' and 'li' are treated as special cases in the procedure.)
157 #
158 # (If gb_english is +true+, replace the 'iser' and 'isation' suffixes with
159 # 'ise', similarly to how 'izer' and 'ization' are treated.)
160 def porter2_step2(gb_english = false)
161 r1 = self.porter2_r1
162 s2m = Porter2::STEP_2_MAPS.dup
163 if gb_english
164 s2m["iser"] = "ise"
165 s2m["isation"] = "ise"
166 end
167 step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
168 if self =~ step_2_re
169 if r1 =~ /#{$&}$/
170 self.sub(/#{$&}$/, s2m[$&])
171 else
172 self
173 end
174 elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/
175 self.sub(/li$/, '')
176 elsif r1 =~ /ogi$/ and self =~ /logi$/
177 self.sub(/ogi$/, 'og')
178 else
179 self
180 end
181 end
182
183
184 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_3_MAPS.
185 # If one is found and that suffix occurs in R1, replace it with the value
186 # found in STEP_3_MAPS.
187 #
188 # (Suffix 'ative' is treated as a special case in the procedure.)
189 #
190 # (If gb_english is +true+, replace the 'alise' suffix with
191 # 'al', similarly to how 'alize' is treated.)
192 def porter2_step3(gb_english = false)
193 if self =~ /ative$/ and self.porter2_r2 =~ /ative$/
194 self.sub(/ative$/, '')
195 else
196 s3m = Porter2::STEP_3_MAPS.dup
197 if gb_english
198 s3m["alise"] = "al"
199 end
200 step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
201 r1 = self.porter2_r1
202 if self =~ step_3_re and r1 =~ /#{$&}$/
203 self.sub(/#{$&}$/, s3m[$&])
204 else
205 self
206 end
207 end
208 end
209
210
211 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_4_MAPS.
212 # If one is found and that suffix occurs in R2, replace it with the value
213 # found in STEP_4_MAPS.
214 #
215 # (Suffix 'ion' is treated as a special case in the procedure.)
216 #
217 # (If gb_english is +true+, delete the 'ise' suffix if found.)
218 def porter2_step4(gb_english = false)
219 if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/
220 self.sub(/ion$/, '')
221 else
222 s4m = Porter2::STEP_4_MAPS.dup
223 if gb_english
224 s4m["ise"] = ""
225 end
226 step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")})
227 r2 = self.porter2_r2
228 if self =~ step_4_re
229 if r2 =~ /#{$&}/
230 self.sub(/#{$&}$/, s4m[$&])
231 else
232 self
233 end
234 else
235 self
236 end
237 end
238 end
239
240
241 # Search for the the following suffixes, and, if found, perform the action indicated.
242 # e:: delete if in R2, or in R1 and not preceded by a short syllable
243 # l:: delete if in R2 and preceded by l
244 def porter2_step5
245 if self =~ /ll$/ and self.porter2_r2 =~ /l$/
246 self.sub(/ll$/, 'l')
247 elsif self =~ /e$/ and self.porter2_r2 =~ /e$/
248 self.sub(/e$/, '')
249 else
250 r1 = self.porter2_r1
251 if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/
252 self.sub(/e$/, '')
253 else
254 self
255 end
256 end
257 end
258
259
260 # Turn all Y letters into y
261 def porter2_postprocess
262 self.gsub(/Y/, 'y')
263 end
264
265 public
266
267 # Perform the stemming procedure. If +gb_english+ is true, treat '-ise' and similar suffixes
268 # as '-ize' in American English.
269 def porter2_stem(gb_english = false)
270 preword = self.porter2_tidy
271 return preword if preword.length <= 2
272
273 word = preword.porter2_preprocess
274
275 if Porter2::SPECIAL_CASES.has_key? word
276 Porter2::SPECIAL_CASES[word]
277 else
278 w1a = word.porter2_step0.porter2_step1a
279 if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
280 w1a
281 else
282 w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess
283 end
284 end
285 end
286
287 # A verbose version of porter2_stem that prints the output of each stage to STDOUT
288 def porter2_stem_verbose(gb_english = false)
289 preword = self.porter2_tidy
290 puts "Preword: #{preword}"
291 return preword if preword.length <= 2
292
293 word = preword.porter2_preprocess
294 puts "Preprocessed: #{word}"
295
296 if Porter2::SPECIAL_CASES.has_key? word
297 puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}"
298 Porter2::SPECIAL_CASES[word]
299 else
300 r1 = word.porter2_r1
301 r2 = word.porter2_r2
302 puts "R1 = #{r1}, R2 = #{r2}"
303
304 w0 = word.porter2_step0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
305 w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
306
307 if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
308 puts "Returning #{w1a} as 1a special case"
309 w1a
310 else
311 w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
312 w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
313 w2 = w1c.porter2_step2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
314 w3 = w2.porter2_step3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
315 w4 = w3.porter2_step4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
316 w5 = w4.porter2_step5 ; puts "After step 5: #{w5}"
317 wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}"
318 wpost
319 end
320 end
321 end
322
323 alias stem porter2_stem
324
325 end
326