3 require 'porter2_module'
5 # ==The Porter 2 stemmer
7 # This is the Porter 2 stemming algorithm, as described at
8 # http://snowball.tartarus.org/algorithms/english/stemmer.html
9 # The original paper is:
11 # Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14,
14 # Constants for the stemmer are in the Porter2 module.
16 # Procedures that implement the stemmer are added to the String class.
18 # The stemmer algorithm is implemented in the porter2_stem procedure.
20 # ==Internationalisation
21 # There isn't much, as this is a stemmer that only works for English.
23 # The +gb_english+ flag to the various procedures allows the stemmer to treat the British
24 # English '-ise' the same as the American English '-ize'.
27 # Several places in the algorithm require matching the longest suffix of a word. The
28 # regexp engine in Ruby 1.9 seems to handle alterntives in regexps by finding the
29 # alternative that matches at the first position in the string. As we're only talking
30 # about suffixes, that first match is also the longest suffix. If the regexp engine changes,
31 # this behaviour may change and break the stemmer.
34 # Tidy up the word before we get down to the algorithm
36 preword
= self.to_s
.strip
.downcase
38 # map apostrophe-like characters to apostrophes
39 preword
.gsub
!(/‘/, "'")
40 preword
.gsub
!(/’/, "'")
46 # Preprocess the word.
47 # Remove any initial ', if present. Then, set initial y, or y after a vowel, to Y
49 # (The comment to 'establish the regions R1 and R2' in the original description
50 # is an implementation optimisation that identifies where the regions start. As
51 # no modifications are made to the word that affect those positions, you may want
52 # to cache them now. This implementation doesn't do that.)
53 def porter2_preprocess
56 # remove any initial apostrophe
57 w
.gsub
!(/^'*(.)/, '\1')
59 # set initial y, or y after a vowel, to Y
61 w
.gsub
!(/(#{Porter2::V})y/, '\1Y')
67 # R1 is the portion of the word after the first non-vowel after the first vowel
68 # (with words beginning 'gener-', 'commun-', and 'arsen-' treated as special cases
70 if self =~
/^(gener|commun|arsen)(?<r1>.*)/
71 Regexp
.last_match(:r1)
73 self =~
/#{Porter2::V}#{Porter2::C}(?<r1>.*)$/
74 Regexp
.last_match(:r1) || ""
79 # R2 is the portion of R1 (porter2_r1) after the first non-vowel after the first vowel
81 self.porter2_r1
=~
/#{Porter2::V}#{Porter2::C}(?<r2>.*)$/
82 Regexp
.last_match(:r2) || ""
86 # Returns true if the word ends with a short syllable
87 def porter2_ends_with_short_syllable
?
88 self =~
/#{Porter2::SHORT_SYLLABLE}$/ ? true : false
92 # A word is short if it ends in a short syllable, and R1 is null
93 def porter2_is_short_word
?
94 self.porter2_ends_with_short_syllable
? and self.porter2_r1
.empty
?
98 # Search for the longest among the suffixes,
102 # and remove if found.
104 self.sub
!(/(.)('s'|'s|')$/, '\1') || self
108 # Search for the longest among the following suffixes, and perform the action indicated.
109 # sses:: replace by ss
110 # ied, ies:: replace by i if preceded by more than one letter, otherwise by ie
111 # s:: delete if the preceding word part contains a vowel not immediately before the s
112 # us, ss:: do nothing
115 self.sub(/sses$/, 'ss')
116 elsif self =~
/..(ied|ies)$/
117 self.sub(/(ied|ies)$/, 'i')
118 elsif self =~
/(ied|ies)$/
119 self.sub(/(ied|ies)$/, 'ie')
120 elsif self =~
/(us|ss)$/
123 if self =~
/(#{Porter2::V}.+)s$/
134 # Search for the longest among the following suffixes, and perform the action indicated.
135 # eed, eedly:: replace by ee if the suffix is also in R1
136 # ed, edly, ing, ingly:: delete if the preceding word part contains a vowel and,
137 # after the deletion:
138 # * if the word ends at, bl or iz: add e, or
139 # * if the word ends with a double: remove the last letter, or
140 # * if the word is short: add e
142 # (If gb_english is +true+, treat the 'is' suffix as 'iz' above.)
143 def porter2_step1b(gb_english
= false)
144 if self =~
/(eed|eedly)$/
145 if self.porter2_r1
=~
/(eed|eedly)$/
146 self.sub(/(eed|eedly)$/, 'ee')
152 if w
=~
/#{Porter2::V}.*(ed|edly|ing|ingly)$/
153 w
.sub
!(/(ed|edly|ing|ingly)$/, '')
154 if w
=~
/(at|lb|iz)$/
156 elsif w
=~
/is$/ and gb_english
158 elsif w
=~
/#{Porter2::Double}$/
160 elsif w
.porter2_is_short_word
?
169 # Replace a suffix of y or Y by i if it is preceded by a non-vowel which is
170 # not the first letter of the word.
172 if self =~
/.+#{Porter2::C}(y|Y)$/
173 self.sub(/(y|Y)$/, 'i')
180 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_2_MAPS.
181 # If one is found and that suffix occurs in R1, replace it with the value
182 # found in STEP_2_MAPS.
184 # (Suffixes 'ogi' and 'li' are treated as special cases in the procedure.)
186 # (If gb_english is +true+, replace the 'iser' and 'isation' suffixes with
187 # 'ise', similarly to how 'izer' and 'ization' are treated.)
188 def porter2_step2(gb_english
= false)
190 s2m
= Porter2
::STEP_2_MAPS.dup
193 s2m
["isation"] = "ise"
195 step_2_re
= Regexp
.union(s2m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
198 self.sub(/#{$&}$/, s2m
[$
&])
202 elsif r1
=~
/li$/ and self =~
/(#{Porter2::Valid_LI})li$/
204 elsif r1
=~
/ogi$/ and self =~
/logi$/
205 self.sub(/ogi$/, 'og')
212 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_3_MAPS.
213 # If one is found and that suffix occurs in R1, replace it with the value
214 # found in STEP_3_MAPS.
216 # (Suffix 'ative' is treated as a special case in the procedure.)
218 # (If gb_english is +true+, replace the 'alise' suffix with
219 # 'al', similarly to how 'alize' is treated.)
220 def porter2_step3(gb_english
= false)
221 if self =~
/ative$/ and self.porter2_r2
=~
/ative$/
222 self.sub(/ative$/, '')
224 s3m
= Porter2
::STEP_3_MAPS.dup
228 step_3_re
= Regexp
.union(s3m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
230 if self =~ step_3_re
and r1
=~
/#{$&}$/
231 self.sub(/#{$&}$/, s3m
[$
&])
239 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_4_MAPS.
240 # If one is found and that suffix occurs in R2, replace it with the value
241 # found in STEP_4_MAPS.
243 # (Suffix 'ion' is treated as a special case in the procedure.)
245 # (If gb_english is +true+, delete the 'ise' suffix if found.)
246 def porter2_step4(gb_english
= false)
247 if self.porter2_r2
=~
/ion$/ and self =~
/(s|t)ion$/
250 s4m
= Porter2
::STEP_4_MAPS.dup
254 step_4_re
= Regexp
.union(s4m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
258 self.sub(/#{$&}$/, s4m
[$
&])
269 # Search for the the following suffixes, and, if found, perform the action indicated.
270 # e:: delete if in R2, or in R1 and not preceded by a short syllable
271 # l:: delete if in R2 and preceded by l
273 if self =~
/ll$/ and self.porter2_r2
=~
/l$/
275 elsif self =~
/e$/ and self.porter2_r2
=~
/e$/
279 if self =~
/e$/ and r1
=~
/e$/ and not self =~
/#{Porter2::SHORT_SYLLABLE}e$/
288 # Turn all Y letters into y
289 def porter2_postprocess
295 # Perform the stemming procedure. If +gb_english+ is true, treat '-ise' and similar suffixes
296 # as '-ize' in American English.
297 def porter2_stem(gb_english
= false)
298 preword
= self.porter2_tidy
299 return preword
if preword
.length
<= 2
301 word
= preword
.porter2_preprocess
303 if Porter2
::SPECIAL_CASES.has_key
? word
304 Porter2
::SPECIAL_CASES[word
]
306 w1a
= word
.porter2_step0
.porter2_step1a
307 if Porter2
::STEP_1A_SPECIAL_CASES.include? w1a
310 w1a
.porter2_step1b(gb_english
).porter2_step1c
.porter2_step2(gb_english
).porter2_step3(gb_english
).porter2_step4(gb_english
).porter2_step5
.porter2_postprocess
315 # A verbose version of porter2_stem that prints the output of each stage to STDOUT
316 def porter2_stem_verbose(gb_english
= false)
317 preword
= self.porter2_tidy
318 puts
"Preword: #{preword}"
319 return preword
if preword
.length
<= 2
321 word
= preword
.porter2_preprocess
322 puts
"Preprocessed: #{word}"
324 if Porter2
::SPECIAL_CASES.has_key
? word
325 puts
"Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}"
326 Porter2
::SPECIAL_CASES[word
]
330 puts
"R1 = #{r1}, R2 = #{r2}"
332 w0
= word
.porter2_step0
; puts
"After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
333 w1a
= w0
.porter2_step1a
; puts
"After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
335 if Porter2
::STEP_1A_SPECIAL_CASES.include? w1a
336 puts
"Returning #{w1a} as 1a special case"
339 w1b
= w1a
.porter2_step1b(gb_english
) ; puts
"After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
340 w1c
= w1b
.porter2_step1c
; puts
"After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
341 w2
= w1c
.porter2_step2(gb_english
) ; puts
"After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
342 w3
= w2
.porter2_step3(gb_english
) ; puts
"After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
343 w4
= w3
.porter2_step4(gb_english
) ; puts
"After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
344 w5
= w4
.porter2_step5
; puts
"After step 5: #{w5}"
345 wpost
= w5
.porter2_postprocess
; puts
"After postprocess: #{wpost}"
351 alias stem porter2_stem