3 # Porter 2 stemmer in Ruby.
5 # This is the Porter 2 stemming algorithm, as described at
6 # http://snowball.tartarus.org/algorithms/english/stemmer.html
7 # The original paper is:
9 # Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14,
19 # A non-vowel other than w, x, or Y
22 # Doubles created when added a suffix: these are undoubled when stemmed
23 Double
= "(bb|dd|ff|gg|mm|nn|pp|rr|tt)"
25 # A valid letter that can come before 'li'
26 Valid_LI
= "[cdeghkmnrt]"
28 # A specification for a short syllable
29 SHORT_SYLLABLE
= "((#{C}#{V}#{CW})|(^#{V}#{C}))"
31 # Suffix transformations used in Step 2.
32 # (ogi, li endings dealt with in procedure)
33 STEP_2_MAPS
= {"tional" => "tion",
56 # Suffix transformations used in Step 3.
57 # (ative ending dealt with in procedure)
58 STEP_3_MAPS
= {"tional" => "tion",
67 # Suffix transformations used in Step 4.
68 STEP_4_MAPS
= {"al" => "",
86 # Special-case stemmings
87 SPECIAL_CASES
= {"skis" => "ski",
104 "cosmos" => "cosmos",
108 # Special case words to ignore after step 1a.
109 STEP_1A_SPECIAL_CASES
= %w
[ inning outing canning herring earring proceed exceed succeed
]
111 # Tidy up the word before we get down to the algorithm
113 preword
= self.to_s
.strip
.downcase
115 # map apostrophe-like characters to apostrophes
116 preword
.gsub
!(/‘/, "'")
117 preword
.gsub
!(/’/, "'")
122 def porter2_preprocess
125 # remove any initial apostrophe
126 w
.gsub
!(/^'*(.)/, '\1')
128 # set initial y, or y after a vowel, to Y
130 w
.gsub
!(/(#{V})y/, '\1Y')
135 # The word after the first non-vowel after the first vowel
137 if self =~
/^(gener|commun|arsen)(?<r1>.*)/
138 Regexp
.last_match(:r1)
140 self =~
/#{V}#{C}(?<r1>.*)$/
141 Regexp
.last_match(:r1) || ""
145 # R1 after the first non-vowel after the first vowel
147 self.porter2_r1
=~
/#{V}#{C}(?<r2>.*)$/
148 Regexp
.last_match(:r2) || ""
151 # A short syllable in a word is either
152 # 1. a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or
153 # 2. a vowel at the beginning of the word followed by a non-vowel.
154 def porter2_ends_with_short_syllable
?
155 self =~
/#{SHORT_SYLLABLE}$/ ? true : false
158 # A word is short if it ends in a short syllable, and if R1 is null
159 def porter2_is_short_word
?
160 self.porter2_ends_with_short_syllable
? and self.porter2_r1
.empty
?
163 # Search for the longest among the suffixes,
167 # and remove if found.
169 self.sub
!(/(.)('s'|'s|')$/, '\1') || self
172 # Remove plural suffixes
175 self.sub(/sses$/, 'ss')
176 elsif self =~
/..(ied|ies)$/
177 self.sub(/(ied|ies)$/, 'i')
178 elsif self =~
/(ied|ies)$/
179 self.sub(/(ied|ies)$/, 'ie')
180 elsif self =~
/(us|ss)$/
183 if self =~
/(#{V}.+)s$/
193 def step_1b(gb_english
= false)
194 if self =~
/(eed|eedly)$/
195 if self.porter2_r1
=~
/(eed|eedly)$/
196 self.sub(/(eed|eedly)$/, 'ee')
202 if w
=~
/#{V}.*(ed|edly|ing|ingly)$/
203 w
.sub
!(/(ed|edly|ing|ingly)$/, '')
204 if w
=~
/(at|lb|iz)$/
206 elsif w
=~
/is$/ and gb_english
208 elsif w
=~
/#{Double}$/
210 elsif w
.porter2_is_short_word
?
220 if self =~
/.+#{C}(y|Y)$/
221 self.sub(/(y|Y)$/, 'i')
228 def step_2(gb_english
= false)
230 s2m
= STEP_2_MAPS
.dup
233 s2m
["isation"] = "ise"
235 step_2_re
= Regexp
.union(s2m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
238 self.sub(/#{$&}$/, s2m
[$
&])
242 elsif r1
=~
/li$/ and self =~
/(#{Valid_LI})li$/
244 elsif r1
=~
/ogi$/ and self =~
/logi$/
245 self.sub(/ogi$/, 'og')
252 def step_3(gb_english
= false)
253 if self =~
/ative$/ and self.porter2_r2
=~
/ative$/
254 self.sub(/ative$/, '')
256 s3m
= STEP_3_MAPS
.dup
260 step_3_re
= Regexp
.union(s3m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
262 if self =~ step_3_re
and r1
=~
/#{$&}$/
263 self.sub(/#{$&}$/, s3m
[$
&])
271 def step_4(gb_english
= false)
272 if self.porter2_r2
=~
/ion$/ and self =~
/(s|t)ion$/
275 s4m
= STEP_4_MAPS
.dup
279 step_4_re
= Regexp
.union(s4m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
283 self.sub(/#{$&}$/, s4m
[$
&])
295 if self =~
/ll$/ and self.porter2_r2
=~
/l$/
297 elsif self =~
/e$/ and self.porter2_r2
=~
/e$/
301 if self =~
/e$/ and r1
=~
/e$/ and not self =~
/#{SHORT_SYLLABLE}e$/
310 def porter2_postprocess
315 def porter2_stem(gb_english
= false)
316 preword
= self.porter2_tidy
317 return preword
if preword
.length
<= 2
319 word
= preword
.porter2_preprocess
321 if SPECIAL_CASES
.has_key
? word
324 w1a
= word
.step_0
.step_1a
325 if STEP_1A_SPECIAL_CASES
.include? w1a
328 w1a
.step_1b(gb_english
).step_1c
.step_2(gb_english
).step_3(gb_english
).step_4(gb_english
).step_5
.porter2_postprocess
333 def porter2_stem_verbose(gb_english
= false)
334 preword
= self.porter2_tidy
335 puts
"Preword: #{preword}"
336 return preword
if preword
.length
<= 2
338 word
= preword
.porter2_preprocess
339 puts
"Preprocessed: #{word}"
341 if SPECIAL_CASES
.has_key
? word
342 puts
"Returning #{word} as special case #{SPECIAL_CASES[word]}"
347 puts
"R1 = #{r1}, R2 = #{r2}"
349 w0
= word
.step_0
; puts
"After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
350 w1a
= w0
.step_1a
; puts
"After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
352 if STEP_1A_SPECIAL_CASES
.include? w1a
353 puts
"Returning #{w1a} as 1a special case"
356 w1b
= w1a
.step_1b(gb_english
) ; puts
"After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
357 w1c
= w1b
.step_1c
; puts
"After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
358 w2
= w1c
.step_2(gb_english
) ; puts
"After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
359 w3
= w2
.step_3(gb_english
) ; puts
"After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
360 w4
= w3
.step_4(gb_english
) ; puts
"After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
361 w5
= w4
.step_5
; puts
"After step 5: #{w5}"
362 wpost
= w5
.porter2_postprocess
; puts
"After postprocess: #{wpost}"
368 alias stem porter2_stem
372 # Add stem method to all Strings
376 # private :porter2_preprocess, :porter2_r1, :porter2_r2