3 # Implementation of the Porter 2 stemmer. String#porter2_stem is the main stemming procedure.
6 # Tidy up the word before we get down to the algorithm
8 preword
= self.to_s
.strip
.downcase
10 # map apostrophe-like characters to apostrophes
11 preword
.gsub
!(/‘/, "'")
12 preword
.gsub
!(/’/, "'")
18 # Preprocess the word.
19 # Remove any initial ', if present. Then, set initial y, or y after a vowel, to Y
21 # (The comment to 'establish the regions R1 and R2' in the original description
22 # is an implementation optimisation that identifies where the regions start. As
23 # no modifications are made to the word that affect those positions, you may want
24 # to cache them now. This implementation doesn't do that.)
25 def porter2_preprocess
28 # remove any initial apostrophe
29 w
.gsub
!(/^'*(.)/, '\1')
31 # set initial y, or y after a vowel, to Y
33 w
.gsub
!(/(#{Porter2::V})y/, '\1Y')
39 # R1 is the portion of the word after the first non-vowel after the first vowel
40 # (with words beginning 'gener-', 'commun-', and 'arsen-' treated as special cases
42 if self =~
/^(gener|commun|arsen)(?<r1>.*)/
43 Regexp
.last_match(:r1)
45 self =~
/#{Porter2::V}#{Porter2::C}(?<r1>.*)$/
46 Regexp
.last_match(:r1) || ""
51 # R2 is the portion of R1 (porter2_r1) after the first non-vowel after the first vowel
53 self.porter2_r1
=~
/#{Porter2::V}#{Porter2::C}(?<r2>.*)$/
54 Regexp
.last_match(:r2) || ""
58 # Returns true if the word ends with a short syllable
59 def porter2_ends_with_short_syllable
?
60 self =~
/#{Porter2::SHORT_SYLLABLE}$/ ? true : false
64 # A word is short if it ends in a short syllable, and R1 is null
65 def porter2_is_short_word
?
66 self.porter2_ends_with_short_syllable
? and self.porter2_r1
.empty
?
70 # Search for the longest among the suffixes,
74 # and remove if found.
76 self.sub
!(/(.)('s'|'s|')$/, '\1') || self
80 # Search for the longest among the following suffixes, and perform the action indicated.
81 # sses:: replace by ss
82 # ied, ies:: replace by i if preceded by more than one letter, otherwise by ie
83 # s:: delete if the preceding word part contains a vowel not immediately before the s
87 self.sub(/sses$/, 'ss')
88 elsif self =~
/..(ied|ies)$/
89 self.sub(/(ied|ies)$/, 'i')
90 elsif self =~
/(ied|ies)$/
91 self.sub(/(ied|ies)$/, 'ie')
92 elsif self =~
/(us|ss)$/
95 if self =~
/(#{Porter2::V}.+)s$/
106 # Search for the longest among the following suffixes, and perform the action indicated.
107 # eed, eedly:: replace by ee if the suffix is also in R1
108 # ed, edly, ing, ingly:: delete if the preceding word part contains a vowel and,
109 # after the deletion:
110 # * if the word ends at, bl or iz: add e, or
111 # * if the word ends with a double: remove the last letter, or
112 # * if the word is short: add e
114 # (If gb_english is +true+, treat the 'is' suffix as 'iz' above.)
115 def porter2_step1b(gb_english
= false)
116 if self =~
/(eed|eedly)$/
117 if self.porter2_r1
=~
/(eed|eedly)$/
118 self.sub(/(eed|eedly)$/, 'ee')
124 if w
=~
/#{Porter2::V}.*(ed|edly|ing|ingly)$/
125 w
.sub
!(/(ed|edly|ing|ingly)$/, '')
126 if w
=~
/(at|lb|iz)$/
128 elsif w
=~
/is$/ and gb_english
130 elsif w
=~
/#{Porter2::Double}$/
132 elsif w
.porter2_is_short_word
?
141 # Replace a suffix of y or Y by i if it is preceded by a non-vowel which is
142 # not the first letter of the word.
144 if self =~
/.+#{Porter2::C}(y|Y)$/
145 self.sub(/(y|Y)$/, 'i')
152 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_2_MAPS.
153 # If one is found and that suffix occurs in R1, replace it with the value
154 # found in STEP_2_MAPS.
156 # (Suffixes 'ogi' and 'li' are treated as special cases in the procedure.)
158 # (If gb_english is +true+, replace the 'iser' and 'isation' suffixes with
159 # 'ise', similarly to how 'izer' and 'ization' are treated.)
160 def porter2_step2(gb_english
= false)
162 s2m
= Porter2
::STEP_2_MAPS.dup
165 s2m
["isation"] = "ise"
167 step_2_re
= Regexp
.union(s2m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
170 self.sub(/#{$&}$/, s2m
[$
&])
174 elsif r1
=~
/li$/ and self =~
/(#{Porter2::Valid_LI})li$/
176 elsif r1
=~
/ogi$/ and self =~
/logi$/
177 self.sub(/ogi$/, 'og')
184 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_3_MAPS.
185 # If one is found and that suffix occurs in R1, replace it with the value
186 # found in STEP_3_MAPS.
188 # (Suffix 'ative' is treated as a special case in the procedure.)
190 # (If gb_english is +true+, replace the 'alise' suffix with
191 # 'al', similarly to how 'alize' is treated.)
192 def porter2_step3(gb_english
= false)
193 if self =~
/ative$/ and self.porter2_r2
=~
/ative$/
194 self.sub(/ative$/, '')
196 s3m
= Porter2
::STEP_3_MAPS.dup
200 step_3_re
= Regexp
.union(s3m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
202 if self =~ step_3_re
and r1
=~
/#{$&}$/
203 self.sub(/#{$&}$/, s3m
[$
&])
211 # Search for the longest among the suffixes listed in the keys of Porter2::STEP_4_MAPS.
212 # If one is found and that suffix occurs in R2, replace it with the value
213 # found in STEP_4_MAPS.
215 # (Suffix 'ion' is treated as a special case in the procedure.)
217 # (If gb_english is +true+, delete the 'ise' suffix if found.)
218 def porter2_step4(gb_english
= false)
219 if self.porter2_r2
=~
/ion$/ and self =~
/(s|t)ion$/
222 s4m
= Porter2
::STEP_4_MAPS.dup
226 step_4_re
= Regexp
.union(s4m
.keys
.map
{|r
| Regexp
.new(r
+ "$")})
230 self.sub(/#{$&}$/, s4m
[$
&])
241 # Search for the the following suffixes, and, if found, perform the action indicated.
242 # e:: delete if in R2, or in R1 and not preceded by a short syllable
243 # l:: delete if in R2 and preceded by l
245 if self =~
/ll$/ and self.porter2_r2
=~
/l$/
247 elsif self =~
/e$/ and self.porter2_r2
=~
/e$/
251 if self =~
/e$/ and r1
=~
/e$/ and not self =~
/#{Porter2::SHORT_SYLLABLE}e$/
260 # Turn all Y letters into y
261 def porter2_postprocess
267 # Perform the stemming procedure. If +gb_english+ is true, treat '-ise' and similar suffixes
268 # as '-ize' in American English.
269 def porter2_stem(gb_english
= false)
270 preword
= self.porter2_tidy
271 return preword
if preword
.length
<= 2
273 word
= preword
.porter2_preprocess
275 if Porter2
::SPECIAL_CASES.has_key
? word
276 Porter2
::SPECIAL_CASES[word
]
278 w1a
= word
.porter2_step0
.porter2_step1a
279 if Porter2
::STEP_1A_SPECIAL_CASES.include? w1a
282 w1a
.porter2_step1b(gb_english
).porter2_step1c
.porter2_step2(gb_english
).porter2_step3(gb_english
).porter2_step4(gb_english
).porter2_step5
.porter2_postprocess
287 # A verbose version of porter2_stem that prints the output of each stage to STDOUT
288 def porter2_stem_verbose(gb_english
= false)
289 preword
= self.porter2_tidy
290 puts
"Preword: #{preword}"
291 return preword
if preword
.length
<= 2
293 word
= preword
.porter2_preprocess
294 puts
"Preprocessed: #{word}"
296 if Porter2
::SPECIAL_CASES.has_key
? word
297 puts
"Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}"
298 Porter2
::SPECIAL_CASES[word
]
302 puts
"R1 = #{r1}, R2 = #{r2}"
304 w0
= word
.porter2_step0
; puts
"After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
305 w1a
= w0
.porter2_step1a
; puts
"After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
307 if Porter2
::STEP_1A_SPECIAL_CASES.include? w1a
308 puts
"Returning #{w1a} as 1a special case"
311 w1b
= w1a
.porter2_step1b(gb_english
) ; puts
"After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
312 w1c
= w1b
.porter2_step1c
; puts
"After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
313 w2
= w1c
.porter2_step2(gb_english
) ; puts
"After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
314 w3
= w2
.porter2_step3(gb_english
) ; puts
"After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
315 w4
= w3
.porter2_step4(gb_english
) ; puts
"After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
316 w5
= w4
.porter2_step5
; puts
"After step 5: #{w5}"
317 wpost
= w5
.porter2_postprocess
; puts
"After postprocess: #{wpost}"
323 alias stem porter2_stem