1 #! /local/ruby/bin/ruby
4 # Porter stemmer in Ruby.
6 # This is the Porter 2 stemming algorithm, as described at
7 # http://snowball.tartarus.org/algorithms/english/stemmer.html
8 # The original paper is:
10 # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
15 C = "[^aeiouy]" # consonant
16 V = "[aeiouy]" # vowel
17 CW = "[^aeiouywxY]" # a non-vowel other than w, x, or Y
18 Double = "bb|dd|ff|gg|mm|nn|pp|rr|tt"
19 Valid_LI = "[cdeghkmnrt]"
20 SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))"
22 STEP_2_MAPS = {"tional" => "tion",
45 # li ending dealt with in procedure
47 STEP_3_MAPS = {"tional" => "tion",
55 # ative ending dealt with in procedure
57 STEP_4_MAPS = {"al" => "",
78 SPECIAL_CASES = {"skis" => "ski",
99 STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ]
102 def porter2_preprocess
103 w = self.to_s.strip.downcase
105 # map apostrophe-like characters to apostrophes
109 # remove any initial apostrophe
112 # set initial y, or y after a vowel, to Y
114 w.gsub!(/(#{V})y/, '\1Y')
119 # The word after the first non-vowel after the first vowel
121 if self =~ /^(gener|commun|arsen)(?<r1>.*)/
122 Regexp.last_match(:r1)
124 self =~ /#{V}#{C}(?<r1>.*)$/
125 Regexp.last_match(:r1) || ""
129 # R1 after the first non-vowel after the first vowel
131 self.porter2_r1 =~ /#{V}#{C}(?<r2>.*)$/
132 Regexp.last_match(:r2) || ""
135 def porter2_ends_with_short_syllable?
136 self =~ /#{SHORT_SYLLABLE}$/ ? true : false
139 def porter2_is_short_word?(r1)
140 self.porter2_ends_with_short_syllable? and r1.empty?
145 self.sub!(/('s'|'s|')$/, '') || self
148 # Remove plural suffixes
150 self.sub!(/sses$/, 'ss')
151 self.sub!(/^(.)(ies|ied)$/, '\1ie')
152 self.sub!(/^(.+)(ies|ied)$/, '\1i')
153 self.sub!(/^(ies|ied)$/, 'ie')
154 unless self =~ /(ss|us)$/
155 self.sub!(/(#{V}.+)s$/, '\1')
160 def step_1b(r1, gb_english = false)
161 self.sub!(/(eed|eedly)$/, 'ee') if r1 =~ /eed|eedly/
163 if w =~ /#{V}.*(ed|edly|ing|ingly)$/
164 w.sub!(/(ed|edly|ing|ingly)$/, '')
165 if w =~ /(at|lb|iz)$/
167 elsif w =~ /is$/ and gb_english
169 elsif w =~ /#{Double}$/
171 elsif w.porter2_is_short_word?(w.porter2_r1)
179 if self =~ /.+#{C}.*(y|Y)$/
180 self.sub(/(y|Y)$/, 'i')
187 def step_2(gb_english = false)
188 if self =~ /(#{Valid_LI})li$/
189 self.dup.sub(/(#{Valid_LI})li$/, '\1')
191 s2m = STEP_2_MAPS.dup
194 s2m["isation"] = "ise"
196 step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
206 def step_3(r2, gb_english = false)
207 if self =~ /ative$/ and r2 =~ /ative/
208 self.dup.sub(/ative$/, '')
210 s3m = STEP_3_MAPS.dup
214 step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
224 def step_4(r2, gb_english = false)
225 s4m = STEP_4_MAPS.dup
229 suffixes = s4m.keys.sort_by {|s| s.length}.reverse
231 if r2 =~ /#{s}/ and self =~ /#{s}$/
240 if self =~ /ll$/ and r2 =~ /l/
241 self.dup.sub(/ll$/, 'l')
242 elsif self =~ /e$/ and (r2 =~ /e/ or r1 =~ /#{SHORT_SYLLABLE}e/ )
243 self.dup.sub(/e$/, '')
250 def porter2_postprocess
251 self.dup.gsub(/Y/, 'y')
255 def porter2_stem(gb_english = false)
256 word = self.porter2_preprocess
258 if SPECIAL_CASES.has_key? word
264 w1a = word.step_0.step_1a.step_1b(gb_english)
265 if STEP_1A_SPECIAL_CASES.include? w1a
268 w1a.step_1c.step_2(gb_english).step_3(r2, gb_english).step_4(r2, gb_english).step_5(r1, r2)
274 alias stem porter2_stem
278 # Add stem method to all Strings
282 private :porter2_preprocess, :porter2_r1, :porter2_r2