lib/porter2.rb

   1 # coding: utf-8
   2
   3 require 'porter2_module'
   4
   5 # ==The Porter 2 stemmer
   6 #
   7 # This is the Porter 2 stemming algorithm, as described at
   8 # http://snowball.tartarus.org/algorithms/english/stemmer.html
   9 # The original paper is:
  10 #
  11 # Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14,
  12 # no. 3, pp 130-137
  13 #
  14 # Constants for the stemmer are in the Porter2 module.
  15 #
  16 # Procedures that implement the stemmer are added to the String class.
  17 #
  18 # The stemmer algorithm is implemented in the porter2_stem procedure.
  19 #
  20 # ==Internationalisation
  21 # There isn't much, as this is a stemmer that only works for English.
  22 #
  23 # The +gb_english+ flag to the various procedures allows the stemmer to treat the British
  24 # English '-ise' the same as the American English '-ize'.
  25 #
  26 # ==Longest suffixes
  27 # Several places in the algorithm require matching the longest suffix of a word. The
  28 # regexp engine in Ruby 1.9 seems to handle alterntives in regexps by finding the
  29 # alternative that matches at the first position in the string. As we're only talking
  30 # about suffixes, that first match is also the longest suffix. If the regexp engine changes,
  31 # this behaviour may change and break the stemmer.
  32
  33 class String
  34   # Tidy up the word before we get down to the algorithm
  35   def porter2_tidy
  36     preword = self.to_s.strip.downcase
  37
  38     # map apostrophe-like characters to apostrophes
  39     preword.gsub!(/‘/, "'")
  40     preword.gsub!(/’/, "'")
  41
  42     preword
  43   end
  44
  45
  46   # Preprocess the word.
  47   # Remove any initial ', if present. Then, set initial y, or y after a vowel, to Y
  48   #
  49   # (The comment to 'establish the regions R1 and R2' in the original description
  50   # is an implementation optimisation that identifies where the regions start. As
  51   # no modifications are made to the word that affect those positions, you may want
  52   # to cache them now. This implementation doesn't do that.)
  53   def porter2_preprocess
  54     w = self.dup
  55
  56     # remove any initial apostrophe
  57     w.gsub!(/^'*(.)/, '\1')
  58
  59     # set initial y, or y after a vowel, to Y
  60     w.gsub!(/^y/, "Y")
  61     w.gsub!(/(#{Porter2::V})y/, '\1Y')
  62
  63     w
  64   end
  65
  66
  67   # R1 is the portion of the word after the first non-vowel after the first vowel
  68   # (with words beginning 'gener-', 'commun-', and 'arsen-' treated as special cases
  69   def porter2_r1
  70     if self =~ /^(gener|commun|arsen)(?<r1>.*)/
  71       Regexp.last_match(:r1)
  72     else
  73       self =~ /#{Porter2::V}#{Porter2::C}(?<r1>.*)$/
  74       Regexp.last_match(:r1) || ""
  75     end
  76   end
  77
  78
  79   # R2 is the portion of R1 (porter2_r1) after the first non-vowel after the first vowel
  80   def porter2_r2
  81     self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?<r2>.*)$/
  82     Regexp.last_match(:r2) || ""
  83   end
  84
  85
  86   # Returns true if the word ends with a short syllable
  87   def porter2_ends_with_short_syllable?
  88     self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false
  89   end
  90
  91
  92   # A word is short if it ends in a short syllable, and R1 is null
  93   def porter2_is_short_word?
  94     self.porter2_ends_with_short_syllable? and self.porter2_r1.empty?
  95   end
  96
  97
  98   # Search for the longest among the suffixes,
  99   # * '
 100   # * 's
 101   # * 's'
 102   # and remove if found.
 103   def porter2_step0
 104     self.sub!(/(.)('s'|'s|')$/, '\1') || self
 105   end
 106
 107
 108   # Search for the longest among the following suffixes, and perform the action indicated.
 109   # sses:: replace by ss
 110   # ied, ies:: replace by i if preceded by more than one letter, otherwise by ie
 111   # s:: delete if the preceding word part contains a vowel not immediately before the s
 112   # us, ss:: do nothing
 113   def porter2_step1a
 114     if self =~ /sses$/
 115       self.sub(/sses$/, 'ss')
 116     elsif self =~ /..(ied|ies)$/
 117       self.sub(/(ied|ies)$/, 'i')
 118     elsif self =~ /(ied|ies)$/
 119       self.sub(/(ied|ies)$/, 'ie')
 120     elsif self =~ /(us|ss)$/
 121       self
 122     elsif self =~ /s$/
 123       if self =~ /(#{Porter2::V}.+)s$/
 124         self.sub(/s$/, '')
 125       else
 126         self
 127       end
 128     else
 129       self
 130     end
 131   end
 132
 133
 134   # Search for the longest among the following suffixes, and perform the action indicated.
 135   # eed, eedly:: replace by ee if the suffix is also in R1
 136   # ed, edly, ing, ingly:: delete if the preceding word part contains a vowel and,
 137   #                        after the deletion:
 138   #                        * if the word ends at, bl or iz: add e, or
 139   #                        * if the word ends with a double: remove the last letter, or
 140   #                        * if the word is short: add e
 141   #
 142   # (If gb_english is +true+, treat the 'is' suffix as 'iz' above.)
 143   def porter2_step1b(gb_english = false)
 144     if self =~ /(eed|eedly)$/
 145       if self.porter2_r1 =~ /(eed|eedly)$/
 146         self.sub(/(eed|eedly)$/, 'ee')
 147       else
 148         self
 149       end
 150     else
 151       w = self.dup
 152       if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/
 153         w.sub!(/(ed|edly|ing|ingly)$/, '')
 154         if w =~ /(at|lb|iz)$/
 155           w += 'e'
 156         elsif w =~ /is$/ and gb_english
 157           w += 'e'
 158         elsif w =~ /#{Porter2::Double}$/
 159           w.chop!
 160         elsif w.porter2_is_short_word?
 161           w += 'e'
 162         end
 163       end
 164       w
 165     end
 166   end
 167
 168
 169   # Replace a suffix of y or Y by i if it is preceded by a non-vowel which is
 170   # not the first letter of the word.
 171   def porter2_step1c
 172     if self =~ /.+#{Porter2::C}(y|Y)$/
 173       self.sub(/(y|Y)$/, 'i')
 174     else
 175       self
 176     end
 177   end
 178
 179
 180   # Search for the longest among the suffixes listed in the keys of Porter2::STEP_2_MAPS.
 181   # If one is found and that suffix occurs in R1, replace it with the value
 182   # found in STEP_2_MAPS.
 183   #
 184   # (Suffixes 'ogi' and 'li' are treated as special cases in the procedure.)
 185   #
 186   # (If gb_english is +true+, replace the 'iser' and 'isation' suffixes with
 187   # 'ise', similarly to how 'izer' and 'ization' are treated.)
 188   def porter2_step2(gb_english = false)
 189     r1 = self.porter2_r1
 190     s2m = Porter2::STEP_2_MAPS.dup
 191     if gb_english
 192       s2m["iser"] = "ise"
 193       s2m["isation"] = "ise"
 194     end
 195     step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
 196     if self =~ step_2_re
 197       if r1 =~ /#{$&}$/
 198         self.sub(/#{$&}$/, s2m[$&])
 199       else
 200         self
 201       end
 202     elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/
 203       self.sub(/li$/, '')
 204     elsif r1 =~ /ogi$/ and self =~ /logi$/
 205       self.sub(/ogi$/, 'og')
 206     else
 207       self
 208     end
 209   end
 210
 211
 212   # Search for the longest among the suffixes listed in the keys of Porter2::STEP_3_MAPS.
 213   # If one is found and that suffix occurs in R1, replace it with the value
 214   # found in STEP_3_MAPS.
 215   #
 216   # (Suffix 'ative' is treated as a special case in the procedure.)
 217   #
 218   # (If gb_english is +true+, replace the 'alise' suffix with
 219   # 'al', similarly to how 'alize' is treated.)
 220   def porter2_step3(gb_english = false)
 221     if self =~ /ative$/ and self.porter2_r2 =~ /ative$/
 222       self.sub(/ative$/, '')
 223     else
 224       s3m = Porter2::STEP_3_MAPS.dup
 225       if gb_english
 226         s3m["alise"] = "al"
 227       end
 228       step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
 229       r1 = self.porter2_r1
 230       if self =~ step_3_re and r1 =~ /#{$&}$/
 231         self.sub(/#{$&}$/, s3m[$&])
 232       else
 233         self
 234       end
 235     end
 236   end
 237
 238
 239   # Search for the longest among the suffixes listed in the keys of Porter2::STEP_4_MAPS.
 240   # If one is found and that suffix occurs in R2, replace it with the value
 241   # found in STEP_4_MAPS.
 242   #
 243   # (Suffix 'ion' is treated as a special case in the procedure.)
 244   #
 245   # (If gb_english is +true+, delete the 'ise' suffix if found.)
 246   def porter2_step4(gb_english = false)
 247     if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/
 248       self.sub(/ion$/, '')
 249     else
 250       s4m = Porter2::STEP_4_MAPS.dup
 251       if gb_english
 252         s4m["ise"] = ""
 253       end
 254       step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")})
 255       r2 = self.porter2_r2
 256       if self =~ step_4_re
 257         if r2 =~ /#{$&}/
 258           self.sub(/#{$&}$/, s4m[$&])
 259         else
 260           self
 261         end
 262       else
 263         self
 264       end
 265     end
 266   end
 267
 268
 269   # Search for the the following suffixes, and, if found, perform the action indicated.
 270   # e:: delete if in R2, or in R1 and not preceded by a short syllable
 271   # l:: delete if in R2 and preceded by l
 272   def porter2_step5
 273     if self =~ /ll$/ and self.porter2_r2 =~ /l$/
 274       self.sub(/ll$/, 'l')
 275     elsif self =~ /e$/ and self.porter2_r2 =~ /e$/
 276       self.sub(/e$/, '')
 277     else
 278       r1 = self.porter2_r1
 279       if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/
 280         self.sub(/e$/, '')
 281       else
 282         self
 283       end
 284     end
 285   end
 286
 287
 288   # Turn all Y letters into y
 289   def porter2_postprocess
 290     self.gsub(/Y/, 'y')
 291   end
 292
 293   public
 294
 295   # Perform the stemming procedure. If +gb_english+ is true, treat '-ise' and similar suffixes
 296   # as '-ize' in American English.
 297   def porter2_stem(gb_english = false)
 298     preword = self.porter2_tidy
 299     return preword if preword.length <= 2
 300
 301     word = preword.porter2_preprocess
 302
 303     if Porter2::SPECIAL_CASES.has_key? word
 304       Porter2::SPECIAL_CASES[word]
 305     else
 306       w1a = word.porter2_step0.porter2_step1a
 307       if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
 308         w1a
 309       else
 310         w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess
 311       end
 312     end
 313   end
 314
 315   # A verbose version of porter2_stem that prints the output of each stage to STDOUT
 316   def porter2_stem_verbose(gb_english = false)
 317     preword = self.porter2_tidy
 318     puts "Preword: #{preword}"
 319     return preword if preword.length <= 2
 320
 321     word = preword.porter2_preprocess
 322     puts "Preprocessed: #{word}"
 323
 324     if Porter2::SPECIAL_CASES.has_key? word
 325       puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}"
 326       Porter2::SPECIAL_CASES[word]
 327     else
 328       r1 = word.porter2_r1
 329       r2 = word.porter2_r2
 330       puts "R1 = #{r1}, R2 = #{r2}"
 331
 332       w0 = word.porter2_step0 ; puts "After step 0:  #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
 333       w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
 334
 335       if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
 336         puts "Returning #{w1a} as 1a special case"
 337         w1a
 338       else
 339         w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
 340         w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
 341         w2 = w1c.porter2_step2(gb_english) ; puts "After step 2:  #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
 342         w3 = w2.porter2_step3(gb_english) ; puts "After step 3:  #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
 343         w4 = w3.porter2_step4(gb_english) ; puts "After step 4:  #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
 344         w5 = w4.porter2_step5 ; puts "After step 5:  #{w5}"
 345         wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}"
 346         wpost
 347       end
 348     end
 349   end
 350
 351   alias stem porter2_stem
 352
 353 end
 354