#! /local/ruby/bin/ruby # coding: utf-8 # Porter stemmer in Ruby. # # This is the Porter 2 stemming algorithm, as described at # http://snowball.tartarus.org/algorithms/english/stemmer.html # The original paper is: # # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, # no. 3, pp 130-137, module Stemmable C = "[^aeiouy]" # consonant V = "[aeiouy]" # vowel CW = "[^aeiouywxY]" # a non-vowel other than w, x, or Y Double = "bb|dd|ff|gg|mm|nn|pp|rr|tt" Valid_LI = "[cdeghkmnrt]" SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))" STEP_2_MAPS = {"tional" => "tion", "enci" => "ence", "anci" => "ance", "abli" => "able", "entli" => "ent", "ization" => "ize", "izer" => "ize", "ational" => "ate", "ation" => "ate", "ator" => "ate", "alism" => "al", "aliti" => "al", "alli" => "al", "fulness" => "ful", "ousli" => "ous", "ousness" => "ous", "iveness" => "ive", "iviti" => "ive", "biliti" => "ble", "bli" => "ble", "fulli" => "ful", "lessli" => "less", "logi" => "log" } # li ending dealt with in procedure STEP_3_MAPS = {"tional" => "tion", "ational" => "ate", "alize" => "al", "icate" => "ic", "iciti" => "ic", "ical" => "ic", "ful" => "", "ness" => "" } # ative ending dealt with in procedure STEP_4_MAPS = {"al" => "", "ance" => "", "ence" => "", "er" => "", "ic" => "", "able" => "", "ible" => "", "ant" => "", "ement" => "", "ment" => "", "ent" => "", "ism" => "", "ate" => "", "iti" => "", "ous" => "", "ive" => "", "ize" => "", "sion" => "s", "tion" => "t" } SPECIAL_CASES = {"skis" => "ski", "skies" => "sky", "dying" => "die", "lying" => "lie", "tying" => "tie", "idly" => "idl" "gently" => "gentl", "ugly" => "ugli", "early" => "earli", "only" => "onli", "singly" =>"singl", "sky" => "sky", "news" => "news", "howe" => "howe", "atlas" => "atlas", "cosmos" => "cosmos", "bias" => "bias", "andes" => "andes" } STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ] def porter2_preprocess w = self.to_s.strip.downcase # map apostrophe-like characters to apostrophes w.gsub!(/‘/, "'") w.gsub!(/’/, "'") # remove any initial apostrophe w.gsub!(/^'*/, "") # set initial y, or y after a vowel, to Y w.gsub!(/^y/, "Y") w.gsub!(/(#{V})y/, '\1Y') w end # The word after the first non-vowel after the first vowel def porter2_r1 if self =~ /^(gener|commun|arsen)(?.*)/ Regexp.last_match(:r1) else self =~ /#{V}#{C}(?.*)$/ Regexp.last_match(:r1) || "" end end # R1 after the first non-vowel after the first vowel def porter2_r2 self.porter2_r1 =~ /#{V}#{C}(?.*)$/ Regexp.last_match(:r2) || "" end def porter2_ends_with_short_syllable? self =~ /#{SHORT_SYLLABLE}$/ ? true : false end def porter2_is_short_word?(r1) self.porter2_ends_with_short_syllable? and r1.empty? end # Remove 's suffixes def step_0 self.sub!(/('s'|'s|')$/, '') || self end # Remove plural suffixes def step_1a self.sub!(/sses$/, 'ss') self.sub!(/^(.)(ies|ied)$/, '\1ie') self.sub!(/^(.+)(ies|ied)$/, '\1i') self.sub!(/^(ies|ied)$/, 'ie') unless self =~ /(ss|us)$/ self.sub!(/(#{V}.+)s$/, '\1') end self end def step_1b(r1, gb_english = false) self.sub!(/(eed|eedly)$/, 'ee') if r1 =~ /eed|eedly/ w = self.dup if w =~ /#{V}.*(ed|edly|ing|ingly)$/ w.sub!(/(ed|edly|ing|ingly)$/, '') if w =~ /(at|lb|iz)$/ w += 'e' elsif w =~ /is$/ and gb_english w += 'e' elsif w =~ /#{Double}$/ w.chop! elsif w.porter2_is_short_word?(w.porter2_r1) w += 'e' end end w end def step_1c if self =~ /.+#{C}.*(y|Y)$/ self.sub(/(y|Y)$/, 'i') else self end end def step_2(gb_english = false) if self =~ /(#{Valid_LI})li$/ self.dup.sub(/(#{Valid_LI})li$/, '\1') else s2m = STEP_2_MAPS.dup if gb_english s2m["iser"] = "ise" s2m["isation"] = "ise" end step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")}) if self =~ step_2_re $` + s2m[$&] else self end end end def step_3(r2, gb_english = false) if self =~ /ative$/ and r2 =~ /ative/ self.dup.sub(/ative$/, '') else s3m = STEP_3_MAPS.dup if gb_english s3m["alise"] = "al" end step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")}) if self =~ step_3_re $` + s3m[$&] else self end end end def step_4(r2, gb_english = false) s4m = STEP_4_MAPS.dup if gb_english s4m["ise"] = "" end suffixes = s4m.keys.sort_by {|s| s.length}.reverse suffixes.each do |s| if r2 =~ /#{s}/ and self =~ /#{s}$/ return $` + s4m[$&] end end return self end def step_5(r1, r2) if self =~ /ll$/ and r2 =~ /l/ self.dup.sub(/ll$/, 'l') elsif self =~ /e$/ and (r2 =~ /e/ or r1 =~ /#{SHORT_SYLLABLE}e/ ) self.dup.sub(/e$/, '') else self end end def porter2_postprocess self.dup.gsub(/Y/, 'y') end def porter2_stem(gb_english = false) word = self.porter2_preprocess if SPECIAL_CASES.has_key? word SPECIAL_CASES[word] else r1 = word.porter2_r1 r2 = word.porter2_r2 w1a = word.step_0.step_1a.step_1b(gb_english) if STEP_1A_SPECIAL_CASES.include? w1a w1a else w1a.step_1c.step_2(gb_english).step_3(r2, gb_english).step_4(r2, gb_english).step_5(r1, r2) end end end alias stem porter2_stem end # Add stem method to all Strings class String include Stemmable private :porter2_preprocess, :porter2_r1, :porter2_r2 end