#! /local/ruby/bin/ruby
# coding: utf-8

# Porter stemmer in Ruby.
#
# This is the Porter 2 stemming algorithm, as described at 
# http://snowball.tartarus.org/algorithms/english/stemmer.html
# The original paper is:
#
#   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
#   no. 3, pp 130-137,

module Stemmable

  C = "[^aeiouy]"        # consonant
  V = "[aeiouy]"         # vowel
  CW = "[^aeiouywxY]"    # a non-vowel other than w, x, or Y
  Double = "bb|dd|ff|gg|mm|nn|pp|rr|tt"
  Valid_LI = "[cdeghkmnrt]"
  SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))"

  STEP_2_MAPS = {"tional" => "tion",
		 "enci" => "ence",
                 "anci" => "ance",
                 "abli" => "able",
                 "entli" => "ent",
                 "ization" => "ize",
                 "izer" => "ize",
                 "ational" => "ate",
                 "ation" => "ate",
                 "ator" => "ate",
                 "alism" => "al",
                 "aliti" => "al",
                 "alli" => "al",
                 "fulness" => "ful",
                 "ousli" => "ous",
                 "ousness" => "ous",
                 "iveness" => "ive",
                 "iviti" => "ive",
                 "biliti" => "ble",
                 "bli" => "ble",
                 "fulli" => "ful",
                 "lessli" => "less",
                 "logi" => "log" }
                 # li ending dealt with in procedure
  
  STEP_3_MAPS = {"tional" => "tion",
                 "ational" => "ate",
                 "alize" => "al",
                 "icate" => "ic",
                 "iciti" => "ic",
                 "ical" => "ic",
                 "ful" => "",
                 "ness" => "" }
                 # ative ending dealt with in procedure
  
   STEP_4_MAPS = {"al" => "",
                 "ance" => "",
                 "ence" => "",
                 "er" => "",
                 "ic" => "",
                 "able" => "",
                 "ible" => "",
                 "ant" => "",
                 "ement" => "",
                 "ment" => "",
                 "ent" => "",
                 "ism" => "",
                 "ate" => "",
                 "iti" => "",
                 "ous" => "",
                 "ive" => "",
                 "ize" => "",
                 "sion" => "s",
                 "tion" => "t" }
  
  
   SPECIAL_CASES = {"skis" => "ski",
                    "skies" => "sky",
                    
                    "dying" => "die",
                    "lying" => "lie",
                    "tying" => "tie",
                    "idly" =>  "idl"
                    "gently" => "gentl",
                    "ugly" => "ugli",
                    "early" => "earli",
                    "only" => "onli",
                    "singly" =>"singl",
                    
                    "sky" => "sky",
                    "news" => "news",
                    "howe" => "howe",
                    "atlas" => "atlas",
                    "cosmos" => "cosmos",
                    "bias" => "bias",
                    "andes" => "andes" }
   
   STEP_1A_SPECIAL_CASES = %w[ inning outing canning  herring earring proceed exceed succeed ]
     
   
  def porter2_preprocess    
    w = self.to_s.strip.downcase
    
    # map apostrophe-like characters to apostrophes
    w.gsub!(/‘/, "'")
    w.gsub!(/’/, "'")
    
    # remove any initial apostrophe
    w.gsub!(/^'*/, "")
    
    # set initial y, or y after a vowel, to Y
    w.gsub!(/^y/, "Y")
    w.gsub!(/(#{V})y/, '\1Y')
    
    w
  end
    
  # The word after the first non-vowel after the first vowel
  def porter2_r1
    if self =~ /^(gener|commun|arsen)(?<r1>.*)/
      Regexp.last_match(:r1)
    else
      self =~ /#{V}#{C}(?<r1>.*)$/
      Regexp.last_match(:r1) || ""
    end
  end
  
  # R1 after the first non-vowel after the first vowel
  def porter2_r2
    self.porter2_r1 =~ /#{V}#{C}(?<r2>.*)$/
    Regexp.last_match(:r2) || ""
  end
  
  def porter2_ends_with_short_syllable?
    self =~ /#{SHORT_SYLLABLE}$/ ? true : false
  end
  
  def porter2_is_short_word?(r1)
    self.porter2_ends_with_short_syllable? and r1.empty?
  end
  
  # Remove 's suffixes
  def step_0
    self.sub!(/('s'|'s|')$/, '') || self
  end
  
  # Remove plural suffixes
  def step_1a
    self.sub!(/sses$/, 'ss')
    self.sub!(/^(.)(ies|ied)$/, '\1ie')
    self.sub!(/^(.+)(ies|ied)$/, '\1i')
    self.sub!(/^(ies|ied)$/, 'ie')
    unless self =~ /(ss|us)$/
      self.sub!(/(#{V}.+)s$/, '\1')
    end
    self
  end
  
  def step_1b(r1, gb_english = false)
    self.sub!(/(eed|eedly)$/, 'ee') if r1 =~ /eed|eedly/
    w = self.dup
    if w =~ /#{V}.*(ed|edly|ing|ingly)$/
      w.sub!(/(ed|edly|ing|ingly)$/, '')
      if w =~ /(at|lb|iz)$/
        w += 'e' 
      elsif w =~ /is$/ and gb_english
	w += 'e' 
      elsif w =~ /#{Double}$/
	w.chop!
      elsif w.porter2_is_short_word?(w.porter2_r1)
	w += 'e'
      end
    end
    w
  end
  
  def step_1c
    if self =~ /.+#{C}.*(y|Y)$/
      self.sub(/(y|Y)$/, 'i')
    else
      self
    end
  end
  

  def step_2(gb_english = false)
    if self =~ /(#{Valid_LI})li$/
      self.dup.sub(/(#{Valid_LI})li$/, '\1')
    else
      s2m = STEP_2_MAPS.dup
      if gb_english
	s2m["iser"] = "ise"
	s2m["isation"] = "ise"
      end
      step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
      if self =~ step_2_re
	$` + s2m[$&]
      else
	self
      end
    end
  end
     
  
  def step_3(r2, gb_english = false)
    if self =~ /ative$/ and r2 =~ /ative/
      self.dup.sub(/ative$/, '')
    else
      s3m = STEP_3_MAPS.dup
      if gb_english
	s3m["alise"] = "al"
      end
      step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
      if self =~ step_3_re
	$` + s3m[$&]
      else
	self
      end
    end
  end
  
  
  def step_4(r2, gb_english = false)
    s4m = STEP_4_MAPS.dup
    if gb_english
      s4m["ise"] = ""
    end
    suffixes = s4m.keys.sort_by {|s| s.length}.reverse
    suffixes.each do |s|
      if r2 =~ /#{s}/ and self =~ /#{s}$/ 
	return $` + s4m[$&]
      end
    end
    return self
  end

  
  def step_5(r1, r2)
    if self =~ /ll$/ and r2 =~ /l/
      self.dup.sub(/ll$/, 'l') 
    elsif self =~ /e$/ and (r2 =~ /e/ or r1 =~ /#{SHORT_SYLLABLE}e/ )
      self.dup.sub(/e$/, '') 
    else
      self
    end
  end
  
  
  def porter2_postprocess
    self.dup.gsub(/Y/, 'y')
  end

  
  def porter2_stem(gb_english = false)
    word = self.porter2_preprocess
    
    if SPECIAL_CASES.has_key? word
      SPECIAL_CASES[word]
    else
      r1 = word.porter2_r1
      r2 = word.porter2_r2
    
      w1a = word.step_0.step_1a.step_1b(gb_english)
      if STEP_1A_SPECIAL_CASES.include? w1a 
	w1a
      else
        w1a.step_1c.step_2(gb_english).step_3(r2, gb_english).step_4(r2, gb_english).step_5(r1, r2)
      end
    end
  end  
  
  
  alias stem porter2_stem

end

# Add stem method to all Strings
class String
  include Stemmable
  
  private :porter2_preprocess, :porter2_r1, :porter2_r2
end