Updated the docs
[porter2stemmer.git] / lib / porter2.rb~
diff --git a/lib/porter2.rb~ b/lib/porter2.rb~
deleted file mode 100644 (file)
index cdc0395..0000000
+++ /dev/null
@@ -1,283 +0,0 @@
-#! /local/ruby/bin/ruby\r
-# coding: utf-8\r
-\r
-# Porter stemmer in Ruby.\r
-#\r
-# This is the Porter 2 stemming algorithm, as described at \r
-# http://snowball.tartarus.org/algorithms/english/stemmer.html\r
-# The original paper is:\r
-#\r
-#   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,\r
-#   no. 3, pp 130-137,\r
-\r
-module Stemmable\r
-\r
-  C = "[^aeiouy]"        # consonant\r
-  V = "[aeiouy]"         # vowel\r
-  CW = "[^aeiouywxY]"    # a non-vowel other than w, x, or Y\r
-  Double = "bb|dd|ff|gg|mm|nn|pp|rr|tt"\r
-  Valid_LI = "[cdeghkmnrt]"\r
-  SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))"\r
-\r
-  STEP_2_MAPS = {"tional" => "tion",\r
-                "enci" => "ence",\r
-                 "anci" => "ance",\r
-                 "abli" => "able",\r
-                 "entli" => "ent",\r
-                 "ization" => "ize",\r
-                 "izer" => "ize",\r
-                 "ational" => "ate",\r
-                 "ation" => "ate",\r
-                 "ator" => "ate",\r
-                 "alism" => "al",\r
-                 "aliti" => "al",\r
-                 "alli" => "al",\r
-                 "fulness" => "ful",\r
-                 "ousli" => "ous",\r
-                 "ousness" => "ous",\r
-                 "iveness" => "ive",\r
-                 "iviti" => "ive",\r
-                 "biliti" => "ble",\r
-                 "bli" => "ble",\r
-                 "fulli" => "ful",\r
-                 "lessli" => "less",\r
-                 "logi" => "log" }\r
-                 # li ending dealt with in procedure\r
-  \r
-  STEP_3_MAPS = {"tional" => "tion",\r
-                 "ational" => "ate",\r
-                 "alize" => "al",\r
-                 "icate" => "ic",\r
-                 "iciti" => "ic",\r
-                 "ical" => "ic",\r
-                 "ful" => "",\r
-                 "ness" => "" }\r
-                 # ative ending dealt with in procedure\r
-  \r
-   STEP_4_MAPS = {"al" => "",\r
-                 "ance" => "",\r
-                 "ence" => "",\r
-                 "er" => "",\r
-                 "ic" => "",\r
-                 "able" => "",\r
-                 "ible" => "",\r
-                 "ant" => "",\r
-                 "ement" => "",\r
-                 "ment" => "",\r
-                 "ent" => "",\r
-                 "ism" => "",\r
-                 "ate" => "",\r
-                 "iti" => "",\r
-                 "ous" => "",\r
-                 "ive" => "",\r
-                 "ize" => "",\r
-                 "sion" => "s",\r
-                 "tion" => "t" }\r
-  \r
-  \r
-   SPECIAL_CASES = {"skis" => "ski",\r
-                    "skies" => "sky",\r
-                    \r
-                    "dying" => "die",\r
-                    "lying" => "lie",\r
-                    "tying" => "tie",\r
-                    "idly" =>  "idl"\r
-                    "gently" => "gentl",\r
-                    "ugly" => "ugli",\r
-                    "early" => "earli",\r
-                    "only" => "onli",\r
-                    "singly" =>"singl",\r
-                    \r
-                    "sky" => "sky",\r
-                    "news" => "news",\r
-                    "howe" => "howe",\r
-                    "atlas" => "atlas",\r
-                    "cosmos" => "cosmos",\r
-                    "bias" => "bias",\r
-                    "andes" => "andes" }\r
-   \r
-   STEP_1A_SPECIAL_CASES = %w[ inning outing canning  herring earring proceed exceed succeed ]\r
-     \r
-   \r
-  def porter2_preprocess    \r
-    w = self.to_s.strip.downcase\r
-    \r
-    # map apostrophe-like characters to apostrophes\r
-    w.gsub!(/‘/, "'")\r
-    w.gsub!(/’/, "'")\r
-    \r
-    # remove any initial apostrophe\r
-    w.gsub!(/^'*/, "")\r
-    \r
-    # set initial y, or y after a vowel, to Y\r
-    w.gsub!(/^y/, "Y")\r
-    w.gsub!(/(#{V})y/, '\1Y')\r
-    \r
-    w\r
-  end\r
-    \r
-  # The word after the first non-vowel after the first vowel\r
-  def porter2_r1\r
-    if self =~ /^(gener|commun|arsen)(?<r1>.*)/\r
-      Regexp.last_match(:r1)\r
-    else\r
-      self =~ /#{V}#{C}(?<r1>.*)$/\r
-      Regexp.last_match(:r1) || ""\r
-    end\r
-  end\r
-  \r
-  # R1 after the first non-vowel after the first vowel\r
-  def porter2_r2\r
-    self.porter2_r1 =~ /#{V}#{C}(?<r2>.*)$/\r
-    Regexp.last_match(:r2) || ""\r
-  end\r
-  \r
-  def porter2_ends_with_short_syllable?\r
-    self =~ /#{SHORT_SYLLABLE}$/ ? true : false\r
-  end\r
-  \r
-  def porter2_is_short_word?(r1)\r
-    self.porter2_ends_with_short_syllable? and r1.empty?\r
-  end\r
-  \r
-  # Remove 's suffixes\r
-  def step_0\r
-    self.sub!(/('s'|'s|')$/, '') || self\r
-  end\r
-  \r
-  # Remove plural suffixes\r
-  def step_1a\r
-    self.sub!(/sses$/, 'ss')\r
-    self.sub!(/^(.)(ies|ied)$/, '\1ie')\r
-    self.sub!(/^(.+)(ies|ied)$/, '\1i')\r
-    self.sub!(/^(ies|ied)$/, 'ie')\r
-    unless self =~ /(ss|us)$/\r
-      self.sub!(/(#{V}.+)s$/, '\1')\r
-    end\r
-    self\r
-  end\r
-  \r
-  def step_1b(r1, gb_english = false)\r
-    self.sub!(/(eed|eedly)$/, 'ee') if r1 =~ /eed|eedly/\r
-    w = self.dup\r
-    if w =~ /#{V}.*(ed|edly|ing|ingly)$/\r
-      w.sub!(/(ed|edly|ing|ingly)$/, '')\r
-      if w =~ /(at|lb|iz)$/\r
-        w += 'e' \r
-      elsif w =~ /is$/ and gb_english\r
-       w += 'e' \r
-      elsif w =~ /#{Double}$/\r
-       w.chop!\r
-      elsif w.porter2_is_short_word?(w.porter2_r1)\r
-       w += 'e'\r
-      end\r
-    end\r
-    w\r
-  end\r
-  \r
-  def step_1c\r
-    if self =~ /.+#{C}.*(y|Y)$/\r
-      self.sub(/(y|Y)$/, 'i')\r
-    else\r
-      self\r
-    end\r
-  end\r
-  \r
-\r
-  def step_2(gb_english = false)\r
-    if self =~ /(#{Valid_LI})li$/\r
-      self.dup.sub(/(#{Valid_LI})li$/, '\1')\r
-    else\r
-      s2m = STEP_2_MAPS.dup\r
-      if gb_english\r
-       s2m["iser"] = "ise"\r
-       s2m["isation"] = "ise"\r
-      end\r
-      step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})\r
-      if self =~ step_2_re\r
-       $` + s2m[$&]\r
-      else\r
-       self\r
-      end\r
-    end\r
-  end\r
-     \r
-  \r
-  def step_3(r2, gb_english = false)\r
-    if self =~ /ative$/ and r2 =~ /ative/\r
-      self.dup.sub(/ative$/, '')\r
-    else\r
-      s3m = STEP_3_MAPS.dup\r
-      if gb_english\r
-       s3m["alise"] = "al"\r
-      end\r
-      step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})\r
-      if self =~ step_3_re\r
-       $` + s3m[$&]\r
-      else\r
-       self\r
-      end\r
-    end\r
-  end\r
-  \r
-  \r
-  def step_4(r2, gb_english = false)\r
-    s4m = STEP_4_MAPS.dup\r
-    if gb_english\r
-      s4m["ise"] = ""\r
-    end\r
-    suffixes = s4m.keys.sort_by {|s| s.length}.reverse\r
-    suffixes.each do |s|\r
-      if r2 =~ /#{s}/ and self =~ /#{s}$/ \r
-       return $` + s4m[$&]\r
-      end\r
-    end\r
-    return self\r
-  end\r
-\r
-  \r
-  def step_5(r1, r2)\r
-    if self =~ /ll$/ and r2 =~ /l/\r
-      self.dup.sub(/ll$/, 'l') \r
-    elsif self =~ /e$/ and (r2 =~ /e/ or r1 =~ /#{SHORT_SYLLABLE}e/ )\r
-      self.dup.sub(/e$/, '') \r
-    else\r
-      self\r
-    end\r
-  end\r
-  \r
-  \r
-  def porter2_postprocess\r
-    self.dup.gsub(/Y/, 'y')\r
-  end\r
-\r
-  \r
-  def porter2_stem(gb_english = false)\r
-    word = self.porter2_preprocess\r
-    \r
-    if SPECIAL_CASES.has_key? word\r
-      SPECIAL_CASES[word]\r
-    else\r
-      r1 = word.porter2_r1\r
-      r2 = word.porter2_r2\r
-    \r
-      w1a = word.step_0.step_1a.step_1b(gb_english)\r
-      if STEP_1A_SPECIAL_CASES.include? w1a \r
-       w1a\r
-      else\r
-        w1a.step_1c.step_2(gb_english).step_3(r2, gb_english).step_4(r2, gb_english).step_5(r1, r2)\r
-      end\r
-    end\r
-  end  \r
-  \r
-  \r
-  alias stem porter2_stem\r
-\r
-end\r
-\r
-# Add stem method to all Strings\r
-class String\r
-  include Stemmable\r
-  \r
-  private :porter2_preprocess, :porter2_r1, :porter2_r2\r
-end\r