Gemified
[porter2stemmer.git] / lib / porter2stemmer / constants.rb
diff --git a/lib/porter2stemmer/constants.rb b/lib/porter2stemmer/constants.rb
new file mode 100644 (file)
index 0000000..f123bc5
--- /dev/null
@@ -0,0 +1,114 @@
+# coding: utf-8\r
+\r
+# Constants for the Porter 2 stemmer\r
+module Porter2\r
+\r
+  # A non-vowel\r
+  C = "[^aeiouy]"\r
+\r
+  # A vowel: a e i o u y\r
+  V = "[aeiouy]"\r
+\r
+  # A non-vowel other than w, x, or Y\r
+  CW = "[^aeiouywxY]"\r
+\r
+  # Doubles created when adding a suffix: these are undoubled when stemmed\r
+  Double = "(bb|dd|ff|gg|mm|nn|pp|rr|tt)"\r
+\r
+  # A valid letter that can come before 'li' (or 'ly')\r
+  Valid_LI = "[cdeghkmnrt]"\r
+\r
+  # A specification for a short syllable.\r
+  #\r
+  # A short syllable in a word is either: \r
+  # 1. a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or \r
+  # 2. a vowel at the beginning of the word followed by a non-vowel.\r
+  #\r
+  # (The original document is silent on whether sequences of two or more non-vowels make a\r
+  # syllable long. But as this specification is only used to find sequences of non-vowel -\r
+  # vowel - non-vowel - end-of-word, this ambiguity does not have an effect.)\r
+  SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))"\r
+\r
+  # Suffix transformations used in porter2_step2.\r
+  # (ogi, li endings dealt with in procedure)\r
+  STEP_2_MAPS = {"tional" => "tion",\r
+                "enci" => "ence",\r
+                 "anci" => "ance",\r
+                 "abli" => "able",\r
+                 "entli" => "ent",\r
+                 "ization" => "ize",\r
+                 "izer" => "ize",\r
+                 "ational" => "ate",\r
+                 "ation" => "ate",\r
+                 "ator" => "ate",\r
+                 "alism" => "al",\r
+                 "aliti" => "al",\r
+                 "alli" => "al",\r
+                 "fulness" => "ful",\r
+                 "ousli" => "ous",\r
+                 "ousness" => "ous",\r
+                 "iveness" => "ive",\r
+                 "iviti" => "ive",\r
+                 "biliti" => "ble",\r
+                 "bli" => "ble",\r
+                 "fulli" => "ful",\r
+                 "lessli" => "less" }\r
+\r
+  # Suffix transformations used in porter2_step3.\r
+  # (ative ending dealt with in procedure)  \r
+  STEP_3_MAPS = {"tional" => "tion",\r
+                 "ational" => "ate",\r
+                 "alize" => "al",\r
+                 "icate" => "ic",\r
+                 "iciti" => "ic",\r
+                 "ical" => "ic",\r
+                 "ful" => "",\r
+                 "ness" => "" }\r
+  \r
+  # Suffix transformations used in porter2_step4.\r
+  # (ion ending dealt with in procedure)\r
+  STEP_4_MAPS = {"al" => "",\r
+                 "ance" => "",\r
+                 "ence" => "",\r
+                 "er" => "",\r
+                 "ic" => "",\r
+                 "able" => "",\r
+                 "ible" => "",\r
+                 "ant" => "",\r
+                 "ement" => "",\r
+                 "ment" => "",\r
+                 "ent" => "",\r
+                 "ism" => "",\r
+                 "ate" => "",\r
+                 "iti" => "",\r
+                 "ous" => "",\r
+                 "ive" => "",\r
+                 "ize" => "" }\r
+  \r
+  # Special-case stemmings \r
+  SPECIAL_CASES = {"skis" => "ski",\r
+                   "skies" => "sky",\r
+                    \r
+                   "dying" => "die",\r
+                   "lying" => "lie",\r
+                   "tying" => "tie",\r
+                   "idly" =>  "idl",\r
+                   "gently" => "gentl",\r
+                   "ugly" => "ugli",\r
+                   "early" => "earli",\r
+                   "only" => "onli",\r
+                   "singly" =>"singl",\r
+                    \r
+                   "sky" => "sky",\r
+                   "news" => "news",\r
+                   "howe" => "howe",\r
+                   "atlas" => "atlas",\r
+                   "cosmos" => "cosmos",\r
+                   "bias" => "bias",\r
+                   "andes" => "andes" }\r
+   \r
+  # Special case words to stop processing after step 1a.\r
+  STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ]\r
+\r
+end\r
+\r