X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=lib%2Fporter2_constants.rb;fp=lib%2Fporter2_constants.rb;h=f123bc5c0716d96f52e10c21f71a15a26c81888c;hb=074ce0bade4a2e3ab2210624ba598cd5edd0bec8;hp=0000000000000000000000000000000000000000;hpb=066fa6953e201272b93df3b32f25fff2e18c4ec7;p=porter2stemmer.git

diff --git a/lib/porter2_constants.rb b/lib/porter2_constants.rb
new file mode 100644
index 0000000..f123bc5
--- /dev/null
+++ b/lib/porter2_constants.rb
@@ -0,0 +1,114 @@
+# coding: utf-8
+
+# Constants for the Porter 2 stemmer
+module Porter2
+
+  # A non-vowel
+  C = "[^aeiouy]"
+
+  # A vowel: a e i o u y
+  V = "[aeiouy]"
+
+  # A non-vowel other than w, x, or Y
+  CW = "[^aeiouywxY]"
+
+  # Doubles created when adding a suffix: these are undoubled when stemmed
+  Double = "(bb|dd|ff|gg|mm|nn|pp|rr|tt)"
+
+  # A valid letter that can come before 'li' (or 'ly')
+  Valid_LI = "[cdeghkmnrt]"
+
+  # A specification for a short syllable.
+  #
+  # A short syllable in a word is either: 
+  # 1. a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or 
+  # 2. a vowel at the beginning of the word followed by a non-vowel.
+  #
+  # (The original document is silent on whether sequences of two or more non-vowels make a
+  # syllable long. But as this specification is only used to find sequences of non-vowel -
+  # vowel - non-vowel - end-of-word, this ambiguity does not have an effect.)
+  SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))"
+
+  # Suffix transformations used in porter2_step2.
+  # (ogi, li endings dealt with in procedure)
+  STEP_2_MAPS = {"tional" => "tion",
+		 "enci" => "ence",
+                 "anci" => "ance",
+                 "abli" => "able",
+                 "entli" => "ent",
+                 "ization" => "ize",
+                 "izer" => "ize",
+                 "ational" => "ate",
+                 "ation" => "ate",
+                 "ator" => "ate",
+                 "alism" => "al",
+                 "aliti" => "al",
+                 "alli" => "al",
+                 "fulness" => "ful",
+                 "ousli" => "ous",
+                 "ousness" => "ous",
+                 "iveness" => "ive",
+                 "iviti" => "ive",
+                 "biliti" => "ble",
+                 "bli" => "ble",
+                 "fulli" => "ful",
+                 "lessli" => "less" }
+
+  # Suffix transformations used in porter2_step3.
+  # (ative ending dealt with in procedure)  
+  STEP_3_MAPS = {"tional" => "tion",
+                 "ational" => "ate",
+                 "alize" => "al",
+                 "icate" => "ic",
+                 "iciti" => "ic",
+                 "ical" => "ic",
+                 "ful" => "",
+                 "ness" => "" }
+  
+  # Suffix transformations used in porter2_step4.
+  # (ion ending dealt with in procedure)
+  STEP_4_MAPS = {"al" => "",
+                 "ance" => "",
+                 "ence" => "",
+                 "er" => "",
+                 "ic" => "",
+                 "able" => "",
+                 "ible" => "",
+                 "ant" => "",
+                 "ement" => "",
+                 "ment" => "",
+                 "ent" => "",
+                 "ism" => "",
+                 "ate" => "",
+                 "iti" => "",
+                 "ous" => "",
+                 "ive" => "",
+                 "ize" => "" }
+  
+  # Special-case stemmings 
+  SPECIAL_CASES = {"skis" => "ski",
+                   "skies" => "sky",
+                    
+                   "dying" => "die",
+                   "lying" => "lie",
+                   "tying" => "tie",
+                   "idly" =>  "idl",
+                   "gently" => "gentl",
+                   "ugly" => "ugli",
+                   "early" => "earli",
+                   "only" => "onli",
+                   "singly" =>"singl",
+                    
+                   "sky" => "sky",
+                   "news" => "news",
+                   "howe" => "howe",
+                   "atlas" => "atlas",
+                   "cosmos" => "cosmos",
+                   "bias" => "bias",
+                   "andes" => "andes" }
+   
+  # Special case words to stop processing after step 1a.
+  STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ]
+
+end
+