X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=lib%2Fporter2_constants.rb;fp=lib%2Fporter2_constants.rb;h=f123bc5c0716d96f52e10c21f71a15a26c81888c;hb=074ce0bade4a2e3ab2210624ba598cd5edd0bec8;hp=0000000000000000000000000000000000000000;hpb=066fa6953e201272b93df3b32f25fff2e18c4ec7;p=porter2stemmer.git diff --git a/lib/porter2_constants.rb b/lib/porter2_constants.rb new file mode 100644 index 0000000..f123bc5 --- /dev/null +++ b/lib/porter2_constants.rb @@ -0,0 +1,114 @@ +# coding: utf-8 + +# Constants for the Porter 2 stemmer +module Porter2 + + # A non-vowel + C = "[^aeiouy]" + + # A vowel: a e i o u y + V = "[aeiouy]" + + # A non-vowel other than w, x, or Y + CW = "[^aeiouywxY]" + + # Doubles created when adding a suffix: these are undoubled when stemmed + Double = "(bb|dd|ff|gg|mm|nn|pp|rr|tt)" + + # A valid letter that can come before 'li' (or 'ly') + Valid_LI = "[cdeghkmnrt]" + + # A specification for a short syllable. + # + # A short syllable in a word is either: + # 1. a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or + # 2. a vowel at the beginning of the word followed by a non-vowel. + # + # (The original document is silent on whether sequences of two or more non-vowels make a + # syllable long. But as this specification is only used to find sequences of non-vowel - + # vowel - non-vowel - end-of-word, this ambiguity does not have an effect.) + SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))" + + # Suffix transformations used in porter2_step2. + # (ogi, li endings dealt with in procedure) + STEP_2_MAPS = {"tional" => "tion", + "enci" => "ence", + "anci" => "ance", + "abli" => "able", + "entli" => "ent", + "ization" => "ize", + "izer" => "ize", + "ational" => "ate", + "ation" => "ate", + "ator" => "ate", + "alism" => "al", + "aliti" => "al", + "alli" => "al", + "fulness" => "ful", + "ousli" => "ous", + "ousness" => "ous", + "iveness" => "ive", + "iviti" => "ive", + "biliti" => "ble", + "bli" => "ble", + "fulli" => "ful", + "lessli" => "less" } + + # Suffix transformations used in porter2_step3. + # (ative ending dealt with in procedure) + STEP_3_MAPS = {"tional" => "tion", + "ational" => "ate", + "alize" => "al", + "icate" => "ic", + "iciti" => "ic", + "ical" => "ic", + "ful" => "", + "ness" => "" } + + # Suffix transformations used in porter2_step4. + # (ion ending dealt with in procedure) + STEP_4_MAPS = {"al" => "", + "ance" => "", + "ence" => "", + "er" => "", + "ic" => "", + "able" => "", + "ible" => "", + "ant" => "", + "ement" => "", + "ment" => "", + "ent" => "", + "ism" => "", + "ate" => "", + "iti" => "", + "ous" => "", + "ive" => "", + "ize" => "" } + + # Special-case stemmings + SPECIAL_CASES = {"skis" => "ski", + "skies" => "sky", + + "dying" => "die", + "lying" => "lie", + "tying" => "tie", + "idly" => "idl", + "gently" => "gentl", + "ugly" => "ugli", + "early" => "earli", + "only" => "onli", + "singly" =>"singl", + + "sky" => "sky", + "news" => "news", + "howe" => "howe", + "atlas" => "atlas", + "cosmos" => "cosmos", + "bias" => "bias", + "andes" => "andes" } + + # Special case words to stop processing after step 1a. + STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ] + +end +