From 074ce0bade4a2e3ab2210624ba598cd5edd0bec8 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Fri, 11 Mar 2011 12:08:28 +0000 Subject: [PATCH] Updated documentation --- Readme.rdoc | 53 +++ doc/Porter2.html | 251 ++++++++++++ doc/Readme_rdoc.html | 165 ++++++++ doc/String.html | 530 ++++++++++++------------- doc/TestPorter2.html | 9 + doc/created.rid | 8 +- doc/index.html | 7 + doc/lib/porter2_constants_rb.html | 55 +++ doc/lib/porter2_implementation_rb.html | 55 +++ doc/lib/porter2_module_rb.html | 55 +++ doc/lib/porter2_rb.html | 4 +- doc/lib/porter2_string_rb.html | 57 +++ lib/porter2.rb | 352 +--------------- lib/porter2_constants.rb | 114 ++++++ lib/porter2_implementation.rb | 326 +++++++++++++++ 15 files changed, 1410 insertions(+), 631 deletions(-) create mode 100644 Readme.rdoc create mode 100644 doc/Porter2.html create mode 100644 doc/Readme_rdoc.html create mode 100644 doc/lib/porter2_constants_rb.html create mode 100644 doc/lib/porter2_implementation_rb.html create mode 100644 doc/lib/porter2_module_rb.html create mode 100644 doc/lib/porter2_string_rb.html create mode 100644 lib/porter2_constants.rb create mode 100644 lib/porter2_implementation.rb diff --git a/Readme.rdoc b/Readme.rdoc new file mode 100644 index 0000000..4ed0e6f --- /dev/null +++ b/Readme.rdoc @@ -0,0 +1,53 @@ +# coding: utf-8 + +# ==The Porter 2 stemmer +# This is the Porter 2 stemming algorithm, as described at +# http://snowball.tartarus.org/algorithms/english/stemmer.html +# The original paper is: +# +# Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14, +# no. 3, pp 130-137 +# +# ==Features of this implementation +# This stemmer is written in pure Ruby, making it easy to modify for language variants. +# For instance, the original Porter stemmer only works for American English and does +# not recognise British English's '-ise' as an alternate spelling of '-ize'. This +# implementation has been extended to handle correctly British English. +# +# This stemmer also features a comprehensive test set of over 29,000 words, taken from the +# {Porter 2 stemmer website}[http://snowball.tartarus.org/algorithms/english/stemmer.html]. +# +# ==Files +# Constants for the stemmer are in the Porter2 module. +# +# Procedures that implement the stemmer are added to the String class. +# +# The stemmer algorithm is implemented in the String#porter2_stem procedure. +# +# ==Internationalisation +# There isn't much, as this is a stemmer that only works for English. +# +# The +gb_english+ flag to the various procedures allows the stemmer to treat the British +# English '-ise' the same as the American English '-ize'. +# +# ==Longest suffixes +# Several places in the algorithm require matching the longest suffix of a word. The +# regexp engine in Ruby 1.9 seems to handle alterntives in regexps by finding the +# alternative that matches at the first position in the string. As we're only talking +# about suffixes, that first match is also the longest suffix. If the regexp engine changes, +# this behaviour may change and break the stemmer. +# +# ==Usage +# Call the String#porter2_stem or String#stem methods on a string to return its stem +# "consistency".stem # => "consist" +# "knitting".stem # => "knit" +# "articulated".stem # => "articul" +# "nationalize".stem # => "nation" +# "nationalise".stem # => "nationalis" +# "nationalise".stem(true) # => "nation" +# +# ==Author +# The Porter 2 stemming algorithm was developed by +# {Martin Porter}[http://snowball.tartarus.org/algorithms/english/stemmer.html]. +# This implementation is by {Neil Smith}[http://www.njae.me.uk]. + diff --git a/doc/Porter2.html b/doc/Porter2.html new file mode 100644 index 0000000..9b9e84b --- /dev/null +++ b/doc/Porter2.html @@ -0,0 +1,251 @@ + + + + + + + Module: Porter2 + + + + + + + + + + + +
+
+
+

+ Home + Classes + Methods +

+
+
+ +
+
+

In Files

+ +
+ + +
+ +
+ + + + + + + + + + + + +
+ +
+ + +
+

Files

+ +
+ + +
+

Class Index + [+]

+
+
+ Quicksearch + +
+
+ + + +
+ + +
+
+ +
+

Porter2

+ +
+

+Constants for the Porter 2 stemmer +

+ +
+ + + +
+

Constants

+
+ +
C
+ +

+A non-vowel +

+ + +
V
+ +

+A vowel: a e i o u y +

+ + +
CW
+ +

+A non-vowel other than w, x, or Y +

+ + +
Double
+ +

+Doubles created when adding a suffix: these are undoubled when stemmed +

+ + +
Valid_LI
+ +

+A valid letter that can come before ‘li’ (or ‘ly’) +

+ + +
SHORT_SYLLABLE
+ +

+A specification for a short syllable. +

+

+A short syllable in a word is either: +

+
    +
  1. +a vowel followed by a non-vowel other than w, x or Y and preceded by a +non-vowel, or +

    +
  2. +
  3. +a vowel at the beginning of the word followed by a non-vowel. +

    +
  4. +
+

+(The original document is silent on whether sequences of two or more +non-vowels make a syllable long. But as this specification is only used to +find sequences of non-vowel - vowel - non-vowel - end-of-word, this +ambiguity does not have an effect.) +

+ + +
STEP_2_MAPS
+ +

+Suffix transformations used in porter2_step2. (ogi, li endings dealt with +in procedure) +

+ + +
STEP_3_MAPS
+ +

+Suffix transformations used in porter2_step3. (ative ending dealt with in +procedure) +

+ + +
STEP_4_MAPS
+ +

+Suffix transformations used in porter2_step4. (ion ending dealt with in +procedure) +

+ + +
SPECIAL_CASES
+ +

+Special-case stemmings +

+ + +
STEP_1A_SPECIAL_CASES
+ +

+Special case words to stop processing after step 1a. +

+ + +
+
+ + + + + + + + +
+ + +
+ +

Disabled; run with --debug to generate this.

+ +
+ +
+

[Validate]

+

Generated with the Darkfish + Rdoc Generator 1.1.6.

+
+ + + + diff --git a/doc/Readme_rdoc.html b/doc/Readme_rdoc.html new file mode 100644 index 0000000..ccb076c --- /dev/null +++ b/doc/Readme_rdoc.html @@ -0,0 +1,165 @@ + + + + + + + + File: Readme.rdoc [RDoc Documentation] + + + + + + + + + + +
+
+
+

+ Home + Classes + Methods +

+
+
+ +
+ + +
+

Files

+ +
+ + +
+

Class Index + [+]

+
+
+ Quicksearch + +
+
+ + + +
+ + +
+
+ +
+

The Porter 2 stemmer

+

+This is the Porter 2 stemming algorithm, as described at snowball.tartarus.org/algorithms/english/stemmer.html +The original paper is: +

+

+Porter, 1980, “An algorithm for suffix stripping”, +Program, Vol. 14, no. 3, pp 130-137 +

+

Features of this implementation

+

+This stemmer is written in pure Ruby, making it easy to modify for language +variants. For instance, the original Porter stemmer only works for +American English and does not recognise British English’s +’-ise’ as an alternate spelling of ’-ize’. This +implementation has been extended to handle correctly British English. +

+

+This stemmer also features a comprehensive test set of over 29,000 words, +taken from the Porter +2 stemmer website. +

+

Files

+

+Constants for the stemmer are in the Porter2 +module. +

+

+Procedures that implement the stemmer are added to the String class. +

+

+The stemmer algorithm is implemented in the String#porter2_stem procedure. +

+

Internationalisation

+

+There isn’t much, as this is a stemmer that only works for English. +

+

+The gb_english flag to the various procedures allows the stemmer +to treat the British English ’-ise’ the same as the American +English ’-ize’. +

+

Longest suffixes

+

+Several places in the algorithm require matching the longest suffix of a +word. The regexp engine in Ruby 1.9 seems to handle alterntives in regexps +by finding the alternative that matches at the first position in the +string. As we’re only talking about suffixes, that first match is +also the longest suffix. If the regexp engine changes, this behaviour may +change and break the stemmer. +

+

Usage

+

+Call the String#porter2_stem or String#stem methods on a string to +return its stem +

+
+ "consistency".stem       # => "consist"
+ "knitting".stem          # => "knit"
+ "articulated".stem       # => "articul"
+ "nationalize".stem       # => "nation"
+ "nationalise".stem       # => "nationalis"
+ "nationalise".stem(true) # => "nation"
+
+

Author

+

+The Porter 2 stemming algorithm was developed by Martin +Porter. This implementation is by Neil +Smith. +

+ +
+ +
+

[Validate]

+

Generated with the Darkfish + Rdoc Generator 1.1.6.

+
+ + + diff --git a/doc/String.html b/doc/String.html index e0921af..f04ae9a 100644 --- a/doc/String.html +++ b/doc/String.html @@ -38,8 +38,8 @@
@@ -116,6 +116,15 @@
+
+

Files

+ +
+

Class Index @@ -150,45 +159,10 @@

String

-

The Porter 2 stemmer

-

-This is the Porter 2 stemming algorithm, as described at snowball.tartarus.org/algorithms/english/stemmer.html -The original paper is: -

-

-Porter, 1980, “An algorithm for suffix stripping”, -Program, Vol. 14, no. 3, pp 130-137 -

-

-Constants for the stemmer are in the Porter2 -module. -

-

-Procedures that implement the stemmer are added to the String class. -

-

-The stemmer algorithm is implemented in the porter2_stem procedure. -

-

Internationalisation

-

-There isn’t much, as this is a stemmer that only works for English. -

-

-The gb_english flag to the various procedures allows the stemmer -to treat the British English ’-ise’ the same as the American -English ’-ize’. -

-

Longest suffixes

-

-Several places in the algorithm require matching the longest suffix of a -word. The regexp engine in Ruby 1.9 seems to handle alterntives in regexps -by finding the alternative that matches at the first position in the -string. As we’re only talking about suffixes, that first match is -also the longest suffix. If the regexp engine changes, this behaviour may -change and break the stemmer. +

+Implementation of the Porter 2 stemmer. String#porter2_stem is the +main stemming procedure.

@@ -227,10 +201,10 @@ Returns true if the word ends with a short syllable
-    # File lib/porter2.rb, line 87
-87:   def porter2_ends_with_short_syllable?
-88:     self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false
-89:   end
+ # File lib/porter2_implementation.rb, line 59 +59: def porter2_ends_with_short_syllable? +60: self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false +61: end
@@ -263,10 +237,10 @@ A word is short if it ends in a short syllable, and R1 is null
-    # File lib/porter2.rb, line 93
-93:   def porter2_is_short_word?
-94:     self.porter2_ends_with_short_syllable? and self.porter2_r1.empty?
-95:   end
+ # File lib/porter2_implementation.rb, line 65 +65: def porter2_is_short_word? +66: self.porter2_ends_with_short_syllable? and self.porter2_r1.empty? +67: end
@@ -299,10 +273,10 @@ Turn all Y letters into y
-     # File lib/porter2.rb, line 289
-289:   def porter2_postprocess
-290:     self.gsub(/Y/, 'y')
-291:   end
+ # File lib/porter2_implementation.rb, line 261 +261: def porter2_postprocess +262: self.gsub(/Y/, 'y') +263: end
@@ -343,19 +317,19 @@ doesn’t do that.)
-    # File lib/porter2.rb, line 53
-53:   def porter2_preprocess    
-54:     w = self.dup
-55: 
-56:     # remove any initial apostrophe
-57:     w.gsub!(/^'*(.)/, '\1')
-58:     
-59:     # set initial y, or y after a vowel, to Y
-60:     w.gsub!(/^y/, "Y")
-61:     w.gsub!(/(#{Porter2::V})y/, '\1Y')
-62:     
-63:     w
-64:   end
+ # File lib/porter2_implementation.rb, line 25 +25: def porter2_preprocess +26: w = self.dup +27: +28: # remove any initial apostrophe +29: w.gsub!(/^'*(.)/, '\1') +30: +31: # set initial y, or y after a vowel, to Y +32: w.gsub!(/^y/, "Y") +33: w.gsub!(/(#{Porter2::V})y/, '\1Y') +34: +35: w +36: end
@@ -390,15 +364,15 @@ and ‘arsen-’ treated as special cases
-    # File lib/porter2.rb, line 69
-69:   def porter2_r1
-70:     if self =~ /^(gener|commun|arsen)(?<r1>.*)/
-71:       Regexp.last_match(:r1)
-72:     else
-73:       self =~ /#{Porter2::V}#{Porter2::C}(?<r1>.*)$/
-74:       Regexp.last_match(:r1) || ""
-75:     end
-76:   end
+ # File lib/porter2_implementation.rb, line 41 +41: def porter2_r1 +42: if self =~ /^(gener|commun|arsen)(?<r1>.*)/ +43: Regexp.last_match(:r1) +44: else +45: self =~ /#{Porter2::V}#{Porter2::C}(?<r1>.*)$/ +46: Regexp.last_match(:r1) || "" +47: end +48: end
@@ -433,11 +407,11 @@ non-vowel after the first vowel
-    # File lib/porter2.rb, line 80
-80:   def porter2_r2
-81:     self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?<r2>.*)$/
-82:     Regexp.last_match(:r2) || ""
-83:   end
+ # File lib/porter2_implementation.rb, line 52 +52: def porter2_r2 +53: self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?<r2>.*)$/ +54: Regexp.last_match(:r2) || "" +55: end
@@ -472,24 +446,24 @@ English.
-     # File lib/porter2.rb, line 297
-297:   def porter2_stem(gb_english = false)
-298:     preword = self.porter2_tidy
-299:     return preword if preword.length <= 2
-300: 
-301:     word = preword.porter2_preprocess
-302:     
-303:     if Porter2::SPECIAL_CASES.has_key? word
-304:       Porter2::SPECIAL_CASES[word]
-305:     else
-306:       w1a = word.porter2_step0.porter2_step1a
-307:       if Porter2::STEP_1A_SPECIAL_CASES.include? w1a 
-308:         w1a
-309:       else
-310:         w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess
-311:       end
-312:     end
-313:   end
+ # File lib/porter2_implementation.rb, line 269 +269: def porter2_stem(gb_english = false) +270: preword = self.porter2_tidy +271: return preword if preword.length <= 2 +272: +273: word = preword.porter2_preprocess +274: +275: if Porter2::SPECIAL_CASES.has_key? word +276: Porter2::SPECIAL_CASES[word] +277: else +278: w1a = word.porter2_step0.porter2_step1a +279: if Porter2::STEP_1A_SPECIAL_CASES.include? w1a +280: w1a +281: else +282: w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess +283: end +284: end +285: end
@@ -528,41 +502,41 @@ output of each stage to STDOUT
-     # File lib/porter2.rb, line 316
-316:   def porter2_stem_verbose(gb_english = false)
-317:     preword = self.porter2_tidy
-318:     puts "Preword: #{preword}"
-319:     return preword if preword.length <= 2
-320: 
-321:     word = preword.porter2_preprocess
-322:     puts "Preprocessed: #{word}"
-323:     
-324:     if Porter2::SPECIAL_CASES.has_key? word
-325:       puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}"
-326:       Porter2::SPECIAL_CASES[word]
-327:     else
-328:       r1 = word.porter2_r1
-329:       r2 = word.porter2_r2
-330:       puts "R1 = #{r1}, R2 = #{r2}"
-331:     
-332:       w0 = word.porter2_step0 ; puts "After step 0:  #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
-333:       w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
-334:       
-335:       if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
-336:         puts "Returning #{w1a} as 1a special case"
-337:         w1a
-338:       else
-339:         w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
-340:         w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
-341:         w2 = w1c.porter2_step2(gb_english) ; puts "After step 2:  #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
-342:         w3 = w2.porter2_step3(gb_english) ; puts "After step 3:  #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
-343:         w4 = w3.porter2_step4(gb_english) ; puts "After step 4:  #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
-344:         w5 = w4.porter2_step5 ; puts "After step 5:  #{w5}"
-345:         wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}"
-346:         wpost
-347:       end
-348:     end
-349:   end
+ # File lib/porter2_implementation.rb, line 288 +288: def porter2_stem_verbose(gb_english = false) +289: preword = self.porter2_tidy +290: puts "Preword: #{preword}" +291: return preword if preword.length <= 2 +292: +293: word = preword.porter2_preprocess +294: puts "Preprocessed: #{word}" +295: +296: if Porter2::SPECIAL_CASES.has_key? word +297: puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}" +298: Porter2::SPECIAL_CASES[word] +299: else +300: r1 = word.porter2_r1 +301: r2 = word.porter2_r2 +302: puts "R1 = #{r1}, R2 = #{r2}" +303: +304: w0 = word.porter2_step0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})" +305: w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})" +306: +307: if Porter2::STEP_1A_SPECIAL_CASES.include? w1a +308: puts "Returning #{w1a} as 1a special case" +309: w1a +310: else +311: w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})" +312: w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})" +313: w2 = w1c.porter2_step2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})" +314: w3 = w2.porter2_step3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})" +315: w4 = w3.porter2_step4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})" +316: w5 = w4.porter2_step5 ; puts "After step 5: #{w5}" +317: wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}" +318: wpost +319: end +320: end +321: end
@@ -612,10 +586,10 @@ and remove if found.
-     # File lib/porter2.rb, line 103
-103:   def porter2_step0
-104:     self.sub!(/(.)('s'|'s|')$/, '\1') || self
-105:   end
+ # File lib/porter2_implementation.rb, line 75 +75: def porter2_step0 +76: self.sub!(/(.)('s'|'s|')$/, '\1') || self +77: end
@@ -668,26 +642,26 @@ do nothing
-     # File lib/porter2.rb, line 113
-113:   def porter2_step1a
-114:     if self =~ /sses$/
-115:       self.sub(/sses$/, 'ss')
-116:     elsif self =~ /..(ied|ies)$/
-117:       self.sub(/(ied|ies)$/, 'i')
-118:     elsif self =~ /(ied|ies)$/
-119:       self.sub(/(ied|ies)$/, 'ie')
-120:     elsif self =~ /(us|ss)$/
-121:       self
-122:     elsif self =~ /s$/
-123:       if self =~ /(#{Porter2::V}.+)s$/
-124:         self.sub(/s$/, '') 
-125:       else
-126:         self
-127:       end
-128:     else
-129:       self
-130:     end
-131:   end
+ # File lib/porter2_implementation.rb, line 85 + 85: def porter2_step1a + 86: if self =~ /sses$/ + 87: self.sub(/sses$/, 'ss') + 88: elsif self =~ /..(ied|ies)$/ + 89: self.sub(/(ied|ies)$/, 'i') + 90: elsif self =~ /(ied|ies)$/ + 91: self.sub(/(ied|ies)$/, 'ie') + 92: elsif self =~ /(us|ss)$/ + 93: self + 94: elsif self =~ /s$/ + 95: if self =~ /(#{Porter2::V}.+)s$/ + 96: self.sub(/s$/, '') + 97: else + 98: self + 99: end +100: else +101: self +102: end +103: end
@@ -754,31 +728,31 @@ if the word is short: add e
-     # File lib/porter2.rb, line 143
-143:   def porter2_step1b(gb_english = false)
-144:     if self =~ /(eed|eedly)$/
-145:       if self.porter2_r1 =~ /(eed|eedly)$/
-146:         self.sub(/(eed|eedly)$/, 'ee')
-147:       else
-148:         self
-149:       end
-150:     else
-151:       w = self.dup
-152:       if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/
-153:         w.sub!(/(ed|edly|ing|ingly)$/, '')
-154:         if w =~ /(at|lb|iz)$/
-155:           w += 'e' 
-156:         elsif w =~ /is$/ and gb_english
-157:           w += 'e' 
-158:         elsif w =~ /#{Porter2::Double}$/
-159:           w.chop!
-160:         elsif w.porter2_is_short_word?
-161:           w += 'e'
-162:         end
-163:       end
-164:       w
-165:     end
-166:   end
+ # File lib/porter2_implementation.rb, line 115 +115: def porter2_step1b(gb_english = false) +116: if self =~ /(eed|eedly)$/ +117: if self.porter2_r1 =~ /(eed|eedly)$/ +118: self.sub(/(eed|eedly)$/, 'ee') +119: else +120: self +121: end +122: else +123: w = self.dup +124: if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/ +125: w.sub!(/(ed|edly|ing|ingly)$/, '') +126: if w =~ /(at|lb|iz)$/ +127: w += 'e' +128: elsif w =~ /is$/ and gb_english +129: w += 'e' +130: elsif w =~ /#{Porter2::Double}$/ +131: w.chop! +132: elsif w.porter2_is_short_word? +133: w += 'e' +134: end +135: end +136: w +137: end +138: end
@@ -812,14 +786,14 @@ not the first letter of the word.
-     # File lib/porter2.rb, line 171
-171:   def porter2_step1c
-172:     if self =~ /.+#{Porter2::C}(y|Y)$/
-173:       self.sub(/(y|Y)$/, 'i')
-174:     else
-175:       self
-176:     end
-177:   end
+ # File lib/porter2_implementation.rb, line 143 +143: def porter2_step1c +144: if self =~ /.+#{Porter2::C}(y|Y)$/ +145: self.sub(/(y|Y)$/, 'i') +146: else +147: self +148: end +149: end
@@ -863,29 +837,29 @@ cases in the procedure.)
-     # File lib/porter2.rb, line 188
-188:   def porter2_step2(gb_english = false)
-189:     r1 = self.porter2_r1
-190:     s2m = Porter2::STEP_2_MAPS.dup
-191:     if gb_english
-192:       s2m["iser"] = "ise"
-193:       s2m["isation"] = "ise"
-194:     end
-195:     step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
-196:     if self =~ step_2_re
-197:       if r1 =~ /#{$&}$/
-198:         self.sub(/#{$&}$/, s2m[$&])
-199:       else
-200:         self
-201:       end
-202:     elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/
-203:       self.sub(/li$/, '')
-204:     elsif r1 =~ /ogi$/ and self =~ /logi$/
-205:       self.sub(/ogi$/, 'og')
-206:     else
-207:       self
-208:     end
-209:   end
+ # File lib/porter2_implementation.rb, line 160 +160: def porter2_step2(gb_english = false) +161: r1 = self.porter2_r1 +162: s2m = Porter2::STEP_2_MAPS.dup +163: if gb_english +164: s2m["iser"] = "ise" +165: s2m["isation"] = "ise" +166: end +167: step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")}) +168: if self =~ step_2_re +169: if r1 =~ /#{$&}$/ +170: self.sub(/#{$&}$/, s2m[$&]) +171: else +172: self +173: end +174: elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/ +175: self.sub(/li$/, '') +176: elsif r1 =~ /ogi$/ and self =~ /logi$/ +177: self.sub(/ogi$/, 'og') +178: else +179: self +180: end +181: end
@@ -927,24 +901,24 @@ with ‘al’, similarly to how ‘alize’ is treated.)
-     # File lib/porter2.rb, line 220
-220:   def porter2_step3(gb_english = false)
-221:     if self =~ /ative$/ and self.porter2_r2 =~ /ative$/
-222:       self.sub(/ative$/, '')
-223:     else
-224:       s3m = Porter2::STEP_3_MAPS.dup
-225:       if gb_english
-226:         s3m["alise"] = "al"
-227:       end
-228:       step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
-229:       r1 = self.porter2_r1
-230:       if self =~ step_3_re and r1 =~ /#{$&}$/ 
-231:         self.sub(/#{$&}$/, s3m[$&])
-232:       else
-233:         self
-234:       end
-235:     end
-236:   end
+ # File lib/porter2_implementation.rb, line 192 +192: def porter2_step3(gb_english = false) +193: if self =~ /ative$/ and self.porter2_r2 =~ /ative$/ +194: self.sub(/ative$/, '') +195: else +196: s3m = Porter2::STEP_3_MAPS.dup +197: if gb_english +198: s3m["alise"] = "al" +199: end +200: step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")}) +201: r1 = self.porter2_r1 +202: if self =~ step_3_re and r1 =~ /#{$&}$/ +203: self.sub(/#{$&}$/, s3m[$&]) +204: else +205: self +206: end +207: end +208: end
@@ -986,28 +960,28 @@ found.)
-     # File lib/porter2.rb, line 246
-246:   def porter2_step4(gb_english = false)
-247:     if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/
-248:       self.sub(/ion$/, '')
-249:     else
-250:       s4m = Porter2::STEP_4_MAPS.dup
-251:       if gb_english
-252:         s4m["ise"] = ""
-253:       end
-254:       step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")})
-255:       r2 = self.porter2_r2
-256:       if self =~ step_4_re
-257:         if r2 =~ /#{$&}/
-258:           self.sub(/#{$&}$/, s4m[$&])
-259:         else
-260:           self
-261:         end
-262:       else
-263:         self
-264:       end
-265:     end
-266:   end
+ # File lib/porter2_implementation.rb, line 218 +218: def porter2_step4(gb_english = false) +219: if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/ +220: self.sub(/ion$/, '') +221: else +222: s4m = Porter2::STEP_4_MAPS.dup +223: if gb_english +224: s4m["ise"] = "" +225: end +226: step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")}) +227: r2 = self.porter2_r2 +228: if self =~ step_4_re +229: if r2 =~ /#{$&}/ +230: self.sub(/#{$&}$/, s4m[$&]) +231: else +232: self +233: end +234: else +235: self +236: end +237: end +238: end
@@ -1051,21 +1025,21 @@ delete if in R2 and preceded by l
-     # File lib/porter2.rb, line 272
-272:   def porter2_step5
-273:     if self =~ /ll$/ and self.porter2_r2 =~ /l$/
-274:       self.sub(/ll$/, 'l') 
-275:     elsif self =~ /e$/ and self.porter2_r2 =~ /e$/ 
-276:       self.sub(/e$/, '') 
-277:     else
-278:       r1 = self.porter2_r1
-279:       if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/
-280:         self.sub(/e$/, '')
-281:       else
-282:         self
-283:       end
-284:     end
-285:   end
+ # File lib/porter2_implementation.rb, line 244 +244: def porter2_step5 +245: if self =~ /ll$/ and self.porter2_r2 =~ /l$/ +246: self.sub(/ll$/, 'l') +247: elsif self =~ /e$/ and self.porter2_r2 =~ /e$/ +248: self.sub(/e$/, '') +249: else +250: r1 = self.porter2_r1 +251: if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/ +252: self.sub(/e$/, '') +253: else +254: self +255: end +256: end +257: end
@@ -1098,16 +1072,16 @@ Tidy up the word before we get down to the algorithm
-    # File lib/porter2.rb, line 35
-35:   def porter2_tidy
-36:     preword = self.to_s.strip.downcase
-37:     
-38:     # map apostrophe-like characters to apostrophes
-39:     preword.gsub!(/‘/, "'")
-40:     preword.gsub!(/’/, "'")
-41: 
-42:     preword
-43:   end
+ # File lib/porter2_implementation.rb, line 7 + 7: def porter2_tidy + 8: preword = self.to_s.strip.downcase + 9: +10: # map apostrophe-like characters to apostrophes +11: preword.gsub!(/‘/, "'") +12: preword.gsub!(/’/, "'") +13: +14: preword +15: end
diff --git a/doc/TestPorter2.html b/doc/TestPorter2.html index dab7cbe..137ff2a 100644 --- a/doc/TestPorter2.html +++ b/doc/TestPorter2.html @@ -117,6 +117,15 @@
+
+

Files

+ +
+

Class Index diff --git a/doc/created.rid b/doc/created.rid index d9b4f19..a564bce 100644 --- a/doc/created.rid +++ b/doc/created.rid @@ -1,6 +1,8 @@ -Fri, 07 Jan 2011 08:46:50 +0000 +Fri, 11 Feb 2011 13:56:05 +0000 ./test/tc_porter2_parts.rb Wed, 05 Jan 2011 11:38:33 +0000 ./test/ts_porter2.rb Mon, 03 Jan 2011 00:20:11 +0000 ./test/tc_porter2_full.rb Wed, 05 Jan 2011 11:35:59 +0000 -./lib/porter2.rb Fri, 07 Jan 2011 08:46:31 +0000 -./lib/porter2_constants.rb Fri, 07 Jan 2011 08:46:16 +0000 +./lib/porter2.rb Sun, 09 Jan 2011 18:34:08 +0000 +./lib/porter2_constants.rb Sun, 09 Jan 2011 09:20:05 +0000 +./lib/porter2_implementation.rb Sat, 08 Jan 2011 10:20:57 +0000 +./Readme.rdoc Fri, 11 Feb 2011 13:55:53 +0000 diff --git a/doc/index.html b/doc/index.html index 6c609f9..fc697ab 100644 --- a/doc/index.html +++ b/doc/index.html @@ -27,6 +27,13 @@ +

Files

+ +

Classes/Modules

    diff --git a/doc/lib/porter2_constants_rb.html b/doc/lib/porter2_constants_rb.html new file mode 100644 index 0000000..e67bf34 --- /dev/null +++ b/doc/lib/porter2_constants_rb.html @@ -0,0 +1,55 @@ + + + + + + + + File: porter2_constants.rb [RDoc Documentation] + + + + + + + + + + +
    +
    +
    Last Modified
    +
    2011-01-09 09:20:05 +0000
    + + +
    Requires
    +
    +
      + +
    +
    + + + +
    +
    + +
    + +
    +

    Description

    +

    +coding: utf-8 +

    + +
    + +
    + + + diff --git a/doc/lib/porter2_implementation_rb.html b/doc/lib/porter2_implementation_rb.html new file mode 100644 index 0000000..234ea7a --- /dev/null +++ b/doc/lib/porter2_implementation_rb.html @@ -0,0 +1,55 @@ + + + + + + + + File: porter2_implementation.rb [RDoc Documentation] + + + + + + + + + + +
    +
    +
    Last Modified
    +
    2011-01-08 10:20:57 +0000
    + + +
    Requires
    +
    +
      + +
    +
    + + + +
    +
    + +
    + +
    +

    Description

    +

    +coding: utf-8 +

    + +
    + +
    + + + diff --git a/doc/lib/porter2_module_rb.html b/doc/lib/porter2_module_rb.html new file mode 100644 index 0000000..e633525 --- /dev/null +++ b/doc/lib/porter2_module_rb.html @@ -0,0 +1,55 @@ + + + + + + + + File: porter2_module.rb [RDoc Documentation] + + + + + + + + + + +
    +
    +
    Last Modified
    +
    2011-01-05 11:34:03 +0000
    + + +
    Requires
    +
    +
      + +
    +
    + + + +
    +
    + +
    + +
    +

    Description

    +

    +coding: utf-8 +

    + +
    + +
    + + + diff --git a/doc/lib/porter2_rb.html b/doc/lib/porter2_rb.html index d4d44d9..850b2f2 100644 --- a/doc/lib/porter2_rb.html +++ b/doc/lib/porter2_rb.html @@ -24,7 +24,7 @@
    Last Modified
    -
    2011-01-07 08:46:31 +0000
    +
    2011-01-09 18:34:08 +0000
    Requires
    @@ -33,6 +33,8 @@
  • porter2_constants
  • +
  • porter2_implementation
  • +
diff --git a/doc/lib/porter2_string_rb.html b/doc/lib/porter2_string_rb.html new file mode 100644 index 0000000..e41e011 --- /dev/null +++ b/doc/lib/porter2_string_rb.html @@ -0,0 +1,57 @@ + + + + + + + + File: porter2_string.rb [RDoc Documentation] + + + + + + + + + + +
+
+
Last Modified
+
2011-01-05 11:24:47 +0000
+ + +
Requires
+
+
    + +
  • porter2_module
  • + +
+
+ + + +
+
+ +
+ +
+

Description

+

+coding: utf-8 +

+ +
+ +
+ + + diff --git a/lib/porter2.rb b/lib/porter2.rb index e99e358..229128d 100644 --- a/lib/porter2.rb +++ b/lib/porter2.rb @@ -1,354 +1,8 @@ # coding: utf-8 -require 'porter2_constants' - # ==The Porter 2 stemmer -# -# This is the Porter 2 stemming algorithm, as described at -# http://snowball.tartarus.org/algorithms/english/stemmer.html -# The original paper is: -# -# Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14, -# no. 3, pp 130-137 -# -# Constants for the stemmer are in the Porter2 module. -# -# Procedures that implement the stemmer are added to the String class. -# -# The stemmer algorithm is implemented in the porter2_stem procedure. -# -# ==Internationalisation -# There isn't much, as this is a stemmer that only works for English. -# -# The +gb_english+ flag to the various procedures allows the stemmer to treat the British -# English '-ise' the same as the American English '-ize'. -# -# ==Longest suffixes -# Several places in the algorithm require matching the longest suffix of a word. The -# regexp engine in Ruby 1.9 seems to handle alterntives in regexps by finding the -# alternative that matches at the first position in the string. As we're only talking -# about suffixes, that first match is also the longest suffix. If the regexp engine changes, -# this behaviour may change and break the stemmer. - -class String - # Tidy up the word before we get down to the algorithm - def porter2_tidy - preword = self.to_s.strip.downcase - - # map apostrophe-like characters to apostrophes - preword.gsub!(/‘/, "'") - preword.gsub!(/’/, "'") - - preword - end - - - # Preprocess the word. - # Remove any initial ', if present. Then, set initial y, or y after a vowel, to Y - # - # (The comment to 'establish the regions R1 and R2' in the original description - # is an implementation optimisation that identifies where the regions start. As - # no modifications are made to the word that affect those positions, you may want - # to cache them now. This implementation doesn't do that.) - def porter2_preprocess - w = self.dup - - # remove any initial apostrophe - w.gsub!(/^'*(.)/, '\1') - - # set initial y, or y after a vowel, to Y - w.gsub!(/^y/, "Y") - w.gsub!(/(#{Porter2::V})y/, '\1Y') - - w - end - - - # R1 is the portion of the word after the first non-vowel after the first vowel - # (with words beginning 'gener-', 'commun-', and 'arsen-' treated as special cases - def porter2_r1 - if self =~ /^(gener|commun|arsen)(?.*)/ - Regexp.last_match(:r1) - else - self =~ /#{Porter2::V}#{Porter2::C}(?.*)$/ - Regexp.last_match(:r1) || "" - end - end - - - # R2 is the portion of R1 (porter2_r1) after the first non-vowel after the first vowel - def porter2_r2 - self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?.*)$/ - Regexp.last_match(:r2) || "" - end - - - # Returns true if the word ends with a short syllable - def porter2_ends_with_short_syllable? - self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false - end - - - # A word is short if it ends in a short syllable, and R1 is null - def porter2_is_short_word? - self.porter2_ends_with_short_syllable? and self.porter2_r1.empty? - end - - - # Search for the longest among the suffixes, - # * ' - # * 's - # * 's' - # and remove if found. - def porter2_step0 - self.sub!(/(.)('s'|'s|')$/, '\1') || self - end - - - # Search for the longest among the following suffixes, and perform the action indicated. - # sses:: replace by ss - # ied, ies:: replace by i if preceded by more than one letter, otherwise by ie - # s:: delete if the preceding word part contains a vowel not immediately before the s - # us, ss:: do nothing - def porter2_step1a - if self =~ /sses$/ - self.sub(/sses$/, 'ss') - elsif self =~ /..(ied|ies)$/ - self.sub(/(ied|ies)$/, 'i') - elsif self =~ /(ied|ies)$/ - self.sub(/(ied|ies)$/, 'ie') - elsif self =~ /(us|ss)$/ - self - elsif self =~ /s$/ - if self =~ /(#{Porter2::V}.+)s$/ - self.sub(/s$/, '') - else - self - end - else - self - end - end - - # Search for the longest among the following suffixes, and perform the action indicated. - # eed, eedly:: replace by ee if the suffix is also in R1 - # ed, edly, ing, ingly:: delete if the preceding word part contains a vowel and, - # after the deletion: - # * if the word ends at, bl or iz: add e, or - # * if the word ends with a double: remove the last letter, or - # * if the word is short: add e - # - # (If gb_english is +true+, treat the 'is' suffix as 'iz' above.) - def porter2_step1b(gb_english = false) - if self =~ /(eed|eedly)$/ - if self.porter2_r1 =~ /(eed|eedly)$/ - self.sub(/(eed|eedly)$/, 'ee') - else - self - end - else - w = self.dup - if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/ - w.sub!(/(ed|edly|ing|ingly)$/, '') - if w =~ /(at|lb|iz)$/ - w += 'e' - elsif w =~ /is$/ and gb_english - w += 'e' - elsif w =~ /#{Porter2::Double}$/ - w.chop! - elsif w.porter2_is_short_word? - w += 'e' - end - end - w - end - end - - - # Replace a suffix of y or Y by i if it is preceded by a non-vowel which is - # not the first letter of the word. - def porter2_step1c - if self =~ /.+#{Porter2::C}(y|Y)$/ - self.sub(/(y|Y)$/, 'i') - else - self - end - end - - - # Search for the longest among the suffixes listed in the keys of Porter2::STEP_2_MAPS. - # If one is found and that suffix occurs in R1, replace it with the value - # found in STEP_2_MAPS. - # - # (Suffixes 'ogi' and 'li' are treated as special cases in the procedure.) - # - # (If gb_english is +true+, replace the 'iser' and 'isation' suffixes with - # 'ise', similarly to how 'izer' and 'ization' are treated.) - def porter2_step2(gb_english = false) - r1 = self.porter2_r1 - s2m = Porter2::STEP_2_MAPS.dup - if gb_english - s2m["iser"] = "ise" - s2m["isation"] = "ise" - end - step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")}) - if self =~ step_2_re - if r1 =~ /#{$&}$/ - self.sub(/#{$&}$/, s2m[$&]) - else - self - end - elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/ - self.sub(/li$/, '') - elsif r1 =~ /ogi$/ and self =~ /logi$/ - self.sub(/ogi$/, 'og') - else - self - end - end - - - # Search for the longest among the suffixes listed in the keys of Porter2::STEP_3_MAPS. - # If one is found and that suffix occurs in R1, replace it with the value - # found in STEP_3_MAPS. - # - # (Suffix 'ative' is treated as a special case in the procedure.) - # - # (If gb_english is +true+, replace the 'alise' suffix with - # 'al', similarly to how 'alize' is treated.) - def porter2_step3(gb_english = false) - if self =~ /ative$/ and self.porter2_r2 =~ /ative$/ - self.sub(/ative$/, '') - else - s3m = Porter2::STEP_3_MAPS.dup - if gb_english - s3m["alise"] = "al" - end - step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")}) - r1 = self.porter2_r1 - if self =~ step_3_re and r1 =~ /#{$&}$/ - self.sub(/#{$&}$/, s3m[$&]) - else - self - end - end - end - - - # Search for the longest among the suffixes listed in the keys of Porter2::STEP_4_MAPS. - # If one is found and that suffix occurs in R2, replace it with the value - # found in STEP_4_MAPS. - # - # (Suffix 'ion' is treated as a special case in the procedure.) - # - # (If gb_english is +true+, delete the 'ise' suffix if found.) - def porter2_step4(gb_english = false) - if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/ - self.sub(/ion$/, '') - else - s4m = Porter2::STEP_4_MAPS.dup - if gb_english - s4m["ise"] = "" - end - step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")}) - r2 = self.porter2_r2 - if self =~ step_4_re - if r2 =~ /#{$&}/ - self.sub(/#{$&}$/, s4m[$&]) - else - self - end - else - self - end - end - end - - - # Search for the the following suffixes, and, if found, perform the action indicated. - # e:: delete if in R2, or in R1 and not preceded by a short syllable - # l:: delete if in R2 and preceded by l - def porter2_step5 - if self =~ /ll$/ and self.porter2_r2 =~ /l$/ - self.sub(/ll$/, 'l') - elsif self =~ /e$/ and self.porter2_r2 =~ /e$/ - self.sub(/e$/, '') - else - r1 = self.porter2_r1 - if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/ - self.sub(/e$/, '') - else - self - end - end - end - - - # Turn all Y letters into y - def porter2_postprocess - self.gsub(/Y/, 'y') - end - - public - - # Perform the stemming procedure. If +gb_english+ is true, treat '-ise' and similar suffixes - # as '-ize' in American English. - def porter2_stem(gb_english = false) - preword = self.porter2_tidy - return preword if preword.length <= 2 - - word = preword.porter2_preprocess - - if Porter2::SPECIAL_CASES.has_key? word - Porter2::SPECIAL_CASES[word] - else - w1a = word.porter2_step0.porter2_step1a - if Porter2::STEP_1A_SPECIAL_CASES.include? w1a - w1a - else - w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess - end - end - end - - # A verbose version of porter2_stem that prints the output of each stage to STDOUT - def porter2_stem_verbose(gb_english = false) - preword = self.porter2_tidy - puts "Preword: #{preword}" - return preword if preword.length <= 2 - - word = preword.porter2_preprocess - puts "Preprocessed: #{word}" - - if Porter2::SPECIAL_CASES.has_key? word - puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}" - Porter2::SPECIAL_CASES[word] - else - r1 = word.porter2_r1 - r2 = word.porter2_r2 - puts "R1 = #{r1}, R2 = #{r2}" - - w0 = word.porter2_step0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})" - w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})" - - if Porter2::STEP_1A_SPECIAL_CASES.include? w1a - puts "Returning #{w1a} as 1a special case" - w1a - else - w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})" - w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})" - w2 = w1c.porter2_step2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})" - w3 = w2.porter2_step3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})" - w4 = w3.porter2_step4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})" - w5 = w4.porter2_step5 ; puts "After step 5: #{w5}" - wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}" - wpost - end - end - end - - alias stem porter2_stem - -end +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") +require 'porter2_constants' +require 'porter2_implementation' diff --git a/lib/porter2_constants.rb b/lib/porter2_constants.rb new file mode 100644 index 0000000..f123bc5 --- /dev/null +++ b/lib/porter2_constants.rb @@ -0,0 +1,114 @@ +# coding: utf-8 + +# Constants for the Porter 2 stemmer +module Porter2 + + # A non-vowel + C = "[^aeiouy]" + + # A vowel: a e i o u y + V = "[aeiouy]" + + # A non-vowel other than w, x, or Y + CW = "[^aeiouywxY]" + + # Doubles created when adding a suffix: these are undoubled when stemmed + Double = "(bb|dd|ff|gg|mm|nn|pp|rr|tt)" + + # A valid letter that can come before 'li' (or 'ly') + Valid_LI = "[cdeghkmnrt]" + + # A specification for a short syllable. + # + # A short syllable in a word is either: + # 1. a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or + # 2. a vowel at the beginning of the word followed by a non-vowel. + # + # (The original document is silent on whether sequences of two or more non-vowels make a + # syllable long. But as this specification is only used to find sequences of non-vowel - + # vowel - non-vowel - end-of-word, this ambiguity does not have an effect.) + SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))" + + # Suffix transformations used in porter2_step2. + # (ogi, li endings dealt with in procedure) + STEP_2_MAPS = {"tional" => "tion", + "enci" => "ence", + "anci" => "ance", + "abli" => "able", + "entli" => "ent", + "ization" => "ize", + "izer" => "ize", + "ational" => "ate", + "ation" => "ate", + "ator" => "ate", + "alism" => "al", + "aliti" => "al", + "alli" => "al", + "fulness" => "ful", + "ousli" => "ous", + "ousness" => "ous", + "iveness" => "ive", + "iviti" => "ive", + "biliti" => "ble", + "bli" => "ble", + "fulli" => "ful", + "lessli" => "less" } + + # Suffix transformations used in porter2_step3. + # (ative ending dealt with in procedure) + STEP_3_MAPS = {"tional" => "tion", + "ational" => "ate", + "alize" => "al", + "icate" => "ic", + "iciti" => "ic", + "ical" => "ic", + "ful" => "", + "ness" => "" } + + # Suffix transformations used in porter2_step4. + # (ion ending dealt with in procedure) + STEP_4_MAPS = {"al" => "", + "ance" => "", + "ence" => "", + "er" => "", + "ic" => "", + "able" => "", + "ible" => "", + "ant" => "", + "ement" => "", + "ment" => "", + "ent" => "", + "ism" => "", + "ate" => "", + "iti" => "", + "ous" => "", + "ive" => "", + "ize" => "" } + + # Special-case stemmings + SPECIAL_CASES = {"skis" => "ski", + "skies" => "sky", + + "dying" => "die", + "lying" => "lie", + "tying" => "tie", + "idly" => "idl", + "gently" => "gentl", + "ugly" => "ugli", + "early" => "earli", + "only" => "onli", + "singly" =>"singl", + + "sky" => "sky", + "news" => "news", + "howe" => "howe", + "atlas" => "atlas", + "cosmos" => "cosmos", + "bias" => "bias", + "andes" => "andes" } + + # Special case words to stop processing after step 1a. + STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ] + +end + diff --git a/lib/porter2_implementation.rb b/lib/porter2_implementation.rb new file mode 100644 index 0000000..906f5bd --- /dev/null +++ b/lib/porter2_implementation.rb @@ -0,0 +1,326 @@ +# coding: utf-8 + +# Implementation of the Porter 2 stemmer. String#porter2_stem is the main stemming procedure. + +class String + # Tidy up the word before we get down to the algorithm + def porter2_tidy + preword = self.to_s.strip.downcase + + # map apostrophe-like characters to apostrophes + preword.gsub!(/‘/, "'") + preword.gsub!(/’/, "'") + + preword + end + + + # Preprocess the word. + # Remove any initial ', if present. Then, set initial y, or y after a vowel, to Y + # + # (The comment to 'establish the regions R1 and R2' in the original description + # is an implementation optimisation that identifies where the regions start. As + # no modifications are made to the word that affect those positions, you may want + # to cache them now. This implementation doesn't do that.) + def porter2_preprocess + w = self.dup + + # remove any initial apostrophe + w.gsub!(/^'*(.)/, '\1') + + # set initial y, or y after a vowel, to Y + w.gsub!(/^y/, "Y") + w.gsub!(/(#{Porter2::V})y/, '\1Y') + + w + end + + + # R1 is the portion of the word after the first non-vowel after the first vowel + # (with words beginning 'gener-', 'commun-', and 'arsen-' treated as special cases + def porter2_r1 + if self =~ /^(gener|commun|arsen)(?.*)/ + Regexp.last_match(:r1) + else + self =~ /#{Porter2::V}#{Porter2::C}(?.*)$/ + Regexp.last_match(:r1) || "" + end + end + + + # R2 is the portion of R1 (porter2_r1) after the first non-vowel after the first vowel + def porter2_r2 + self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?.*)$/ + Regexp.last_match(:r2) || "" + end + + + # Returns true if the word ends with a short syllable + def porter2_ends_with_short_syllable? + self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false + end + + + # A word is short if it ends in a short syllable, and R1 is null + def porter2_is_short_word? + self.porter2_ends_with_short_syllable? and self.porter2_r1.empty? + end + + + # Search for the longest among the suffixes, + # * ' + # * 's + # * 's' + # and remove if found. + def porter2_step0 + self.sub!(/(.)('s'|'s|')$/, '\1') || self + end + + + # Search for the longest among the following suffixes, and perform the action indicated. + # sses:: replace by ss + # ied, ies:: replace by i if preceded by more than one letter, otherwise by ie + # s:: delete if the preceding word part contains a vowel not immediately before the s + # us, ss:: do nothing + def porter2_step1a + if self =~ /sses$/ + self.sub(/sses$/, 'ss') + elsif self =~ /..(ied|ies)$/ + self.sub(/(ied|ies)$/, 'i') + elsif self =~ /(ied|ies)$/ + self.sub(/(ied|ies)$/, 'ie') + elsif self =~ /(us|ss)$/ + self + elsif self =~ /s$/ + if self =~ /(#{Porter2::V}.+)s$/ + self.sub(/s$/, '') + else + self + end + else + self + end + end + + + # Search for the longest among the following suffixes, and perform the action indicated. + # eed, eedly:: replace by ee if the suffix is also in R1 + # ed, edly, ing, ingly:: delete if the preceding word part contains a vowel and, + # after the deletion: + # * if the word ends at, bl or iz: add e, or + # * if the word ends with a double: remove the last letter, or + # * if the word is short: add e + # + # (If gb_english is +true+, treat the 'is' suffix as 'iz' above.) + def porter2_step1b(gb_english = false) + if self =~ /(eed|eedly)$/ + if self.porter2_r1 =~ /(eed|eedly)$/ + self.sub(/(eed|eedly)$/, 'ee') + else + self + end + else + w = self.dup + if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/ + w.sub!(/(ed|edly|ing|ingly)$/, '') + if w =~ /(at|lb|iz)$/ + w += 'e' + elsif w =~ /is$/ and gb_english + w += 'e' + elsif w =~ /#{Porter2::Double}$/ + w.chop! + elsif w.porter2_is_short_word? + w += 'e' + end + end + w + end + end + + + # Replace a suffix of y or Y by i if it is preceded by a non-vowel which is + # not the first letter of the word. + def porter2_step1c + if self =~ /.+#{Porter2::C}(y|Y)$/ + self.sub(/(y|Y)$/, 'i') + else + self + end + end + + + # Search for the longest among the suffixes listed in the keys of Porter2::STEP_2_MAPS. + # If one is found and that suffix occurs in R1, replace it with the value + # found in STEP_2_MAPS. + # + # (Suffixes 'ogi' and 'li' are treated as special cases in the procedure.) + # + # (If gb_english is +true+, replace the 'iser' and 'isation' suffixes with + # 'ise', similarly to how 'izer' and 'ization' are treated.) + def porter2_step2(gb_english = false) + r1 = self.porter2_r1 + s2m = Porter2::STEP_2_MAPS.dup + if gb_english + s2m["iser"] = "ise" + s2m["isation"] = "ise" + end + step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")}) + if self =~ step_2_re + if r1 =~ /#{$&}$/ + self.sub(/#{$&}$/, s2m[$&]) + else + self + end + elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/ + self.sub(/li$/, '') + elsif r1 =~ /ogi$/ and self =~ /logi$/ + self.sub(/ogi$/, 'og') + else + self + end + end + + + # Search for the longest among the suffixes listed in the keys of Porter2::STEP_3_MAPS. + # If one is found and that suffix occurs in R1, replace it with the value + # found in STEP_3_MAPS. + # + # (Suffix 'ative' is treated as a special case in the procedure.) + # + # (If gb_english is +true+, replace the 'alise' suffix with + # 'al', similarly to how 'alize' is treated.) + def porter2_step3(gb_english = false) + if self =~ /ative$/ and self.porter2_r2 =~ /ative$/ + self.sub(/ative$/, '') + else + s3m = Porter2::STEP_3_MAPS.dup + if gb_english + s3m["alise"] = "al" + end + step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")}) + r1 = self.porter2_r1 + if self =~ step_3_re and r1 =~ /#{$&}$/ + self.sub(/#{$&}$/, s3m[$&]) + else + self + end + end + end + + + # Search for the longest among the suffixes listed in the keys of Porter2::STEP_4_MAPS. + # If one is found and that suffix occurs in R2, replace it with the value + # found in STEP_4_MAPS. + # + # (Suffix 'ion' is treated as a special case in the procedure.) + # + # (If gb_english is +true+, delete the 'ise' suffix if found.) + def porter2_step4(gb_english = false) + if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/ + self.sub(/ion$/, '') + else + s4m = Porter2::STEP_4_MAPS.dup + if gb_english + s4m["ise"] = "" + end + step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")}) + r2 = self.porter2_r2 + if self =~ step_4_re + if r2 =~ /#{$&}/ + self.sub(/#{$&}$/, s4m[$&]) + else + self + end + else + self + end + end + end + + + # Search for the the following suffixes, and, if found, perform the action indicated. + # e:: delete if in R2, or in R1 and not preceded by a short syllable + # l:: delete if in R2 and preceded by l + def porter2_step5 + if self =~ /ll$/ and self.porter2_r2 =~ /l$/ + self.sub(/ll$/, 'l') + elsif self =~ /e$/ and self.porter2_r2 =~ /e$/ + self.sub(/e$/, '') + else + r1 = self.porter2_r1 + if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/ + self.sub(/e$/, '') + else + self + end + end + end + + + # Turn all Y letters into y + def porter2_postprocess + self.gsub(/Y/, 'y') + end + + public + + # Perform the stemming procedure. If +gb_english+ is true, treat '-ise' and similar suffixes + # as '-ize' in American English. + def porter2_stem(gb_english = false) + preword = self.porter2_tidy + return preword if preword.length <= 2 + + word = preword.porter2_preprocess + + if Porter2::SPECIAL_CASES.has_key? word + Porter2::SPECIAL_CASES[word] + else + w1a = word.porter2_step0.porter2_step1a + if Porter2::STEP_1A_SPECIAL_CASES.include? w1a + w1a + else + w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess + end + end + end + + # A verbose version of porter2_stem that prints the output of each stage to STDOUT + def porter2_stem_verbose(gb_english = false) + preword = self.porter2_tidy + puts "Preword: #{preword}" + return preword if preword.length <= 2 + + word = preword.porter2_preprocess + puts "Preprocessed: #{word}" + + if Porter2::SPECIAL_CASES.has_key? word + puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}" + Porter2::SPECIAL_CASES[word] + else + r1 = word.porter2_r1 + r2 = word.porter2_r2 + puts "R1 = #{r1}, R2 = #{r2}" + + w0 = word.porter2_step0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})" + w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})" + + if Porter2::STEP_1A_SPECIAL_CASES.include? w1a + puts "Returning #{w1a} as 1a special case" + w1a + else + w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})" + w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})" + w2 = w1c.porter2_step2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})" + w3 = w2.porter2_step3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})" + w4 = w3.porter2_step4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})" + w5 = w4.porter2_step5 ; puts "After step 5: #{w5}" + wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}" + wpost + end + end + end + + alias stem porter2_stem + +end + -- 2.34.1