X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;ds=inline;f=doc%2FString.html;h=f04ae9aa7771b3591944e48aa30427256ac4f35d;hb=49dc03eafbc7aa52de0caa3b3c0d5b5f25ce311d;hp=e0921af7f555c9e878e958a8eb0df0b0bf313e72;hpb=a222cc928502d3bb6ecd3d03c532ce7f9804acb4;p=porter2stemmer.git diff --git a/doc/String.html b/doc/String.html index e0921af..f04ae9a 100644 --- a/doc/String.html +++ b/doc/String.html @@ -38,8 +38,8 @@
@@ -116,6 +116,15 @@-This is the Porter 2 stemming algorithm, as described at snowball.tartarus.org/algorithms/english/stemmer.html -The original paper is: -
--Porter, 1980, “An algorithm for suffix stripping”, -Program, Vol. 14, no. 3, pp 130-137 -
--Constants for the stemmer are in the Porter2 -module. -
--Procedures that implement the stemmer are added to the String class. -
--The stemmer algorithm is implemented in the porter2_stem procedure. -
--There isn’t much, as this is a stemmer that only works for English. -
--The gb_english flag to the various procedures allows the stemmer -to treat the British English ’-ise’ the same as the American -English ’-ize’. -
--Several places in the algorithm require matching the longest suffix of a -word. The regexp engine in Ruby 1.9 seems to handle alterntives in regexps -by finding the alternative that matches at the first position in the -string. As we’re only talking about suffixes, that first match is -also the longest suffix. If the regexp engine changes, this behaviour may -change and break the stemmer. +
+Implementation of the Porter 2 stemmer. String#porter2_stem is the +main stemming procedure.
- # File lib/porter2.rb, line 87 -87: def porter2_ends_with_short_syllable? -88: self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false -89: end+ # File lib/porter2_implementation.rb, line 59 +59: def porter2_ends_with_short_syllable? +60: self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false +61: end
- # File lib/porter2.rb, line 93 -93: def porter2_is_short_word? -94: self.porter2_ends_with_short_syllable? and self.porter2_r1.empty? -95: end+ # File lib/porter2_implementation.rb, line 65 +65: def porter2_is_short_word? +66: self.porter2_ends_with_short_syllable? and self.porter2_r1.empty? +67: end
- # File lib/porter2.rb, line 289 -289: def porter2_postprocess -290: self.gsub(/Y/, 'y') -291: end+ # File lib/porter2_implementation.rb, line 261 +261: def porter2_postprocess +262: self.gsub(/Y/, 'y') +263: end
- # File lib/porter2.rb, line 53 -53: def porter2_preprocess -54: w = self.dup -55: -56: # remove any initial apostrophe -57: w.gsub!(/^'*(.)/, '\1') -58: -59: # set initial y, or y after a vowel, to Y -60: w.gsub!(/^y/, "Y") -61: w.gsub!(/(#{Porter2::V})y/, '\1Y') -62: -63: w -64: end+ # File lib/porter2_implementation.rb, line 25 +25: def porter2_preprocess +26: w = self.dup +27: +28: # remove any initial apostrophe +29: w.gsub!(/^'*(.)/, '\1') +30: +31: # set initial y, or y after a vowel, to Y +32: w.gsub!(/^y/, "Y") +33: w.gsub!(/(#{Porter2::V})y/, '\1Y') +34: +35: w +36: end
- # File lib/porter2.rb, line 69 -69: def porter2_r1 -70: if self =~ /^(gener|commun|arsen)(?<r1>.*)/ -71: Regexp.last_match(:r1) -72: else -73: self =~ /#{Porter2::V}#{Porter2::C}(?<r1>.*)$/ -74: Regexp.last_match(:r1) || "" -75: end -76: end+ # File lib/porter2_implementation.rb, line 41 +41: def porter2_r1 +42: if self =~ /^(gener|commun|arsen)(?<r1>.*)/ +43: Regexp.last_match(:r1) +44: else +45: self =~ /#{Porter2::V}#{Porter2::C}(?<r1>.*)$/ +46: Regexp.last_match(:r1) || "" +47: end +48: end
- # File lib/porter2.rb, line 80 -80: def porter2_r2 -81: self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?<r2>.*)$/ -82: Regexp.last_match(:r2) || "" -83: end+ # File lib/porter2_implementation.rb, line 52 +52: def porter2_r2 +53: self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?<r2>.*)$/ +54: Regexp.last_match(:r2) || "" +55: end
- # File lib/porter2.rb, line 297 -297: def porter2_stem(gb_english = false) -298: preword = self.porter2_tidy -299: return preword if preword.length <= 2 -300: -301: word = preword.porter2_preprocess -302: -303: if Porter2::SPECIAL_CASES.has_key? word -304: Porter2::SPECIAL_CASES[word] -305: else -306: w1a = word.porter2_step0.porter2_step1a -307: if Porter2::STEP_1A_SPECIAL_CASES.include? w1a -308: w1a -309: else -310: w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess -311: end -312: end -313: end+ # File lib/porter2_implementation.rb, line 269 +269: def porter2_stem(gb_english = false) +270: preword = self.porter2_tidy +271: return preword if preword.length <= 2 +272: +273: word = preword.porter2_preprocess +274: +275: if Porter2::SPECIAL_CASES.has_key? word +276: Porter2::SPECIAL_CASES[word] +277: else +278: w1a = word.porter2_step0.porter2_step1a +279: if Porter2::STEP_1A_SPECIAL_CASES.include? w1a +280: w1a +281: else +282: w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess +283: end +284: end +285: end
- # File lib/porter2.rb, line 316 -316: def porter2_stem_verbose(gb_english = false) -317: preword = self.porter2_tidy -318: puts "Preword: #{preword}" -319: return preword if preword.length <= 2 -320: -321: word = preword.porter2_preprocess -322: puts "Preprocessed: #{word}" -323: -324: if Porter2::SPECIAL_CASES.has_key? word -325: puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}" -326: Porter2::SPECIAL_CASES[word] -327: else -328: r1 = word.porter2_r1 -329: r2 = word.porter2_r2 -330: puts "R1 = #{r1}, R2 = #{r2}" -331: -332: w0 = word.porter2_step0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})" -333: w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})" -334: -335: if Porter2::STEP_1A_SPECIAL_CASES.include? w1a -336: puts "Returning #{w1a} as 1a special case" -337: w1a -338: else -339: w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})" -340: w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})" -341: w2 = w1c.porter2_step2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})" -342: w3 = w2.porter2_step3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})" -343: w4 = w3.porter2_step4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})" -344: w5 = w4.porter2_step5 ; puts "After step 5: #{w5}" -345: wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}" -346: wpost -347: end -348: end -349: end+ # File lib/porter2_implementation.rb, line 288 +288: def porter2_stem_verbose(gb_english = false) +289: preword = self.porter2_tidy +290: puts "Preword: #{preword}" +291: return preword if preword.length <= 2 +292: +293: word = preword.porter2_preprocess +294: puts "Preprocessed: #{word}" +295: +296: if Porter2::SPECIAL_CASES.has_key? word +297: puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}" +298: Porter2::SPECIAL_CASES[word] +299: else +300: r1 = word.porter2_r1 +301: r2 = word.porter2_r2 +302: puts "R1 = #{r1}, R2 = #{r2}" +303: +304: w0 = word.porter2_step0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})" +305: w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})" +306: +307: if Porter2::STEP_1A_SPECIAL_CASES.include? w1a +308: puts "Returning #{w1a} as 1a special case" +309: w1a +310: else +311: w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})" +312: w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})" +313: w2 = w1c.porter2_step2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})" +314: w3 = w2.porter2_step3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})" +315: w4 = w3.porter2_step4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})" +316: w5 = w4.porter2_step5 ; puts "After step 5: #{w5}" +317: wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}" +318: wpost +319: end +320: end +321: end
- # File lib/porter2.rb, line 103 -103: def porter2_step0 -104: self.sub!(/(.)('s'|'s|')$/, '\1') || self -105: end+ # File lib/porter2_implementation.rb, line 75 +75: def porter2_step0 +76: self.sub!(/(.)('s'|'s|')$/, '\1') || self +77: end
- # File lib/porter2.rb, line 113 -113: def porter2_step1a -114: if self =~ /sses$/ -115: self.sub(/sses$/, 'ss') -116: elsif self =~ /..(ied|ies)$/ -117: self.sub(/(ied|ies)$/, 'i') -118: elsif self =~ /(ied|ies)$/ -119: self.sub(/(ied|ies)$/, 'ie') -120: elsif self =~ /(us|ss)$/ -121: self -122: elsif self =~ /s$/ -123: if self =~ /(#{Porter2::V}.+)s$/ -124: self.sub(/s$/, '') -125: else -126: self -127: end -128: else -129: self -130: end -131: end+ # File lib/porter2_implementation.rb, line 85 + 85: def porter2_step1a + 86: if self =~ /sses$/ + 87: self.sub(/sses$/, 'ss') + 88: elsif self =~ /..(ied|ies)$/ + 89: self.sub(/(ied|ies)$/, 'i') + 90: elsif self =~ /(ied|ies)$/ + 91: self.sub(/(ied|ies)$/, 'ie') + 92: elsif self =~ /(us|ss)$/ + 93: self + 94: elsif self =~ /s$/ + 95: if self =~ /(#{Porter2::V}.+)s$/ + 96: self.sub(/s$/, '') + 97: else + 98: self + 99: end +100: else +101: self +102: end +103: end
- # File lib/porter2.rb, line 143 -143: def porter2_step1b(gb_english = false) -144: if self =~ /(eed|eedly)$/ -145: if self.porter2_r1 =~ /(eed|eedly)$/ -146: self.sub(/(eed|eedly)$/, 'ee') -147: else -148: self -149: end -150: else -151: w = self.dup -152: if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/ -153: w.sub!(/(ed|edly|ing|ingly)$/, '') -154: if w =~ /(at|lb|iz)$/ -155: w += 'e' -156: elsif w =~ /is$/ and gb_english -157: w += 'e' -158: elsif w =~ /#{Porter2::Double}$/ -159: w.chop! -160: elsif w.porter2_is_short_word? -161: w += 'e' -162: end -163: end -164: w -165: end -166: end+ # File lib/porter2_implementation.rb, line 115 +115: def porter2_step1b(gb_english = false) +116: if self =~ /(eed|eedly)$/ +117: if self.porter2_r1 =~ /(eed|eedly)$/ +118: self.sub(/(eed|eedly)$/, 'ee') +119: else +120: self +121: end +122: else +123: w = self.dup +124: if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/ +125: w.sub!(/(ed|edly|ing|ingly)$/, '') +126: if w =~ /(at|lb|iz)$/ +127: w += 'e' +128: elsif w =~ /is$/ and gb_english +129: w += 'e' +130: elsif w =~ /#{Porter2::Double}$/ +131: w.chop! +132: elsif w.porter2_is_short_word? +133: w += 'e' +134: end +135: end +136: w +137: end +138: end
- # File lib/porter2.rb, line 171 -171: def porter2_step1c -172: if self =~ /.+#{Porter2::C}(y|Y)$/ -173: self.sub(/(y|Y)$/, 'i') -174: else -175: self -176: end -177: end+ # File lib/porter2_implementation.rb, line 143 +143: def porter2_step1c +144: if self =~ /.+#{Porter2::C}(y|Y)$/ +145: self.sub(/(y|Y)$/, 'i') +146: else +147: self +148: end +149: end
- # File lib/porter2.rb, line 188 -188: def porter2_step2(gb_english = false) -189: r1 = self.porter2_r1 -190: s2m = Porter2::STEP_2_MAPS.dup -191: if gb_english -192: s2m["iser"] = "ise" -193: s2m["isation"] = "ise" -194: end -195: step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")}) -196: if self =~ step_2_re -197: if r1 =~ /#{$&}$/ -198: self.sub(/#{$&}$/, s2m[$&]) -199: else -200: self -201: end -202: elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/ -203: self.sub(/li$/, '') -204: elsif r1 =~ /ogi$/ and self =~ /logi$/ -205: self.sub(/ogi$/, 'og') -206: else -207: self -208: end -209: end+ # File lib/porter2_implementation.rb, line 160 +160: def porter2_step2(gb_english = false) +161: r1 = self.porter2_r1 +162: s2m = Porter2::STEP_2_MAPS.dup +163: if gb_english +164: s2m["iser"] = "ise" +165: s2m["isation"] = "ise" +166: end +167: step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")}) +168: if self =~ step_2_re +169: if r1 =~ /#{$&}$/ +170: self.sub(/#{$&}$/, s2m[$&]) +171: else +172: self +173: end +174: elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/ +175: self.sub(/li$/, '') +176: elsif r1 =~ /ogi$/ and self =~ /logi$/ +177: self.sub(/ogi$/, 'og') +178: else +179: self +180: end +181: end
- # File lib/porter2.rb, line 220 -220: def porter2_step3(gb_english = false) -221: if self =~ /ative$/ and self.porter2_r2 =~ /ative$/ -222: self.sub(/ative$/, '') -223: else -224: s3m = Porter2::STEP_3_MAPS.dup -225: if gb_english -226: s3m["alise"] = "al" -227: end -228: step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")}) -229: r1 = self.porter2_r1 -230: if self =~ step_3_re and r1 =~ /#{$&}$/ -231: self.sub(/#{$&}$/, s3m[$&]) -232: else -233: self -234: end -235: end -236: end+ # File lib/porter2_implementation.rb, line 192 +192: def porter2_step3(gb_english = false) +193: if self =~ /ative$/ and self.porter2_r2 =~ /ative$/ +194: self.sub(/ative$/, '') +195: else +196: s3m = Porter2::STEP_3_MAPS.dup +197: if gb_english +198: s3m["alise"] = "al" +199: end +200: step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")}) +201: r1 = self.porter2_r1 +202: if self =~ step_3_re and r1 =~ /#{$&}$/ +203: self.sub(/#{$&}$/, s3m[$&]) +204: else +205: self +206: end +207: end +208: end
- # File lib/porter2.rb, line 246 -246: def porter2_step4(gb_english = false) -247: if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/ -248: self.sub(/ion$/, '') -249: else -250: s4m = Porter2::STEP_4_MAPS.dup -251: if gb_english -252: s4m["ise"] = "" -253: end -254: step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")}) -255: r2 = self.porter2_r2 -256: if self =~ step_4_re -257: if r2 =~ /#{$&}/ -258: self.sub(/#{$&}$/, s4m[$&]) -259: else -260: self -261: end -262: else -263: self -264: end -265: end -266: end+ # File lib/porter2_implementation.rb, line 218 +218: def porter2_step4(gb_english = false) +219: if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/ +220: self.sub(/ion$/, '') +221: else +222: s4m = Porter2::STEP_4_MAPS.dup +223: if gb_english +224: s4m["ise"] = "" +225: end +226: step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")}) +227: r2 = self.porter2_r2 +228: if self =~ step_4_re +229: if r2 =~ /#{$&}/ +230: self.sub(/#{$&}$/, s4m[$&]) +231: else +232: self +233: end +234: else +235: self +236: end +237: end +238: end
- # File lib/porter2.rb, line 272 -272: def porter2_step5 -273: if self =~ /ll$/ and self.porter2_r2 =~ /l$/ -274: self.sub(/ll$/, 'l') -275: elsif self =~ /e$/ and self.porter2_r2 =~ /e$/ -276: self.sub(/e$/, '') -277: else -278: r1 = self.porter2_r1 -279: if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/ -280: self.sub(/e$/, '') -281: else -282: self -283: end -284: end -285: end+ # File lib/porter2_implementation.rb, line 244 +244: def porter2_step5 +245: if self =~ /ll$/ and self.porter2_r2 =~ /l$/ +246: self.sub(/ll$/, 'l') +247: elsif self =~ /e$/ and self.porter2_r2 =~ /e$/ +248: self.sub(/e$/, '') +249: else +250: r1 = self.porter2_r1 +251: if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/ +252: self.sub(/e$/, '') +253: else +254: self +255: end +256: end +257: end
- # File lib/porter2.rb, line 35 -35: def porter2_tidy -36: preword = self.to_s.strip.downcase -37: -38: # map apostrophe-like characters to apostrophes -39: preword.gsub!(/â/, "'") -40: preword.gsub!(/â/, "'") -41: -42: preword -43: end+ # File lib/porter2_implementation.rb, line 7 + 7: def porter2_tidy + 8: preword = self.to_s.strip.downcase + 9: +10: # map apostrophe-like characters to apostrophes +11: preword.gsub!(/â/, "'") +12: preword.gsub!(/â/, "'") +13: +14: preword +15: end