Got the gem layout working
[porter2stemmer.git] / lib / porter2.rb~
1 #! /local/ruby/bin/ruby
2 # coding: utf-8
3
4 # Porter stemmer in Ruby.
5 #
6 # This is the Porter 2 stemming algorithm, as described at
7 # http://snowball.tartarus.org/algorithms/english/stemmer.html
8 # The original paper is:
9 #
10 # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
11 # no. 3, pp 130-137,
12
13 module Stemmable
14
15 C = "[^aeiouy]" # consonant
16 V = "[aeiouy]" # vowel
17 CW = "[^aeiouywxY]" # a non-vowel other than w, x, or Y
18 Double = "bb|dd|ff|gg|mm|nn|pp|rr|tt"
19 Valid_LI = "[cdeghkmnrt]"
20 SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))"
21
22 STEP_2_MAPS = {"tional" => "tion",
23 "enci" => "ence",
24 "anci" => "ance",
25 "abli" => "able",
26 "entli" => "ent",
27 "ization" => "ize",
28 "izer" => "ize",
29 "ational" => "ate",
30 "ation" => "ate",
31 "ator" => "ate",
32 "alism" => "al",
33 "aliti" => "al",
34 "alli" => "al",
35 "fulness" => "ful",
36 "ousli" => "ous",
37 "ousness" => "ous",
38 "iveness" => "ive",
39 "iviti" => "ive",
40 "biliti" => "ble",
41 "bli" => "ble",
42 "fulli" => "ful",
43 "lessli" => "less",
44 "logi" => "log" }
45 # li ending dealt with in procedure
46
47 STEP_3_MAPS = {"tional" => "tion",
48 "ational" => "ate",
49 "alize" => "al",
50 "icate" => "ic",
51 "iciti" => "ic",
52 "ical" => "ic",
53 "ful" => "",
54 "ness" => "" }
55 # ative ending dealt with in procedure
56
57 STEP_4_MAPS = {"al" => "",
58 "ance" => "",
59 "ence" => "",
60 "er" => "",
61 "ic" => "",
62 "able" => "",
63 "ible" => "",
64 "ant" => "",
65 "ement" => "",
66 "ment" => "",
67 "ent" => "",
68 "ism" => "",
69 "ate" => "",
70 "iti" => "",
71 "ous" => "",
72 "ive" => "",
73 "ize" => "",
74 "sion" => "s",
75 "tion" => "t" }
76
77
78 SPECIAL_CASES = {"skis" => "ski",
79 "skies" => "sky",
80
81 "dying" => "die",
82 "lying" => "lie",
83 "tying" => "tie",
84 "idly" => "idl"
85 "gently" => "gentl",
86 "ugly" => "ugli",
87 "early" => "earli",
88 "only" => "onli",
89 "singly" =>"singl",
90
91 "sky" => "sky",
92 "news" => "news",
93 "howe" => "howe",
94 "atlas" => "atlas",
95 "cosmos" => "cosmos",
96 "bias" => "bias",
97 "andes" => "andes" }
98
99 STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ]
100
101
102 def porter2_preprocess
103 w = self.to_s.strip.downcase
104
105 # map apostrophe-like characters to apostrophes
106 w.gsub!(/‘/, "'")
107 w.gsub!(/’/, "'")
108
109 # remove any initial apostrophe
110 w.gsub!(/^'*/, "")
111
112 # set initial y, or y after a vowel, to Y
113 w.gsub!(/^y/, "Y")
114 w.gsub!(/(#{V})y/, '\1Y')
115
116 w
117 end
118
119 # The word after the first non-vowel after the first vowel
120 def porter2_r1
121 if self =~ /^(gener|commun|arsen)(?<r1>.*)/
122 Regexp.last_match(:r1)
123 else
124 self =~ /#{V}#{C}(?<r1>.*)$/
125 Regexp.last_match(:r1) || ""
126 end
127 end
128
129 # R1 after the first non-vowel after the first vowel
130 def porter2_r2
131 self.porter2_r1 =~ /#{V}#{C}(?<r2>.*)$/
132 Regexp.last_match(:r2) || ""
133 end
134
135 def porter2_ends_with_short_syllable?
136 self =~ /#{SHORT_SYLLABLE}$/ ? true : false
137 end
138
139 def porter2_is_short_word?(r1)
140 self.porter2_ends_with_short_syllable? and r1.empty?
141 end
142
143 # Remove 's suffixes
144 def step_0
145 self.sub!(/('s'|'s|')$/, '') || self
146 end
147
148 # Remove plural suffixes
149 def step_1a
150 self.sub!(/sses$/, 'ss')
151 self.sub!(/^(.)(ies|ied)$/, '\1ie')
152 self.sub!(/^(.+)(ies|ied)$/, '\1i')
153 self.sub!(/^(ies|ied)$/, 'ie')
154 unless self =~ /(ss|us)$/
155 self.sub!(/(#{V}.+)s$/, '\1')
156 end
157 self
158 end
159
160 def step_1b(r1, gb_english = false)
161 self.sub!(/(eed|eedly)$/, 'ee') if r1 =~ /eed|eedly/
162 w = self.dup
163 if w =~ /#{V}.*(ed|edly|ing|ingly)$/
164 w.sub!(/(ed|edly|ing|ingly)$/, '')
165 if w =~ /(at|lb|iz)$/
166 w += 'e'
167 elsif w =~ /is$/ and gb_english
168 w += 'e'
169 elsif w =~ /#{Double}$/
170 w.chop!
171 elsif w.porter2_is_short_word?(w.porter2_r1)
172 w += 'e'
173 end
174 end
175 w
176 end
177
178 def step_1c
179 if self =~ /.+#{C}.*(y|Y)$/
180 self.sub(/(y|Y)$/, 'i')
181 else
182 self
183 end
184 end
185
186
187 def step_2(gb_english = false)
188 if self =~ /(#{Valid_LI})li$/
189 self.dup.sub(/(#{Valid_LI})li$/, '\1')
190 else
191 s2m = STEP_2_MAPS.dup
192 if gb_english
193 s2m["iser"] = "ise"
194 s2m["isation"] = "ise"
195 end
196 step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
197 if self =~ step_2_re
198 $` + s2m[$&]
199 else
200 self
201 end
202 end
203 end
204
205
206 def step_3(r2, gb_english = false)
207 if self =~ /ative$/ and r2 =~ /ative/
208 self.dup.sub(/ative$/, '')
209 else
210 s3m = STEP_3_MAPS.dup
211 if gb_english
212 s3m["alise"] = "al"
213 end
214 step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
215 if self =~ step_3_re
216 $` + s3m[$&]
217 else
218 self
219 end
220 end
221 end
222
223
224 def step_4(r2, gb_english = false)
225 s4m = STEP_4_MAPS.dup
226 if gb_english
227 s4m["ise"] = ""
228 end
229 suffixes = s4m.keys.sort_by {|s| s.length}.reverse
230 suffixes.each do |s|
231 if r2 =~ /#{s}/ and self =~ /#{s}$/
232 return $` + s4m[$&]
233 end
234 end
235 return self
236 end
237
238
239 def step_5(r1, r2)
240 if self =~ /ll$/ and r2 =~ /l/
241 self.dup.sub(/ll$/, 'l')
242 elsif self =~ /e$/ and (r2 =~ /e/ or r1 =~ /#{SHORT_SYLLABLE}e/ )
243 self.dup.sub(/e$/, '')
244 else
245 self
246 end
247 end
248
249
250 def porter2_postprocess
251 self.dup.gsub(/Y/, 'y')
252 end
253
254
255 def porter2_stem(gb_english = false)
256 word = self.porter2_preprocess
257
258 if SPECIAL_CASES.has_key? word
259 SPECIAL_CASES[word]
260 else
261 r1 = word.porter2_r1
262 r2 = word.porter2_r2
263
264 w1a = word.step_0.step_1a.step_1b(gb_english)
265 if STEP_1A_SPECIAL_CASES.include? w1a
266 w1a
267 else
268 w1a.step_1c.step_2(gb_english).step_3(r2, gb_english).step_4(r2, gb_english).step_5(r1, r2)
269 end
270 end
271 end
272
273
274 alias stem porter2_stem
275
276 end
277
278 # Add stem method to all Strings
279 class String
280 include Stemmable
281
282 private :porter2_preprocess, :porter2_r1, :porter2_r2
283 end