b38016a78ee011d1f76712bc62332fe43cbfeb2b
[porter2stemmer.git] / test / tc_porter2_parts.rb
1 # coding: utf-8
2 # Porter 2 stemmer test file
3 #
4 # This file tests each stage of the stemmer individually.
5
6
7 $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
8
9 require 'test/unit'
10 require 'porter2'
11
12 #class String
13 # public :porter2_preprocess, :porter2_r1, :porter2_r2
14 #end
15
16 class TestPorter2 < Test::Unit::TestCase
17
18 def test_tidy
19 assert_equal "abacde", "abacde".porter2_tidy
20 assert_equal "abacde", " abacde ".porter2_tidy
21 assert_equal "abacde", "ABACDE".porter2_tidy
22 assert_equal "ab'cde", "ab‘cde".porter2_tidy
23 assert_equal "ab'cde", "ab’cde".porter2_tidy
24 assert_equal "ab'c'de", "ab’c’de".porter2_tidy
25 assert_equal "ab'c'de", "ab‘c‘de".porter2_tidy
26 assert_equal "''abacde", "’‘abacde".porter2_tidy
27 end
28
29 def test_preprocess
30 assert_equal "abacde", "abacde".porter2_preprocess
31 assert_equal "abacde", "''abacde".porter2_preprocess
32 assert_equal "ab'c'de", "'ab'c'de".porter2_preprocess
33 assert_equal "ab'c'de", "''ab'c'de".porter2_preprocess
34 assert_equal "Yabac", "yabac".porter2_preprocess
35 assert_equal "aYbc", "aybc".porter2_preprocess
36 assert_equal "abacdeY", "abacdey".porter2_preprocess
37 assert_equal "abaYde", "abayde".porter2_preprocess
38 assert_equal "kabaYde", "kabayde".porter2_preprocess
39 assert_equal "'", "'''".porter2_preprocess
40 end
41
42 def test_find_R1
43 assert_equal "iful", "beautiful".porter2_r1
44 assert_equal "y", "beauty".porter2_r1
45 assert_equal "", "beau".porter2_r1
46 assert_equal "imadversion", "animadversion".porter2_r1
47 assert_equal "kled", "sprinkled".porter2_r1
48 assert_equal "harist", "eucharist".porter2_r1
49
50 # special cases
51 assert_equal "ate", "generate".porter2_r1
52 assert_equal "ates", "generates".porter2_r1
53 assert_equal "ated", "generated".porter2_r1
54 assert_equal "al", "general".porter2_r1
55 assert_equal "ally", "generally".porter2_r1
56 assert_equal "ic", "generic".porter2_r1
57 assert_equal "ically", "generically".porter2_r1
58 assert_equal "ous", "generous".porter2_r1
59 assert_equal "ously", "generously".porter2_r1
60
61 assert_equal "al", "communal".porter2_r1
62 assert_equal "ity", "community".porter2_r1
63 assert_equal "e", "commune".porter2_r1
64
65 assert_equal "ic", "arsenic".porter2_r1
66 assert_equal "al", "arsenal".porter2_r1
67 end
68
69 def test_ends_with_short_syllable?
70 assert_equal true, "rap".porter2_ends_with_short_syllable?
71 assert_equal true, "trap".porter2_ends_with_short_syllable?
72 assert_equal true, "entrap".porter2_ends_with_short_syllable?
73 assert_equal true, "ow".porter2_ends_with_short_syllable?
74 assert_equal true, "on".porter2_ends_with_short_syllable?
75 assert_equal true, "at".porter2_ends_with_short_syllable?
76 assert_equal false, "uproot".porter2_ends_with_short_syllable?
77 assert_equal false, "bestow".porter2_ends_with_short_syllable?
78 assert_equal false, "disturb".porter2_ends_with_short_syllable?
79 end
80
81 def test_is_short_word?
82 short_words = %w[ bed shed shred hop ]
83 long_words = %w[ bead embed beds ]
84 short_words.each do |w|
85 r1 = w.porter2_r1
86 assert_equal true, w.porter2_is_short_word?,
87 "#{w} should be short but classified as long"
88 end
89 long_words.each do |w|
90 r1 = w.porter2_r1
91 assert_equal false, w.porter2_is_short_word?,
92 "#{w} should be long but classified as short"
93 end
94 end
95
96 def test_find_R2
97 assert_equal "ul", "beautiful".porter2_r2
98 assert_equal "", "beauty".porter2_r2
99 assert_equal "", "beau".porter2_r2
100 assert_equal "adversion", "animadversion".porter2_r2
101 assert_equal "", "sprinkled".porter2_r2
102 assert_equal "ist", "eucharist".porter2_r2
103 end
104
105 def test_step_0
106 assert_equal "abac", "abac".step_0
107 assert_equal "abac", "abac'".step_0
108 assert_equal "abac", "abac's".step_0
109 assert_equal "abac", "abac's'".step_0
110 assert_equal "ab'c", "ab'c".step_0
111 assert_equal "ab'sc", "ab'sc".step_0
112 assert_equal "ab's'c", "ab's'c".step_0
113 assert_equal "ab'sc", "ab'sc's".step_0
114 assert_equal "'", "'".step_0
115 assert_equal "'s", "'s".step_0
116 assert_equal "'s", "'s'".step_0
117 end
118
119 def test_step_1a
120 assert_equal "abacde", "abacde".step_1a
121 assert_equal "abacess", "abacesses".step_1a
122 assert_equal "tie", "ties".step_1a
123 assert_equal "tie", "tied".step_1a
124 assert_equal "cri", "cries".step_1a
125 assert_equal "cri", "cried".step_1a
126 assert_equal "gas", "gas".step_1a
127 assert_equal "this", "this".step_1a
128 assert_equal "gap", "gaps".step_1a
129 assert_equal "kiwi", "kiwis".step_1a
130 assert_equal "abacus", "abacus".step_1a
131 assert_equal "abacess", "abacess".step_1a
132 end
133
134 def test_step_1b
135 assert_equal "abacde", "abacde".step_1b
136 words_non_gb = {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
137 "hopping" => "hop", "hopped" => "hop",
138 "hoped" => "hope", "hoping" => "hope",
139 "atomized" => "atomize", "atomised" => "atomis",
140 "addicted" => "addict", "bleed" => "bleed" }
141 words_non_gb.each do |original, stemmed|
142 assert_equal stemmed, original.step_1b,
143 "#{original} should have stemmed to #{stemmed} but got #{original.step_1b(original.porter2_r1)} instead"
144 end
145 words_gb = {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
146 "hopping" => "hop", "hopped" => "hop",
147 "hoped" => "hope", "hoping" => "hope",
148 "atomized" => "atomize", "atomised" => "atomise",
149 "addicted" => "addict", "bleed" => "bleed" }
150 words_gb.each do |original, stemmed|
151 assert_equal stemmed, original.step_1b(true),
152 "#{original} should have stemmed to #{stemmed} but got #{original.step_1b(original.porter2_r1)} instead"
153 end
154 end
155
156 def test_step_1c
157 assert_equal "cri", "cry".step_1c
158 assert_equal "by", "by".step_1c
159 assert_equal "saY", "saY".step_1c
160 assert_equal "abbeY", "abbeY".step_1c
161 end
162
163 def test_step_2
164 assert_equal "abac", "abac".step_2
165
166 assert_equal "nationalize", "nationalization".step_2
167 assert_equal "nationalisate", "nationalisation".step_2
168 assert_equal "nationalize", "nationalization".step_2(true)
169 assert_equal "nationalise", "nationalisation".step_2(true)
170 # Repeat the steps to ensure that the english-gb behaviour isn't sticky
171 assert_equal "nationalize", "nationalization".step_2(false)
172 assert_equal "nationalisate", "nationalisation".step_2(false)
173 assert_equal "nationalize", "nationalization".step_2
174 assert_equal "nationalisate", "nationalisation".step_2
175
176 assert_equal "nationalize", "nationalizer".step_2
177 assert_equal "nationaliser", "nationaliser".step_2
178 assert_equal "nationalize", "nationalizer".step_2(true)
179 assert_equal "nationalise", "nationaliser".step_2(true)
180
181 assert_equal "abaction", "abactional".step_2
182 assert_equal "abacence", "abacenci".step_2
183 assert_equal "abacance", "abacanci".step_2
184 assert_equal "abacable", "abacabli".step_2
185 assert_equal "abacent", "abacentli".step_2
186 assert_equal "abacize", "abacizer".step_2
187 assert_equal "abacize", "abacization".step_2
188 assert_equal "abacate", "abacational".step_2
189 assert_equal "abacate", "abacation".step_2
190 assert_equal "abacate", "abacator".step_2
191 assert_equal "abacal", "abacalism".step_2
192 assert_equal "abacal", "abacaliti".step_2
193 assert_equal "abacal", "abacalli".step_2
194 assert_equal "abacful", "abacfulness".step_2
195 assert_equal "abacous", "abacousli".step_2
196 assert_equal "abacous", "abacousness".step_2
197 assert_equal "abacive", "abaciveness".step_2
198 assert_equal "abacive", "abaciviti".step_2
199 assert_equal "abiliti", "abiliti".step_2
200 assert_equal "abacble", "abacbiliti".step_2
201 assert_equal "abacble", "abacbli".step_2
202 assert_equal "abacful", "abacfulli".step_2
203 assert_equal "abacless", "abaclessli".step_2
204 assert_equal "abaclog", "abaclogi".step_2
205
206 assert_equal "abac", "abacli".step_2
207 assert_equal "abd", "abdli".step_2
208 assert_equal "abe", "abeli".step_2
209 assert_equal "abg", "abgli".step_2
210 assert_equal "abh", "abhli".step_2
211 assert_equal "abk", "abkli".step_2
212 assert_equal "abm", "abmli".step_2
213 assert_equal "abn", "abnli".step_2
214 assert_equal "abr", "abrli".step_2
215 assert_equal "abt", "abtli".step_2
216 assert_equal "abali", "abali".step_2
217
218 assert_equal "bad", "badli".step_2
219 assert_equal "fluentli", "fluentli".step_2
220 assert_equal "geolog", "geologi".step_2
221 end
222
223 def test_step_3
224 assert_equal "abac", "abac".step_3("")
225
226 assert_equal "national", "nationalize".step_3
227 assert_equal "nationalise", "nationalise".step_3
228 assert_equal "national", "nationalise".step_3(true)
229 # Repeat the steps to ensure that the english-gb behaviour isn't sticky
230 assert_equal "national", "nationalize".step_3(false)
231 assert_equal "nationalise", "nationalise".step_3(false)
232 assert_equal "national", "nationalize".step_3
233 assert_equal "nationalise", "nationalise".step_3
234
235 assert_equal "abaction", "abactional".step_3
236 assert_equal "abacate", "abacational".step_3
237 assert_equal "abacic", "abacicate".step_3
238 assert_equal "abacic", "abaciciti".step_3
239 assert_equal "abacic", "abacical".step_3
240 assert_equal "abac", "abacful".step_3
241 assert_equal "abac", "abacness".step_3
242
243 assert_equal "abacabac", "abacabacative".step_3
244 assert_equal "abacabac", "abacabacative".step_3
245
246 assert_equal "dryness", "dryness".step_3
247 end
248
249 def test_step_4
250 assert_equal "abac", "abac".step_4("")
251
252 assert_equal "nation", "nationize".step_4
253 assert_equal "nationise", "nationise".step_4
254 assert_equal "nation", "nationize".step_4(true)
255 assert_equal "nation", "nationise".step_4(true)
256 assert_equal "nation", "nationize".step_4(false)
257 assert_equal "nationise", "nationise".step_4(false)
258 assert_equal "nation", "nationize".step_4()
259 assert_equal "nationise", "nationise".step_4()
260
261 assert_equal "abac", "abacal".step_4
262 assert_equal "abac", "abacance".step_4
263 assert_equal "abac", "abacence".step_4
264 assert_equal "abac", "abacer".step_4
265 assert_equal "abac", "abacic".step_4
266 assert_equal "abacer", "abacerable".step_4
267 assert_equal "abac", "abacible".step_4
268 assert_equal "abac", "abacant".step_4
269 assert_equal "abac", "abacement".step_4 # Check we handle overlapping suffixes properly
270 assert_equal "abacac", "abacacement".step_4
271 assert_equal "abacac", "abacacment".step_4
272 assert_equal "abac", "abacment".step_4
273 assert_equal "abac", "abacent".step_4
274 assert_equal "abac", "abacism".step_4
275 assert_equal "abac", "abacate".step_4
276 assert_equal "abac", "abaciti".step_4
277 assert_equal "abac", "abacous".step_4
278 assert_equal "abac", "abacive".step_4
279 assert_equal "abac", "abacize".step_4
280 assert_equal "abacion", "abacion".step_4
281 assert_equal "abacs", "abacsion".step_4
282 assert_equal "abact", "abaction".step_4
283 assert_equal "abction", "abction".step_4
284 assert_equal "ablut", "ablution".step_4
285 assert_equal "agreement", "agreement".step_4
286
287 assert_equal "abcal", "abcal".step_4 # No removal if suffix isn't in R2
288 end
289
290 def test_step_5
291 assert_equal "abac", "abac".step_5
292
293 assert_equal "abacl", "abacll".step_5
294 assert_equal "abcll", "abcll".step_5
295
296 assert_equal "abc", "abc".step_5
297 assert_equal "abl", "able".step_5
298 assert_equal "abe", "abe".step_5
299 assert_equal "abac", "abace".step_5
300 assert_equal "bawac", "bawace".step_5
301 end
302
303 def test_porter2_postprocess
304 assert_equal "abac", "abac".porter2_postprocess
305 assert_equal "abacy", "abacy".porter2_postprocess
306 assert_equal "abacy", "abacY".porter2_postprocess
307 assert_equal "aybcy", "aYbcY".porter2_postprocess
308 assert_equal "aybcy", "aYbcy".porter2_postprocess
309 end
310
311 end