2 # Porter 2 stemmer test file
4 # This file tests each stage of the stemmer individually.
6 $
:.unshift File
.join(File
.dirname(__FILE__
), "..", "lib")
11 class TestPorter2
< Test
::Unit::TestCase
14 assert_equal
"abacde", "abacde".porter2_tidy
15 assert_equal
"abacde", " abacde ".porter2_tidy
16 assert_equal
"abacde", "ABACDE".porter2_tidy
17 assert_equal
"ab'cde", "ab‘cde".porter2_tidy
18 assert_equal
"ab'cde", "ab’cde".porter2_tidy
19 assert_equal
"ab'c'de", "ab’c’de".porter2_tidy
20 assert_equal
"ab'c'de", "ab‘c‘de".porter2_tidy
21 assert_equal
"''abacde", "’‘abacde".porter2_tidy
25 assert_equal
"abacde", "abacde".porter2_preprocess
26 assert_equal
"abacde", "''abacde".porter2_preprocess
27 assert_equal
"ab'c'de", "'ab'c'de".porter2_preprocess
28 assert_equal
"ab'c'de", "''ab'c'de".porter2_preprocess
29 assert_equal
"Yabac", "yabac".porter2_preprocess
30 assert_equal
"aYbc", "aybc".porter2_preprocess
31 assert_equal
"abacdeY", "abacdey".porter2_preprocess
32 assert_equal
"abaYde", "abayde".porter2_preprocess
33 assert_equal
"kabaYde", "kabayde".porter2_preprocess
34 assert_equal
"kabyaYde", "kabyayde".porter2_preprocess
35 assert_equal
"'", "'''".porter2_preprocess
39 assert_equal
"iful", "beautiful".porter2_r1
40 assert_equal
"y", "beauty".porter2_r1
41 assert_equal
"", "beau".porter2_r1
42 assert_equal
"imadversion", "animadversion".porter2_r1
43 assert_equal
"kled", "sprinkled".porter2_r1
44 assert_equal
"harist", "eucharist".porter2_r1
47 assert_equal
"ate", "generate".porter2_r1
48 assert_equal
"ates", "generates".porter2_r1
49 assert_equal
"ated", "generated".porter2_r1
50 assert_equal
"al", "general".porter2_r1
51 assert_equal
"ally", "generally".porter2_r1
52 assert_equal
"ic", "generic".porter2_r1
53 assert_equal
"ically", "generically".porter2_r1
54 assert_equal
"ous", "generous".porter2_r1
55 assert_equal
"ously", "generously".porter2_r1
57 assert_equal
"al", "communal".porter2_r1
58 assert_equal
"ity", "community".porter2_r1
59 assert_equal
"e", "commune".porter2_r1
61 assert_equal
"ic", "arsenic".porter2_r1
62 assert_equal
"al", "arsenal".porter2_r1
66 assert_equal
"ul", "beautiful".porter2_r2
67 assert_equal
"", "beauty".porter2_r2
68 assert_equal
"", "beau".porter2_r2
69 assert_equal
"adversion", "animadversion".porter2_r2
70 assert_equal
"", "sprinkled".porter2_r2
71 assert_equal
"ist", "eucharist".porter2_r2
74 def test_ends_with_short_syllable
?
75 assert_equal
true, "rap".porter2_ends_with_short_syllable
?
76 assert_equal
true, "trap".porter2_ends_with_short_syllable
?
77 assert_equal
true, "entrap".porter2_ends_with_short_syllable
?
78 assert_equal
true, "ow".porter2_ends_with_short_syllable
?
79 assert_equal
true, "on".porter2_ends_with_short_syllable
?
80 assert_equal
true, "at".porter2_ends_with_short_syllable
?
81 assert_equal
false, "uproot".porter2_ends_with_short_syllable
?
82 assert_equal
false, "bestow".porter2_ends_with_short_syllable
?
83 assert_equal
false, "disturb".porter2_ends_with_short_syllable
?
86 def test_is_short_word
?
87 short_words
= %w
[ bed shed shred hop
]
88 long_words
= %w
[ bead embed beds
]
89 short_words
.each
do |w
|
91 assert_equal
true, w
.porter2_is_short_word
?,
92 "#{w} should be short but classified as long"
94 long_words
.each
do |w
|
96 assert_equal
false, w
.porter2_is_short_word
?,
97 "#{w} should be long but classified as short"
102 assert_equal
"abac", "abac".porter2_step0
103 assert_equal
"abac", "abac'".porter2_step0
104 assert_equal
"abac", "abac's".porter2_step0
105 assert_equal
"abac", "abac's'".porter2_step0
106 assert_equal
"ab'c", "ab'c".porter2_step0
107 assert_equal
"ab'sc", "ab'sc".porter2_step0
108 assert_equal
"ab's'c", "ab's'c".porter2_step0
109 assert_equal
"ab'sc", "ab'sc's".porter2_step0
110 assert_equal
"'", "'".porter2_step0
111 assert_equal
"'s", "'s".porter2_step0
112 assert_equal
"'s", "'s'".porter2_step0
116 assert_equal
"abacde", "abacde".porter2_step1a
117 assert_equal
"abacess", "abacesses".porter2_step1a
118 assert_equal
"tie", "ties".porter2_step1a
119 assert_equal
"tie", "tied".porter2_step1a
120 assert_equal
"cri", "cries".porter2_step1a
121 assert_equal
"cri", "cried".porter2_step1a
122 assert_equal
"gas", "gas".porter2_step1a
123 assert_equal
"this", "this".porter2_step1a
124 assert_equal
"gap", "gaps".porter2_step1a
125 assert_equal
"kiwi", "kiwis".porter2_step1a
126 assert_equal
"abacus", "abacus".porter2_step1a
127 assert_equal
"abacess", "abacess".porter2_step1a
131 assert_equal
"abacde", "abacde".porter2_step1b
132 words_non_gb
= {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
133 "hopping" => "hop", "hopped" => "hop",
134 "hoped" => "hope", "hoping" => "hope",
135 "atomized" => "atomize", "atomised" => "atomis",
136 "addicted" => "addict", "bleed" => "bleed" }
137 words_non_gb
.each
do |original
, stemmed
|
138 assert_equal stemmed
, original
.porter2_step1b
,
139 "#{original} should have stemmed to #{stemmed} but got #{original.porter2_step1b(original.porter2_r1)} instead"
141 words_gb
= {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
142 "hopping" => "hop", "hopped" => "hop",
143 "hoped" => "hope", "hoping" => "hope",
144 "atomized" => "atomize", "atomised" => "atomise",
145 "addicted" => "addict", "bleed" => "bleed" }
146 words_gb
.each
do |original
, stemmed
|
147 assert_equal stemmed
, original
.porter2_step1b(true),
148 "#{original} should have stemmed to #{stemmed} but got #{original.porter2_step1b(original.porter2_r1)} instead"
153 assert_equal
"cri", "cry".porter2_step1c
154 assert_equal
"by", "by".porter2_step1c
155 assert_equal
"saY", "saY".porter2_step1c
156 assert_equal
"abbeY", "abbeY".porter2_step1c
160 assert_equal
"abac", "abac".porter2_step2
162 assert_equal
"nationalize", "nationalization".porter2_step2
163 assert_equal
"nationalisate", "nationalisation".porter2_step2
164 assert_equal
"nationalize", "nationalization".porter2_step2(true)
165 assert_equal
"nationalise", "nationalisation".porter2_step2(true)
166 # Repeat the steps to ensure that the english-gb behaviour isn't sticky
167 assert_equal
"nationalize", "nationalization".porter2_step2(false)
168 assert_equal
"nationalisate", "nationalisation".porter2_step2(false)
169 assert_equal
"nationalize", "nationalization".porter2_step2
170 assert_equal
"nationalisate", "nationalisation".porter2_step2
172 assert_equal
"nationalize", "nationalizer".porter2_step2
173 assert_equal
"nationaliser", "nationaliser".porter2_step2
174 assert_equal
"nationalize", "nationalizer".porter2_step2(true)
175 assert_equal
"nationalise", "nationaliser".porter2_step2(true)
177 assert_equal
"abaction", "abactional".porter2_step2
178 assert_equal
"abacence", "abacenci".porter2_step2
179 assert_equal
"abacance", "abacanci".porter2_step2
180 assert_equal
"abacable", "abacabli".porter2_step2
181 assert_equal
"abacent", "abacentli".porter2_step2
182 assert_equal
"abacize", "abacizer".porter2_step2
183 assert_equal
"abacize", "abacization".porter2_step2
184 assert_equal
"abacate", "abacational".porter2_step2
185 assert_equal
"abacate", "abacation".porter2_step2
186 assert_equal
"abacate", "abacator".porter2_step2
187 assert_equal
"abacal", "abacalism".porter2_step2
188 assert_equal
"abacal", "abacaliti".porter2_step2
189 assert_equal
"abacal", "abacalli".porter2_step2
190 assert_equal
"abacful", "abacfulness".porter2_step2
191 assert_equal
"abacous", "abacousli".porter2_step2
192 assert_equal
"abacous", "abacousness".porter2_step2
193 assert_equal
"abacive", "abaciveness".porter2_step2
194 assert_equal
"abacive", "abaciviti".porter2_step2
195 assert_equal
"abiliti", "abiliti".porter2_step2
196 assert_equal
"abacble", "abacbiliti".porter2_step2
197 assert_equal
"abacble", "abacbli".porter2_step2
198 assert_equal
"abacful", "abacfulli".porter2_step2
199 assert_equal
"abacless", "abaclessli".porter2_step2
200 assert_equal
"abaclog", "abaclogi".porter2_step2
202 assert_equal
"abac", "abacli".porter2_step2
203 assert_equal
"abd", "abdli".porter2_step2
204 assert_equal
"abe", "abeli".porter2_step2
205 assert_equal
"abg", "abgli".porter2_step2
206 assert_equal
"abh", "abhli".porter2_step2
207 assert_equal
"abk", "abkli".porter2_step2
208 assert_equal
"abm", "abmli".porter2_step2
209 assert_equal
"abn", "abnli".porter2_step2
210 assert_equal
"abr", "abrli".porter2_step2
211 assert_equal
"abt", "abtli".porter2_step2
212 assert_equal
"abali", "abali".porter2_step2
214 assert_equal
"bad", "badli".porter2_step2
215 assert_equal
"fluentli", "fluentli".porter2_step2
216 assert_equal
"geolog", "geologi".porter2_step2
220 assert_equal
"abac", "abac".porter2_step3("")
222 assert_equal
"national", "nationalize".porter2_step3
223 assert_equal
"nationalise", "nationalise".porter2_step3
224 assert_equal
"national", "nationalise".porter2_step3(true)
225 # Repeat the steps to ensure that the english-gb behaviour isn't sticky
226 assert_equal
"national", "nationalize".porter2_step3(false)
227 assert_equal
"nationalise", "nationalise".porter2_step3(false)
228 assert_equal
"national", "nationalize".porter2_step3
229 assert_equal
"nationalise", "nationalise".porter2_step3
231 assert_equal
"abaction", "abactional".porter2_step3
232 assert_equal
"abacate", "abacational".porter2_step3
233 assert_equal
"abacic", "abacicate".porter2_step3
234 assert_equal
"abacic", "abaciciti".porter2_step3
235 assert_equal
"abacic", "abacical".porter2_step3
236 assert_equal
"abac", "abacful".porter2_step3
237 assert_equal
"abac", "abacness".porter2_step3
239 assert_equal
"abacabac", "abacabacative".porter2_step3
240 assert_equal
"abacabac", "abacabacative".porter2_step3
242 assert_equal
"dryness", "dryness".porter2_step3
246 assert_equal
"abac", "abac".porter2_step4("")
248 assert_equal
"nation", "nationize".porter2_step4
249 assert_equal
"nationise", "nationise".porter2_step4
250 assert_equal
"nation", "nationize".porter2_step4(true)
251 assert_equal
"nation", "nationise".porter2_step4(true)
252 assert_equal
"nation", "nationize".porter2_step4(false)
253 assert_equal
"nationise", "nationise".porter2_step4(false)
254 assert_equal
"nation", "nationize".porter2_step4()
255 assert_equal
"nationise", "nationise".porter2_step4()
257 assert_equal
"abac", "abacal".porter2_step4
258 assert_equal
"abac", "abacance".porter2_step4
259 assert_equal
"abac", "abacence".porter2_step4
260 assert_equal
"abac", "abacer".porter2_step4
261 assert_equal
"abac", "abacic".porter2_step4
262 assert_equal
"abacer", "abacerable".porter2_step4
263 assert_equal
"abac", "abacible".porter2_step4
264 assert_equal
"abac", "abacant".porter2_step4
265 assert_equal
"abac", "abacement".porter2_step4
# Check we handle overlapping suffixes properly
266 assert_equal
"abacac", "abacacement".porter2_step4
267 assert_equal
"abacac", "abacacment".porter2_step4
268 assert_equal
"abac", "abacment".porter2_step4
269 assert_equal
"abac", "abacent".porter2_step4
270 assert_equal
"abac", "abacism".porter2_step4
271 assert_equal
"abac", "abacate".porter2_step4
272 assert_equal
"abac", "abaciti".porter2_step4
273 assert_equal
"abac", "abacous".porter2_step4
274 assert_equal
"abac", "abacive".porter2_step4
275 assert_equal
"abac", "abacize".porter2_step4
276 assert_equal
"abacion", "abacion".porter2_step4
277 assert_equal
"abacs", "abacsion".porter2_step4
278 assert_equal
"abact", "abaction".porter2_step4
279 assert_equal
"abction", "abction".porter2_step4
280 assert_equal
"ablut", "ablution".porter2_step4
281 assert_equal
"agreement", "agreement".porter2_step4
283 assert_equal
"abcal", "abcal".porter2_step4
# No removal if suffix isn't in R2
287 assert_equal
"abac", "abac".porter2_step5
289 assert_equal
"abacl", "abacll".porter2_step5
290 assert_equal
"abcll", "abcll".porter2_step5
292 assert_equal
"abc", "abc".porter2_step5
293 assert_equal
"abl", "able".porter2_step5
294 assert_equal
"abe", "abe".porter2_step5
295 assert_equal
"abac", "abace".porter2_step5
296 assert_equal
"bawac", "bawace".porter2_step5
299 def test_porter2_postprocess
300 assert_equal
"abac", "abac".porter2_postprocess
301 assert_equal
"abacy", "abacy".porter2_postprocess
302 assert_equal
"abacy", "abacY".porter2_postprocess
303 assert_equal
"aybcy", "aYbcY".porter2_postprocess
304 assert_equal
"aybcy", "aYbcy".porter2_postprocess