Initial commit
[porter2stemmer.git] / test / tc_porter2_parts.rb~
1 # coding: utf-8
2 # Porter stemmer test file
3
4
5 $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
7 require 'test/unit'
8 require 'porter2'
9 require 'test_words'
10
11 class String
12 public :porter2_preprocess, :porter2_r1, :porter2_r2
13 end
14
15 class TestPorter2 < Test::Unit::TestCase
16
17 def test_preprocess
18 assert_equal "abcde", "abcde".porter2_preprocess
19 assert_equal "abcde", "ABCDE".porter2_preprocess
20 assert_equal "ab'cde", "ab‘cde".porter2_preprocess
21 assert_equal "ab'cde", "ab’cde".porter2_preprocess
22 assert_equal "ab'c'de", "ab’c’de".porter2_preprocess
23 assert_equal "ab'c'de", "ab‘c‘de".porter2_preprocess
24 assert_equal "abcde", "''abcde".porter2_preprocess
25 assert_equal "abcde", "’‘abcde".porter2_preprocess
26 assert_equal "ab'c'de", "'ab‘c‘de".porter2_preprocess
27 assert_equal "ab'c'de", "’‘ab‘c‘de".porter2_preprocess
28 assert_equal "Yabc", "yabc".porter2_preprocess
29 assert_equal "aYbc", "aybc".porter2_preprocess
30 assert_equal "abcdeY", "abcdey".porter2_preprocess
31 assert_equal "abaYde", "abayde".porter2_preprocess
32 assert_equal "kabaYde", "kabayde".porter2_preprocess
33 end
34
35 def test_find_R1
36 assert_equal "iful", "beautiful".porter2_r1
37 assert_equal "y", "beauty".porter2_r1
38 assert_equal "", "beau".porter2_r1
39 assert_equal "imadversion", "animadversion".porter2_r1
40 assert_equal "kled", "sprinkled".porter2_r1
41 assert_equal "harist", "eucharist".porter2_r1
42
43 # special cases
44 assert_equal "ate", "generate".porter2_r1
45 assert_equal "ates", "generates".porter2_r1
46 assert_equal "ated", "generated".porter2_r1
47 assert_equal "al", "general".porter2_r1
48 assert_equal "ally", "generally".porter2_r1
49 assert_equal "ic", "generic".porter2_r1
50 assert_equal "ically", "generically".porter2_r1
51 assert_equal "ous", "generous".porter2_r1
52 assert_equal "ously", "generously".porter2_r1
53
54 assert_equal "al", "communal".porter2_r1
55 assert_equal "ity", "community".porter2_r1
56 assert_equal "e", "commune".porter2_r1
57
58 assert_equal "ic", "arsenic".porter2_r1
59 assert_equal "al", "arsenal".porter2_r1
60 end
61
62 def test_ends_with_short_syllable?
63 assert_equal true, "rap".porter2_ends_with_short_syllable?
64 assert_equal true, "trap".porter2_ends_with_short_syllable?
65 assert_equal true, "entrap".porter2_ends_with_short_syllable?
66 assert_equal true, "ow".porter2_ends_with_short_syllable?
67 assert_equal true, "on".porter2_ends_with_short_syllable?
68 assert_equal true, "at".porter2_ends_with_short_syllable?
69 assert_equal false, "uproot".porter2_ends_with_short_syllable?
70 assert_equal false, "bestow".porter2_ends_with_short_syllable?
71 assert_equal false, "disturb".porter2_ends_with_short_syllable?
72 end
73
74 def test_is_short_word?
75 short_words = %w[ bed shed shred hop ]
76 long_words = %w[ bead embed beds ]
77 short_words.each do |w|
78 r1 = w.porter2_r1
79 assert_equal true, w.porter2_is_short_word?(r1),
80 "#{w} should be short but classified as long"
81 end
82 long_words.each do |w|
83 r1 = w.porter2_r1
84 assert_equal false, w.porter2_is_short_word?(r1),
85 "#{w} should be long but classified as short"
86 end
87 end
88
89 def test_find_R2
90 assert_equal "ul", "beautiful".porter2_r2
91 assert_equal "", "beauty".porter2_r2
92 assert_equal "", "beau".porter2_r2
93 assert_equal "adversion", "animadversion".porter2_r2
94 assert_equal "", "sprinkled".porter2_r2
95 assert_equal "ist", "eucharist".porter2_r2
96 end
97
98 def test_step_0
99 assert_equal "abc", "abc".step_0
100 assert_equal "abc", "abc'".step_0
101 assert_equal "abc", "abc's".step_0
102 assert_equal "abc", "abc's'".step_0
103 assert_equal "ab'c", "ab'c".step_0
104 assert_equal "ab'sc", "ab'sc".step_0
105 assert_equal "ab's'c", "ab's'c".step_0
106 assert_equal "ab'sc", "ab'sc's".step_0
107 end
108
109 def test_step_1a
110 assert_equal "abcde", "abcde".step_1a
111 assert_equal "abcess", "abcesses".step_1a
112 assert_equal "tie", "ties".step_1a
113 assert_equal "tie", "tied".step_1a
114 assert_equal "cri", "cries".step_1a
115 assert_equal "cri", "cried".step_1a
116 assert_equal "gas", "gas".step_1a
117 assert_equal "this", "this".step_1a
118 assert_equal "gap", "gaps".step_1a
119 assert_equal "kiwi", "kiwis".step_1a
120 assert_equal "abacus", "abacus".step_1a
121 assert_equal "abcess", "abcess".step_1a
122 end
123
124 def test_step_1b
125 assert_equal "abcde", "abcde".step_1b("abcde".porter2_r1)
126 words_non_gb = {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
127 "hopping" => "hop", "hopped" => "hop",
128 "hoped" => "hope", "hoping" => "hope",
129 "atomized" => "atomize", "atomised" => "atomis" }
130 words_non_gb.each do |original, stemmed|
131 assert_equal stemmed, original.step_1b(original.porter2_r1),
132 "#{original} should have stemmed to #{stemmed} but got #{original.step_1b(original.porter2_r1)} instead"
133 end
134 words_gb = {"luxuriated" => "luxuriate", "luxuriating" => "luxuriate",
135 "hopping" => "hop", "hopped" => "hop",
136 "hoped" => "hope", "hoping" => "hope",
137 "atomized" => "atomize", "atomised" => "atomise" }
138 words_gb.each do |original, stemmed|
139 assert_equal stemmed, original.step_1b(original.porter2_r1, true),
140 "#{original} should have stemmed to #{stemmed} but got #{original.step_1b(original.porter2_r1)} instead"
141 end
142 end
143
144 def test_step_1c
145 assert_equal "cri", "cry".step_1c
146 assert_equal "by", "by".step_1c
147 assert_equal "say", "say".step_1c
148 end
149
150 def test_step_2
151 assert_equal "abc", "abc".step_2
152
153 assert_equal "nationalize", "nationalization".step_2
154 assert_equal "nationalisate", "nationalisation".step_2
155 assert_equal "nationalize", "nationalization".step_2(true)
156 assert_equal "nationalise", "nationalisation".step_2(true)
157 # Repeat the steps to ensure that the english-gb behaviour isn't sticky
158 assert_equal "nationalize", "nationalization".step_2(false)
159 assert_equal "nationalisate", "nationalisation".step_2(false)
160 assert_equal "nationalize", "nationalization".step_2
161 assert_equal "nationalisate", "nationalisation".step_2
162
163 assert_equal "nationalize", "nationalizer".step_2
164 assert_equal "nationaliser", "nationaliser".step_2
165 assert_equal "nationalize", "nationalizer".step_2(true)
166 assert_equal "nationalise", "nationaliser".step_2(true)
167
168 assert_equal "abction", "abctional".step_2
169 assert_equal "abcence", "abcenci".step_2
170 assert_equal "abcance", "abcanci".step_2
171 assert_equal "abcable", "abcabli".step_2
172 assert_equal "abcent", "abcentli".step_2
173 assert_equal "abcize", "abcizer".step_2
174 assert_equal "abcize", "abcization".step_2
175 assert_equal "abcate", "abcational".step_2
176 assert_equal "abcate", "abcation".step_2
177 assert_equal "abcate", "abcator".step_2
178 assert_equal "abcal", "abcalism".step_2
179 assert_equal "abcal", "abcaliti".step_2
180 assert_equal "abcal", "abcalli".step_2
181 assert_equal "abcful", "abcfulness".step_2
182 assert_equal "abcous", "abcousli".step_2
183 assert_equal "abcous", "abcousness".step_2
184 assert_equal "abcive", "abciveness".step_2
185 assert_equal "abcive", "abciviti".step_2
186 assert_equal "abcble", "abcbiliti".step_2
187 assert_equal "abcble", "abcbli".step_2
188 assert_equal "abcful", "abcfulli".step_2
189 assert_equal "abcless", "abclessli".step_2
190 assert_equal "abclog", "abclogi".step_2
191
192 assert_equal "abc", "abcli".step_2
193 assert_equal "abd", "abdli".step_2
194 assert_equal "abe", "abeli".step_2
195 assert_equal "abg", "abgli".step_2
196 assert_equal "abh", "abhli".step_2
197 assert_equal "abk", "abkli".step_2
198 assert_equal "abm", "abmli".step_2
199 assert_equal "abn", "abnli".step_2
200 assert_equal "abr", "abrli".step_2
201 assert_equal "abt", "abtli".step_2
202 assert_equal "abali", "abali".step_2
203 end
204
205 def test_step_3
206 assert_equal "abc", "abc".step_3("")
207
208 assert_equal "national", "nationalize".step_3("alize")
209 assert_equal "nationalise", "nationalise".step_3("alise")
210 assert_equal "national", "nationalise".step_3("alise", true)
211 # Repeat the steps to ensure that the english-gb behaviour isn't sticky
212 assert_equal "national", "nationalize".step_3("alize", false)
213 assert_equal "nationalise", "nationalise".step_3("alise", false)
214 assert_equal "national", "nationalize".step_3("alize")
215 assert_equal "nationalise", "nationalise".step_3("alise")
216
217 assert_equal "abction", "abctional".step_3("al")
218 assert_equal "abcate", "abcational".step_3("ional")
219 assert_equal "abcic", "abcicate".step_3("ate")
220 assert_equal "abcic", "abciciti".step_3("iti")
221 assert_equal "abcic", "abcical".step_3("al")
222 assert_equal "abc", "abcful".step_3("")
223 assert_equal "abc", "abcness".step_3("")
224
225 assert_equal "abcabc", "abcabcative".step_3("cative")
226 assert_equal "abcabc", "abcabcative".step_3("cativealic")
227 end
228
229 def test_step_4
230 assert_equal "abc", "abc".step_4("")
231
232 assert_equal "nation", "nationize".step_4("ionizeic")
233 assert_equal "nationise", "nationise".step_4("ioniseic")
234 assert_equal "nation", "nationize".step_4("ionizeic", true)
235 assert_equal "nation", "nationise".step_4("ioniseic", true)
236 assert_equal "nation", "nationize".step_4("ionizeic", false)
237 assert_equal "nationise", "nationise".step_4("ioniseic", false)
238 assert_equal "nation", "nationize".step_4("ionizeic")
239 assert_equal "nationise", "nationise".step_4("ioniseic")
240
241 assert_equal "abc", "abcal".step_4("ionalic")
242 assert_equal "abc", "abcance".step_4("ance")
243 assert_equal "abc", "abcence".step_4("ence")
244 assert_equal "abc", "abcer".step_4("erance")
245 assert_equal "abc", "abcic".step_4("ic")
246 assert_equal "abcer", "abcerable".step_4("erableic")
247 assert_equal "abc", "abcible".step_4("eribleic")
248 assert_equal "abc", "abcant".step_4("ant")
249 assert_equal "abc", "abcement".step_4("ement") # Check we handle overlapping suffixes properly
250 assert_equal "abce", "abcement".step_4("ment")
251 assert_equal "abc", "abcment".step_4("mentement")
252 assert_equal "abc", "abcment".step_4("ement")
253 assert_equal "abc", "abcent".step_4("ement")
254 assert_equal "abc", "abcent".step_4("ent")
255 assert_equal "abc", "abcism".step_4("ism")
256 assert_equal "abc", "abcate".step_4("ate")
257 assert_equal "abc", "abciti".step_4("ition")
258 assert_equal "abc", "abcous".step_4("ously")
259 assert_equal "abc", "abcive".step_4("ively")
260 assert_equal "abc", "abcize".step_4("ize")
261 assert_equal "abcion", "abcion".step_4("ion")
262 assert_equal "abcs", "abcsion".step_4("sion")
263 assert_equal "abct", "abction".step_4("tion")
264 assert_equal "abction", "abction".step_4("ion")
265
266 assert_equal "abcal", "abcal".step_4("") # No removal if suffix isn't in R2
267 end
268
269 def test_step_5
270 assert_equal "abc", "abc".step_5("", "")
271
272 assert_equal "abcl", "abcll".step_5("elele", "ele")
273 assert_equal "abcll", "abcll".step_5("ele", "e")
274
275 assert_equal "abce", "abce".step_5("", "")
276 assert_equal "abc", "abce".step_5("", "ele")
277 assert_equal "abc", "abce".step_5("elele", "")
278 end
279
280 def test_porter2_postprocess
281 assert_equal "abc", "abc".porter2_postprocess
282 assert_equal "abcy", "abcy".porter2_postprocess
283 assert_equal "abcy", "abcY".porter2_postprocess
284 assert_equal "aybcy", "aYbcY".porter2_postprocess
285 assert_equal "aybcy", "aYbcy".porter2_postprocess
286 end
287
288 def test_stemmer
289 TEST_WORDS.each do |base, stemmed|
290 assert_equal stemmed, base.stem,
291 "#{base} should have stemmed to #{stemmed} but got #{original.stem} instead"
292 end
293 end
294
295 end