1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 """Porter Stemming Algorithm
30
31 This is the Porter stemming algorithm, ported to Python from the
32 version coded up in ANSI C by the author. It follows the algorithm
33 presented in
34
35 Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
36
37 only differing from it at the points maked --DEPARTURE-- and --NEW--
38 below.
39
40 For a more faithful version of the Porter algorithm, see
41
42 http://www.tartarus.org/~martin/PorterStemmer/
43
44 Later additions:
45
46 June 2000
47
48 The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
49 short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
50
51 This follows a suggestion of Barry Wilkins, reasearch student at
52 Birmingham.
53
54
55 February 2000
56
57 the cvc test for not dropping final -e now looks after vc at the
58 beginning of a word, so are, eve, ice, ore, use keep final -e. In this
59 test c is any consonant, including w, x and y. This extension was
60 suggested by Chris Emerson.
61
62 -fully -> -ful treated like -fulness -> -ful, and
63 -tionally -> -tion treated like -tional -> -tion
64
65 both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
66
67 Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
68
69 Additional modifications were made to incorperate this module into
70 nltk. All such modifications are marked with \"--NLTK--\". The nltk
71 version of this module is maintained by the NLTK developers, and is
72 available from <http://nltk.sourceforge.net>
73 """
74
75
76
77 __docformat__ = 'plaintext'
78
79 import sys
80 import re
81 import string
82
83
84
85 from nltk_lite.stem import *
86
88
89
90
91 """
92 A word stemmer based on the Porter stemming algorithm.
93
94 Porter, M. \"An algorithm for suffix stripping.\"
95 Program 14.3 (1980): 130-137.
96
97 A few minor modifications have been made to Porter's basic
98 algorithm. See the source code of this module for more
99 information.
100
101 The Porter Stemmer requires that all tokens have string types.
102 """
103
105 """The main part of the stemming algorithm starts here.
106 b is a buffer holding a word to be stemmed. The letters are in b[k0],
107 b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
108 readjusted downwards as the stemming progresses. Zero termination is
109 not in fact used in the algorithm.
110
111 Note that only lower case sequences are stemmed. Forcing to lower case
112 should be done before stem(...) is called.
113 """
114
115 self.b = ""
116 self.k = 0
117 self.k0 = 0
118 self.j = 0
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138 irregular_forms = {
139 "sky" : ["sky", "skies"],
140 "die" : ["dying"],
141 "lie" : ["lying"],
142 "tie" : ["tying"],
143 "news" : ["news"],
144 "inning" : ["innings", "inning"],
145 "outing" : ["outings", "outing"],
146 "canning" : ["cannings", "canning"],
147 "howe" : ["howe"],
148
149
150 "proceed" : ["proceed"],
151 "exceed" : ["exceed"],
152 "succeed" : ["succeed"],
153 }
154
155 self.pool = {}
156 for key in irregular_forms.keys():
157 for val in irregular_forms[key]:
158 self.pool[val] = key
159
161 """cons(i) is TRUE <=> b[i] is a consonant."""
162 if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
163 return 0
164 if self.b[i] == 'y':
165 if i == self.k0:
166 return 1
167 else:
168 return (not self.cons(i - 1))
169 return 1
170
172 """m() measures the number of consonant sequences between k0 and j.
173 if c is a consonant sequence and v a vowel sequence, and <..>
174 indicates arbitrary presence,
175
176 <c><v> gives 0
177 <c>vc<v> gives 1
178 <c>vcvc<v> gives 2
179 <c>vcvcvc<v> gives 3
180 ....
181 """
182 n = 0
183 i = self.k0
184 while 1:
185 if i > self.j:
186 return n
187 if not self.cons(i):
188 break
189 i = i + 1
190 i = i + 1
191 while 1:
192 while 1:
193 if i > self.j:
194 return n
195 if self.cons(i):
196 break
197 i = i + 1
198 i = i + 1
199 n = n + 1
200 while 1:
201 if i > self.j:
202 return n
203 if not self.cons(i):
204 break
205 i = i + 1
206 i = i + 1
207
209 """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
210 for i in range(self.k0, self.j + 1):
211 if not self.cons(i):
212 return 1
213 return 0
214
216 """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
217 if j < (self.k0 + 1):
218 return 0
219 if (self.b[j] != self.b[j-1]):
220 return 0
221 return self.cons(j)
222
224 """cvc(i) is TRUE <=>
225
226 a) ( --NEW--) i == 1, and p[0] p[1] is vowel consonant, or
227
228 b) p[i - 2], p[i - 1], p[i] has the form consonant -
229 vowel - consonant and also if the second c is not w, x or y. this
230 is used when trying to restore an e at the end of a short word.
231 e.g.
232
233 cav(e), lov(e), hop(e), crim(e), but
234 snow, box, tray.
235 """
236 if i == 0: return 0
237 if i == 1: return (not self.cons(0) and self.cons(1))
238 if not self.cons(i) or self.cons(i-1) or not self.cons(i-2): return 0
239
240 ch = self.b[i]
241 if ch == 'w' or ch == 'x' or ch == 'y':
242 return 0
243
244 return 1
245
247 """ends(s) is TRUE <=> k0,...k ends with the string s."""
248 length = len(s)
249 if s[length - 1] != self.b[self.k]:
250 return 0
251 if length > (self.k - self.k0 + 1):
252 return 0
253 if self.b[self.k-length+1:self.k+1] != s:
254 return 0
255 self.j = self.k - length
256 return 1
257
259 """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
260 length = len(s)
261 self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
262 self.k = self.j + length
263
265 """r(s) is used further down."""
266 if self.m() > 0:
267 self.setto(s)
268
270 """step1ab() gets rid of plurals and -ed or -ing. e.g.
271
272 caresses -> caress
273 ponies -> poni
274 sties -> sti
275 tie -> tie (--NEW--: see below)
276 caress -> caress
277 cats -> cat
278
279 feed -> feed
280 agreed -> agree
281 disabled -> disable
282
283 matting -> mat
284 mating -> mate
285 meeting -> meet
286 milling -> mill
287 messing -> mess
288
289 meetings -> meet
290 """
291 if self.b[self.k] == 's':
292 if self.ends("sses"):
293 self.k = self.k - 2
294 elif self.ends("ies"):
295 if self.j == 0:
296 self.k = self.k - 1
297
298
299 else:
300 self.k = self.k - 2
301 elif self.b[self.k - 1] != 's':
302 self.k = self.k - 1
303
304 if self.ends("ied"):
305 if self.j == 0:
306 self.k = self.k - 1
307 else:
308 self.k = self.k - 2
309
310
311
312 elif self.ends("eed"):
313 if self.m() > 0:
314 self.k = self.k - 1
315 elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
316 self.k = self.j
317 if self.ends("at"): self.setto("ate")
318 elif self.ends("bl"): self.setto("ble")
319 elif self.ends("iz"): self.setto("ize")
320 elif self.doublec(self.k):
321 self.k = self.k - 1
322 ch = self.b[self.k]
323 if ch == 'l' or ch == 's' or ch == 'z':
324 self.k = self.k + 1
325 elif (self.m() == 1 and self.cvc(self.k)):
326 self.setto("e")
327
329 """step1c() turns terminal y to i when there is another vowel in the stem.
330 --NEW--: This has been modified from the original Porter algorithm so that y->i
331 is only done when y is preceded by a consonant, but not if the stem
332 is only a single consonant, i.e.
333
334 (*c and not c) Y -> I
335
336 So 'happy' -> 'happi', but
337 'enjoy' -> 'enjoy' etc
338
339 This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
340 'enjoy'. Step 1c is perhaps done too soon; but with this modification that
341 no longer really matters.
342
343 Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
344 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
345 'flies' ...
346 """
347 if self.ends("y") and self.j > 0 and self.cons(self.k - 1):
348 self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
349
351 """step2() maps double suffices to single ones.
352 so -ization ( = -ize plus -ation) maps to -ize etc. note that the
353 string before the suffix must give m() > 0.
354 """
355 if self.b[self.k - 1] == 'a':
356 if self.ends("ational"): self.r("ate")
357 elif self.ends("tional"): self.r("tion")
358 elif self.b[self.k - 1] == 'c':
359 if self.ends("enci"): self.r("ence")
360 elif self.ends("anci"): self.r("ance")
361 elif self.b[self.k - 1] == 'e':
362 if self.ends("izer"): self.r("ize")
363 elif self.b[self.k - 1] == 'l':
364 if self.ends("bli"): self.r("ble")
365
366
367 elif self.ends("alli"):
368 if self.m() > 0:
369 self.setto("al")
370 self.step2()
371 elif self.ends("fulli"): self.r("ful")
372 elif self.ends("entli"): self.r("ent")
373 elif self.ends("eli"): self.r("e")
374 elif self.ends("ousli"): self.r("ous")
375 elif self.b[self.k - 1] == 'o':
376 if self.ends("ization"): self.r("ize")
377 elif self.ends("ation"): self.r("ate")
378 elif self.ends("ator"): self.r("ate")
379 elif self.b[self.k - 1] == 's':
380 if self.ends("alism"): self.r("al")
381 elif self.ends("iveness"): self.r("ive")
382 elif self.ends("fulness"): self.r("ful")
383 elif self.ends("ousness"): self.r("ous")
384 elif self.b[self.k - 1] == 't':
385 if self.ends("aliti"): self.r("al")
386 elif self.ends("iviti"): self.r("ive")
387 elif self.ends("biliti"): self.r("ble")
388 elif self.b[self.k - 1] == 'g':
389 if self.ends("logi"):
390 self.j = self.j + 1
391 self.r("og")
392
393
395 """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
396 if self.b[self.k] == 'e':
397 if self.ends("icate"): self.r("ic")
398 elif self.ends("ative"): self.r("")
399 elif self.ends("alize"): self.r("al")
400 elif self.b[self.k] == 'i':
401 if self.ends("iciti"): self.r("ic")
402 elif self.b[self.k] == 'l':
403 if self.ends("ical"): self.r("ic")
404 elif self.ends("ful"): self.r("")
405 elif self.b[self.k] == 's':
406 if self.ends("ness"): self.r("")
407
409 """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
410 if self.b[self.k - 1] == 'a':
411 if self.ends("al"): pass
412 else: return
413 elif self.b[self.k - 1] == 'c':
414 if self.ends("ance"): pass
415 elif self.ends("ence"): pass
416 else: return
417 elif self.b[self.k - 1] == 'e':
418 if self.ends("er"): pass
419 else: return
420 elif self.b[self.k - 1] == 'i':
421 if self.ends("ic"): pass
422 else: return
423 elif self.b[self.k - 1] == 'l':
424 if self.ends("able"): pass
425 elif self.ends("ible"): pass
426 else: return
427 elif self.b[self.k - 1] == 'n':
428 if self.ends("ant"): pass
429 elif self.ends("ement"): pass
430 elif self.ends("ment"): pass
431 elif self.ends("ent"): pass
432 else: return
433 elif self.b[self.k - 1] == 'o':
434 if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
435 elif self.ends("ou"): pass
436
437 else: return
438 elif self.b[self.k - 1] == 's':
439 if self.ends("ism"): pass
440 else: return
441 elif self.b[self.k - 1] == 't':
442 if self.ends("ate"): pass
443 elif self.ends("iti"): pass
444 else: return
445 elif self.b[self.k - 1] == 'u':
446 if self.ends("ous"): pass
447 else: return
448 elif self.b[self.k - 1] == 'v':
449 if self.ends("ive"): pass
450 else: return
451 elif self.b[self.k - 1] == 'z':
452 if self.ends("ize"): pass
453 else: return
454 else:
455 return
456 if self.m() > 1:
457 self.k = self.j
458
460 """step5() removes a final -e if m() > 1, and changes -ll to -l if
461 m() > 1.
462 """
463 self.j = self.k
464 if self.b[self.k] == 'e':
465 a = self.m()
466 if a > 1 or (a == 1 and not self.cvc(self.k-1)):
467 self.k = self.k - 1
468 if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
469 self.k = self.k -1
470
472 """In stem(p,i,j), p is a char pointer, and the string to be stemmed
473 is from p[i] to p[j] inclusive. Typically i is zero and j is the
474 offset to the last character of a string, (p[j+1] == '\0'). The
475 stemmer adjusts the characters p[i] ... p[j] and returns the new
476 end-point of the string, k. Stemming never increases word length, so
477 i <= k <= j. To turn the stemmer into a module, declare 'stem' as
478 extern, and delete the remainder of this file.
479 """
480
481
482
483 if j == None:
484 j = len(p) - 1
485
486
487 self.b = p
488 self.k = j
489 self.k0 = i
490
491 if self.pool.has_key(self.b[self.k0:self.k+1]):
492 return self.pool[self.b[self.k0:self.k+1]]
493
494 if self.k <= self.k0 + 1:
495 return self.b
496
497
498
499
500
501
502 self.step1ab()
503 self.step1c()
504 self.step2()
505 self.step3()
506 self.step4()
507 self.step5()
508 return self.b[self.k0:self.k+1]
509
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544 - def stem(self, word):
547
548
549
551 return '<Porter Stemmer>'
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
571 """
572 A demonstration of the porter stemmer on a sample from
573 the Penn Treebank corpus.
574 """
575
576 from nltk_lite.corpora import treebank
577 from nltk_lite import stem
578
579 stemmer = stem.Porter()
580
581 i = 0
582 orig = []
583 stemmed = []
584 for sent in treebank.raw():
585 for word in sent:
586 orig.append(word)
587 sword = stemmer.stem(word)
588 stemmed.append(sword)
589 i+=1
590 if i>3: break
591
592
593 results = string.join(stemmed)
594 results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()
595
596
597 original = string.join(orig)
598 original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()
599
600
601 print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
602 print original
603 print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
604 print results
605 print '*'*70
606
607
608
609 if __name__ == '__main__': demo()
610