summaryrefslogtreecommitdiff
path: root/src/Chapter4/tweetlda/PorterStemmer.java
blob: 1a7149e1fe9f33cb7d270112cb4bc136407cc8a6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
package tweetlda;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.TokenSequence;

public class PorterStemmer extends Pipe {

	private static final long serialVersionUID = 154100332101873830L;

	public Instance pipe(Instance carrier){
		TokenSequence ts = (TokenSequence) carrier.getData();
		String word;
		Stemmer s;
		
		for(int i = 0; i < ts.size(); i++){
			word = ts.get(i).getText();
			//stem the word
			s = new Stemmer();
			for(char ch : word.toCharArray()){
				if(Character.isLetter(ch)){
					s.add(ch);
				}
			}
			s.stem();
			ts.get(i).setText(s.toString());
		}
		carrier.setData(ts);

		return carrier;
	}

}