diff options
Diffstat (limited to 'src/Chapter4/tweetlda/PorterStemmer.java')
-rw-r--r-- | src/Chapter4/tweetlda/PorterStemmer.java | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/src/Chapter4/tweetlda/PorterStemmer.java b/src/Chapter4/tweetlda/PorterStemmer.java new file mode 100644 index 0000000..1a7149e --- /dev/null +++ b/src/Chapter4/tweetlda/PorterStemmer.java @@ -0,0 +1,33 @@ +package tweetlda; + +import cc.mallet.pipe.Pipe; +import cc.mallet.types.Instance; +import cc.mallet.types.TokenSequence; + +public class PorterStemmer extends Pipe { + + private static final long serialVersionUID = 154100332101873830L; + + public Instance pipe(Instance carrier){ + TokenSequence ts = (TokenSequence) carrier.getData(); + String word; + Stemmer s; + + for(int i = 0; i < ts.size(); i++){ + word = ts.get(i).getText(); + //stem the word + s = new Stemmer(); + for(char ch : word.toCharArray()){ + if(Character.isLetter(ch)){ + s.add(ch); + } + } + s.stem(); + ts.get(i).setText(s.toString()); + } + carrier.setData(ts); + + return carrier; + } + +} |