summaryrefslogtreecommitdiff
path: root/src/Chapter4/tweetlda/PorterStemmer.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/Chapter4/tweetlda/PorterStemmer.java')
-rw-r--r--src/Chapter4/tweetlda/PorterStemmer.java33
1 files changed, 33 insertions, 0 deletions
diff --git a/src/Chapter4/tweetlda/PorterStemmer.java b/src/Chapter4/tweetlda/PorterStemmer.java
new file mode 100644
index 0000000..1a7149e
--- /dev/null
+++ b/src/Chapter4/tweetlda/PorterStemmer.java
@@ -0,0 +1,33 @@
+package tweetlda;
+
+import cc.mallet.pipe.Pipe;
+import cc.mallet.types.Instance;
+import cc.mallet.types.TokenSequence;
+
+public class PorterStemmer extends Pipe {
+
+ private static final long serialVersionUID = 154100332101873830L;
+
+ public Instance pipe(Instance carrier){
+ TokenSequence ts = (TokenSequence) carrier.getData();
+ String word;
+ Stemmer s;
+
+ for(int i = 0; i < ts.size(); i++){
+ word = ts.get(i).getText();
+ //stem the word
+ s = new Stemmer();
+ for(char ch : word.toCharArray()){
+ if(Character.isLetter(ch)){
+ s.add(ch);
+ }
+ }
+ s.stem();
+ ts.get(i).setText(s.toString());
+ }
+ carrier.setData(ts);
+
+ return carrier;
+ }
+
+}