Initial commit

build.xml, etc. are modified a bit after opening in Netbeans 7.4.
author: Peter Wu <peter@lekensteyn.nl> 2014-04-23 12:22:20 +0200
committer: Peter Wu <peter@lekensteyn.nl> 2014-04-23 12:22:20 +0200
commit: 14d7547cd31c5be878e377a4a5370f604c8d59d4 (patch)
tree: 003840f1a21d39b07d45cd3112c38b6eed40e3ab /src/Chapter4/tweetlda/LDA.java
download: TwitterDataAnalytics-14d7547cd31c5be878e377a4a5370f604c8d59d4.tar.gz
1 files changed, 89 insertions, 0 deletions
diff --git a/src/Chapter4/tweetlda/LDA.java b/src/Chapter4/tweetlda/LDA.java
new file mode 100644
index 0000000..ca7f9a3
--- /dev/null
+++ b/src/Chapter4/tweetlda/LDA.java
@@ -0,0 +1,89 @@
+package tweetlda;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.TreeSet;
+import java.util.regex.Pattern;
+
+import org.json.JSONObject;
+
+import cc.mallet.pipe.CharSequence2TokenSequence;
+import cc.mallet.pipe.CharSequenceLowercase;
+import cc.mallet.pipe.Pipe;
+import cc.mallet.pipe.SerialPipes;
+import cc.mallet.pipe.TokenSequence2FeatureSequence;
+import cc.mallet.pipe.TokenSequenceRemoveStopwords;
+import cc.mallet.pipe.iterator.StringArrayIterator;
+import cc.mallet.topics.ParallelTopicModel;
+import cc.mallet.types.Alphabet;
+import cc.mallet.types.IDSorter;
+import cc.mallet.types.InstanceList;
+
+public class LDA {
+
+	private static final String STOP_WORDS = "stopwords.txt";
+    private static final int ITERATIONS = 100;
+    private static final int THREADS = 4;
+    private static final int NUM_TOPICS = 25;
+	private static final int NOM_WORDS_TO_ANALYZE = 25;
+
+	public static void main(String[] args) throws Exception {
+		ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
+		File stopwords = new File(STOP_WORDS);
+		
+		String inputFileName = args.length >= 1 ? args[0] : "testows.json";
+		
+		File inputFile = new File(inputFileName);
+
+		// Lowercase, tokenize, remove stopwords, stem, and convert to features
+		pipeList.add((Pipe) new CharSequenceLowercase());
+        pipeList.add((Pipe) new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")));
+        pipeList.add((Pipe) new TokenSequenceRemoveStopwords(stopwords, "UTF-8", false, false, false));
+        pipeList.add((Pipe) new PorterStemmer());
+        pipeList.add((Pipe) new TokenSequence2FeatureSequence());
+
+        InstanceList instances = new InstanceList(new SerialPipes(pipeList));
+
+        BufferedReader fileReader = new BufferedReader(new FileReader(inputFile));
+        LinkedList<String> textList = new LinkedList<String>();
+        String line;
+        while((line = fileReader.readLine()) != null){
+        	JSONObject elem = new JSONObject(line);
+        	if(elem.has("text")){
+        		textList.add(elem.getString("text"));
+        	}
+        }
+
+        instances.addThruPipe(new StringArrayIterator(textList.toArray(new String[textList.size()])));
+
+        ParallelTopicModel model = new ParallelTopicModel(NUM_TOPICS); 
+        model.addInstances(instances);
+        model.setNumThreads(THREADS);
+        model.setNumIterations(ITERATIONS);
+        model.estimate();
+
+        // The data alphabet maps word IDs to strings
+        Alphabet dataAlphabet = instances.getDataAlphabet();
+
+        int topicIdx=0;
+        StringBuilder sb;
+        for (TreeSet<IDSorter> set : model.getSortedWords()) {
+            sb = new StringBuilder().append(topicIdx);
+            sb.append(" - ");
+            int j = 0;
+            double sum = 0.0;
+            for (IDSorter s : set) {
+            	sum += s.getWeight();
+            }
+            for (IDSorter s : set) {
+                sb.append(dataAlphabet.lookupObject(s.getID())).append(":").append(s.getWeight() / sum).append(", ");
+                if (++j >= NOM_WORDS_TO_ANALYZE) break;
+            }
+            System.out.println(sb.append("\n").toString());
+            topicIdx++;
+        }
+	}
+}
author	Peter Wu <peter@lekensteyn.nl>	2014-04-23 12:22:20 +0200
committer	Peter Wu <peter@lekensteyn.nl>	2014-04-23 12:22:20 +0200
commit	14d7547cd31c5be878e377a4a5370f604c8d59d4 (patch)
tree	003840f1a21d39b07d45cd3112c38b6eed40e3ab /src/Chapter4/tweetlda/LDA.java
download	TwitterDataAnalytics-14d7547cd31c5be878e377a4a5370f604c8d59d4.tar.gz