summaryrefslogtreecommitdiff
path: root/src/Chapter4/tweetlda/LDA.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/Chapter4/tweetlda/LDA.java')
-rw-r--r--src/Chapter4/tweetlda/LDA.java89
1 files changed, 0 insertions, 89 deletions
diff --git a/src/Chapter4/tweetlda/LDA.java b/src/Chapter4/tweetlda/LDA.java
deleted file mode 100644
index ca7f9a3..0000000
--- a/src/Chapter4/tweetlda/LDA.java
+++ /dev/null
@@ -1,89 +0,0 @@
-package tweetlda;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.util.ArrayList;
-import java.util.LinkedList;
-import java.util.TreeSet;
-import java.util.regex.Pattern;
-
-import org.json.JSONObject;
-
-import cc.mallet.pipe.CharSequence2TokenSequence;
-import cc.mallet.pipe.CharSequenceLowercase;
-import cc.mallet.pipe.Pipe;
-import cc.mallet.pipe.SerialPipes;
-import cc.mallet.pipe.TokenSequence2FeatureSequence;
-import cc.mallet.pipe.TokenSequenceRemoveStopwords;
-import cc.mallet.pipe.iterator.StringArrayIterator;
-import cc.mallet.topics.ParallelTopicModel;
-import cc.mallet.types.Alphabet;
-import cc.mallet.types.IDSorter;
-import cc.mallet.types.InstanceList;
-
-public class LDA {
-
- private static final String STOP_WORDS = "stopwords.txt";
- private static final int ITERATIONS = 100;
- private static final int THREADS = 4;
- private static final int NUM_TOPICS = 25;
- private static final int NOM_WORDS_TO_ANALYZE = 25;
-
- public static void main(String[] args) throws Exception {
- ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
- File stopwords = new File(STOP_WORDS);
-
- String inputFileName = args.length >= 1 ? args[0] : "testows.json";
-
- File inputFile = new File(inputFileName);
-
- // Lowercase, tokenize, remove stopwords, stem, and convert to features
- pipeList.add((Pipe) new CharSequenceLowercase());
- pipeList.add((Pipe) new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")));
- pipeList.add((Pipe) new TokenSequenceRemoveStopwords(stopwords, "UTF-8", false, false, false));
- pipeList.add((Pipe) new PorterStemmer());
- pipeList.add((Pipe) new TokenSequence2FeatureSequence());
-
- InstanceList instances = new InstanceList(new SerialPipes(pipeList));
-
- BufferedReader fileReader = new BufferedReader(new FileReader(inputFile));
- LinkedList<String> textList = new LinkedList<String>();
- String line;
- while((line = fileReader.readLine()) != null){
- JSONObject elem = new JSONObject(line);
- if(elem.has("text")){
- textList.add(elem.getString("text"));
- }
- }
-
- instances.addThruPipe(new StringArrayIterator(textList.toArray(new String[textList.size()])));
-
- ParallelTopicModel model = new ParallelTopicModel(NUM_TOPICS);
- model.addInstances(instances);
- model.setNumThreads(THREADS);
- model.setNumIterations(ITERATIONS);
- model.estimate();
-
- // The data alphabet maps word IDs to strings
- Alphabet dataAlphabet = instances.getDataAlphabet();
-
- int topicIdx=0;
- StringBuilder sb;
- for (TreeSet<IDSorter> set : model.getSortedWords()) {
- sb = new StringBuilder().append(topicIdx);
- sb.append(" - ");
- int j = 0;
- double sum = 0.0;
- for (IDSorter s : set) {
- sum += s.getWeight();
- }
- for (IDSorter s : set) {
- sb.append(dataAlphabet.lookupObject(s.getID())).append(":").append(s.getWeight() / sum).append(", ");
- if (++j >= NOM_WORDS_TO_ANALYZE) break;
- }
- System.out.println(sb.append("\n").toString());
- topicIdx++;
- }
- }
-}