diff options
author | Peter Wu <peter@lekensteyn.nl> | 2014-04-23 12:22:20 +0200 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2014-04-23 12:22:20 +0200 |
commit | 14d7547cd31c5be878e377a4a5370f604c8d59d4 (patch) | |
tree | 003840f1a21d39b07d45cd3112c38b6eed40e3ab /src/Chapter4/tweetlda/LDA.java | |
download | TwitterDataAnalytics-14d7547cd31c5be878e377a4a5370f604c8d59d4.tar.gz |
Initial commit
build.xml, etc. are modified a bit after opening in Netbeans 7.4.
Diffstat (limited to 'src/Chapter4/tweetlda/LDA.java')
-rw-r--r-- | src/Chapter4/tweetlda/LDA.java | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/src/Chapter4/tweetlda/LDA.java b/src/Chapter4/tweetlda/LDA.java new file mode 100644 index 0000000..ca7f9a3 --- /dev/null +++ b/src/Chapter4/tweetlda/LDA.java @@ -0,0 +1,89 @@ +package tweetlda; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import org.json.JSONObject; + +import cc.mallet.pipe.CharSequence2TokenSequence; +import cc.mallet.pipe.CharSequenceLowercase; +import cc.mallet.pipe.Pipe; +import cc.mallet.pipe.SerialPipes; +import cc.mallet.pipe.TokenSequence2FeatureSequence; +import cc.mallet.pipe.TokenSequenceRemoveStopwords; +import cc.mallet.pipe.iterator.StringArrayIterator; +import cc.mallet.topics.ParallelTopicModel; +import cc.mallet.types.Alphabet; +import cc.mallet.types.IDSorter; +import cc.mallet.types.InstanceList; + +public class LDA { + + private static final String STOP_WORDS = "stopwords.txt"; + private static final int ITERATIONS = 100; + private static final int THREADS = 4; + private static final int NUM_TOPICS = 25; + private static final int NOM_WORDS_TO_ANALYZE = 25; + + public static void main(String[] args) throws Exception { + ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); + File stopwords = new File(STOP_WORDS); + + String inputFileName = args.length >= 1 ? args[0] : "testows.json"; + + File inputFile = new File(inputFileName); + + // Lowercase, tokenize, remove stopwords, stem, and convert to features + pipeList.add((Pipe) new CharSequenceLowercase()); + pipeList.add((Pipe) new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}"))); + pipeList.add((Pipe) new TokenSequenceRemoveStopwords(stopwords, "UTF-8", false, false, false)); + pipeList.add((Pipe) new PorterStemmer()); + pipeList.add((Pipe) new TokenSequence2FeatureSequence()); + + InstanceList instances = new InstanceList(new SerialPipes(pipeList)); + + BufferedReader fileReader = new BufferedReader(new FileReader(inputFile)); + LinkedList<String> textList = new LinkedList<String>(); + String line; + while((line = fileReader.readLine()) != null){ + JSONObject elem = new JSONObject(line); + if(elem.has("text")){ + textList.add(elem.getString("text")); + } + } + + instances.addThruPipe(new StringArrayIterator(textList.toArray(new String[textList.size()]))); + + ParallelTopicModel model = new ParallelTopicModel(NUM_TOPICS); + model.addInstances(instances); + model.setNumThreads(THREADS); + model.setNumIterations(ITERATIONS); + model.estimate(); + + // The data alphabet maps word IDs to strings + Alphabet dataAlphabet = instances.getDataAlphabet(); + + int topicIdx=0; + StringBuilder sb; + for (TreeSet<IDSorter> set : model.getSortedWords()) { + sb = new StringBuilder().append(topicIdx); + sb.append(" - "); + int j = 0; + double sum = 0.0; + for (IDSorter s : set) { + sum += s.getWeight(); + } + for (IDSorter s : set) { + sb.append(dataAlphabet.lookupObject(s.getID())).append(":").append(s.getWeight() / sum).append(", "); + if (++j >= NOM_WORDS_TO_ANALYZE) break; + } + System.out.println(sb.append("\n").toString()); + topicIdx++; + } + } +} |