diff options
Diffstat (limited to 'src/Chapter4/tweetlda/LDA.java')
-rw-r--r-- | src/Chapter4/tweetlda/LDA.java | 89 |
1 files changed, 0 insertions, 89 deletions
diff --git a/src/Chapter4/tweetlda/LDA.java b/src/Chapter4/tweetlda/LDA.java deleted file mode 100644 index ca7f9a3..0000000 --- a/src/Chapter4/tweetlda/LDA.java +++ /dev/null @@ -1,89 +0,0 @@ -package tweetlda; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.TreeSet; -import java.util.regex.Pattern; - -import org.json.JSONObject; - -import cc.mallet.pipe.CharSequence2TokenSequence; -import cc.mallet.pipe.CharSequenceLowercase; -import cc.mallet.pipe.Pipe; -import cc.mallet.pipe.SerialPipes; -import cc.mallet.pipe.TokenSequence2FeatureSequence; -import cc.mallet.pipe.TokenSequenceRemoveStopwords; -import cc.mallet.pipe.iterator.StringArrayIterator; -import cc.mallet.topics.ParallelTopicModel; -import cc.mallet.types.Alphabet; -import cc.mallet.types.IDSorter; -import cc.mallet.types.InstanceList; - -public class LDA { - - private static final String STOP_WORDS = "stopwords.txt"; - private static final int ITERATIONS = 100; - private static final int THREADS = 4; - private static final int NUM_TOPICS = 25; - private static final int NOM_WORDS_TO_ANALYZE = 25; - - public static void main(String[] args) throws Exception { - ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); - File stopwords = new File(STOP_WORDS); - - String inputFileName = args.length >= 1 ? args[0] : "testows.json"; - - File inputFile = new File(inputFileName); - - // Lowercase, tokenize, remove stopwords, stem, and convert to features - pipeList.add((Pipe) new CharSequenceLowercase()); - pipeList.add((Pipe) new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}"))); - pipeList.add((Pipe) new TokenSequenceRemoveStopwords(stopwords, "UTF-8", false, false, false)); - pipeList.add((Pipe) new PorterStemmer()); - pipeList.add((Pipe) new TokenSequence2FeatureSequence()); - - InstanceList instances = new InstanceList(new SerialPipes(pipeList)); - - BufferedReader fileReader = new BufferedReader(new FileReader(inputFile)); - LinkedList<String> textList = new LinkedList<String>(); - String line; - while((line = fileReader.readLine()) != null){ - JSONObject elem = new JSONObject(line); - if(elem.has("text")){ - textList.add(elem.getString("text")); - } - } - - instances.addThruPipe(new StringArrayIterator(textList.toArray(new String[textList.size()]))); - - ParallelTopicModel model = new ParallelTopicModel(NUM_TOPICS); - model.addInstances(instances); - model.setNumThreads(THREADS); - model.setNumIterations(ITERATIONS); - model.estimate(); - - // The data alphabet maps word IDs to strings - Alphabet dataAlphabet = instances.getDataAlphabet(); - - int topicIdx=0; - StringBuilder sb; - for (TreeSet<IDSorter> set : model.getSortedWords()) { - sb = new StringBuilder().append(topicIdx); - sb.append(" - "); - int j = 0; - double sum = 0.0; - for (IDSorter s : set) { - sum += s.getWeight(); - } - for (IDSorter s : set) { - sb.append(dataAlphabet.lookupObject(s.getID())).append(":").append(s.getWeight() / sum).append(", "); - if (++j >= NOM_WORDS_TO_ANALYZE) break; - } - System.out.println(sb.append("\n").toString()); - topicIdx++; - } - } -} |