diff options
Diffstat (limited to 'src/Chapter4/tweetlda')
-rw-r--r-- | src/Chapter4/tweetlda/LDA.java | 89 | ||||
-rw-r--r-- | src/Chapter4/tweetlda/PorterStemmer.java | 33 | ||||
-rw-r--r-- | src/Chapter4/tweetlda/Stemmer.java | 428 |
3 files changed, 0 insertions, 550 deletions
diff --git a/src/Chapter4/tweetlda/LDA.java b/src/Chapter4/tweetlda/LDA.java deleted file mode 100644 index ca7f9a3..0000000 --- a/src/Chapter4/tweetlda/LDA.java +++ /dev/null @@ -1,89 +0,0 @@ -package tweetlda; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.TreeSet; -import java.util.regex.Pattern; - -import org.json.JSONObject; - -import cc.mallet.pipe.CharSequence2TokenSequence; -import cc.mallet.pipe.CharSequenceLowercase; -import cc.mallet.pipe.Pipe; -import cc.mallet.pipe.SerialPipes; -import cc.mallet.pipe.TokenSequence2FeatureSequence; -import cc.mallet.pipe.TokenSequenceRemoveStopwords; -import cc.mallet.pipe.iterator.StringArrayIterator; -import cc.mallet.topics.ParallelTopicModel; -import cc.mallet.types.Alphabet; -import cc.mallet.types.IDSorter; -import cc.mallet.types.InstanceList; - -public class LDA { - - private static final String STOP_WORDS = "stopwords.txt"; - private static final int ITERATIONS = 100; - private static final int THREADS = 4; - private static final int NUM_TOPICS = 25; - private static final int NOM_WORDS_TO_ANALYZE = 25; - - public static void main(String[] args) throws Exception { - ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); - File stopwords = new File(STOP_WORDS); - - String inputFileName = args.length >= 1 ? args[0] : "testows.json"; - - File inputFile = new File(inputFileName); - - // Lowercase, tokenize, remove stopwords, stem, and convert to features - pipeList.add((Pipe) new CharSequenceLowercase()); - pipeList.add((Pipe) new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}"))); - pipeList.add((Pipe) new TokenSequenceRemoveStopwords(stopwords, "UTF-8", false, false, false)); - pipeList.add((Pipe) new PorterStemmer()); - pipeList.add((Pipe) new TokenSequence2FeatureSequence()); - - InstanceList instances = new InstanceList(new SerialPipes(pipeList)); - - BufferedReader fileReader = new BufferedReader(new FileReader(inputFile)); - LinkedList<String> textList = new LinkedList<String>(); - String line; - while((line = fileReader.readLine()) != null){ - JSONObject elem = new JSONObject(line); - if(elem.has("text")){ - textList.add(elem.getString("text")); - } - } - - instances.addThruPipe(new StringArrayIterator(textList.toArray(new String[textList.size()]))); - - ParallelTopicModel model = new ParallelTopicModel(NUM_TOPICS); - model.addInstances(instances); - model.setNumThreads(THREADS); - model.setNumIterations(ITERATIONS); - model.estimate(); - - // The data alphabet maps word IDs to strings - Alphabet dataAlphabet = instances.getDataAlphabet(); - - int topicIdx=0; - StringBuilder sb; - for (TreeSet<IDSorter> set : model.getSortedWords()) { - sb = new StringBuilder().append(topicIdx); - sb.append(" - "); - int j = 0; - double sum = 0.0; - for (IDSorter s : set) { - sum += s.getWeight(); - } - for (IDSorter s : set) { - sb.append(dataAlphabet.lookupObject(s.getID())).append(":").append(s.getWeight() / sum).append(", "); - if (++j >= NOM_WORDS_TO_ANALYZE) break; - } - System.out.println(sb.append("\n").toString()); - topicIdx++; - } - } -} diff --git a/src/Chapter4/tweetlda/PorterStemmer.java b/src/Chapter4/tweetlda/PorterStemmer.java deleted file mode 100644 index 1a7149e..0000000 --- a/src/Chapter4/tweetlda/PorterStemmer.java +++ /dev/null @@ -1,33 +0,0 @@ -package tweetlda; - -import cc.mallet.pipe.Pipe; -import cc.mallet.types.Instance; -import cc.mallet.types.TokenSequence; - -public class PorterStemmer extends Pipe { - - private static final long serialVersionUID = 154100332101873830L; - - public Instance pipe(Instance carrier){ - TokenSequence ts = (TokenSequence) carrier.getData(); - String word; - Stemmer s; - - for(int i = 0; i < ts.size(); i++){ - word = ts.get(i).getText(); - //stem the word - s = new Stemmer(); - for(char ch : word.toCharArray()){ - if(Character.isLetter(ch)){ - s.add(ch); - } - } - s.stem(); - ts.get(i).setText(s.toString()); - } - carrier.setData(ts); - - return carrier; - } - -} diff --git a/src/Chapter4/tweetlda/Stemmer.java b/src/Chapter4/tweetlda/Stemmer.java deleted file mode 100644 index f06dfc6..0000000 --- a/src/Chapter4/tweetlda/Stemmer.java +++ /dev/null @@ -1,428 +0,0 @@ -package tweetlda; - - -/* - - Porter stemmer in Java. The original paper is in - - Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, - no. 3, pp 130-137, - - See also http://www.tartarus.org/~martin/PorterStemmer - - History: - - Release 1 - - Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. - The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] - is then out outside the bounds of b. - - Release 2 - - Similarly, - - Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. - 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and - b[j] is then outside the bounds of b. - - Release 3 - - Considerably revised 4/9/00 in the light of many helpful suggestions - from Brian Goetz of Quiotix Corporation (brian@quiotix.com). - - Release 4 - -*/ - -import java.io.*; - -/** - * Stemmer, implementing the Porter Stemming Algorithm - * - * The Stemmer class transforms a word into its root form. The input - * word can be provided a character at time (by calling add()), or at once - * by calling one of the various stem(something) methods. - */ - -class Stemmer -{ private char[] b; - private int i, /* offset into b */ - i_end, /* offset to end of stemmed word */ - j, k; - private static final int INC = 50; - /* unit of size whereby b is increased */ - public Stemmer() - { b = new char[INC]; - i = 0; - i_end = 0; - } - - /** - * Add a character to the word being stemmed. When you are finished - * adding characters, you can call stem(void) to stem the word. - */ - - public void add(char ch) - { if (i == b.length) - { char[] new_b = new char[i+INC]; - for (int c = 0; c < i; c++) new_b[c] = b[c]; - b = new_b; - } - b[i++] = ch; - } - - - /** Adds wLen characters to the word being stemmed contained in a portion - * of a char[] array. This is like repeated calls of add(char ch), but - * faster. - */ - - public void add(char[] w, int wLen) - { if (i+wLen >= b.length) - { char[] new_b = new char[i+wLen+INC]; - for (int c = 0; c < i; c++) new_b[c] = b[c]; - b = new_b; - } - for (int c = 0; c < wLen; c++) b[i++] = w[c]; - } - - /** - * After a word has been stemmed, it can be retrieved by toString(), - * or a reference to the internal buffer can be retrieved by getResultBuffer - * and getResultLength (which is generally more efficient.) - */ - public String toString() { return new String(b,0,i_end); } - - /** - * Returns the length of the word resulting from the stemming process. - */ - public int getResultLength() { return i_end; } - - /** - * Returns a reference to a character buffer containing the results of - * the stemming process. You also need to consult getResultLength() - * to determine the length of the result. - */ - public char[] getResultBuffer() { return b; } - - /* cons(i) is true <=> b[i] is a consonant. */ - - private final boolean cons(int i) - { switch (b[i]) - { case 'a': case 'e': case 'i': case 'o': case 'u': return false; - case 'y': return (i==0) ? true : !cons(i-1); - default: return true; - } - } - - /* m() measures the number of consonant sequences between 0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - <c><v> gives 0 - <c>vc<v> gives 1 - <c>vcvc<v> gives 2 - <c>vcvcvc<v> gives 3 - .... - */ - - private final int m() - { int n = 0; - int i = 0; - while(true) - { if (i > j) return n; - if (! cons(i)) break; i++; - } - i++; - while(true) - { while(true) - { if (i > j) return n; - if (cons(i)) break; - i++; - } - i++; - n++; - while(true) - { if (i > j) return n; - if (! cons(i)) break; - i++; - } - i++; - } - } - - /* vowelinstem() is true <=> 0,...j contains a vowel */ - - private final boolean vowelinstem() - { int i; for (i = 0; i <= j; i++) if (! cons(i)) return true; - return false; - } - - /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ - - private final boolean doublec(int j) - { if (j < 1) return false; - if (b[j] != b[j-1]) return false; - return cons(j); - } - - /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - - */ - - private final boolean cvc(int i) - { if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false; - { int ch = b[i]; - if (ch == 'w' || ch == 'x' || ch == 'y') return false; - } - return true; - } - - private final boolean ends(String s) - { int l = s.length(); - int o = k-l+1; - if (o < 0) return false; - for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false; - j = k-l; - return true; - } - - /* setto(s) sets (j+1),...k to the characters in the string s, readjusting - k. */ - - private final void setto(String s) - { int l = s.length(); - int o = j+1; - for (int i = 0; i < l; i++) b[o+i] = s.charAt(i); - k = j+l; - } - - /* r(s) is used further down. */ - - private final void r(String s) { if (m() > 0) setto(s); } - - /* step1() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - - */ - - private final void step1() - { if (b[k] == 's') - { if (ends("sses")) k -= 2; else - if (ends("ies")) setto("i"); else - if (b[k-1] != 's') k--; - } - if (ends("eed")) { if (m() > 0) k--; } else - if ((ends("ed") || ends("ing")) && vowelinstem()) - { k = j; - if (ends("at")) setto("ate"); else - if (ends("bl")) setto("ble"); else - if (ends("iz")) setto("ize"); else - if (doublec(k)) - { k--; - { int ch = b[k]; - if (ch == 'l' || ch == 's' || ch == 'z') k++; - } - } - else if (m() == 1 && cvc(k)) setto("e"); - } - } - - /* step2() turns terminal y to i when there is another vowel in the stem. */ - - private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; } - - /* step3() maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc. note that the string before the suffix must give - m() > 0. */ - - private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1]) - { - case 'a': if (ends("ational")) { r("ate"); break; } - if (ends("tional")) { r("tion"); break; } - break; - case 'c': if (ends("enci")) { r("ence"); break; } - if (ends("anci")) { r("ance"); break; } - break; - case 'e': if (ends("izer")) { r("ize"); break; } - break; - case 'l': if (ends("bli")) { r("ble"); break; } - if (ends("alli")) { r("al"); break; } - if (ends("entli")) { r("ent"); break; } - if (ends("eli")) { r("e"); break; } - if (ends("ousli")) { r("ous"); break; } - break; - case 'o': if (ends("ization")) { r("ize"); break; } - if (ends("ation")) { r("ate"); break; } - if (ends("ator")) { r("ate"); break; } - break; - case 's': if (ends("alism")) { r("al"); break; } - if (ends("iveness")) { r("ive"); break; } - if (ends("fulness")) { r("ful"); break; } - if (ends("ousness")) { r("ous"); break; } - break; - case 't': if (ends("aliti")) { r("al"); break; } - if (ends("iviti")) { r("ive"); break; } - if (ends("biliti")) { r("ble"); break; } - break; - case 'g': if (ends("logi")) { r("log"); break; } - } } - - /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ - - private final void step4() { switch (b[k]) - { - case 'e': if (ends("icate")) { r("ic"); break; } - if (ends("ative")) { r(""); break; } - if (ends("alize")) { r("al"); break; } - break; - case 'i': if (ends("iciti")) { r("ic"); break; } - break; - case 'l': if (ends("ical")) { r("ic"); break; } - if (ends("ful")) { r(""); break; } - break; - case 's': if (ends("ness")) { r(""); break; } - break; - } } - - /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ - - private final void step5() - { if (k == 0) return; /* for Bug 1 */ switch (b[k-1]) - { case 'a': if (ends("al")) break; return; - case 'c': if (ends("ance")) break; - if (ends("ence")) break; return; - case 'e': if (ends("er")) break; return; - case 'i': if (ends("ic")) break; return; - case 'l': if (ends("able")) break; - if (ends("ible")) break; return; - case 'n': if (ends("ant")) break; - if (ends("ement")) break; - if (ends("ment")) break; - /* element etc. not stripped before the m */ - if (ends("ent")) break; return; - case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; - /* j >= 0 fixes Bug 2 */ - if (ends("ou")) break; return; - /* takes care of -ous */ - case 's': if (ends("ism")) break; return; - case 't': if (ends("ate")) break; - if (ends("iti")) break; return; - case 'u': if (ends("ous")) break; return; - case 'v': if (ends("ive")) break; return; - case 'z': if (ends("ize")) break; return; - default: return; - } - if (m() > 1) k = j; - } - - /* step6() removes a final -e if m() > 1. */ - - private final void step6() - { j = k; - if (b[k] == 'e') - { int a = m(); - if (a > 1 || a == 1 && !cvc(k-1)) k--; - } - if (b[k] == 'l' && doublec(k) && m() > 1) k--; - } - - /** Stem the word placed into the Stemmer buffer through calls to add(). - * Returns true if the stemming process resulted in a word different - * from the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public void stem() - { k = i - 1; - if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); } - i_end = k+1; i = 0; - } - - /** Test program for demonstrating the Stemmer. It reads text from a - * a list of files, stems each word, and writes the result to standard - * output. Note that the word stemmed is expected to be in lower case: - * forcing lower case must be done outside the Stemmer class. - * Usage: Stemmer file-name file-name ... - */ - public static void main(String[] args) - { - char[] w = new char[501]; - Stemmer s = new Stemmer(); - for (int i = 0; i < args.length; i++) - try - { - FileInputStream in = new FileInputStream(args[i]); - - try - { while(true) - - { int ch = in.read(); - if (Character.isLetter((char) ch)) - { - int j = 0; - while(true) - { ch = Character.toLowerCase((char) ch); - w[j] = (char) ch; - if (j < 500) j++; - ch = in.read(); - if (!Character.isLetter((char) ch)) - { - /* to test add(char ch) */ - for (int c = 0; c < j; c++) s.add(w[c]); - - /* or, to test add(char[] w, int j) */ - /* s.add(w, j); */ - - s.stem(); - { String u; - - /* and now, to test toString() : */ - u = s.toString(); - - /* to test getResultBuffer(), getResultLength() : */ - /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */ - - System.out.print(u); - } - break; - } - } - } - if (ch < 0) break; - System.out.print((char)ch); - } - } - catch (IOException e) - { System.out.println("error reading " + args[i]); - break; - } - } - catch (FileNotFoundException e) - { System.out.println("file " + args[i] + " not found"); - break; - } - } -} |