diff options
Diffstat (limited to 'src/Chapter4/classification/bayes')
6 files changed, 0 insertions, 439 deletions
diff --git a/src/Chapter4/classification/bayes/Classification.java b/src/Chapter4/classification/bayes/Classification.java deleted file mode 100644 index ea9aba7..0000000 --- a/src/Chapter4/classification/bayes/Classification.java +++ /dev/null @@ -1,22 +0,0 @@ -package Chapter4.classification.bayes; - -public class Classification { - private String label; - private double confidence; - - public Classification(String label, double confidence){ - this.label = label; - this.confidence = confidence; - } - - public String getLabel() { - return label; - } - public double getConfidence() { - return confidence; - } - - public String toString(){ - return "(" + label + ", " + confidence + ")"; - } -} diff --git a/src/Chapter4/classification/bayes/NBCxv.java b/src/Chapter4/classification/bayes/NBCxv.java deleted file mode 100644 index 5c48e28..0000000 --- a/src/Chapter4/classification/bayes/NBCxv.java +++ /dev/null @@ -1,60 +0,0 @@ -package Chapter4.classification.bayes; - -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; - -import com.google.gson.JsonObject; -import com.google.gson.JsonStreamParser; - -public class NBCxv { - public static void main(String[] args){ - - String filename = args.length >= 1 ? args[0] : "owsemoticons.json"; - - ArrayList<String> allTexts = new ArrayList<String>(); - - try { - //read the file, and train each document - JsonStreamParser parser = new JsonStreamParser(new FileReader(filename)); - JsonObject elem; - while (parser.hasNext()) { - elem = parser.next().getAsJsonObject(); - allTexts.add(elem.get("text").getAsString()); - } - } catch (IOException e) { - e.printStackTrace(); - } - - //do 5-fold cross validation 3 times - Map<Integer, ArrayList<String>> buckets; - int bucketIdx; - NaiveBayesSentimentClassifier nbsc; - for(int i = 0; i < 3; i++){ - - //randomly split the texts into 5 buckets - buckets = new HashMap<Integer, ArrayList<String>>(); - //initialize the 5 buckets - for(int j = 0; j < 5; j++) buckets.put(j, new ArrayList<String>()); - for(String text : allTexts){ - bucketIdx = (int) (Math.random()*5); - buckets.get(bucketIdx).add(text); - } - - for(int j = 0; j < 5; j++){ - //use all but j as the training, use j as the test. - nbsc = new NaiveBayesSentimentClassifier(); - for(int k = 0; k < 5; k++){ - if(k != j){ - nbsc.trainInstances(buckets.get(k)); - } - } - //test with bucket j - - } - } - - } -} diff --git a/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java b/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java deleted file mode 100644 index 923416c..0000000 --- a/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java +++ /dev/null @@ -1,264 +0,0 @@ -package Chapter4.classification.bayes; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.StringTokenizer; - -/** - * This class performs both the training and classification steps of a Naive Bayes Classifier. - * - */ -public class NaiveBayesSentimentClassifier { - //the possible sentiment labels - private static final String[] SENTIMENT_LABELS = {"happy", "sad"}; - //the tokens to look for in labeling the sentiment. - private static final String[] HAPPY_SMILEYS = {":)", ";)", ":D", ":-)", ":o)", ":-D"}; - private static final String[] SAD_SMILEYS = {":(", ":-(", ":'(", ":'-(", "D:"}; - //store these as a set for faster retrieval - private static final Set<String> HAPPY_SMILEY_SET = new HashSet<String>(Arrays.asList(HAPPY_SMILEYS)); - private static final Set<String> SAD_SMILEY_SET = new HashSet<String>(Arrays.asList(SAD_SMILEYS)); - - //counter for the number of times each word has been associated with each sentiment. - private Map<String, Integer[]> sentOccurs; - //counter for the number of times we've seen each sentiment. - private Integer[] sentCount; - - public NaiveBayesSentimentClassifier(){ - //initialize the counters - sentOccurs = new HashMap<String, Integer[]>(); - sentCount = new Integer[SENTIMENT_LABELS.length]; - for(int i = 0; i < SENTIMENT_LABELS.length; i++){ - sentCount[i] = 0; - } - } - - /** - * Tokenize a string. Turns string into list of words based on whitespace, then - * removes stopwords, punctuation, and reduces the word to its stem. - * @param text - * The piece of text - * @return - * Each individual word. - */ - private List<String> getTokens(String text){ - StringTokenizer tokens = new StringTokenizer(text); - ArrayList<String> words = new ArrayList<String>(); - - String tmp; - StringBuilder sb; - while(tokens.hasMoreTokens()){ - sb = new StringBuilder(); - tmp = tokens.nextToken(); - tmp = tmp.toLowerCase(); - - for(char ch : tmp.toCharArray()){ - if(Character.isLetter(ch)){ - sb.append(ch); - } - } - tmp = sb.toString(); - if(tmp.length() > 0 && !StopwordsList.stopwordsSet.contains(tmp)){ - words.add(sb.toString()); - } - } - - return words; - } - - /** - * Checks if tweet has a "label" (emoticon). If so, stores the words in - * the prior. - * @param tweetText - * The text of the document to check. - */ - public void trainInstance(String tweetText){ - //see if the tweet is labeled (i.e. has a smiley) - int tweetLabel = extractLabel(tweetText); - List<String> tokens = getTokens(tweetText); - if(tweetLabel != -1){ - //add these words to the classifier - updateClassifier(tokens, tweetLabel); - } - } - - public String printWordOccurs(int sentIndex, int topN){ - StringBuilder sb = new StringBuilder(); - - WordCountPair wpcset[] = new WordCountPair[sentOccurs.keySet().size()]; - - String s; - int t = 0; - Iterator<String> sIter = sentOccurs.keySet().iterator(); -// int totalCount = 0; -// while(sIter.hasNext()){ -// s = sIter.next(); -// totalCount += sentOccurs.get(s)[sentIndex]; -// } - - sIter = sentOccurs.keySet().iterator(); - while(sIter.hasNext()){ - s = sIter.next(); -// wpcset[t++] = new WordCountPair(s, sentOccurs.get(s)[sentIndex] * 1.0 / totalCount); - wpcset[t++] = new WordCountPair(s, Math.sqrt(sentOccurs.get(s)[sentIndex] * 1.0 )); - } - - Arrays.sort(wpcset); - - double frac; - for(int i = 0; (i < topN || topN <= 0) && i < wpcset.length; i++){ - s = wpcset[i].getWord(); - frac = wpcset[i].getCount(); - - sb.append(s); - sb.append(":"); - sb.append(frac); - sb.append("\n"); - } - - return sb.toString(); - } - - public void trainInstances(List<String> tweetTexts){ - for(String text : tweetTexts){ - trainInstance(text); - } - } - - /** - * Classify a tweet as happy or sad. This ignores the emoticon for demonstration purposes. - * @param tweetText - * The text of the tweet - * @return - * A Classification object that returns the sentiment of the tweet. - */ - public Classification classify(String tweetText){ - //stores the probability of each sentiment being the tweets true sentiment. - double[] labelProbs = new double[SENTIMENT_LABELS.length]; - //tokenize the string - List<String> tokens = getTokens(tweetText); - int maxLabelIdx = 0; - for(int i = 0; i < labelProbs.length; i++){ - //calculate the probability that the tweet has that sentiment. - labelProbs[i] = calcLabelProb(tokens, i); - System.out.println(i + " -> " + labelProbs[i] ); - //keep track of the label probability - maxLabelIdx = labelProbs[i] > labelProbs[maxLabelIdx] ? i : maxLabelIdx; - } - //calc the confidence - double conf = labelProbs[maxLabelIdx]; - labelProbs[maxLabelIdx] = 0; - conf -= sumVector(labelProbs); - - return new Classification(SENTIMENT_LABELS[maxLabelIdx], conf); - } - - private int extractLabel(String tweetText){ - StringTokenizer tokens = new StringTokenizer(tweetText); - while(tokens.hasMoreTokens()){ - String token = tokens.nextToken(); - if(HAPPY_SMILEY_SET.contains(token)){ - return 0; - } - else if(SAD_SMILEY_SET.contains(token)){ - return 1; - } - } - return -1; - } - - /** - * This updates the classifier's probabilites for each word - * with the new piece of text. - * @param tokens - * The tokens in the tweet. - * @param sentIndex - * The sentiment label. - */ - private void updateClassifier(List<String> tokens, int sentIndex){ - for(String token : tokens){ - if(sentOccurs.containsKey(token)){ - sentOccurs.get(token)[sentIndex] ++ ; - } - else{ - //make a new array and put it - Integer[] newArray = {0, 0}; - newArray[sentIndex] ++; - sentOccurs.put(token, newArray); - } - } - //update the overall document count - sentCount[sentIndex]++; - } - - /** - * The probability of the tweet having a given label. - * @param tokens - * The tokens in the tweet. - * @param sentIndex - * The probability we are testing. - * @return - * The probability the tweet has the class label indicated by "sentIndex". - */ - private double calcLabelProb(List<String> tokens, int sentIndex){ - - //calculate the class probabilities - double[] pClass = new double[SENTIMENT_LABELS.length]; - int cSum = sumVector(sentCount); - int totalWordCount = 0; - - for(int i = 0; i < sentCount.length; i++){ - pClass[i] = sentCount[i] * 1.0 / cSum; - } - - for(String word : sentOccurs.keySet()){ - Integer[] wordCt = sentOccurs.get(word); - totalWordCount = sumVector(wordCt); - } - - - double p = 1.0; - boolean foundOne = false; - for(String token : tokens){ - if(sentOccurs.containsKey(token)){ - foundOne = true; - Integer[] probs = sentOccurs.get(token); - double pWordGivenClass = probs[sentIndex] / (double)(sumVector(probs)); - double pWord = sumVector(probs) / totalWordCount; - p *= pWordGivenClass * pClass[sentIndex] / pWord; - } - } - return foundOne ? p : 0.0; - } - - /** - * Helper function to sum the values in a 1D array. - * @param vector - * The 1D array to sum. - * @return - * The sum. - */ - private double sumVector(double[] vector){ - double sum = 0.0; - for(double d : vector) sum += d; - return sum; - } - - /** - * Helper function to sum the values in a 1D array. - * @param vector - * The 1D array to sum. - * @return - * The sum. - */ - private int sumVector(Integer[] vector){ - int sum = 0; - for(int d : vector) sum += d; - return sum; - } -} diff --git a/src/Chapter4/classification/bayes/StopwordsList.java b/src/Chapter4/classification/bayes/StopwordsList.java deleted file mode 100644 index 06edd5a..0000000 --- a/src/Chapter4/classification/bayes/StopwordsList.java +++ /dev/null @@ -1,10 +0,0 @@ -package Chapter4.classification.bayes; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; - -public class StopwordsList { - private static final String[] stopwords = {"a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "did", "do", "does", "doing", "don", "down", "during", "each", "few", "for", "from", "further", "get", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "im", "i'm", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most", "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "rt", "s", "same", "she", "should", "so", "some", "such", "t", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "us", "very", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "you", "your", "yours", "yourself", "yourselves"}; - public static final Set<String> stopwordsSet = new HashSet<String>(Arrays.asList(stopwords)); -} diff --git a/src/Chapter4/classification/bayes/TestNBC.java b/src/Chapter4/classification/bayes/TestNBC.java deleted file mode 100644 index 7e0e743..0000000 --- a/src/Chapter4/classification/bayes/TestNBC.java +++ /dev/null @@ -1,49 +0,0 @@ -package Chapter4.classification.bayes; - -import java.io.FileReader; -import java.io.IOException; - -import com.google.gson.JsonObject; -import com.google.gson.JsonStreamParser; - -public class TestNBC { - public static void main(String[] args){ - - String filename = args.length >= 1 ? args[0] : "owsemoticons.json"; - - //initialize the sentiment classifier - NaiveBayesSentimentClassifier nbsc = new NaiveBayesSentimentClassifier(); - - try { - //read the file, and train each document - JsonStreamParser parser = new JsonStreamParser(new FileReader(filename)); - JsonObject elem; - String text; - while (parser.hasNext()) { - elem = parser.next().getAsJsonObject(); - text = elem.get("text").getAsString(); - nbsc.trainInstance(text); - } - - //print out the positive and negative dictionary - System.out.println("=== Positive Dictionary ==="); - System.out.println(nbsc.printWordOccurs(0, 25)); - System.out.println("=== Negative Dictionary ==="); - System.out.println(nbsc.printWordOccurs(1, 25)); - - //now go through and classify each line as positive or negative -// parser = new JsonStreamParser(new FileReader(filename)); -// while (parser.hasNext()) { -// elem = parser.next().getAsJsonObject(); -// text = elem.get("text").getAsString(); -// Classification c = nbsc.classify(text); -// System.out.println(c + " -> " + text); -// } - System.out.println(nbsc.classify("I love new york")); - - } catch (IOException e) { - e.printStackTrace(); - } - - } -} diff --git a/src/Chapter4/classification/bayes/WordCountPair.java b/src/Chapter4/classification/bayes/WordCountPair.java deleted file mode 100644 index b96be92..0000000 --- a/src/Chapter4/classification/bayes/WordCountPair.java +++ /dev/null @@ -1,34 +0,0 @@ -package Chapter4.classification.bayes; - -public class WordCountPair implements Comparable<WordCountPair>{ - - - private String word; - private double count; - - public WordCountPair(String word, double count){ - this.word = word; - this.count = count; - } - - public int compareTo(WordCountPair arg0) { - return arg0.count - count < 0 ? -1 : 1; - } - - public String getWord() { - return word; - } - - public void setWord(String word) { - this.word = word; - } - - public double getCount() { - return count; - } - - public void setCount(int count) { - this.count = count; - } - -} |