6 files changed, 0 insertions, 439 deletions
diff --git a/src/Chapter4/classification/bayes/Classification.java b/src/Chapter4/classification/bayes/Classification.java
deleted file mode 100644
index ea9aba7..0000000
--- a/src/Chapter4/classification/bayes/Classification.java
+++ /dev/null
@@ -1,22 +0,0 @@
-package Chapter4.classification.bayes;
-
-public class Classification {
-	private String label;
-	private double confidence;
-	
-	public Classification(String label, double confidence){
-		this.label = label;
-		this.confidence = confidence;
-	}
-	
-	public String getLabel() {
-		return label;
-	}
-	public double getConfidence() {
-		return confidence;
-	}
-	
-	public String toString(){
-		return "(" + label + ", " + confidence + ")";
-	}
-}
diff --git a/src/Chapter4/classification/bayes/NBCxv.java b/src/Chapter4/classification/bayes/NBCxv.java
deleted file mode 100644
index 5c48e28..0000000
--- a/src/Chapter4/classification/bayes/NBCxv.java
+++ /dev/null
@@ -1,60 +0,0 @@
-package Chapter4.classification.bayes;
-
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-
-import com.google.gson.JsonObject;
-import com.google.gson.JsonStreamParser;
-
-public class NBCxv {
-	public static void main(String[] args){
-			
-		String filename = args.length >= 1 ? args[0] : "owsemoticons.json";
-		
-		ArrayList<String> allTexts = new ArrayList<String>();
-	    
-		try {
-			//read the file, and train each document
-			JsonStreamParser parser = new JsonStreamParser(new FileReader(filename));
-			JsonObject elem;
-			while (parser.hasNext()) {
-				elem = parser.next().getAsJsonObject();
-	            allTexts.add(elem.get("text").getAsString());
-			}
-		} catch (IOException e) {
-			e.printStackTrace();
-		}
-		
-		//do 5-fold cross validation 3 times
-		Map<Integer, ArrayList<String>> buckets;
-		int bucketIdx;
-		NaiveBayesSentimentClassifier nbsc;
-		for(int i = 0; i < 3; i++){
-			
-			//randomly split the texts into 5 buckets
-			buckets = new HashMap<Integer, ArrayList<String>>();
-			//initialize the 5 buckets
-			for(int j = 0; j < 5; j++) buckets.put(j, new ArrayList<String>());
-			for(String text : allTexts){
-				bucketIdx = (int) (Math.random()*5);
-				buckets.get(bucketIdx).add(text);
-			}
-			
-			for(int j = 0; j < 5; j++){
-				//use all but j as the training, use j as the test.
-				nbsc = new NaiveBayesSentimentClassifier();
-				for(int k = 0; k < 5; k++){
-					if(k != j){
-						nbsc.trainInstances(buckets.get(k));
-					}
-				}
-				//test with bucket j
-				
-			}
-		}
-		
-	}
-}
diff --git a/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java b/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java
deleted file mode 100644
index 923416c..0000000
--- a/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java
+++ /dev/null
@@ -1,264 +0,0 @@
-package Chapter4.classification.bayes;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.StringTokenizer;
-
-/**
- * This class performs both the training and classification steps of a Naive Bayes Classifier.
- *
- */
-public class NaiveBayesSentimentClassifier {
-	//the possible sentiment labels
-	private static final String[] SENTIMENT_LABELS = {"happy", "sad"};
-	//the tokens to look for in labeling the sentiment.
-	private static final String[] HAPPY_SMILEYS = {":)", ";)", ":D", ":-)", ":o)", ":-D"};
-	private static final String[] SAD_SMILEYS = {":(", ":-(", ":'(", ":'-(", "D:"};
-	//store these as a set for faster retrieval
-	private static final Set<String> HAPPY_SMILEY_SET = new HashSet<String>(Arrays.asList(HAPPY_SMILEYS));
-	private static final Set<String> SAD_SMILEY_SET = new HashSet<String>(Arrays.asList(SAD_SMILEYS));
-	
-	//counter for the number of times each word has been associated with each sentiment.
-	private Map<String, Integer[]> sentOccurs;
-	//counter for the number of times we've seen each sentiment.
-	private Integer[] sentCount;
-	
-	public NaiveBayesSentimentClassifier(){
-		//initialize the counters
-		sentOccurs = new HashMap<String, Integer[]>();
-		sentCount = new Integer[SENTIMENT_LABELS.length];
-		for(int i = 0; i < SENTIMENT_LABELS.length; i++){
-			sentCount[i] = 0;
-		}
-	}
-	
-	/**
-	 * Tokenize a string. Turns string into list of words based on whitespace, then
-	 * removes stopwords, punctuation, and reduces the word to its stem. 
-	 * @param text
-	 * The piece of text
-	 * @return
-	 * Each individual word.
-	 */
-	private List<String> getTokens(String text){
-		StringTokenizer tokens = new StringTokenizer(text);
-		ArrayList<String> words = new ArrayList<String>();
-		
-		String tmp;
-		StringBuilder sb;
-		while(tokens.hasMoreTokens()){
-			sb = new StringBuilder();
-			tmp = tokens.nextToken();
-			tmp = tmp.toLowerCase();
-			
-			for(char ch : tmp.toCharArray()){
-				if(Character.isLetter(ch)){
-					sb.append(ch);
-				}
-			}
-			tmp = sb.toString();
-			if(tmp.length() > 0 && !StopwordsList.stopwordsSet.contains(tmp)){
-				words.add(sb.toString());
-			}
-		}
-		
-		return words;
-	}
-	
-	/**
-	 * Checks if tweet has a "label" (emoticon). If so, stores the words in
-	 * the prior.
-	 * @param tweetText
-	 * The text of the document to check.
-	 */
-	public void trainInstance(String tweetText){
-		//see if the tweet is labeled (i.e. has a smiley)
-		int tweetLabel = extractLabel(tweetText);
-		List<String> tokens = getTokens(tweetText);
-		if(tweetLabel != -1){
-			//add these words to the classifier
-			updateClassifier(tokens, tweetLabel);
-		}
-	}
-	
-	public String printWordOccurs(int sentIndex, int topN){
-		StringBuilder sb = new StringBuilder();
-		
-		WordCountPair wpcset[] = new WordCountPair[sentOccurs.keySet().size()]; 
-		
-		String s;
-		int t = 0;
-		Iterator<String> sIter = sentOccurs.keySet().iterator();
-//		int totalCount = 0;
-//		while(sIter.hasNext()){
-//			s = sIter.next();
-//			totalCount += sentOccurs.get(s)[sentIndex];
-//		}
-		
-		sIter = sentOccurs.keySet().iterator();
-		while(sIter.hasNext()){
-			s = sIter.next();
-//			wpcset[t++] = new WordCountPair(s, sentOccurs.get(s)[sentIndex] * 1.0 / totalCount);
-			wpcset[t++] = new WordCountPair(s, Math.sqrt(sentOccurs.get(s)[sentIndex] * 1.0 ));
-		}
-		
-		Arrays.sort(wpcset);
-		
-		double frac;
-		for(int i = 0; (i < topN || topN <= 0) && i < wpcset.length; i++){
-			s = wpcset[i].getWord();
-			frac = wpcset[i].getCount();
-			
-			sb.append(s);
-			sb.append(":");
-			sb.append(frac);
-			sb.append("\n");
-		}
-		
-		return sb.toString();
-	}
-	
-	public void trainInstances(List<String> tweetTexts){
-		for(String text : tweetTexts){
-			trainInstance(text);
-		}
-	}
-	
-	/**
-	 * Classify a tweet as happy or sad. This ignores the emoticon for demonstration purposes.
-	 * @param tweetText
-	 * The text of the tweet
-	 * @return
-	 * A Classification object that returns the sentiment of the tweet.
-	 */
-	public Classification classify(String tweetText){
-		//stores the probability of each sentiment being the tweets true sentiment.
-		double[] labelProbs = new double[SENTIMENT_LABELS.length];
-		//tokenize the string
-		List<String> tokens = getTokens(tweetText);		
-		int maxLabelIdx = 0;
-		for(int i = 0; i < labelProbs.length; i++){
-			//calculate the probability that the tweet has that sentiment.
-			labelProbs[i] = calcLabelProb(tokens, i);
-			System.out.println(i + " -> " + labelProbs[i] );
-			//keep track of the label probability
-			maxLabelIdx = labelProbs[i] > labelProbs[maxLabelIdx] ? i : maxLabelIdx;
-		}
-		//calc the confidence
-		double conf = labelProbs[maxLabelIdx];
-		labelProbs[maxLabelIdx] = 0;
-		conf -= sumVector(labelProbs);
-		
-		return new Classification(SENTIMENT_LABELS[maxLabelIdx], conf);
-	}
-	
-	private int extractLabel(String tweetText){
-		StringTokenizer tokens = new StringTokenizer(tweetText);
-		while(tokens.hasMoreTokens()){
-			String token = tokens.nextToken();
-			if(HAPPY_SMILEY_SET.contains(token)){
-				return 0;
-			}
-			else if(SAD_SMILEY_SET.contains(token)){
-				return 1;
-			}
-		}
-		return -1;
-	}
-	
-	/**
-	 * This updates the classifier's probabilites for each word
-	 * with the new piece of text.
-	 * @param tokens
-	 * The tokens in the tweet.
-	 * @param sentIndex
-	 * The sentiment label.
-	 */
-	private void updateClassifier(List<String> tokens, int sentIndex){
-		for(String token : tokens){
-			if(sentOccurs.containsKey(token)){
-				sentOccurs.get(token)[sentIndex] ++ ;
-			}
-			else{
-				//make a new array and put it
-				Integer[] newArray = {0, 0};
-				newArray[sentIndex] ++;
-				sentOccurs.put(token, newArray);
-			}
-		}
-		//update the overall document count
-		sentCount[sentIndex]++;
-	}
-	
-	/**
-	 * The probability of the tweet having a given label.
-	 * @param tokens
-	 * The tokens in the tweet.
-	 * @param sentIndex
-	 * The probability we are testing.
-	 * @return
-	 * The probability the tweet has the class label indicated by "sentIndex".
-	 */
-	private double calcLabelProb(List<String> tokens, int sentIndex){
-		
-		//calculate the class probabilities
-		double[] pClass = new double[SENTIMENT_LABELS.length];
-		int cSum = sumVector(sentCount);
-		int totalWordCount = 0;
-		
-		for(int i = 0; i < sentCount.length; i++){
-			pClass[i] = sentCount[i] * 1.0 / cSum; 
-		}
-		
-		for(String word : sentOccurs.keySet()){
-			Integer[] wordCt = sentOccurs.get(word);
-			totalWordCount = sumVector(wordCt);
-		}
-		
-		
-		double p = 1.0;
-		boolean foundOne = false;
-		for(String token : tokens){
-			if(sentOccurs.containsKey(token)){
-				foundOne = true;
-				Integer[] probs = sentOccurs.get(token);
-				double pWordGivenClass = probs[sentIndex] / (double)(sumVector(probs)); 
-				double pWord = sumVector(probs) / totalWordCount;
-				p *= pWordGivenClass * pClass[sentIndex] / pWord;
-			}
-		}
-		return foundOne ? p : 0.0;
-	}
-	
-	/**
-	 * Helper function to sum the values in a 1D array.
-	 * @param vector
-	 * The 1D array to sum.
-	 * @return
-	 * The sum.
-	 */
-	private double sumVector(double[] vector){
-		double sum = 0.0;
-		for(double d : vector) sum += d;
-		return sum;
-	}
-	
-	/**
-	 * Helper function to sum the values in a 1D array.
-	 * @param vector
-	 * The 1D array to sum.
-	 * @return
-	 * The sum.
-	 */
-	private int sumVector(Integer[] vector){
-		int sum = 0;
-		for(int d : vector) sum += d;
-		return sum;
-	}
-}
diff --git a/src/Chapter4/classification/bayes/StopwordsList.java b/src/Chapter4/classification/bayes/StopwordsList.java
deleted file mode 100644
index 06edd5a..0000000
--- a/src/Chapter4/classification/bayes/StopwordsList.java
+++ /dev/null
@@ -1,10 +0,0 @@
-package Chapter4.classification.bayes;
-
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-
-public class StopwordsList {
-	private static final String[] stopwords = {"a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "did", "do", "does", "doing", "don", "down", "during", "each", "few", "for", "from", "further", "get", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "im", "i'm", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most", "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "rt", "s", "same", "she", "should", "so", "some", "such", "t", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "us", "very", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "you", "your", "yours", "yourself", "yourselves"};
-	public static final Set<String> stopwordsSet = new HashSet<String>(Arrays.asList(stopwords));
-}
diff --git a/src/Chapter4/classification/bayes/TestNBC.java b/src/Chapter4/classification/bayes/TestNBC.java
deleted file mode 100644
index 7e0e743..0000000
--- a/src/Chapter4/classification/bayes/TestNBC.java
+++ /dev/null
@@ -1,49 +0,0 @@
-package Chapter4.classification.bayes;
-
-import java.io.FileReader;
-import java.io.IOException;
-
-import com.google.gson.JsonObject;
-import com.google.gson.JsonStreamParser;
-
-public class TestNBC {
-	public static void main(String[] args){
-		
-		String filename = args.length >= 1 ? args[0] : "owsemoticons.json";
-		
-		//initialize the sentiment classifier
-		NaiveBayesSentimentClassifier nbsc = new NaiveBayesSentimentClassifier();
-        
-		try {
-			//read the file, and train each document
-			JsonStreamParser parser = new JsonStreamParser(new FileReader(filename));
-			JsonObject elem;
-			String text;
-			while (parser.hasNext()) {
-	            elem = parser.next().getAsJsonObject();
-	            text = elem.get("text").getAsString();
-	            nbsc.trainInstance(text);
-			}
-			
-			//print out the positive and negative dictionary
-			System.out.println("=== Positive Dictionary ===");
-			System.out.println(nbsc.printWordOccurs(0, 25));
-			System.out.println("=== Negative Dictionary ===");
-			System.out.println(nbsc.printWordOccurs(1, 25));
-			
-	        //now go through and classify each line as positive or negative
-//			parser = new JsonStreamParser(new FileReader(filename));
-//			while (parser.hasNext()) {
-//	            elem = parser.next().getAsJsonObject();
-//	            text = elem.get("text").getAsString();
-//	            Classification c = nbsc.classify(text);
-//	            System.out.println(c + " -> " + text);
-//			}
-			System.out.println(nbsc.classify("I love new york"));
-			
-		} catch (IOException e) {
-			e.printStackTrace();
-		}
-		
-	}
-}	
diff --git a/src/Chapter4/classification/bayes/WordCountPair.java b/src/Chapter4/classification/bayes/WordCountPair.java
deleted file mode 100644
index b96be92..0000000
--- a/src/Chapter4/classification/bayes/WordCountPair.java
+++ /dev/null
@@ -1,34 +0,0 @@
-package Chapter4.classification.bayes;
-
-public class WordCountPair implements Comparable<WordCountPair>{
-
-	
-	private String word;
-	private double count;
-	
-	public WordCountPair(String word, double count){
-		this.word = word;
-		this.count = count;
-	}
-	
-	public int compareTo(WordCountPair arg0) {
-		return arg0.count - count < 0 ? -1 : 1;
-	}
-
-	public String getWord() {
-		return word;
-	}
-
-	public void setWord(String word) {
-		this.word = word;
-	}
-
-	public double getCount() {
-		return count;
-	}
-
-	public void setCount(int count) {
-		this.count = count;
-	}
- 
-}