src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264

package Chapter4.classification.bayes;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

/**
 * This class performs both the training and classification steps of a Naive Bayes Classifier.
 *
 */
public class NaiveBayesSentimentClassifier {
	//the possible sentiment labels
	private static final String[] SENTIMENT_LABELS = {"happy", "sad"};
	//the tokens to look for in labeling the sentiment.
	private static final String[] HAPPY_SMILEYS = {":)", ";)", ":D", ":-)", ":o)", ":-D"};
	private static final String[] SAD_SMILEYS = {":(", ":-(", ":'(", ":'-(", "D:"};
	//store these as a set for faster retrieval
	private static final Set<String> HAPPY_SMILEY_SET = new HashSet<String>(Arrays.asList(HAPPY_SMILEYS));
	private static final Set<String> SAD_SMILEY_SET = new HashSet<String>(Arrays.asList(SAD_SMILEYS));
	
	//counter for the number of times each word has been associated with each sentiment.
	private Map<String, Integer[]> sentOccurs;
	//counter for the number of times we've seen each sentiment.
	private Integer[] sentCount;
	
	public NaiveBayesSentimentClassifier(){
		//initialize the counters
		sentOccurs = new HashMap<String, Integer[]>();
		sentCount = new Integer[SENTIMENT_LABELS.length];
		for(int i = 0; i < SENTIMENT_LABELS.length; i++){
			sentCount[i] = 0;
		}
	}
	
	/**
	 * Tokenize a string. Turns string into list of words based on whitespace, then
	 * removes stopwords, punctuation, and reduces the word to its stem. 
	 * @param text
	 * The piece of text
	 * @return
	 * Each individual word.
	 */
	private List<String> getTokens(String text){
		StringTokenizer tokens = new StringTokenizer(text);
		ArrayList<String> words = new ArrayList<String>();
		
		String tmp;
		StringBuilder sb;
		while(tokens.hasMoreTokens()){
			sb = new StringBuilder();
			tmp = tokens.nextToken();
			tmp = tmp.toLowerCase();
			
			for(char ch : tmp.toCharArray()){
				if(Character.isLetter(ch)){
					sb.append(ch);
				}
			}
			tmp = sb.toString();
			if(tmp.length() > 0 && !StopwordsList.stopwordsSet.contains(tmp)){
				words.add(sb.toString());
			}
		}
		
		return words;
	}
	
	/**
	 * Checks if tweet has a "label" (emoticon). If so, stores the words in
	 * the prior.
	 * @param tweetText
	 * The text of the document to check.
	 */
	public void trainInstance(String tweetText){
		//see if the tweet is labeled (i.e. has a smiley)
		int tweetLabel = extractLabel(tweetText);
		List<String> tokens = getTokens(tweetText);
		if(tweetLabel != -1){
			//add these words to the classifier
			updateClassifier(tokens, tweetLabel);
		}
	}
	
	public String printWordOccurs(int sentIndex, int topN){
		StringBuilder sb = new StringBuilder();
		
		WordCountPair wpcset[] = new WordCountPair[sentOccurs.keySet().size()]; 
		
		String s;
		int t = 0;
		Iterator<String> sIter = sentOccurs.keySet().iterator();
//		int totalCount = 0;
//		while(sIter.hasNext()){
//			s = sIter.next();
//			totalCount += sentOccurs.get(s)[sentIndex];
//		}
		
		sIter = sentOccurs.keySet().iterator();
		while(sIter.hasNext()){
			s = sIter.next();
//			wpcset[t++] = new WordCountPair(s, sentOccurs.get(s)[sentIndex] * 1.0 / totalCount);
			wpcset[t++] = new WordCountPair(s, Math.sqrt(sentOccurs.get(s)[sentIndex] * 1.0 ));
		}
		
		Arrays.sort(wpcset);
		
		double frac;
		for(int i = 0; (i < topN || topN <= 0) && i < wpcset.length; i++){
			s = wpcset[i].getWord();
			frac = wpcset[i].getCount();
			
			sb.append(s);
			sb.append(":");
			sb.append(frac);
			sb.append("\n");
		}
		
		return sb.toString();
	}
	
	public void trainInstances(List<String> tweetTexts){
		for(String text : tweetTexts){
			trainInstance(text);
		}
	}
	
	/**
	 * Classify a tweet as happy or sad. This ignores the emoticon for demonstration purposes.
	 * @param tweetText
	 * The text of the tweet
	 * @return
	 * A Classification object that returns the sentiment of the tweet.
	 */
	public Classification classify(String tweetText){
		//stores the probability of each sentiment being the tweets true sentiment.
		double[] labelProbs = new double[SENTIMENT_LABELS.length];
		//tokenize the string
		List<String> tokens = getTokens(tweetText);		
		int maxLabelIdx = 0;
		for(int i = 0; i < labelProbs.length; i++){
			//calculate the probability that the tweet has that sentiment.
			labelProbs[i] = calcLabelProb(tokens, i);
			System.out.println(i + " -> " + labelProbs[i] );
			//keep track of the label probability
			maxLabelIdx = labelProbs[i] > labelProbs[maxLabelIdx] ? i : maxLabelIdx;
		}
		//calc the confidence
		double conf = labelProbs[maxLabelIdx];
		labelProbs[maxLabelIdx] = 0;
		conf -= sumVector(labelProbs);
		
		return new Classification(SENTIMENT_LABELS[maxLabelIdx], conf);
	}
	
	private int extractLabel(String tweetText){
		StringTokenizer tokens = new StringTokenizer(tweetText);
		while(tokens.hasMoreTokens()){
			String token = tokens.nextToken();
			if(HAPPY_SMILEY_SET.contains(token)){
				return 0;
			}
			else if(SAD_SMILEY_SET.contains(token)){
				return 1;
			}
		}
		return -1;
	}
	
	/**
	 * This updates the classifier's probabilites for each word
	 * with the new piece of text.
	 * @param tokens
	 * The tokens in the tweet.
	 * @param sentIndex
	 * The sentiment label.
	 */
	private void updateClassifier(List<String> tokens, int sentIndex){
		for(String token : tokens){
			if(sentOccurs.containsKey(token)){
				sentOccurs.get(token)[sentIndex] ++ ;
			}
			else{
				//make a new array and put it
				Integer[] newArray = {0, 0};
				newArray[sentIndex] ++;
				sentOccurs.put(token, newArray);
			}
		}
		//update the overall document count
		sentCount[sentIndex]++;
	}
	
	/**
	 * The probability of the tweet having a given label.
	 * @param tokens
	 * The tokens in the tweet.
	 * @param sentIndex
	 * The probability we are testing.
	 * @return
	 * The probability the tweet has the class label indicated by "sentIndex".
	 */
	private double calcLabelProb(List<String> tokens, int sentIndex){
		
		//calculate the class probabilities
		double[] pClass = new double[SENTIMENT_LABELS.length];
		int cSum = sumVector(sentCount);
		int totalWordCount = 0;
		
		for(int i = 0; i < sentCount.length; i++){
			pClass[i] = sentCount[i] * 1.0 / cSum; 
		}
		
		for(String word : sentOccurs.keySet()){
			Integer[] wordCt = sentOccurs.get(word);
			totalWordCount = sumVector(wordCt);
		}
		
		
		double p = 1.0;
		boolean foundOne = false;
		for(String token : tokens){
			if(sentOccurs.containsKey(token)){
				foundOne = true;
				Integer[] probs = sentOccurs.get(token);
				double pWordGivenClass = probs[sentIndex] / (double)(sumVector(probs)); 
				double pWord = sumVector(probs) / totalWordCount;
				p *= pWordGivenClass * pClass[sentIndex] / pWord;
			}
		}
		return foundOne ? p : 0.0;
	}
	
	/**
	 * Helper function to sum the values in a 1D array.
	 * @param vector
	 * The 1D array to sum.
	 * @return
	 * The sum.
	 */
	private double sumVector(double[] vector){
		double sum = 0.0;
		for(double d : vector) sum += d;
		return sum;
	}
	
	/**
	 * Helper function to sum the values in a 1D array.
	 * @param vector
	 * The 1D array to sum.
	 * @return
	 * The sum.
	 */
	private int sumVector(Integer[] vector){
		int sum = 0;
		for(int d : vector) sum += d;
		return sum;
	}
}