summaryrefslogtreecommitdiff
path: root/src/Chapter4/classification/bayes/NBCxv.java
blob: 5c48e2850aadb3188c869d948459fa6a388d1123 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
package Chapter4.classification.bayes;

import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import com.google.gson.JsonObject;
import com.google.gson.JsonStreamParser;

public class NBCxv {
	public static void main(String[] args){
			
		String filename = args.length >= 1 ? args[0] : "owsemoticons.json";
		
		ArrayList<String> allTexts = new ArrayList<String>();
	    
		try {
			//read the file, and train each document
			JsonStreamParser parser = new JsonStreamParser(new FileReader(filename));
			JsonObject elem;
			while (parser.hasNext()) {
				elem = parser.next().getAsJsonObject();
	            allTexts.add(elem.get("text").getAsString());
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		//do 5-fold cross validation 3 times
		Map<Integer, ArrayList<String>> buckets;
		int bucketIdx;
		NaiveBayesSentimentClassifier nbsc;
		for(int i = 0; i < 3; i++){
			
			//randomly split the texts into 5 buckets
			buckets = new HashMap<Integer, ArrayList<String>>();
			//initialize the 5 buckets
			for(int j = 0; j < 5; j++) buckets.put(j, new ArrayList<String>());
			for(String text : allTexts){
				bucketIdx = (int) (Math.random()*5);
				buckets.get(bucketIdx).add(text);
			}
			
			for(int j = 0; j < 5; j++){
				//use all but j as the training, use j as the test.
				nbsc = new NaiveBayesSentimentClassifier();
				for(int k = 0; k < 5; k++){
					if(k != j){
						nbsc.trainInstances(buckets.get(k));
					}
				}
				//test with bucket j
				
			}
		}
		
	}
}