summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2014-06-11 14:43:58 +0200
committerPeter Wu <peter@lekensteyn.nl>2014-06-11 14:43:58 +0200
commitd489c0b2fb1d0aa40d22eefba5e3bf0194fae710 (patch)
treeb1a873f9a581f2377631573d24004a1cbec9ad68 /src
parent4511cfb3f975088eb4ebd388e00977cc2ed3787d (diff)
parent7c501d799ace9ff2cf615a5e91b9040f2311f2c4 (diff)
downloadGoldfarmer-d489c0b2fb1d0aa40d22eefba5e3bf0194fae710.tar.gz
Merge branch 'master' of git.lekensteyn.nl:tue/2IOC0-DBL/Goldfarmer
Conflicts: src/main/Analyzor.java src/main/FarmShell.java
Diffstat (limited to 'src')
-rw-r--r--src/main/Analyzor.java78
-rw-r--r--src/main/FarmShell.java4
2 files changed, 82 insertions, 0 deletions
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java
index d0a2b31..128b078 100644
--- a/src/main/Analyzor.java
+++ b/src/main/Analyzor.java
@@ -566,6 +566,84 @@ public class Analyzor {
}
writer.close();
}
+
+ void categorize(String file) throws FileNotFoundException, UnsupportedEncodingException{
+
+ //get the division in categories
+ InputStream inFile = new FileInputStream("categories.txt");
+ Scanner readFile = new Scanner(inFile);
+ HashMap<String,String> toCategory = new HashMap<>();
+
+ while (readFile.hasNextLine()) {
+ String line = readFile.nextLine();
+ if(line.split(",").length>1){
+ for(String element:line.split(",")[1].split(" ")){
+ toCategory.put(element, line.split(",")[0]);
+ }
+ }
+ }
+
+
+ //read the csv
+ Scanner sc = new Scanner(new File(file));
+
+ PrintWriter writer = new PrintWriter("categorised.csv", "UTF-8");
+ //copy the first line
+ writer.println(sc.nextLine());
+
+ String line;
+ String[] values;
+ Boolean printed;
+ HashSet<String> used;
+
+ //for every line
+ while(sc.hasNextLine()){
+ //get the values (and so the word)
+ line = sc.nextLine();
+ values = line.split(",");
+ printed = false;
+
+ //divide into categories
+ //substring
+ used = new HashSet<>();
+ for(String key : toCategory.keySet()){
+ if(values[0].contains(key) && !used.contains(toCategory.get(key))){
+ used.add(toCategory.get(key));
+ String[] newValues = values;
+ newValues[0] = toCategory.get(key);
+ //print it
+ writer.println(csvLine(newValues));
+ printed = true;
+ }
+ }
+ //exact word
+ if(toCategory.containsKey("#" + values[0])){
+ values[0] = toCategory.get("#" + values[0]);
+ }
+
+ //print it
+ if(!printed){
+ writer.println(csvLine(values));
+ }
+ }
+ writer.close();
+ }
+
+ String csvLine(String[] values){
+ int length = values.length;
+ int index = 0;
+ String result = "";
+
+ for(String s : values){
+ result += s;
+ if(!(index == length - 1)){
+ result += ",";
+ }
+ index++;
+ }
+
+ return result;
+ }
//replaces punctuation so it will be splitted
//also removes urls
diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java
index d624a71..61878d7 100644
--- a/src/main/FarmShell.java
+++ b/src/main/FarmShell.java
@@ -145,6 +145,9 @@ public class FarmShell {
case newsspread:
getAnalyzor().newsSpread(params[0]);
break;
+ case categorize:
+ getAnalyzor().categorize(params[0]);
+ break;
case getBrands:
String query = params.length > 0 ? params[0].trim() : "";
String[] args = query.split("\\s+", 2);
@@ -194,6 +197,7 @@ public class FarmShell {
disco("makes a outputfile for disco", 1),
posneg("makes a csv for a histogram for positive or negative tweets", 1),
newsspread("makes a csv for disco to show a news spread process", 1),
+ categorize("categorizes words in csv (first column) as defined in categorize.txt", 1),
exit("Returns to shell"),
help("Get help");