summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authors123188 <s123188@S123188.campus.tue.nl>2014-06-11 12:29:38 +0200
committers123188 <s123188@S123188.campus.tue.nl>2014-06-11 12:29:38 +0200
commita45b8b54685e565eda2a6f81eee10c39974f1175 (patch)
treec664a99976b7305a305263509ba7e00622ad0de9
parent7b476a73d999a9be4e49247873add1c66ef49824 (diff)
downloadGoldfarmer-a45b8b54685e565eda2a6f81eee10c39974f1175.tar.gz
new method for categorizing words
-rw-r--r--src/main/Analyzor.java50
-rw-r--r--src/main/FarmShell.java5
2 files changed, 55 insertions, 0 deletions
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java
index 7b9def5..7685cdb 100644
--- a/src/main/Analyzor.java
+++ b/src/main/Analyzor.java
@@ -472,6 +472,56 @@ public class Analyzor {
writer.close();
}
+ void categorize(String file) throws FileNotFoundException, UnsupportedEncodingException{
+
+ //get the division in categories
+ InputStream inFile = new FileInputStream("categories.txt");
+ Scanner readFile = new Scanner(inFile);
+ HashMap<String,String> toCategory = new HashMap<>();
+
+ while (readFile.hasNextLine()) {
+ String line = readFile.nextLine();
+ if(line.split(",").length>1){
+ toCategory.put(line.split(",")[0], line.split(",")[1]);
+ }
+ }
+
+
+ //read the csv
+ Scanner sc = new Scanner(new File(file));
+
+ PrintWriter writer = new PrintWriter("categorised.csv", "UTF-8");
+ //copy the first line
+ writer.println(sc.nextLine());
+
+ String line;
+ String[] values;
+
+ //for every line
+ while(sc.hasNextLine()){
+ //get the values (and so the word)
+ line = sc.nextLine();
+ values = line.split(",");
+
+ //divide into categories
+ if(toCategory.containsKey(values[0])){
+ values[0] = toCategory.get(values[0]);
+ }
+
+ //print it
+ int length = values.length;
+ int index = 0;
+ for(String s : values){
+ writer.print(s);
+ if(!(index == length - 1)){
+ writer.print(",");
+ }
+ index++;
+ }
+ }
+ writer.close();
+ }
+
//replaces punctuation so it will be splitted
//also removes urls
private String splitPunctToWords(String text) {
diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java
index 9342d0b..6fa6d5f 100644
--- a/src/main/FarmShell.java
+++ b/src/main/FarmShell.java
@@ -137,6 +137,7 @@ public class FarmShell {
break;
case timezone:
getAnalyzor().timezone(params[0]);
+ break;
case disco:
getAnalyzor().disco(params[0]);
break;
@@ -146,6 +147,9 @@ public class FarmShell {
case newsspread:
getAnalyzor().newsSpread(params[0]);
break;
+ case categorize:
+ getAnalyzor().categorize(params[0]);
+ break;
case getBrands:
getAnalyzor().getBrands();
break;
@@ -183,6 +187,7 @@ public class FarmShell {
disco("makes a outputfile for disco", 1),
posneg("makes a csv for a histogram for positive or negative tweets", 1),
newsspread("makes a csv for disco to show a news spread process", 1),
+ categorize("categorizes words in a csv as defined in categories.txt", 1),
exit("Returns to shell"),
help("Get help");