Merge branch 'master' of git.lekensteyn.nl:tue/2IOC0-DBL/Goldfarmer

Conflicts: src/main/Analyzor.java src/main/FarmShell.java
author: Peter Wu <peter@lekensteyn.nl> 2014-06-11 14:43:58 +0200
committer: Peter Wu <peter@lekensteyn.nl> 2014-06-11 14:43:58 +0200
commit: d489c0b2fb1d0aa40d22eefba5e3bf0194fae710 (patch)
tree: b1a873f9a581f2377631573d24004a1cbec9ad68 /src
parent: 4511cfb3f975088eb4ebd388e00977cc2ed3787d (diff)
parent: 7c501d799ace9ff2cf615a5e91b9040f2311f2c4 (diff)
download: Goldfarmer-d489c0b2fb1d0aa40d22eefba5e3bf0194fae710.tar.gz
2 files changed, 82 insertions, 0 deletions
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java
index d0a2b31..128b078 100644
--- a/src/main/Analyzor.java
+++ b/src/main/Analyzor.java
@@ -566,6 +566,84 @@ public class Analyzor {
         }
         writer.close();
     }
+    
+        void categorize(String file) throws FileNotFoundException, UnsupportedEncodingException{
+        
+        //get the division in categories
+        InputStream inFile = new FileInputStream("categories.txt");
+        Scanner readFile = new Scanner(inFile);
+        HashMap<String,String> toCategory = new HashMap<>();
+        
+        while (readFile.hasNextLine()) {
+            String line = readFile.nextLine();
+            if(line.split(",").length>1){
+                for(String element:line.split(",")[1].split(" ")){
+                    toCategory.put(element, line.split(",")[0]);
+                }
+            }
+        }
+        
+        
+        //read the csv
+        Scanner sc = new Scanner(new File(file));
+        
+        PrintWriter writer = new PrintWriter("categorised.csv", "UTF-8");
+        //copy the first line
+        writer.println(sc.nextLine());
+        
+        String line;
+        String[] values;
+        Boolean printed;
+        HashSet<String> used;
+        
+        //for every line
+        while(sc.hasNextLine()){
+            //get the values (and so the word)
+            line = sc.nextLine();
+            values = line.split(",");
+            printed = false;
+            
+            //divide into categories
+            //substring
+            used = new HashSet<>();
+            for(String key : toCategory.keySet()){
+                if(values[0].contains(key) && !used.contains(toCategory.get(key))){
+                    used.add(toCategory.get(key));
+                    String[] newValues = values;
+                    newValues[0] = toCategory.get(key);
+                    //print it
+                    writer.println(csvLine(newValues));
+                    printed = true;
+                }
+            }
+            //exact word
+            if(toCategory.containsKey("#" + values[0])){
+                values[0] = toCategory.get("#" + values[0]);
+            }
+            
+            //print it
+            if(!printed){
+                writer.println(csvLine(values));
+            }
+        }
+        writer.close();
+    }
+        
+    String csvLine(String[] values){
+        int length = values.length;
+        int index = 0;
+        String result = "";
+
+        for(String s : values){
+            result += s;
+            if(!(index == length - 1)){
+                result += ",";
+            }
+            index++;
+        }
+        
+        return result;
+    }
 
     //replaces punctuation so it will be splitted
     //also removes urls
diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java
index d624a71..61878d7 100644
--- a/src/main/FarmShell.java
+++ b/src/main/FarmShell.java
@@ -145,6 +145,9 @@ public class FarmShell {
             case newsspread:
                 getAnalyzor().newsSpread(params[0]);
                 break;
+            case categorize:
+                getAnalyzor().categorize(params[0]);
+                break;
             case getBrands:
                 String query = params.length > 0 ? params[0].trim() : "";
                 String[] args = query.split("\\s+", 2);
@@ -194,6 +197,7 @@ public class FarmShell {
         disco("makes a outputfile for disco", 1),
         posneg("makes a csv for a histogram for positive or negative tweets", 1),
         newsspread("makes a csv for disco to show a news spread process", 1),
+        categorize("categorizes words in csv (first column) as defined in categorize.txt", 1),
         exit("Returns to shell"),
         help("Get help");
author	Peter Wu <peter@lekensteyn.nl>	2014-06-11 14:43:58 +0200
committer	Peter Wu <peter@lekensteyn.nl>	2014-06-11 14:43:58 +0200
commit	d489c0b2fb1d0aa40d22eefba5e3bf0194fae710 (patch)
tree	b1a873f9a581f2377631573d24004a1cbec9ad68 /src
parent	4511cfb3f975088eb4ebd388e00977cc2ed3787d (diff)
parent	7c501d799ace9ff2cf615a5e91b9040f2311f2c4 (diff)
download	Goldfarmer-d489c0b2fb1d0aa40d22eefba5e3bf0194fae710.tar.gz