diff options
author | Peter Wu <peter@lekensteyn.nl> | 2014-06-11 14:43:58 +0200 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2014-06-11 14:43:58 +0200 |
commit | d489c0b2fb1d0aa40d22eefba5e3bf0194fae710 (patch) | |
tree | b1a873f9a581f2377631573d24004a1cbec9ad68 /src | |
parent | 4511cfb3f975088eb4ebd388e00977cc2ed3787d (diff) | |
parent | 7c501d799ace9ff2cf615a5e91b9040f2311f2c4 (diff) | |
download | Goldfarmer-d489c0b2fb1d0aa40d22eefba5e3bf0194fae710.tar.gz |
Merge branch 'master' of git.lekensteyn.nl:tue/2IOC0-DBL/Goldfarmer
Conflicts:
src/main/Analyzor.java
src/main/FarmShell.java
Diffstat (limited to 'src')
-rw-r--r-- | src/main/Analyzor.java | 78 | ||||
-rw-r--r-- | src/main/FarmShell.java | 4 |
2 files changed, 82 insertions, 0 deletions
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index d0a2b31..128b078 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -566,6 +566,84 @@ public class Analyzor { } writer.close(); } + + void categorize(String file) throws FileNotFoundException, UnsupportedEncodingException{ + + //get the division in categories + InputStream inFile = new FileInputStream("categories.txt"); + Scanner readFile = new Scanner(inFile); + HashMap<String,String> toCategory = new HashMap<>(); + + while (readFile.hasNextLine()) { + String line = readFile.nextLine(); + if(line.split(",").length>1){ + for(String element:line.split(",")[1].split(" ")){ + toCategory.put(element, line.split(",")[0]); + } + } + } + + + //read the csv + Scanner sc = new Scanner(new File(file)); + + PrintWriter writer = new PrintWriter("categorised.csv", "UTF-8"); + //copy the first line + writer.println(sc.nextLine()); + + String line; + String[] values; + Boolean printed; + HashSet<String> used; + + //for every line + while(sc.hasNextLine()){ + //get the values (and so the word) + line = sc.nextLine(); + values = line.split(","); + printed = false; + + //divide into categories + //substring + used = new HashSet<>(); + for(String key : toCategory.keySet()){ + if(values[0].contains(key) && !used.contains(toCategory.get(key))){ + used.add(toCategory.get(key)); + String[] newValues = values; + newValues[0] = toCategory.get(key); + //print it + writer.println(csvLine(newValues)); + printed = true; + } + } + //exact word + if(toCategory.containsKey("#" + values[0])){ + values[0] = toCategory.get("#" + values[0]); + } + + //print it + if(!printed){ + writer.println(csvLine(values)); + } + } + writer.close(); + } + + String csvLine(String[] values){ + int length = values.length; + int index = 0; + String result = ""; + + for(String s : values){ + result += s; + if(!(index == length - 1)){ + result += ","; + } + index++; + } + + return result; + } //replaces punctuation so it will be splitted //also removes urls diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java index d624a71..61878d7 100644 --- a/src/main/FarmShell.java +++ b/src/main/FarmShell.java @@ -145,6 +145,9 @@ public class FarmShell { case newsspread: getAnalyzor().newsSpread(params[0]); break; + case categorize: + getAnalyzor().categorize(params[0]); + break; case getBrands: String query = params.length > 0 ? params[0].trim() : ""; String[] args = query.split("\\s+", 2); @@ -194,6 +197,7 @@ public class FarmShell { disco("makes a outputfile for disco", 1), posneg("makes a csv for a histogram for positive or negative tweets", 1), newsspread("makes a csv for disco to show a news spread process", 1), + categorize("categorizes words in csv (first column) as defined in categorize.txt", 1), exit("Returns to shell"), help("Get help"); |