From a45b8b54685e565eda2a6f81eee10c39974f1175 Mon Sep 17 00:00:00 2001 From: s123188 Date: Wed, 11 Jun 2014 12:29:38 +0200 Subject: new method for categorizing words --- src/main/Analyzor.java | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ src/main/FarmShell.java | 5 +++++ 2 files changed, 55 insertions(+) diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 7b9def5..7685cdb 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -472,6 +472,56 @@ public class Analyzor { writer.close(); } + void categorize(String file) throws FileNotFoundException, UnsupportedEncodingException{ + + //get the division in categories + InputStream inFile = new FileInputStream("categories.txt"); + Scanner readFile = new Scanner(inFile); + HashMap toCategory = new HashMap<>(); + + while (readFile.hasNextLine()) { + String line = readFile.nextLine(); + if(line.split(",").length>1){ + toCategory.put(line.split(",")[0], line.split(",")[1]); + } + } + + + //read the csv + Scanner sc = new Scanner(new File(file)); + + PrintWriter writer = new PrintWriter("categorised.csv", "UTF-8"); + //copy the first line + writer.println(sc.nextLine()); + + String line; + String[] values; + + //for every line + while(sc.hasNextLine()){ + //get the values (and so the word) + line = sc.nextLine(); + values = line.split(","); + + //divide into categories + if(toCategory.containsKey(values[0])){ + values[0] = toCategory.get(values[0]); + } + + //print it + int length = values.length; + int index = 0; + for(String s : values){ + writer.print(s); + if(!(index == length - 1)){ + writer.print(","); + } + index++; + } + } + writer.close(); + } + //replaces punctuation so it will be splitted //also removes urls private String splitPunctToWords(String text) { diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java index 9342d0b..6fa6d5f 100644 --- a/src/main/FarmShell.java +++ b/src/main/FarmShell.java @@ -137,6 +137,7 @@ public class FarmShell { break; case timezone: getAnalyzor().timezone(params[0]); + break; case disco: getAnalyzor().disco(params[0]); break; @@ -146,6 +147,9 @@ public class FarmShell { case newsspread: getAnalyzor().newsSpread(params[0]); break; + case categorize: + getAnalyzor().categorize(params[0]); + break; case getBrands: getAnalyzor().getBrands(); break; @@ -183,6 +187,7 @@ public class FarmShell { disco("makes a outputfile for disco", 1), posneg("makes a csv for a histogram for positive or negative tweets", 1), newsspread("makes a csv for disco to show a news spread process", 1), + categorize("categorizes words in a csv as defined in categories.txt", 1), exit("Returns to shell"), help("Get help"); -- cgit v1.2.1