From 0e0ae624389696732b8358fbe092813c24b2b361 Mon Sep 17 00:00:00 2001 From: Maurice Laveaux Date: Wed, 4 Jun 2014 10:31:38 +0200 Subject: Added better notifications on getBrands. * Reports amount of tweets processed. --- src/main/Analyzor.java | 770 +++++++++++++++++++++++++----------------------- src/main/FarmShell.java | 458 ++++++++++++++-------------- 2 files changed, 643 insertions(+), 585 deletions(-) diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index b896f62..1560417 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -1,368 +1,402 @@ -package main; - -import analysis.BrandChecker; -import database.NamedPreparedStatement; -import database.QueryUtils; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.List; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map.Entry; -import java.util.Scanner; - -/** - * The sentiment analysis class that rates tweets based on a unigram and bigram - * set of weights. - */ -public class Analyzor { - - /** - * The map that matches single words to their weights. - */ - private final HashMap unimap = new HashMap(); - - /** - * The map that matches word pairs to their weights. - */ - private final HashMap bimap = new HashMap(); - - /** - * The results of a query, maybe return from query(). - */ - private ResultSet data; - - /** - * The persistent connection to the database. - */ - private final Connection connection; - - /** - * @param connection An open connection to the database. - */ - public Analyzor(Connection connection) { - this.connection = connection; - } - - /** - * Read the unigram and bigram lexica. - * - * @throws FileNotFoundException - */ - public void readLexicon() throws FileNotFoundException { - if (!unimap.isEmpty()) { - // data is already read. - return; - } - System.err.println("Trying to read lexicons..."); - // A unigram is in the format (WS = whitespace): - // word rating ??? ?? - // A bigram has an two WS-separated words instead of one. - try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); - Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { - //Fill the map of unigrams - int lineno = 1; - while (uniScanner.hasNext()) { - - String words = uniScanner.next(); - Double d = Double.valueOf(uniScanner.next()); - unimap.put(words.toLowerCase(), d); - if (uniScanner.hasNextLine()) { - uniScanner.nextLine(); - } - lineno++; - - } - - //fill the map of bigrams - while (biScanner.hasNext()) { - String words = biScanner.next() + " " + biScanner.next(); - bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next())); - if (biScanner.hasNextLine()) { - biScanner.nextLine(); - } - } - } - System.err.println("Lexicons are read."); - } - - /** - * Executes a query that the analyzer can analyze. - * - * @param query The query string to execute. - * @throws SQLException When database connection isn't available. - */ - public void query(String query) throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement(query); - data = statement.executeQuery(); - } - - /** - * Run a sentiment analysis and fill the database with the output. - * - * @param query The sql text for the query. - * @throws SQLException - * @throws IOException - */ - public void sentimentAnalysis(String query) throws SQLException, IOException { - query(query); - - //read the lexicons - readLexicon(); - - //go to the start of te dataset - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - Double value; - String text; - - //for all tuples - while (data.next()) { - //get the text - text = data.getString("text"); - text = splitPunctToWords(text); - // test is the tweet text you are going to analyze - String[] words = text.split("\\s+"); // text splitted into separate words - double positiverate = 0; // positive rating - - // Rate the text with unigrams - for (String word : words) { - value = unimap.get(word); - if (value != null) { - positiverate += unimap.get(word); - } - } - // Rate the text with bigrams - for (int i = 0; i < words.length - 1; i++) { - String pair = words[i] + " " + words[i + 1]; - value = bimap.get(pair); - if (value != null) { - positiverate += bimap.get(pair); - } - } - //insert the rating into the database - NamedPreparedStatement m_insertRating; - m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); - QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); - m_insertRating.executeUpdate(); - //don't print the rate - //System.out.println(text + ": " + (int) (positiverate * 10)); - } - } - - /** - * Make a wordcloud of the results of some query. - * - * @param query The sql text for a query. - * @throws SQLException - * @throws FileNotFoundException - * @throws UnsupportedEncodingException - */ - public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - - query(query); - //go to the start of the ResultSet data - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - String text; - String brand; - String[] words; - HashMap> wordcloud = new HashMap<>(); - - while (data.next()) { - //get brand - brand=data.getString("brand"); - //make hashmap for each brand - if(!wordcloud.containsKey(brand)){ - wordcloud.put(brand, new HashMap()); - } - //get the text - text = data.getString("text"); - //remove punctuation, convert to lowercase and split on words - text = removePunct(text); - text = text.toLowerCase(); - words = text.split("\\s+"); - //for all words - for (String word : words) { - //if it is empty, a space or a stripe, skip it - if(word.equals("") || word.equals(" ") || word.equals("-")){ - continue; - } - //if the word is already in the map, increment the amount - if(wordcloud.get(brand).containsKey(word)){ - wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); - } - //if the word is not already in the map, make an entry with amount = 1 - else{ - wordcloud.get(brand).put(word, 1); - } - } - } - //print the words and their frequency in a csv file - mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count"); - } - - //generate csv for disco from the query - public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - //do the query - query(query); - PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); - //print the first row - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - writer.print(data.getMetaData().getColumnLabel(i) + ", "); - } - writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); - //print the values - while (data.next()) { - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - if (data.getObject(i) == null) { - writer.print(", "); - } else { - writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); - } - } - if (data.getObject(data.getMetaData().getColumnCount()) == null) { - writer.println("0"); - } else { - writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); - } - } - writer.close(); - } - - public void getBrands() throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement("delete from mentionsbrand"); - statement.executeUpdate(); - BrandChecker checker = new BrandChecker("brandonlyrules.txt"); - query("select * from tweet"); - NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); - while (data.next()) { - List brands = checker.getBrands(data.getString("text")); - if (brands.isEmpty()) { - QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no"); - m_insertBrand.executeUpdate(); - } else { - for (String brand : brands) { - QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand); - m_insertBrand.executeUpdate(); - } - } - } - } - - //gets the amount of users that tweet about a brand in a timezone - //makes a csv file timezone, brand, amount - public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{ - query(query); - - InputStream inFile = new FileInputStream("timezone.txt"); - Scanner readFile = new Scanner(inFile); - HashMap toTimezone = new HashMap<>(); - while (readFile.hasNextLine()) { - String line = readFile.nextLine(); - if(line.split(",").length>1){ - toTimezone.put(line.split(",")[0], line.split(",")[1]); - } - } - - - - //hashmap timezone, brand, amount - HashMap> timeMap = new HashMap<>(); - String timezone; - String brand; - - while(data.next()){ - timezone = data.getString("timezone"); - if (toTimezone.containsKey(timezone)){ - timezone=toTimezone.get(timezone); - } else { - timezone="other"; - } - brand = data.getString("brand"); - //if the timezone is already in the map - if(timeMap.containsKey(timezone)){ - //if the brand for that timezone is already in the map - if(timeMap.get(timezone).containsKey(brand)){ - //increment the amount - timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); - } - //if the brand for that timezone is not yet in the map - else{ - //make a new entry for that brand with amount = 1 - timeMap.get(timezone).put(brand, 1); - } - } - //if the timezone is not yet in the map - else{ - //make a new hashmap for this map and fill it with the brand and the amount - timeMap.put(timezone, new HashMap()); - timeMap.get(timezone).put(brand, 1); - } - } - - - //make the CSV out of the map - mapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); - } - - //replaces punctuation so it will be splitted - //also removes urls - private String splitPunctToWords(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[!?):;\"']", " $0"); - text = text.replaceAll("[.,-](\\s|$)", " $0"); - text = text.replaceAll("\\s[(\"']", "$0 "); - return text; - } - - //removes punctuation - //also removes urls - private String removePunct(String text) { - text = text.replaceAll("https?://\\S*", " "); - text = text.replaceAll("@\\S*", " "); - text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); - return text; - } - - //prints a hashmap into a csv for a html application - //Hashmap> becomes key1, key2, value - //only for String, String, Integer - void mapToCSV(HashMap> map, String fileName, String firstLine) - throws FileNotFoundException, UnsupportedEncodingException{ - - PrintWriter writer = new PrintWriter(fileName, "UTF-8"); - - writer.println(firstLine); - - //loop over brands - for(Entry en : map.entrySet()){ - //loop over words - for(Entry e : map.get(en.getKey()).entrySet()){ - writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); - } - } - - writer.close(); - System.out.println("csv file made, please put it next to html file and run this"); - } -} +package main; + +import analysis.BrandChecker; +import database.NamedPreparedStatement; +import database.QueryUtils; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.List; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map.Entry; +import java.util.Scanner; + +/** + * The sentiment analysis class that rates tweets based on a unigram and bigram + * set of weights. + */ +public class Analyzor { + + /** + * The map that matches single words to their weights. + */ + private final HashMap unimap = new HashMap(); + + /** + * The map that matches word pairs to their weights. + */ + private final HashMap bimap = new HashMap(); + + /** + * The results of a query, maybe return from query(). + */ + private ResultSet data; + + /** + * The persistent connection to the database. + */ + private final Connection connection; + + /** + * @param connection An open connection to the database. + */ + public Analyzor(Connection connection) { + this.connection = connection; + } + + /** + * Read the unigram and bigram lexica. + * + * @throws FileNotFoundException + */ + public void readLexicon() throws FileNotFoundException { + if (!unimap.isEmpty()) { + // data is already read. + return; + } + System.err.println("Trying to read lexicons..."); + // A unigram is in the format (WS = whitespace): + // word rating ??? ?? + // A bigram has an two WS-separated words instead of one. + try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); + Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { + //Fill the map of unigrams + int lineno = 1; + while (uniScanner.hasNext()) { + + String words = uniScanner.next(); + Double d = Double.valueOf(uniScanner.next()); + unimap.put(words.toLowerCase(), d); + if (uniScanner.hasNextLine()) { + uniScanner.nextLine(); + } + lineno++; + + } + + //fill the map of bigrams + while (biScanner.hasNext()) { + String words = biScanner.next() + " " + biScanner.next(); + bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next())); + if (biScanner.hasNextLine()) { + biScanner.nextLine(); + } + } + } + System.err.println("Lexicons are read."); + } + + /** + * Executes a query that the analyzer can analyze. + * + * @param query The query string to execute. + * @throws SQLException When database connection isn't available. + */ + public void query(String query) throws SQLException { + PreparedStatement statement; + //make a connection to the database and execute the query + statement = connection.prepareStatement(query); + data = statement.executeQuery(); + } + + /** + * Run a sentiment analysis and fill the database with the output. + * + * @param query The sql text for the query. + * @throws SQLException + * @throws IOException + */ + public void sentimentAnalysis(String query) throws SQLException, IOException { + query(query); + + //read the lexicons + readLexicon(); + + //go to the start of te dataset + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + + Double value; + String text; + + //for all tuples + while (data.next()) { + //get the text + text = data.getString("text"); + text = splitPunctToWords(text); + // test is the tweet text you are going to analyze + String[] words = text.split("\\s+"); // text splitted into separate words + double positiverate = 0; // positive rating + + // Rate the text with unigrams + for (String word : words) { + value = unimap.get(word); + if (value != null) { + positiverate += unimap.get(word); + } + } + // Rate the text with bigrams + for (int i = 0; i < words.length - 1; i++) { + String pair = words[i] + " " + words[i + 1]; + value = bimap.get(pair); + if (value != null) { + positiverate += bimap.get(pair); + } + } + //insert the rating into the database + NamedPreparedStatement m_insertRating; + m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); + QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); + m_insertRating.executeUpdate(); + //don't print the rate + //System.out.println(text + ": " + (int) (positiverate * 10)); + } + } + + /** + * Make a wordcloud of the results of some query. + * + * @param query The sql text for a query. + * @throws SQLException + * @throws FileNotFoundException + * @throws UnsupportedEncodingException + */ + public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + + query(query); + //go to the start of the ResultSet data + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + + String text; + String brand; + String[] words; + HashMap> wordcloud = new HashMap<>(); + + while (data.next()) { + //get brand + brand=data.getString("brand"); + //make hashmap for each brand + if(!wordcloud.containsKey(brand)){ + wordcloud.put(brand, new HashMap()); + } + //get the text + text = data.getString("text"); + //remove punctuation, convert to lowercase and split on words + text = removePunct(text); + text = text.toLowerCase(); + words = text.split("\\s+"); + //for all words + for (String word : words) { + //if it is empty, a space or a stripe, skip it + if(word.equals("") || word.equals(" ") || word.equals("-")){ + continue; + } + //if the word is already in the map, increment the amount + if(wordcloud.get(brand).containsKey(word)){ + wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); + } + //if the word is not already in the map, make an entry with amount = 1 + else{ + wordcloud.get(brand).put(word, 1); + } + } + } + //print the words and their frequency in a csv file + mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count"); + } + + //generate csv for disco from the query + public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + //do the query + query(query); + PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); + //print the first row + for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { + writer.print(data.getMetaData().getColumnLabel(i) + ", "); + } + writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); + //print the values + while (data.next()) { + for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { + if (data.getObject(i) == null) { + writer.print(", "); + } else { + writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); + } + } + if (data.getObject(data.getMetaData().getColumnCount()) == null) { + writer.println("0"); + } else { + writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); + } + } + writer.close(); + } + + /** + * Obtain the brands of select tweet texts. + * + * @param queryText The rows to select. + * @param reset Whether to reset mentionsbrand. + * @throws SQLException If the query is unsuccesfull. + */ + public void getBrands(String queryText, boolean reset) throws SQLException { + BrandChecker checker = new BrandChecker("brandonlyrules.txt"); + + PreparedStatement statement; + // make a connection to the database and execute the query + if (reset) { + System.out.println("Cleaning old entries of mentionsbrand."); + statement = connection.prepareStatement("delete from mentionsbrand"); + statement.executeUpdate(); + } + + System.out.println("Obtaining all selected entries in tweet."); + if (queryText.isEmpty()) { + query("select * from tweet"); + } else { + query(queryText); + } + System.out.println("Query finished."); + + NamedPreparedStatement insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); + + int brandCount = 0; + int count = 0; + long timestamp = System.currentTimeMillis(); + + while (data.next()) { + List brands = checker.getBrands(data.getString("text")); + if (brands.isEmpty()) { + brandCount++; + QueryUtils.setInsertBrandParams(insertBrand, data.getLong("tweetid"), "no"); + insertBrand.executeUpdate(); + } else { + brandCount += brands.size(); + for (String brand : brands) { + QueryUtils.setInsertBrandParams(insertBrand, data.getLong("tweetid"), brand); + insertBrand.executeUpdate(); + } + } + + count++; + if (count % 10000 == 0) { + System.out.println("Processed " + count + " tweets in " + (System.currentTimeMillis() - timestamp) + " ms"); + } + } + + System.out.println("Processed " + count + " tweets in " + (System.currentTimeMillis() - timestamp) + " ms"); + System.out.println("Finished getBrands, processed " + count + " number of tweets, added " + brandCount + " brands or no."); + } + + //gets the amount of users that tweet about a brand in a timezone + //makes a csv file timezone, brand, amount + public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{ + query(query); + + InputStream inFile = new FileInputStream("timezone.txt"); + Scanner readFile = new Scanner(inFile); + HashMap toTimezone = new HashMap<>(); + while (readFile.hasNextLine()) { + String line = readFile.nextLine(); + if(line.split(",").length>1){ + toTimezone.put(line.split(",")[0], line.split(",")[1]); + } + } + + + + //hashmap timezone, brand, amount + HashMap> timeMap = new HashMap<>(); + String timezone; + String brand; + + while(data.next()){ + timezone = data.getString("timezone"); + if (toTimezone.containsKey(timezone)){ + timezone=toTimezone.get(timezone); + } else { + timezone="other"; + } + brand = data.getString("brand"); + //if the timezone is already in the map + if(timeMap.containsKey(timezone)){ + //if the brand for that timezone is already in the map + if(timeMap.get(timezone).containsKey(brand)){ + //increment the amount + timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); + } + //if the brand for that timezone is not yet in the map + else{ + //make a new entry for that brand with amount = 1 + timeMap.get(timezone).put(brand, 1); + } + } + //if the timezone is not yet in the map + else{ + //make a new hashmap for this map and fill it with the brand and the amount + timeMap.put(timezone, new HashMap()); + timeMap.get(timezone).put(brand, 1); + } + } + + + //make the CSV out of the map + mapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); + } + + //replaces punctuation so it will be splitted + //also removes urls + private String splitPunctToWords(String text) { + text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll("[!?):;\"']", " $0"); + text = text.replaceAll("[.,-](\\s|$)", " $0"); + text = text.replaceAll("\\s[(\"']", "$0 "); + return text; + } + + //removes punctuation + //also removes urls + private String removePunct(String text) { + text = text.replaceAll("https?://\\S*", " "); + text = text.replaceAll("@\\S*", " "); + text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); + return text; + } + + //prints a hashmap into a csv for a html application + //Hashmap> becomes key1, key2, value + //only for String, String, Integer + void mapToCSV(HashMap> map, String fileName, String firstLine) + throws FileNotFoundException, UnsupportedEncodingException{ + + PrintWriter writer = new PrintWriter(fileName, "UTF-8"); + + writer.println(firstLine); + + //loop over brands + for(Entry en : map.entrySet()){ + //loop over words + for(Entry e : map.get(en.getKey()).entrySet()){ + writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); + } + } + + writer.close(); + System.out.println("csv file made, please put it next to html file and run this"); + } +} diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java index 6bf350e..1f2835c 100644 --- a/src/main/FarmShell.java +++ b/src/main/FarmShell.java @@ -1,217 +1,241 @@ -package main; - -import database.ConnectionBuilder; -import java.io.IOException; -import java.sql.Connection; -import java.sql.SQLException; -import java.util.Arrays; -import java.util.NoSuchElementException; -import java.util.Scanner; - -/** - * - * @author s123188 - */ -public class FarmShell { - - /** - * A scanner for the stdin. - */ - private final Scanner scanner = new Scanner(System.in); - - private Analyzor cached_analyzor; - private final ConnectionBuilder dbConnectionBuilder; - - FarmShell(ConnectionBuilder dbConnectionBuilder) { - this.dbConnectionBuilder = dbConnectionBuilder; - } - - private void printPrompt() { - System.out.print("$ "); - } - - private Analyzor getAnalyzor() throws SQLException { - if (cached_analyzor == null) { - Connection dbCon = dbConnectionBuilder.create(); - cached_analyzor = new Analyzor(dbCon); - } - return cached_analyzor; - } - - /** - * Processes commands from stdin until the exit command is received or EOF. - */ - public void process_forever() { - System.err.println("Entering interactive shell, type 'help' for help " - + "or 'exit' to leave. '.' repeats the previous interactive " - + "command."); - // print prompt for reading first command - printPrompt(); - String lastLine = ""; - while (scanner.hasNextLine()) { - String line = scanner.nextLine().trim(); - // repeat last command - if (line.equals(".")) { - line = lastLine; - } - if (!execute(line)) { - // requested to terminate - break; - } - if (!line.isEmpty()) { - lastLine = line; - } - // print prompt for reading next line - printPrompt(); - } - } - - /** - * Execute a single commands. - * - * @param cmd A single line of the command. - * @return Whether to continue or exit the application. - */ - public boolean execute(String cmd) { - String[] args = cmd.trim().split("\\s+", 2); - if (!args[0].isEmpty()) { - // non-empty command, let's see whether it makes sense? - return execute(args); - } - return true; - } - - /** - * Executes a command with optional parameters. - * - * @param args An array with the first argument containing the command with - * optional parameters in following arguments. - * @return true if more commands are allowed to be executed, false - * otherwise. - */ - public boolean execute(String[] args) { - try { - Command command = Command.fromString(args[0]); - String[] params = Arrays.copyOfRange(args, 1, args.length); - execute(command, params); - } catch (IllegalArgumentException ex) { - System.err.println(ex.getMessage()); - } catch (IOException ex) { - System.err.println("Command " + args[0] + " failed with " + ex); - ex.printStackTrace(); - } catch (NoSuchElementException ex) { - if ("EXIT NOW".equals(ex.getMessage())) { - // thrown by the "exit" command to signal exit - return false; - } else { - System.err.println("ZOMG SOMETHIGN FAILED: " + ex.getMessage()); - ex.printStackTrace(); - } - } catch (SQLException ex) { - System.err.println("such " + ex); - } - // another satisfied customer, next! - return true; - } - - private void execute(Command command, String[] params) throws SQLException, IOException { - if (params.length < command.getParamCount()) { - throw new IllegalArgumentException("Expected " - + command.getParamCount() + " parameters, got only " - + params.length); - } - switch (command) { - case filterbots: - System.out.println("not yet implemented"); - break; - case sentiment: - // if there is no query, update all unrated items. - if (params.length > 0) { - getAnalyzor().sentimentAnalysis(params[0]); - } else { - getAnalyzor().sentimentAnalysis(""); - } - break; - case wordcloud: - getAnalyzor().makeWordCloud(params[0]); - break; - case timezone: - getAnalyzor().timezone(params[0]); - case disco: - getAnalyzor().disco(params[0]); - break; - case getBrands: - getAnalyzor().getBrands(); - break; - case help: - for (String line : HELP) { - System.out.println(line); - } - for (Command cmd : Command.values()) { - System.out.printf(" %-10s", cmd.name()); - if (!cmd.getDescription().isEmpty()) { - System.out.print(" " + cmd.getDescription()); - } - if (cmd.getParamCount() == 1) { - System.out.print(" (1 arg)"); - } else if (cmd.getParamCount() > 1) { - System.out.printf(" (%d args)", cmd.getParamCount()); - } - System.out.println(); - } - break; - case exit: - throw new NoSuchElementException("EXIT NOW"); - default: - throw new AssertionError(command.name()); - } - } - - enum Command { - - filterbots("marks all users as bot or not", 1), - sentiment("analyzes all tweets on brand positivity (optional arg: tweet/brand selection query)"), - wordcloud("makes a wordcloud of the text of the tweets", 1), - getBrands("fills the database with the brands of a tweet"), - timezone("makes a map per brand for the users", 1), - disco("makes a outputfile for disco", 1), - exit("Returns to shell"), - help("Get help"); - - private final String description; - private final int paramCount; - - Command(String description) { - this.description = description; - this.paramCount = 0; - } - - Command(String description, int paramCount) { - this.description = description; - this.paramCount = paramCount; - } - - public String getDescription() { - return description; - } - - public int getParamCount() { - return paramCount; - } - - public static Command fromString(String command) { - for (Command cmd : values()) { - if (cmd.name().equals(command)) { - return cmd; - } - } - throw new IllegalArgumentException("Unrecognized command. Hint: help"); - } - }; - - private final String[] HELP = new String[]{ - "Interactive TweetShell", - "", - "Available commands:" - }; -} +package main; + +import database.ConnectionBuilder; +import java.io.IOException; +import java.sql.Connection; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.NoSuchElementException; +import java.util.Scanner; + +/** + * + * @author s123188 + */ +public class FarmShell { + + /** + * A scanner for the stdin. + */ + private final Scanner scanner = new Scanner(System.in); + + private Analyzor cached_analyzor; + private final ConnectionBuilder dbConnectionBuilder; + + FarmShell(ConnectionBuilder dbConnectionBuilder) { + this.dbConnectionBuilder = dbConnectionBuilder; + } + + private void printPrompt() { + System.out.print("$ "); + } + + private Analyzor getAnalyzor() throws SQLException { + if (cached_analyzor == null) { + Connection dbCon = dbConnectionBuilder.create(); + cached_analyzor = new Analyzor(dbCon); + } + return cached_analyzor; + } + + /** + * Processes commands from stdin until the exit command is received or EOF. + */ + public void process_forever() { + System.err.println("Entering interactive shell, type 'help' for help " + + "or 'exit' to leave. '.' repeats the previous interactive " + + "command."); + // print prompt for reading first command + printPrompt(); + String lastLine = ""; + while (scanner.hasNextLine()) { + String line = scanner.nextLine().trim(); + // repeat last command + if (line.equals(".")) { + line = lastLine; + } + if (!execute(line)) { + // requested to terminate + break; + } + if (!line.isEmpty()) { + lastLine = line; + } + // print prompt for reading next line + printPrompt(); + } + } + + /** + * Execute a single commands. + * + * @param cmd A single line of the command. + * @return Whether to continue or exit the application. + */ + public boolean execute(String cmd) { + String[] args = cmd.trim().split("\\s+", 2); + if (!args[0].isEmpty()) { + // non-empty command, let's see whether it makes sense? + return execute(args); + } + return true; + } + + /** + * Executes a command with optional parameters. + * + * @param args An array with the first argument containing the command with + * optional parameters in following arguments. + * @return true if more commands are allowed to be executed, false + * otherwise. + */ + public boolean execute(String[] args) { + try { + Command command = Command.fromString(args[0]); + String[] params = Arrays.copyOfRange(args, 1, args.length); + execute(command, params); + } catch (IllegalArgumentException ex) { + System.err.println(ex.getMessage()); + } catch (IOException ex) { + System.err.println("Command " + args[0] + " failed with " + ex); + ex.printStackTrace(); + } catch (NoSuchElementException ex) { + if ("EXIT NOW".equals(ex.getMessage())) { + // thrown by the "exit" command to signal exit + return false; + } else { + System.err.println("ZOMG SOMETHIGN FAILED: " + ex.getMessage()); + ex.printStackTrace(); + } + } catch (SQLException ex) { + System.err.println("such " + ex); + } + // another satisfied customer, next! + return true; + } + + private void execute(Command command, String[] params) throws SQLException, IOException { + if (params.length < command.getParamCount()) { + throw new IllegalArgumentException("Expected " + + command.getParamCount() + " parameters, got only " + + params.length); + } + switch (command) { + case filterbots: + System.out.println("not yet implemented"); + break; + case sentiment: + // if there is no query, update all unrated items. + if (params.length > 0) { + getAnalyzor().sentimentAnalysis(params[0]); + } else { + getAnalyzor().sentimentAnalysis(""); + } + break; + case wordcloud: + getAnalyzor().makeWordCloud(params[0]); + break; + case timezone: + getAnalyzor().timezone(params[0]); + case disco: + getAnalyzor().disco(params[0]); + break; + case getBrands: + String trimmed = params[0].trim(); + String bool = trimmed; + String query = null; + + int index = trimmed.indexOf(" "); + + if (index > -1) { + bool = trimmed.substring(0, index); + query = trimmed.substring(index + 1, trimmed.length()); + } + + boolean reset = false; + if (bool.equals("true")) { + reset = true; + } else if (bool.equals("false")){ + reset = false; + } else { + throw new IllegalArgumentException("getBrands: expected boolean, got " + params[0]); + } + + if (query != null) { + getAnalyzor().getBrands(query, reset); + } else { + getAnalyzor().getBrands("", reset); + } + break; + case help: + for (String line : HELP) { + System.out.println(line); + } + for (Command cmd : Command.values()) { + System.out.printf(" %-10s", cmd.name()); + if (!cmd.getDescription().isEmpty()) { + System.out.print(" " + cmd.getDescription()); + } + if (cmd.getParamCount() == 1) { + System.out.print(" (1 arg)"); + } else if (cmd.getParamCount() > 1) { + System.out.printf(" (%d args)", cmd.getParamCount()); + } + System.out.println(); + } + break; + case exit: + throw new NoSuchElementException("EXIT NOW"); + default: + throw new AssertionError(command.name()); + } + } + + enum Command { + + filterbots("marks all users as bot or not", 1), + sentiment("analyzes all tweets on brand positivity (optional arg: tweet/brand selection query)"), + wordcloud("makes a wordcloud of the text of the tweets", 1), + getBrands("fills the database with the brands of a tweet, arg: bool indicating whether to reset mentionsbrand (optional arg: tweet selection query", 1), + timezone("makes a map per brand for the users", 1), + disco("makes a outputfile for disco", 1), + exit("Returns to shell"), + help("Get help"); + + private final String description; + private final int paramCount; + + Command(String description) { + this.description = description; + this.paramCount = 0; + } + + Command(String description, int paramCount) { + this.description = description; + this.paramCount = paramCount; + } + + public String getDescription() { + return description; + } + + public int getParamCount() { + return paramCount; + } + + public static Command fromString(String command) { + for (Command cmd : values()) { + if (cmd.name().equals(command)) { + return cmd; + } + } + throw new IllegalArgumentException("Unrecognized command. Hint: help"); + } + }; + + private final String[] HELP = new String[]{ + "Interactive TweetShell", + "", + "Available commands:" + }; +} -- cgit v1.2.1 From 7dd0cc0d27e6b741eb29089ef836f98e47cc9329 Mon Sep 17 00:00:00 2001 From: Maurice Laveaux Date: Wed, 4 Jun 2014 10:37:59 +0200 Subject: Removed to many rules that were not needed. --- brandonlyrules.txt | 73 ++++++------------------------------------------------ 1 file changed, 8 insertions(+), 65 deletions(-) diff --git a/brandonlyrules.txt b/brandonlyrules.txt index fe4557d..00e4dab 100755 --- a/brandonlyrules.txt +++ b/brandonlyrules.txt @@ -1,70 +1,13 @@ -samsung - samsung - galaxy -samsung - galaxy - s5,s4,s3,zoom,note -samsung - galaxy,s5 -samsung - samsung,s5 -samsung - galaxy,s4 -samsung - samsung,s4 -samsung - galaxy,s3 -samsung - samsung,s3 -samsung - galaxy,k,zoom -samsung - samsung,k,zoom -samsung - galaxy,note -samsung - samsung,note +samsung - samsung -food - apple - iphone -apple - iphone - 4,4s,5,5s,5c -apple - iphone,4 -apple - iphone4 -apple - iphone,4s -apple - iphone4s -apple - iphone,5 -apple - iphone5 -apple - iphone,5s -apple - iphone5s -apple - iphone,5c -apple - iphone5c +no - apple - iphone +apple - iphone -huawei- huawei - ascend,p6,p7,mini,y300,y530,mate,g700,g510,g6,g525 -huawei - huawei,ascend -huawei - huawei,p6 -huawei - huawei,p7 -huawei - huawei,mini -huawei - huawei,y300 -huawei - huawei,y530 -huawei - huawei,mate -huawei - huawei,g700 -huawei - huawei,g510 -huawei - huawei,g6 -huawei - huawei,g525 +huawei- huawei -sony - sony - xperia,e1,z,z1,z2,compact,ZR,M -sony - sony,xperia -sony - sony,L -sony - sony,E1 -sony - sony,Z -sony - sony,Z1 -sony - sony,Z2 -sony - sony,compact -sony - sony,ZR -sony - sony,M +sony - sony -HTC - htc - one,m8,mini,desire,dual,x,sv -HTC - htc,one -HTC - htc,m8 -HTC - htc,mini -HTC - htc,desire -HTC - htc,x,dual -HTC - htc,sv +HTC - htc -LG - lg - nexus,g2,l70,l90,flex,mini,l9,l7,l5,l3 -LG - nexus,5 -LG - lg,g2 -LG - lg,l70 -LG - lg,l90 -LG - lg,l40 -LG - lg,g,flex -LG - lg,mini -LG - lg,l9 -LG - lg,l7 -LG - lg,l5 -LG - lg,l3 +LG - nexus +LG - lg -- cgit v1.2.1 From 6747da91cba9f90dcc6786cb8979dafb3a44043b Mon Sep 17 00:00:00 2001 From: Maurice Laveaux Date: Wed, 4 Jun 2014 13:37:07 +0200 Subject: get rid of CRLF again. --- nbproject/project.properties | 162 +++++++++++++++++++++---------------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/nbproject/project.properties b/nbproject/project.properties index b262ab6..ab8ae05 100644 --- a/nbproject/project.properties +++ b/nbproject/project.properties @@ -1,81 +1,81 @@ -annotation.processing.enabled=true -annotation.processing.enabled.in.editor=false -annotation.processing.processors.list= -annotation.processing.run.all.processors=true -annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output -application.title=Goldfarmer -application.vendor=maurice -build.classes.dir=${build.dir}/classes -build.classes.excludes=**/*.java,**/*.form -# This directory is removed when the project is cleaned: -build.dir=build -build.generated.dir=${build.dir}/generated -build.generated.sources.dir=${build.dir}/generated-sources -# Only compile against the classpath explicitly listed here: -build.sysclasspath=ignore -build.test.classes.dir=${build.dir}/test/classes -build.test.results.dir=${build.dir}/test/results -# Uncomment to specify the preferred debugger connection transport: -#debug.transport=dt_socket -debug.classpath=\ - ${run.classpath} -debug.test.classpath=\ - ${run.test.classpath} -# Files in build.classes.dir which should be excluded from distribution jar -dist.archive.excludes= -# This directory is removed when the project is cleaned: -dist.dir=dist -dist.jar=${dist.dir}/Goldfarmer.jar -dist.javadoc.dir=${dist.dir}/javadoc -endorsed.classpath= -excludes= -file.reference.joda-time-2.3.jar=lib/joda-time-2.3.jar -file.reference.postgresql-9.3-1101.jdbc41.jar=lib/postgresql-9.3-1101.jdbc41.jar -includes=** -jar.compress=false -javac.classpath=\ - ${file.reference.joda-time-2.3.jar}:\ - ${file.reference.postgresql-9.3-1101.jdbc41.jar} -# Space-separated list of extra javac options -javac.compilerargs= -javac.deprecation=false -javac.processorpath=\ - ${javac.classpath} -javac.source=1.7 -javac.target=1.7 -javac.test.classpath=\ - ${javac.classpath}:\ - ${build.classes.dir}:\ - ${libs.junit_4.classpath} -javac.test.processorpath=\ - ${javac.test.classpath} -javadoc.additionalparam= -javadoc.author=false -javadoc.encoding=${source.encoding} -javadoc.noindex=false -javadoc.nonavbar=false -javadoc.notree=false -javadoc.private=false -javadoc.splitindex=true -javadoc.use=true -javadoc.version=false -javadoc.windowtitle= -main.class=main.Main -manifest.file=manifest.mf -meta.inf.dir=${src.dir}/META-INF -mkdist.disabled=false -platform.active=default_platform -project.licensePath=./nbproject/licenseheader.txt -run.classpath=\ - ${javac.classpath}:\ - ${build.classes.dir} -# Space-separated list of JVM arguments used when running the project. -# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value. -# To set system properties for unit tests define test-sys-prop.name=value: -run.jvmargs= -run.test.classpath=\ - ${javac.test.classpath}:\ - ${build.test.classes.dir} -source.encoding=UTF-8 -src.dir=src -test.src.dir=test +annotation.processing.enabled=true +annotation.processing.enabled.in.editor=false +annotation.processing.processors.list= +annotation.processing.run.all.processors=true +annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output +application.title=Goldfarmer +application.vendor=maurice +build.classes.dir=${build.dir}/classes +build.classes.excludes=**/*.java,**/*.form +# This directory is removed when the project is cleaned: +build.dir=build +build.generated.dir=${build.dir}/generated +build.generated.sources.dir=${build.dir}/generated-sources +# Only compile against the classpath explicitly listed here: +build.sysclasspath=ignore +build.test.classes.dir=${build.dir}/test/classes +build.test.results.dir=${build.dir}/test/results +# Uncomment to specify the preferred debugger connection transport: +#debug.transport=dt_socket +debug.classpath=\ + ${run.classpath} +debug.test.classpath=\ + ${run.test.classpath} +# Files in build.classes.dir which should be excluded from distribution jar +dist.archive.excludes= +# This directory is removed when the project is cleaned: +dist.dir=dist +dist.jar=${dist.dir}/Goldfarmer.jar +dist.javadoc.dir=${dist.dir}/javadoc +endorsed.classpath= +excludes= +file.reference.joda-time-2.3.jar=lib/joda-time-2.3.jar +file.reference.postgresql-9.3-1101.jdbc41.jar=lib/postgresql-9.3-1101.jdbc41.jar +includes=** +jar.compress=false +javac.classpath=\ + ${file.reference.joda-time-2.3.jar}:\ + ${file.reference.postgresql-9.3-1101.jdbc41.jar} +# Space-separated list of extra javac options +javac.compilerargs= +javac.deprecation=false +javac.processorpath=\ + ${javac.classpath} +javac.source=1.7 +javac.target=1.7 +javac.test.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir}:\ + ${libs.junit_4.classpath} +javac.test.processorpath=\ + ${javac.test.classpath} +javadoc.additionalparam= +javadoc.author=false +javadoc.encoding=${source.encoding} +javadoc.noindex=false +javadoc.nonavbar=false +javadoc.notree=false +javadoc.private=false +javadoc.splitindex=true +javadoc.use=true +javadoc.version=false +javadoc.windowtitle= +main.class=main.Main +manifest.file=manifest.mf +meta.inf.dir=${src.dir}/META-INF +mkdist.disabled=false +platform.active=default_platform +project.licensePath=./nbproject/licenseheader.txt +run.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir} +# Space-separated list of JVM arguments used when running the project. +# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value. +# To set system properties for unit tests define test-sys-prop.name=value: +run.jvmargs= +run.test.classpath=\ + ${javac.test.classpath}:\ + ${build.test.classes.dir} +source.encoding=UTF-8 +src.dir=src +test.src.dir=test -- cgit v1.2.1 From 554f33b510656d5de46eecd5fef11f237fd38043 Mon Sep 17 00:00:00 2001 From: Maurice Laveaux Date: Wed, 4 Jun 2014 14:03:25 +0200 Subject: timezone command occurred twice. --- src/main/FarmShell.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java index 5fa468e..9e47a74 100644 --- a/src/main/FarmShell.java +++ b/src/main/FarmShell.java @@ -203,7 +203,6 @@ public class FarmShell { sentiment("analyzes all tweets on brand positivity (optional arg: tweet/brand selection query)"), wordcloud("makes a csv for a wordcloud of the text of the tweets", 1), getBrands("fills the database with the brands of a tweet, arg: bool indicating whether to reset mentionsbrand (optional arg: tweet selection query", 1), - timezone("makes a csv ", 1), timezone("makes a map per brand for the users", 1), disco("makes a outputfile for disco", 1), posneg("makes a csv for a histogram for positive or negative tweets", 1), -- cgit v1.2.1 From 6ed880b928ceaee3935562c2eb975ddaa49a8530 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 4 Jun 2014 16:16:41 +0200 Subject: Formatting, remove unused imports --- src/main/Analyzor.java | 225 +++++++++++++++++++++++------------------------- src/main/FarmShell.java | 14 +-- 2 files changed, 116 insertions(+), 123 deletions(-) diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 40ec38a..5a201be 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -18,7 +18,6 @@ import java.sql.Timestamp; import java.util.List; import java.util.HashMap; import java.util.HashSet; -import java.util.Locale; import java.util.Map.Entry; import java.util.Scanner; @@ -186,14 +185,14 @@ public class Analyzor { String text; String brand; String[] words; - HashMap> wordcloud = new HashMap<>(); + HashMap> wordcloud = new HashMap<>(); while (data.next()) { //get brand brand = data.getString("brand"); //make hashmap for each brand - if(!wordcloud.containsKey(brand)){ - wordcloud.put(brand, new HashMap()); + if (!wordcloud.containsKey(brand)) { + wordcloud.put(brand, new HashMap()); } //get the text text = data.getString("text"); @@ -204,15 +203,14 @@ public class Analyzor { //for all words for (String word : words) { //if it is empty, a space or a stripe, skip it - if(word.equals("") || word.equals(" ") || word.equals("-")){ + if (word.equals("") || word.equals(" ") || word.equals("-")) { continue; } //if the word is already in the map, increment the amount - if(wordcloud.get(brand).containsKey(word)){ + if (wordcloud.get(brand).containsKey(word)) { wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); - } - //if the word is not already in the map, make an entry with amount = 1 - else{ + } //if the word is not already in the map, make an entry with amount = 1 + else { wordcloud.get(brand).put(word, 1); } } @@ -251,14 +249,14 @@ public class Analyzor { /** * Obtain the brands of select tweet texts. - * + * * @param queryText The rows to select. * @param reset Whether to reset mentionsbrand. * @throws SQLException If the query is unsuccesfull. */ public void getBrands(String queryText, boolean reset) throws SQLException { BrandChecker checker = new BrandChecker("brandonlyrules.txt"); - + PreparedStatement statement; // make a connection to the database and execute the query if (reset) { @@ -266,7 +264,7 @@ public class Analyzor { statement = connection.prepareStatement("delete from mentionsbrand"); statement.executeUpdate(); } - + System.out.println("Obtaining all selected entries in tweet."); if (queryText.isEmpty()) { query("select * from tweet"); @@ -274,13 +272,13 @@ public class Analyzor { query(queryText); } System.out.println("Query finished."); - + NamedPreparedStatement insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); - + int brandCount = 0; int count = 0; long timestamp = System.currentTimeMillis(); - + while (data.next()) { List brands = checker.getBrands(data.getString("text")); if (brands.isEmpty()) { @@ -294,112 +292,108 @@ public class Analyzor { insertBrand.executeUpdate(); } } - + count++; if (count % 10000 == 0) { System.out.println("Processed " + count + " tweets in " + (System.currentTimeMillis() - timestamp) + " ms"); } } - - System.out.println("Processed " + count + " tweets in " + (System.currentTimeMillis() - timestamp) + " ms"); + + System.out.println("Processed " + count + " tweets in " + (System.currentTimeMillis() - timestamp) + " ms"); System.out.println("Finished getBrands, processed " + count + " number of tweets, added " + brandCount + " brands or no."); } - + //gets the amount of users that tweet about a brand in a timezone //makes a csv file timezone, brand, amount - public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{ + public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { query(query); - + InputStream inFile = new FileInputStream("timezone.txt"); Scanner readFile = new Scanner(inFile); - HashMap toTimezone = new HashMap<>(); + HashMap toTimezone = new HashMap<>(); while (readFile.hasNextLine()) { String line = readFile.nextLine(); - if(line.split(",").length>1){ + if (line.split(",").length > 1) { toTimezone.put(line.split(",")[0], line.split(",")[1]); } } - + //hashmap timezone, brand, amount HashMap> timeMap = new HashMap<>(); String timezone; String brand; - - while(data.next()){ + + while (data.next()) { timezone = data.getString("timezone"); - if (toTimezone.containsKey(timezone)){ - timezone=toTimezone.get(timezone); + if (toTimezone.containsKey(timezone)) { + timezone = toTimezone.get(timezone); } else { - timezone="other"; + timezone = "other"; } brand = data.getString("brand"); - + //if the timezone is already in the map - if(timeMap.containsKey(timezone)){ + if (timeMap.containsKey(timezone)) { //if the brand for that timezone is already in the map - if(timeMap.get(timezone).containsKey(brand)){ + if (timeMap.get(timezone).containsKey(brand)) { //increment the amount timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); - } - //if the brand for that timezone is not yet in the map - else{ + } //if the brand for that timezone is not yet in the map + else { //make a new entry for that brand with amount = 1 timeMap.get(timezone).put(brand, 1); } - } - //if the timezone is not yet in the map - else{ + } //if the timezone is not yet in the map + else { //make a new hashmap for this map and fill it with the brand and the amount timeMap.put(timezone, new HashMap()); timeMap.get(timezone).put(brand, 1); } } - + //make the CSV out of the map ssiMapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); } - + //gets the positivity of the tweets about a brand //makes a csv file for posnegVisualizer - void posNeg(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{ + void posNeg(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { query(query); - + String brand; int rating; int ratingInterval; - + int intervalSize = 10; //brand, ratingInterval, amount HashMap> posnegMap = new HashMap<>(); /* - the rating interval is given by an integer, which is the result of the - tweets sentiment value divided by interval size rounded down. - This puts all data in boxes for the histogram. - */ - - while(data.next()){ - + the rating interval is given by an integer, which is the result of the + tweets sentiment value divided by interval size rounded down. + This puts all data in boxes for the histogram. + */ + + while (data.next()) { + brand = data.getString("brand"); rating = data.getInt("rating"); //ratingInterval is an integer divisible by intervalSize //if a rating is between a ratingInterval+-0.5*intervalSize, it belongs in that interval - ratingInterval = (rating + (int)(0.5 * intervalSize))/intervalSize*intervalSize; - + ratingInterval = (rating + (int) (0.5 * intervalSize)) / intervalSize * intervalSize; + //if the brand is already in the map - if(posnegMap.containsKey(brand)){ + if (posnegMap.containsKey(brand)) { //if the brand for that brand is already in the map - if(posnegMap.get(brand).containsKey(ratingInterval)){ + if (posnegMap.get(brand).containsKey(ratingInterval)) { //increment the amount posnegMap.get(brand).put(ratingInterval, posnegMap.get(brand).get(ratingInterval) + 1); - } - //if the brand for that brand is not yet in the map - else{ + } //if the brand for that brand is not yet in the map + else { //make a new entry for that brand with amount = 1 posnegMap.get(brand).put(ratingInterval, 1); } - } - //if the brand is not yet in the map - else{ + } //if the brand is not yet in the map + else { //make a new hashmap for this map and fill it with the brand and the amount posnegMap.put(brand, new HashMap()); posnegMap.get(brand).put(ratingInterval, 1); @@ -407,39 +401,39 @@ public class Analyzor { } siiMapToCSV(posnegMap, "posneg.csv", "brand,ratingInterval,count"); } - + /* - makes a csv for disco of a process of news spreading + makes a csv for disco of a process of news spreading - the query should be as follows: - - it should be a union of the following query twice, once with TYPE = retweet, once with TYPE = reply - - pick two tables of tweet (t1 and t2) and one of TYPEof - - t1.tweetid = TYPEof.TYPEonid and t2.tweetid = TYPEof.TYPEid - - t1.tweetid should be named maintweetid - - t2.tweetid should be named TYPEid - - t1.timestamp should be names maintime - - t2.timestamp should be named othertime - - t1.userid should be named mainuserid - - t2.userid should be named otheruserid + the query should be as follows: + - it should be a union of the following query twice, once with TYPE = retweet, once with TYPE = reply + - pick two tables of tweet (t1 and t2) and one of TYPEof + - t1.tweetid = TYPEof.TYPEonid and t2.tweetid = TYPEof.TYPEid + - t1.tweetid should be named maintweetid + - t2.tweetid should be named TYPEid + - t1.timestamp should be names maintime + - t2.timestamp should be named othertime + - t1.userid should be named mainuserid + - t2.userid should be named otheruserid - so the resulting tables should be: - maintweetid, maintime, mainuserid, replyid, retweetid, othertime, otheruserid + so the resulting tables should be: + maintweetid, maintime, mainuserid, replyid, retweetid, othertime, otheruserid - note that one of replyid and retweetid has to be null and the other a long for each row - how to do this: http://stackoverflow.com/questions/2309943/unioning-two-tables-with-different-number-of-columns + note that one of replyid and retweetid has to be null and the other a long for each row + how to do this: http://stackoverflow.com/questions/2309943/unioning-two-tables-with-different-number-of-columns - the csv will contain: tweetID of the replied/retweeted on, reply/retweet, timestamp, tweetid of the reply/retweet, userid - which corresponds to: caseID , activity , timestamp, resource , rescource - */ - void newsSpread(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{ + the csv will contain: tweetID of the replied/retweeted on, reply/retweet, timestamp, tweetid of the reply/retweet, userid + which corresponds to: caseID , activity , timestamp, resource , rescource + */ + void newsSpread(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { query(query); - + long maintweetID; long replyID; long retweetID; - + //tweetID, set of replyID's HashMap> hasReplies = new HashMap<>(); //tweetID, set of retweetID's @@ -448,20 +442,19 @@ public class Analyzor { HashMap timestamp = new HashMap<>(); //tweetID, its userID HashMap user = new HashMap<>(); - - while(data.next()){ - + + while (data.next()) { + maintweetID = data.getLong("thetweetid"); replyID = data.getLong("replyid"); retweetID = data.getLong("retweetid"); - + //put these in the corresponding maps //note that exact one of the two if statements below will hold - //if the replyID is not null - if(replyID != 0){ + if (replyID != 0) { //if this tweetID has no set yet, make one - if(hasReplies.get(maintweetID) == null){ + if (hasReplies.get(maintweetID) == null) { hasReplies.put(maintweetID, new HashSet()); } //add the replyID to the tweetID @@ -472,9 +465,9 @@ public class Analyzor { user.put(replyID, data.getLong("otheruser")); } //if the retweetID is not null - if(retweetID != 0){ + if (retweetID != 0) { //if this tweetID has no set yet, make one - if(hasRetweets.get(maintweetID) == null){ + if (hasRetweets.get(maintweetID) == null) { hasRetweets.put(maintweetID, new HashSet()); } //add the retweetID to the tweetID @@ -485,27 +478,27 @@ public class Analyzor { user.put(retweetID, data.getLong("otheruser")); } } - + //now use this data to make a csv for disco PrintWriter writer = new PrintWriter("newsSpread.csv", "UTF-8"); //print the first line writer.println("caseID,activity,timestamp,tweet,user"); - + //print all replies - for(Long tweetid : hasReplies.keySet()){ - for(Long replyid : hasReplies.get(tweetid)){ + for (Long tweetid : hasReplies.keySet()) { + for (Long replyid : hasReplies.get(tweetid)) { writer.println(tweetid + ", reply, " + timestamp.get(replyid) + ", " + replyid + ", " + user.get(replyid)); } } //print all retweets - for(Long tweetid : hasRetweets.keySet()){ - for(Long retweetid : hasRetweets.get(tweetid)){ + for (Long tweetid : hasRetweets.keySet()) { + for (Long retweetid : hasRetweets.get(tweetid)) { writer.println(tweetid + ", retweet, " + timestamp.get(retweetid) + ", " + retweetid + ", " + user.get(retweetid)); } } writer.close(); } - + //replaces punctuation so it will be splitted //also removes urls private String splitPunctToWords(String text) { @@ -524,44 +517,44 @@ public class Analyzor { text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); return text; } - + //prints a hashmap into a csv for a html application //Hashmap> becomes key1, key2, value //only for String, String, Integer - void ssiMapToCSV(HashMap> map, String fileName, String firstLine) - throws FileNotFoundException, UnsupportedEncodingException{ - + void ssiMapToCSV(HashMap> map, String fileName, String firstLine) + throws FileNotFoundException, UnsupportedEncodingException { + PrintWriter writer = new PrintWriter(fileName, "UTF-8"); - + writer.println(firstLine); - + //loop over brands - for(Entry en : map.entrySet()){ + for (Entry en : map.entrySet()) { //loop over words - for(Entry e : map.get(en.getKey()).entrySet()){ + for (Entry e : map.get(en.getKey()).entrySet()) { writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); } } - + writer.close(); System.out.println("csv file made, please put it next to html file and run this"); } - - void siiMapToCSV(HashMap> map, String fileName, String firstLine) - throws FileNotFoundException, UnsupportedEncodingException{ - + + void siiMapToCSV(HashMap> map, String fileName, String firstLine) + throws FileNotFoundException, UnsupportedEncodingException { + PrintWriter writer = new PrintWriter(fileName, "UTF-8"); - + writer.println(firstLine); - + //loop over brands - for(Entry en : map.entrySet()){ + for (Entry en : map.entrySet()) { //loop over words - for(Entry e : map.get(en.getKey()).entrySet()){ + for (Entry e : map.get(en.getKey()).entrySet()) { writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); } } - + writer.close(); System.out.println("csv file made, please put it next to html file and run this"); } diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java index 9e47a74..766e652 100644 --- a/src/main/FarmShell.java +++ b/src/main/FarmShell.java @@ -147,26 +147,26 @@ public class FarmShell { getAnalyzor().newsSpread(params[0]); break; case getBrands: - String trimmed = params[0].trim(); + String trimmed = params[0].trim(); String bool = trimmed; String query = null; - + int index = trimmed.indexOf(" "); - + if (index > -1) { - bool = trimmed.substring(0, index); + bool = trimmed.substring(0, index); query = trimmed.substring(index + 1, trimmed.length()); } - + boolean reset = false; if (bool.equals("true")) { reset = true; - } else if (bool.equals("false")){ + } else if (bool.equals("false")) { reset = false; } else { throw new IllegalArgumentException("getBrands: expected boolean, got " + params[0]); } - + if (query != null) { getAnalyzor().getBrands(query, reset); } else { -- cgit v1.2.1 From e14c671f3cbb56a08d765bd992e4cf774a0d1353 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 4 Jun 2014 17:13:19 +0200 Subject: Producer / consumer for getBrands --- src/database/BrandAnalyzerQueue.java | 88 ++++++++++++++++++++++++++++++++++++ src/main/Analyzor.java | 31 ++++++------- 2 files changed, 102 insertions(+), 17 deletions(-) create mode 100644 src/database/BrandAnalyzerQueue.java diff --git a/src/database/BrandAnalyzerQueue.java b/src/database/BrandAnalyzerQueue.java new file mode 100644 index 0000000..d4e4029 --- /dev/null +++ b/src/database/BrandAnalyzerQueue.java @@ -0,0 +1,88 @@ +package database; + +import analysis.BrandChecker; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.List; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * + * @author Peter Wu + */ +public class BrandAnalyzerQueue implements Runnable { + + private final BrandChecker checker; + private final ResultSet data; + private final BlockingQueue queue; + private volatile boolean last = false; + + public BrandAnalyzerQueue(ResultSet data) { + this.checker = new BrandChecker("brandonlyrules.txt"); + this.data = data; + this.queue = new ArrayBlockingQueue<>(1000); + } + + private Logger getLogger() { + return Logger.getLogger(BrandAnalyzerQueue.class.getName()); + } + + @Override + public void run() { + try { + fillQueue(); + } catch (SQLException ex) { + getLogger().log(Level.SEVERE, "Horrible! Database error", ex); + } catch (InterruptedException ex) { + getLogger().log(Level.SEVERE, "Interrupted!", ex); + } + try { + last = true; + queue.put(new Result(-1, null)); + } catch (InterruptedException ex) { + getLogger().log(Level.SEVERE, "Failed to insert suicide pill!"); + } + } + + private void fillQueue() throws SQLException, InterruptedException { + while (data.next()) { + List brands = checker.getBrands(data.getString("text")); + // if there is no brand, add a dummy so we know it got checked + if (brands.isEmpty()) { + brands.add("no"); + } + long tweetid = data.getLong("tweetid"); + Result result = new Result(tweetid, brands); + queue.put(result); + } + } + + public Result next() { + Result result = null; + try { + if (!last) { + result = queue.take(); + if (result.brands == null) { + result = null; + } + } + } catch (InterruptedException ex) { + getLogger().log(Level.SEVERE, "Interrupted!", ex); + } + return result; + } + + public static class Result { + + public final long tweetid; + public final List brands; + + public Result(long tweetid, List brands) { + this.tweetid = tweetid; + this.brands = brands; + } + } +} diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 5a201be..810fc4d 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -1,6 +1,7 @@ package main; import analysis.BrandChecker; +import database.BrandAnalyzerQueue; import database.NamedPreparedStatement; import database.QueryUtils; import java.io.File; @@ -255,8 +256,6 @@ public class Analyzor { * @throws SQLException If the query is unsuccesfull. */ public void getBrands(String queryText, boolean reset) throws SQLException { - BrandChecker checker = new BrandChecker("brandonlyrules.txt"); - PreparedStatement statement; // make a connection to the database and execute the query if (reset) { @@ -278,29 +277,27 @@ public class Analyzor { int brandCount = 0; int count = 0; long timestamp = System.currentTimeMillis(); - - while (data.next()) { - List brands = checker.getBrands(data.getString("text")); - if (brands.isEmpty()) { - brandCount++; - QueryUtils.setInsertBrandParams(insertBrand, data.getLong("tweetid"), "no"); + BrandAnalyzerQueue analyzer = new BrandAnalyzerQueue(data); + BrandAnalyzerQueue.Result result; + new Thread(analyzer).start(); + while ((result = analyzer.next()) != null) { + for (String brand : result.brands) { + QueryUtils.setInsertBrandParams(insertBrand, result.tweetid, brand); insertBrand.executeUpdate(); - } else { - brandCount += brands.size(); - for (String brand : brands) { - QueryUtils.setInsertBrandParams(insertBrand, data.getLong("tweetid"), brand); - insertBrand.executeUpdate(); - } } + brandCount += result.brands.size(); count++; if (count % 10000 == 0) { - System.out.println("Processed " + count + " tweets in " + (System.currentTimeMillis() - timestamp) + " ms"); + System.err.println("Processed " + count + " tweets, inserted " + + brandCount + " in " + ((System.currentTimeMillis() - timestamp) / 1000) + " sec"); } } - System.out.println("Processed " + count + " tweets in " + (System.currentTimeMillis() - timestamp) + " ms"); - System.out.println("Finished getBrands, processed " + count + " number of tweets, added " + brandCount + " brands or no."); + System.err.println("Processed " + count + " tweets in " + + ((System.currentTimeMillis() - timestamp) / 1000) + " sec"); + System.err.println("Finished getBrands, processed " + count + + " number of tweets, added " + brandCount + " brands or no."); } //gets the amount of users that tweet about a brand in a timezone -- cgit v1.2.1