diff options
author | s123188 <s123188@S123188.campus.tue.nl> | 2014-05-27 17:55:08 +0200 |
---|---|---|
committer | s123188 <s123188@S123188.campus.tue.nl> | 2014-05-27 17:55:08 +0200 |
commit | 9969b6a6cbae322680cfcbc27df3d37b0954f00a (patch) | |
tree | e790ccdb63aa581dd61a2a1ce4e3882b40ee3d76 /src | |
parent | 00baf4ffc86dac7b723c4fc3d2c963d1fa84729b (diff) | |
download | Goldfarmer-9969b6a6cbae322680cfcbc27df3d37b0954f00a.tar.gz |
changed Analyzor.timezone(String query) so that it adds a legenda "timezone" so that the visualizer can display a legends
Diffstat (limited to 'src')
-rw-r--r-- | src/main/Analyzor.java | 710 |
1 files changed, 361 insertions, 349 deletions
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index ffd9a5b..6369ece 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -1,349 +1,361 @@ -package main; - -import analysis.BrandChecker; -import database.NamedPreparedStatement; -import database.QueryUtils; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.List; -import java.util.HashMap; -import java.util.Map.Entry; -import java.util.Scanner; - -/** - * The sentiment analysis class that rates tweets based on a unigram and bigram - * set of weights. - */ -public class Analyzor { - - /** - * The map that matches single words to their weights. - */ - private final HashMap<String, Double> unimap = new HashMap(); - - /** - * The map that matches word pairs to their weights. - */ - private final HashMap<String, Double> bimap = new HashMap(); - - /** - * The results of a query, maybe return from query(). - */ - private ResultSet data; - - /** - * The persistent connection to the database. - */ - private final Connection connection; - - /** - * @param connection An open connection to the database. - */ - public Analyzor(Connection connection) { - this.connection = connection; - } - - /** - * Read the unigram and bigram lexica. - * - * @throws FileNotFoundException - */ - public void readLexicon() throws FileNotFoundException { - if (!unimap.isEmpty()) { - // data is already read. - return; - } - System.err.println("Trying to read lexicons..."); - // A unigram is in the format (WS = whitespace): - // word <WS> rating <WS> ??? <WS> ?? - // A bigram has an two WS-separated words instead of one. - try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); - Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { - //Fill the map of unigrams - int lineno = 1; - while (uniScanner.hasNext()) { - - String words = uniScanner.next(); - Double d = Double.valueOf(uniScanner.next()); - unimap.put(words.toLowerCase(), d); - if (uniScanner.hasNextLine()) { - uniScanner.nextLine(); - } - lineno++; - - } - - //fill the map of bigrams - while (biScanner.hasNext()) { - String words = biScanner.next() + " " + biScanner.next(); - bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next())); - if (biScanner.hasNextLine()) { - biScanner.nextLine(); - } - } - } - System.err.println("Lexicons are read."); - } - - /** - * Executes a query that the analyzer can analyze. - * - * @param query The query string to execute. - * @throws SQLException When database connection isn't available. - */ - public void query(String query) throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement(query); - data = statement.executeQuery(); - } - - /** - * Run a sentiment analysis and fill the database with the output. - * - * @param query The sql text for the query. - * @throws SQLException - * @throws IOException - */ - public void sentimentAnalysis(String query) throws SQLException, IOException { - NamedPreparedStatement tweetBrandStmt, updateRating; - - //read the lexicons - readLexicon(); - - // if you ever need to re-apply rating, use something like: - // UPDATE mentionsbrand SET rating = NULL WHERE ... - if (query.isEmpty()) { - query = "SELECT t.tweetid, t.text, b.brand FROM tweet t " - + "JOIN mentionsbrand b USING (tweetid) " - + "WHERE b.rating IS NULL"; - } - tweetBrandStmt = new NamedPreparedStatement(connection, - query); - ResultSet tweetBrandResults = tweetBrandStmt.executeQuery(); - - updateRating = new NamedPreparedStatement(connection, - "UPDATE mentionsbrand SET rating = :rating " - + "WHERE tweetid = :tweetid AND brand = :brand"); - - Double value; - String text; - - //for all tuples - while (tweetBrandResults.next()) { - //get the text - text = tweetBrandResults.getString("text"); - text = splitPunctToWords(text); - // test is the tweet text you are going to analyze - String[] words = text.split("\\s+"); // text splitted into separate words - double positiverate = 0; // positive rating - - // Rate the text with unigrams - for (String word : words) { - value = unimap.get(word); - if (value != null) { - positiverate += unimap.get(word); - } - } - // Rate the text with bigrams - for (int i = 0; i < words.length - 1; i++) { - String pair = words[i] + " " + words[i + 1]; - value = bimap.get(pair); - if (value != null) { - positiverate += bimap.get(pair); - } - } - //insert the rating into the database - updateRating.setLong("tweetid", tweetBrandResults.getLong("tweetid")); - updateRating.setString("brand", tweetBrandResults.getString("brand")); - updateRating.setInt("rating", (int) (positiverate * 10)); - updateRating.executeUpdate(); - } - } - - /** - * Make a wordcloud of the results of some query. - * - * @param query The sql text for a query. - * @throws SQLException - * @throws FileNotFoundException - * @throws UnsupportedEncodingException - */ - public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - - query(query); - //go to the start of the ResultSet data - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - String text; - String brand; - String[] words; - HashMap<String, HashMap<String, Integer>> wordcloud = new HashMap<>(); - - while (data.next()) { - //get brand - brand = data.getString("brand"); - //make hashmap for each brand - if (!wordcloud.containsKey(brand)) { - wordcloud.put(brand, new HashMap<String, Integer>()); - } - //get the text - text = data.getString("text"); - //remove punctuation, convert to lowercase and split on words - text = removePunct(text); - text = text.toLowerCase(); - words = text.split("\\s+"); - //for all words - for (String word : words) { - //if it is empty, a space or a stripe, skip it - if (word.equals("") || word.equals(" ") || word.equals("-")) { - continue; - } - //if the word is already in the map, increment the amount - if (wordcloud.get(brand).containsKey(word)) { - wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); - } //if the word is not already in the map, make an entry with amount = 1 - else { - wordcloud.get(brand).put(word, 1); - } - } - } - //print the words and their frequency in a csv file - mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count"); - } - - //generate csv for disco from the query - public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - //do the query - query(query); - PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); - //print the first row - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - writer.print(data.getMetaData().getColumnLabel(i) + ", "); - } - writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); - //print the values - while (data.next()) { - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - if (data.getObject(i) == null) { - writer.print(", "); - } else { - writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); - } - } - if (data.getObject(data.getMetaData().getColumnCount()) == null) { - writer.println("0"); - } else { - writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); - } - } - writer.close(); - } - - public void getBrands() throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement("delete from mentionsbrand"); - statement.executeUpdate(); - BrandChecker checker = new BrandChecker("brandonlyrules.txt"); - query("select * from tweet"); - NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); - while (data.next()) { - List<String> brands = checker.getBrands(data.getString("text")); - if (brands.isEmpty()) { - QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no"); - m_insertBrand.executeUpdate(); - } else { - for (String brand : brands) { - QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand); - m_insertBrand.executeUpdate(); - } - } - } - } - - //gets the amount of users that tweet about a brand in a timezone - //makes a csv file timezone, brand, amount - public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - query(query); - //hashmap timezone, brand, amount - HashMap<String, HashMap<String, Integer>> timeMap = new HashMap<>(); - String timezone; - String brand; - - while (data.next()) { - timezone = data.getString("timezone"); - brand = data.getString("brand"); - //if the timezone is already in the map - if (timeMap.containsKey(timezone)) { - //if the brand for that timezone is already in the map - if (timeMap.get(timezone).containsKey(brand)) { - //increment the amount - timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); - } //if the brand for that timezone is not yet in the map - else { - //make a new entry for that brand with amount = 1 - timeMap.get(timezone).put(brand, 1); - } - } //if the timezone is not yet in the map - else { - //make a new hashmap for this map and fill it with the brand and the amount - timeMap.put(timezone, new HashMap<String, Integer>()); - timeMap.get(timezone).put(brand, 1); - } - } - //make the CSV out of the map - mapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); - } - - //replaces punctuation so it will be splitted - //also removes urls - private String splitPunctToWords(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[!?):;\"']", " $0"); - text = text.replaceAll("[.,-](\\s|$)", " $0"); - text = text.replaceAll("\\s[(\"']", "$0 "); - return text; - } - - //removes punctuation - //also removes urls - private String removePunct(String text) { - text = text.replaceAll("https?://\\S*", " "); - text = text.replaceAll("@\\S*", " "); - text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); - return text; - } - - //prints a hashmap into a csv for a html application - //Hashmap<key1, HashMap<key2, value>> becomes key1, key2, value - //only for String, String, Integer - void mapToCSV(HashMap<String, HashMap<String, Integer>> map, String fileName, String firstLine) - throws FileNotFoundException, UnsupportedEncodingException { - - PrintWriter writer = new PrintWriter(fileName, "UTF-8"); - - writer.println(firstLine); - - //loop over brands - for (Entry en : map.entrySet()) { - //loop over words - for (Entry e : map.get(en.getKey()).entrySet()) { - writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); - } - } - - writer.close(); - System.out.println("csv file made, please put it next to html file and run this"); - } -} +package main;
+
+import analysis.BrandChecker;
+import database.NamedPreparedStatement;
+import database.QueryUtils;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.List;
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.util.Scanner;
+
+/**
+ * The sentiment analysis class that rates tweets based on a unigram and bigram
+ * set of weights.
+ */
+public class Analyzor {
+
+ /**
+ * The map that matches single words to their weights.
+ */
+ private final HashMap<String, Double> unimap = new HashMap();
+
+ /**
+ * The map that matches word pairs to their weights.
+ */
+ private final HashMap<String, Double> bimap = new HashMap();
+
+ /**
+ * The results of a query, maybe return from query().
+ */
+ private ResultSet data;
+
+ /**
+ * The persistent connection to the database.
+ */
+ private final Connection connection;
+
+ /**
+ * @param connection An open connection to the database.
+ */
+ public Analyzor(Connection connection) {
+ this.connection = connection;
+ }
+
+ /**
+ * Read the unigram and bigram lexica.
+ *
+ * @throws FileNotFoundException
+ */
+ public void readLexicon() throws FileNotFoundException {
+ if (!unimap.isEmpty()) {
+ // data is already read.
+ return;
+ }
+ System.err.println("Trying to read lexicons...");
+ // A unigram is in the format (WS = whitespace):
+ // word <WS> rating <WS> ??? <WS> ??
+ // A bigram has an two WS-separated words instead of one.
+ try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt"));
+ Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) {
+ //Fill the map of unigrams
+ int lineno = 1;
+ while (uniScanner.hasNext()) {
+
+ String words = uniScanner.next();
+ Double d = Double.valueOf(uniScanner.next());
+ unimap.put(words.toLowerCase(), d);
+ if (uniScanner.hasNextLine()) {
+ uniScanner.nextLine();
+ }
+ lineno++;
+
+ }
+
+ //fill the map of bigrams
+ while (biScanner.hasNext()) {
+ String words = biScanner.next() + " " + biScanner.next();
+ bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next()));
+ if (biScanner.hasNextLine()) {
+ biScanner.nextLine();
+ }
+ }
+ }
+ System.err.println("Lexicons are read.");
+ }
+
+ /**
+ * Executes a query that the analyzer can analyze.
+ *
+ * @param query The query string to execute.
+ * @throws SQLException When database connection isn't available.
+ */
+ public void query(String query) throws SQLException {
+ PreparedStatement statement;
+ //make a connection to the database and execute the query
+ statement = connection.prepareStatement(query);
+ data = statement.executeQuery();
+ }
+
+ /**
+ * Run a sentiment analysis and fill the database with the output.
+ *
+ * @param query The sql text for the query.
+ * @throws SQLException
+ * @throws IOException
+ */
+ public void sentimentAnalysis(String query) throws SQLException, IOException {
+ NamedPreparedStatement tweetBrandStmt, updateRating;
+
+ //read the lexicons
+ readLexicon();
+
+ // if you ever need to re-apply rating, use something like:
+ // UPDATE mentionsbrand SET rating = NULL WHERE ...
+ if (query.isEmpty()) {
+ query = "SELECT t.tweetid, t.text, b.brand FROM tweet t "
+ + "JOIN mentionsbrand b USING (tweetid) "
+ + "WHERE b.rating IS NULL";
+ }
+ tweetBrandStmt = new NamedPreparedStatement(connection,
+ query);
+ ResultSet tweetBrandResults = tweetBrandStmt.executeQuery();
+
+ updateRating = new NamedPreparedStatement(connection,
+ "UPDATE mentionsbrand SET rating = :rating "
+ + "WHERE tweetid = :tweetid AND brand = :brand");
+
+ Double value;
+ String text;
+
+ //for all tuples
+ while (tweetBrandResults.next()) {
+ //get the text
+ text = tweetBrandResults.getString("text");
+ text = splitPunctToWords(text);
+ // test is the tweet text you are going to analyze
+ String[] words = text.split("\\s+"); // text splitted into separate words
+ double positiverate = 0; // positive rating
+
+ // Rate the text with unigrams
+ for (String word : words) {
+ value = unimap.get(word);
+ if (value != null) {
+ positiverate += unimap.get(word);
+ }
+ }
+ // Rate the text with bigrams
+ for (int i = 0; i < words.length - 1; i++) {
+ String pair = words[i] + " " + words[i + 1];
+ value = bimap.get(pair);
+ if (value != null) {
+ positiverate += bimap.get(pair);
+ }
+ }
+ //insert the rating into the database
+ updateRating.setLong("tweetid", tweetBrandResults.getLong("tweetid"));
+ updateRating.setString("brand", tweetBrandResults.getString("brand"));
+ updateRating.setInt("rating", (int) (positiverate * 10));
+ updateRating.executeUpdate();
+ }
+ }
+
+ /**
+ * Make a wordcloud of the results of some query.
+ *
+ * @param query The sql text for a query.
+ * @throws SQLException
+ * @throws FileNotFoundException
+ * @throws UnsupportedEncodingException
+ */
+ public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+
+ query(query);
+ //go to the start of the ResultSet data
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+
+ String text;
+ String brand;
+ String[] words;
+ HashMap<String, HashMap<String, Integer>> wordcloud = new HashMap<>();
+
+ while (data.next()) {
+ //get brand
+ brand = data.getString("brand");
+ //make hashmap for each brand
+ if (!wordcloud.containsKey(brand)) {
+ wordcloud.put(brand, new HashMap<String, Integer>());
+ }
+ //get the text
+ text = data.getString("text");
+ //remove punctuation, convert to lowercase and split on words
+ text = removePunct(text);
+ text = text.toLowerCase();
+ words = text.split("\\s+");
+ //for all words
+ for (String word : words) {
+ //if it is empty, a space or a stripe, skip it
+ if (word.equals("") || word.equals(" ") || word.equals("-")) {
+ continue;
+ }
+ //if the word is already in the map, increment the amount
+ if (wordcloud.get(brand).containsKey(word)) {
+ wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1);
+ } //if the word is not already in the map, make an entry with amount = 1
+ else {
+ wordcloud.get(brand).put(word, 1);
+ }
+ }
+ }
+ //print the words and their frequency in a csv file
+ mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count");
+ }
+
+ //generate csv for disco from the query
+ public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+ //do the query
+ query(query);
+ PrintWriter writer = new PrintWriter("output.csv", "UTF-8");
+ //print the first row
+ for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
+ writer.print(data.getMetaData().getColumnLabel(i) + ", ");
+ }
+ writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount()));
+ //print the values
+ while (data.next()) {
+ for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
+ if (data.getObject(i) == null) {
+ writer.print(", ");
+ } else {
+ writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", ");
+ }
+ }
+ if (data.getObject(data.getMetaData().getColumnCount()) == null) {
+ writer.println("0");
+ } else {
+ writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " "));
+ }
+ }
+ writer.close();
+ }
+
+ public void getBrands() throws SQLException {
+ PreparedStatement statement;
+ //make a connection to the database and execute the query
+ statement = connection.prepareStatement("delete from mentionsbrand");
+ statement.executeUpdate();
+ BrandChecker checker = new BrandChecker("brandonlyrules.txt");
+ query("select * from tweet");
+ NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand);
+ while (data.next()) {
+ List<String> brands = checker.getBrands(data.getString("text"));
+ if (brands.isEmpty()) {
+ QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no");
+ m_insertBrand.executeUpdate();
+ } else {
+ for (String brand : brands) {
+ QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand);
+ m_insertBrand.executeUpdate();
+ }
+ }
+ }
+ }
+
+ //gets the amount of users that tweet about a brand in a timezone
+ //makes a csv file timezone, brand, amount
+ public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+ query(query);
+ //hashmap timezone, brand, amount
+ HashMap<String, HashMap<String, Integer>> timeMap = new HashMap<>();
+ String timezone;
+ String brand;
+
+ while (data.next()) {
+ timezone = data.getString("timezone");
+ brand = data.getString("brand");
+ //if the timezone is already in the map
+ if (timeMap.containsKey(timezone)) {
+ //if the brand for that timezone is already in the map
+ if (timeMap.get(timezone).containsKey(brand)) {
+ //increment the amount
+ timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1);
+ } //if the brand for that timezone is not yet in the map
+ else {
+ //make a new entry for that brand with amount = 1
+ timeMap.get(timezone).put(brand, 1);
+ }
+ } //if the timezone is not yet in the map
+ else {
+ //make a new hashmap for this map and fill it with the brand and the amount
+ timeMap.put(timezone, new HashMap<String, Integer>());
+ timeMap.get(timezone).put(brand, 1);
+ }
+ }
+
+ //add a legenda "timezone" that will make the legenda for the timezone map
+ final int legendaSize = 6000;
+
+ timeMap.put("legenda" , new HashMap<String, Integer>());
+ timeMap.get("legenda").put("sony", legendaSize/6);
+ timeMap.get("legenda").put("lg", legendaSize/6);
+ timeMap.get("legenda").put("huawei", legendaSize/6);
+ timeMap.get("legenda").put("htc", legendaSize/6);
+ timeMap.get("legenda").put("samsung", legendaSize/6);
+ timeMap.get("legenda").put("apple", legendaSize/6);
+
+ //make the CSV out of the map
+ mapToCSV(timeMap, "timezone.csv", "timezone,brand,count");
+ }
+
+ //replaces punctuation so it will be splitted
+ //also removes urls
+ private String splitPunctToWords(String text) {
+ text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll("[!?):;\"']", " $0");
+ text = text.replaceAll("[.,-](\\s|$)", " $0");
+ text = text.replaceAll("\\s[(\"']", "$0 ");
+ return text;
+ }
+
+ //removes punctuation
+ //also removes urls
+ private String removePunct(String text) {
+ text = text.replaceAll("https?://\\S*", " ");
+ text = text.replaceAll("@\\S*", " ");
+ text = text.replaceAll("[^a-zA-Z0-9#_-]", " ");
+ return text;
+ }
+
+ //prints a hashmap into a csv for a html application
+ //Hashmap<key1, HashMap<key2, value>> becomes key1, key2, value
+ //only for String, String, Integer
+ void mapToCSV(HashMap<String, HashMap<String, Integer>> map, String fileName, String firstLine)
+ throws FileNotFoundException, UnsupportedEncodingException {
+
+ PrintWriter writer = new PrintWriter(fileName, "UTF-8");
+
+ writer.println(firstLine);
+
+ //loop over brands
+ for (Entry en : map.entrySet()) {
+ //loop over words
+ for (Entry e : map.get(en.getKey()).entrySet()) {
+ writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue());
+ }
+ }
+
+ writer.close();
+ System.out.println("csv file made, please put it next to html file and run this");
+ }
+}
|