summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordaanpeters <d.p.c.h.peters@student.tue.nl>2014-05-28 10:43:37 +0200
committerdaanpeters <d.p.c.h.peters@student.tue.nl>2014-05-28 10:43:37 +0200
commit62f2a42f5d5be918561c9996d7fa77b0ef2a9aa9 (patch)
tree4291993828510f9070a68aaa3810d7c4462d5101
parent01cc77792f3ee3fef1d628e98a9a7be7e96b056f (diff)
parent9969b6a6cbae322680cfcbc27df3d37b0954f00a (diff)
downloadGoldfarmer-62f2a42f5d5be918561c9996d7fa77b0ef2a9aa9.tar.gz
Merge origin/master
Conflicts: src/main/Analyzor.java src/main/FarmShell.java
-rw-r--r--nbproject/configs/such_database.properties2
-rw-r--r--nbproject/project.properties162
-rw-r--r--src/analysis/BrandChecker.java32
-rw-r--r--src/database/NamedPreparedStatement.java11
-rw-r--r--src/database/QueryUtils.java8
-rw-r--r--src/main/Analyzor.java736
-rw-r--r--src/main/FarmShell.java429
-rw-r--r--test/analysis/BrandCheckerTest.java4
8 files changed, 693 insertions, 691 deletions
diff --git a/nbproject/configs/such_database.properties b/nbproject/configs/such_database.properties
index bba41ec..9dffee6 100644
--- a/nbproject/configs/such_database.properties
+++ b/nbproject/configs/such_database.properties
@@ -1 +1 @@
-$label=such database
+$label=such database
diff --git a/nbproject/project.properties b/nbproject/project.properties
index b262ab6..ab8ae05 100644
--- a/nbproject/project.properties
+++ b/nbproject/project.properties
@@ -1,81 +1,81 @@
-annotation.processing.enabled=true
-annotation.processing.enabled.in.editor=false
-annotation.processing.processors.list=
-annotation.processing.run.all.processors=true
-annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output
-application.title=Goldfarmer
-application.vendor=maurice
-build.classes.dir=${build.dir}/classes
-build.classes.excludes=**/*.java,**/*.form
-# This directory is removed when the project is cleaned:
-build.dir=build
-build.generated.dir=${build.dir}/generated
-build.generated.sources.dir=${build.dir}/generated-sources
-# Only compile against the classpath explicitly listed here:
-build.sysclasspath=ignore
-build.test.classes.dir=${build.dir}/test/classes
-build.test.results.dir=${build.dir}/test/results
-# Uncomment to specify the preferred debugger connection transport:
-#debug.transport=dt_socket
-debug.classpath=\
- ${run.classpath}
-debug.test.classpath=\
- ${run.test.classpath}
-# Files in build.classes.dir which should be excluded from distribution jar
-dist.archive.excludes=
-# This directory is removed when the project is cleaned:
-dist.dir=dist
-dist.jar=${dist.dir}/Goldfarmer.jar
-dist.javadoc.dir=${dist.dir}/javadoc
-endorsed.classpath=
-excludes=
-file.reference.joda-time-2.3.jar=lib/joda-time-2.3.jar
-file.reference.postgresql-9.3-1101.jdbc41.jar=lib/postgresql-9.3-1101.jdbc41.jar
-includes=**
-jar.compress=false
-javac.classpath=\
- ${file.reference.joda-time-2.3.jar}:\
- ${file.reference.postgresql-9.3-1101.jdbc41.jar}
-# Space-separated list of extra javac options
-javac.compilerargs=
-javac.deprecation=false
-javac.processorpath=\
- ${javac.classpath}
-javac.source=1.7
-javac.target=1.7
-javac.test.classpath=\
- ${javac.classpath}:\
- ${build.classes.dir}:\
- ${libs.junit_4.classpath}
-javac.test.processorpath=\
- ${javac.test.classpath}
-javadoc.additionalparam=
-javadoc.author=false
-javadoc.encoding=${source.encoding}
-javadoc.noindex=false
-javadoc.nonavbar=false
-javadoc.notree=false
-javadoc.private=false
-javadoc.splitindex=true
-javadoc.use=true
-javadoc.version=false
-javadoc.windowtitle=
-main.class=main.Main
-manifest.file=manifest.mf
-meta.inf.dir=${src.dir}/META-INF
-mkdist.disabled=false
-platform.active=default_platform
-project.licensePath=./nbproject/licenseheader.txt
-run.classpath=\
- ${javac.classpath}:\
- ${build.classes.dir}
-# Space-separated list of JVM arguments used when running the project.
-# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value.
-# To set system properties for unit tests define test-sys-prop.name=value:
-run.jvmargs=
-run.test.classpath=\
- ${javac.test.classpath}:\
- ${build.test.classes.dir}
-source.encoding=UTF-8
-src.dir=src
-test.src.dir=test
+annotation.processing.enabled=true
+annotation.processing.enabled.in.editor=false
+annotation.processing.processors.list=
+annotation.processing.run.all.processors=true
+annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output
+application.title=Goldfarmer
+application.vendor=maurice
+build.classes.dir=${build.dir}/classes
+build.classes.excludes=**/*.java,**/*.form
+# This directory is removed when the project is cleaned:
+build.dir=build
+build.generated.dir=${build.dir}/generated
+build.generated.sources.dir=${build.dir}/generated-sources
+# Only compile against the classpath explicitly listed here:
+build.sysclasspath=ignore
+build.test.classes.dir=${build.dir}/test/classes
+build.test.results.dir=${build.dir}/test/results
+# Uncomment to specify the preferred debugger connection transport:
+#debug.transport=dt_socket
+debug.classpath=\
+ ${run.classpath}
+debug.test.classpath=\
+ ${run.test.classpath}
+# Files in build.classes.dir which should be excluded from distribution jar
+dist.archive.excludes=
+# This directory is removed when the project is cleaned:
+dist.dir=dist
+dist.jar=${dist.dir}/Goldfarmer.jar
+dist.javadoc.dir=${dist.dir}/javadoc
+endorsed.classpath=
+excludes=
+file.reference.joda-time-2.3.jar=lib/joda-time-2.3.jar
+file.reference.postgresql-9.3-1101.jdbc41.jar=lib/postgresql-9.3-1101.jdbc41.jar
+includes=**
+jar.compress=false
+javac.classpath=\
+ ${file.reference.joda-time-2.3.jar}:\
+ ${file.reference.postgresql-9.3-1101.jdbc41.jar}
+# Space-separated list of extra javac options
+javac.compilerargs=
+javac.deprecation=false
+javac.processorpath=\
+ ${javac.classpath}
+javac.source=1.7
+javac.target=1.7
+javac.test.classpath=\
+ ${javac.classpath}:\
+ ${build.classes.dir}:\
+ ${libs.junit_4.classpath}
+javac.test.processorpath=\
+ ${javac.test.classpath}
+javadoc.additionalparam=
+javadoc.author=false
+javadoc.encoding=${source.encoding}
+javadoc.noindex=false
+javadoc.nonavbar=false
+javadoc.notree=false
+javadoc.private=false
+javadoc.splitindex=true
+javadoc.use=true
+javadoc.version=false
+javadoc.windowtitle=
+main.class=main.Main
+manifest.file=manifest.mf
+meta.inf.dir=${src.dir}/META-INF
+mkdist.disabled=false
+platform.active=default_platform
+project.licensePath=./nbproject/licenseheader.txt
+run.classpath=\
+ ${javac.classpath}:\
+ ${build.classes.dir}
+# Space-separated list of JVM arguments used when running the project.
+# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value.
+# To set system properties for unit tests define test-sys-prop.name=value:
+run.jvmargs=
+run.test.classpath=\
+ ${javac.test.classpath}:\
+ ${build.test.classes.dir}
+source.encoding=UTF-8
+src.dir=src
+test.src.dir=test
diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java
index 10e22b4..ee9c7b4 100644
--- a/src/analysis/BrandChecker.java
+++ b/src/analysis/BrandChecker.java
@@ -1,17 +1,3 @@
-/*
- * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE Version 2, December 2004
- *
- * Copyright (C) 2004 Sam Hocevar
- *
- * Everyone is permitted to copy and distribute verbatim or modified copies
- * of this license document, and changing it is allowed as long as the name is
- * changed.
- *
- * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING,
- * DISTRIBUTION AND MODIFICATION
- *
- * 0. You just DO WHAT THE FUCK YOU WANT TO.
- */
package analysis;
import java.io.FileInputStream;
@@ -93,7 +79,7 @@ public class BrandChecker {
if (line.isEmpty()) {
return;
}
-
+
if (!line.contains("-")) {
System.err.println("illformatted rule: " + line + ", missing -");
} else {
@@ -110,13 +96,13 @@ public class BrandChecker {
// Read the <name> line.
String name = parts[0].trim();
-
+
// Read the positive words.
- String positive = parts[1].replaceAll(" ","");
+ String positive = parts[1].replaceAll(" ", "");
String[] sequence = positive.split(",");
-
+
if (parts.length == 3) {
- String negative = parts[2].replaceAll(" ", "");
+ String negative = parts[2].replaceAll(" ", "");
String[] blacklist = negative.split(",");
ruleset.add(new BrandRule(name, sequence, blacklist));
} else {
@@ -139,7 +125,7 @@ public class BrandChecker {
* The words that should be in the text.
*/
private final HashMap<String, Boolean> names;
-
+
/**
* A blacklist of words that are not interesting.
*/
@@ -164,7 +150,7 @@ public class BrandChecker {
} else {
this.blacklist = null;
}
-
+
for (String name : names) {
this.names.put(name, Boolean.FALSE);
}
@@ -177,7 +163,7 @@ public class BrandChecker {
*/
public boolean analyze(String[] words) {
reset();
-
+
int found = 0;
for (String word : words) {
@@ -201,7 +187,7 @@ public class BrandChecker {
public String getBrand() {
return brand;
}
-
+
private void reset() {
for (String name : this.names.keySet()) {
this.names.put(name, Boolean.FALSE);
diff --git a/src/database/NamedPreparedStatement.java b/src/database/NamedPreparedStatement.java
index ebb775b..9305d32 100644
--- a/src/database/NamedPreparedStatement.java
+++ b/src/database/NamedPreparedStatement.java
@@ -2,6 +2,7 @@ package database;
import java.sql.Connection;
import java.sql.PreparedStatement;
+import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.sql.Types;
@@ -104,4 +105,14 @@ public class NamedPreparedStatement {
throw ex;
}
}
+
+ public ResultSet executeQuery() throws SQLException {
+ try {
+ return getStmt().executeQuery();
+ } catch (SQLException ex) {
+ System.err.println("Query error: " + ex.getMessage());
+ System.err.println(stmt);
+ throw ex;
+ }
+ }
}
diff --git a/src/database/QueryUtils.java b/src/database/QueryUtils.java
index 2cc6fd6..b95903f 100644
--- a/src/database/QueryUtils.java
+++ b/src/database/QueryUtils.java
@@ -1,7 +1,6 @@
package database;
import java.sql.SQLException;
-import java.util.Locale;
/**
* Utilities to create queries.
@@ -9,8 +8,9 @@ import java.util.Locale;
* @author Maurice Laveaux
*/
public class QueryUtils {
- public final static String insertRating
- = buildQuery("mentionsbrand", new String[]{"tweetid","brand"},"tweetid","brand", "rating");
+
+ public final static String insertRating
+ = buildQuery("mentionsbrand", new String[]{"tweetid", "brand"}, "tweetid", "brand", "rating");
public final static String insertProfile
= buildQuery("twitteruser", new String[]{"userid"},
"userid", "displayname", "timezone", "tweetcount", "followercount",
@@ -96,7 +96,7 @@ public class QueryUtils {
statement.setLong("tweetid", tweetid);
statement.setInt("rating", rating);
statement.setString("brand", brand);
-
+
}
public static void setInsertBrandParams(NamedPreparedStatement brandStmt,
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java
index 58f7dfc..b896f62 100644
--- a/src/main/Analyzor.java
+++ b/src/main/Analyzor.java
@@ -1,368 +1,368 @@
-package main;
-
-import analysis.BrandChecker;
-import database.NamedPreparedStatement;
-import database.QueryUtils;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PrintWriter;
-import java.io.UnsupportedEncodingException;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.List;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map.Entry;
-import java.util.Scanner;
-
-/**
- * The sentiment analysis class that rates tweets based on a unigram and bigram
- * set of weights.
- */
-public class Analyzor {
-
- /**
- * The map that matches single words to their weights.
- */
- private final HashMap<String, Double> unimap = new HashMap();
-
- /**
- * The map that matches word pairs to their weights.
- */
- private final HashMap<String, Double> bimap = new HashMap();
-
- /**
- * The results of a query, maybe return from query().
- */
- private ResultSet data;
-
- /**
- * The persistent connection to the database.
- */
- private final Connection connection;
-
- /**
- * @param connection An open connection to the database.
- */
- public Analyzor(Connection connection) {
- this.connection = connection;
- }
-
- /**
- * Read the unigram and bigram lexica.
- *
- * @throws FileNotFoundException
- */
- public void readLexicon() throws FileNotFoundException {
- if (!unimap.isEmpty()) {
- // data is already read.
- return;
- }
- System.err.println("Trying to read lexicons...");
- // A unigram is in the format (WS = whitespace):
- // word <WS> rating <WS> ??? <WS> ??
- // A bigram has an two WS-separated words instead of one.
- try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt"));
- Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) {
- //Fill the map of unigrams
- int lineno = 1;
- while (uniScanner.hasNext()) {
-
- String words = uniScanner.next();
- Double d = Double.valueOf(uniScanner.next());
- unimap.put(words.toLowerCase(), d);
- if (uniScanner.hasNextLine()) {
- uniScanner.nextLine();
- }
- lineno++;
-
- }
-
- //fill the map of bigrams
- while (biScanner.hasNext()) {
- String words = biScanner.next() + " " + biScanner.next();
- bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next()));
- if (biScanner.hasNextLine()) {
- biScanner.nextLine();
- }
- }
- }
- System.err.println("Lexicons are read.");
- }
-
- /**
- * Executes a query that the analyzer can analyze.
- *
- * @param query The query string to execute.
- * @throws SQLException When database connection isn't available.
- */
- public void query(String query) throws SQLException {
- PreparedStatement statement;
- //make a connection to the database and execute the query
- statement = connection.prepareStatement(query);
- data = statement.executeQuery();
- }
-
- /**
- * Run a sentiment analysis and fill the database with the output.
- *
- * @param query The sql text for the query.
- * @throws SQLException
- * @throws IOException
- */
- public void sentimentAnalysis(String query) throws SQLException, IOException {
- query(query);
-
- //read the lexicons
- readLexicon();
-
- //go to the start of te dataset
- if (data == null) {
- System.err.println("data is empty, try querying first");
- return;
- }
-
- Double value;
- String text;
-
- //for all tuples
- while (data.next()) {
- //get the text
- text = data.getString("text");
- text = splitPunctToWords(text);
- // test is the tweet text you are going to analyze
- String[] words = text.split("\\s+"); // text splitted into separate words
- double positiverate = 0; // positive rating
-
- // Rate the text with unigrams
- for (String word : words) {
- value = unimap.get(word);
- if (value != null) {
- positiverate += unimap.get(word);
- }
- }
- // Rate the text with bigrams
- for (int i = 0; i < words.length - 1; i++) {
- String pair = words[i] + " " + words[i + 1];
- value = bimap.get(pair);
- if (value != null) {
- positiverate += bimap.get(pair);
- }
- }
- //insert the rating into the database
- NamedPreparedStatement m_insertRating;
- m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
- QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
- m_insertRating.executeUpdate();
- //don't print the rate
- //System.out.println(text + ": " + (int) (positiverate * 10));
- }
- }
-
- /**
- * Make a wordcloud of the results of some query.
- *
- * @param query The sql text for a query.
- * @throws SQLException
- * @throws FileNotFoundException
- * @throws UnsupportedEncodingException
- */
- public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
-
- query(query);
- //go to the start of the ResultSet data
- if (data == null) {
- System.err.println("data is empty, try querying first");
- return;
- }
-
- String text;
- String brand;
- String[] words;
- HashMap<String,HashMap<String, Integer>> wordcloud = new HashMap<>();
-
- while (data.next()) {
- //get brand
- brand=data.getString("brand");
- //make hashmap for each brand
- if(!wordcloud.containsKey(brand)){
- wordcloud.put(brand, new HashMap<String,Integer>());
- }
- //get the text
- text = data.getString("text");
- //remove punctuation, convert to lowercase and split on words
- text = removePunct(text);
- text = text.toLowerCase();
- words = text.split("\\s+");
- //for all words
- for (String word : words) {
- //if it is empty, a space or a stripe, skip it
- if(word.equals("") || word.equals(" ") || word.equals("-")){
- continue;
- }
- //if the word is already in the map, increment the amount
- if(wordcloud.get(brand).containsKey(word)){
- wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1);
- }
- //if the word is not already in the map, make an entry with amount = 1
- else{
- wordcloud.get(brand).put(word, 1);
- }
- }
- }
- //print the words and their frequency in a csv file
- mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count");
- }
-
- //generate csv for disco from the query
- public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
- //do the query
- query(query);
- PrintWriter writer = new PrintWriter("output.csv", "UTF-8");
- //print the first row
- for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
- writer.print(data.getMetaData().getColumnLabel(i) + ", ");
- }
- writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount()));
- //print the values
- while (data.next()) {
- for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
- if (data.getObject(i) == null) {
- writer.print(", ");
- } else {
- writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", ");
- }
- }
- if (data.getObject(data.getMetaData().getColumnCount()) == null) {
- writer.println("0");
- } else {
- writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " "));
- }
- }
- writer.close();
- }
-
- public void getBrands() throws SQLException {
- PreparedStatement statement;
- //make a connection to the database and execute the query
- statement = connection.prepareStatement("delete from mentionsbrand");
- statement.executeUpdate();
- BrandChecker checker = new BrandChecker("brandonlyrules.txt");
- query("select * from tweet");
- NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand);
- while (data.next()) {
- List<String> brands = checker.getBrands(data.getString("text"));
- if (brands.isEmpty()) {
- QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no");
- m_insertBrand.executeUpdate();
- } else {
- for (String brand : brands) {
- QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand);
- m_insertBrand.executeUpdate();
- }
- }
- }
- }
-
- //gets the amount of users that tweet about a brand in a timezone
- //makes a csv file timezone, brand, amount
- public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{
- query(query);
-
- InputStream inFile = new FileInputStream("timezone.txt");
- Scanner readFile = new Scanner(inFile);
- HashMap<String,String> toTimezone = new HashMap<>();
- while (readFile.hasNextLine()) {
- String line = readFile.nextLine();
- if(line.split(",").length>1){
- toTimezone.put(line.split(",")[0], line.split(",")[1]);
- }
- }
-
-
-
- //hashmap timezone, brand, amount
- HashMap<String, HashMap<String, Integer>> timeMap = new HashMap<>();
- String timezone;
- String brand;
-
- while(data.next()){
- timezone = data.getString("timezone");
- if (toTimezone.containsKey(timezone)){
- timezone=toTimezone.get(timezone);
- } else {
- timezone="other";
- }
- brand = data.getString("brand");
- //if the timezone is already in the map
- if(timeMap.containsKey(timezone)){
- //if the brand for that timezone is already in the map
- if(timeMap.get(timezone).containsKey(brand)){
- //increment the amount
- timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1);
- }
- //if the brand for that timezone is not yet in the map
- else{
- //make a new entry for that brand with amount = 1
- timeMap.get(timezone).put(brand, 1);
- }
- }
- //if the timezone is not yet in the map
- else{
- //make a new hashmap for this map and fill it with the brand and the amount
- timeMap.put(timezone, new HashMap<String, Integer>());
- timeMap.get(timezone).put(brand, 1);
- }
- }
-
-
- //make the CSV out of the map
- mapToCSV(timeMap, "timezone.csv", "timezone,brand,count");
- }
-
- //replaces punctuation so it will be splitted
- //also removes urls
- private String splitPunctToWords(String text) {
- text = text.replaceAll("https?://\\S*", "");
- text = text.replaceAll("[!?):;\"']", " $0");
- text = text.replaceAll("[.,-](\\s|$)", " $0");
- text = text.replaceAll("\\s[(\"']", "$0 ");
- return text;
- }
-
- //removes punctuation
- //also removes urls
- private String removePunct(String text) {
- text = text.replaceAll("https?://\\S*", " ");
- text = text.replaceAll("@\\S*", " ");
- text = text.replaceAll("[^a-zA-Z0-9#_-]", " ");
- return text;
- }
-
- //prints a hashmap into a csv for a html application
- //Hashmap<key1, HashMap<key2, value>> becomes key1, key2, value
- //only for String, String, Integer
- void mapToCSV(HashMap<String, HashMap<String, Integer>> map, String fileName, String firstLine)
- throws FileNotFoundException, UnsupportedEncodingException{
-
- PrintWriter writer = new PrintWriter(fileName, "UTF-8");
-
- writer.println(firstLine);
-
- //loop over brands
- for(Entry en : map.entrySet()){
- //loop over words
- for(Entry e : map.get(en.getKey()).entrySet()){
- writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue());
- }
- }
-
- writer.close();
- System.out.println("csv file made, please put it next to html file and run this");
- }
-}
+package main;
+
+import analysis.BrandChecker;
+import database.NamedPreparedStatement;
+import database.QueryUtils;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.List;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map.Entry;
+import java.util.Scanner;
+
+/**
+ * The sentiment analysis class that rates tweets based on a unigram and bigram
+ * set of weights.
+ */
+public class Analyzor {
+
+ /**
+ * The map that matches single words to their weights.
+ */
+ private final HashMap<String, Double> unimap = new HashMap();
+
+ /**
+ * The map that matches word pairs to their weights.
+ */
+ private final HashMap<String, Double> bimap = new HashMap();
+
+ /**
+ * The results of a query, maybe return from query().
+ */
+ private ResultSet data;
+
+ /**
+ * The persistent connection to the database.
+ */
+ private final Connection connection;
+
+ /**
+ * @param connection An open connection to the database.
+ */
+ public Analyzor(Connection connection) {
+ this.connection = connection;
+ }
+
+ /**
+ * Read the unigram and bigram lexica.
+ *
+ * @throws FileNotFoundException
+ */
+ public void readLexicon() throws FileNotFoundException {
+ if (!unimap.isEmpty()) {
+ // data is already read.
+ return;
+ }
+ System.err.println("Trying to read lexicons...");
+ // A unigram is in the format (WS = whitespace):
+ // word <WS> rating <WS> ??? <WS> ??
+ // A bigram has an two WS-separated words instead of one.
+ try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt"));
+ Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) {
+ //Fill the map of unigrams
+ int lineno = 1;
+ while (uniScanner.hasNext()) {
+
+ String words = uniScanner.next();
+ Double d = Double.valueOf(uniScanner.next());
+ unimap.put(words.toLowerCase(), d);
+ if (uniScanner.hasNextLine()) {
+ uniScanner.nextLine();
+ }
+ lineno++;
+
+ }
+
+ //fill the map of bigrams
+ while (biScanner.hasNext()) {
+ String words = biScanner.next() + " " + biScanner.next();
+ bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next()));
+ if (biScanner.hasNextLine()) {
+ biScanner.nextLine();
+ }
+ }
+ }
+ System.err.println("Lexicons are read.");
+ }
+
+ /**
+ * Executes a query that the analyzer can analyze.
+ *
+ * @param query The query string to execute.
+ * @throws SQLException When database connection isn't available.
+ */
+ public void query(String query) throws SQLException {
+ PreparedStatement statement;
+ //make a connection to the database and execute the query
+ statement = connection.prepareStatement(query);
+ data = statement.executeQuery();
+ }
+
+ /**
+ * Run a sentiment analysis and fill the database with the output.
+ *
+ * @param query The sql text for the query.
+ * @throws SQLException
+ * @throws IOException
+ */
+ public void sentimentAnalysis(String query) throws SQLException, IOException {
+ query(query);
+
+ //read the lexicons
+ readLexicon();
+
+ //go to the start of te dataset
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+
+ Double value;
+ String text;
+
+ //for all tuples
+ while (data.next()) {
+ //get the text
+ text = data.getString("text");
+ text = splitPunctToWords(text);
+ // test is the tweet text you are going to analyze
+ String[] words = text.split("\\s+"); // text splitted into separate words
+ double positiverate = 0; // positive rating
+
+ // Rate the text with unigrams
+ for (String word : words) {
+ value = unimap.get(word);
+ if (value != null) {
+ positiverate += unimap.get(word);
+ }
+ }
+ // Rate the text with bigrams
+ for (int i = 0; i < words.length - 1; i++) {
+ String pair = words[i] + " " + words[i + 1];
+ value = bimap.get(pair);
+ if (value != null) {
+ positiverate += bimap.get(pair);
+ }
+ }
+ //insert the rating into the database
+ NamedPreparedStatement m_insertRating;
+ m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
+ QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
+ m_insertRating.executeUpdate();
+ //don't print the rate
+ //System.out.println(text + ": " + (int) (positiverate * 10));
+ }
+ }
+
+ /**
+ * Make a wordcloud of the results of some query.
+ *
+ * @param query The sql text for a query.
+ * @throws SQLException
+ * @throws FileNotFoundException
+ * @throws UnsupportedEncodingException
+ */
+ public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+
+ query(query);
+ //go to the start of the ResultSet data
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+
+ String text;
+ String brand;
+ String[] words;
+ HashMap<String,HashMap<String, Integer>> wordcloud = new HashMap<>();
+
+ while (data.next()) {
+ //get brand
+ brand=data.getString("brand");
+ //make hashmap for each brand
+ if(!wordcloud.containsKey(brand)){
+ wordcloud.put(brand, new HashMap<String,Integer>());
+ }
+ //get the text
+ text = data.getString("text");
+ //remove punctuation, convert to lowercase and split on words
+ text = removePunct(text);
+ text = text.toLowerCase();
+ words = text.split("\\s+");
+ //for all words
+ for (String word : words) {
+ //if it is empty, a space or a stripe, skip it
+ if(word.equals("") || word.equals(" ") || word.equals("-")){
+ continue;
+ }
+ //if the word is already in the map, increment the amount
+ if(wordcloud.get(brand).containsKey(word)){
+ wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1);
+ }
+ //if the word is not already in the map, make an entry with amount = 1
+ else{
+ wordcloud.get(brand).put(word, 1);
+ }
+ }
+ }
+ //print the words and their frequency in a csv file
+ mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count");
+ }
+
+ //generate csv for disco from the query
+ public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+ //do the query
+ query(query);
+ PrintWriter writer = new PrintWriter("output.csv", "UTF-8");
+ //print the first row
+ for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
+ writer.print(data.getMetaData().getColumnLabel(i) + ", ");
+ }
+ writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount()));
+ //print the values
+ while (data.next()) {
+ for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
+ if (data.getObject(i) == null) {
+ writer.print(", ");
+ } else {
+ writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", ");
+ }
+ }
+ if (data.getObject(data.getMetaData().getColumnCount()) == null) {
+ writer.println("0");
+ } else {
+ writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " "));
+ }
+ }
+ writer.close();
+ }
+
+ public void getBrands() throws SQLException {
+ PreparedStatement statement;
+ //make a connection to the database and execute the query
+ statement = connection.prepareStatement("delete from mentionsbrand");
+ statement.executeUpdate();
+ BrandChecker checker = new BrandChecker("brandonlyrules.txt");
+ query("select * from tweet");
+ NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand);
+ while (data.next()) {
+ List<String> brands = checker.getBrands(data.getString("text"));
+ if (brands.isEmpty()) {
+ QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no");
+ m_insertBrand.executeUpdate();
+ } else {
+ for (String brand : brands) {
+ QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand);
+ m_insertBrand.executeUpdate();
+ }
+ }
+ }
+ }
+
+ //gets the amount of users that tweet about a brand in a timezone
+ //makes a csv file timezone, brand, amount
+ public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{
+ query(query);
+
+ InputStream inFile = new FileInputStream("timezone.txt");
+ Scanner readFile = new Scanner(inFile);
+ HashMap<String,String> toTimezone = new HashMap<>();
+ while (readFile.hasNextLine()) {
+ String line = readFile.nextLine();
+ if(line.split(",").length>1){
+ toTimezone.put(line.split(",")[0], line.split(",")[1]);
+ }
+ }
+
+
+
+ //hashmap timezone, brand, amount
+ HashMap<String, HashMap<String, Integer>> timeMap = new HashMap<>();
+ String timezone;
+ String brand;
+
+ while(data.next()){
+ timezone = data.getString("timezone");
+ if (toTimezone.containsKey(timezone)){
+ timezone=toTimezone.get(timezone);
+ } else {
+ timezone="other";
+ }
+ brand = data.getString("brand");
+ //if the timezone is already in the map
+ if(timeMap.containsKey(timezone)){
+ //if the brand for that timezone is already in the map
+ if(timeMap.get(timezone).containsKey(brand)){
+ //increment the amount
+ timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1);
+ }
+ //if the brand for that timezone is not yet in the map
+ else{
+ //make a new entry for that brand with amount = 1
+ timeMap.get(timezone).put(brand, 1);
+ }
+ }
+ //if the timezone is not yet in the map
+ else{
+ //make a new hashmap for this map and fill it with the brand and the amount
+ timeMap.put(timezone, new HashMap<String, Integer>());
+ timeMap.get(timezone).put(brand, 1);
+ }
+ }
+
+
+ //make the CSV out of the map
+ mapToCSV(timeMap, "timezone.csv", "timezone,brand,count");
+ }
+
+ //replaces punctuation so it will be splitted
+ //also removes urls
+ private String splitPunctToWords(String text) {
+ text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll("[!?):;\"']", " $0");
+ text = text.replaceAll("[.,-](\\s|$)", " $0");
+ text = text.replaceAll("\\s[(\"']", "$0 ");
+ return text;
+ }
+
+ //removes punctuation
+ //also removes urls
+ private String removePunct(String text) {
+ text = text.replaceAll("https?://\\S*", " ");
+ text = text.replaceAll("@\\S*", " ");
+ text = text.replaceAll("[^a-zA-Z0-9#_-]", " ");
+ return text;
+ }
+
+ //prints a hashmap into a csv for a html application
+ //Hashmap<key1, HashMap<key2, value>> becomes key1, key2, value
+ //only for String, String, Integer
+ void mapToCSV(HashMap<String, HashMap<String, Integer>> map, String fileName, String firstLine)
+ throws FileNotFoundException, UnsupportedEncodingException{
+
+ PrintWriter writer = new PrintWriter(fileName, "UTF-8");
+
+ writer.println(firstLine);
+
+ //loop over brands
+ for(Entry en : map.entrySet()){
+ //loop over words
+ for(Entry e : map.get(en.getKey()).entrySet()){
+ writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue());
+ }
+ }
+
+ writer.close();
+ System.out.println("csv file made, please put it next to html file and run this");
+ }
+}
diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java
index 8fc515a..6bf350e 100644
--- a/src/main/FarmShell.java
+++ b/src/main/FarmShell.java
@@ -1,212 +1,217 @@
-package main;
-
-import database.ConnectionBuilder;
-import java.io.IOException;
-import java.sql.Connection;
-import java.sql.SQLException;
-import java.util.Arrays;
-import java.util.NoSuchElementException;
-import java.util.Scanner;
-
-/**
- *
- * @author s123188
- */
-public class FarmShell {
-
- /**
- * A scanner for the stdin.
- */
- private final Scanner scanner = new Scanner(System.in);
-
- private Analyzor cached_analyzor;
- private final ConnectionBuilder dbConnectionBuilder;
-
- FarmShell(ConnectionBuilder dbConnectionBuilder) {
- this.dbConnectionBuilder = dbConnectionBuilder;
- }
-
- private void printPrompt() {
- System.out.print("$ ");
- }
-
- private Analyzor getAnalyzor() throws SQLException {
- if (cached_analyzor == null) {
- Connection dbCon = dbConnectionBuilder.create();
- cached_analyzor = new Analyzor(dbCon);
- }
- return cached_analyzor;
- }
-
- /**
- * Processes commands from stdin until the exit command is received or EOF.
- */
- public void process_forever() {
- System.err.println("Entering interactive shell, type 'help' for help "
- + "or 'exit' to leave. '.' repeats the previous interactive "
- + "command.");
- // print prompt for reading first command
- printPrompt();
- String lastLine = "";
- while (scanner.hasNextLine()) {
- String line = scanner.nextLine().trim();
- // repeat last command
- if (line.equals(".")) {
- line = lastLine;
- }
- if (!execute(line)) {
- // requested to terminate
- break;
- }
- if (!line.isEmpty()) {
- lastLine = line;
- }
- // print prompt for reading next line
- printPrompt();
- }
- }
-
- /**
- * Execute a single commands.
- *
- * @param cmd A single line of the command.
- * @return Whether to continue or exit the application.
- */
- public boolean execute(String cmd) {
- String[] args = cmd.trim().split("\\s+", 2);
- if (!args[0].isEmpty()) {
- // non-empty command, let's see whether it makes sense?
- return execute(args);
- }
- return true;
- }
-
- /**
- * Executes a command with optional parameters.
- *
- * @param args An array with the first argument containing the command with
- * optional parameters in following arguments.
- * @return true if more commands are allowed to be executed, false
- * otherwise.
- */
- public boolean execute(String[] args) {
- try {
- Command command = Command.fromString(args[0]);
- String[] params = Arrays.copyOfRange(args, 1, args.length);
- execute(command, params);
- } catch (IllegalArgumentException ex) {
- System.err.println(ex.getMessage());
- } catch (IOException ex) {
- System.err.println("Command " + args[0] + " failed with " + ex);
- ex.printStackTrace();
- } catch (NoSuchElementException ex) {
- if ("EXIT NOW".equals(ex.getMessage())) {
- // thrown by the "exit" command to signal exit
- return false;
- } else {
- System.err.println("ZOMG SOMETHIGN FAILED: " + ex.getMessage());
- ex.printStackTrace();
- }
- } catch (SQLException ex) {
- System.err.println("such " + ex);
- }
- // another satisfied customer, next!
- return true;
- }
-
- private void execute(Command command, String[] params) throws SQLException, IOException {
- if (params.length < command.getParamCount()) {
- throw new IllegalArgumentException("Expected "
- + command.getParamCount() + " parameters, got only "
- + params.length);
- }
- switch (command) {
- case filterbots:
- System.out.println("not yet implemented");
- break;
- case sentiment:
- getAnalyzor().sentimentAnalysis(params[0]);
- break;
- case wordcloud:
- getAnalyzor().makeWordCloud(params[0]);
- break;
- case timezone:
- getAnalyzor().timezone(params[0]);
- case disco:
- getAnalyzor().disco(params[0]);
- break;
- case getBrands:
- getAnalyzor().getBrands();
- break;
- case help:
- for (String line : HELP) {
- System.out.println(line);
- }
- for (Command cmd : Command.values()) {
- System.out.printf(" %-10s", cmd.name());
- if (!cmd.getDescription().isEmpty()) {
- System.out.print(" " + cmd.getDescription());
- }
- if (cmd.getParamCount() == 1) {
- System.out.print(" (1 arg)");
- } else if (cmd.getParamCount() > 1) {
- System.out.printf(" (%d args)", cmd.getParamCount());
- }
- System.out.println();
- }
- break;
- case exit:
- throw new NoSuchElementException("EXIT NOW");
- default:
- throw new AssertionError(command.name());
- }
- }
-
- enum Command {
-
- filterbots("marks all users as bot or not", 1),
- sentiment("analyzes all tweets on positivity (about a brand)", 1),
- wordcloud("makes a wordcloud of the text of the tweets", 1),
- getBrands("fills the database with the brands of a tweet"),
- timezone("makes a map per brand for the users", 1),
- disco("makes a outputfile for disco", 1),
- exit("Returns to shell"),
- help("Get help");
-
- private final String description;
- private final int paramCount;
-
- Command(String description) {
- this.description = description;
- this.paramCount = 0;
- }
-
- Command(String description, int paramCount) {
- this.description = description;
- this.paramCount = paramCount;
- }
-
- public String getDescription() {
- return description;
- }
-
- public int getParamCount() {
- return paramCount;
- }
-
- public static Command fromString(String command) {
- for (Command cmd : values()) {
- if (cmd.name().equals(command)) {
- return cmd;
- }
- }
- throw new IllegalArgumentException("Unrecognized command. Hint: help");
- }
- };
-
- private final String[] HELP = new String[]{
- "Interactive TweetShell",
- "",
- "Available commands:"
- };
-}
+package main;
+
+import database.ConnectionBuilder;
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.util.Arrays;
+import java.util.NoSuchElementException;
+import java.util.Scanner;
+
+/**
+ *
+ * @author s123188
+ */
+public class FarmShell {
+
+ /**
+ * A scanner for the stdin.
+ */
+ private final Scanner scanner = new Scanner(System.in);
+
+ private Analyzor cached_analyzor;
+ private final ConnectionBuilder dbConnectionBuilder;
+
+ FarmShell(ConnectionBuilder dbConnectionBuilder) {
+ this.dbConnectionBuilder = dbConnectionBuilder;
+ }
+
+ private void printPrompt() {
+ System.out.print("$ ");
+ }
+
+ private Analyzor getAnalyzor() throws SQLException {
+ if (cached_analyzor == null) {
+ Connection dbCon = dbConnectionBuilder.create();
+ cached_analyzor = new Analyzor(dbCon);
+ }
+ return cached_analyzor;
+ }
+
+ /**
+ * Processes commands from stdin until the exit command is received or EOF.
+ */
+ public void process_forever() {
+ System.err.println("Entering interactive shell, type 'help' for help "
+ + "or 'exit' to leave. '.' repeats the previous interactive "
+ + "command.");
+ // print prompt for reading first command
+ printPrompt();
+ String lastLine = "";
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine().trim();
+ // repeat last command
+ if (line.equals(".")) {
+ line = lastLine;
+ }
+ if (!execute(line)) {
+ // requested to terminate
+ break;
+ }
+ if (!line.isEmpty()) {
+ lastLine = line;
+ }
+ // print prompt for reading next line
+ printPrompt();
+ }
+ }
+
+ /**
+ * Execute a single commands.
+ *
+ * @param cmd A single line of the command.
+ * @return Whether to continue or exit the application.
+ */
+ public boolean execute(String cmd) {
+ String[] args = cmd.trim().split("\\s+", 2);
+ if (!args[0].isEmpty()) {
+ // non-empty command, let's see whether it makes sense?
+ return execute(args);
+ }
+ return true;
+ }
+
+ /**
+ * Executes a command with optional parameters.
+ *
+ * @param args An array with the first argument containing the command with
+ * optional parameters in following arguments.
+ * @return true if more commands are allowed to be executed, false
+ * otherwise.
+ */
+ public boolean execute(String[] args) {
+ try {
+ Command command = Command.fromString(args[0]);
+ String[] params = Arrays.copyOfRange(args, 1, args.length);
+ execute(command, params);
+ } catch (IllegalArgumentException ex) {
+ System.err.println(ex.getMessage());
+ } catch (IOException ex) {
+ System.err.println("Command " + args[0] + " failed with " + ex);
+ ex.printStackTrace();
+ } catch (NoSuchElementException ex) {
+ if ("EXIT NOW".equals(ex.getMessage())) {
+ // thrown by the "exit" command to signal exit
+ return false;
+ } else {
+ System.err.println("ZOMG SOMETHIGN FAILED: " + ex.getMessage());
+ ex.printStackTrace();
+ }
+ } catch (SQLException ex) {
+ System.err.println("such " + ex);
+ }
+ // another satisfied customer, next!
+ return true;
+ }
+
+ private void execute(Command command, String[] params) throws SQLException, IOException {
+ if (params.length < command.getParamCount()) {
+ throw new IllegalArgumentException("Expected "
+ + command.getParamCount() + " parameters, got only "
+ + params.length);
+ }
+ switch (command) {
+ case filterbots:
+ System.out.println("not yet implemented");
+ break;
+ case sentiment:
+ // if there is no query, update all unrated items.
+ if (params.length > 0) {
+ getAnalyzor().sentimentAnalysis(params[0]);
+ } else {
+ getAnalyzor().sentimentAnalysis("");
+ }
+ break;
+ case wordcloud:
+ getAnalyzor().makeWordCloud(params[0]);
+ break;
+ case timezone:
+ getAnalyzor().timezone(params[0]);
+ case disco:
+ getAnalyzor().disco(params[0]);
+ break;
+ case getBrands:
+ getAnalyzor().getBrands();
+ break;
+ case help:
+ for (String line : HELP) {
+ System.out.println(line);
+ }
+ for (Command cmd : Command.values()) {
+ System.out.printf(" %-10s", cmd.name());
+ if (!cmd.getDescription().isEmpty()) {
+ System.out.print(" " + cmd.getDescription());
+ }
+ if (cmd.getParamCount() == 1) {
+ System.out.print(" (1 arg)");
+ } else if (cmd.getParamCount() > 1) {
+ System.out.printf(" (%d args)", cmd.getParamCount());
+ }
+ System.out.println();
+ }
+ break;
+ case exit:
+ throw new NoSuchElementException("EXIT NOW");
+ default:
+ throw new AssertionError(command.name());
+ }
+ }
+
+ enum Command {
+
+ filterbots("marks all users as bot or not", 1),
+ sentiment("analyzes all tweets on brand positivity (optional arg: tweet/brand selection query)"),
+ wordcloud("makes a wordcloud of the text of the tweets", 1),
+ getBrands("fills the database with the brands of a tweet"),
+ timezone("makes a map per brand for the users", 1),
+ disco("makes a outputfile for disco", 1),
+ exit("Returns to shell"),
+ help("Get help");
+
+ private final String description;
+ private final int paramCount;
+
+ Command(String description) {
+ this.description = description;
+ this.paramCount = 0;
+ }
+
+ Command(String description, int paramCount) {
+ this.description = description;
+ this.paramCount = paramCount;
+ }
+
+ public String getDescription() {
+ return description;
+ }
+
+ public int getParamCount() {
+ return paramCount;
+ }
+
+ public static Command fromString(String command) {
+ for (Command cmd : values()) {
+ if (cmd.name().equals(command)) {
+ return cmd;
+ }
+ }
+ throw new IllegalArgumentException("Unrecognized command. Hint: help");
+ }
+ };
+
+ private final String[] HELP = new String[]{
+ "Interactive TweetShell",
+ "",
+ "Available commands:"
+ };
+}
diff --git a/test/analysis/BrandCheckerTest.java b/test/analysis/BrandCheckerTest.java
index 23d8445..f55035b 100644
--- a/test/analysis/BrandCheckerTest.java
+++ b/test/analysis/BrandCheckerTest.java
@@ -82,10 +82,10 @@ public class BrandCheckerTest {
public void testBullshit() {
doTest("This applepie is delicious", new String[]{});
}
-
+
@Test
public void multipleBrands() {
- doTest("This tweet contains both iphone 4s,galaxy s5 and iphone", new String[]{"iphone 4s","galaxy s5"});
+ doTest("This tweet contains both iphone 4s,galaxy s5 and iphone", new String[]{"iphone 4s", "galaxy s5"});
}
}