summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMaurice Laveaux <m.laveaux@student.tue.nl>2014-05-19 11:13:35 +0200
committerMaurice Laveaux <m.laveaux@student.tue.nl>2014-05-19 11:13:35 +0200
commitb1bbd92fdcf9f52d38852b6c5ae802cc401ff712 (patch)
tree7ece7058a0ca39d4773b18e70437b8978aa3022a /src
parent058ef3d39029da492477c3c59f1e405193ca3013 (diff)
parent1f5085cb29caa40c96878dd0fc4194abdb2cac2b (diff)
downloadGoldfarmer-b1bbd92fdcf9f52d38852b6c5ae802cc401ff712.tar.gz
Merge branch 'master' of git.lekensteyn.nl:tue/2IOC0-DBL/Goldfarmer
Diffstat (limited to 'src')
-rw-r--r--src/database/QueryUtils.java6
-rw-r--r--src/main/Analyzor.java509
-rw-r--r--src/main/FarmShell.java4
3 files changed, 255 insertions, 264 deletions
diff --git a/src/database/QueryUtils.java b/src/database/QueryUtils.java
index 9aab081..2cc6fd6 100644
--- a/src/database/QueryUtils.java
+++ b/src/database/QueryUtils.java
@@ -99,4 +99,10 @@ public class QueryUtils {
}
+ public static void setInsertBrandParams(NamedPreparedStatement brandStmt,
+ long id, String brand) throws SQLException {
+ brandStmt.setLong("tweetid", id);
+ brandStmt.setString("brand", brand);
+ // TODO: rating (positive)
+ }
}
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java
index 39df43c..aa2d0a4 100644
--- a/src/main/Analyzor.java
+++ b/src/main/Analyzor.java
@@ -1,264 +1,245 @@
-package main;
-
-import database.NamedPreparedStatement;
-import database.QueryUtils;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.UnsupportedEncodingException;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.Scanner;
-
-/**
- * The sentiment analysis class that rates tweets based on a unigram and bigram
- * set of weights.
- */
-public class Analyzor {
-
- /**
- * The map that matches single words to their weights.
- */
- private final HashMap<String, Double> unimap = new HashMap();
-
- /**
- * The map that matches word pairs to their weights.
- */
- private final HashMap<String, Double> bimap = new HashMap();
-
- /**
- * The results of the query.
- */
- private ResultSet data;
-
- /**
- * The existing connection to the database.
- */
- private final Connection connection;
-
- /**
- * @param connection A connection to the database.
- */
- public Analyzor(Connection connection) {
- this.connection = connection;
- }
-
- /**
- * Read the unigram and bigram lexica.
- *
- * @throws FileNotFoundException
- */
- public void readLexicon() throws FileNotFoundException {
- if (!unimap.isEmpty()) {
- // data is already read.
- return;
- }
- System.err.println("Trying to read lexicons...");
- // A unigram is in the format (WS = whitespace):
- // word <WS> rating <WS> ??? <WS> ??
- // A bigram has an two WS-separated words instead of one.
- try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt"));
- Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) {
- //Fill the map of unigrams
- while (uniScanner.hasNext()) {
- String words = uniScanner.next();
- unimap.put(words.toLowerCase(), uniScanner.nextDouble());
- if (uniScanner.hasNextLine()) {
- uniScanner.nextLine();
- }
- }
-
- //fill the map of bigrams
- while (biScanner.hasNext()) {
- String words = biScanner.next() + " " + biScanner.next();
- bimap.put(words.toLowerCase(), biScanner.nextDouble());
- if (biScanner.hasNextLine()) {
- biScanner.nextLine();
- }
- }
- }
- System.err.println("Lexicons are read.");
- }
-
- /**
- * Executes a query that the analyzer can analyze.
- *
- * @param query The query string to execute.
- * @throws SQLException When database connection isn't available.
- */
- public void query(String query) throws SQLException {
- PreparedStatement statement;
- //make a connection to the database and execute the query
- statement = connection.prepareStatement(query);
- data = statement.executeQuery();
- }
-
- /**
- * Run a sentiment analysis and fill the database with the output.
- *
- * @param query The query to analyze.
- * @throws SQLException
- * @throws IOException
- */
- public void sentimentAnalysis(String query) throws SQLException, IOException {
- query(query);
-
- //read the lexicons
- readLexicon();
-
- //go to the start of te dataset
- if (data == null) {
- System.err.println("data is empty, try querying first");
- return;
- }
-
- Double value;
- String text;
-
- //for all tuples
- while (data.next()) {
- //get the text
- text = data.getString("text");
- text = splitPunctToWords(text);
- // test is the tweet text you are going to analyze
- String[] words = text.split("\\s+"); // text splitted into separate words
- double positiverate = 0; // positive rating
-
- // Rate the text with unigrams
- for (String word : words) {
- value = unimap.get(word);
- if (value != null) {
- positiverate += unimap.get(word);
- }
- }
- // Rate the text with bigrams
- for (int i = 0; i < words.length - 1; i++) {
- String pair = words[i] + " " + words[i + 1];
- value = bimap.get(pair);
- if (value != null) {
- positiverate += bimap.get(pair);
- }
- }
- //insert the rating into the database
- NamedPreparedStatement m_insertRating;
- m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
- QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
- m_insertRating.executeUpdate();
- //don't print the rate
- //System.out.println(text + ": " + (int) (positiverate * 10));
- }
- }
-
- /**
- * Make a word cloud of the results of a query.
- *
- * @param query
- * @throws SQLException
- * @throws FileNotFoundException
- * @throws UnsupportedEncodingException
- */
- public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
-
- query(query);
- //go to the start of the ResultSet data
- if (data == null) {
- System.err.println("data is empty, try querying first");
- return;
- }
-
- String text;
- String[] words;
- Integer value;
- String tweetid;
-
- PrintWriter writer = new PrintWriter("wordcloud.csv", "UTF-8");
- //print the first row
- writer.println("tweetid, word");
-
- while (data.next()) {
- //get the text
- text = data.getString("text");
- //remove punctuation, convert to lowercase and split on words
- text = removePunct(text);
- text = text.toLowerCase();
- words = text.split("\\s+");
- //we use the tweetid as case id
- tweetid = Long.toString(data.getLong("tweetid"));
-
- for (String word : words) {
- writer.println(tweetid + ", " + word);
- }
- }
- //print it in a csv file to put in disco
-
- //print the first row
-
- //print the values
- writer.close();
- }
-
- /**
- * Makes a comma seperated values file from the query and writes it to output.csv
- *
- * @param query The string of the query to execute.
- * @throws SQLException
- * @throws FileNotFoundException
- * @throws UnsupportedEncodingException
- */
- public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
- //do the query
- query(query);
- PrintWriter writer = new PrintWriter("output.csv", "UTF-8");
- //print the first row
- for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
- writer.print(data.getMetaData().getColumnLabel(i) + ", ");
- }
- writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount()));
- //print the values
- while (data.next()) {
- for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
- if (data.getObject(i) == null) {
- writer.print(", ");
- } else {
- writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", ");
- }
- }
- if(data.getObject(data.getMetaData().getColumnCount())==null){
- writer.println("0");
- } else {
- writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " "));
- }
- }
- writer.close();
- }
-
- /**
- * Replaces punctuation so it can be split correctly.
- *
- * @param text The input text
- * @return The text with punctuation replaced.
- */
- private String splitPunctToWords(String text) {
- text = text.replaceAll("https?://\\S*", "");
- text = text.replaceAll("[!?):;\"']", " $0");
- text = text.replaceAll("[.,-](\\s|$)", " $0");
- text = text.replaceAll("\\s[(\"']", "$0 ");
- return text;
- }
-
- /**
- * Removes punctuation and urls.
- *
- * @param text The input text
- * @return The text with punctuation removed.
- */
- private String removePunct(String text) {
- text = text.replaceAll("https?://\\S*", "");
- text = text.replaceAll("[.,!?()-:;\"']", " ");
- return text;
- }
-}
+package main;
+
+import analysis.BrandChecker;
+import database.NamedPreparedStatement;
+import database.QueryUtils;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.util.Scanner;
+
+/**
+ * The sentiment analysis class that rates tweets based on a unigram and bigram
+ * set of weights.
+ */
+public class Analyzor {
+
+ /**
+ * The map that matches single words to their weights.
+ */
+ private final HashMap<String, Double> unimap = new HashMap();
+
+ /**
+ * The map that matches word pairs to their weights.
+ */
+ private final HashMap<String, Double> bimap = new HashMap();
+
+ private ResultSet data;
+ private final Connection connection;
+
+ Analyzor(Connection connection) {
+ this.connection = connection;
+ }
+
+ //reads the lexicons
+ void readLexicon() throws FileNotFoundException {
+ if (!unimap.isEmpty()) {
+ // data is already read.
+ return;
+ }
+ System.err.println("Trying to read lexicons...");
+ // A unigram is in the format (WS = whitespace):
+ // word <WS> rating <WS> ??? <WS> ??
+ // A bigram has an two WS-separated words instead of one.
+ try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt"));
+ Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) {
+ //Fill the map of unigrams
+ while (uniScanner.hasNext()) {
+ String words = uniScanner.next();
+ unimap.put(words.toLowerCase(), uniScanner.nextDouble());
+ if (uniScanner.hasNextLine()) {
+ uniScanner.nextLine();
+ }
+ }
+
+ //fill the map of bigrams
+ while (biScanner.hasNext()) {
+ String words = biScanner.next() + " " + biScanner.next();
+ bimap.put(words.toLowerCase(), biScanner.nextDouble());
+ if (biScanner.hasNextLine()) {
+ biScanner.nextLine();
+ }
+ }
+ }
+ System.err.println("Lexicons are read.");
+ }
+
+ /**
+ * Executes a query that the analyzer can analyze.
+ *
+ * @param query The query string to execute.
+ * @throws SQLException When database connection isn't available.
+ */
+ public void query(String query) throws SQLException {
+ PreparedStatement statement;
+ //make a connection to the database and execute the query
+ statement = connection.prepareStatement(query);
+ data = statement.executeQuery();
+ }
+
+ /**
+ * Run a sentiment analysis and fill the database with the output.
+ *
+ * @throws SQLException
+ * @throws IOException
+ */
+ public void sentimentAnalysis(String query) throws SQLException, IOException {
+ query(query);
+
+ //read the lexicons
+ readLexicon();
+
+ //go to the start of te dataset
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+
+ Double value;
+ String text;
+
+ //for all tuples
+ while (data.next()) {
+ //get the text
+ text = data.getString("text");
+ text = splitPunctToWords(text);
+ // test is the tweet text you are going to analyze
+ String[] words = text.split("\\s+"); // text splitted into separate words
+ double positiverate = 0; // positive rating
+
+ // Rate the text with unigrams
+ for (String word : words) {
+ value = unimap.get(word);
+ if (value != null) {
+ positiverate += unimap.get(word);
+ }
+ }
+ // Rate the text with bigrams
+ for (int i = 0; i < words.length - 1; i++) {
+ String pair = words[i] + " " + words[i + 1];
+ value = bimap.get(pair);
+ if (value != null) {
+ positiverate += bimap.get(pair);
+ }
+ }
+ //insert the rating into the database
+ NamedPreparedStatement m_insertRating;
+ m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
+ QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
+ m_insertRating.executeUpdate();
+ //don't print the rate
+ //System.out.println(text + ": " + (int) (positiverate * 10));
+ }
+ }
+
+ //makes a wordcloud of the tweets in the ResultSet data
+ void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+
+ query(query);
+ //go to the start of the ResultSet data
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+
+ String text;
+ String[] words;
+ Integer value;
+ String tweetid;
+
+ PrintWriter writer = new PrintWriter("wordcloud.csv", "UTF-8");
+ //print the first row
+ writer.println("tweetid, word");
+
+ while (data.next()) {
+ //get the text
+ text = data.getString("text");
+ //remove punctuation, convert to lowercase and split on words
+ text = removePunct(text);
+ text = text.toLowerCase();
+ words = text.split("\\s+");
+ //we use the tweetid as case id
+ tweetid = Long.toString(data.getLong("tweetid"));
+
+ for (String word : words) {
+ writer.println(tweetid + ", " + word);
+ }
+ }
+ //print it in a csv file to put in disco
+
+ //print the first row
+
+ //print the values
+ writer.close();
+ }
+
+ //generate csv for disco from the query
+ public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+ //do the query
+ query(query);
+ PrintWriter writer = new PrintWriter("output.csv", "UTF-8");
+ //print the first row
+ for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
+ writer.print(data.getMetaData().getColumnLabel(i) + ", ");
+ }
+ writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount()));
+ //print the values
+ while (data.next()) {
+ for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
+ if (data.getObject(i) == null) {
+ writer.print(", ");
+ } else {
+ writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", ");
+ }
+ }
+ if(data.getObject(data.getMetaData().getColumnCount())==null){
+ writer.println("0");
+ } else {
+ writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " "));
+ }
+ }
+ writer.close();
+ }
+
+ public void getBrands() throws SQLException{
+ PreparedStatement statement;
+ //make a connection to the database and execute the query
+ statement = connection.prepareStatement("delete from mentionsbrand");
+ statement.executeUpdate();
+ BrandChecker checker = new BrandChecker("brandrules.txt");
+ query("select * from tweet");
+ NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand);
+ while(data.next()){
+ for(String brand:checker.getBrands(data.getString("text"))){
+ QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand);
+ m_insertBrand.executeUpdate();
+ }
+ }
+ }
+
+ //replaces punctuation so it will be splitted
+ //also removes urls
+ private String splitPunctToWords(String text) {
+ text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll("[!?):;\"']", " $0");
+ text = text.replaceAll("[.,-](\\s|$)", " $0");
+ text = text.replaceAll("\\s[(\"']", "$0 ");
+ return text;
+ }
+
+ //removes punctuation
+ //also removes urls
+ private String removePunct(String text) {
+ text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll("[.,!?()-:;\"']", " ");
+ return text;
+ }
+}
diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java
index 044d3d3..1266fd3 100644
--- a/src/main/FarmShell.java
+++ b/src/main/FarmShell.java
@@ -133,6 +133,9 @@ public class FarmShell {
case disco:
getAnalyzor().disco(params[0]);
break;
+ case getBrands:
+ getAnalyzor().getBrands();
+ break;
case help:
for (String line : HELP) {
System.out.println(line);
@@ -162,6 +165,7 @@ public class FarmShell {
filterbots("marks all users as bot or not", 1),
sentiment("analyzes all tweets on positivity (about a brand)", 1),
wordcloud("makes a wordcloud of the text of the tweets", 1),
+ getBrands("fills the database with the brands of a tweet"),
disco("makes a outputfile for disco",1),
exit("Returns to shell"),
help("Get help");