From 23580362ed7a9d822cca210510c69b3093bdf053 Mon Sep 17 00:00:00 2001 From: S129778 Date: Mon, 19 May 2014 09:50:27 +0200 Subject: get brands connection to database --- src/database/QueryUtils.java | 6 + src/main/Analyzor.java | 473 ++++++++++++++++++++++--------------------- src/main/FarmShell.java | 4 + 3 files changed, 255 insertions(+), 228 deletions(-) (limited to 'src') diff --git a/src/database/QueryUtils.java b/src/database/QueryUtils.java index 9aab081..2cc6fd6 100644 --- a/src/database/QueryUtils.java +++ b/src/database/QueryUtils.java @@ -99,4 +99,10 @@ public class QueryUtils { } + public static void setInsertBrandParams(NamedPreparedStatement brandStmt, + long id, String brand) throws SQLException { + brandStmt.setLong("tweetid", id); + brandStmt.setString("brand", brand); + // TODO: rating (positive) + } } diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 97e781e..2b21fd4 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -1,228 +1,245 @@ -package main; - -import database.NamedPreparedStatement; -import database.QueryUtils; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.Map.Entry; -import java.util.Scanner; - -/** - * The sentiment analysis class that rates tweets based on a unigram and bigram - * set of weights. - */ -public class Analyzor { - - /** - * The map that matches single words to their weights. - */ - private final HashMap unimap = new HashMap(); - - /** - * The map that matches word pairs to their weights. - */ - private final HashMap bimap = new HashMap(); - - private ResultSet data; - private final Connection connection; - - Analyzor(Connection connection) { - this.connection = connection; - } - - //reads the lexicons - void readLexicon() throws FileNotFoundException { - if (!unimap.isEmpty()) { - // data is already read. - return; - } - System.err.println("Trying to read lexicons..."); - // A unigram is in the format (WS = whitespace): - // word rating ??? ?? - // A bigram has an two WS-separated words instead of one. - try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); - Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { - //Fill the map of unigrams - while (uniScanner.hasNext()) { - String words = uniScanner.next(); - unimap.put(words.toLowerCase(), uniScanner.nextDouble()); - if (uniScanner.hasNextLine()) { - uniScanner.nextLine(); - } - } - - //fill the map of bigrams - while (biScanner.hasNext()) { - String words = biScanner.next() + " " + biScanner.next(); - bimap.put(words.toLowerCase(), biScanner.nextDouble()); - if (biScanner.hasNextLine()) { - biScanner.nextLine(); - } - } - } - System.err.println("Lexicons are read."); - } - - /** - * Executes a query that the analyzer can analyze. - * - * @param query The query string to execute. - * @throws SQLException When database connection isn't available. - */ - public void query(String query) throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement(query); - data = statement.executeQuery(); - } - - /** - * Run a sentiment analysis and fill the database with the output. - * - * @throws SQLException - * @throws IOException - */ - public void sentimentAnalysis(String query) throws SQLException, IOException { - query(query); - - //read the lexicons - readLexicon(); - - //go to the start of te dataset - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - Double value; - String text; - - //for all tuples - while (data.next()) { - //get the text - text = data.getString("text"); - text = splitPunctToWords(text); - // test is the tweet text you are going to analyze - String[] words = text.split("\\s+"); // text splitted into separate words - double positiverate = 0; // positive rating - - // Rate the text with unigrams - for (String word : words) { - value = unimap.get(word); - if (value != null) { - positiverate += unimap.get(word); - } - } - // Rate the text with bigrams - for (int i = 0; i < words.length - 1; i++) { - String pair = words[i] + " " + words[i + 1]; - value = bimap.get(pair); - if (value != null) { - positiverate += bimap.get(pair); - } - } - //insert the rating into the database - NamedPreparedStatement m_insertRating; - m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); - QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); - m_insertRating.executeUpdate(); - //don't print the rate - //System.out.println(text + ": " + (int) (positiverate * 10)); - } - } - - //makes a wordcloud of the tweets in the ResultSet data - void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - - query(query); - //go to the start of the ResultSet data - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - String text; - String[] words; - Integer value; - String tweetid; - - PrintWriter writer = new PrintWriter("wordcloud.csv", "UTF-8"); - //print the first row - writer.println("tweetid, word"); - - while (data.next()) { - //get the text - text = data.getString("text"); - //remove punctuation, convert to lowercase and split on words - text = removePunct(text); - text = text.toLowerCase(); - words = text.split("\\s+"); - //we use the tweetid as case id - tweetid = Long.toString(data.getLong("tweetid")); - - for (String word : words) { - writer.println(tweetid + ", " + word); - } - } - //print it in a csv file to put in disco - - //print the first row - - //print the values - writer.close(); - } - - //generate csv for disco from the query - public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - //do the query - query(query); - PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); - //print the first row - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - writer.print(data.getMetaData().getColumnLabel(i) + ", "); - } - writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); - //print the values - while (data.next()) { - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - if (data.getObject(i) == null) { - writer.print(", "); - } else { - writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); - } - } - if(data.getObject(data.getMetaData().getColumnCount())==null){ - writer.println("0"); - } else { - writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); - } - } - writer.close(); - } - - //replaces punctuation so it will be splitted - //also removes urls - private String splitPunctToWords(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[!?):;\"']", " $0"); - text = text.replaceAll("[.,-](\\s|$)", " $0"); - text = text.replaceAll("\\s[(\"']", "$0 "); - return text; - } - - //removes punctuation - //also removes urls - private String removePunct(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[.,!?()-:;\"']", " "); - return text; - } -} +package main; + +import analysis.BrandChecker; +import database.NamedPreparedStatement; +import database.QueryUtils; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.Scanner; + +/** + * The sentiment analysis class that rates tweets based on a unigram and bigram + * set of weights. + */ +public class Analyzor { + + /** + * The map that matches single words to their weights. + */ + private final HashMap unimap = new HashMap(); + + /** + * The map that matches word pairs to their weights. + */ + private final HashMap bimap = new HashMap(); + + private ResultSet data; + private final Connection connection; + + Analyzor(Connection connection) { + this.connection = connection; + } + + //reads the lexicons + void readLexicon() throws FileNotFoundException { + if (!unimap.isEmpty()) { + // data is already read. + return; + } + System.err.println("Trying to read lexicons..."); + // A unigram is in the format (WS = whitespace): + // word rating ??? ?? + // A bigram has an two WS-separated words instead of one. + try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); + Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { + //Fill the map of unigrams + while (uniScanner.hasNext()) { + String words = uniScanner.next(); + unimap.put(words.toLowerCase(), uniScanner.nextDouble()); + if (uniScanner.hasNextLine()) { + uniScanner.nextLine(); + } + } + + //fill the map of bigrams + while (biScanner.hasNext()) { + String words = biScanner.next() + " " + biScanner.next(); + bimap.put(words.toLowerCase(), biScanner.nextDouble()); + if (biScanner.hasNextLine()) { + biScanner.nextLine(); + } + } + } + System.err.println("Lexicons are read."); + } + + /** + * Executes a query that the analyzer can analyze. + * + * @param query The query string to execute. + * @throws SQLException When database connection isn't available. + */ + public void query(String query) throws SQLException { + PreparedStatement statement; + //make a connection to the database and execute the query + statement = connection.prepareStatement(query); + data = statement.executeQuery(); + } + + /** + * Run a sentiment analysis and fill the database with the output. + * + * @throws SQLException + * @throws IOException + */ + public void sentimentAnalysis(String query) throws SQLException, IOException { + query(query); + + //read the lexicons + readLexicon(); + + //go to the start of te dataset + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + + Double value; + String text; + + //for all tuples + while (data.next()) { + //get the text + text = data.getString("text"); + text = splitPunctToWords(text); + // test is the tweet text you are going to analyze + String[] words = text.split("\\s+"); // text splitted into separate words + double positiverate = 0; // positive rating + + // Rate the text with unigrams + for (String word : words) { + value = unimap.get(word); + if (value != null) { + positiverate += unimap.get(word); + } + } + // Rate the text with bigrams + for (int i = 0; i < words.length - 1; i++) { + String pair = words[i] + " " + words[i + 1]; + value = bimap.get(pair); + if (value != null) { + positiverate += bimap.get(pair); + } + } + //insert the rating into the database + NamedPreparedStatement m_insertRating; + m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); + QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); + m_insertRating.executeUpdate(); + //don't print the rate + //System.out.println(text + ": " + (int) (positiverate * 10)); + } + } + + //makes a wordcloud of the tweets in the ResultSet data + void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + + query(query); + //go to the start of the ResultSet data + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + + String text; + String[] words; + Integer value; + String tweetid; + + PrintWriter writer = new PrintWriter("wordcloud.csv", "UTF-8"); + //print the first row + writer.println("tweetid, word"); + + while (data.next()) { + //get the text + text = data.getString("text"); + //remove punctuation, convert to lowercase and split on words + text = removePunct(text); + text = text.toLowerCase(); + words = text.split("\\s+"); + //we use the tweetid as case id + tweetid = Long.toString(data.getLong("tweetid")); + + for (String word : words) { + writer.println(tweetid + ", " + word); + } + } + //print it in a csv file to put in disco + + //print the first row + + //print the values + writer.close(); + } + + //generate csv for disco from the query + public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + //do the query + query(query); + PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); + //print the first row + for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { + writer.print(data.getMetaData().getColumnLabel(i) + ", "); + } + writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); + //print the values + while (data.next()) { + for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { + if (data.getObject(i) == null) { + writer.print(", "); + } else { + writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); + } + } + if(data.getObject(data.getMetaData().getColumnCount())==null){ + writer.println("0"); + } else { + writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); + } + } + writer.close(); + } + + public void getBrands() throws SQLException{ + PreparedStatement statement; + //make a connection to the database and execute the query + statement = connection.prepareStatement("delete from mentionsbrand"); + statement.executeUpdate(); + BrandChecker checker = new BrandChecker("brandrules.txt"); + query("select * from tweet"); + NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); + while(data.next()){ + for(String brand:checker.getBrands(data.getString("text"))){ + QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand); + m_insertBrand.executeUpdate(); + } + } + } + + //replaces punctuation so it will be splitted + //also removes urls + private String splitPunctToWords(String text) { + text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll("[!?):;\"']", " $0"); + text = text.replaceAll("[.,-](\\s|$)", " $0"); + text = text.replaceAll("\\s[(\"']", "$0 "); + return text; + } + + //removes punctuation + //also removes urls + private String removePunct(String text) { + text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll("[.,!?()-:;\"']", " "); + return text; + } +} diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java index 044d3d3..1266fd3 100644 --- a/src/main/FarmShell.java +++ b/src/main/FarmShell.java @@ -133,6 +133,9 @@ public class FarmShell { case disco: getAnalyzor().disco(params[0]); break; + case getBrands: + getAnalyzor().getBrands(); + break; case help: for (String line : HELP) { System.out.println(line); @@ -162,6 +165,7 @@ public class FarmShell { filterbots("marks all users as bot or not", 1), sentiment("analyzes all tweets on positivity (about a brand)", 1), wordcloud("makes a wordcloud of the text of the tweets", 1), + getBrands("fills the database with the brands of a tweet"), disco("makes a outputfile for disco",1), exit("Returns to shell"), help("Get help"); -- cgit v1.2.1