package main; import analysis.BrandChecker; import database.NamedPreparedStatement; import database.QueryUtils; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.List; import java.util.HashMap; import java.util.Map.Entry; import java.util.Scanner; /** * The sentiment analysis class that rates tweets based on a unigram and bigram * set of weights. */ public class Analyzor { /** * The map that matches single words to their weights. */ private final HashMap unimap = new HashMap(); /** * The map that matches word pairs to their weights. */ private final HashMap bimap = new HashMap(); /** * The results of a query, maybe return from query(). */ private ResultSet data; /** * The persistent connection to the database. */ private final Connection connection; /** * @param connection An open connection to the database. */ public Analyzor(Connection connection) { this.connection = connection; } /** * Read the unigram and bigram lexica. * * @throws FileNotFoundException */ public void readLexicon() throws FileNotFoundException { if (!unimap.isEmpty()) { // data is already read. return; } System.err.println("Trying to read lexicons..."); // A unigram is in the format (WS = whitespace): // word rating ??? ?? // A bigram has an two WS-separated words instead of one. try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { //Fill the map of unigrams int lineno = 1; while (uniScanner.hasNext()) { String words = uniScanner.next(); Double d = Double.valueOf(uniScanner.next()); unimap.put(words.toLowerCase(), d); if (uniScanner.hasNextLine()) { uniScanner.nextLine(); } lineno++; } //fill the map of bigrams while (biScanner.hasNext()) { String words = biScanner.next() + " " + biScanner.next(); bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next())); if (biScanner.hasNextLine()) { biScanner.nextLine(); } } } System.err.println("Lexicons are read."); } /** * Executes a query that the analyzer can analyze. * * @param query The query string to execute. * @throws SQLException When database connection isn't available. */ public void query(String query) throws SQLException { PreparedStatement statement; //make a connection to the database and execute the query statement = connection.prepareStatement(query); data = statement.executeQuery(); } /** * Run a sentiment analysis and fill the database with the output. * * @param query The sql text for the query. * @throws SQLException * @throws IOException */ public void sentimentAnalysis(String query) throws SQLException, IOException { query(query); //read the lexicons readLexicon(); //go to the start of te dataset if (data == null) { System.err.println("data is empty, try querying first"); return; } Double value; String text; //for all tuples while (data.next()) { //get the text text = data.getString("text"); text = splitPunctToWords(text); // test is the tweet text you are going to analyze String[] words = text.split("\\s+"); // text splitted into separate words double positiverate = 0; // positive rating // Rate the text with unigrams for (String word : words) { value = unimap.get(word); if (value != null) { positiverate += unimap.get(word); } } // Rate the text with bigrams for (int i = 0; i < words.length - 1; i++) { String pair = words[i] + " " + words[i + 1]; value = bimap.get(pair); if (value != null) { positiverate += bimap.get(pair); } } //insert the rating into the database NamedPreparedStatement m_insertRating; m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); m_insertRating.executeUpdate(); //don't print the rate //System.out.println(text + ": " + (int) (positiverate * 10)); } } /** * Make a wordcloud of the results of some query. * * @param query The sql text for a query. * @throws SQLException * @throws FileNotFoundException * @throws UnsupportedEncodingException */ public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { query(query); //go to the start of the ResultSet data if (data == null) { System.err.println("data is empty, try querying first"); return; } String text; String[] words; HashMap wordcloud = new HashMap<>(); while (data.next()) { //get the text text = data.getString("text"); //remove punctuation, convert to lowercase and split on words text = removePunct(text); text = text.toLowerCase(); words = text.split("\\s+"); for (String word : words) { if(wordcloud.containsKey(word)){ wordcloud.put(word, wordcloud.get(word)); } else{ wordcloud.put(word, 1); } } } //print the words and their frequency in a csv file PrintWriter writer = new PrintWriter("wordcloud.csv", "UTF-8"); writer.println("word,count"); for(Entry e : wordcloud.entrySet()){ writer.println(e.getKey() + ", " + e.getValue()); } writer.close(); System.out.println("csv file made, please put it next to wordcloud.html and run this"); } //generate csv for disco from the query public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { //do the query query(query); PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); //print the first row for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { writer.print(data.getMetaData().getColumnLabel(i) + ", "); } writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); //print the values while (data.next()) { for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { if (data.getObject(i) == null) { writer.print(", "); } else { writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); } } if (data.getObject(data.getMetaData().getColumnCount()) == null) { writer.println("0"); } else { writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); } } writer.close(); } public void getBrands() throws SQLException { PreparedStatement statement; //make a connection to the database and execute the query statement = connection.prepareStatement("delete from mentionsbrand"); statement.executeUpdate(); BrandChecker checker = new BrandChecker("brandrules.txt"); query("select * from tweet"); NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); while (data.next()) { List brands = checker.getBrands(data.getString("text")); if (brands.isEmpty()) { QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no"); m_insertBrand.executeUpdate(); } else { for (String brand : brands) { QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand); m_insertBrand.executeUpdate(); } } } } //replaces punctuation so it will be splitted //also removes urls private String splitPunctToWords(String text) { text = text.replaceAll("https?://\\S*", ""); text = text.replaceAll("[!?):;\"']", " $0"); text = text.replaceAll("[.,-](\\s|$)", " $0"); text = text.replaceAll("\\s[(\"']", "$0 "); return text; } //removes punctuation //also removes urls private String removePunct(String text) { text = text.replaceAll("https?://\\S*", ""); text = text.replaceAll("[.,!?();\"'-]", " "); text = text.replaceAll("[^\\x00-\\x7F]", " "); return text; } }