Change wordcloud with added brands

author: S129778 <S129778@S129778.campus.tue.nl> 2014-05-21 16:54:34 +0200
committer: S129778 <S129778@S129778.campus.tue.nl> 2014-05-21 16:54:34 +0200
commit: edbab8bc4e3236853fbf16ed1b65a262c178a1c2 (patch)
tree: 5d3803d216ded1a5f6ac63bb87a4d50f9433169c
parent: 4831e2b08b225a30d418197844f3d71e9e0c81cb (diff)
download: Goldfarmer-edbab8bc4e3236853fbf16ed1b65a262c178a1c2.tar.gz
1 files changed, 296 insertions, 285 deletions
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java
index bbe1b5e..12a6a4c 100644
--- a/src/main/Analyzor.java
+++ b/src/main/Analyzor.java
@@ -1,285 +1,296 @@
-package main;
-
-import analysis.BrandChecker;
-import database.NamedPreparedStatement;
-import database.QueryUtils;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.UnsupportedEncodingException;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.List;
-import java.util.HashMap;
-import java.util.Map.Entry;
-import java.util.Scanner;
-
-/**
- * The sentiment analysis class that rates tweets based on a unigram and bigram
- * set of weights.
- */
-public class Analyzor {
-
-    /**
-     * The map that matches single words to their weights.
-     */
-    private final HashMap<String, Double> unimap = new HashMap();
-
-    /**
-     * The map that matches word pairs to their weights.
-     */
-    private final HashMap<String, Double> bimap = new HashMap();
-
-    /**
-     * The results of a query, maybe return from query().
-     */
-    private ResultSet data;
-
-    /**
-     * The persistent connection to the database.
-     */
-    private final Connection connection;
-
-    /**
-     * @param connection An open connection to the database.
-     */
-    public Analyzor(Connection connection) {
-        this.connection = connection;
-    }
-
-    /**
-     * Read the unigram and bigram lexica.
-     *
-     * @throws FileNotFoundException
-     */
-    public void readLexicon() throws FileNotFoundException {
-        if (!unimap.isEmpty()) {
-            // data is already read.
-            return;
-        }
-        System.err.println("Trying to read lexicons...");
-        // A unigram is in the format (WS = whitespace):
-        // word <WS> rating <WS> ??? <WS> ??
-        // A bigram has an two WS-separated words instead of one.
-        try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt"));
-                Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) {
-            //Fill the map of unigrams
-            int lineno = 1;
-            while (uniScanner.hasNext()) {
-
-                String words = uniScanner.next();
-                Double d = Double.valueOf(uniScanner.next());
-                unimap.put(words.toLowerCase(), d);
-                if (uniScanner.hasNextLine()) {
-                    uniScanner.nextLine();
-                }
-                lineno++;
-
-            }
-
-            //fill the map of bigrams
-            while (biScanner.hasNext()) {
-                String words = biScanner.next() + " " + biScanner.next();
-                bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next()));
-                if (biScanner.hasNextLine()) {
-                    biScanner.nextLine();
-                }
-            }
-        }
-        System.err.println("Lexicons are read.");
-    }
-
-    /**
-     * Executes a query that the analyzer can analyze.
-     *
-     * @param query The query string to execute.
-     * @throws SQLException When database connection isn't available.
-     */
-    public void query(String query) throws SQLException {
-        PreparedStatement statement;
-        //make a connection to the database and execute the query
-        statement = connection.prepareStatement(query);
-        data = statement.executeQuery();
-    }
-
-    /**
-     * Run a sentiment analysis and fill the database with the output.
-     *
-     * @param query The sql text for the query.
-     * @throws SQLException
-     * @throws IOException
-     */
-    public void sentimentAnalysis(String query) throws SQLException, IOException {
-        query(query);
-
-        //read the lexicons
-        readLexicon();
-
-        //go to the start of te dataset
-        if (data == null) {
-            System.err.println("data is empty, try querying first");
-            return;
-        }
-
-        Double value;
-        String text;
-
-        //for all tuples
-        while (data.next()) {
-            //get the text
-            text = data.getString("text");
-            text = splitPunctToWords(text);
-            // test is the tweet text you are going to analyze
-            String[] words = text.split("\\s+"); // text splitted into separate words
-            double positiverate = 0; // positive rating
-
-            // Rate the text with unigrams
-            for (String word : words) {
-                value = unimap.get(word);
-                if (value != null) {
-                    positiverate += unimap.get(word);
-                }
-            }
-            // Rate the text with bigrams
-            for (int i = 0; i < words.length - 1; i++) {
-                String pair = words[i] + " " + words[i + 1];
-                value = bimap.get(pair);
-                if (value != null) {
-                    positiverate += bimap.get(pair);
-                }
-            }
-            //insert the rating into the database
-            NamedPreparedStatement m_insertRating;
-            m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
-            QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
-            m_insertRating.executeUpdate();
-            //don't print the rate
-            //System.out.println(text + ": " + (int) (positiverate * 10));
-        }
-    }
-
-    /**
-     * Make a wordcloud of the results of some query.
-     *
-     * @param query The sql text for a query.
-     * @throws SQLException
-     * @throws FileNotFoundException
-     * @throws UnsupportedEncodingException
-     */
-    public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
-
-        query(query);
-        //go to the start of the ResultSet data
-        if (data == null) {
-            System.err.println("data is empty, try querying first");
-            return;
-        }
-
-        String text;
-        String[] words;
-        HashMap<String, Integer> wordcloud = new HashMap<>();
-
-        while (data.next()) {
-            //get the text
-            text = data.getString("text");
-            //remove punctuation, convert to lowercase and split on words
-            text = removePunct(text);
-            text = text.toLowerCase();
-            words = text.split("\\s+");
-            
-            for (String word : words) {
-                if(wordcloud.containsKey(word)){
-                    wordcloud.put(word, wordcloud.get(word) + 1);
-                }
-                else{
-                    wordcloud.put(word, 1);
-                }
-            }
-        }
-        wordcloud.remove("");
-        //print the words and their frequency in a csv file
-        PrintWriter writer = new PrintWriter("wordcloud.csv", "UTF-8");
-        
-        writer.println("word,count");
-                
-        for(Entry e : wordcloud.entrySet()){
-            writer.println(e.getKey() + "," + e.getValue());
-        }
-        
-        writer.close();
-        System.out.println("csv file made, please put it next to wordcloud.html and run this");
-    }
-
-    //generate csv for disco from the query
-    public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
-        //do the query
-        query(query);
-        PrintWriter writer = new PrintWriter("output.csv", "UTF-8");
-        //print the first row
-        for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
-            writer.print(data.getMetaData().getColumnLabel(i) + ", ");
-        }
-        writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount()));
-        //print the values
-        while (data.next()) {
-            for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
-                if (data.getObject(i) == null) {
-                    writer.print(", ");
-                } else {
-                    writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", ");
-                }
-            }
-            if (data.getObject(data.getMetaData().getColumnCount()) == null) {
-                writer.println("0");
-            } else {
-                writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " "));
-            }
-        }
-        writer.close();
-    }
-
-    public void getBrands() throws SQLException {
-        PreparedStatement statement;
-        //make a connection to the database and execute the query
-        statement = connection.prepareStatement("delete from mentionsbrand");
-        statement.executeUpdate();
-        BrandChecker checker = new BrandChecker("brandrules.txt");
-        query("select * from tweet");
-        NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand);
-        while (data.next()) {
-            List<String> brands = checker.getBrands(data.getString("text"));
-            if (brands.isEmpty()) {
-                QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no");
-                m_insertBrand.executeUpdate();
-            } else {
-                for (String brand : brands) {
-                    QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand);
-                    m_insertBrand.executeUpdate();
-                }
-            }
-
-        }
-    }
-
-    //replaces punctuation so it will be splitted
-    //also removes urls
-    private String splitPunctToWords(String text) {
-        text = text.replaceAll("https?://\\S*", "");
-        text = text.replaceAll("[!?):;\"']", " $0");
-        text = text.replaceAll("[.,-](\\s|$)", " $0");
-        text = text.replaceAll("\\s[(\"']", "$0 ");
-        return text;
-    }
-
-    //removes punctuation
-    //also removes urls
-    private String removePunct(String text) {
-        text = text.replaceAll("https?://\\S*", " ");
-        text = text.replaceAll("@\\S*", " ");
-        text = text.replaceAll("[^a-zA-Z0-9#_-]", " ");
-        return text;
-    }
-}
+package main;
+
+import analysis.BrandChecker;
+import database.NamedPreparedStatement;
+import database.QueryUtils;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.List;
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.util.Scanner;
+
+/**
+ * The sentiment analysis class that rates tweets based on a unigram and bigram
+ * set of weights.
+ */
+public class Analyzor {
+
+    /**
+     * The map that matches single words to their weights.
+     */
+    private final HashMap<String, Double> unimap = new HashMap();
+
+    /**
+     * The map that matches word pairs to their weights.
+     */
+    private final HashMap<String, Double> bimap = new HashMap();
+
+    /**
+     * The results of a query, maybe return from query().
+     */
+    private ResultSet data;
+
+    /**
+     * The persistent connection to the database.
+     */
+    private final Connection connection;
+
+    /**
+     * @param connection An open connection to the database.
+     */
+    public Analyzor(Connection connection) {
+        this.connection = connection;
+    }
+
+    /**
+     * Read the unigram and bigram lexica.
+     *
+     * @throws FileNotFoundException
+     */
+    public void readLexicon() throws FileNotFoundException {
+        if (!unimap.isEmpty()) {
+            // data is already read.
+            return;
+        }
+        System.err.println("Trying to read lexicons...");
+        // A unigram is in the format (WS = whitespace):
+        // word <WS> rating <WS> ??? <WS> ??
+        // A bigram has an two WS-separated words instead of one.
+        try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt"));
+                Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) {
+            //Fill the map of unigrams
+            int lineno = 1;
+            while (uniScanner.hasNext()) {
+
+                String words = uniScanner.next();
+                Double d = Double.valueOf(uniScanner.next());
+                unimap.put(words.toLowerCase(), d);
+                if (uniScanner.hasNextLine()) {
+                    uniScanner.nextLine();
+                }
+                lineno++;
+
+            }
+
+            //fill the map of bigrams
+            while (biScanner.hasNext()) {
+                String words = biScanner.next() + " " + biScanner.next();
+                bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next()));
+                if (biScanner.hasNextLine()) {
+                    biScanner.nextLine();
+                }
+            }
+        }
+        System.err.println("Lexicons are read.");
+    }
+
+    /**
+     * Executes a query that the analyzer can analyze.
+     *
+     * @param query The query string to execute.
+     * @throws SQLException When database connection isn't available.
+     */
+    public void query(String query) throws SQLException {
+        PreparedStatement statement;
+        //make a connection to the database and execute the query
+        statement = connection.prepareStatement(query);
+        data = statement.executeQuery();
+    }
+
+    /**
+     * Run a sentiment analysis and fill the database with the output.
+     *
+     * @param query The sql text for the query.
+     * @throws SQLException
+     * @throws IOException
+     */
+    public void sentimentAnalysis(String query) throws SQLException, IOException {
+        query(query);
+
+        //read the lexicons
+        readLexicon();
+
+        //go to the start of te dataset
+        if (data == null) {
+            System.err.println("data is empty, try querying first");
+            return;
+        }
+
+        Double value;
+        String text;
+
+        //for all tuples
+        while (data.next()) {
+            //get the text
+            text = data.getString("text");
+            text = splitPunctToWords(text);
+            // test is the tweet text you are going to analyze
+            String[] words = text.split("\\s+"); // text splitted into separate words
+            double positiverate = 0; // positive rating
+
+            // Rate the text with unigrams
+            for (String word : words) {
+                value = unimap.get(word);
+                if (value != null) {
+                    positiverate += unimap.get(word);
+                }
+            }
+            // Rate the text with bigrams
+            for (int i = 0; i < words.length - 1; i++) {
+                String pair = words[i] + " " + words[i + 1];
+                value = bimap.get(pair);
+                if (value != null) {
+                    positiverate += bimap.get(pair);
+                }
+            }
+            //insert the rating into the database
+            NamedPreparedStatement m_insertRating;
+            m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
+            QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
+            m_insertRating.executeUpdate();
+            //don't print the rate
+            //System.out.println(text + ": " + (int) (positiverate * 10));
+        }
+    }
+
+    /**
+     * Make a wordcloud of the results of some query.
+     *
+     * @param query The sql text for a query.
+     * @throws SQLException
+     * @throws FileNotFoundException
+     * @throws UnsupportedEncodingException
+     */
+    public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+
+        query(query);
+        //go to the start of the ResultSet data
+        if (data == null) {
+            System.err.println("data is empty, try querying first");
+            return;
+        }
+
+        String text;
+        String brand;
+        String[] words;
+        HashMap<String,HashMap<String, Integer>> wordcloud = new HashMap<>();
+
+        while (data.next()) {
+            //get brand
+            brand=data.getString("brand");
+            //make hashmap for each brand
+            if(!wordcloud.containsKey(brand)){
+                wordcloud.put(brand, new HashMap<String,Integer>());
+            }
+            //get the text
+            text = data.getString("text");
+            //remove punctuation, convert to lowercase and split on words
+            text = removePunct(text);
+            text = text.toLowerCase();
+            words = text.split("\\s+");
+            
+            for (String word : words) {
+                if(wordcloud.get(brand).containsKey(word)){
+                    wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1);
+                }
+                else{
+                    wordcloud.get(brand).put(word, 1);
+                }
+            }
+        }
+        wordcloud.remove("");
+        //print the words and their frequency in a csv file
+        PrintWriter writer = new PrintWriter("wordcloud.csv", "UTF-8");
+        
+        writer.println("brand,word,count");
+          
+        //loop over brands
+        for(Entry en : wordcloud.entrySet()){
+            //loop over words
+            for(Entry e : wordcloud.get(en.getKey()).entrySet()){
+                writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue());
+            }
+        }
+        
+        writer.close();
+        System.out.println("csv file made, please put it next to wordcloud.html and run this");
+    }
+
+    //generate csv for disco from the query
+    public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException {
+        //do the query
+        query(query);
+        PrintWriter writer = new PrintWriter("output.csv", "UTF-8");
+        //print the first row
+        for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
+            writer.print(data.getMetaData().getColumnLabel(i) + ", ");
+        }
+        writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount()));
+        //print the values
+        while (data.next()) {
+            for (int i = 1; i < data.getMetaData().getColumnCount(); i++) {
+                if (data.getObject(i) == null) {
+                    writer.print(", ");
+                } else {
+                    writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", ");
+                }
+            }
+            if (data.getObject(data.getMetaData().getColumnCount()) == null) {
+                writer.println("0");
+            } else {
+                writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " "));
+            }
+        }
+        writer.close();
+    }
+
+    public void getBrands() throws SQLException {
+        PreparedStatement statement;
+        //make a connection to the database and execute the query
+        statement = connection.prepareStatement("delete from mentionsbrand");
+        statement.executeUpdate();
+        BrandChecker checker = new BrandChecker("brandonlyrules.txt");
+        query("select * from tweet");
+        NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand);
+        while (data.next()) {
+            List<String> brands = checker.getBrands(data.getString("text"));
+            if (brands.isEmpty()) {
+                QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no");
+                m_insertBrand.executeUpdate();
+            } else {
+                for (String brand : brands) {
+                    QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand);
+                    m_insertBrand.executeUpdate();
+                }
+            }
+
+        }
+    }
+
+    //replaces punctuation so it will be splitted
+    //also removes urls
+    private String splitPunctToWords(String text) {
+        text = text.replaceAll("https?://\\S*", "");
+        text = text.replaceAll("[!?):;\"']", " $0");
+        text = text.replaceAll("[.,-](\\s|$)", " $0");
+        text = text.replaceAll("\\s[(\"']", "$0 ");
+        return text;
+    }
+
+    //removes punctuation
+    //also removes urls
+    private String removePunct(String text) {
+        text = text.replaceAll("https?://\\S*", " ");
+        text = text.replaceAll("@\\S*", " ");
+        text = text.replaceAll("[^a-zA-Z0-9#_-]", " ");
+        return text;
+    }
+}
author	S129778 <S129778@S129778.campus.tue.nl>	2014-05-21 16:54:34 +0200
committer	S129778 <S129778@S129778.campus.tue.nl>	2014-05-21 16:54:34 +0200
commit	edbab8bc4e3236853fbf16ed1b65a262c178a1c2 (patch)
tree	5d3803d216ded1a5f6ac63bb87a4d50f9433169c
parent	4831e2b08b225a30d418197844f3d71e9e0c81cb (diff)
download	Goldfarmer-edbab8bc4e3236853fbf16ed1b65a262c178a1c2.tar.gz