Added first version of BrandChecker

TODO * Not yet finalized, need to search through ,.;'/[] etc. * Maybe implement searching after eachother, first htc then one.
author: Maurice Laveaux <m.laveaux@student.tue.nl> 2014-05-15 20:47:17 +0200
committer: Maurice Laveaux <m.laveaux@student.tue.nl> 2014-05-15 20:47:17 +0200
commit: d7c4a4ddb0b0fe43e5b02f1748c811a1249dd172 (patch)
tree: cf011338210c048670eb967b7cdc5adc80b8a01b /src
parent: f94162b0c8e6a7b7bd62087f14fcb1c646a6fe84 (diff)
download: Goldfarmer-d7c4a4ddb0b0fe43e5b02f1748c811a1249dd172.tar.gz
2 files changed, 334 insertions, 208 deletions
diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java
index 6b57a39..740424c 100644
--- a/src/analysis/BrandChecker.java
+++ b/src/analysis/BrandChecker.java
@@ -17,45 +17,151 @@ package analysis;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
-import java.util.logging.Level;
-import java.util.logging.Logger;
+import java.util.Locale;
+import java.util.Scanner;
+import java.util.Set;
 
 /**
  * This class obtains a text and returns the brands that are contained in this
- * text.
- * 
+ * text. The input file contains lines with [brandname] [extensions] |
+ * [blacklisted words].
+ *
  * @author Maurice Laveaux
  */
 public class BrandChecker {
-    
+
+    /**
+     * A set of rules that determine the brands.
+     */
+    private final ArrayList<BrandRule> ruleset = new ArrayList();
+
     /**
-     * @param filename The filename that contains all the brands. 
+     * @param filename The filename that contains all the rules.
      */
-    public BrandChecker(final String filename) {        
+    public BrandChecker(final String filename) {
         try {
             readFile(filename);
         } catch (FileNotFoundException ex) {
-            Logger.getLogger(BrandChecker.class.getName()).log(Level.SEVERE, null, ex);
+            throw new IllegalArgumentException("file named " + filename + " not found.");
         }
     }
-    
+
     /**
      * Get the brands that are in some text.
-     * 
+     *
      * @param text Any valid text.
      * @return The list of brands that are contained in this text or null.
      */
     public List<String> getBrands(String text) {
-        
-        
-        
-        return null;
+        String[] words = text.toLowerCase().split("\\s+");
+
+        List<String> brands = new ArrayList();
+
+        for (BrandRule rule : ruleset) {
+            if (rule.analyze(words)) {
+                brands.add(rule.getBrand());
+            }
+        }
+
+        return brands;
     }
-    
+
     private void readFile(final String filename) throws FileNotFoundException {
-        
         InputStream inFile = new FileInputStream(filename);
-        
+        Scanner readFile = new Scanner(inFile);
+
+        while (readFile.hasNextLine()) {
+            String line = readFile.nextLine();
+
+            parseRule(line.toLowerCase(Locale.ENGLISH));
+        }
+    }
+
+    private void parseRule(String line) {
+        if (line.isEmpty()) {
+            return;
+        }
+
+        if (!line.contains("-")) {
+            // only positive search entries.
+            String[] sequence = line.split("\\s+");
+            String[] blacklist = {""};
+            ruleset.add(new BrandRule(line, sequence, blacklist));
+        } else {
+            String[] parts = line.split("-");
+            // positive and negative.
+            if (parts.length < 2) {
+                throw new IllegalArgumentException("Brand rule contained '-' but not two parts.");
+            }
+
+            String[] sequence = parts[0].trim().split("\\s+");
+            String[] blacklist = parts[1].trim().split("\\s+");
+
+            ruleset.add(new BrandRule(parts[0].trim(), sequence, blacklist));
+        }
+
+    }
+
+    private class BrandRule {
+
+        /**
+         * The words that should be in the text.
+         */
+        private final ArrayList<String> names;
+
+        /**
+         * A blacklist of words that are not interesting.
+         */
+        private final Set<String> blacklist;
+
+        /**
+         * The brand name of this rule.
+         */
+        private final String brand;
+
+        /**
+         *
+         * @param brand The brand of this rule.
+         * @param sequential The sequence of strings to obtain.
+         * @param blacklist The blacklisted words.
+         */
+        public BrandRule(final String brandname, final String[] names, final String[] blacklist) {
+            this.brand = brandname;
+            this.names = new ArrayList(Arrays.asList(names));
+            this.blacklist = new HashSet(Arrays.asList(blacklist));
+        }
+
+        /**
+         * Analyzes if this rule is holds for some text.
+         *
+         * @param words A list of words in a line.
+         */
+        public boolean analyze(String[] words) {
+            int found = 0;
+
+            for (String word : words) {
+                if (blacklist.contains(word)) {
+                    return false;
+                }
+                
+                if (names.contains(word)) {
+                    found++;
+                }
+            }
+
+            if (found == names.size()) {
+                return true;
+            }
+
+            return false;
+        }
+
+        public String getBrand() {
+            return brand;
+        }
     }
 }
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java
index 9be1101..2dc482b 100644
--- a/src/main/Analyzor.java
+++ b/src/main/Analyzor.java
@@ -1,191 +1,211 @@
-package main;
-
-import database.NamedPreparedStatement;
-import database.QueryUtils;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.Scanner;
-
-/**
- * The sentiment analysis class that rates tweets based on a unigram and bigram
- * set of weights.
- */
-public class Analyzor {
-
-    /**
-     * The map that matches single words to their weights.
-     */
-    private final HashMap<String, Double> unimap = new HashMap();
-
-    /**
-     * The map that matches word pairs to their weights.
-     */
-    private final HashMap<String, Double> bimap = new HashMap();
-
-    private ResultSet data;
-    private final Connection connection;
-
-    Analyzor(Connection connection) {
-        this.connection = connection;
-    }
-
-    //reads the lexicons
-    void readLexicon() throws FileNotFoundException {
-        if (!unimap.isEmpty()) {
-            // data is already read.
-            return;
-        }
-        // A unigram is in the format (WS = whitespace):
-        // word <WS> rating <WS> ??? <WS> ??
-        // A bigram has an two WS-separated words instead of one.
-        try (Scanner uniScanner = new Scanner("unigrams-pmilexicon.txt");
-                Scanner biScanner = new Scanner("bigrams-pmilexicon.txt");) {
-            //Fill the map of unigrams
-            while (uniScanner.hasNext()) {
-                String words = uniScanner.next();
-                unimap.put(words.toLowerCase(), uniScanner.nextDouble());
-                if (uniScanner.hasNextLine()) {
-                    uniScanner.nextLine();
-                }
-            }
-
-            //fill the map of bigrams
-            while (biScanner.hasNext()) {
-                String words = biScanner.next() + " " + biScanner.next();
-                bimap.put(words.toLowerCase(), biScanner.nextDouble());
-                if (biScanner.hasNextLine()) {
-                    biScanner.nextLine();
-                }
-            }
-        }
-    }
-
-    /**
-     * Executes a query that the analyzer can analyze.
-     *
-     * @param query The query string to execute.
-     * @throws SQLException When database connection isn't available.
-     */
-    public void query(String query) throws SQLException {
-        PreparedStatement statement;
-        //make a connection to the database and execute the query
-        statement = connection.prepareStatement(query);
-        data = statement.executeQuery();
-    }
-
-    /**
-     * Run a sentiment analysis and fill the database with the output.
-     *
-     * @throws SQLException
-     * @throws IOException
-     */
-    public void sentimentAnalysis(String query) throws SQLException, IOException {
-        query(query);
-
-        //read the lexicons
-        readLexicon();
-
-        //go to the start of te dataset
-        if (data == null) {
-            System.err.println("data is empty, try querying first");
-            return;
-        }
-        data.beforeFirst();
-
-        Double value;
-        String text;
-
-        //for all tuples
-        while (data.next()) {
-            //get the text
-            text = data.getString("text");
-            text = splitPunctToWords(text);
-            // test is the tweet text you are going to analyze
-            String[] words = text.split("\\s+"); // text splitted into separate words
-            double positiverate = 0; // positive rating
-
-            // Rate the text with unigrams
-            for (String word : words) {
-                value = unimap.get(word);
-                if (value != null) {
-                    positiverate += unimap.get(word);
-                }
-            }
-            // Rate the text with bigrams
-            for (int i = 0; i < words.length - 1; i++) {
-                String pair = words[i] + " " + words[i + 1];
-                value = bimap.get(pair);
-                if (value != null) {
-                    positiverate += bimap.get(pair);
-                }
-            }
-            //insert the rating into the database
-            NamedPreparedStatement m_insertRating;
-            m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
-            QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
-            m_insertRating.executeUpdate();
-            //don't print the rate
-            //System.out.println(text + ": " + (int) (positiverate * 10));
-        }
-    }
-
-    //makes a wordcloud of the tweets in the ResultSet data
-    void makeWordCloud(String query) throws SQLException {
-
-        query(query);
-        //go to the start of the ResultSet data
-        if (data == null) {
-            System.err.println("data is empty, try querying first");
-            return;
-        }
-
-        //make the hashmap with the words and their frequency
-        HashMap<String, Integer> wordcloud = new HashMap<>();
-
-        String text;
-        String[] words;
-        Integer value;
-
-        while (data.next()) {
-            //get the text
-            text = data.getString("text");
-            //remove punctuation, convert to lowercase and split on words
-            text = removePunct(text);
-            text = text.toLowerCase();
-            words = text.split("\\s+");
-
-            //count the words
-            for (String word : words) {
-                value = wordcloud.get(word);
-                if (value == null) {
-                    wordcloud.put(word, 1);
-                } else {
-                    wordcloud.put(word, value++);
-                }
-            }
-        }
-    }
-
-    //replaces punctuation so it will be splitted
-    //also removes urls
-    private String splitPunctToWords(String text) {
-        text = text.replaceAll("https?://\\S*", "");
-        text = text.replaceAll("[!?):;\"']", " $0");
-        text = text.replaceAll("[.,-](\\s|$)", " $0");
-        text = text.replaceAll("\\s[(\"']", "$0 ");
-        return text;
-    }
-
-    //removes punctuation
-    //also removes urls
-    private String removePunct(String text) {
-        text = text.replaceAll("https?://\\S*", "");
-        text = text.replaceAll("[.,!?()-:;\"']", " ");
-        return text;
-    }
-}
+package main;
+
+import analysis.BrandChecker;
+import database.NamedPreparedStatement;
+import database.QueryUtils;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Scanner;
+
+/**
+ * The sentiment analysis class that rates tweets based on a unigram and bigram
+ * set of weights.
+ */
+public class Analyzor {
+
+    /**
+     * The map that matches single words to their weights.
+     */
+    private final HashMap<String, Double> unimap = new HashMap();
+
+    /**
+     * The map that matches word pairs to their weights.
+     */
+    private final HashMap<String, Double> bimap = new HashMap();
+
+    /**
+     * The results of a query.
+     */
+    private ResultSet data;
+    
+    /**
+     * The connection to the database.
+     */
+    private final Connection connection;
+
+    /**
+     * Used to determine the brands inside a tweet.
+     */
+    private final BrandChecker brandChecker = new BrandChecker("brands.txt");
+    
+    Analyzor(Connection connection) {
+        this.connection = connection;
+    }
+
+    //reads the lexicons
+    void readLexicon() throws FileNotFoundException {
+        if (!unimap.isEmpty()) {
+            // data is already read.
+            return;
+        }
+        // A unigram is in the format (WS = whitespace):
+        // word <WS> rating <WS> ??? <WS> ??
+        // A bigram has an two WS-separated words instead of one.
+        try (Scanner uniScanner = new Scanner("unigrams-pmilexicon.txt");
+                Scanner biScanner = new Scanner("bigrams-pmilexicon.txt");) {
+            //Fill the map of unigrams
+            while (uniScanner.hasNext()) {
+                String words = uniScanner.next();
+                unimap.put(words.toLowerCase(), uniScanner.nextDouble());
+                if (uniScanner.hasNextLine()) {
+                    uniScanner.nextLine();
+                }
+            }
+
+            //fill the map of bigrams
+            while (biScanner.hasNext()) {
+                String words = biScanner.next() + " " + biScanner.next();
+                bimap.put(words.toLowerCase(), biScanner.nextDouble());
+                if (biScanner.hasNextLine()) {
+                    biScanner.nextLine();
+                }
+            }
+        }
+    }
+
+    /**
+     * Executes a query that the analyzer can analyze.
+     *
+     * @param query The query string to execute.
+     * @throws SQLException When database connection isn't available.
+     */
+    public void query(String query) throws SQLException {
+        PreparedStatement statement;
+        //make a connection to the database and execute the query
+        statement = connection.prepareStatement(query);
+        data = statement.executeQuery();
+    }
+
+    /**
+     * Run a sentiment analysis and fill the database with the output.
+     *
+     * @param query The query to analyze
+     * @throws SQLException
+     * @throws IOException
+     */
+    public void sentimentAnalysis(String query) throws SQLException, IOException {
+        query(query);
+
+        //read the lexicons
+        readLexicon();
+
+        //go to the start of te dataset
+        if (data == null) {
+            System.err.println("data is empty, try querying first");
+            return;
+        }
+        data.beforeFirst();
+
+        Double value;
+        String text;
+
+        //for all tuples
+        while (data.next()) {
+            //get the text
+            text = data.getString("text");
+            text = splitPunctToWords(text);
+            // test is the tweet text you are going to analyze
+            String[] words = text.split("\\s+"); // text splitted into separate words
+            double positiverate = 0; // positive rating
+
+            // Rate the text with unigrams
+            for (String word : words) {
+                value = unimap.get(word);
+                if (value != null) {
+                    positiverate += unimap.get(word);
+                }
+            }
+            // Rate the text with bigrams
+            for (int i = 0; i < words.length - 1; i++) {
+                String pair = words[i] + " " + words[i + 1];
+                value = bimap.get(pair);
+                if (value != null) {
+                    positiverate += bimap.get(pair);
+                }
+            }
+            // Obtain the brands contained in a tweet text.
+            //List<String> brands = brandChecker.getBrands(text);
+            
+            // insert the rating into the database
+            NamedPreparedStatement m_insertRating;
+            m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
+            QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
+            m_insertRating.executeUpdate();
+            
+            
+            //don't print the rate
+            //System.out.println(text + ": " + (int) (positiverate * 10));
+        }
+    }
+
+    //makes a wordcloud of the tweets in the ResultSet data
+    public void makeWordCloud(String query) throws SQLException {
+
+        query(query);
+        //go to the start of the ResultSet data
+        if (data == null) {
+            System.err.println("data is empty, try querying first");
+            return;
+        }
+
+        //make the hashmap with the words and their frequency
+        HashMap<String, Integer> wordcloud = new HashMap<>();
+
+        String text;
+        String[] words;
+        Integer value;
+
+        while (data.next()) {
+            //get the text
+            text = data.getString("text");
+            //remove punctuation, convert to lowercase and split on words
+            text = removePunct(text);
+            text = text.toLowerCase();
+            words = text.split("\\s+");
+
+            //count the words
+            for (String word : words) {
+                value = wordcloud.get(word);
+                if (value == null) {
+                    wordcloud.put(word, 1);
+                } else {
+                    wordcloud.put(word, value++);
+                }
+            }
+        }
+    }
+
+    //replaces punctuation so it will be splitted
+    //also removes urls
+    private String splitPunctToWords(String text) {
+        text = text.replaceAll("https?://\\S*", "");
+        text = text.replaceAll("[!?):;\"']", " $0");
+        text = text.replaceAll("[.,-](\\s|$)", " $0");
+        text = text.replaceAll("\\s[(\"']", "$0 ");
+        return text;
+    }
+
+    //removes punctuation
+    //also removes urls
+    private String removePunct(String text) {
+        text = text.replaceAll("https?://\\S*", "");
+        text = text.replaceAll("[.,!?()-:;\"']", " ");
+        return text;
+    }
+}
author	Maurice Laveaux <m.laveaux@student.tue.nl>	2014-05-15 20:47:17 +0200
committer	Maurice Laveaux <m.laveaux@student.tue.nl>	2014-05-15 20:47:17 +0200
commit	d7c4a4ddb0b0fe43e5b02f1748c811a1249dd172 (patch)
tree	cf011338210c048670eb967b7cdc5adc80b8a01b /src
parent	f94162b0c8e6a7b7bd62087f14fcb1c646a6fe84 (diff)
download	Goldfarmer-d7c4a4ddb0b0fe43e5b02f1748c811a1249dd172.tar.gz