summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMaurice Laveaux <m.laveaux@student.tue.nl>2014-05-15 20:47:17 +0200
committerMaurice Laveaux <m.laveaux@student.tue.nl>2014-05-15 20:47:17 +0200
commitd7c4a4ddb0b0fe43e5b02f1748c811a1249dd172 (patch)
treecf011338210c048670eb967b7cdc5adc80b8a01b /src
parentf94162b0c8e6a7b7bd62087f14fcb1c646a6fe84 (diff)
downloadGoldfarmer-d7c4a4ddb0b0fe43e5b02f1748c811a1249dd172.tar.gz
Added first version of BrandChecker
TODO * Not yet finalized, need to search through ,.;'/[] etc. * Maybe implement searching after eachother, first htc then one.
Diffstat (limited to 'src')
-rw-r--r--src/analysis/BrandChecker.java140
-rw-r--r--src/main/Analyzor.java402
2 files changed, 334 insertions, 208 deletions
diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java
index 6b57a39..740424c 100644
--- a/src/analysis/BrandChecker.java
+++ b/src/analysis/BrandChecker.java
@@ -17,45 +17,151 @@ package analysis;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
import java.util.List;
-import java.util.logging.Level;
-import java.util.logging.Logger;
+import java.util.Locale;
+import java.util.Scanner;
+import java.util.Set;
/**
* This class obtains a text and returns the brands that are contained in this
- * text.
- *
+ * text. The input file contains lines with [brandname] [extensions] |
+ * [blacklisted words].
+ *
* @author Maurice Laveaux
*/
public class BrandChecker {
-
+
+ /**
+ * A set of rules that determine the brands.
+ */
+ private final ArrayList<BrandRule> ruleset = new ArrayList();
+
/**
- * @param filename The filename that contains all the brands.
+ * @param filename The filename that contains all the rules.
*/
- public BrandChecker(final String filename) {
+ public BrandChecker(final String filename) {
try {
readFile(filename);
} catch (FileNotFoundException ex) {
- Logger.getLogger(BrandChecker.class.getName()).log(Level.SEVERE, null, ex);
+ throw new IllegalArgumentException("file named " + filename + " not found.");
}
}
-
+
/**
* Get the brands that are in some text.
- *
+ *
* @param text Any valid text.
* @return The list of brands that are contained in this text or null.
*/
public List<String> getBrands(String text) {
-
-
-
- return null;
+ String[] words = text.toLowerCase().split("\\s+");
+
+ List<String> brands = new ArrayList();
+
+ for (BrandRule rule : ruleset) {
+ if (rule.analyze(words)) {
+ brands.add(rule.getBrand());
+ }
+ }
+
+ return brands;
}
-
+
private void readFile(final String filename) throws FileNotFoundException {
-
InputStream inFile = new FileInputStream(filename);
-
+ Scanner readFile = new Scanner(inFile);
+
+ while (readFile.hasNextLine()) {
+ String line = readFile.nextLine();
+
+ parseRule(line.toLowerCase(Locale.ENGLISH));
+ }
+ }
+
+ private void parseRule(String line) {
+ if (line.isEmpty()) {
+ return;
+ }
+
+ if (!line.contains("-")) {
+ // only positive search entries.
+ String[] sequence = line.split("\\s+");
+ String[] blacklist = {""};
+ ruleset.add(new BrandRule(line, sequence, blacklist));
+ } else {
+ String[] parts = line.split("-");
+ // positive and negative.
+ if (parts.length < 2) {
+ throw new IllegalArgumentException("Brand rule contained '-' but not two parts.");
+ }
+
+ String[] sequence = parts[0].trim().split("\\s+");
+ String[] blacklist = parts[1].trim().split("\\s+");
+
+ ruleset.add(new BrandRule(parts[0].trim(), sequence, blacklist));
+ }
+
+ }
+
+ private class BrandRule {
+
+ /**
+ * The words that should be in the text.
+ */
+ private final ArrayList<String> names;
+
+ /**
+ * A blacklist of words that are not interesting.
+ */
+ private final Set<String> blacklist;
+
+ /**
+ * The brand name of this rule.
+ */
+ private final String brand;
+
+ /**
+ *
+ * @param brand The brand of this rule.
+ * @param sequential The sequence of strings to obtain.
+ * @param blacklist The blacklisted words.
+ */
+ public BrandRule(final String brandname, final String[] names, final String[] blacklist) {
+ this.brand = brandname;
+ this.names = new ArrayList(Arrays.asList(names));
+ this.blacklist = new HashSet(Arrays.asList(blacklist));
+ }
+
+ /**
+ * Analyzes if this rule is holds for some text.
+ *
+ * @param words A list of words in a line.
+ */
+ public boolean analyze(String[] words) {
+ int found = 0;
+
+ for (String word : words) {
+ if (blacklist.contains(word)) {
+ return false;
+ }
+
+ if (names.contains(word)) {
+ found++;
+ }
+ }
+
+ if (found == names.size()) {
+ return true;
+ }
+
+ return false;
+ }
+
+ public String getBrand() {
+ return brand;
+ }
}
}
diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java
index 9be1101..2dc482b 100644
--- a/src/main/Analyzor.java
+++ b/src/main/Analyzor.java
@@ -1,191 +1,211 @@
-package main;
-
-import database.NamedPreparedStatement;
-import database.QueryUtils;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.Scanner;
-
-/**
- * The sentiment analysis class that rates tweets based on a unigram and bigram
- * set of weights.
- */
-public class Analyzor {
-
- /**
- * The map that matches single words to their weights.
- */
- private final HashMap<String, Double> unimap = new HashMap();
-
- /**
- * The map that matches word pairs to their weights.
- */
- private final HashMap<String, Double> bimap = new HashMap();
-
- private ResultSet data;
- private final Connection connection;
-
- Analyzor(Connection connection) {
- this.connection = connection;
- }
-
- //reads the lexicons
- void readLexicon() throws FileNotFoundException {
- if (!unimap.isEmpty()) {
- // data is already read.
- return;
- }
- // A unigram is in the format (WS = whitespace):
- // word <WS> rating <WS> ??? <WS> ??
- // A bigram has an two WS-separated words instead of one.
- try (Scanner uniScanner = new Scanner("unigrams-pmilexicon.txt");
- Scanner biScanner = new Scanner("bigrams-pmilexicon.txt");) {
- //Fill the map of unigrams
- while (uniScanner.hasNext()) {
- String words = uniScanner.next();
- unimap.put(words.toLowerCase(), uniScanner.nextDouble());
- if (uniScanner.hasNextLine()) {
- uniScanner.nextLine();
- }
- }
-
- //fill the map of bigrams
- while (biScanner.hasNext()) {
- String words = biScanner.next() + " " + biScanner.next();
- bimap.put(words.toLowerCase(), biScanner.nextDouble());
- if (biScanner.hasNextLine()) {
- biScanner.nextLine();
- }
- }
- }
- }
-
- /**
- * Executes a query that the analyzer can analyze.
- *
- * @param query The query string to execute.
- * @throws SQLException When database connection isn't available.
- */
- public void query(String query) throws SQLException {
- PreparedStatement statement;
- //make a connection to the database and execute the query
- statement = connection.prepareStatement(query);
- data = statement.executeQuery();
- }
-
- /**
- * Run a sentiment analysis and fill the database with the output.
- *
- * @throws SQLException
- * @throws IOException
- */
- public void sentimentAnalysis(String query) throws SQLException, IOException {
- query(query);
-
- //read the lexicons
- readLexicon();
-
- //go to the start of te dataset
- if (data == null) {
- System.err.println("data is empty, try querying first");
- return;
- }
- data.beforeFirst();
-
- Double value;
- String text;
-
- //for all tuples
- while (data.next()) {
- //get the text
- text = data.getString("text");
- text = splitPunctToWords(text);
- // test is the tweet text you are going to analyze
- String[] words = text.split("\\s+"); // text splitted into separate words
- double positiverate = 0; // positive rating
-
- // Rate the text with unigrams
- for (String word : words) {
- value = unimap.get(word);
- if (value != null) {
- positiverate += unimap.get(word);
- }
- }
- // Rate the text with bigrams
- for (int i = 0; i < words.length - 1; i++) {
- String pair = words[i] + " " + words[i + 1];
- value = bimap.get(pair);
- if (value != null) {
- positiverate += bimap.get(pair);
- }
- }
- //insert the rating into the database
- NamedPreparedStatement m_insertRating;
- m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
- QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
- m_insertRating.executeUpdate();
- //don't print the rate
- //System.out.println(text + ": " + (int) (positiverate * 10));
- }
- }
-
- //makes a wordcloud of the tweets in the ResultSet data
- void makeWordCloud(String query) throws SQLException {
-
- query(query);
- //go to the start of the ResultSet data
- if (data == null) {
- System.err.println("data is empty, try querying first");
- return;
- }
-
- //make the hashmap with the words and their frequency
- HashMap<String, Integer> wordcloud = new HashMap<>();
-
- String text;
- String[] words;
- Integer value;
-
- while (data.next()) {
- //get the text
- text = data.getString("text");
- //remove punctuation, convert to lowercase and split on words
- text = removePunct(text);
- text = text.toLowerCase();
- words = text.split("\\s+");
-
- //count the words
- for (String word : words) {
- value = wordcloud.get(word);
- if (value == null) {
- wordcloud.put(word, 1);
- } else {
- wordcloud.put(word, value++);
- }
- }
- }
- }
-
- //replaces punctuation so it will be splitted
- //also removes urls
- private String splitPunctToWords(String text) {
- text = text.replaceAll("https?://\\S*", "");
- text = text.replaceAll("[!?):;\"']", " $0");
- text = text.replaceAll("[.,-](\\s|$)", " $0");
- text = text.replaceAll("\\s[(\"']", "$0 ");
- return text;
- }
-
- //removes punctuation
- //also removes urls
- private String removePunct(String text) {
- text = text.replaceAll("https?://\\S*", "");
- text = text.replaceAll("[.,!?()-:;\"']", " ");
- return text;
- }
-}
+package main;
+
+import analysis.BrandChecker;
+import database.NamedPreparedStatement;
+import database.QueryUtils;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Scanner;
+
+/**
+ * The sentiment analysis class that rates tweets based on a unigram and bigram
+ * set of weights.
+ */
+public class Analyzor {
+
+ /**
+ * The map that matches single words to their weights.
+ */
+ private final HashMap<String, Double> unimap = new HashMap();
+
+ /**
+ * The map that matches word pairs to their weights.
+ */
+ private final HashMap<String, Double> bimap = new HashMap();
+
+ /**
+ * The results of a query.
+ */
+ private ResultSet data;
+
+ /**
+ * The connection to the database.
+ */
+ private final Connection connection;
+
+ /**
+ * Used to determine the brands inside a tweet.
+ */
+ private final BrandChecker brandChecker = new BrandChecker("brands.txt");
+
+ Analyzor(Connection connection) {
+ this.connection = connection;
+ }
+
+ //reads the lexicons
+ void readLexicon() throws FileNotFoundException {
+ if (!unimap.isEmpty()) {
+ // data is already read.
+ return;
+ }
+ // A unigram is in the format (WS = whitespace):
+ // word <WS> rating <WS> ??? <WS> ??
+ // A bigram has an two WS-separated words instead of one.
+ try (Scanner uniScanner = new Scanner("unigrams-pmilexicon.txt");
+ Scanner biScanner = new Scanner("bigrams-pmilexicon.txt");) {
+ //Fill the map of unigrams
+ while (uniScanner.hasNext()) {
+ String words = uniScanner.next();
+ unimap.put(words.toLowerCase(), uniScanner.nextDouble());
+ if (uniScanner.hasNextLine()) {
+ uniScanner.nextLine();
+ }
+ }
+
+ //fill the map of bigrams
+ while (biScanner.hasNext()) {
+ String words = biScanner.next() + " " + biScanner.next();
+ bimap.put(words.toLowerCase(), biScanner.nextDouble());
+ if (biScanner.hasNextLine()) {
+ biScanner.nextLine();
+ }
+ }
+ }
+ }
+
+ /**
+ * Executes a query that the analyzer can analyze.
+ *
+ * @param query The query string to execute.
+ * @throws SQLException When database connection isn't available.
+ */
+ public void query(String query) throws SQLException {
+ PreparedStatement statement;
+ //make a connection to the database and execute the query
+ statement = connection.prepareStatement(query);
+ data = statement.executeQuery();
+ }
+
+ /**
+ * Run a sentiment analysis and fill the database with the output.
+ *
+ * @param query The query to analyze
+ * @throws SQLException
+ * @throws IOException
+ */
+ public void sentimentAnalysis(String query) throws SQLException, IOException {
+ query(query);
+
+ //read the lexicons
+ readLexicon();
+
+ //go to the start of te dataset
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+ data.beforeFirst();
+
+ Double value;
+ String text;
+
+ //for all tuples
+ while (data.next()) {
+ //get the text
+ text = data.getString("text");
+ text = splitPunctToWords(text);
+ // test is the tweet text you are going to analyze
+ String[] words = text.split("\\s+"); // text splitted into separate words
+ double positiverate = 0; // positive rating
+
+ // Rate the text with unigrams
+ for (String word : words) {
+ value = unimap.get(word);
+ if (value != null) {
+ positiverate += unimap.get(word);
+ }
+ }
+ // Rate the text with bigrams
+ for (int i = 0; i < words.length - 1; i++) {
+ String pair = words[i] + " " + words[i + 1];
+ value = bimap.get(pair);
+ if (value != null) {
+ positiverate += bimap.get(pair);
+ }
+ }
+ // Obtain the brands contained in a tweet text.
+ //List<String> brands = brandChecker.getBrands(text);
+
+ // insert the rating into the database
+ NamedPreparedStatement m_insertRating;
+ m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
+ QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
+ m_insertRating.executeUpdate();
+
+
+ //don't print the rate
+ //System.out.println(text + ": " + (int) (positiverate * 10));
+ }
+ }
+
+ //makes a wordcloud of the tweets in the ResultSet data
+ public void makeWordCloud(String query) throws SQLException {
+
+ query(query);
+ //go to the start of the ResultSet data
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+
+ //make the hashmap with the words and their frequency
+ HashMap<String, Integer> wordcloud = new HashMap<>();
+
+ String text;
+ String[] words;
+ Integer value;
+
+ while (data.next()) {
+ //get the text
+ text = data.getString("text");
+ //remove punctuation, convert to lowercase and split on words
+ text = removePunct(text);
+ text = text.toLowerCase();
+ words = text.split("\\s+");
+
+ //count the words
+ for (String word : words) {
+ value = wordcloud.get(word);
+ if (value == null) {
+ wordcloud.put(word, 1);
+ } else {
+ wordcloud.put(word, value++);
+ }
+ }
+ }
+ }
+
+ //replaces punctuation so it will be splitted
+ //also removes urls
+ private String splitPunctToWords(String text) {
+ text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll("[!?):;\"']", " $0");
+ text = text.replaceAll("[.,-](\\s|$)", " $0");
+ text = text.replaceAll("\\s[(\"']", "$0 ");
+ return text;
+ }
+
+ //removes punctuation
+ //also removes urls
+ private String removePunct(String text) {
+ text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll("[.,!?()-:;\"']", " ");
+ return text;
+ }
+}