diff options
author | Maurice Laveaux <m.laveaux@student.tue.nl> | 2014-05-15 20:47:17 +0200 |
---|---|---|
committer | Maurice Laveaux <m.laveaux@student.tue.nl> | 2014-05-15 20:47:17 +0200 |
commit | d7c4a4ddb0b0fe43e5b02f1748c811a1249dd172 (patch) | |
tree | cf011338210c048670eb967b7cdc5adc80b8a01b | |
parent | f94162b0c8e6a7b7bd62087f14fcb1c646a6fe84 (diff) | |
download | Goldfarmer-d7c4a4ddb0b0fe43e5b02f1748c811a1249dd172.tar.gz |
Added first version of BrandChecker
TODO * Not yet finalized, need to search through ,.;'/[] etc.
* Maybe implement searching after eachother, first htc then one.
-rwxr-xr-x | brandrules.txt | 54 | ||||
-rw-r--r-- | src/analysis/BrandChecker.java | 140 | ||||
-rw-r--r-- | src/main/Analyzor.java | 402 |
3 files changed, 388 insertions, 208 deletions
diff --git a/brandrules.txt b/brandrules.txt new file mode 100755 index 0000000..ae7d6f9 --- /dev/null +++ b/brandrules.txt @@ -0,0 +1,54 @@ +galaxy s5 - tablet tab
+galaxy s4 - tablet tab
+galaxy s3
+galaxy K zoom
+galaxy note
+samsung note
+samsung zoom
+
+iphone 5
+iphone 5c
+iphone 5s
+iphone 4
+iphone 4s
+
+huawei ascend
+huawei p6
+huawei p7
+huawei mini
+huawei y300
+huawei y530
+huawei mate
+huawei g700
+huawei g510
+huawei g6
+huawei g525
+
+sony xperia
+sony L
+sony E1
+sony Z
+sony Z1
+sony Z2
+sony compact
+sony ZR
+sony M
+
+HTC one - phone
+HTC M8
+HTC mini
+HTC desire
+HTC X dual
+HTC SV
+
+LG Nexus 5
+LG G2
+LG L70
+LG L90
+LG L40
+LG G flex
+LG mini
+LG L9
+LG L7
+LG L5
+LG L3
diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java index 6b57a39..740424c 100644 --- a/src/analysis/BrandChecker.java +++ b/src/analysis/BrandChecker.java @@ -17,45 +17,151 @@ package analysis; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; +import java.util.Locale; +import java.util.Scanner; +import java.util.Set; /** * This class obtains a text and returns the brands that are contained in this - * text. - * + * text. The input file contains lines with [brandname] [extensions] | + * [blacklisted words]. + * * @author Maurice Laveaux */ public class BrandChecker { - + + /** + * A set of rules that determine the brands. + */ + private final ArrayList<BrandRule> ruleset = new ArrayList(); + /** - * @param filename The filename that contains all the brands. + * @param filename The filename that contains all the rules. */ - public BrandChecker(final String filename) { + public BrandChecker(final String filename) { try { readFile(filename); } catch (FileNotFoundException ex) { - Logger.getLogger(BrandChecker.class.getName()).log(Level.SEVERE, null, ex); + throw new IllegalArgumentException("file named " + filename + " not found."); } } - + /** * Get the brands that are in some text. - * + * * @param text Any valid text. * @return The list of brands that are contained in this text or null. */ public List<String> getBrands(String text) { - - - - return null; + String[] words = text.toLowerCase().split("\\s+"); + + List<String> brands = new ArrayList(); + + for (BrandRule rule : ruleset) { + if (rule.analyze(words)) { + brands.add(rule.getBrand()); + } + } + + return brands; } - + private void readFile(final String filename) throws FileNotFoundException { - InputStream inFile = new FileInputStream(filename); - + Scanner readFile = new Scanner(inFile); + + while (readFile.hasNextLine()) { + String line = readFile.nextLine(); + + parseRule(line.toLowerCase(Locale.ENGLISH)); + } + } + + private void parseRule(String line) { + if (line.isEmpty()) { + return; + } + + if (!line.contains("-")) { + // only positive search entries. + String[] sequence = line.split("\\s+"); + String[] blacklist = {""}; + ruleset.add(new BrandRule(line, sequence, blacklist)); + } else { + String[] parts = line.split("-"); + // positive and negative. + if (parts.length < 2) { + throw new IllegalArgumentException("Brand rule contained '-' but not two parts."); + } + + String[] sequence = parts[0].trim().split("\\s+"); + String[] blacklist = parts[1].trim().split("\\s+"); + + ruleset.add(new BrandRule(parts[0].trim(), sequence, blacklist)); + } + + } + + private class BrandRule { + + /** + * The words that should be in the text. + */ + private final ArrayList<String> names; + + /** + * A blacklist of words that are not interesting. + */ + private final Set<String> blacklist; + + /** + * The brand name of this rule. + */ + private final String brand; + + /** + * + * @param brand The brand of this rule. + * @param sequential The sequence of strings to obtain. + * @param blacklist The blacklisted words. + */ + public BrandRule(final String brandname, final String[] names, final String[] blacklist) { + this.brand = brandname; + this.names = new ArrayList(Arrays.asList(names)); + this.blacklist = new HashSet(Arrays.asList(blacklist)); + } + + /** + * Analyzes if this rule is holds for some text. + * + * @param words A list of words in a line. + */ + public boolean analyze(String[] words) { + int found = 0; + + for (String word : words) { + if (blacklist.contains(word)) { + return false; + } + + if (names.contains(word)) { + found++; + } + } + + if (found == names.size()) { + return true; + } + + return false; + } + + public String getBrand() { + return brand; + } } } diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 9be1101..2dc482b 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -1,191 +1,211 @@ -package main; - -import database.NamedPreparedStatement; -import database.QueryUtils; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.Scanner; - -/** - * The sentiment analysis class that rates tweets based on a unigram and bigram - * set of weights. - */ -public class Analyzor { - - /** - * The map that matches single words to their weights. - */ - private final HashMap<String, Double> unimap = new HashMap(); - - /** - * The map that matches word pairs to their weights. - */ - private final HashMap<String, Double> bimap = new HashMap(); - - private ResultSet data; - private final Connection connection; - - Analyzor(Connection connection) { - this.connection = connection; - } - - //reads the lexicons - void readLexicon() throws FileNotFoundException { - if (!unimap.isEmpty()) { - // data is already read. - return; - } - // A unigram is in the format (WS = whitespace): - // word <WS> rating <WS> ??? <WS> ?? - // A bigram has an two WS-separated words instead of one. - try (Scanner uniScanner = new Scanner("unigrams-pmilexicon.txt"); - Scanner biScanner = new Scanner("bigrams-pmilexicon.txt");) { - //Fill the map of unigrams - while (uniScanner.hasNext()) { - String words = uniScanner.next(); - unimap.put(words.toLowerCase(), uniScanner.nextDouble()); - if (uniScanner.hasNextLine()) { - uniScanner.nextLine(); - } - } - - //fill the map of bigrams - while (biScanner.hasNext()) { - String words = biScanner.next() + " " + biScanner.next(); - bimap.put(words.toLowerCase(), biScanner.nextDouble()); - if (biScanner.hasNextLine()) { - biScanner.nextLine(); - } - } - } - } - - /** - * Executes a query that the analyzer can analyze. - * - * @param query The query string to execute. - * @throws SQLException When database connection isn't available. - */ - public void query(String query) throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement(query); - data = statement.executeQuery(); - } - - /** - * Run a sentiment analysis and fill the database with the output. - * - * @throws SQLException - * @throws IOException - */ - public void sentimentAnalysis(String query) throws SQLException, IOException { - query(query); - - //read the lexicons - readLexicon(); - - //go to the start of te dataset - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - data.beforeFirst(); - - Double value; - String text; - - //for all tuples - while (data.next()) { - //get the text - text = data.getString("text"); - text = splitPunctToWords(text); - // test is the tweet text you are going to analyze - String[] words = text.split("\\s+"); // text splitted into separate words - double positiverate = 0; // positive rating - - // Rate the text with unigrams - for (String word : words) { - value = unimap.get(word); - if (value != null) { - positiverate += unimap.get(word); - } - } - // Rate the text with bigrams - for (int i = 0; i < words.length - 1; i++) { - String pair = words[i] + " " + words[i + 1]; - value = bimap.get(pair); - if (value != null) { - positiverate += bimap.get(pair); - } - } - //insert the rating into the database - NamedPreparedStatement m_insertRating; - m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); - QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); - m_insertRating.executeUpdate(); - //don't print the rate - //System.out.println(text + ": " + (int) (positiverate * 10)); - } - } - - //makes a wordcloud of the tweets in the ResultSet data - void makeWordCloud(String query) throws SQLException { - - query(query); - //go to the start of the ResultSet data - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - //make the hashmap with the words and their frequency - HashMap<String, Integer> wordcloud = new HashMap<>(); - - String text; - String[] words; - Integer value; - - while (data.next()) { - //get the text - text = data.getString("text"); - //remove punctuation, convert to lowercase and split on words - text = removePunct(text); - text = text.toLowerCase(); - words = text.split("\\s+"); - - //count the words - for (String word : words) { - value = wordcloud.get(word); - if (value == null) { - wordcloud.put(word, 1); - } else { - wordcloud.put(word, value++); - } - } - } - } - - //replaces punctuation so it will be splitted - //also removes urls - private String splitPunctToWords(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[!?):;\"']", " $0"); - text = text.replaceAll("[.,-](\\s|$)", " $0"); - text = text.replaceAll("\\s[(\"']", "$0 "); - return text; - } - - //removes punctuation - //also removes urls - private String removePunct(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[.,!?()-:;\"']", " "); - return text; - } -} +package main;
+
+import analysis.BrandChecker;
+import database.NamedPreparedStatement;
+import database.QueryUtils;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Scanner;
+
+/**
+ * The sentiment analysis class that rates tweets based on a unigram and bigram
+ * set of weights.
+ */
+public class Analyzor {
+
+ /**
+ * The map that matches single words to their weights.
+ */
+ private final HashMap<String, Double> unimap = new HashMap();
+
+ /**
+ * The map that matches word pairs to their weights.
+ */
+ private final HashMap<String, Double> bimap = new HashMap();
+
+ /**
+ * The results of a query.
+ */
+ private ResultSet data;
+
+ /**
+ * The connection to the database.
+ */
+ private final Connection connection;
+
+ /**
+ * Used to determine the brands inside a tweet.
+ */
+ private final BrandChecker brandChecker = new BrandChecker("brands.txt");
+
+ Analyzor(Connection connection) {
+ this.connection = connection;
+ }
+
+ //reads the lexicons
+ void readLexicon() throws FileNotFoundException {
+ if (!unimap.isEmpty()) {
+ // data is already read.
+ return;
+ }
+ // A unigram is in the format (WS = whitespace):
+ // word <WS> rating <WS> ??? <WS> ??
+ // A bigram has an two WS-separated words instead of one.
+ try (Scanner uniScanner = new Scanner("unigrams-pmilexicon.txt");
+ Scanner biScanner = new Scanner("bigrams-pmilexicon.txt");) {
+ //Fill the map of unigrams
+ while (uniScanner.hasNext()) {
+ String words = uniScanner.next();
+ unimap.put(words.toLowerCase(), uniScanner.nextDouble());
+ if (uniScanner.hasNextLine()) {
+ uniScanner.nextLine();
+ }
+ }
+
+ //fill the map of bigrams
+ while (biScanner.hasNext()) {
+ String words = biScanner.next() + " " + biScanner.next();
+ bimap.put(words.toLowerCase(), biScanner.nextDouble());
+ if (biScanner.hasNextLine()) {
+ biScanner.nextLine();
+ }
+ }
+ }
+ }
+
+ /**
+ * Executes a query that the analyzer can analyze.
+ *
+ * @param query The query string to execute.
+ * @throws SQLException When database connection isn't available.
+ */
+ public void query(String query) throws SQLException {
+ PreparedStatement statement;
+ //make a connection to the database and execute the query
+ statement = connection.prepareStatement(query);
+ data = statement.executeQuery();
+ }
+
+ /**
+ * Run a sentiment analysis and fill the database with the output.
+ *
+ * @param query The query to analyze
+ * @throws SQLException
+ * @throws IOException
+ */
+ public void sentimentAnalysis(String query) throws SQLException, IOException {
+ query(query);
+
+ //read the lexicons
+ readLexicon();
+
+ //go to the start of te dataset
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+ data.beforeFirst();
+
+ Double value;
+ String text;
+
+ //for all tuples
+ while (data.next()) {
+ //get the text
+ text = data.getString("text");
+ text = splitPunctToWords(text);
+ // test is the tweet text you are going to analyze
+ String[] words = text.split("\\s+"); // text splitted into separate words
+ double positiverate = 0; // positive rating
+
+ // Rate the text with unigrams
+ for (String word : words) {
+ value = unimap.get(word);
+ if (value != null) {
+ positiverate += unimap.get(word);
+ }
+ }
+ // Rate the text with bigrams
+ for (int i = 0; i < words.length - 1; i++) {
+ String pair = words[i] + " " + words[i + 1];
+ value = bimap.get(pair);
+ if (value != null) {
+ positiverate += bimap.get(pair);
+ }
+ }
+ // Obtain the brands contained in a tweet text.
+ //List<String> brands = brandChecker.getBrands(text);
+
+ // insert the rating into the database
+ NamedPreparedStatement m_insertRating;
+ m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating);
+ QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10));
+ m_insertRating.executeUpdate();
+
+
+ //don't print the rate
+ //System.out.println(text + ": " + (int) (positiverate * 10));
+ }
+ }
+
+ //makes a wordcloud of the tweets in the ResultSet data
+ public void makeWordCloud(String query) throws SQLException {
+
+ query(query);
+ //go to the start of the ResultSet data
+ if (data == null) {
+ System.err.println("data is empty, try querying first");
+ return;
+ }
+
+ //make the hashmap with the words and their frequency
+ HashMap<String, Integer> wordcloud = new HashMap<>();
+
+ String text;
+ String[] words;
+ Integer value;
+
+ while (data.next()) {
+ //get the text
+ text = data.getString("text");
+ //remove punctuation, convert to lowercase and split on words
+ text = removePunct(text);
+ text = text.toLowerCase();
+ words = text.split("\\s+");
+
+ //count the words
+ for (String word : words) {
+ value = wordcloud.get(word);
+ if (value == null) {
+ wordcloud.put(word, 1);
+ } else {
+ wordcloud.put(word, value++);
+ }
+ }
+ }
+ }
+
+ //replaces punctuation so it will be splitted
+ //also removes urls
+ private String splitPunctToWords(String text) {
+ text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll("[!?):;\"']", " $0");
+ text = text.replaceAll("[.,-](\\s|$)", " $0");
+ text = text.replaceAll("\\s[(\"']", "$0 ");
+ return text;
+ }
+
+ //removes punctuation
+ //also removes urls
+ private String removePunct(String text) {
+ text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll("[.,!?()-:;\"']", " ");
+ return text;
+ }
+}
|