From d7c4a4ddb0b0fe43e5b02f1748c811a1249dd172 Mon Sep 17 00:00:00 2001 From: Maurice Laveaux Date: Thu, 15 May 2014 20:47:17 +0200 Subject: Added first version of BrandChecker TODO * Not yet finalized, need to search through ,.;'/[] etc. * Maybe implement searching after eachother, first htc then one. --- brandrules.txt | 54 ++++++ src/analysis/BrandChecker.java | 140 ++++++++++++-- src/main/Analyzor.java | 402 +++++++++++++++++++++-------------------- 3 files changed, 388 insertions(+), 208 deletions(-) create mode 100755 brandrules.txt diff --git a/brandrules.txt b/brandrules.txt new file mode 100755 index 0000000..ae7d6f9 --- /dev/null +++ b/brandrules.txt @@ -0,0 +1,54 @@ +galaxy s5 - tablet tab +galaxy s4 - tablet tab +galaxy s3 +galaxy K zoom +galaxy note +samsung note +samsung zoom + +iphone 5 +iphone 5c +iphone 5s +iphone 4 +iphone 4s + +huawei ascend +huawei p6 +huawei p7 +huawei mini +huawei y300 +huawei y530 +huawei mate +huawei g700 +huawei g510 +huawei g6 +huawei g525 + +sony xperia +sony L +sony E1 +sony Z +sony Z1 +sony Z2 +sony compact +sony ZR +sony M + +HTC one - phone +HTC M8 +HTC mini +HTC desire +HTC X dual +HTC SV + +LG Nexus 5 +LG G2 +LG L70 +LG L90 +LG L40 +LG G flex +LG mini +LG L9 +LG L7 +LG L5 +LG L3 diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java index 6b57a39..740424c 100644 --- a/src/analysis/BrandChecker.java +++ b/src/analysis/BrandChecker.java @@ -17,45 +17,151 @@ package analysis; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; +import java.util.Locale; +import java.util.Scanner; +import java.util.Set; /** * This class obtains a text and returns the brands that are contained in this - * text. - * + * text. The input file contains lines with [brandname] [extensions] | + * [blacklisted words]. + * * @author Maurice Laveaux */ public class BrandChecker { - + + /** + * A set of rules that determine the brands. + */ + private final ArrayList ruleset = new ArrayList(); + /** - * @param filename The filename that contains all the brands. + * @param filename The filename that contains all the rules. */ - public BrandChecker(final String filename) { + public BrandChecker(final String filename) { try { readFile(filename); } catch (FileNotFoundException ex) { - Logger.getLogger(BrandChecker.class.getName()).log(Level.SEVERE, null, ex); + throw new IllegalArgumentException("file named " + filename + " not found."); } } - + /** * Get the brands that are in some text. - * + * * @param text Any valid text. * @return The list of brands that are contained in this text or null. */ public List getBrands(String text) { - - - - return null; + String[] words = text.toLowerCase().split("\\s+"); + + List brands = new ArrayList(); + + for (BrandRule rule : ruleset) { + if (rule.analyze(words)) { + brands.add(rule.getBrand()); + } + } + + return brands; } - + private void readFile(final String filename) throws FileNotFoundException { - InputStream inFile = new FileInputStream(filename); - + Scanner readFile = new Scanner(inFile); + + while (readFile.hasNextLine()) { + String line = readFile.nextLine(); + + parseRule(line.toLowerCase(Locale.ENGLISH)); + } + } + + private void parseRule(String line) { + if (line.isEmpty()) { + return; + } + + if (!line.contains("-")) { + // only positive search entries. + String[] sequence = line.split("\\s+"); + String[] blacklist = {""}; + ruleset.add(new BrandRule(line, sequence, blacklist)); + } else { + String[] parts = line.split("-"); + // positive and negative. + if (parts.length < 2) { + throw new IllegalArgumentException("Brand rule contained '-' but not two parts."); + } + + String[] sequence = parts[0].trim().split("\\s+"); + String[] blacklist = parts[1].trim().split("\\s+"); + + ruleset.add(new BrandRule(parts[0].trim(), sequence, blacklist)); + } + + } + + private class BrandRule { + + /** + * The words that should be in the text. + */ + private final ArrayList names; + + /** + * A blacklist of words that are not interesting. + */ + private final Set blacklist; + + /** + * The brand name of this rule. + */ + private final String brand; + + /** + * + * @param brand The brand of this rule. + * @param sequential The sequence of strings to obtain. + * @param blacklist The blacklisted words. + */ + public BrandRule(final String brandname, final String[] names, final String[] blacklist) { + this.brand = brandname; + this.names = new ArrayList(Arrays.asList(names)); + this.blacklist = new HashSet(Arrays.asList(blacklist)); + } + + /** + * Analyzes if this rule is holds for some text. + * + * @param words A list of words in a line. + */ + public boolean analyze(String[] words) { + int found = 0; + + for (String word : words) { + if (blacklist.contains(word)) { + return false; + } + + if (names.contains(word)) { + found++; + } + } + + if (found == names.size()) { + return true; + } + + return false; + } + + public String getBrand() { + return brand; + } } } diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 9be1101..2dc482b 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -1,191 +1,211 @@ -package main; - -import database.NamedPreparedStatement; -import database.QueryUtils; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.Scanner; - -/** - * The sentiment analysis class that rates tweets based on a unigram and bigram - * set of weights. - */ -public class Analyzor { - - /** - * The map that matches single words to their weights. - */ - private final HashMap unimap = new HashMap(); - - /** - * The map that matches word pairs to their weights. - */ - private final HashMap bimap = new HashMap(); - - private ResultSet data; - private final Connection connection; - - Analyzor(Connection connection) { - this.connection = connection; - } - - //reads the lexicons - void readLexicon() throws FileNotFoundException { - if (!unimap.isEmpty()) { - // data is already read. - return; - } - // A unigram is in the format (WS = whitespace): - // word rating ??? ?? - // A bigram has an two WS-separated words instead of one. - try (Scanner uniScanner = new Scanner("unigrams-pmilexicon.txt"); - Scanner biScanner = new Scanner("bigrams-pmilexicon.txt");) { - //Fill the map of unigrams - while (uniScanner.hasNext()) { - String words = uniScanner.next(); - unimap.put(words.toLowerCase(), uniScanner.nextDouble()); - if (uniScanner.hasNextLine()) { - uniScanner.nextLine(); - } - } - - //fill the map of bigrams - while (biScanner.hasNext()) { - String words = biScanner.next() + " " + biScanner.next(); - bimap.put(words.toLowerCase(), biScanner.nextDouble()); - if (biScanner.hasNextLine()) { - biScanner.nextLine(); - } - } - } - } - - /** - * Executes a query that the analyzer can analyze. - * - * @param query The query string to execute. - * @throws SQLException When database connection isn't available. - */ - public void query(String query) throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement(query); - data = statement.executeQuery(); - } - - /** - * Run a sentiment analysis and fill the database with the output. - * - * @throws SQLException - * @throws IOException - */ - public void sentimentAnalysis(String query) throws SQLException, IOException { - query(query); - - //read the lexicons - readLexicon(); - - //go to the start of te dataset - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - data.beforeFirst(); - - Double value; - String text; - - //for all tuples - while (data.next()) { - //get the text - text = data.getString("text"); - text = splitPunctToWords(text); - // test is the tweet text you are going to analyze - String[] words = text.split("\\s+"); // text splitted into separate words - double positiverate = 0; // positive rating - - // Rate the text with unigrams - for (String word : words) { - value = unimap.get(word); - if (value != null) { - positiverate += unimap.get(word); - } - } - // Rate the text with bigrams - for (int i = 0; i < words.length - 1; i++) { - String pair = words[i] + " " + words[i + 1]; - value = bimap.get(pair); - if (value != null) { - positiverate += bimap.get(pair); - } - } - //insert the rating into the database - NamedPreparedStatement m_insertRating; - m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); - QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); - m_insertRating.executeUpdate(); - //don't print the rate - //System.out.println(text + ": " + (int) (positiverate * 10)); - } - } - - //makes a wordcloud of the tweets in the ResultSet data - void makeWordCloud(String query) throws SQLException { - - query(query); - //go to the start of the ResultSet data - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - //make the hashmap with the words and their frequency - HashMap wordcloud = new HashMap<>(); - - String text; - String[] words; - Integer value; - - while (data.next()) { - //get the text - text = data.getString("text"); - //remove punctuation, convert to lowercase and split on words - text = removePunct(text); - text = text.toLowerCase(); - words = text.split("\\s+"); - - //count the words - for (String word : words) { - value = wordcloud.get(word); - if (value == null) { - wordcloud.put(word, 1); - } else { - wordcloud.put(word, value++); - } - } - } - } - - //replaces punctuation so it will be splitted - //also removes urls - private String splitPunctToWords(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[!?):;\"']", " $0"); - text = text.replaceAll("[.,-](\\s|$)", " $0"); - text = text.replaceAll("\\s[(\"']", "$0 "); - return text; - } - - //removes punctuation - //also removes urls - private String removePunct(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[.,!?()-:;\"']", " "); - return text; - } -} +package main; + +import analysis.BrandChecker; +import database.NamedPreparedStatement; +import database.QueryUtils; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.List; +import java.util.Scanner; + +/** + * The sentiment analysis class that rates tweets based on a unigram and bigram + * set of weights. + */ +public class Analyzor { + + /** + * The map that matches single words to their weights. + */ + private final HashMap unimap = new HashMap(); + + /** + * The map that matches word pairs to their weights. + */ + private final HashMap bimap = new HashMap(); + + /** + * The results of a query. + */ + private ResultSet data; + + /** + * The connection to the database. + */ + private final Connection connection; + + /** + * Used to determine the brands inside a tweet. + */ + private final BrandChecker brandChecker = new BrandChecker("brands.txt"); + + Analyzor(Connection connection) { + this.connection = connection; + } + + //reads the lexicons + void readLexicon() throws FileNotFoundException { + if (!unimap.isEmpty()) { + // data is already read. + return; + } + // A unigram is in the format (WS = whitespace): + // word rating ??? ?? + // A bigram has an two WS-separated words instead of one. + try (Scanner uniScanner = new Scanner("unigrams-pmilexicon.txt"); + Scanner biScanner = new Scanner("bigrams-pmilexicon.txt");) { + //Fill the map of unigrams + while (uniScanner.hasNext()) { + String words = uniScanner.next(); + unimap.put(words.toLowerCase(), uniScanner.nextDouble()); + if (uniScanner.hasNextLine()) { + uniScanner.nextLine(); + } + } + + //fill the map of bigrams + while (biScanner.hasNext()) { + String words = biScanner.next() + " " + biScanner.next(); + bimap.put(words.toLowerCase(), biScanner.nextDouble()); + if (biScanner.hasNextLine()) { + biScanner.nextLine(); + } + } + } + } + + /** + * Executes a query that the analyzer can analyze. + * + * @param query The query string to execute. + * @throws SQLException When database connection isn't available. + */ + public void query(String query) throws SQLException { + PreparedStatement statement; + //make a connection to the database and execute the query + statement = connection.prepareStatement(query); + data = statement.executeQuery(); + } + + /** + * Run a sentiment analysis and fill the database with the output. + * + * @param query The query to analyze + * @throws SQLException + * @throws IOException + */ + public void sentimentAnalysis(String query) throws SQLException, IOException { + query(query); + + //read the lexicons + readLexicon(); + + //go to the start of te dataset + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + data.beforeFirst(); + + Double value; + String text; + + //for all tuples + while (data.next()) { + //get the text + text = data.getString("text"); + text = splitPunctToWords(text); + // test is the tweet text you are going to analyze + String[] words = text.split("\\s+"); // text splitted into separate words + double positiverate = 0; // positive rating + + // Rate the text with unigrams + for (String word : words) { + value = unimap.get(word); + if (value != null) { + positiverate += unimap.get(word); + } + } + // Rate the text with bigrams + for (int i = 0; i < words.length - 1; i++) { + String pair = words[i] + " " + words[i + 1]; + value = bimap.get(pair); + if (value != null) { + positiverate += bimap.get(pair); + } + } + // Obtain the brands contained in a tweet text. + //List brands = brandChecker.getBrands(text); + + // insert the rating into the database + NamedPreparedStatement m_insertRating; + m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); + QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); + m_insertRating.executeUpdate(); + + + //don't print the rate + //System.out.println(text + ": " + (int) (positiverate * 10)); + } + } + + //makes a wordcloud of the tweets in the ResultSet data + public void makeWordCloud(String query) throws SQLException { + + query(query); + //go to the start of the ResultSet data + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + + //make the hashmap with the words and their frequency + HashMap wordcloud = new HashMap<>(); + + String text; + String[] words; + Integer value; + + while (data.next()) { + //get the text + text = data.getString("text"); + //remove punctuation, convert to lowercase and split on words + text = removePunct(text); + text = text.toLowerCase(); + words = text.split("\\s+"); + + //count the words + for (String word : words) { + value = wordcloud.get(word); + if (value == null) { + wordcloud.put(word, 1); + } else { + wordcloud.put(word, value++); + } + } + } + } + + //replaces punctuation so it will be splitted + //also removes urls + private String splitPunctToWords(String text) { + text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll("[!?):;\"']", " $0"); + text = text.replaceAll("[.,-](\\s|$)", " $0"); + text = text.replaceAll("\\s[(\"']", "$0 "); + return text; + } + + //removes punctuation + //also removes urls + private String removePunct(String text) { + text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll("[.,!?()-:;\"']", " "); + return text; + } +} -- cgit v1.2.1