From e051abbfdbf7ff721bf1318bf0b5939741b1f792 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Mon, 26 May 2014 11:36:25 +0200 Subject: FKING CRLF --- nbproject/configs/such_database.properties | 2 +- nbproject/project.properties | 162 +++---- src/main/Analyzor.java | 690 ++++++++++++++--------------- 3 files changed, 427 insertions(+), 427 deletions(-) diff --git a/nbproject/configs/such_database.properties b/nbproject/configs/such_database.properties index bba41ec..9dffee6 100644 --- a/nbproject/configs/such_database.properties +++ b/nbproject/configs/such_database.properties @@ -1 +1 @@ -$label=such database +$label=such database diff --git a/nbproject/project.properties b/nbproject/project.properties index b262ab6..ab8ae05 100644 --- a/nbproject/project.properties +++ b/nbproject/project.properties @@ -1,81 +1,81 @@ -annotation.processing.enabled=true -annotation.processing.enabled.in.editor=false -annotation.processing.processors.list= -annotation.processing.run.all.processors=true -annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output -application.title=Goldfarmer -application.vendor=maurice -build.classes.dir=${build.dir}/classes -build.classes.excludes=**/*.java,**/*.form -# This directory is removed when the project is cleaned: -build.dir=build -build.generated.dir=${build.dir}/generated -build.generated.sources.dir=${build.dir}/generated-sources -# Only compile against the classpath explicitly listed here: -build.sysclasspath=ignore -build.test.classes.dir=${build.dir}/test/classes -build.test.results.dir=${build.dir}/test/results -# Uncomment to specify the preferred debugger connection transport: -#debug.transport=dt_socket -debug.classpath=\ - ${run.classpath} -debug.test.classpath=\ - ${run.test.classpath} -# Files in build.classes.dir which should be excluded from distribution jar -dist.archive.excludes= -# This directory is removed when the project is cleaned: -dist.dir=dist -dist.jar=${dist.dir}/Goldfarmer.jar -dist.javadoc.dir=${dist.dir}/javadoc -endorsed.classpath= -excludes= -file.reference.joda-time-2.3.jar=lib/joda-time-2.3.jar -file.reference.postgresql-9.3-1101.jdbc41.jar=lib/postgresql-9.3-1101.jdbc41.jar -includes=** -jar.compress=false -javac.classpath=\ - ${file.reference.joda-time-2.3.jar}:\ - ${file.reference.postgresql-9.3-1101.jdbc41.jar} -# Space-separated list of extra javac options -javac.compilerargs= -javac.deprecation=false -javac.processorpath=\ - ${javac.classpath} -javac.source=1.7 -javac.target=1.7 -javac.test.classpath=\ - ${javac.classpath}:\ - ${build.classes.dir}:\ - ${libs.junit_4.classpath} -javac.test.processorpath=\ - ${javac.test.classpath} -javadoc.additionalparam= -javadoc.author=false -javadoc.encoding=${source.encoding} -javadoc.noindex=false -javadoc.nonavbar=false -javadoc.notree=false -javadoc.private=false -javadoc.splitindex=true -javadoc.use=true -javadoc.version=false -javadoc.windowtitle= -main.class=main.Main -manifest.file=manifest.mf -meta.inf.dir=${src.dir}/META-INF -mkdist.disabled=false -platform.active=default_platform -project.licensePath=./nbproject/licenseheader.txt -run.classpath=\ - ${javac.classpath}:\ - ${build.classes.dir} -# Space-separated list of JVM arguments used when running the project. -# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value. -# To set system properties for unit tests define test-sys-prop.name=value: -run.jvmargs= -run.test.classpath=\ - ${javac.test.classpath}:\ - ${build.test.classes.dir} -source.encoding=UTF-8 -src.dir=src -test.src.dir=test +annotation.processing.enabled=true +annotation.processing.enabled.in.editor=false +annotation.processing.processors.list= +annotation.processing.run.all.processors=true +annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output +application.title=Goldfarmer +application.vendor=maurice +build.classes.dir=${build.dir}/classes +build.classes.excludes=**/*.java,**/*.form +# This directory is removed when the project is cleaned: +build.dir=build +build.generated.dir=${build.dir}/generated +build.generated.sources.dir=${build.dir}/generated-sources +# Only compile against the classpath explicitly listed here: +build.sysclasspath=ignore +build.test.classes.dir=${build.dir}/test/classes +build.test.results.dir=${build.dir}/test/results +# Uncomment to specify the preferred debugger connection transport: +#debug.transport=dt_socket +debug.classpath=\ + ${run.classpath} +debug.test.classpath=\ + ${run.test.classpath} +# Files in build.classes.dir which should be excluded from distribution jar +dist.archive.excludes= +# This directory is removed when the project is cleaned: +dist.dir=dist +dist.jar=${dist.dir}/Goldfarmer.jar +dist.javadoc.dir=${dist.dir}/javadoc +endorsed.classpath= +excludes= +file.reference.joda-time-2.3.jar=lib/joda-time-2.3.jar +file.reference.postgresql-9.3-1101.jdbc41.jar=lib/postgresql-9.3-1101.jdbc41.jar +includes=** +jar.compress=false +javac.classpath=\ + ${file.reference.joda-time-2.3.jar}:\ + ${file.reference.postgresql-9.3-1101.jdbc41.jar} +# Space-separated list of extra javac options +javac.compilerargs= +javac.deprecation=false +javac.processorpath=\ + ${javac.classpath} +javac.source=1.7 +javac.target=1.7 +javac.test.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir}:\ + ${libs.junit_4.classpath} +javac.test.processorpath=\ + ${javac.test.classpath} +javadoc.additionalparam= +javadoc.author=false +javadoc.encoding=${source.encoding} +javadoc.noindex=false +javadoc.nonavbar=false +javadoc.notree=false +javadoc.private=false +javadoc.splitindex=true +javadoc.use=true +javadoc.version=false +javadoc.windowtitle= +main.class=main.Main +manifest.file=manifest.mf +meta.inf.dir=${src.dir}/META-INF +mkdist.disabled=false +platform.active=default_platform +project.licensePath=./nbproject/licenseheader.txt +run.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir} +# Space-separated list of JVM arguments used when running the project. +# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value. +# To set system properties for unit tests define test-sys-prop.name=value: +run.jvmargs= +run.test.classpath=\ + ${javac.test.classpath}:\ + ${build.test.classes.dir} +source.encoding=UTF-8 +src.dir=src +test.src.dir=test diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 0c3ede3..9c98a9d 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -1,345 +1,345 @@ -package main; - -import analysis.BrandChecker; -import database.NamedPreparedStatement; -import database.QueryUtils; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.List; -import java.util.HashMap; -import java.util.Map.Entry; -import java.util.Scanner; - -/** - * The sentiment analysis class that rates tweets based on a unigram and bigram - * set of weights. - */ -public class Analyzor { - - /** - * The map that matches single words to their weights. - */ - private final HashMap unimap = new HashMap(); - - /** - * The map that matches word pairs to their weights. - */ - private final HashMap bimap = new HashMap(); - - /** - * The results of a query, maybe return from query(). - */ - private ResultSet data; - - /** - * The persistent connection to the database. - */ - private final Connection connection; - - /** - * @param connection An open connection to the database. - */ - public Analyzor(Connection connection) { - this.connection = connection; - } - - /** - * Read the unigram and bigram lexica. - * - * @throws FileNotFoundException - */ - public void readLexicon() throws FileNotFoundException { - if (!unimap.isEmpty()) { - // data is already read. - return; - } - System.err.println("Trying to read lexicons..."); - // A unigram is in the format (WS = whitespace): - // word rating ??? ?? - // A bigram has an two WS-separated words instead of one. - try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); - Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { - //Fill the map of unigrams - int lineno = 1; - while (uniScanner.hasNext()) { - - String words = uniScanner.next(); - Double d = Double.valueOf(uniScanner.next()); - unimap.put(words.toLowerCase(), d); - if (uniScanner.hasNextLine()) { - uniScanner.nextLine(); - } - lineno++; - - } - - //fill the map of bigrams - while (biScanner.hasNext()) { - String words = biScanner.next() + " " + biScanner.next(); - bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next())); - if (biScanner.hasNextLine()) { - biScanner.nextLine(); - } - } - } - System.err.println("Lexicons are read."); - } - - /** - * Executes a query that the analyzer can analyze. - * - * @param query The query string to execute. - * @throws SQLException When database connection isn't available. - */ - public void query(String query) throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement(query); - data = statement.executeQuery(); - } - - /** - * Run a sentiment analysis and fill the database with the output. - * - * @param query The sql text for the query. - * @throws SQLException - * @throws IOException - */ - public void sentimentAnalysis(String query) throws SQLException, IOException { - query(query); - - //read the lexicons - readLexicon(); - - //go to the start of te dataset - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - Double value; - String text; - - //for all tuples - while (data.next()) { - //get the text - text = data.getString("text"); - text = splitPunctToWords(text); - // test is the tweet text you are going to analyze - String[] words = text.split("\\s+"); // text splitted into separate words - double positiverate = 0; // positive rating - - // Rate the text with unigrams - for (String word : words) { - value = unimap.get(word); - if (value != null) { - positiverate += unimap.get(word); - } - } - // Rate the text with bigrams - for (int i = 0; i < words.length - 1; i++) { - String pair = words[i] + " " + words[i + 1]; - value = bimap.get(pair); - if (value != null) { - positiverate += bimap.get(pair); - } - } - //insert the rating into the database - NamedPreparedStatement m_insertRating; - m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); - QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); - m_insertRating.executeUpdate(); - //don't print the rate - //System.out.println(text + ": " + (int) (positiverate * 10)); - } - } - - /** - * Make a wordcloud of the results of some query. - * - * @param query The sql text for a query. - * @throws SQLException - * @throws FileNotFoundException - * @throws UnsupportedEncodingException - */ - public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - - query(query); - //go to the start of the ResultSet data - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - String text; - String brand; - String[] words; - HashMap> wordcloud = new HashMap<>(); - - while (data.next()) { - //get brand - brand=data.getString("brand"); - //make hashmap for each brand - if(!wordcloud.containsKey(brand)){ - wordcloud.put(brand, new HashMap()); - } - //get the text - text = data.getString("text"); - //remove punctuation, convert to lowercase and split on words - text = removePunct(text); - text = text.toLowerCase(); - words = text.split("\\s+"); - //for all words - for (String word : words) { - //if it is empty, a space or a stripe, skip it - if(word.equals("") || word.equals(" ") || word.equals("-")){ - continue; - } - //if the word is already in the map, increment the amount - if(wordcloud.get(brand).containsKey(word)){ - wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); - } - //if the word is not already in the map, make an entry with amount = 1 - else{ - wordcloud.get(brand).put(word, 1); - } - } - } - //print the words and their frequency in a csv file - mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count"); - } - - //generate csv for disco from the query - public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - //do the query - query(query); - PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); - //print the first row - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - writer.print(data.getMetaData().getColumnLabel(i) + ", "); - } - writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); - //print the values - while (data.next()) { - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - if (data.getObject(i) == null) { - writer.print(", "); - } else { - writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); - } - } - if (data.getObject(data.getMetaData().getColumnCount()) == null) { - writer.println("0"); - } else { - writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); - } - } - writer.close(); - } - - public void getBrands() throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement("delete from mentionsbrand"); - statement.executeUpdate(); - BrandChecker checker = new BrandChecker("brandonlyrules.txt"); - query("select * from tweet"); - NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); - while (data.next()) { - List brands = checker.getBrands(data.getString("text")); - if (brands.isEmpty()) { - QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no"); - m_insertBrand.executeUpdate(); - } else { - for (String brand : brands) { - QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand); - m_insertBrand.executeUpdate(); - } - } - } - } - - //gets the amount of users that tweet about a brand in a timezone - //makes a csv file timezone, brand, amount - public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{ - query(query); - //hashmap timezone, brand, amount - HashMap> timeMap = new HashMap<>(); - String timezone; - String brand; - - while(data.next()){ - timezone = data.getString("timezone"); - brand = data.getString("brand"); - //if the timezone is already in the map - if(timeMap.containsKey(timezone)){ - //if the brand for that timezone is already in the map - if(timeMap.get(timezone).containsKey(brand)){ - //increment the amount - timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); - } - //if the brand for that timezone is not yet in the map - else{ - //make a new entry for that brand with amount = 1 - timeMap.get(timezone).put(brand, 1); - } - } - //if the timezone is not yet in the map - else{ - //make a new hashmap for this map and fill it with the brand and the amount - timeMap.put(timezone, new HashMap()); - timeMap.get(timezone).put(brand, 1); - } - } - //make the CSV out of the map - mapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); - } - - //replaces punctuation so it will be splitted - //also removes urls - private String splitPunctToWords(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[!?):;\"']", " $0"); - text = text.replaceAll("[.,-](\\s|$)", " $0"); - text = text.replaceAll("\\s[(\"']", "$0 "); - return text; - } - - //removes punctuation - //also removes urls - private String removePunct(String text) { - text = text.replaceAll("https?://\\S*", " "); - text = text.replaceAll("@\\S*", " "); - text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); - return text; - } - - //prints a hashmap into a csv for a html application - //Hashmap> becomes key1, key2, value - //only for String, String, Integer - void mapToCSV(HashMap> map, String fileName, String firstLine) - throws FileNotFoundException, UnsupportedEncodingException{ - - PrintWriter writer = new PrintWriter(fileName, "UTF-8"); - - writer.println(firstLine); - - //loop over brands - for(Entry en : map.entrySet()){ - //loop over words - for(Entry e : map.get(en.getKey()).entrySet()){ - writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); - } - } - - writer.close(); - System.out.println("csv file made, please put it next to html file and run this"); - } -} +package main; + +import analysis.BrandChecker; +import database.NamedPreparedStatement; +import database.QueryUtils; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.List; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.Scanner; + +/** + * The sentiment analysis class that rates tweets based on a unigram and bigram + * set of weights. + */ +public class Analyzor { + + /** + * The map that matches single words to their weights. + */ + private final HashMap unimap = new HashMap(); + + /** + * The map that matches word pairs to their weights. + */ + private final HashMap bimap = new HashMap(); + + /** + * The results of a query, maybe return from query(). + */ + private ResultSet data; + + /** + * The persistent connection to the database. + */ + private final Connection connection; + + /** + * @param connection An open connection to the database. + */ + public Analyzor(Connection connection) { + this.connection = connection; + } + + /** + * Read the unigram and bigram lexica. + * + * @throws FileNotFoundException + */ + public void readLexicon() throws FileNotFoundException { + if (!unimap.isEmpty()) { + // data is already read. + return; + } + System.err.println("Trying to read lexicons..."); + // A unigram is in the format (WS = whitespace): + // word rating ??? ?? + // A bigram has an two WS-separated words instead of one. + try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); + Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { + //Fill the map of unigrams + int lineno = 1; + while (uniScanner.hasNext()) { + + String words = uniScanner.next(); + Double d = Double.valueOf(uniScanner.next()); + unimap.put(words.toLowerCase(), d); + if (uniScanner.hasNextLine()) { + uniScanner.nextLine(); + } + lineno++; + + } + + //fill the map of bigrams + while (biScanner.hasNext()) { + String words = biScanner.next() + " " + biScanner.next(); + bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next())); + if (biScanner.hasNextLine()) { + biScanner.nextLine(); + } + } + } + System.err.println("Lexicons are read."); + } + + /** + * Executes a query that the analyzer can analyze. + * + * @param query The query string to execute. + * @throws SQLException When database connection isn't available. + */ + public void query(String query) throws SQLException { + PreparedStatement statement; + //make a connection to the database and execute the query + statement = connection.prepareStatement(query); + data = statement.executeQuery(); + } + + /** + * Run a sentiment analysis and fill the database with the output. + * + * @param query The sql text for the query. + * @throws SQLException + * @throws IOException + */ + public void sentimentAnalysis(String query) throws SQLException, IOException { + query(query); + + //read the lexicons + readLexicon(); + + //go to the start of te dataset + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + + Double value; + String text; + + //for all tuples + while (data.next()) { + //get the text + text = data.getString("text"); + text = splitPunctToWords(text); + // test is the tweet text you are going to analyze + String[] words = text.split("\\s+"); // text splitted into separate words + double positiverate = 0; // positive rating + + // Rate the text with unigrams + for (String word : words) { + value = unimap.get(word); + if (value != null) { + positiverate += unimap.get(word); + } + } + // Rate the text with bigrams + for (int i = 0; i < words.length - 1; i++) { + String pair = words[i] + " " + words[i + 1]; + value = bimap.get(pair); + if (value != null) { + positiverate += bimap.get(pair); + } + } + //insert the rating into the database + NamedPreparedStatement m_insertRating; + m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); + QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); + m_insertRating.executeUpdate(); + //don't print the rate + //System.out.println(text + ": " + (int) (positiverate * 10)); + } + } + + /** + * Make a wordcloud of the results of some query. + * + * @param query The sql text for a query. + * @throws SQLException + * @throws FileNotFoundException + * @throws UnsupportedEncodingException + */ + public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + + query(query); + //go to the start of the ResultSet data + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + + String text; + String brand; + String[] words; + HashMap> wordcloud = new HashMap<>(); + + while (data.next()) { + //get brand + brand=data.getString("brand"); + //make hashmap for each brand + if(!wordcloud.containsKey(brand)){ + wordcloud.put(brand, new HashMap()); + } + //get the text + text = data.getString("text"); + //remove punctuation, convert to lowercase and split on words + text = removePunct(text); + text = text.toLowerCase(); + words = text.split("\\s+"); + //for all words + for (String word : words) { + //if it is empty, a space or a stripe, skip it + if(word.equals("") || word.equals(" ") || word.equals("-")){ + continue; + } + //if the word is already in the map, increment the amount + if(wordcloud.get(brand).containsKey(word)){ + wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); + } + //if the word is not already in the map, make an entry with amount = 1 + else{ + wordcloud.get(brand).put(word, 1); + } + } + } + //print the words and their frequency in a csv file + mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count"); + } + + //generate csv for disco from the query + public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + //do the query + query(query); + PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); + //print the first row + for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { + writer.print(data.getMetaData().getColumnLabel(i) + ", "); + } + writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); + //print the values + while (data.next()) { + for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { + if (data.getObject(i) == null) { + writer.print(", "); + } else { + writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); + } + } + if (data.getObject(data.getMetaData().getColumnCount()) == null) { + writer.println("0"); + } else { + writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); + } + } + writer.close(); + } + + public void getBrands() throws SQLException { + PreparedStatement statement; + //make a connection to the database and execute the query + statement = connection.prepareStatement("delete from mentionsbrand"); + statement.executeUpdate(); + BrandChecker checker = new BrandChecker("brandonlyrules.txt"); + query("select * from tweet"); + NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); + while (data.next()) { + List brands = checker.getBrands(data.getString("text")); + if (brands.isEmpty()) { + QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no"); + m_insertBrand.executeUpdate(); + } else { + for (String brand : brands) { + QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand); + m_insertBrand.executeUpdate(); + } + } + } + } + + //gets the amount of users that tweet about a brand in a timezone + //makes a csv file timezone, brand, amount + public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{ + query(query); + //hashmap timezone, brand, amount + HashMap> timeMap = new HashMap<>(); + String timezone; + String brand; + + while(data.next()){ + timezone = data.getString("timezone"); + brand = data.getString("brand"); + //if the timezone is already in the map + if(timeMap.containsKey(timezone)){ + //if the brand for that timezone is already in the map + if(timeMap.get(timezone).containsKey(brand)){ + //increment the amount + timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); + } + //if the brand for that timezone is not yet in the map + else{ + //make a new entry for that brand with amount = 1 + timeMap.get(timezone).put(brand, 1); + } + } + //if the timezone is not yet in the map + else{ + //make a new hashmap for this map and fill it with the brand and the amount + timeMap.put(timezone, new HashMap()); + timeMap.get(timezone).put(brand, 1); + } + } + //make the CSV out of the map + mapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); + } + + //replaces punctuation so it will be splitted + //also removes urls + private String splitPunctToWords(String text) { + text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll("[!?):;\"']", " $0"); + text = text.replaceAll("[.,-](\\s|$)", " $0"); + text = text.replaceAll("\\s[(\"']", "$0 "); + return text; + } + + //removes punctuation + //also removes urls + private String removePunct(String text) { + text = text.replaceAll("https?://\\S*", " "); + text = text.replaceAll("@\\S*", " "); + text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); + return text; + } + + //prints a hashmap into a csv for a html application + //Hashmap> becomes key1, key2, value + //only for String, String, Integer + void mapToCSV(HashMap> map, String fileName, String firstLine) + throws FileNotFoundException, UnsupportedEncodingException{ + + PrintWriter writer = new PrintWriter(fileName, "UTF-8"); + + writer.println(firstLine); + + //loop over brands + for(Entry en : map.entrySet()){ + //loop over words + for(Entry e : map.get(en.getKey()).entrySet()){ + writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); + } + } + + writer.close(); + System.out.println("csv file made, please put it next to html file and run this"); + } +} -- cgit v1.2.1 From 7df2ae452a984cb12986b33034557476cb4a1536 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Mon, 26 May 2014 12:02:08 +0200 Subject: Optimize sentiment analysis Do not create a new prepared statement every time. Use an optimized UPDATE query. Drop requirement for supplying a query. --- src/database/NamedPreparedStatement.java | 11 +++++++++++ src/main/Analyzor.java | 33 +++++++++++++++++++------------- src/main/FarmShell.java | 11 ++++++++--- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/src/database/NamedPreparedStatement.java b/src/database/NamedPreparedStatement.java index ebb775b..9305d32 100644 --- a/src/database/NamedPreparedStatement.java +++ b/src/database/NamedPreparedStatement.java @@ -2,6 +2,7 @@ package database; import java.sql.Connection; import java.sql.PreparedStatement; +import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Timestamp; import java.sql.Types; @@ -104,4 +105,14 @@ public class NamedPreparedStatement { throw ex; } } + + public ResultSet executeQuery() throws SQLException { + try { + return getStmt().executeQuery(); + } catch (SQLException ex) { + System.err.println("Query error: " + ex.getMessage()); + System.err.println(stmt); + throw ex; + } + } } diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 9c98a9d..5385a79 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -113,24 +113,33 @@ public class Analyzor { * @throws IOException */ public void sentimentAnalysis(String query) throws SQLException, IOException { - query(query); + NamedPreparedStatement tweetBrandStmt, updateRating; //read the lexicons readLexicon(); - //go to the start of te dataset - if (data == null) { - System.err.println("data is empty, try querying first"); - return; + // if you ever need to re-apply rating, use something like: + // UPDATE mentionsbrand SET rating = NULL WHERE ... + if (query.isEmpty()) { + query = "SELECT t.tweetid, t.text, b.brand FROM tweet t " + + "JOIN mentionsbrand b USING (tweetid) " + + "WHERE b.rating IS NULL"; } + tweetBrandStmt = new NamedPreparedStatement(connection, + query); + ResultSet tweetBrandResults = tweetBrandStmt.executeQuery(); + + updateRating = new NamedPreparedStatement(connection, + "UPDATE mentionsbrand SET rating = :rating " + + "WHERE tweetid = :tweetid AND brand = :brand"); Double value; String text; //for all tuples - while (data.next()) { + while (tweetBrandResults.next()) { //get the text - text = data.getString("text"); + text = tweetBrandResults.getString("text"); text = splitPunctToWords(text); // test is the tweet text you are going to analyze String[] words = text.split("\\s+"); // text splitted into separate words @@ -152,12 +161,10 @@ public class Analyzor { } } //insert the rating into the database - NamedPreparedStatement m_insertRating; - m_insertRating = new NamedPreparedStatement(connection, QueryUtils.insertRating); - QueryUtils.setInsertParams(m_insertRating, data.getLong("tweetid"), data.getString("brand"), (int) (positiverate * 10)); - m_insertRating.executeUpdate(); - //don't print the rate - //System.out.println(text + ": " + (int) (positiverate * 10)); + updateRating.setLong("tweetid", tweetBrandResults.getLong("tweetid")); + updateRating.setString("brand", tweetBrandResults.getString("brand")); + updateRating.setInt("rating", (int) (positiverate * 10)); + updateRating.executeUpdate(); } } diff --git a/src/main/FarmShell.java b/src/main/FarmShell.java index 1266fd3..ed1a0ff 100644 --- a/src/main/FarmShell.java +++ b/src/main/FarmShell.java @@ -125,7 +125,12 @@ public class FarmShell { System.out.println("not yet implemented"); break; case sentiment: - getAnalyzor().sentimentAnalysis(params[0]); + // if there is no query, update all unrated items. + if (params.length > 0) { + getAnalyzor().sentimentAnalysis(params[0]); + } else { + getAnalyzor().sentimentAnalysis(""); + } break; case wordcloud: getAnalyzor().makeWordCloud(params[0]); @@ -163,10 +168,10 @@ public class FarmShell { enum Command { filterbots("marks all users as bot or not", 1), - sentiment("analyzes all tweets on positivity (about a brand)", 1), + sentiment("analyzes all tweets on brand positivity (optional arg: tweet/brand selection query)"), wordcloud("makes a wordcloud of the text of the tweets", 1), getBrands("fills the database with the brands of a tweet"), - disco("makes a outputfile for disco",1), + disco("makes a outputfile for disco", 1), exit("Returns to shell"), help("Get help"); -- cgit v1.2.1 From 00baf4ffc86dac7b723c4fc3d2c963d1fa84729b Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Mon, 26 May 2014 12:03:33 +0200 Subject: Formatting, drop useless license header Sam Hocevar doesn't know us nor does he have copyright about this work... --- src/analysis/BrandChecker.java | 32 ++++++--------------- src/database/QueryUtils.java | 8 +++--- src/main/Analyzor.java | 57 ++++++++++++++++++------------------- test/analysis/BrandCheckerTest.java | 4 +-- 4 files changed, 42 insertions(+), 59 deletions(-) diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java index 10e22b4..ee9c7b4 100644 --- a/src/analysis/BrandChecker.java +++ b/src/analysis/BrandChecker.java @@ -1,17 +1,3 @@ -/* - * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE Version 2, December 2004 - * - * Copyright (C) 2004 Sam Hocevar - * - * Everyone is permitted to copy and distribute verbatim or modified copies - * of this license document, and changing it is allowed as long as the name is - * changed. - * - * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, - * DISTRIBUTION AND MODIFICATION - * - * 0. You just DO WHAT THE FUCK YOU WANT TO. - */ package analysis; import java.io.FileInputStream; @@ -93,7 +79,7 @@ public class BrandChecker { if (line.isEmpty()) { return; } - + if (!line.contains("-")) { System.err.println("illformatted rule: " + line + ", missing -"); } else { @@ -110,13 +96,13 @@ public class BrandChecker { // Read the line. String name = parts[0].trim(); - + // Read the positive words. - String positive = parts[1].replaceAll(" ",""); + String positive = parts[1].replaceAll(" ", ""); String[] sequence = positive.split(","); - + if (parts.length == 3) { - String negative = parts[2].replaceAll(" ", ""); + String negative = parts[2].replaceAll(" ", ""); String[] blacklist = negative.split(","); ruleset.add(new BrandRule(name, sequence, blacklist)); } else { @@ -139,7 +125,7 @@ public class BrandChecker { * The words that should be in the text. */ private final HashMap names; - + /** * A blacklist of words that are not interesting. */ @@ -164,7 +150,7 @@ public class BrandChecker { } else { this.blacklist = null; } - + for (String name : names) { this.names.put(name, Boolean.FALSE); } @@ -177,7 +163,7 @@ public class BrandChecker { */ public boolean analyze(String[] words) { reset(); - + int found = 0; for (String word : words) { @@ -201,7 +187,7 @@ public class BrandChecker { public String getBrand() { return brand; } - + private void reset() { for (String name : this.names.keySet()) { this.names.put(name, Boolean.FALSE); diff --git a/src/database/QueryUtils.java b/src/database/QueryUtils.java index 2cc6fd6..b95903f 100644 --- a/src/database/QueryUtils.java +++ b/src/database/QueryUtils.java @@ -1,7 +1,6 @@ package database; import java.sql.SQLException; -import java.util.Locale; /** * Utilities to create queries. @@ -9,8 +8,9 @@ import java.util.Locale; * @author Maurice Laveaux */ public class QueryUtils { - public final static String insertRating - = buildQuery("mentionsbrand", new String[]{"tweetid","brand"},"tweetid","brand", "rating"); + + public final static String insertRating + = buildQuery("mentionsbrand", new String[]{"tweetid", "brand"}, "tweetid", "brand", "rating"); public final static String insertProfile = buildQuery("twitteruser", new String[]{"userid"}, "userid", "displayname", "timezone", "tweetcount", "followercount", @@ -96,7 +96,7 @@ public class QueryUtils { statement.setLong("tweetid", tweetid); statement.setInt("rating", rating); statement.setString("brand", brand); - + } public static void setInsertBrandParams(NamedPreparedStatement brandStmt, diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index 5385a79..ffd9a5b 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -188,14 +188,14 @@ public class Analyzor { String text; String brand; String[] words; - HashMap> wordcloud = new HashMap<>(); + HashMap> wordcloud = new HashMap<>(); while (data.next()) { //get brand - brand=data.getString("brand"); + brand = data.getString("brand"); //make hashmap for each brand - if(!wordcloud.containsKey(brand)){ - wordcloud.put(brand, new HashMap()); + if (!wordcloud.containsKey(brand)) { + wordcloud.put(brand, new HashMap()); } //get the text text = data.getString("text"); @@ -206,15 +206,14 @@ public class Analyzor { //for all words for (String word : words) { //if it is empty, a space or a stripe, skip it - if(word.equals("") || word.equals(" ") || word.equals("-")){ + if (word.equals("") || word.equals(" ") || word.equals("-")) { continue; } //if the word is already in the map, increment the amount - if(wordcloud.get(brand).containsKey(word)){ + if (wordcloud.get(brand).containsKey(word)) { wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); - } - //if the word is not already in the map, make an entry with amount = 1 - else{ + } //if the word is not already in the map, make an entry with amount = 1 + else { wordcloud.get(brand).put(word, 1); } } @@ -275,31 +274,29 @@ public class Analyzor { //gets the amount of users that tweet about a brand in a timezone //makes a csv file timezone, brand, amount - public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException{ + public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { query(query); //hashmap timezone, brand, amount HashMap> timeMap = new HashMap<>(); String timezone; String brand; - - while(data.next()){ + + while (data.next()) { timezone = data.getString("timezone"); brand = data.getString("brand"); //if the timezone is already in the map - if(timeMap.containsKey(timezone)){ + if (timeMap.containsKey(timezone)) { //if the brand for that timezone is already in the map - if(timeMap.get(timezone).containsKey(brand)){ + if (timeMap.get(timezone).containsKey(brand)) { //increment the amount timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); - } - //if the brand for that timezone is not yet in the map - else{ + } //if the brand for that timezone is not yet in the map + else { //make a new entry for that brand with amount = 1 timeMap.get(timezone).put(brand, 1); } - } - //if the timezone is not yet in the map - else{ + } //if the timezone is not yet in the map + else { //make a new hashmap for this map and fill it with the brand and the amount timeMap.put(timezone, new HashMap()); timeMap.get(timezone).put(brand, 1); @@ -308,7 +305,7 @@ public class Analyzor { //make the CSV out of the map mapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); } - + //replaces punctuation so it will be splitted //also removes urls private String splitPunctToWords(String text) { @@ -327,25 +324,25 @@ public class Analyzor { text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); return text; } - + //prints a hashmap into a csv for a html application //Hashmap> becomes key1, key2, value //only for String, String, Integer - void mapToCSV(HashMap> map, String fileName, String firstLine) - throws FileNotFoundException, UnsupportedEncodingException{ - + void mapToCSV(HashMap> map, String fileName, String firstLine) + throws FileNotFoundException, UnsupportedEncodingException { + PrintWriter writer = new PrintWriter(fileName, "UTF-8"); - + writer.println(firstLine); - + //loop over brands - for(Entry en : map.entrySet()){ + for (Entry en : map.entrySet()) { //loop over words - for(Entry e : map.get(en.getKey()).entrySet()){ + for (Entry e : map.get(en.getKey()).entrySet()) { writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); } } - + writer.close(); System.out.println("csv file made, please put it next to html file and run this"); } diff --git a/test/analysis/BrandCheckerTest.java b/test/analysis/BrandCheckerTest.java index 23d8445..f55035b 100644 --- a/test/analysis/BrandCheckerTest.java +++ b/test/analysis/BrandCheckerTest.java @@ -82,10 +82,10 @@ public class BrandCheckerTest { public void testBullshit() { doTest("This applepie is delicious", new String[]{}); } - + @Test public void multipleBrands() { - doTest("This tweet contains both iphone 4s,galaxy s5 and iphone", new String[]{"iphone 4s","galaxy s5"}); + doTest("This tweet contains both iphone 4s,galaxy s5 and iphone", new String[]{"iphone 4s", "galaxy s5"}); } } -- cgit v1.2.1 From 9969b6a6cbae322680cfcbc27df3d37b0954f00a Mon Sep 17 00:00:00 2001 From: s123188 Date: Tue, 27 May 2014 17:55:08 +0200 Subject: changed Analyzor.timezone(String query) so that it adds a legenda "timezone" so that the visualizer can display a legends --- src/main/Analyzor.java | 710 +++++++++++++++++++++++++------------------------ 1 file changed, 361 insertions(+), 349 deletions(-) diff --git a/src/main/Analyzor.java b/src/main/Analyzor.java index ffd9a5b..6369ece 100644 --- a/src/main/Analyzor.java +++ b/src/main/Analyzor.java @@ -1,349 +1,361 @@ -package main; - -import analysis.BrandChecker; -import database.NamedPreparedStatement; -import database.QueryUtils; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.List; -import java.util.HashMap; -import java.util.Map.Entry; -import java.util.Scanner; - -/** - * The sentiment analysis class that rates tweets based on a unigram and bigram - * set of weights. - */ -public class Analyzor { - - /** - * The map that matches single words to their weights. - */ - private final HashMap unimap = new HashMap(); - - /** - * The map that matches word pairs to their weights. - */ - private final HashMap bimap = new HashMap(); - - /** - * The results of a query, maybe return from query(). - */ - private ResultSet data; - - /** - * The persistent connection to the database. - */ - private final Connection connection; - - /** - * @param connection An open connection to the database. - */ - public Analyzor(Connection connection) { - this.connection = connection; - } - - /** - * Read the unigram and bigram lexica. - * - * @throws FileNotFoundException - */ - public void readLexicon() throws FileNotFoundException { - if (!unimap.isEmpty()) { - // data is already read. - return; - } - System.err.println("Trying to read lexicons..."); - // A unigram is in the format (WS = whitespace): - // word rating ??? ?? - // A bigram has an two WS-separated words instead of one. - try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); - Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { - //Fill the map of unigrams - int lineno = 1; - while (uniScanner.hasNext()) { - - String words = uniScanner.next(); - Double d = Double.valueOf(uniScanner.next()); - unimap.put(words.toLowerCase(), d); - if (uniScanner.hasNextLine()) { - uniScanner.nextLine(); - } - lineno++; - - } - - //fill the map of bigrams - while (biScanner.hasNext()) { - String words = biScanner.next() + " " + biScanner.next(); - bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next())); - if (biScanner.hasNextLine()) { - biScanner.nextLine(); - } - } - } - System.err.println("Lexicons are read."); - } - - /** - * Executes a query that the analyzer can analyze. - * - * @param query The query string to execute. - * @throws SQLException When database connection isn't available. - */ - public void query(String query) throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement(query); - data = statement.executeQuery(); - } - - /** - * Run a sentiment analysis and fill the database with the output. - * - * @param query The sql text for the query. - * @throws SQLException - * @throws IOException - */ - public void sentimentAnalysis(String query) throws SQLException, IOException { - NamedPreparedStatement tweetBrandStmt, updateRating; - - //read the lexicons - readLexicon(); - - // if you ever need to re-apply rating, use something like: - // UPDATE mentionsbrand SET rating = NULL WHERE ... - if (query.isEmpty()) { - query = "SELECT t.tweetid, t.text, b.brand FROM tweet t " - + "JOIN mentionsbrand b USING (tweetid) " - + "WHERE b.rating IS NULL"; - } - tweetBrandStmt = new NamedPreparedStatement(connection, - query); - ResultSet tweetBrandResults = tweetBrandStmt.executeQuery(); - - updateRating = new NamedPreparedStatement(connection, - "UPDATE mentionsbrand SET rating = :rating " - + "WHERE tweetid = :tweetid AND brand = :brand"); - - Double value; - String text; - - //for all tuples - while (tweetBrandResults.next()) { - //get the text - text = tweetBrandResults.getString("text"); - text = splitPunctToWords(text); - // test is the tweet text you are going to analyze - String[] words = text.split("\\s+"); // text splitted into separate words - double positiverate = 0; // positive rating - - // Rate the text with unigrams - for (String word : words) { - value = unimap.get(word); - if (value != null) { - positiverate += unimap.get(word); - } - } - // Rate the text with bigrams - for (int i = 0; i < words.length - 1; i++) { - String pair = words[i] + " " + words[i + 1]; - value = bimap.get(pair); - if (value != null) { - positiverate += bimap.get(pair); - } - } - //insert the rating into the database - updateRating.setLong("tweetid", tweetBrandResults.getLong("tweetid")); - updateRating.setString("brand", tweetBrandResults.getString("brand")); - updateRating.setInt("rating", (int) (positiverate * 10)); - updateRating.executeUpdate(); - } - } - - /** - * Make a wordcloud of the results of some query. - * - * @param query The sql text for a query. - * @throws SQLException - * @throws FileNotFoundException - * @throws UnsupportedEncodingException - */ - public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - - query(query); - //go to the start of the ResultSet data - if (data == null) { - System.err.println("data is empty, try querying first"); - return; - } - - String text; - String brand; - String[] words; - HashMap> wordcloud = new HashMap<>(); - - while (data.next()) { - //get brand - brand = data.getString("brand"); - //make hashmap for each brand - if (!wordcloud.containsKey(brand)) { - wordcloud.put(brand, new HashMap()); - } - //get the text - text = data.getString("text"); - //remove punctuation, convert to lowercase and split on words - text = removePunct(text); - text = text.toLowerCase(); - words = text.split("\\s+"); - //for all words - for (String word : words) { - //if it is empty, a space or a stripe, skip it - if (word.equals("") || word.equals(" ") || word.equals("-")) { - continue; - } - //if the word is already in the map, increment the amount - if (wordcloud.get(brand).containsKey(word)) { - wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); - } //if the word is not already in the map, make an entry with amount = 1 - else { - wordcloud.get(brand).put(word, 1); - } - } - } - //print the words and their frequency in a csv file - mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count"); - } - - //generate csv for disco from the query - public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - //do the query - query(query); - PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); - //print the first row - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - writer.print(data.getMetaData().getColumnLabel(i) + ", "); - } - writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); - //print the values - while (data.next()) { - for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { - if (data.getObject(i) == null) { - writer.print(", "); - } else { - writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); - } - } - if (data.getObject(data.getMetaData().getColumnCount()) == null) { - writer.println("0"); - } else { - writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); - } - } - writer.close(); - } - - public void getBrands() throws SQLException { - PreparedStatement statement; - //make a connection to the database and execute the query - statement = connection.prepareStatement("delete from mentionsbrand"); - statement.executeUpdate(); - BrandChecker checker = new BrandChecker("brandonlyrules.txt"); - query("select * from tweet"); - NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); - while (data.next()) { - List brands = checker.getBrands(data.getString("text")); - if (brands.isEmpty()) { - QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no"); - m_insertBrand.executeUpdate(); - } else { - for (String brand : brands) { - QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand); - m_insertBrand.executeUpdate(); - } - } - } - } - - //gets the amount of users that tweet about a brand in a timezone - //makes a csv file timezone, brand, amount - public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { - query(query); - //hashmap timezone, brand, amount - HashMap> timeMap = new HashMap<>(); - String timezone; - String brand; - - while (data.next()) { - timezone = data.getString("timezone"); - brand = data.getString("brand"); - //if the timezone is already in the map - if (timeMap.containsKey(timezone)) { - //if the brand for that timezone is already in the map - if (timeMap.get(timezone).containsKey(brand)) { - //increment the amount - timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); - } //if the brand for that timezone is not yet in the map - else { - //make a new entry for that brand with amount = 1 - timeMap.get(timezone).put(brand, 1); - } - } //if the timezone is not yet in the map - else { - //make a new hashmap for this map and fill it with the brand and the amount - timeMap.put(timezone, new HashMap()); - timeMap.get(timezone).put(brand, 1); - } - } - //make the CSV out of the map - mapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); - } - - //replaces punctuation so it will be splitted - //also removes urls - private String splitPunctToWords(String text) { - text = text.replaceAll("https?://\\S*", ""); - text = text.replaceAll("[!?):;\"']", " $0"); - text = text.replaceAll("[.,-](\\s|$)", " $0"); - text = text.replaceAll("\\s[(\"']", "$0 "); - return text; - } - - //removes punctuation - //also removes urls - private String removePunct(String text) { - text = text.replaceAll("https?://\\S*", " "); - text = text.replaceAll("@\\S*", " "); - text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); - return text; - } - - //prints a hashmap into a csv for a html application - //Hashmap> becomes key1, key2, value - //only for String, String, Integer - void mapToCSV(HashMap> map, String fileName, String firstLine) - throws FileNotFoundException, UnsupportedEncodingException { - - PrintWriter writer = new PrintWriter(fileName, "UTF-8"); - - writer.println(firstLine); - - //loop over brands - for (Entry en : map.entrySet()) { - //loop over words - for (Entry e : map.get(en.getKey()).entrySet()) { - writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); - } - } - - writer.close(); - System.out.println("csv file made, please put it next to html file and run this"); - } -} +package main; + +import analysis.BrandChecker; +import database.NamedPreparedStatement; +import database.QueryUtils; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.List; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.Scanner; + +/** + * The sentiment analysis class that rates tweets based on a unigram and bigram + * set of weights. + */ +public class Analyzor { + + /** + * The map that matches single words to their weights. + */ + private final HashMap unimap = new HashMap(); + + /** + * The map that matches word pairs to their weights. + */ + private final HashMap bimap = new HashMap(); + + /** + * The results of a query, maybe return from query(). + */ + private ResultSet data; + + /** + * The persistent connection to the database. + */ + private final Connection connection; + + /** + * @param connection An open connection to the database. + */ + public Analyzor(Connection connection) { + this.connection = connection; + } + + /** + * Read the unigram and bigram lexica. + * + * @throws FileNotFoundException + */ + public void readLexicon() throws FileNotFoundException { + if (!unimap.isEmpty()) { + // data is already read. + return; + } + System.err.println("Trying to read lexicons..."); + // A unigram is in the format (WS = whitespace): + // word rating ??? ?? + // A bigram has an two WS-separated words instead of one. + try (Scanner uniScanner = new Scanner(new File("unigrams-pmilexicon.txt")); + Scanner biScanner = new Scanner(new File("bigrams-pmilexicon.txt"));) { + //Fill the map of unigrams + int lineno = 1; + while (uniScanner.hasNext()) { + + String words = uniScanner.next(); + Double d = Double.valueOf(uniScanner.next()); + unimap.put(words.toLowerCase(), d); + if (uniScanner.hasNextLine()) { + uniScanner.nextLine(); + } + lineno++; + + } + + //fill the map of bigrams + while (biScanner.hasNext()) { + String words = biScanner.next() + " " + biScanner.next(); + bimap.put(words.toLowerCase(), Double.valueOf(biScanner.next())); + if (biScanner.hasNextLine()) { + biScanner.nextLine(); + } + } + } + System.err.println("Lexicons are read."); + } + + /** + * Executes a query that the analyzer can analyze. + * + * @param query The query string to execute. + * @throws SQLException When database connection isn't available. + */ + public void query(String query) throws SQLException { + PreparedStatement statement; + //make a connection to the database and execute the query + statement = connection.prepareStatement(query); + data = statement.executeQuery(); + } + + /** + * Run a sentiment analysis and fill the database with the output. + * + * @param query The sql text for the query. + * @throws SQLException + * @throws IOException + */ + public void sentimentAnalysis(String query) throws SQLException, IOException { + NamedPreparedStatement tweetBrandStmt, updateRating; + + //read the lexicons + readLexicon(); + + // if you ever need to re-apply rating, use something like: + // UPDATE mentionsbrand SET rating = NULL WHERE ... + if (query.isEmpty()) { + query = "SELECT t.tweetid, t.text, b.brand FROM tweet t " + + "JOIN mentionsbrand b USING (tweetid) " + + "WHERE b.rating IS NULL"; + } + tweetBrandStmt = new NamedPreparedStatement(connection, + query); + ResultSet tweetBrandResults = tweetBrandStmt.executeQuery(); + + updateRating = new NamedPreparedStatement(connection, + "UPDATE mentionsbrand SET rating = :rating " + + "WHERE tweetid = :tweetid AND brand = :brand"); + + Double value; + String text; + + //for all tuples + while (tweetBrandResults.next()) { + //get the text + text = tweetBrandResults.getString("text"); + text = splitPunctToWords(text); + // test is the tweet text you are going to analyze + String[] words = text.split("\\s+"); // text splitted into separate words + double positiverate = 0; // positive rating + + // Rate the text with unigrams + for (String word : words) { + value = unimap.get(word); + if (value != null) { + positiverate += unimap.get(word); + } + } + // Rate the text with bigrams + for (int i = 0; i < words.length - 1; i++) { + String pair = words[i] + " " + words[i + 1]; + value = bimap.get(pair); + if (value != null) { + positiverate += bimap.get(pair); + } + } + //insert the rating into the database + updateRating.setLong("tweetid", tweetBrandResults.getLong("tweetid")); + updateRating.setString("brand", tweetBrandResults.getString("brand")); + updateRating.setInt("rating", (int) (positiverate * 10)); + updateRating.executeUpdate(); + } + } + + /** + * Make a wordcloud of the results of some query. + * + * @param query The sql text for a query. + * @throws SQLException + * @throws FileNotFoundException + * @throws UnsupportedEncodingException + */ + public void makeWordCloud(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + + query(query); + //go to the start of the ResultSet data + if (data == null) { + System.err.println("data is empty, try querying first"); + return; + } + + String text; + String brand; + String[] words; + HashMap> wordcloud = new HashMap<>(); + + while (data.next()) { + //get brand + brand = data.getString("brand"); + //make hashmap for each brand + if (!wordcloud.containsKey(brand)) { + wordcloud.put(brand, new HashMap()); + } + //get the text + text = data.getString("text"); + //remove punctuation, convert to lowercase and split on words + text = removePunct(text); + text = text.toLowerCase(); + words = text.split("\\s+"); + //for all words + for (String word : words) { + //if it is empty, a space or a stripe, skip it + if (word.equals("") || word.equals(" ") || word.equals("-")) { + continue; + } + //if the word is already in the map, increment the amount + if (wordcloud.get(brand).containsKey(word)) { + wordcloud.get(brand).put(word, wordcloud.get(brand).get(word) + 1); + } //if the word is not already in the map, make an entry with amount = 1 + else { + wordcloud.get(brand).put(word, 1); + } + } + } + //print the words and their frequency in a csv file + mapToCSV(wordcloud, "wordcloud.csv", "brand,word,count"); + } + + //generate csv for disco from the query + public void disco(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + //do the query + query(query); + PrintWriter writer = new PrintWriter("output.csv", "UTF-8"); + //print the first row + for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { + writer.print(data.getMetaData().getColumnLabel(i) + ", "); + } + writer.println(data.getMetaData().getColumnLabel(data.getMetaData().getColumnCount())); + //print the values + while (data.next()) { + for (int i = 1; i < data.getMetaData().getColumnCount(); i++) { + if (data.getObject(i) == null) { + writer.print(", "); + } else { + writer.print(data.getObject(i).toString().replaceAll("[,\n]", " ") + ", "); + } + } + if (data.getObject(data.getMetaData().getColumnCount()) == null) { + writer.println("0"); + } else { + writer.println(data.getObject(data.getMetaData().getColumnCount()).toString().replace(",", " ")); + } + } + writer.close(); + } + + public void getBrands() throws SQLException { + PreparedStatement statement; + //make a connection to the database and execute the query + statement = connection.prepareStatement("delete from mentionsbrand"); + statement.executeUpdate(); + BrandChecker checker = new BrandChecker("brandonlyrules.txt"); + query("select * from tweet"); + NamedPreparedStatement m_insertBrand = new NamedPreparedStatement(connection, QueryUtils.insertBrand); + while (data.next()) { + List brands = checker.getBrands(data.getString("text")); + if (brands.isEmpty()) { + QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), "no"); + m_insertBrand.executeUpdate(); + } else { + for (String brand : brands) { + QueryUtils.setInsertBrandParams(m_insertBrand, data.getLong("tweetid"), brand); + m_insertBrand.executeUpdate(); + } + } + } + } + + //gets the amount of users that tweet about a brand in a timezone + //makes a csv file timezone, brand, amount + public void timezone(String query) throws SQLException, FileNotFoundException, UnsupportedEncodingException { + query(query); + //hashmap timezone, brand, amount + HashMap> timeMap = new HashMap<>(); + String timezone; + String brand; + + while (data.next()) { + timezone = data.getString("timezone"); + brand = data.getString("brand"); + //if the timezone is already in the map + if (timeMap.containsKey(timezone)) { + //if the brand for that timezone is already in the map + if (timeMap.get(timezone).containsKey(brand)) { + //increment the amount + timeMap.get(timezone).put(brand, timeMap.get(timezone).get(brand) + 1); + } //if the brand for that timezone is not yet in the map + else { + //make a new entry for that brand with amount = 1 + timeMap.get(timezone).put(brand, 1); + } + } //if the timezone is not yet in the map + else { + //make a new hashmap for this map and fill it with the brand and the amount + timeMap.put(timezone, new HashMap()); + timeMap.get(timezone).put(brand, 1); + } + } + + //add a legenda "timezone" that will make the legenda for the timezone map + final int legendaSize = 6000; + + timeMap.put("legenda" , new HashMap()); + timeMap.get("legenda").put("sony", legendaSize/6); + timeMap.get("legenda").put("lg", legendaSize/6); + timeMap.get("legenda").put("huawei", legendaSize/6); + timeMap.get("legenda").put("htc", legendaSize/6); + timeMap.get("legenda").put("samsung", legendaSize/6); + timeMap.get("legenda").put("apple", legendaSize/6); + + //make the CSV out of the map + mapToCSV(timeMap, "timezone.csv", "timezone,brand,count"); + } + + //replaces punctuation so it will be splitted + //also removes urls + private String splitPunctToWords(String text) { + text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll("[!?):;\"']", " $0"); + text = text.replaceAll("[.,-](\\s|$)", " $0"); + text = text.replaceAll("\\s[(\"']", "$0 "); + return text; + } + + //removes punctuation + //also removes urls + private String removePunct(String text) { + text = text.replaceAll("https?://\\S*", " "); + text = text.replaceAll("@\\S*", " "); + text = text.replaceAll("[^a-zA-Z0-9#_-]", " "); + return text; + } + + //prints a hashmap into a csv for a html application + //Hashmap> becomes key1, key2, value + //only for String, String, Integer + void mapToCSV(HashMap> map, String fileName, String firstLine) + throws FileNotFoundException, UnsupportedEncodingException { + + PrintWriter writer = new PrintWriter(fileName, "UTF-8"); + + writer.println(firstLine); + + //loop over brands + for (Entry en : map.entrySet()) { + //loop over words + for (Entry e : map.get(en.getKey()).entrySet()) { + writer.println(en.getKey() + "," + e.getKey() + "," + e.getValue()); + } + } + + writer.close(); + System.out.println("csv file made, please put it next to html file and run this"); + } +} -- cgit v1.2.1