diff options
author | Peter Wu <peter@lekensteyn.nl> | 2014-04-23 12:22:20 +0200 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2014-04-23 12:22:20 +0200 |
commit | 14d7547cd31c5be878e377a4a5370f604c8d59d4 (patch) | |
tree | 003840f1a21d39b07d45cd3112c38b6eed40e3ab /src/utils/TextUtils.java | |
download | TwitterDataAnalytics-14d7547cd31c5be878e377a4a5370f604c8d59d4.tar.gz |
Initial commit
build.xml, etc. are modified a bit after opening in Netbeans 7.4.
Diffstat (limited to 'src/utils/TextUtils.java')
-rw-r--r-- | src/utils/TextUtils.java | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/src/utils/TextUtils.java b/src/utils/TextUtils.java new file mode 100644 index 0000000..764ce11 --- /dev/null +++ b/src/utils/TextUtils.java @@ -0,0 +1,212 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package utils; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class TextUtils +{ + //holds a list of stop words to be removed when generating word clouds etc. + HashSet<String> STOPWORDS = new HashSet<String>(); + + String SEPARATOR = " "; + + /** + * Loads the stop words from a file onto a collection. for use by all methods in this class + * @param filename + */ + public void LoadStopWords(String filename) + { + if(!filename.isEmpty()) + { + + BufferedReader bread = null; + try { + bread = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF8")); + String temp = ""; + try { + while ((temp = bread.readLine()) != null) { + if (!temp.isEmpty()) { + String[] stwords = temp.split(","); + for (String t : stwords) { + t = t.toLowerCase(); + if (!STOPWORDS.contains(t)) { + STOPWORDS.add(t); + } + } + } + } + } catch (IOException ex) { + Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex); + } + } catch (UnsupportedEncodingException ex) { + Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex); + } catch (FileNotFoundException ex) { + Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex); + } finally { + try { + bread.close(); + } catch (IOException ex) { + Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex); + } + } + } + } + + /** + * Converts a tweet/text into individual words/tokens. All stopwords are removed and the list also does not contain hyperlinks. + * Splitting is performed on space. + * @param text + * @param ignoreHashtags + * @param ignoreUsernames + * @return a list of words contained in text + */ + public HashMap<String,Integer> TokenizeText(String text, boolean ignoreHashtags, boolean ignoreUsernames) + { + String[] tokens = text.split(SEPARATOR); + HashMap<String,Integer> words = new HashMap<String,Integer>(); + for(String token:tokens) + { + token = token.replaceAll("\"|'|\\.||;|,", ""); + if(token.isEmpty()||token.length()<=2||STOPWORDS.contains(token)||token.startsWith("&")||token.startsWith("http")) + { + continue; + } + else + { + if(ignoreHashtags) + { + if(token.startsWith("#")) + { + continue; + } + } + if(ignoreUsernames) + { + if(token.startsWith("@")) + { + continue; + } + } + if(!words.containsKey(token)) + { + words.put(token,1); + } + else + { + words.put(token, words.get(token)+1); + } + } + } + return words; + } + + /** + * Checks whether the tweet is a retweet based on the presence of the RT pattern as the start of the text. Expects the tweet text to be in lowercase. + * @param text + * @return + */ + public static boolean IsTweetRT(String text) + { + Pattern p = Pattern.compile("^rt @[a-z_0-9]+"); + Matcher m = p.matcher(text); + if(m.find()) + { + return true; + } + return false; + } + + /** + * Checks whether the text contains a hyperlink in the text + * @param text + * @return + */ + public static boolean ContainsURL(String text) + { + Pattern urlpat = Pattern.compile("https?://[a-zA-Z0-9\\./]+"); + Matcher urlmat = urlpat.matcher(text); + if(urlmat.find()) + { + return true; + } + else + return false; + } + + /** + * extracts and returns a list of hashtags from the text + * @param text + * @return + */ + public static ArrayList<String> GetHashTags(String text) + { + Pattern p = Pattern.compile("#[a-zA-Z0-9]+"); + Matcher mat = p.matcher(text); + ArrayList<String> tags = new ArrayList<String>(); + while(mat.find()) + { + String tag = text.substring(mat.start(),mat.end()); + if(!tags.contains(tag.toLowerCase())) + { + tags.add(tag.toLowerCase()); + } + } + return tags; + } + + /** + * Removes LF and CR from the text as well as any quotes and backslashes + * @param text + * @return + */ + public static String GetCleanText(String text) + { + text = text.replaceAll("'|\"|"", ""); + text = text.replaceAll("\\\\", ""); + text = text.replaceAll("\r\n|\n|\r", " "); + text = text.trim(); + return text; + } + + /** + * Removes all patterns that correspond to Retweeted status leaving only original text + * @param tweet + * @return + */ + public static String RemoveRTElements(String tweet) + { + String text = tweet.replaceAll("rt @[a-z_A-Z0-9]+", " "); + text = text.replaceAll("RT @[a-z_A-Z0-9]+", " "); + text = text.replaceAll(":",""); + return text.trim(); + } + + /** + * Removes all hashtags, URLs, and usernames from the tweet text + * @param tweet + * @return + */ + public static String RemoveTwitterElements(String tweet) + { + String temptweet = tweet.replaceAll("#[a-zA-Z_0-9]+", ""); + temptweet = temptweet.replaceAll("https?://[a-zA-Z0-9\\./]+", ""); + temptweet = temptweet.replaceAll("@[a-zA-Z_0-9]+", ""); + temptweet = temptweet.replaceAll("[:?\\.;<>()]", ""); + return temptweet; + } + +} |