Initial commit

build.xml, etc. are modified a bit after opening in Netbeans 7.4.
author: Peter Wu <peter@lekensteyn.nl> 2014-04-23 12:22:20 +0200
committer: Peter Wu <peter@lekensteyn.nl> 2014-04-23 12:22:20 +0200
commit: 14d7547cd31c5be878e377a4a5370f604c8d59d4 (patch)
tree: 003840f1a21d39b07d45cd3112c38b6eed40e3ab /src/utils/TextUtils.java
download: TwitterDataAnalytics-14d7547cd31c5be878e377a4a5370f604c8d59d4.tar.gz
1 files changed, 212 insertions, 0 deletions
diff --git a/src/utils/TextUtils.java b/src/utils/TextUtils.java
new file mode 100644
index 0000000..764ce11
--- /dev/null
+++ b/src/utils/TextUtils.java
@@ -0,0 +1,212 @@
+/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University
+ * @author shamanth
+ */
+package utils;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class TextUtils
+{
+    //holds a list of stop words to be removed when generating word clouds etc.
+    HashSet<String> STOPWORDS = new HashSet<String>();
+
+    String SEPARATOR = " ";
+
+    /**
+     * Loads the stop words from a file onto a collection. for use by all methods in this class
+     * @param filename
+     */
+    public void LoadStopWords(String filename)
+      {
+          if(!filename.isEmpty())
+          {
+
+                BufferedReader bread = null;
+                try {
+                    bread = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF8"));
+                    String temp = "";
+                    try {
+                        while ((temp = bread.readLine()) != null) {
+                            if (!temp.isEmpty()) {
+                                String[] stwords = temp.split(",");
+                                for (String t : stwords) {
+                                    t = t.toLowerCase();
+                                    if (!STOPWORDS.contains(t)) {
+                                        STOPWORDS.add(t);
+                                    }
+                                }
+                            }
+                        }
+                    } catch (IOException ex) {
+                        Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex);
+                    }
+                } catch (UnsupportedEncodingException ex) {
+                    Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex);
+                } catch (FileNotFoundException ex) {
+                    Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex);
+                } finally {
+                    try {
+                        bread.close();
+                    } catch (IOException ex) {
+                        Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex);
+                    }
+                }
+          }
+      }
+
+   /**
+     * Converts a tweet/text into individual words/tokens. All stopwords are removed and the list also does not contain hyperlinks.
+     * Splitting is performed on space.
+     * @param text
+     * @param ignoreHashtags
+     * @param ignoreUsernames
+     * @return a list of words contained in text
+     */
+    public HashMap<String,Integer> TokenizeText(String text, boolean ignoreHashtags, boolean ignoreUsernames)
+    {
+        String[] tokens = text.split(SEPARATOR);
+        HashMap<String,Integer> words = new HashMap<String,Integer>();
+        for(String token:tokens)
+        {
+            token = token.replaceAll("\"|'|\\.||;|,", "");
+            if(token.isEmpty()||token.length()<=2||STOPWORDS.contains(token)||token.startsWith("&")||token.startsWith("http"))
+            {
+               continue;
+            }
+            else
+            {
+                if(ignoreHashtags)
+                {
+                    if(token.startsWith("#"))
+                    {
+                        continue;
+                    }
+                }
+                if(ignoreUsernames)
+                {
+                    if(token.startsWith("@"))
+                    {
+                        continue;
+                    }
+                }
+                if(!words.containsKey(token))
+                {
+                    words.put(token,1);
+                }
+                else
+                {
+                    words.put(token, words.get(token)+1);
+                }
+            }
+        }
+        return words;
+    }
+
+    /**
+     * Checks whether the tweet is a retweet based on the presence of the RT pattern as the start of the text. Expects the tweet text to be in lowercase.
+     * @param text
+     * @return
+     */
+    public static boolean IsTweetRT(String text)
+    {
+        Pattern p = Pattern.compile("^rt @[a-z_0-9]+");
+        Matcher m = p.matcher(text);
+        if(m.find())
+        {
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * Checks whether the text contains a hyperlink in the text
+     * @param text
+     * @return
+     */
+    public static boolean ContainsURL(String text)
+    {
+        Pattern urlpat = Pattern.compile("https?://[a-zA-Z0-9\\./]+");
+        Matcher urlmat = urlpat.matcher(text);
+        if(urlmat.find())
+        {
+            return true;
+        }
+        else
+            return false;
+    }
+
+    /**
+     * extracts and returns a list of hashtags from the text
+     * @param text
+     * @return
+     */
+    public static ArrayList<String> GetHashTags(String text)
+    {
+        Pattern p = Pattern.compile("#[a-zA-Z0-9]+");
+        Matcher mat = p.matcher(text);
+        ArrayList<String> tags = new ArrayList<String>();
+        while(mat.find())
+        {
+            String tag = text.substring(mat.start(),mat.end());
+            if(!tags.contains(tag.toLowerCase()))
+            {
+                tags.add(tag.toLowerCase());
+            }
+        }
+        return tags;
+    }
+
+    /**
+     * Removes LF and CR from the text as well as any quotes and backslashes
+     * @param text
+     * @return
+     */
+    public static String GetCleanText(String text)
+    {
+       text = text.replaceAll("'|\"|&quot;", "");
+       text = text.replaceAll("\\\\", "");
+       text = text.replaceAll("\r\n|\n|\r", " ");
+       text = text.trim();
+       return text;
+    }
+
+    /**
+     * Removes all patterns that correspond to Retweeted status leaving only original text
+     * @param tweet
+     * @return
+     */
+    public static String RemoveRTElements(String tweet)
+    {
+        String text = tweet.replaceAll("rt @[a-z_A-Z0-9]+", " ");
+        text = text.replaceAll("RT @[a-z_A-Z0-9]+", " ");
+        text = text.replaceAll(":","");
+        return text.trim();
+    }
+
+    /**
+     * Removes all hashtags, URLs, and usernames from the tweet text
+     * @param tweet
+     * @return
+     */
+     public static String RemoveTwitterElements(String tweet)
+     {
+         String temptweet = tweet.replaceAll("#[a-zA-Z_0-9]+", "");
+         temptweet = temptweet.replaceAll("https?://[a-zA-Z0-9\\./]+", "");
+         temptweet = temptweet.replaceAll("@[a-zA-Z_0-9]+", "");
+         temptweet = temptweet.replaceAll("[:?\\.;<>()]", "");
+         return temptweet;
+     }
+
+}
author	Peter Wu <peter@lekensteyn.nl>	2014-04-23 12:22:20 +0200
committer	Peter Wu <peter@lekensteyn.nl>	2014-04-23 12:22:20 +0200
commit	14d7547cd31c5be878e377a4a5370f604c8d59d4 (patch)
tree	003840f1a21d39b07d45cd3112c38b6eed40e3ab /src/utils/TextUtils.java
download	TwitterDataAnalytics-14d7547cd31c5be878e377a4a5370f604c8d59d4.tar.gz