summaryrefslogtreecommitdiff
path: root/src/Chapter5/text
diff options
context:
space:
mode:
Diffstat (limited to 'src/Chapter5/text')
-rw-r--r--src/Chapter5/text/EventSummaryExtractor.java269
-rw-r--r--src/Chapter5/text/ExtractTopKeywords.java151
2 files changed, 0 insertions, 420 deletions
diff --git a/src/Chapter5/text/EventSummaryExtractor.java b/src/Chapter5/text/EventSummaryExtractor.java
deleted file mode 100644
index e76f42e..0000000
--- a/src/Chapter5/text/EventSummaryExtractor.java
+++ /dev/null
@@ -1,269 +0,0 @@
-/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University
- * @author shamanth
- */
-package Chapter5.text;
-
-import Chapter5.support.DateInfo;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
-
-public class EventSummaryExtractor
-{
-
- final String DEF_INFILENAME = "ows.json";
- HashMap<String,ArrayList<String>> CATEGORIES = new HashMap<String,ArrayList<String>>();
- SimpleDateFormat twittersdm = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy");
- SimpleDateFormat dayhoursdm = new SimpleDateFormat("yyyy-MM-dd:HH");
-// SimpleDateFormat daysdm = new SimpleDateFormat("MM/dd/yyyy");
- SimpleDateFormat hoursdm = new SimpleDateFormat("HH");
-
- /**
- *
- */
- public void InitializeCategories()
- {
- ArrayList<String> people = new ArrayList<String>();
- people.add("protesters");
- people.add("people");
- CATEGORIES.put("People",people);
- ArrayList<String> police = new ArrayList<String>();
- police.add("police");
- police.add("cops");
- police.add("nypd");
- police.add("raid");
- CATEGORIES.put("Police",police);
- ArrayList<String> media = new ArrayList<String>();
- media.add("press");
- media.add("news");
- media.add("media");
- CATEGORIES.put("Media",media);
- ArrayList<String> city = new ArrayList<String>();
- city.add("nyc");
- city.add("zucotti");
- city.add("park");
- CATEGORIES.put("Location",city);
- ArrayList<String> judiciary = new ArrayList<String>();
- judiciary.add("judge");
- judiciary.add("eviction");
- judiciary.add("order");
- judiciary.add("court");
- CATEGORIES.put("Judiciary", judiciary);
- }
-
- /**
- *
- * @param filename
- * @return
- */
- public JSONObject ExtractCategoryTrends(String filename)
- {
- JSONObject result = new JSONObject();
- try {
- BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
- String temp = "";
- Set<String> catkeys = CATEGORIES.keySet();
- HashMap<String,HashMap<String,Integer>> datecount = new HashMap<String,HashMap<String,Integer>>();
- while((temp = br.readLine())!=null)
- {
- Date d = new Date();
- try {
- JSONObject jobj = new JSONObject(temp);
- //Published time
- if(!jobj.isNull("created_at"))
- {
- String time = "";
- try {
- time = jobj.getString("created_at");
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- if(time.isEmpty())
- {
- continue;
- }
- else
- {
- try {
- d = twittersdm.parse(time);
- } catch (ParseException ex) {
- continue;
- }
- }
- }
- else
- if(!jobj.isNull("timestamp"))
- {
- long time = new Date().getTime();
- try{
- time = jobj.getLong("timestamp");
- }catch(JSONException ex)
- {
- ex.printStackTrace();
- }
- d = new Date();
- d.setTime(time);
- }
- String datestr = dayhoursdm.format(d);
- String text = jobj.getString("text").toLowerCase();
-// System.out.println(text);
- for(String key:catkeys)
- {
- ArrayList<String> words = CATEGORIES.get(key);
- for(String word:words)
- {
- if(text.contains(word))
- {
- HashMap<String,Integer> categorycount = new HashMap<String,Integer>();
- if(datecount.containsKey(datestr))
- {
- categorycount = datecount.get(datestr);
- }
- if(categorycount.containsKey(key))
- {
- categorycount.put(key, categorycount.get(key)+1);
- }
- else
- {
- categorycount.put(key, 1);
- }
- //update the categorycount for the specific date
- datecount.put(datestr, categorycount);
- break;
- }
- }
- }
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- //sort the dates
- Set<String> datekeys = datecount.keySet();
- ArrayList<DateInfo> dinfos = new ArrayList<DateInfo>();
- for(String date:datekeys)
- {
- Date d = null;
- try {
- d = dayhoursdm.parse(date);
- } catch (ParseException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- if(d!=null)
- {
- DateInfo info = new DateInfo();
- info.d = d;
- info.catcounts = datecount.get(date);
- dinfos.add(info);
- }
- }
- Collections.sort(dinfos, Collections.reverseOrder());
- try {
- result.put("axisxstep", dinfos.size()-1);
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- try {
- result.put("axisystep", CATEGORIES.size()-1);
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- JSONArray xcoordinates = new JSONArray();
- JSONArray ycoordinates = new JSONArray();
- //now add the data and the axis labels
- JSONArray axisxlabels = new JSONArray();
- JSONArray axisylabels = new JSONArray();
- JSONArray data = new JSONArray();
- for(String key:catkeys)
- {
- axisylabels.put(key);
- }
- //counters to mark the indices of the values added to data field. i is the x coordinate and j is the y coordinate
- int i=0,j=0;
-
- for(DateInfo date:dinfos)
- {
- String strdate = hoursdm.format(date.d);
- axisxlabels.put(strdate);
- HashMap<String,Integer> catcounts = date.catcounts;
- for(String key:catkeys)
- {
- xcoordinates.put(j);
- ycoordinates.put(i++);
- if(catcounts.containsKey(key))
- {
- data.put(catcounts.get(key));
- }
- else
- {
- data.put(0);
- }
- }
- //reset the x coordinate as we move to the next y item
- i=0;
- j++;
- }
- try {
- result.put("xcoordinates", xcoordinates);
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- try {
- result.put("ycoordinates", ycoordinates);
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- try {
- result.put("axisxlabels", axisxlabels);
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- try {
- result.put("axisylabels", axisylabels);
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- try {
- result.put("data", data);
- } catch (JSONException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- br.close();
- } catch (IOException ex) {
- Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
- }
- return result;
- }
-
- public static void main(String[] args)
- {
- EventSummaryExtractor ese = new EventSummaryExtractor();
- String infilename = ese.DEF_INFILENAME;
- if(args!=null)
- {
- if(args.length>=1&&!args[0].isEmpty())
- {
- File fl = new File(args[0]);
- if(fl.exists())
- {
- infilename = args[0];
- }
- }
- }
- ese.InitializeCategories();
- System.out.println(ese.ExtractCategoryTrends(infilename).toString());
- }
-}
diff --git a/src/Chapter5/text/ExtractTopKeywords.java b/src/Chapter5/text/ExtractTopKeywords.java
deleted file mode 100644
index 8ab412a..0000000
--- a/src/Chapter5/text/ExtractTopKeywords.java
+++ /dev/null
@@ -1,151 +0,0 @@
-/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University
- * @author shamanth
- */
-package Chapter5.text;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
-import utils.Tags;
-import utils.TextUtils;
-
-public class ExtractTopKeywords
-{
-
- static final String DEF_INFILENAME = "ows.json";
- static final int DEF_K = 60;
-
- /**
- * Extracts the most frequently occurring keywords from the tweets by processing them sequentially. Stopwords are ignored.
- * @param inFilename File containing a list of tweets as JSON objects
- * @param K Count of the top keywords to return
- * @param ignoreHashtags If true, hashtags are not considered while counting the most frequent keywords
- * @param ignoreUsernames If true, usernames are not considered while counting the most frequent keywords
- * @param tu TextUtils object which handles the stopwords
- * @return a JSONArray containing an array of JSONObjects. Each object contains two elements "text" and "size" referring to the word and it's frequency
- */
- public JSONArray GetTopKeywords(String inFilename, int K, boolean ignoreHashtags, boolean ignoreUsernames, TextUtils tu)
- {
- HashMap<String, Integer> words = new HashMap<String,Integer>();
- BufferedReader br = null;
- try{
- br = new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8"));
- String temp = "";
- while((temp = br.readLine())!=null)
- {
- try{
- JSONObject tweetobj = new JSONObject(temp);
- if(!tweetobj.isNull("text"))
- {
- String text = tweetobj.getString("text");
- //System.out.println(text);
- text = text.toLowerCase().replaceAll("\\s+", " ");
- /** Step 1: Tokenize tweets into individual words. and count their frequency in the corpus
- * Remove stop words and special characters. Ignore user names and hashtags if the user chooses to.
- */
- HashMap<String,Integer> tokens = tu.TokenizeText(text,ignoreHashtags,ignoreUsernames);
- Set<String> keys = tokens.keySet();
- for(String key:keys)
- {
- if(words.containsKey(key))
- {
- words.put(key, words.get(key)+tokens.get(key));
- }
- else
- {
- words.put(key, tokens.get(key));
- }
- }
- }
- }catch(JSONException ex)
- {
- ex.printStackTrace();
- }
- }
- }catch(IOException ex)
- {
- ex.printStackTrace();
- }finally{
- try {
- br.close();
- } catch (IOException ex) {
- Logger.getLogger(ExtractTopKeywords.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- Set<String> keys = words.keySet();
- ArrayList<Tags> tags = new ArrayList<Tags>();
- for(String key:keys)
- {
- Tags tag = new Tags();
- tag.setKey(key);
- tag.setValue(words.get(key));
- tags.add(tag);
- }
- // Step 2: Sort the words in descending order of frequency
- Collections.sort(tags, Collections.reverseOrder());
- JSONArray cloudwords = new JSONArray();
- int numwords = K;
- if(tags.size()<numwords)
- {
- numwords = tags.size();
- }
- for(int i=0;i<numwords;i++)
- {
- JSONObject wordfreq = new JSONObject();
- Tags tag = tags.get(i);
- try{
- wordfreq.put("text", tag.getKey());
- wordfreq.put("size",tag.getValue());
- cloudwords.put(wordfreq);
- }catch(JSONException ex)
- {
- ex.printStackTrace();
- }
- }
- return cloudwords;
- }
-
- public static void main(String[] args)
- {
- ExtractTopKeywords etk = new ExtractTopKeywords();
-
- //Initialize the TextUtils class which handles all the processing of text.
- TextUtils tu = new TextUtils();
- tu.LoadStopWords("C:/tweettracker/stopwords.txt");
- String infilename = DEF_INFILENAME;
- int K = DEF_K;
- if(args!=null)
- {
- if(args.length>=1&&!args[0].isEmpty())
- {
- File fl = new File(args[0]);
- if(fl.exists())
- {
- infilename = args[0];
- }
- }
- if(args.length>=2&&!args[1].isEmpty())
- {
- try{
- K = Integer.parseInt(args[1]);
- }catch(NumberFormatException ex)
- {
- ex.printStackTrace();
- }
- }
- }
- System.out.println(etk.GetTopKeywords(infilename, K, false,true,tu));
- }
-
-}