diff options
Diffstat (limited to 'src/Chapter5/text')
-rw-r--r-- | src/Chapter5/text/EventSummaryExtractor.java | 269 | ||||
-rw-r--r-- | src/Chapter5/text/ExtractTopKeywords.java | 151 |
2 files changed, 0 insertions, 420 deletions
diff --git a/src/Chapter5/text/EventSummaryExtractor.java b/src/Chapter5/text/EventSummaryExtractor.java deleted file mode 100644 index e76f42e..0000000 --- a/src/Chapter5/text/EventSummaryExtractor.java +++ /dev/null @@ -1,269 +0,0 @@ -/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University - * @author shamanth - */ -package Chapter5.text; - -import Chapter5.support.DateInfo; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.Set; -import java.util.logging.Level; -import java.util.logging.Logger; -import org.json.JSONArray; -import org.json.JSONException; -import org.json.JSONObject; - -public class EventSummaryExtractor -{ - - final String DEF_INFILENAME = "ows.json"; - HashMap<String,ArrayList<String>> CATEGORIES = new HashMap<String,ArrayList<String>>(); - SimpleDateFormat twittersdm = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy"); - SimpleDateFormat dayhoursdm = new SimpleDateFormat("yyyy-MM-dd:HH"); -// SimpleDateFormat daysdm = new SimpleDateFormat("MM/dd/yyyy"); - SimpleDateFormat hoursdm = new SimpleDateFormat("HH"); - - /** - * - */ - public void InitializeCategories() - { - ArrayList<String> people = new ArrayList<String>(); - people.add("protesters"); - people.add("people"); - CATEGORIES.put("People",people); - ArrayList<String> police = new ArrayList<String>(); - police.add("police"); - police.add("cops"); - police.add("nypd"); - police.add("raid"); - CATEGORIES.put("Police",police); - ArrayList<String> media = new ArrayList<String>(); - media.add("press"); - media.add("news"); - media.add("media"); - CATEGORIES.put("Media",media); - ArrayList<String> city = new ArrayList<String>(); - city.add("nyc"); - city.add("zucotti"); - city.add("park"); - CATEGORIES.put("Location",city); - ArrayList<String> judiciary = new ArrayList<String>(); - judiciary.add("judge"); - judiciary.add("eviction"); - judiciary.add("order"); - judiciary.add("court"); - CATEGORIES.put("Judiciary", judiciary); - } - - /** - * - * @param filename - * @return - */ - public JSONObject ExtractCategoryTrends(String filename) - { - JSONObject result = new JSONObject(); - try { - BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8")); - String temp = ""; - Set<String> catkeys = CATEGORIES.keySet(); - HashMap<String,HashMap<String,Integer>> datecount = new HashMap<String,HashMap<String,Integer>>(); - while((temp = br.readLine())!=null) - { - Date d = new Date(); - try { - JSONObject jobj = new JSONObject(temp); - //Published time - if(!jobj.isNull("created_at")) - { - String time = ""; - try { - time = jobj.getString("created_at"); - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - if(time.isEmpty()) - { - continue; - } - else - { - try { - d = twittersdm.parse(time); - } catch (ParseException ex) { - continue; - } - } - } - else - if(!jobj.isNull("timestamp")) - { - long time = new Date().getTime(); - try{ - time = jobj.getLong("timestamp"); - }catch(JSONException ex) - { - ex.printStackTrace(); - } - d = new Date(); - d.setTime(time); - } - String datestr = dayhoursdm.format(d); - String text = jobj.getString("text").toLowerCase(); -// System.out.println(text); - for(String key:catkeys) - { - ArrayList<String> words = CATEGORIES.get(key); - for(String word:words) - { - if(text.contains(word)) - { - HashMap<String,Integer> categorycount = new HashMap<String,Integer>(); - if(datecount.containsKey(datestr)) - { - categorycount = datecount.get(datestr); - } - if(categorycount.containsKey(key)) - { - categorycount.put(key, categorycount.get(key)+1); - } - else - { - categorycount.put(key, 1); - } - //update the categorycount for the specific date - datecount.put(datestr, categorycount); - break; - } - } - } - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - } - //sort the dates - Set<String> datekeys = datecount.keySet(); - ArrayList<DateInfo> dinfos = new ArrayList<DateInfo>(); - for(String date:datekeys) - { - Date d = null; - try { - d = dayhoursdm.parse(date); - } catch (ParseException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - if(d!=null) - { - DateInfo info = new DateInfo(); - info.d = d; - info.catcounts = datecount.get(date); - dinfos.add(info); - } - } - Collections.sort(dinfos, Collections.reverseOrder()); - try { - result.put("axisxstep", dinfos.size()-1); - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - try { - result.put("axisystep", CATEGORIES.size()-1); - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - JSONArray xcoordinates = new JSONArray(); - JSONArray ycoordinates = new JSONArray(); - //now add the data and the axis labels - JSONArray axisxlabels = new JSONArray(); - JSONArray axisylabels = new JSONArray(); - JSONArray data = new JSONArray(); - for(String key:catkeys) - { - axisylabels.put(key); - } - //counters to mark the indices of the values added to data field. i is the x coordinate and j is the y coordinate - int i=0,j=0; - - for(DateInfo date:dinfos) - { - String strdate = hoursdm.format(date.d); - axisxlabels.put(strdate); - HashMap<String,Integer> catcounts = date.catcounts; - for(String key:catkeys) - { - xcoordinates.put(j); - ycoordinates.put(i++); - if(catcounts.containsKey(key)) - { - data.put(catcounts.get(key)); - } - else - { - data.put(0); - } - } - //reset the x coordinate as we move to the next y item - i=0; - j++; - } - try { - result.put("xcoordinates", xcoordinates); - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - try { - result.put("ycoordinates", ycoordinates); - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - try { - result.put("axisxlabels", axisxlabels); - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - try { - result.put("axisylabels", axisylabels); - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - try { - result.put("data", data); - } catch (JSONException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - br.close(); - } catch (IOException ex) { - Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); - } - return result; - } - - public static void main(String[] args) - { - EventSummaryExtractor ese = new EventSummaryExtractor(); - String infilename = ese.DEF_INFILENAME; - if(args!=null) - { - if(args.length>=1&&!args[0].isEmpty()) - { - File fl = new File(args[0]); - if(fl.exists()) - { - infilename = args[0]; - } - } - } - ese.InitializeCategories(); - System.out.println(ese.ExtractCategoryTrends(infilename).toString()); - } -} diff --git a/src/Chapter5/text/ExtractTopKeywords.java b/src/Chapter5/text/ExtractTopKeywords.java deleted file mode 100644 index 8ab412a..0000000 --- a/src/Chapter5/text/ExtractTopKeywords.java +++ /dev/null @@ -1,151 +0,0 @@ -/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University - * @author shamanth - */ -package Chapter5.text; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Set; -import java.util.logging.Level; -import java.util.logging.Logger; -import org.json.JSONArray; -import org.json.JSONException; -import org.json.JSONObject; -import utils.Tags; -import utils.TextUtils; - -public class ExtractTopKeywords -{ - - static final String DEF_INFILENAME = "ows.json"; - static final int DEF_K = 60; - - /** - * Extracts the most frequently occurring keywords from the tweets by processing them sequentially. Stopwords are ignored. - * @param inFilename File containing a list of tweets as JSON objects - * @param K Count of the top keywords to return - * @param ignoreHashtags If true, hashtags are not considered while counting the most frequent keywords - * @param ignoreUsernames If true, usernames are not considered while counting the most frequent keywords - * @param tu TextUtils object which handles the stopwords - * @return a JSONArray containing an array of JSONObjects. Each object contains two elements "text" and "size" referring to the word and it's frequency - */ - public JSONArray GetTopKeywords(String inFilename, int K, boolean ignoreHashtags, boolean ignoreUsernames, TextUtils tu) - { - HashMap<String, Integer> words = new HashMap<String,Integer>(); - BufferedReader br = null; - try{ - br = new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); - String temp = ""; - while((temp = br.readLine())!=null) - { - try{ - JSONObject tweetobj = new JSONObject(temp); - if(!tweetobj.isNull("text")) - { - String text = tweetobj.getString("text"); - //System.out.println(text); - text = text.toLowerCase().replaceAll("\\s+", " "); - /** Step 1: Tokenize tweets into individual words. and count their frequency in the corpus - * Remove stop words and special characters. Ignore user names and hashtags if the user chooses to. - */ - HashMap<String,Integer> tokens = tu.TokenizeText(text,ignoreHashtags,ignoreUsernames); - Set<String> keys = tokens.keySet(); - for(String key:keys) - { - if(words.containsKey(key)) - { - words.put(key, words.get(key)+tokens.get(key)); - } - else - { - words.put(key, tokens.get(key)); - } - } - } - }catch(JSONException ex) - { - ex.printStackTrace(); - } - } - }catch(IOException ex) - { - ex.printStackTrace(); - }finally{ - try { - br.close(); - } catch (IOException ex) { - Logger.getLogger(ExtractTopKeywords.class.getName()).log(Level.SEVERE, null, ex); - } - } - Set<String> keys = words.keySet(); - ArrayList<Tags> tags = new ArrayList<Tags>(); - for(String key:keys) - { - Tags tag = new Tags(); - tag.setKey(key); - tag.setValue(words.get(key)); - tags.add(tag); - } - // Step 2: Sort the words in descending order of frequency - Collections.sort(tags, Collections.reverseOrder()); - JSONArray cloudwords = new JSONArray(); - int numwords = K; - if(tags.size()<numwords) - { - numwords = tags.size(); - } - for(int i=0;i<numwords;i++) - { - JSONObject wordfreq = new JSONObject(); - Tags tag = tags.get(i); - try{ - wordfreq.put("text", tag.getKey()); - wordfreq.put("size",tag.getValue()); - cloudwords.put(wordfreq); - }catch(JSONException ex) - { - ex.printStackTrace(); - } - } - return cloudwords; - } - - public static void main(String[] args) - { - ExtractTopKeywords etk = new ExtractTopKeywords(); - - //Initialize the TextUtils class which handles all the processing of text. - TextUtils tu = new TextUtils(); - tu.LoadStopWords("C:/tweettracker/stopwords.txt"); - String infilename = DEF_INFILENAME; - int K = DEF_K; - if(args!=null) - { - if(args.length>=1&&!args[0].isEmpty()) - { - File fl = new File(args[0]); - if(fl.exists()) - { - infilename = args[0]; - } - } - if(args.length>=2&&!args[1].isEmpty()) - { - try{ - K = Integer.parseInt(args[1]); - }catch(NumberFormatException ex) - { - ex.printStackTrace(); - } - } - } - System.out.println(etk.GetTopKeywords(infilename, K, false,true,tu)); - } - -} |