summaryrefslogtreecommitdiff
path: root/src/Chapter5/network/CreateD3Network.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/Chapter5/network/CreateD3Network.java')
-rw-r--r--src/Chapter5/network/CreateD3Network.java716
1 files changed, 0 insertions, 716 deletions
diff --git a/src/Chapter5/network/CreateD3Network.java b/src/Chapter5/network/CreateD3Network.java
deleted file mode 100644
index d4c25af..0000000
--- a/src/Chapter5/network/CreateD3Network.java
+++ /dev/null
@@ -1,716 +0,0 @@
-/*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
- */
-
-package Chapter5.network;
-
-
-import Chapter5.support.HashTagDS;
-import Chapter5.support.NetworkNode;
-import Chapter5.support.NodeIDComparator;
-import Chapter5.support.NodeSizeComparator;
-import Chapter5.support.ToNodeInfo;
-import Chapter5.support.Tweet;
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
-import utils.TextUtils;
-
-/**
- *
- * @author shamanth
- */
-public class CreateD3Network
-{
- static final String DEF_INFILENAME = "ows.json";
- private String RTPATTERN = "rt @[_a-zA-Z0-9]+";
- private final int DEFAULT_NODE_SIZE = 0;
-// private final int NODE_COUNT_LIMIT = 1;
- //private final String[] node_color_scheme = new String[]{"#FFFFD9","#EDF8B1","#C7E9B4","#7FCDBB","#41B6C4","#1D91C0","#225EA8","#253494","#081D58"};
- //private final String[] node_color_scheme = new String[]{"#A6BDDB","#74A9CF","#3690C0","#0570B0","#045A8D","#023858"};
-
- /**
- * Extracts the users who have been retweeted using the RTPATTERN
- * @param text
- * @return
- */
- public ArrayList<String> GetRTUsers(String text)
- {
- Pattern p = Pattern.compile(RTPATTERN, Pattern.CASE_INSENSITIVE);
- Matcher m = p.matcher(text);
- ArrayList<String> rtusers = new ArrayList<String>();
- while(m.find())
- {
- String nuser = text.substring(m.start(),m.end());
- nuser = nuser.replaceAll("rt @|RT @", "");
-// nuser = nuser.replaceAll("RT @", "");
- rtusers.add(nuser.toLowerCase());
- }
- return rtusers;
- }
-
- /**
- * Identifies the category to which the tweet belongs. Each category is defined by a group of words/hashtags
- * @param tweet
- * @param usercategories
- * @return
- */
- public int GetCategory(String tweet, HashTagDS[] usercategories)
- {
- HashMap<Integer,Integer> categoryvotes = new HashMap<Integer,Integer>();
- tweet = tweet.toLowerCase();
- int i=0;
- for(HashTagDS cat:usercategories)
- {
-
- for(String s :cat.tags)
- {
- if(tweet.indexOf(s)!=-1)
- {
- if(categoryvotes.containsKey(i))
- {
- categoryvotes.put(i, categoryvotes.get(i)+1);
- }
- else
- {
- categoryvotes.put(i, 1);
- }
- }
- }
- i++;
- }
- Set<Integer> keyset = categoryvotes.keySet();
- int maxvote = 0;
- //by default the tweet will be in the first category
- int maxcategoryindex = 0;
- for(int key:keyset)
- {
- if(categoryvotes.get(key)>maxvote)
- {
- maxvote = categoryvotes.get(key);
- maxcategoryindex = key;
- }
- }
- return maxcategoryindex;
- }
-
- /**
- * Converts the input jsonobject containing category descriptions to an array for processing.
- * @param hashtagcoll JSONObject containing the list of hashtags, color, and the topic information
- * @return An array of hashtags
- */
- public HashTagDS[] ConvertJSONArrayToArray(JSONObject hashtagcoll)
- {
- HashTagDS[] hashtags = new HashTagDS[hashtagcoll.length()];
- int j=0;
- try{
- if(hashtagcoll!=null)
- {
- Iterator keyit = hashtagcoll.keys();
- while(keyit.hasNext())
- {
- HashTagDS ht = new HashTagDS();
- JSONObject tags = (JSONObject) hashtagcoll.get((String)keyit.next());
- ht.groupname = keyit.toString();
- ht.color = tags.getString("color");
- JSONArray tagjson = tags.getJSONArray("hts");
- ht.tags = new String[tagjson.length()];
- for(int i=0;i<tagjson.length();i++)
- {
- ht.tags[i] = tagjson.getString(i);
- }
- hashtags[j++] = ht;
- }
- }
- }catch(JSONException ex)
- {
- ex.printStackTrace();
- }
- return hashtags;
- }
-
- /**
- * Identifies the category of a node based on the content of his tweets(each tweet can be assigned a category based on it's text). A simple majority is sufficient to make this decision.
- * @param tnfs
- * @param hashtagarray
- * @return
- */
- public int GetMajorityTopicColor(NetworkNode tnfs,HashTagDS[] hashtagarray)
- {
- HashMap<Integer,Integer> catcount = new HashMap<Integer,Integer>();
- //if the node has no tolinks then look at the node that it retweeted to decide the color of the node
- for(String tweet:tnfs.data)
- {
- int id = this.GetCategory(tweet, hashtagarray);
- if(catcount.containsKey(id))
- {
- catcount.put(id, catcount.get(id)+1);
- }
- else
- catcount.put(id, 1);
- }
- Set<Integer> keys = catcount.keySet();
- int maxcatID = -1;
- int maxcount = 0;
- for(int k:keys)
- {
- if(maxcatID==-1)
- {
- maxcatID = k;
- maxcount = catcount.get(k);
- }
- else
- {
- if(maxcount<catcount.get(k))
- {
- maxcount = catcount.get(k);
- maxcatID = k;
- }
- }
- }
- return maxcatID;
- }
-
- /**
- * Takes as input a JSON file and reads through the file sequentially to process and create a retweet network from the tweets.
- * @param inFilename
- * @param numNodeClasses
- * @param hashtags category info containing hashtags
- * @param num_nodes number of seed nodes to be included in the network
- * @return a JSONObject consisting of nodes and links of the network
- */
- public JSONObject ConvertTweetsToDiffusionPath(String inFilename,int numNodeClasses,
- JSONObject hashtags, int num_nodes)
- {
- HashMap<String,NetworkNode> userconnections = new HashMap<String,NetworkNode>();
-// HashMap<String,Integer> tweet_class_codes = new HashMap<String,Integer>();
-// int tweet_class_counter = 1;
- HashTagDS[] hashtagarray = ConvertJSONArrayToArray(hashtags);
- BufferedReader br = null;
- try{
- br = new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8"));
- String temp = "";
- while((temp = br.readLine())!=null)
- {
- JSONObject tweetobj;
- try {
- tweetobj = new JSONObject(temp);
- } catch (JSONException ex) {
- ex.printStackTrace();
- continue;
- }
- //Extract the tweet first
- Tweet t = new Tweet();
- String text="";
- try {
- text = TextUtils.GetCleanText(tweetobj.getString("text")).toLowerCase();
- } catch (JSONException ex) {
- ex.printStackTrace();
- continue;
- }
- //Check that the tweet matches at least one of the topics
- boolean groupmatch = false;
- for(HashTagDS ht:hashtagarray)
- {
- String[] tags = ht.tags;
- for(String tg:tags)
- {
- if(text.contains(tg))
- {
- groupmatch = true;
- break;
- }
- }
- if(groupmatch)
- {
- break;
- }
- }
- if(!groupmatch)
- {
- continue;
- }
- //
- ArrayList<String> fromusers = new ArrayList<String>();
- if(!tweetobj.isNull("retweeted_status"))
- {
- JSONObject rtstatus;
- try {
- rtstatus = tweetobj.getJSONObject("retweeted_status");
- if(rtstatus.isNull("user"))
- {
- JSONObject rtuserobj = rtstatus.getJSONObject("user");
- try{
- fromusers.add(rtuserobj.get("screen_name").toString());
- }catch(JSONException ex)
- {
- ex.printStackTrace();
- }
- }
- } catch (JSONException ex) {
- Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- else
- {
- //use the tweet text to retrieve the pattern "RT @username:"
- fromusers = GetRTUsers(text);
- }
- if(fromusers.isEmpty())
- {
- continue;
- }
-
- //identify the class values to be applied to all the nodes and
- //edges.
-// String prunedtext = TextUtils.RemoveTwitterElements(text);
-// Integer class_code = tweet_class_codes.get(prunedtext);
-// if(class_code==null)
-// {
-// class_code = tweet_class_counter;
-// tweet_class_codes.put(prunedtext, class_code); //set the unique id for this tweet
-// tweet_class_counter++;
-// }
- t.text = TextUtils.RemoveRTElements(text);
- if(!tweetobj.isNull("user"))
- {
- JSONObject userobj;
- try {
- userobj = tweetobj.getJSONObject("user");
- t.user = userobj.getString("screen_name").toLowerCase();
- } catch (JSONException ex) {
- Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
-// try {
-// t.pubdate = String.valueOf(tweetobj.get("timestamp"));
-// } catch (JSONException ex) {
-// Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex);
-// }
- t.catColor = hashtagarray[t.catID].color;
- //update the size of the from fromuser
- int cur_level = 0;
- for(int i=fromusers.size()-1;i>=0;i--)
- {
- String touser = "";
- if(i==0)
- {//if this is the last user in the retweet sequence then use the user of the tweet as the next link
- touser = t.user;
- }
- else
- { //if there are still fromuser in the retweet chain then use them as the next link
- touser = fromusers.get(i-1);
- }
- //don't add any selflinks
- if(fromusers.get(i).equals(touser))
- {
- continue;
- }
- NetworkNode fromuser = null;
- if(userconnections.containsKey(fromusers.get(i)))
- {
- //from node already exists simply add this new connection to it
- fromuser = userconnections.get(fromusers.get(i));
- }
- else
- {
- //the from user was not found. add the node
- fromuser = new NetworkNode();
- // fromuser.id = nodeid++;
- fromuser.username = fromusers.get(i);
- fromuser.tonodes = new ArrayList<ToNodeInfo>();
- fromuser.class_codes = new ArrayList<Integer>();
- fromuser.size = DEFAULT_NODE_SIZE;
- fromuser.level = cur_level;
- fromuser.data = new ArrayList<String>();
- fromuser.data.add(t.text);
- //fromuser.category = ;
- }
-// if(!fromuser.class_codes.contains(class_code))
-// {
-// //add the marker to from node if it does not have it already
-// fromuser.class_codes.add(class_code);
-// }
- //if to node is not in the list then create it
- NetworkNode tonode = null;
- if(!userconnections.containsKey(touser))
- {
- tonode = new NetworkNode();
- // System.out.println(touser+" "+nodeid);
- // tonode.id= nodeid++;
- tonode.username = touser;
- tonode.tonodes= new ArrayList<ToNodeInfo>();
- tonode.class_codes = new ArrayList<Integer>();
- tonode.catID = t.catID;
- tonode.catColor = t.catColor;
- tonode.size = DEFAULT_NODE_SIZE;
- tonode.data= new ArrayList<String>();
- tonode.data.add(t.text);
- tonode.level = cur_level+1;
- //add the classcode to the node if it doesn't already exist
-// if(!tonode.class_codes.contains(class_code))
-// {
-// tonode.class_codes.add(class_code);
-// }
- //add the touser info
- userconnections.put(touser, tonode);
- }
- else
- {
- tonode = userconnections.get(touser);
- tonode.data.add(t.text);
- if(tonode.level<cur_level+1)
- {
- tonode.level = cur_level;
- }
- //add the classcode to the node if it doesn't already exist
-// if(!tonode.class_codes.contains(class_code))
-// {
-// tonode.class_codes.add(class_code);
-// }
- }
- ToNodeInfo inf = new ToNodeInfo();
- inf.tonodeid = tonode.id;
- inf.text = t.text;
-// inf.date = t.pubdate;
-// inf.class_code = class_code;
- inf.tousername = touser;
- inf.catID = t.catID;
- inf.catColor = t.catColor;
- fromuser.tonodes.add(inf);
- //update from node size
- fromuser.size++;
- //add back updated fromuser
- userconnections.put(fromusers.get(i), fromuser);
- //update the level for next iteration
- cur_level++;
- }
- }
- }catch(IOException ex)
- {
- ex.printStackTrace();
- }
- Set<String> keys = userconnections.keySet();
- ArrayList<NetworkNode> returnnodes = new ArrayList<NetworkNode>();
- //its +1 because nodes with size 0 are not going to be used to calculate the class
- int min = DEFAULT_NODE_SIZE+1;
- int max = DEFAULT_NODE_SIZE+1;
- for(String k:keys)
- {
- NetworkNode n = userconnections.get(k);
- int maxcat = GetMajorityTopicColor(n,hashtagarray);
- n.catID = maxcat;
- n.catColor = hashtagarray[maxcat].color;
- userconnections.put(k, n);
- //
-// if(n.size==0)
-// {//mark the node as a zero node
-// n.class_codes.add(-1);
-// }
-// else
-// {
- if(n.size>max)
- {
- max = n.size;
- }
- if(n.size<min)
- {
- min = n.size;
- }
-// }
- returnnodes.add(n);
- }
- //create node groups to assign unique colors to nodes in different Categories based upon the number of connections
- ArrayList<NetworkNode> nodes = ComputeGroupsSqrt(returnnodes, max, min, numNodeClasses);
- Collections.sort(nodes,Collections.reverseOrder(new NodeSizeComparator()));
- //select how many nodes to show.
- int nodes_to_visit = 0;
- if(nodes.size()>=num_nodes)
- {
- nodes_to_visit = num_nodes;
- }
- else
- {
- nodes_to_visit = nodes.size();
- }
-
- HashMap<String,NetworkNode> prunednodes = new HashMap<String,NetworkNode>();
- HashMap<String,Integer> nodeidlist = new HashMap<String,Integer>();
- int nodeid = 0; //node nodeid counter
- for(int k=0;k<nodes_to_visit;k++)
- {
- NetworkNode nd = nodes.get(k);
-// System.out.println("visiting node "+nd.username);
- nd.level = 0;
- HashMap<String,NetworkNode> rtnodes = GetNextHopConnections(userconnections,nd,new HashMap<String,NetworkNode>());
- Set<String> names = rtnodes.keySet();
- for(String n:names)
- {
- if(!prunednodes.containsKey(n))
- {
- NetworkNode newnode = rtnodes.get(n);
- if(newnode.size>0)
- {
- prunednodes.put(n, newnode);
- nodeidlist.put(n, nodeid++);
- }
- }
- }
- }
-
- /** We now have all the nodes of the network. compute their ids sequentially
- * and assign them to the respective nodes. Simultaneously compact the nodes
- * of the network to remove all nodes which have not been retweeted and are
- * of size 0
- */
-
- Set<String> allnodes = prunednodes.keySet();
-// System.out.println(prunednodes.size());
- ArrayList<NetworkNode> finalnodes = new ArrayList<NetworkNode>();
-// HashMap<Integer,ArrayList<Integer>> conninfo = new HashMap<Integer,ArrayList<Integer>>();
- for(String n:allnodes)
- {
- NetworkNode nd = prunednodes.get(n);
- nd.id = nodeidlist.get(nd.username);
- ArrayList<Integer> connids = new ArrayList<Integer>();
-// ArrayList<ToNodeInfo> compact_To_nodes = new ArrayList<ToNodeInfo>();
- int counter = 0;
- for(ToNodeInfo tnf: nd.tonodes)
- {
- //user has never been retweeted. the chain terminates here, so remove it
- if(nodeidlist.containsKey(tnf.tousername))
- {
- tnf.tonodeid = nodeidlist.get(tnf.tousername);
- connids.add(tnf.tonodeid);
- nd.tonodes.set(counter, tnf);
- counter++;
- }
- }
- finalnodes.add(nd);
- //store the connections to compute the clusterids later
-// if(!conninfo.containsKey(nd.id))
-// {
-// conninfo.put(nd.id, connids);
-// }
- }
- //generate the clusterids
-// ArrayList<Integer>[] clusterids = (ArrayList<Integer>[])new ArrayList[allnodes.size()];
-// Set<Integer> idkeys = conninfo.keySet();
-// for(int id:idkeys)
-// {
-// for(int x:conninfo.get(id))
-// {
-// if(clusterids[x]==null)
-// {
-// ArrayList<Integer> toclusterid = new ArrayList<Integer>();
-// toclusterid.add(id);
-// clusterids[x] = toclusterid;
-// }
-// else
-// {
-// ArrayList<Integer> toclusterid = clusterids[x];
-// if(!toclusterid.contains(id))
-// {
-// toclusterid.add(id);
-// clusterids[x] = toclusterid;
-// }
-// }
-// }
-// }
- //now create the final node list with the clusterids
-// for(String n:allnodes)
-// {
-// NetworkNode nd = prunednodes.get(n);
-// ArrayList<Integer> cids = clusterids[nd.id];
-// if(cids!=null)
-// {
-// int size = cids.size();
-// nd.clusterID = new int[size+1];
-// int counter=0;
-// nd.clusterID[counter++] = nd.id;
-// for(int c:cids)
-// {
-// nd.clusterID[counter++] = c;
-// }
-// }
- //System.out.println(nd.class_codes.toString());
-// finalnodes.add(nd);
-// }
- Collections.sort(finalnodes,new NodeIDComparator());
- System.out.println(finalnodes.size());
- for(NetworkNode node:finalnodes)
- {
- System.out.println(node.id+" "+node.username+" "+node.level+" "+node.size+" "+node.catColor+node.data.get(0));
- }
- return GetD3Structure(finalnodes);
- }
-
- /**
- * Creates a D3 representation of the nodes, consisting of two JSONArray a set of nodes and a set of links between the nodes
- * @param finalnodes
- * @return
- */
- public JSONObject GetD3Structure(ArrayList<NetworkNode> finalnodes)
- {
- JSONObject alltweets = new JSONObject();
- try {
- JSONArray nodes = new JSONArray();
- JSONArray links = new JSONArray();
- for (NetworkNode node : finalnodes)
- {
- try {
- //create adjacencies
- JSONArray nodedata = new JSONArray();
- for (ToNodeInfo tnf : node.tonodes) {
- JSONObject jsadj = new JSONObject();
- jsadj.put("source", node.id);
- jsadj.put("target", tnf.tonodeid);
- //weight of the edge
- jsadj.put("value", 1);
- //class code is a unique id corresponding to the text
- jsadj.put("data", tnf.class_code);
- links.put(jsadj);
- //create a data object for the node
- JSONObject jsdata = new JSONObject();
- jsdata.put("tonodeid", tnf.tonodeid);
- jsdata.put("nodefrom", node.username);
- jsdata.put("nodeto", tnf.tousername);
- jsdata.put("tweet", tnf.text);
-// jsdata.put("pubtime", tnf.date);
- //class code for tweet to be used to filter
-// jsdata.put("classcode", tnf.class_code);
- nodedata.put(jsdata);
- }
- //add node
- JSONObject nd = new JSONObject();
- nd.put("name", node.username);
- nd.put("group", node.group);
- nd.put("id", node.id);
- nd.put("size", node.size);
- nd.put("catColor", node.catColor);
- nd.put("catID", node.catID);
- nd.put("data", nodedata);
- nd.put("level", node.level);
- //clusterids for the node
-// JSONArray cids = new JSONArray();
-// if (node.clusterID != null) {
-// for (int code : node.clusterID) {
-// cids.put(code);
-// }
-// } else {
-// cids.put(node.id);
-// }
-// nd.put("clusterids", cids);
- //classcodes for the node
-// JSONArray codes = new JSONArray();
-// for (int c : node.class_codes) {
-// codes.put(c);
-// }
-// nd.put("classcodes", codes);
- nodes.put(nd);
- } catch (JSONException ex) {
- ex.printStackTrace();
- }
- }
- alltweets.put("nodes", nodes);
- alltweets.put("links", links);
- } catch (JSONException ex) {
- Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex);
- }
- return alltweets;
- }
-
- /**
- * Recursively traverses the list of nodes to identify all nodes reachable from a starting node.
- * @param userconnections A map containing the usernames as keys and the node information as value
- * @param cur_node Node currently being processed.
- * @param newnodes A list of nodes which can be reached from the current node
- * @return A map of the usernames and the node information for all nodes reachable
- */
- public HashMap<String,NetworkNode> GetNextHopConnections(HashMap<String,NetworkNode> userconnections,NetworkNode cur_node,HashMap<String,NetworkNode> newnodes)
- {
- cur_node.level = cur_node.level+1;
- newnodes.put(cur_node.username,cur_node);
- for(int i=0;i<cur_node.tonodes.size();i++)
- {
- ToNodeInfo tnf = cur_node.tonodes.get(i);
- if(newnodes.containsKey(tnf.tousername))
- {
- continue;
- }
-
- HashMap<String,NetworkNode> rtnodes = GetNextHopConnections(userconnections, userconnections.get(tnf.tousername),newnodes);
- newnodes = rtnodes;
- }
- return newnodes;
- }
-
- /**
- * Divides a list of nodes into groups using the square root binning
- * technique. If a node has size x and there are y groups in total. Then the
- * group of the node is computed as ceil((sqrt(x)/sqrt(max))*y), where max is
- * the size of the largest node.
- * @param nodes A list of nodes
- * @param max The maximum size of a node
- * @param min The minimum size of a node
- * @param noofclasses Number of classes into which the nodes must be classified
- * @return A list of nodes along with their class
- */
- public ArrayList<NetworkNode> ComputeGroupsSqrt(ArrayList<NetworkNode> nodes, int max, int min, int noofclasses)
- {
- ArrayList<NetworkNode> finalnodes = new ArrayList<NetworkNode>();
- for(int i=0;i<nodes.size();i++)
- {
- NetworkNode node = nodes.get(i);
- int color_index = 0;
- if(node.size>0)
- {
- color_index = (int) Math.ceil(((double)Math.sqrt(node.size)/Math.sqrt(max))*noofclasses)-1;
-// node.size = color_index*6;
- }
- node.group = color_index;
- finalnodes.add(node);
- }
- return finalnodes;
- }
-
-
- //DEBUG use only
- public static void main(String[] args)
- {
- try {
- CreateD3Network cdn = new CreateD3Network();
- JSONObject jobj = new JSONObject();
- JSONObject obj = new JSONObject();
- obj.put("color", "#800000");
- JSONArray ja = new JSONArray();
- ja.put("zuccotti");
- obj.put("hts", ja);
- jobj.put("Group 1", obj);
- obj = new JSONObject();
- obj.put("color", "#0FFF00");
- ja = new JSONArray();
- ja.put("#nypd");
- obj.put("hts", ja);
- jobj.put("Group 2", obj);
- String filename = "D:\\Twitter Data Analytics\\Data\\testows.json";
- JSONObject nodes = cdn.ConvertTweetsToDiffusionPath(filename,7, jobj,5);
- } catch (JSONException ex) {
- ex.printStackTrace();
- }
- }
-}