/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package Chapter5.network; import Chapter5.support.HashTagDS; import Chapter5.support.NetworkNode; import Chapter5.support.NodeIDComparator; import Chapter5.support.NodeSizeComparator; import Chapter5.support.ToNodeInfo; import Chapter5.support.Tweet; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import utils.TextUtils; /** * * @author shamanth */ public class CreateD3Network { static final String DEF_INFILENAME = "ows.json"; private String RTPATTERN = "rt @[_a-zA-Z0-9]+"; private final int DEFAULT_NODE_SIZE = 0; // private final int NODE_COUNT_LIMIT = 1; //private final String[] node_color_scheme = new String[]{"#FFFFD9","#EDF8B1","#C7E9B4","#7FCDBB","#41B6C4","#1D91C0","#225EA8","#253494","#081D58"}; //private final String[] node_color_scheme = new String[]{"#A6BDDB","#74A9CF","#3690C0","#0570B0","#045A8D","#023858"}; /** * Extracts the users who have been retweeted using the RTPATTERN * @param text * @return */ public ArrayList GetRTUsers(String text) { Pattern p = Pattern.compile(RTPATTERN, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(text); ArrayList rtusers = new ArrayList(); while(m.find()) { String nuser = text.substring(m.start(),m.end()); nuser = nuser.replaceAll("rt @|RT @", ""); // nuser = nuser.replaceAll("RT @", ""); rtusers.add(nuser.toLowerCase()); } return rtusers; } /** * Identifies the category to which the tweet belongs. Each category is defined by a group of words/hashtags * @param tweet * @param usercategories * @return */ public int GetCategory(String tweet, HashTagDS[] usercategories) { HashMap categoryvotes = new HashMap(); tweet = tweet.toLowerCase(); int i=0; for(HashTagDS cat:usercategories) { for(String s :cat.tags) { if(tweet.indexOf(s)!=-1) { if(categoryvotes.containsKey(i)) { categoryvotes.put(i, categoryvotes.get(i)+1); } else { categoryvotes.put(i, 1); } } } i++; } Set keyset = categoryvotes.keySet(); int maxvote = 0; //by default the tweet will be in the first category int maxcategoryindex = 0; for(int key:keyset) { if(categoryvotes.get(key)>maxvote) { maxvote = categoryvotes.get(key); maxcategoryindex = key; } } return maxcategoryindex; } /** * Converts the input jsonobject containing category descriptions to an array for processing. * @param hashtagcoll JSONObject containing the list of hashtags, color, and the topic information * @return An array of hashtags */ public HashTagDS[] ConvertJSONArrayToArray(JSONObject hashtagcoll) { HashTagDS[] hashtags = new HashTagDS[hashtagcoll.length()]; int j=0; try{ if(hashtagcoll!=null) { Iterator keyit = hashtagcoll.keys(); while(keyit.hasNext()) { HashTagDS ht = new HashTagDS(); JSONObject tags = (JSONObject) hashtagcoll.get((String)keyit.next()); ht.groupname = keyit.toString(); ht.color = tags.getString("color"); JSONArray tagjson = tags.getJSONArray("hts"); ht.tags = new String[tagjson.length()]; for(int i=0;i catcount = new HashMap(); //if the node has no tolinks then look at the node that it retweeted to decide the color of the node for(String tweet:tnfs.data) { int id = this.GetCategory(tweet, hashtagarray); if(catcount.containsKey(id)) { catcount.put(id, catcount.get(id)+1); } else catcount.put(id, 1); } Set keys = catcount.keySet(); int maxcatID = -1; int maxcount = 0; for(int k:keys) { if(maxcatID==-1) { maxcatID = k; maxcount = catcount.get(k); } else { if(maxcount userconnections = new HashMap(); // HashMap tweet_class_codes = new HashMap(); // int tweet_class_counter = 1; HashTagDS[] hashtagarray = ConvertJSONArrayToArray(hashtags); BufferedReader br = null; try{ br = new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); String temp = ""; while((temp = br.readLine())!=null) { JSONObject tweetobj; try { tweetobj = new JSONObject(temp); } catch (JSONException ex) { ex.printStackTrace(); continue; } //Extract the tweet first Tweet t = new Tweet(); String text=""; try { text = TextUtils.GetCleanText(tweetobj.getString("text")).toLowerCase(); } catch (JSONException ex) { ex.printStackTrace(); continue; } //Check that the tweet matches at least one of the topics boolean groupmatch = false; for(HashTagDS ht:hashtagarray) { String[] tags = ht.tags; for(String tg:tags) { if(text.contains(tg)) { groupmatch = true; break; } } if(groupmatch) { break; } } if(!groupmatch) { continue; } // ArrayList fromusers = new ArrayList(); if(!tweetobj.isNull("retweeted_status")) { JSONObject rtstatus; try { rtstatus = tweetobj.getJSONObject("retweeted_status"); if(rtstatus.isNull("user")) { JSONObject rtuserobj = rtstatus.getJSONObject("user"); try{ fromusers.add(rtuserobj.get("screen_name").toString()); }catch(JSONException ex) { ex.printStackTrace(); } } } catch (JSONException ex) { Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex); } } else { //use the tweet text to retrieve the pattern "RT @username:" fromusers = GetRTUsers(text); } if(fromusers.isEmpty()) { continue; } //identify the class values to be applied to all the nodes and //edges. // String prunedtext = TextUtils.RemoveTwitterElements(text); // Integer class_code = tweet_class_codes.get(prunedtext); // if(class_code==null) // { // class_code = tweet_class_counter; // tweet_class_codes.put(prunedtext, class_code); //set the unique id for this tweet // tweet_class_counter++; // } t.text = TextUtils.RemoveRTElements(text); if(!tweetobj.isNull("user")) { JSONObject userobj; try { userobj = tweetobj.getJSONObject("user"); t.user = userobj.getString("screen_name").toLowerCase(); } catch (JSONException ex) { Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex); } } // try { // t.pubdate = String.valueOf(tweetobj.get("timestamp")); // } catch (JSONException ex) { // Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex); // } t.catColor = hashtagarray[t.catID].color; //update the size of the from fromuser int cur_level = 0; for(int i=fromusers.size()-1;i>=0;i--) { String touser = ""; if(i==0) {//if this is the last user in the retweet sequence then use the user of the tweet as the next link touser = t.user; } else { //if there are still fromuser in the retweet chain then use them as the next link touser = fromusers.get(i-1); } //don't add any selflinks if(fromusers.get(i).equals(touser)) { continue; } NetworkNode fromuser = null; if(userconnections.containsKey(fromusers.get(i))) { //from node already exists simply add this new connection to it fromuser = userconnections.get(fromusers.get(i)); } else { //the from user was not found. add the node fromuser = new NetworkNode(); // fromuser.id = nodeid++; fromuser.username = fromusers.get(i); fromuser.tonodes = new ArrayList(); fromuser.class_codes = new ArrayList(); fromuser.size = DEFAULT_NODE_SIZE; fromuser.level = cur_level; fromuser.data = new ArrayList(); fromuser.data.add(t.text); //fromuser.category = ; } // if(!fromuser.class_codes.contains(class_code)) // { // //add the marker to from node if it does not have it already // fromuser.class_codes.add(class_code); // } //if to node is not in the list then create it NetworkNode tonode = null; if(!userconnections.containsKey(touser)) { tonode = new NetworkNode(); // System.out.println(touser+" "+nodeid); // tonode.id= nodeid++; tonode.username = touser; tonode.tonodes= new ArrayList(); tonode.class_codes = new ArrayList(); tonode.catID = t.catID; tonode.catColor = t.catColor; tonode.size = DEFAULT_NODE_SIZE; tonode.data= new ArrayList(); tonode.data.add(t.text); tonode.level = cur_level+1; //add the classcode to the node if it doesn't already exist // if(!tonode.class_codes.contains(class_code)) // { // tonode.class_codes.add(class_code); // } //add the touser info userconnections.put(touser, tonode); } else { tonode = userconnections.get(touser); tonode.data.add(t.text); if(tonode.level keys = userconnections.keySet(); ArrayList returnnodes = new ArrayList(); //its +1 because nodes with size 0 are not going to be used to calculate the class int min = DEFAULT_NODE_SIZE+1; int max = DEFAULT_NODE_SIZE+1; for(String k:keys) { NetworkNode n = userconnections.get(k); int maxcat = GetMajorityTopicColor(n,hashtagarray); n.catID = maxcat; n.catColor = hashtagarray[maxcat].color; userconnections.put(k, n); // // if(n.size==0) // {//mark the node as a zero node // n.class_codes.add(-1); // } // else // { if(n.size>max) { max = n.size; } if(n.size nodes = ComputeGroupsSqrt(returnnodes, max, min, numNodeClasses); Collections.sort(nodes,Collections.reverseOrder(new NodeSizeComparator())); //select how many nodes to show. int nodes_to_visit = 0; if(nodes.size()>=num_nodes) { nodes_to_visit = num_nodes; } else { nodes_to_visit = nodes.size(); } HashMap prunednodes = new HashMap(); HashMap nodeidlist = new HashMap(); int nodeid = 0; //node nodeid counter for(int k=0;k rtnodes = GetNextHopConnections(userconnections,nd,new HashMap()); Set names = rtnodes.keySet(); for(String n:names) { if(!prunednodes.containsKey(n)) { NetworkNode newnode = rtnodes.get(n); if(newnode.size>0) { prunednodes.put(n, newnode); nodeidlist.put(n, nodeid++); } } } } /** We now have all the nodes of the network. compute their ids sequentially * and assign them to the respective nodes. Simultaneously compact the nodes * of the network to remove all nodes which have not been retweeted and are * of size 0 */ Set allnodes = prunednodes.keySet(); // System.out.println(prunednodes.size()); ArrayList finalnodes = new ArrayList(); // HashMap> conninfo = new HashMap>(); for(String n:allnodes) { NetworkNode nd = prunednodes.get(n); nd.id = nodeidlist.get(nd.username); ArrayList connids = new ArrayList(); // ArrayList compact_To_nodes = new ArrayList(); int counter = 0; for(ToNodeInfo tnf: nd.tonodes) { //user has never been retweeted. the chain terminates here, so remove it if(nodeidlist.containsKey(tnf.tousername)) { tnf.tonodeid = nodeidlist.get(tnf.tousername); connids.add(tnf.tonodeid); nd.tonodes.set(counter, tnf); counter++; } } finalnodes.add(nd); //store the connections to compute the clusterids later // if(!conninfo.containsKey(nd.id)) // { // conninfo.put(nd.id, connids); // } } //generate the clusterids // ArrayList[] clusterids = (ArrayList[])new ArrayList[allnodes.size()]; // Set idkeys = conninfo.keySet(); // for(int id:idkeys) // { // for(int x:conninfo.get(id)) // { // if(clusterids[x]==null) // { // ArrayList toclusterid = new ArrayList(); // toclusterid.add(id); // clusterids[x] = toclusterid; // } // else // { // ArrayList toclusterid = clusterids[x]; // if(!toclusterid.contains(id)) // { // toclusterid.add(id); // clusterids[x] = toclusterid; // } // } // } // } //now create the final node list with the clusterids // for(String n:allnodes) // { // NetworkNode nd = prunednodes.get(n); // ArrayList cids = clusterids[nd.id]; // if(cids!=null) // { // int size = cids.size(); // nd.clusterID = new int[size+1]; // int counter=0; // nd.clusterID[counter++] = nd.id; // for(int c:cids) // { // nd.clusterID[counter++] = c; // } // } //System.out.println(nd.class_codes.toString()); // finalnodes.add(nd); // } Collections.sort(finalnodes,new NodeIDComparator()); System.out.println(finalnodes.size()); for(NetworkNode node:finalnodes) { System.out.println(node.id+" "+node.username+" "+node.level+" "+node.size+" "+node.catColor+node.data.get(0)); } return GetD3Structure(finalnodes); } /** * Creates a D3 representation of the nodes, consisting of two JSONArray a set of nodes and a set of links between the nodes * @param finalnodes * @return */ public JSONObject GetD3Structure(ArrayList finalnodes) { JSONObject alltweets = new JSONObject(); try { JSONArray nodes = new JSONArray(); JSONArray links = new JSONArray(); for (NetworkNode node : finalnodes) { try { //create adjacencies JSONArray nodedata = new JSONArray(); for (ToNodeInfo tnf : node.tonodes) { JSONObject jsadj = new JSONObject(); jsadj.put("source", node.id); jsadj.put("target", tnf.tonodeid); //weight of the edge jsadj.put("value", 1); //class code is a unique id corresponding to the text jsadj.put("data", tnf.class_code); links.put(jsadj); //create a data object for the node JSONObject jsdata = new JSONObject(); jsdata.put("tonodeid", tnf.tonodeid); jsdata.put("nodefrom", node.username); jsdata.put("nodeto", tnf.tousername); jsdata.put("tweet", tnf.text); // jsdata.put("pubtime", tnf.date); //class code for tweet to be used to filter // jsdata.put("classcode", tnf.class_code); nodedata.put(jsdata); } //add node JSONObject nd = new JSONObject(); nd.put("name", node.username); nd.put("group", node.group); nd.put("id", node.id); nd.put("size", node.size); nd.put("catColor", node.catColor); nd.put("catID", node.catID); nd.put("data", nodedata); nd.put("level", node.level); //clusterids for the node // JSONArray cids = new JSONArray(); // if (node.clusterID != null) { // for (int code : node.clusterID) { // cids.put(code); // } // } else { // cids.put(node.id); // } // nd.put("clusterids", cids); //classcodes for the node // JSONArray codes = new JSONArray(); // for (int c : node.class_codes) { // codes.put(c); // } // nd.put("classcodes", codes); nodes.put(nd); } catch (JSONException ex) { ex.printStackTrace(); } } alltweets.put("nodes", nodes); alltweets.put("links", links); } catch (JSONException ex) { Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex); } return alltweets; } /** * Recursively traverses the list of nodes to identify all nodes reachable from a starting node. * @param userconnections A map containing the usernames as keys and the node information as value * @param cur_node Node currently being processed. * @param newnodes A list of nodes which can be reached from the current node * @return A map of the usernames and the node information for all nodes reachable */ public HashMap GetNextHopConnections(HashMap userconnections,NetworkNode cur_node,HashMap newnodes) { cur_node.level = cur_node.level+1; newnodes.put(cur_node.username,cur_node); for(int i=0;i rtnodes = GetNextHopConnections(userconnections, userconnections.get(tnf.tousername),newnodes); newnodes = rtnodes; } return newnodes; } /** * Divides a list of nodes into groups using the square root binning * technique. If a node has size x and there are y groups in total. Then the * group of the node is computed as ceil((sqrt(x)/sqrt(max))*y), where max is * the size of the largest node. * @param nodes A list of nodes * @param max The maximum size of a node * @param min The minimum size of a node * @param noofclasses Number of classes into which the nodes must be classified * @return A list of nodes along with their class */ public ArrayList ComputeGroupsSqrt(ArrayList nodes, int max, int min, int noofclasses) { ArrayList finalnodes = new ArrayList(); for(int i=0;i0) { color_index = (int) Math.ceil(((double)Math.sqrt(node.size)/Math.sqrt(max))*noofclasses)-1; // node.size = color_index*6; } node.group = color_index; finalnodes.add(node); } return finalnodes; } //DEBUG use only public static void main(String[] args) { try { CreateD3Network cdn = new CreateD3Network(); JSONObject jobj = new JSONObject(); JSONObject obj = new JSONObject(); obj.put("color", "#800000"); JSONArray ja = new JSONArray(); ja.put("zuccotti"); obj.put("hts", ja); jobj.put("Group 1", obj); obj = new JSONObject(); obj.put("color", "#0FFF00"); ja = new JSONArray(); ja.put("#nypd"); obj.put("hts", ja); jobj.put("Group 2", obj); String filename = "D:\\Twitter Data Analytics\\Data\\testows.json"; JSONObject nodes = cdn.ConvertTweetsToDiffusionPath(filename,7, jobj,5); } catch (JSONException ex) { ex.printStackTrace(); } } }