From 14d7547cd31c5be878e377a4a5370f604c8d59d4 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 23 Apr 2014 12:22:20 +0200 Subject: Initial commit build.xml, etc. are modified a bit after opening in Netbeans 7.4. --- .gitignore | 12 + build.xml | 73 + ...beans-modules-java-j2seproject-copylibstask.jar | Bin 0 -> 22970 bytes lib/collections-generic-4.01.jar | Bin 0 -> 531557 bytes lib/colt-1.2.0.jar | Bin 0 -> 581945 bytes lib/commons-codec-1.7.jar | Bin 0 -> 259600 bytes lib/commons-httpclient-3.1_1.jar | Bin 0 -> 305001 bytes lib/commons-lang-2.6.jar | Bin 0 -> 284220 bytes lib/commons-logging-1.1.1.jar | Bin 0 -> 60841 bytes lib/concurrent-1.3.4.jar | Bin 0 -> 189284 bytes lib/gson-2.2.4.jar | Bin 0 -> 190418 bytes lib/httpclient-4.2.1.jar | Bin 0 -> 427022 bytes lib/httpcore-4.2.1.jar | Bin 0 -> 223374 bytes lib/j3d-core-1.3.1.jar | Bin 0 -> 2513498 bytes lib/jfig-1.5.2.jar | Bin 0 -> 48767 bytes lib/json.jar | Bin 0 -> 93396 bytes lib/jung-3d-2.0.1.jar | Bin 0 -> 73271 bytes lib/jung-3d-demos-2.0.1.jar | Bin 0 -> 3966 bytes lib/jung-algorithms-2.0.1.jar | Bin 0 -> 233113 bytes lib/jung-api-2.0.1.jar | Bin 0 -> 40975 bytes lib/jung-graph-impl-2.0.1.jar | Bin 0 -> 62329 bytes lib/jung-io-2.0.1.jar | Bin 0 -> 79372 bytes lib/jung-jai-2.0.1.jar | Bin 0 -> 20440 bytes lib/jung-jai-samples-2.0.1.jar | Bin 0 -> 46790 bytes lib/jung-samples-2.0.1.jar | Bin 0 -> 811841 bytes lib/jung-visualization-2.0.1.jar | Bin 0 -> 324398 bytes lib/junit/junit-3.8.2-api.zip | Bin 0 -> 72555 bytes lib/junit/junit-3.8.2.jar | Bin 0 -> 118932 bytes lib/junit_4/junit-4.5-api.zip | Bin 0 -> 184067 bytes lib/junit_4/junit-4.5-src.jar | Bin 0 -> 109014 bytes lib/junit_4/junit-4.5.jar | Bin 0 -> 196787 bytes lib/log4j-1.2.15.jar | Bin 0 -> 391834 bytes lib/mallet-deps.jar | Bin 0 -> 2644050 bytes lib/mallet.jar | Bin 0 -> 2125173 bytes lib/nblibraries.properties | 14 + lib/signpost-commonshttp4-1.2.1.2.jar | Bin 0 -> 6512 bytes lib/signpost-core-1.2.1.2.jar | Bin 0 -> 45277 bytes lib/stax-api-1.0.1.jar | Bin 0 -> 26514 bytes lib/vecmath-1.3.1.jar | Bin 0 -> 289881 bytes lib/wstx-asl-3.2.6.jar | Bin 0 -> 520389 bytes manifest.mf | 3 + nbproject/build-impl.xml | 1415 ++++++++++++++++++++ nbproject/genfiles.properties | 8 + nbproject/project.properties | 135 ++ nbproject/project.xml | 16 + .../Location/LocationTranslationExample.java | 124 ++ src/Chapter2/openauthentication/OAuthExample.java | 79 ++ src/Chapter2/restapi/RESTApiExample.java | 676 ++++++++++ src/Chapter2/restapi/RESTSearchExample.java | 311 +++++ src/Chapter2/streamingapi/StreamingApiExample.java | 372 +++++ src/Chapter2/support/APIType.java | 12 + src/Chapter2/support/InfoType.java | 12 + src/Chapter2/support/Location.java | 28 + src/Chapter2/support/OAuthTokenSecret.java | 38 + src/Chapter4/GraphElements/RetweetEdge.java | 53 + src/Chapter4/GraphElements/UserNode.java | 34 + .../examples/BetweennessCentralityExample.java | 31 + .../examples/EigenvectorCentralityExample.java | 36 + .../examples/InDegreeCentralityExample.java | 30 + .../examples/PageRankCentralityExample.java | 39 + .../classification/bayes/Classification.java | 22 + src/Chapter4/classification/bayes/NBCxv.java | 60 + .../bayes/NaiveBayesSentimentClassifier.java | 264 ++++ .../classification/bayes/StopwordsList.java | 10 + src/Chapter4/classification/bayes/TestNBC.java | 49 + .../classification/bayes/WordCountPair.java | 34 + .../graph/visualization/SimpleGraphViewer.java | 86 ++ src/Chapter4/tweetlda/LDA.java | 89 ++ src/Chapter4/tweetlda/PorterStemmer.java | 33 + src/Chapter4/tweetlda/Stemmer.java | 428 ++++++ src/Chapter4/util/BetweennessScorer.java | 25 + src/Chapter4/util/EigenVectorScorer.java | 64 + src/Chapter4/util/InDegreeScorer.java | 30 + src/Chapter4/util/TweetFileProcessor.java | 76 ++ src/Chapter4/util/TweetFileToGraph.java | 77 ++ src/Chapter5/network/CreateD3Network.java | 716 ++++++++++ src/Chapter5/network/ExtractUserTagNetwork.java | 173 +++ src/Chapter5/support/DateInfo.java | 30 + src/Chapter5/support/HashTagDS.java | 18 + src/Chapter5/support/NetworkNode.java | 49 + src/Chapter5/support/NodeIDComparator.java | 32 + src/Chapter5/support/NodeSizeComparator.java | 29 + src/Chapter5/support/ToNodeInfo.java | 23 + src/Chapter5/support/Tweet.java | 21 + src/Chapter5/text/EventSummaryExtractor.java | 269 ++++ src/Chapter5/text/ExtractTopKeywords.java | 151 +++ src/Chapter5/trends/ControlChartExample.java | 144 ++ src/Chapter5/trends/DateInfo.java | 29 + src/Chapter5/trends/ExtractDatasetTrend.java | 120 ++ src/Chapter5/trends/SparkLineExample.java | 163 +++ src/Chapter5/trends/TCDateInfo.java | 31 + src/Chapter5/trends/TrendComparisonExample.java | 155 +++ src/utils/OAuthUtils.java | 21 + src/utils/Tags.java | 52 + src/utils/TextUtils.java | 212 +++ streaming/streaming.config | 3 + 96 files changed, 7339 insertions(+) create mode 100644 .gitignore create mode 100644 build.xml create mode 100644 lib/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar create mode 100644 lib/collections-generic-4.01.jar create mode 100644 lib/colt-1.2.0.jar create mode 100644 lib/commons-codec-1.7.jar create mode 100644 lib/commons-httpclient-3.1_1.jar create mode 100644 lib/commons-lang-2.6.jar create mode 100644 lib/commons-logging-1.1.1.jar create mode 100644 lib/concurrent-1.3.4.jar create mode 100644 lib/gson-2.2.4.jar create mode 100644 lib/httpclient-4.2.1.jar create mode 100644 lib/httpcore-4.2.1.jar create mode 100644 lib/j3d-core-1.3.1.jar create mode 100644 lib/jfig-1.5.2.jar create mode 100644 lib/json.jar create mode 100644 lib/jung-3d-2.0.1.jar create mode 100644 lib/jung-3d-demos-2.0.1.jar create mode 100644 lib/jung-algorithms-2.0.1.jar create mode 100644 lib/jung-api-2.0.1.jar create mode 100644 lib/jung-graph-impl-2.0.1.jar create mode 100644 lib/jung-io-2.0.1.jar create mode 100644 lib/jung-jai-2.0.1.jar create mode 100644 lib/jung-jai-samples-2.0.1.jar create mode 100644 lib/jung-samples-2.0.1.jar create mode 100644 lib/jung-visualization-2.0.1.jar create mode 100644 lib/junit/junit-3.8.2-api.zip create mode 100644 lib/junit/junit-3.8.2.jar create mode 100644 lib/junit_4/junit-4.5-api.zip create mode 100644 lib/junit_4/junit-4.5-src.jar create mode 100644 lib/junit_4/junit-4.5.jar create mode 100644 lib/log4j-1.2.15.jar create mode 100644 lib/mallet-deps.jar create mode 100644 lib/mallet.jar create mode 100644 lib/nblibraries.properties create mode 100644 lib/signpost-commonshttp4-1.2.1.2.jar create mode 100644 lib/signpost-core-1.2.1.2.jar create mode 100644 lib/stax-api-1.0.1.jar create mode 100644 lib/vecmath-1.3.1.jar create mode 100644 lib/wstx-asl-3.2.6.jar create mode 100644 manifest.mf create mode 100644 nbproject/build-impl.xml create mode 100644 nbproject/genfiles.properties create mode 100644 nbproject/project.properties create mode 100644 nbproject/project.xml create mode 100644 src/Chapter2/Location/LocationTranslationExample.java create mode 100644 src/Chapter2/openauthentication/OAuthExample.java create mode 100644 src/Chapter2/restapi/RESTApiExample.java create mode 100644 src/Chapter2/restapi/RESTSearchExample.java create mode 100644 src/Chapter2/streamingapi/StreamingApiExample.java create mode 100644 src/Chapter2/support/APIType.java create mode 100644 src/Chapter2/support/InfoType.java create mode 100644 src/Chapter2/support/Location.java create mode 100644 src/Chapter2/support/OAuthTokenSecret.java create mode 100644 src/Chapter4/GraphElements/RetweetEdge.java create mode 100644 src/Chapter4/GraphElements/UserNode.java create mode 100644 src/Chapter4/centrality/examples/BetweennessCentralityExample.java create mode 100644 src/Chapter4/centrality/examples/EigenvectorCentralityExample.java create mode 100644 src/Chapter4/centrality/examples/InDegreeCentralityExample.java create mode 100644 src/Chapter4/centrality/examples/PageRankCentralityExample.java create mode 100644 src/Chapter4/classification/bayes/Classification.java create mode 100644 src/Chapter4/classification/bayes/NBCxv.java create mode 100644 src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java create mode 100644 src/Chapter4/classification/bayes/StopwordsList.java create mode 100644 src/Chapter4/classification/bayes/TestNBC.java create mode 100644 src/Chapter4/classification/bayes/WordCountPair.java create mode 100644 src/Chapter4/graph/visualization/SimpleGraphViewer.java create mode 100644 src/Chapter4/tweetlda/LDA.java create mode 100644 src/Chapter4/tweetlda/PorterStemmer.java create mode 100644 src/Chapter4/tweetlda/Stemmer.java create mode 100644 src/Chapter4/util/BetweennessScorer.java create mode 100644 src/Chapter4/util/EigenVectorScorer.java create mode 100644 src/Chapter4/util/InDegreeScorer.java create mode 100644 src/Chapter4/util/TweetFileProcessor.java create mode 100644 src/Chapter4/util/TweetFileToGraph.java create mode 100644 src/Chapter5/network/CreateD3Network.java create mode 100644 src/Chapter5/network/ExtractUserTagNetwork.java create mode 100644 src/Chapter5/support/DateInfo.java create mode 100644 src/Chapter5/support/HashTagDS.java create mode 100644 src/Chapter5/support/NetworkNode.java create mode 100644 src/Chapter5/support/NodeIDComparator.java create mode 100644 src/Chapter5/support/NodeSizeComparator.java create mode 100644 src/Chapter5/support/ToNodeInfo.java create mode 100644 src/Chapter5/support/Tweet.java create mode 100644 src/Chapter5/text/EventSummaryExtractor.java create mode 100644 src/Chapter5/text/ExtractTopKeywords.java create mode 100644 src/Chapter5/trends/ControlChartExample.java create mode 100644 src/Chapter5/trends/DateInfo.java create mode 100644 src/Chapter5/trends/ExtractDatasetTrend.java create mode 100644 src/Chapter5/trends/SparkLineExample.java create mode 100644 src/Chapter5/trends/TCDateInfo.java create mode 100644 src/Chapter5/trends/TrendComparisonExample.java create mode 100644 src/utils/OAuthUtils.java create mode 100644 src/utils/Tags.java create mode 100644 src/utils/TextUtils.java create mode 100644 streaming/streaming.config diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bae604d --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +/nbproject/private/ +/build/ +/dist/ + +# JaCoCO test coverage tool +.jacocoverage/ +jacoco.exec-* + +# Editor temp files, diff, etc. +*~ +.*.sw? +*.orig diff --git a/build.xml b/build.xml new file mode 100644 index 0000000..24aba47 --- /dev/null +++ b/build.xml @@ -0,0 +1,73 @@ + + + + + + + + + + + Builds, tests, and runs the project TwitterDataAnalytics. + + + diff --git a/lib/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar b/lib/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar new file mode 100644 index 0000000..ff1abcc Binary files /dev/null and b/lib/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar differ diff --git a/lib/collections-generic-4.01.jar b/lib/collections-generic-4.01.jar new file mode 100644 index 0000000..92d009c Binary files /dev/null and b/lib/collections-generic-4.01.jar differ diff --git a/lib/colt-1.2.0.jar b/lib/colt-1.2.0.jar new file mode 100644 index 0000000..a7192f6 Binary files /dev/null and b/lib/colt-1.2.0.jar differ diff --git a/lib/commons-codec-1.7.jar b/lib/commons-codec-1.7.jar new file mode 100644 index 0000000..efa7f72 Binary files /dev/null and b/lib/commons-codec-1.7.jar differ diff --git a/lib/commons-httpclient-3.1_1.jar b/lib/commons-httpclient-3.1_1.jar new file mode 100644 index 0000000..7c59774 Binary files /dev/null and b/lib/commons-httpclient-3.1_1.jar differ diff --git a/lib/commons-lang-2.6.jar b/lib/commons-lang-2.6.jar new file mode 100644 index 0000000..98467d3 Binary files /dev/null and b/lib/commons-lang-2.6.jar differ diff --git a/lib/commons-logging-1.1.1.jar b/lib/commons-logging-1.1.1.jar new file mode 100644 index 0000000..8758a96 Binary files /dev/null and b/lib/commons-logging-1.1.1.jar differ diff --git a/lib/concurrent-1.3.4.jar b/lib/concurrent-1.3.4.jar new file mode 100644 index 0000000..551f347 Binary files /dev/null and b/lib/concurrent-1.3.4.jar differ diff --git a/lib/gson-2.2.4.jar b/lib/gson-2.2.4.jar new file mode 100644 index 0000000..9478253 Binary files /dev/null and b/lib/gson-2.2.4.jar differ diff --git a/lib/httpclient-4.2.1.jar b/lib/httpclient-4.2.1.jar new file mode 100644 index 0000000..1d52333 Binary files /dev/null and b/lib/httpclient-4.2.1.jar differ diff --git a/lib/httpcore-4.2.1.jar b/lib/httpcore-4.2.1.jar new file mode 100644 index 0000000..16d75e1 Binary files /dev/null and b/lib/httpcore-4.2.1.jar differ diff --git a/lib/j3d-core-1.3.1.jar b/lib/j3d-core-1.3.1.jar new file mode 100644 index 0000000..cfe6364 Binary files /dev/null and b/lib/j3d-core-1.3.1.jar differ diff --git a/lib/jfig-1.5.2.jar b/lib/jfig-1.5.2.jar new file mode 100644 index 0000000..d671f83 Binary files /dev/null and b/lib/jfig-1.5.2.jar differ diff --git a/lib/json.jar b/lib/json.jar new file mode 100644 index 0000000..5a93e51 Binary files /dev/null and b/lib/json.jar differ diff --git a/lib/jung-3d-2.0.1.jar b/lib/jung-3d-2.0.1.jar new file mode 100644 index 0000000..05c3f18 Binary files /dev/null and b/lib/jung-3d-2.0.1.jar differ diff --git a/lib/jung-3d-demos-2.0.1.jar b/lib/jung-3d-demos-2.0.1.jar new file mode 100644 index 0000000..10fd834 Binary files /dev/null and b/lib/jung-3d-demos-2.0.1.jar differ diff --git a/lib/jung-algorithms-2.0.1.jar b/lib/jung-algorithms-2.0.1.jar new file mode 100644 index 0000000..5b98f9c Binary files /dev/null and b/lib/jung-algorithms-2.0.1.jar differ diff --git a/lib/jung-api-2.0.1.jar b/lib/jung-api-2.0.1.jar new file mode 100644 index 0000000..6dcac89 Binary files /dev/null and b/lib/jung-api-2.0.1.jar differ diff --git a/lib/jung-graph-impl-2.0.1.jar b/lib/jung-graph-impl-2.0.1.jar new file mode 100644 index 0000000..a64f6f7 Binary files /dev/null and b/lib/jung-graph-impl-2.0.1.jar differ diff --git a/lib/jung-io-2.0.1.jar b/lib/jung-io-2.0.1.jar new file mode 100644 index 0000000..4059dcd Binary files /dev/null and b/lib/jung-io-2.0.1.jar differ diff --git a/lib/jung-jai-2.0.1.jar b/lib/jung-jai-2.0.1.jar new file mode 100644 index 0000000..feeb09d Binary files /dev/null and b/lib/jung-jai-2.0.1.jar differ diff --git a/lib/jung-jai-samples-2.0.1.jar b/lib/jung-jai-samples-2.0.1.jar new file mode 100644 index 0000000..784cd88 Binary files /dev/null and b/lib/jung-jai-samples-2.0.1.jar differ diff --git a/lib/jung-samples-2.0.1.jar b/lib/jung-samples-2.0.1.jar new file mode 100644 index 0000000..838461d Binary files /dev/null and b/lib/jung-samples-2.0.1.jar differ diff --git a/lib/jung-visualization-2.0.1.jar b/lib/jung-visualization-2.0.1.jar new file mode 100644 index 0000000..c611e77 Binary files /dev/null and b/lib/jung-visualization-2.0.1.jar differ diff --git a/lib/junit/junit-3.8.2-api.zip b/lib/junit/junit-3.8.2-api.zip new file mode 100644 index 0000000..6d792fd Binary files /dev/null and b/lib/junit/junit-3.8.2-api.zip differ diff --git a/lib/junit/junit-3.8.2.jar b/lib/junit/junit-3.8.2.jar new file mode 100644 index 0000000..d835872 Binary files /dev/null and b/lib/junit/junit-3.8.2.jar differ diff --git a/lib/junit_4/junit-4.5-api.zip b/lib/junit_4/junit-4.5-api.zip new file mode 100644 index 0000000..5748c44 Binary files /dev/null and b/lib/junit_4/junit-4.5-api.zip differ diff --git a/lib/junit_4/junit-4.5-src.jar b/lib/junit_4/junit-4.5-src.jar new file mode 100644 index 0000000..18774a5 Binary files /dev/null and b/lib/junit_4/junit-4.5-src.jar differ diff --git a/lib/junit_4/junit-4.5.jar b/lib/junit_4/junit-4.5.jar new file mode 100644 index 0000000..83f8bc7 Binary files /dev/null and b/lib/junit_4/junit-4.5.jar differ diff --git a/lib/log4j-1.2.15.jar b/lib/log4j-1.2.15.jar new file mode 100644 index 0000000..c930a6a Binary files /dev/null and b/lib/log4j-1.2.15.jar differ diff --git a/lib/mallet-deps.jar b/lib/mallet-deps.jar new file mode 100644 index 0000000..05517df Binary files /dev/null and b/lib/mallet-deps.jar differ diff --git a/lib/mallet.jar b/lib/mallet.jar new file mode 100644 index 0000000..fb8fef5 Binary files /dev/null and b/lib/mallet.jar differ diff --git a/lib/nblibraries.properties b/lib/nblibraries.properties new file mode 100644 index 0000000..52864ae --- /dev/null +++ b/lib/nblibraries.properties @@ -0,0 +1,14 @@ +libs.CopyLibs.classpath=\ + ${base}/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar +libs.CopyLibs.displayName=CopyLibs Task +libs.CopyLibs.prop-version=2.0 +libs.junit.classpath=\ + ${base}/junit/junit-3.8.2.jar +libs.junit.javadoc=\ + ${base}/junit/junit-3.8.2-api.zip +libs.junit_4.classpath=\ + ${base}/junit_4/junit-4.5.jar +libs.junit_4.javadoc=\ + ${base}/junit_4/junit-4.5-api.zip +libs.junit_4.src=\ + ${base}/junit_4/junit-4.5-src.jar diff --git a/lib/signpost-commonshttp4-1.2.1.2.jar b/lib/signpost-commonshttp4-1.2.1.2.jar new file mode 100644 index 0000000..fd37cfa Binary files /dev/null and b/lib/signpost-commonshttp4-1.2.1.2.jar differ diff --git a/lib/signpost-core-1.2.1.2.jar b/lib/signpost-core-1.2.1.2.jar new file mode 100644 index 0000000..8871730 Binary files /dev/null and b/lib/signpost-core-1.2.1.2.jar differ diff --git a/lib/stax-api-1.0.1.jar b/lib/stax-api-1.0.1.jar new file mode 100644 index 0000000..d9a1665 Binary files /dev/null and b/lib/stax-api-1.0.1.jar differ diff --git a/lib/vecmath-1.3.1.jar b/lib/vecmath-1.3.1.jar new file mode 100644 index 0000000..fc2244b Binary files /dev/null and b/lib/vecmath-1.3.1.jar differ diff --git a/lib/wstx-asl-3.2.6.jar b/lib/wstx-asl-3.2.6.jar new file mode 100644 index 0000000..aee5f0c Binary files /dev/null and b/lib/wstx-asl-3.2.6.jar differ diff --git a/manifest.mf b/manifest.mf new file mode 100644 index 0000000..1574df4 --- /dev/null +++ b/manifest.mf @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +X-COMMENT: Main-Class will be added automatically by build + diff --git a/nbproject/build-impl.xml b/nbproject/build-impl.xml new file mode 100644 index 0000000..7e854aa --- /dev/null +++ b/nbproject/build-impl.xml @@ -0,0 +1,1415 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set src.dir + Must set build.dir + Must set dist.dir + Must set build.classes.dir + Must set dist.javadoc.dir + Must set build.test.classes.dir + Must set build.test.results.dir + Must set build.classes.excludes + Must set dist.jar + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set javac.includes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + No tests executed. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set JVM to use for profiling in profiler.info.jvm + Must set profiler agent JVM arguments in profiler.info.jvmargs.agent + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select some files in the IDE or set javac.includes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + To run this application from the command line without Ant, try: + + java -jar "${dist.jar.resolved}" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set run.class + + + + Must select one file in the IDE or set run.class + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set debug.class + + + + + Must select one file in the IDE or set debug.class + + + + + Must set fix.includes + + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + Must select one file in the IDE or set profile.class + This target only works when run from inside the NetBeans IDE. + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set run.class + + + + + + Must select some files in the IDE or set test.includes + + + + + Must select one file in the IDE or set run.class + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select some files in the IDE or set javac.includes + + + + + + + + + + + + + + + + + + Some tests failed; see details above. + + + + + + + + + Must select some files in the IDE or set test.includes + + + + Some tests failed; see details above. + + + + Must select some files in the IDE or set test.class + Must select some method in the IDE or set test.method + + + + Some tests failed; see details above. + + + + + Must select one file in the IDE or set test.class + + + + Must select one file in the IDE or set test.class + Must select some method in the IDE or set test.method + + + + + + + + + + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/nbproject/genfiles.properties b/nbproject/genfiles.properties new file mode 100644 index 0000000..b42a5d3 --- /dev/null +++ b/nbproject/genfiles.properties @@ -0,0 +1,8 @@ +build.xml.data.CRC32=72787bde +build.xml.script.CRC32=57d18e43 +build.xml.stylesheet.CRC32=8064a381@1.68.1.46 +# This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml. +# Do not edit this file. You may delete it but then the IDE will never regenerate such files for you. +nbproject/build-impl.xml.data.CRC32=72787bde +nbproject/build-impl.xml.script.CRC32=4304d30d +nbproject/build-impl.xml.stylesheet.CRC32=5a01deb7@1.68.1.46 diff --git a/nbproject/project.properties b/nbproject/project.properties new file mode 100644 index 0000000..e32b494 --- /dev/null +++ b/nbproject/project.properties @@ -0,0 +1,135 @@ +annotation.processing.enabled=true +annotation.processing.enabled.in.editor=false +annotation.processing.processors.list= +annotation.processing.run.all.processors=true +annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output +application.title=TwitterDataAnalytics +application.vendor=skumar34 +build.classes.dir=${build.dir}/classes +build.classes.excludes=**/*.java,**/*.form +# This directory is removed when the project is cleaned: +build.dir=build +build.generated.dir=${build.dir}/generated +build.generated.sources.dir=${build.dir}/generated-sources +# Only compile against the classpath explicitly listed here: +build.sysclasspath=ignore +build.test.classes.dir=${build.dir}/test/classes +build.test.results.dir=${build.dir}/test/results +# Uncomment to specify the preferred debugger connection transport: +#debug.transport=dt_socket +debug.classpath=\ + ${run.classpath} +debug.test.classpath=\ + ${run.test.classpath} +# This directory is removed when the project is cleaned: +dist.dir=dist +dist.jar=${dist.dir}/TwitterDataAnalytics.jar +dist.javadoc.dir=${dist.dir}/javadoc +endorsed.classpath= +excludes= +file.reference.collections-generic-4.01.jar=lib/collections-generic-4.01.jar +file.reference.colt-1.2.0.jar=lib/colt-1.2.0.jar +file.reference.commons-codec-1.7.jar=lib/commons-codec-1.7.jar +file.reference.commons-httpclient-3.1_1.jar=lib/commons-httpclient-3.1_1.jar +file.reference.commons-lang-2.6.jar=lib/commons-lang-2.6.jar +file.reference.commons-logging-1.1.1.jar=lib/commons-logging-1.1.1.jar +file.reference.concurrent-1.3.4.jar=lib/concurrent-1.3.4.jar +file.reference.gson-2.2.4.jar=lib/gson-2.2.4.jar +file.reference.httpclient-4.2.1.jar=lib/httpclient-4.2.1.jar +file.reference.httpcore-4.2.1.jar=lib/httpcore-4.2.1.jar +file.reference.j3d-core-1.3.1.jar=lib/j3d-core-1.3.1.jar +file.reference.jfig-1.5.2.jar=lib/jfig-1.5.2.jar +file.reference.json.jar=lib/json.jar +file.reference.jung-3d-2.0.1.jar=lib/jung-3d-2.0.1.jar +file.reference.jung-3d-demos-2.0.1.jar=lib/jung-3d-demos-2.0.1.jar +file.reference.jung-algorithms-2.0.1.jar=lib/jung-algorithms-2.0.1.jar +file.reference.jung-api-2.0.1.jar=lib/jung-api-2.0.1.jar +file.reference.jung-graph-impl-2.0.1.jar=lib/jung-graph-impl-2.0.1.jar +file.reference.jung-io-2.0.1.jar=lib/jung-io-2.0.1.jar +file.reference.jung-jai-2.0.1.jar=lib/jung-jai-2.0.1.jar +file.reference.jung-jai-samples-2.0.1.jar=lib/jung-jai-samples-2.0.1.jar +file.reference.jung-samples-2.0.1.jar=lib/jung-samples-2.0.1.jar +file.reference.jung-visualization-2.0.1.jar=lib/jung-visualization-2.0.1.jar +file.reference.log4j-1.2.15.jar=lib/log4j-1.2.15.jar +file.reference.mallet-deps.jar=lib/mallet-deps.jar +file.reference.mallet.jar=lib/mallet.jar +file.reference.signpost-commonshttp4-1.2.1.2.jar=lib/signpost-commonshttp4-1.2.1.2.jar +file.reference.signpost-core-1.2.1.2.jar=lib/signpost-core-1.2.1.2.jar +file.reference.stax-api-1.0.1.jar=lib/stax-api-1.0.1.jar +file.reference.TwitterDataAnalytics-src=src +file.reference.vecmath-1.3.1.jar=lib/vecmath-1.3.1.jar +file.reference.wstx-asl-3.2.6.jar=lib/wstx-asl-3.2.6.jar +includes=** +jar.compress=false +javac.classpath=\ + ${file.reference.collections-generic-4.01.jar}:\ + ${file.reference.colt-1.2.0.jar}:\ + ${file.reference.commons-codec-1.7.jar}:\ + ${file.reference.commons-httpclient-3.1_1.jar}:\ + ${file.reference.commons-lang-2.6.jar}:\ + ${file.reference.commons-logging-1.1.1.jar}:\ + ${file.reference.concurrent-1.3.4.jar}:\ + ${file.reference.gson-2.2.4.jar}:\ + ${file.reference.httpclient-4.2.1.jar}:\ + ${file.reference.httpcore-4.2.1.jar}:\ + ${file.reference.j3d-core-1.3.1.jar}:\ + ${file.reference.jfig-1.5.2.jar}:\ + ${file.reference.json.jar}:\ + ${file.reference.jung-3d-2.0.1.jar}:\ + ${file.reference.jung-3d-demos-2.0.1.jar}:\ + ${file.reference.jung-algorithms-2.0.1.jar}:\ + ${file.reference.jung-api-2.0.1.jar}:\ + ${file.reference.jung-graph-impl-2.0.1.jar}:\ + ${file.reference.jung-io-2.0.1.jar}:\ + ${file.reference.jung-jai-2.0.1.jar}:\ + ${file.reference.jung-jai-samples-2.0.1.jar}:\ + ${file.reference.jung-samples-2.0.1.jar}:\ + ${file.reference.jung-visualization-2.0.1.jar}:\ + ${file.reference.log4j-1.2.15.jar}:\ + ${file.reference.mallet-deps.jar}:\ + ${file.reference.mallet.jar}:\ + ${file.reference.signpost-commonshttp4-1.2.1.2.jar}:\ + ${file.reference.signpost-core-1.2.1.2.jar}:\ + ${file.reference.stax-api-1.0.1.jar}:\ + ${file.reference.vecmath-1.3.1.jar}:\ + ${file.reference.wstx-asl-3.2.6.jar} +# Space-separated list of extra javac options +javac.compilerargs= +javac.deprecation=false +javac.processorpath=\ + ${javac.classpath} +javac.source=1.7 +javac.target=1.7 +javac.test.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir} +javac.test.processorpath=\ + ${javac.test.classpath} +javadoc.additionalparam= +javadoc.author=false +javadoc.encoding=${source.encoding} +javadoc.noindex=false +javadoc.nonavbar=false +javadoc.notree=false +javadoc.private=false +javadoc.splitindex=true +javadoc.use=true +javadoc.version=false +javadoc.windowtitle= +main.class= +manifest.file=manifest.mf +meta.inf.dir=${src.dir}/META-INF +mkdist.disabled=false +platform.active=default_platform +run.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir} +# Space-separated list of JVM arguments used when running the project. +# You may also define separate properties like run-sys-prop.name=value instead of -Dname=value. +# To set system properties for unit tests define test-sys-prop.name=value: +run.jvmargs= +run.test.classpath=\ + ${javac.test.classpath}:\ + ${build.test.classes.dir} +source.encoding=UTF-8 +src.dir=${file.reference.TwitterDataAnalytics-src} diff --git a/nbproject/project.xml b/nbproject/project.xml new file mode 100644 index 0000000..c85b6f7 --- /dev/null +++ b/nbproject/project.xml @@ -0,0 +1,16 @@ + + + org.netbeans.modules.java.j2seproject + + + TwitterDataAnalytics + + + + + + + .\lib\nblibraries.properties + + + diff --git a/src/Chapter2/Location/LocationTranslationExample.java b/src/Chapter2/Location/LocationTranslationExample.java new file mode 100644 index 0000000..69178dc --- /dev/null +++ b/src/Chapter2/Location/LocationTranslationExample.java @@ -0,0 +1,124 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter2.Location; + +import Chapter2.support.Location; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLEncoder; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.json.JSONArray; +import org.json.JSONException; + +public class LocationTranslationExample +{ + + /** + * Translates a location string to coordinates using the database or Nominatim Service + * @param loc + * @return + */ + public Location TranslateLoc(String loc) + { + if(loc!=null&&!loc.isEmpty()) + { + String encodedLoc=""; + try { + //Step 1: Encode the location name + encodedLoc = URLEncoder.encode(loc, "UTF-8"); + } catch (UnsupportedEncodingException ex) { + Logger.getLogger(LocationTranslationExample.class.getName()).log(Level.SEVERE, null, ex); + } + //Step 2: Create a get request to MapQuest API with the name of the location + String url= "http://open.mapquestapi.com/nominatim/v1/search?q="+encodedLoc+"&format=json"; + String page = ReadHTML(url); + if(page!=null) + { + try{ + JSONArray results = new JSONArray(page); + if(results.length()>0) + { + //Step 3: Read and extract the coordinates of the location as a JSONObject + Location loca = new Location(results.getJSONObject(0).getDouble("lat"),results.getJSONObject(0).getDouble("lon")); + return loca; + } + }catch(JSONException ex) + { + Logger.getLogger(LocationTranslationExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + } + return null; + } + + /** + * Extracts the html content of a URL + * @param url + * @return html page + */ + public String ReadHTML(String url) + { + URLConnection conn = null; + URL theURL = null; + try + { + theURL = new URL(url); + } + catch ( MalformedURLException e) + { + System.out.println("Bad URL: " + theURL); + return null; + } + String page = ""; + try + { + conn = theURL.openConnection(); + HttpURLConnection huc = (HttpURLConnection) conn; + conn.setConnectTimeout(2000); + huc.setRequestProperty("User-Agent", "Mozilla/4.5"); + //Set your email address in the request so MapQuest knows how to reach you in the event of problems + huc.setRequestProperty("Email", "twitterdataanalytics@gmail.com"); + if(huc.getResponseCode()>=400&&huc.getResponseCode()<=404) + { + return null; + } + conn.connect(); + BufferedReader bRead = new BufferedReader(new InputStreamReader((InputStream) conn.getContent())); + String temp=null; + while( (temp= bRead.readLine())!=null) + { + page = page+"\n"+temp; + } + bRead.close(); + } + catch (IOException e) { + //System.out.print("ReadHTML IO Error:" + e.getMessage()+" \n"); + return null; + } + return page; + } + + public static void main(String[] args) + { + LocationTranslationExample lte = new LocationTranslationExample(); + if(args!=null) + { + if(args.length>0) + { + for(int i=0;i Usernames = new ArrayList(); + OAuthConsumer Consumer; + + /** + * Creates a OAuthConsumer with the current consumer & user access tokens and secrets + * @return consumer + */ + public OAuthConsumer GetConsumer() + { + OAuthConsumer consumer = new DefaultOAuthConsumer(utils.OAuthUtils.CONSUMER_KEY,utils.OAuthUtils.CONSUMER_SECRET); + consumer.setTokenWithSecret(OAuthTokens.getAccessToken(),OAuthTokens.getAccessSecret()); + return consumer; + } + + /** + * Reads the file and loads the users in the file to be crawled + * @param filename + */ + public void ReadUsers(String filename) + { + BufferedReader br = null; + try { + br = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8")); + String temp = ""; + while((temp = br.readLine())!=null) + { + if(!temp.isEmpty()) + { + Usernames.add(temp); + } + } + } catch (IOException ex) { + ex.printStackTrace(); + } + finally{ + try { + br.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + } + + /** + * Load the User Access Token, and the User Access Secret + */ + public void LoadTwitterToken() + { + //Un-comment before release +// OAuthExample oae = new OAuthExample(); +// OAuthTokens = oae.GetUserAccessKeySecret(); + //Remove before release + OAuthTokens = OAuthExample.DEBUGUserAccessSecret(); + } + + public static void main(String[] args) + { + RESTApiExample rae = new RESTApiExample(); + rae.LoadTwitterToken(); + rae.Consumer = rae.GetConsumer(); +// System.out.println(rae.GetStatuses("twtanalyticsbk")); + System.out.println(rae.GetRateLimitStatus()); +// int apicode = InfoType.PROFILE_INFO; +// String infilename = rae.DEF_FILENAME; +// String outfilename = rae.DEF_OUTFILENAME; +// if(args!=null) +// { +// if(args.length>2) +// { +// apicode = Integer.parseInt(args[2]); +// outfilename = args[1]; +// infilename = args[0]; +// } +// if(args.length>1) +// { +// outfilename = args[1]; +// infilename = args[0]; +// } +// else +// if(args.length>0) +// { +// infilename = args[0]; +// } +// } +// rae.InitializeWriters(outfilename); +// rae.ReadUsers(infilename); +// if(apicode!=InfoType.PROFILE_INFO&&apicode!=InfoType.FOLLOWER_INFO&&apicode!=InfoType.FRIEND_INFO&&apicode!=InfoType.STATUSES_INFO) +// { +// System.out.println("Invalid API type: Use 0 for Profile, 1 for Followers, 2 for Friends, and 3 for Statuses"); +// System.exit(0); +// } +// if(rae.Usernames.size()>0) +// { +// //TO-DO: Print the possible API types and get user selection to crawl the users. +// rae.LoadTwitterToken(); +// for(String user:rae.Usernames) +// { +// if(apicode==InfoType.PROFILE_INFO) +// { +// JSONObject jobj = rae.GetProfile(user); +// if(jobj!=null&&jobj.length()==0) +// { +// rae.WriteToFile(user, jobj.toString()); +// } +// } +// else +// if(apicode==InfoType.FRIEND_INFO) +// { +// JSONArray statusarr = rae.GetFriends(user); +// if(statusarr.length()>0) +// { +// rae.WriteToFile(user, statusarr.toString()); +// } +// } +// else +// if(apicode == InfoType.FOLLOWER_INFO) +// { +// JSONArray statusarr = rae.GetFollowers(user); +// if(statusarr.length()>0) +// { +// rae.WriteToFile(user, statusarr.toString()); +// } +// } +// else +// if(apicode == InfoType.STATUSES_INFO) +// { +// JSONArray statusarr = rae.GetStatuses(user); +// if(statusarr.length()>0) +// { +// rae.GetStatuses(user); +// } +// } +// } +// } +//// now you can close the files as all the threads have finished +// rae.CleanupAfterFinish(); + } + + /** + * Retrieves the rate limit status of the application + * @return + */ + public JSONObject GetRateLimitStatus() + { + try{ + URL url = new URL("https://api.twitter.com/1.1/application/rate_limit_status.json"); + HttpURLConnection huc = (HttpURLConnection) url.openConnection(); + huc.setReadTimeout(5000); + Consumer.sign(huc); + huc.connect(); + BufferedReader bRead = new BufferedReader(new InputStreamReader((InputStream) huc.getContent())); + StringBuffer page = new StringBuffer(); + String temp= ""; + while((temp = bRead.readLine())!=null) + { + page.append(temp); + } + bRead.close(); + return (new JSONObject(page.toString())); + } catch (JSONException ex) { + Logger.getLogger(RESTApiExample.class.getName()).log(Level.SEVERE, null, ex); + } catch (OAuthCommunicationException ex) { + Logger.getLogger(RESTApiExample.class.getName()).log(Level.SEVERE, null, ex); + } catch (OAuthMessageSignerException ex) { + Logger.getLogger(RESTApiExample.class.getName()).log(Level.SEVERE, null, ex); + } catch (OAuthExpectationFailedException ex) { + Logger.getLogger(RESTApiExample.class.getName()).log(Level.SEVERE, null, ex); + }catch(IOException ex) + { + Logger.getLogger(RESTApiExample.class.getName()).log(Level.SEVERE, null, ex); + } + return null; + } + + /** + * Initialize the file writer + * @param path of the file + * @param outFilename name of the file + */ + public void InitializeWriters(String outFilename) { + try { + File fl = new File(outFilename); + if(!fl.exists()) + { + fl.createNewFile(); + } + /** + * Use UTF-8 encoding when saving files to avoid + * losing Unicode characters in the data + */ + OutFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFilename,true),"UTF-8")); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + /** + * Close the opened filewriter to save the data + */ + public void CleanupAfterFinish() + { + try { + OutFileWriter.close(); + } catch (IOException ex) { + Logger.getLogger(RESTApiExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + + /** + * Writes the retrieved data to the output file + * @param data containing the retrived information in JSON + * @param user name of the user currently being written + */ + public void WriteToFile(String user, String data) + { + try + { + OutFileWriter.write(data); + OutFileWriter.newLine(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + /** + * Retrives the profile information of the user + * @param username of the user whose profile needs to be retrieved + * @return the profile information as a JSONObject + */ + public JSONObject GetProfile(String username) + { + BufferedReader bRead = null; + JSONObject profile = null; + try { + System.out.println("Processing profile of "+username); + boolean flag = true; + URL url = new URL("https://api.twitter.com/1.1/users/show.json?screen_name="+username); + HttpURLConnection huc = (HttpURLConnection) url.openConnection(); + huc.setReadTimeout(5000); + // Step 2: Sign the request using the OAuth Secret + Consumer.sign(huc); + huc.connect(); + if(huc.getResponseCode()==404||huc.getResponseCode()==401) + { + System.out.println(huc.getResponseMessage()); + } + else + if(huc.getResponseCode()==500||huc.getResponseCode()==502||huc.getResponseCode()==503) + { + try { + huc.disconnect(); + System.out.println(huc.getResponseMessage()); + Thread.sleep(3000); + } catch (InterruptedException ex) { + ex.printStackTrace(); + } + } + else + // Step 3: If the requests have been exhausted, then wait until the quota is renewed + if(huc.getResponseCode()==429) + { + try { + huc.disconnect(); + Thread.sleep(this.GetWaitTime("/users/show/:id")); + flag = false; + } catch (InterruptedException ex) { + ex.printStackTrace(); + } + } + if(!flag) + { + //recreate the connection because something went wrong the first time. + huc.connect(); + } + StringBuilder content=new StringBuilder(); + if(flag) + { + bRead = new BufferedReader(new InputStreamReader((InputStream) huc.getContent())); + String temp= ""; + while((temp = bRead.readLine())!=null) + { + content.append(temp); + } + } + huc.disconnect(); + try { + profile = new JSONObject(content.toString()); + } catch (JSONException ex) { + ex.printStackTrace(); + } + } catch (OAuthCommunicationException ex) { + ex.printStackTrace(); + } catch (OAuthMessageSignerException ex) { + ex.printStackTrace(); + } catch (OAuthExpectationFailedException ex) { + ex.printStackTrace(); + } catch (IOException ex) { + ex.printStackTrace(); + } + return profile; + } + + /** + * Retrieves the followers of a user + * @param username the name of the user whose followers need to be retrieved + * @return a list of user objects corresponding to the followers of the user + */ + public JSONArray GetFollowers(String username) + { + BufferedReader bRead = null; + JSONArray followers = new JSONArray(); + try { + System.out.println(" followers user = "+username); + long cursor = -1; + while(true) + { + if(cursor==0) + { + break; + } + // Step 1: Create the APi request using the supplied username + URL url = new URL("https://api.twitter.com/1.1/followers/list.json?screen_name="+username+"&cursor=" + cursor); + HttpURLConnection huc = (HttpURLConnection) url.openConnection(); + huc.setReadTimeout(5000); + // Step 2: Sign the request using the OAuth Secret + Consumer.sign(huc); + huc.connect(); + if(huc.getResponseCode()==400||huc.getResponseCode()==404) + { + System.out.println(huc.getResponseMessage()); + break; + } + else + if(huc.getResponseCode()==500||huc.getResponseCode()==502||huc.getResponseCode()==503||huc.getResponseCode()==504) + { + try{ + System.out.println(huc.getResponseMessage()); + huc.disconnect(); + Thread.sleep(3000); + continue; + } catch (InterruptedException ex) { + Logger.getLogger(RESTApiExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + else + // Step 3: If the requests have been exhausted, then wait until the quota is renewed + if(huc.getResponseCode()==429) + { + try { + huc.disconnect(); + Thread.sleep(this.GetWaitTime("/followers/list")); + continue; + } catch (InterruptedException ex) { + Logger.getLogger(RESTApiExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + // Step 4: Retrieve the followers list from Twitter + bRead = new BufferedReader(new InputStreamReader((InputStream) huc.getContent())); + StringBuilder content = new StringBuilder(); + String temp = ""; + while((temp = bRead.readLine())!=null) + { + content.append(temp); + } + try { + JSONObject jobj = new JSONObject(content.toString()); + // Step 5: Retrieve the token for the next request + cursor = jobj.getLong("next_cursor"); + JSONArray idlist = jobj.getJSONArray("users"); + if(idlist.length()==0) + { + break; + } + for(int i=0;i queryTerms) + { + String OR_Operator = " OR "; + StringBuffer querystr = new StringBuffer(); + int count = 1; + for(String term:queryTerms) + { + if(count==1) + { + querystr.append(term); + } + else + { + querystr.append(OR_Operator).append(term); + } + } + return querystr.toString(); + } + + public static void main(String[] args) + { + RESTSearchExample rse = new RESTSearchExample(); + ArrayList queryterms = new ArrayList(); + String outfilename = rse.DEF_FILENAME; + if(args!=null) + { + if(args.length>0) + { + for(int i=0;i Keywords; + HashSet Geoboxes; + HashSet Userids; + final String CONFIG_FILE_PATH = "streaming/streaming.config"; + final String DEF_OUTPATH = "streaming/"; + + /** + * Loads the Twitter access token and secret for a user + */ + public void LoadTwitterToken() + { +// OAuthExample oae = new OAuthExample(); +// OAuthToken = oae.GetUserAccessKeySecret(); + OAuthToken = OAuthExample.DEBUGUserAccessSecret(); + } + + /** + * Creates a connection to the Streaming Filter API + * @param baseUrl the URL for Twitter Filter API + * @param outFilePath Location to place the exported file + */ + public void CreateStreamingConnection(String baseUrl, String outFilePath) + { + HttpClient httpClient = new DefaultHttpClient(); + httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, new Integer(90000)); + //Step 1: Initialize OAuth Consumer + OAuthConsumer consumer = new CommonsHttpOAuthConsumer(OAuthUtils.CONSUMER_KEY,OAuthUtils.CONSUMER_SECRET); + consumer.setTokenWithSecret(OAuthToken.getAccessToken(),OAuthToken.getAccessSecret()); + //Step 2: Create a new HTTP POST request and set parameters + HttpPost httppost = new HttpPost(baseUrl); + try { + httppost.setEntity(new UrlEncodedFormEntity(CreateRequestBody(), "UTF-8")); + } catch (UnsupportedEncodingException ex) { + ex.printStackTrace(); + } + try { + //Step 3: Sign the request + consumer.sign(httppost); + } catch (OAuthMessageSignerException ex) { + ex.printStackTrace(); + } catch (OAuthExpectationFailedException ex) { + ex.printStackTrace(); + } catch (OAuthCommunicationException ex) { + ex.printStackTrace(); + } + HttpResponse response; + InputStream is = null; + try { + //Step 4: Connect to the API + response = httpClient.execute(httppost); + if (response.getStatusLine().getStatusCode()!= HttpStatus.SC_OK) + { + throw new IOException("Got status " +response.getStatusLine().getStatusCode()); + } + else + { + System.out.println(OAuthToken.getAccessToken()+ ": Processing from " + baseUrl); + HttpEntity entity = response.getEntity(); + try { + is = entity.getContent(); + } catch (IOException ex) { + ex.printStackTrace(); + } catch (IllegalStateException ex) { + ex.printStackTrace(); + } + //Step 5: Process the incoming Tweet Stream + this.ProcessTwitterStream(is, outFilePath); + } + } catch (IOException ex) { + ex.printStackTrace(); + }finally { + // Abort the method, otherwise releaseConnection() will + // attempt to finish reading the never-ending response. + // These methods do not throw exceptions. + if(is!=null) + { + try { + is.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + } + } + + /** + * Processes a stream of tweets and writes them to a file one tweet per line. Each tweet here is represented by a JSON document. + * @param is input stream already connected to the streaming API + * @param outFilePath file to put the collected tweets in + * @throws InterruptedException + * @throws IOException + */ + public void ProcessTwitterStream(InputStream is, String outFilePath) + { + BufferedWriter bwrite = null; + try { + JSONTokener jsonTokener = new JSONTokener(new InputStreamReader(is, "UTF-8")); + ArrayList rawtweets = new ArrayList(); + int nooftweetsuploaded = 0; + while (true) { + try { + JSONObject temp = new JSONObject(jsonTokener); + rawtweets.add(temp); +// System.out.println(temp); + if (rawtweets.size() >= RECORDS_TO_PROCESS) + { + Calendar cal = Calendar.getInstance(); + String filename = outFilePath + "tweets_" + cal.getTimeInMillis() + ".json"; + bwrite = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8")); + nooftweetsuploaded += RECORDS_TO_PROCESS; + //Write the collected tweets to a file + for (JSONObject jobj : rawtweets) { + bwrite.write(jobj.toString()); + bwrite.newLine(); + } + System.out.println("Written "+nooftweetsuploaded+" records so far"); + bwrite.close(); + rawtweets.clear(); + } + } catch (JSONException ex) { + ex.printStackTrace(); + } + } + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + public static void main(String[] args) + { + StreamingApiExample sae = new StreamingApiExample(); + sae.LoadTwitterToken(); + //load parameters from a TSV file + String filename = sae.CONFIG_FILE_PATH; + String outfilepath = sae.DEF_OUTPATH; + if(args!=null) + { + if(args.length>0) + { + filename = args[0]; + } + if(args.length>1) + { + File fl = new File(args[1]); + if(fl.exists()&&fl.isDirectory()) + { + outfilepath = args[1]; + } + } + } + sae.ReadParameters(filename); + sae.CreateStreamingConnection("https://stream.twitter.com/1.1/statuses/filter.json", outfilepath); + } + + /** + * Reads the file and loads the parameters to be crawled. Expects that the parameters are tab separated values and the + * @param filename + */ + public void ReadParameters(String filename) + { + BufferedReader br = null; + try { + br = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8")); + String temp = ""; + int count = 1; + if(Userids==null) + { + Userids = new HashSet(); + } + if(Geoboxes==null) + { + Geoboxes = new HashSet(); + } + if(Keywords==null) + { + Keywords = new HashSet(); + } + while((temp = br.readLine())!=null) + { + if(!temp.isEmpty()) + { + if(count==1) + { + String[] keywords = temp.split("\t"); + HashSet temptags = new HashSet(); + for(String word:keywords) + { + if(!temptags.contains(word)) + { + temptags.add(word); + } + } + FilterKeywords(temptags); + } + else + if(count==2) + { + String[] geoboxes = temp.split("\t"); + HashSet tempboxes = new HashSet(); + for(String box:geoboxes) + { + if(!tempboxes.contains(box)) + { + tempboxes.add(box); + } + } + FilterGeoboxes(tempboxes); + } + else + if(count==3) + { + String[] userids = temp.split("\t"); + HashSet tempids = new HashSet(); + for(String id:userids) + { + if(!tempids.contains(id)) + { + tempids.add(id); + } + } + FilterUserids(tempids); + } + count++; + } + } + } catch (IOException ex) { + ex.printStackTrace(); + } + finally{ + try { + br.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + } + + private void FilterUserids(HashSet userids) + { + if(userids!=null) + { + int maxsize = MAX_USERS; + if(userids.size() geoboxes) + { + if(geoboxes!=null) + { + int maxsize = MAX_GEOBOXES; + if(geoboxes.size() hashtags) + { + if(hashtags!=null) + { + int maxsize = MAX_KEYWORDS; + if(hashtags.size() CreateRequestBody() + { + List params = new ArrayList(); + if(Userids != null&&Userids.size()>0) + { + params.add(CreateNameValuePair("follow", Userids)); + System.out.println("userids = "+Userids); + } + if (Geoboxes != null&&Geoboxes.size()>0) { + params.add(CreateNameValuePair("locations", Geoboxes)); + System.out.println("locations = "+Geoboxes); + + } + if (Keywords != null&&Keywords.size()>0) { + params.add(CreateNameValuePair("track", Keywords)); + System.out.println("keywords = "+Keywords); + } + return params; + } + + private NameValuePair CreateNameValuePair(String name, Collection items) + { + StringBuilder sb = new StringBuilder(); + boolean needComma = false; + for (String item : items) { + if (needComma) { + sb.append(','); + } + needComma = true; + sb.append(item); + } + return new BasicNameValuePair(name, sb.toString()); + } +} diff --git a/src/Chapter2/support/APIType.java b/src/Chapter2/support/APIType.java new file mode 100644 index 0000000..94449f8 --- /dev/null +++ b/src/Chapter2/support/APIType.java @@ -0,0 +1,12 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter2.support; + +public class APIType +{ + public static String USER_TIMELINE = "/statuses/user_timeline"; + public static String FOLLOWERS = "/followers/list"; + public static String FRIENDS = "/friends/list"; + public static String USER_PROFILE = "/users/show"; +} diff --git a/src/Chapter2/support/InfoType.java b/src/Chapter2/support/InfoType.java new file mode 100644 index 0000000..42b0334 --- /dev/null +++ b/src/Chapter2/support/InfoType.java @@ -0,0 +1,12 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter2.support; + +public class InfoType +{ + public static final int PROFILE_INFO = 0; + public static final int FOLLOWER_INFO = 1; + public static final int FRIEND_INFO = 2; + public static final int STATUSES_INFO = 3; +} diff --git a/src/Chapter2/support/Location.java b/src/Chapter2/support/Location.java new file mode 100644 index 0000000..7f6234f --- /dev/null +++ b/src/Chapter2/support/Location.java @@ -0,0 +1,28 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package Chapter2.support; + +/** + * + * @author shamanth + */ +public class Location +{ + public Double latitude; + public Double longitude; + + public Location(Double lat,Double lng) + { + latitude = lat; + longitude = lng; + } + + @Override + public String toString() + { + return "Latitude: "+latitude+" & Longitude: "+longitude; + } +} diff --git a/src/Chapter2/support/OAuthTokenSecret.java b/src/Chapter2/support/OAuthTokenSecret.java new file mode 100644 index 0000000..8fee4a8 --- /dev/null +++ b/src/Chapter2/support/OAuthTokenSecret.java @@ -0,0 +1,38 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter2.support; + +public class OAuthTokenSecret +{ + String UserAccessToken; + String UserAccessSecret; + + public String getAccessSecret() { + return UserAccessSecret; + } + + public void setAccessSecret(String AccessSecret) { + this.UserAccessSecret = AccessSecret; + } + + public String getAccessToken() { + return UserAccessToken; + } + + public void setAccessToken(String AccessToken) { + this.UserAccessToken = AccessToken; + } + + public OAuthTokenSecret(String token,String secret) + { + this.setAccessToken(token); + this.setAccessSecret(secret); + } + + @Override + public String toString() + { + return "Access Token: "+getAccessToken()+" Access Secret: "+getAccessSecret(); + } +} diff --git a/src/Chapter4/GraphElements/RetweetEdge.java b/src/Chapter4/GraphElements/RetweetEdge.java new file mode 100644 index 0000000..83836a0 --- /dev/null +++ b/src/Chapter4/GraphElements/RetweetEdge.java @@ -0,0 +1,53 @@ +package GraphElements; + + +public class RetweetEdge { + private UserNode to, from; + private int retweetCount; + + public RetweetEdge(UserNode to, UserNode from){ + this.to = to; + this.from = from; + retweetCount = 1; + } + + public void incrementRTCount(){ + retweetCount++; + } + + public UserNode getTo() { + return to; + } + public void setTo(UserNode to) { + this.to = to; + } + public UserNode getFrom() { + return from; + } + public void setFrom(UserNode from) { + this.from = from; + } + public int getRetweetCount() { + return retweetCount; + } + public void setRetweetCount(int retweetCount) { + this.retweetCount = retweetCount; + } + + public boolean equals(Object maybeEdge){ + if(maybeEdge instanceof RetweetEdge){ + RetweetEdge edge = (RetweetEdge) maybeEdge; + return edge.to.equals(to) && edge.from.equals(from); + } + return false; + + } + + public String toString(){ + return from + " -> " + to; + } + + public int hashCode(){ + return toString().hashCode(); + } +} diff --git a/src/Chapter4/GraphElements/UserNode.java b/src/Chapter4/GraphElements/UserNode.java new file mode 100644 index 0000000..fba4419 --- /dev/null +++ b/src/Chapter4/GraphElements/UserNode.java @@ -0,0 +1,34 @@ +package GraphElements; + + + +public class UserNode { + private String username; + + public UserNode(String username){ + this.username = username; + } + + public String getUsername() { + return username; + } + + public void setUsername(String username) { + this.username = username; + } + + public boolean equals(Object un){ + if(un instanceof UserNode){ + return username.equals(((UserNode)un).username); + } + return false; + } + + public String toString(){ + return username; + } + + public int hashCode(){ + return username.hashCode(); + } +} diff --git a/src/Chapter4/centrality/examples/BetweennessCentralityExample.java b/src/Chapter4/centrality/examples/BetweennessCentralityExample.java new file mode 100644 index 0000000..ab9f7e6 --- /dev/null +++ b/src/Chapter4/centrality/examples/BetweennessCentralityExample.java @@ -0,0 +1,31 @@ +package centrality.examples; + +import Chapter4.util.TweetFileToGraph; +import java.io.File; +import GraphElements.RetweetEdge; +import GraphElements.UserNode; +import edu.uci.ics.jung.algorithms.importance.BetweennessCentrality; +import edu.uci.ics.jung.graph.DirectedGraph; + +public class BetweennessCentralityExample { + public static void main(String[] args){ + + File tweetFile; + + if(args.length > 0){ + tweetFile = new File(args[0]); + } + else{ + tweetFile = new File("synthetic_retweet_network.json"); + } + + DirectedGraph retweetGraph = TweetFileToGraph.getRetweetNetwork(tweetFile); + + //calculate the betweenness centrality + BetweennessCentrality betweenness = new BetweennessCentrality(retweetGraph); + + betweenness.evaluate(); + betweenness.printRankings(true, true); + + } +} diff --git a/src/Chapter4/centrality/examples/EigenvectorCentralityExample.java b/src/Chapter4/centrality/examples/EigenvectorCentralityExample.java new file mode 100644 index 0000000..172dd16 --- /dev/null +++ b/src/Chapter4/centrality/examples/EigenvectorCentralityExample.java @@ -0,0 +1,36 @@ +package centrality.examples; + +import Chapter4.util.TweetFileToGraph; +import java.io.File; +import GraphElements.RetweetEdge; +import GraphElements.UserNode; +import edu.uci.ics.jung.algorithms.scoring.EigenvectorCentrality; +import edu.uci.ics.jung.graph.DirectedGraph; + +public class EigenvectorCentralityExample { + public static void main(String[] args){ + + File tweetFile; + + if(args.length > 0){ + tweetFile = new File(args[0]); + } + else{ + tweetFile = new File("synthetic_retweet_network.json"); + } + + DirectedGraph retweetGraph = TweetFileToGraph.getRetweetNetwork(tweetFile); + +// EigenVectorScorer scorer = new EigenVectorScorer(retweetGraph); +// for(UserNode node : retweetGraph.getVertices()){ +// System.out.println(node + " - " + scorer.getVertexScore(node)); +// } + + EigenvectorCentrality eig = new EigenvectorCentrality(retweetGraph); + eig.evaluate(); + + for(UserNode node : retweetGraph.getVertices()){ + System.out.println(node + " - " + eig.getVertexScore(node)); + } + } +} diff --git a/src/Chapter4/centrality/examples/InDegreeCentralityExample.java b/src/Chapter4/centrality/examples/InDegreeCentralityExample.java new file mode 100644 index 0000000..6a027ac --- /dev/null +++ b/src/Chapter4/centrality/examples/InDegreeCentralityExample.java @@ -0,0 +1,30 @@ +package Chapter4.centrality.examples; + +import Chapter4.util.TweetFileToGraph; +import java.io.File; +import GraphElements.RetweetEdge; +import GraphElements.UserNode; +import edu.uci.ics.jung.graph.DirectedGraph; + +public class InDegreeCentralityExample { + + public static void main(String[] args){ + + File tweetFile; + + if(args.length > 0){ + tweetFile = new File(args[0]); + } + else{ + tweetFile = new File("synthetic_retweet_network.json"); + } + + DirectedGraph retweetGraph = TweetFileToGraph.getRetweetNetwork(tweetFile); + + //calculate the betweenness centrality + for(UserNode node : retweetGraph.getVertices()){ + System.out.println(node + " - " + retweetGraph.getInEdges(node).size()); + } + + } +} diff --git a/src/Chapter4/centrality/examples/PageRankCentralityExample.java b/src/Chapter4/centrality/examples/PageRankCentralityExample.java new file mode 100644 index 0000000..dd44efd --- /dev/null +++ b/src/Chapter4/centrality/examples/PageRankCentralityExample.java @@ -0,0 +1,39 @@ +package Chapter4.centrality.examples; + +import Chapter4.util.TweetFileToGraph; +import java.io.File; +import GraphElements.RetweetEdge; +import GraphElements.UserNode; +import edu.uci.ics.jung.algorithms.scoring.PageRank; +import edu.uci.ics.jung.graph.DirectedGraph; + +public class PageRankCentralityExample { + public static void main(String[] args){ + + File tweetFile; + + if(args.length > 0){ + tweetFile = new File(args[0]); + } + else{ + tweetFile = new File("synthetic_retweet_network.json"); + } + + DirectedGraph retweetGraph = TweetFileToGraph.getRetweetNetwork(tweetFile); + + + PageRank pageRank = new PageRank(retweetGraph, .5); + pageRank.evaluate(); + + for(UserNode node : retweetGraph.getVertices()){ + System.out.println(node + " - " + pageRank.getVertexScore(node)); + } + +// EigenvectorCentrality eig = new EigenvectorCentrality(retweetGraph); +// eig.evaluate(); +// +// for(UserNode node : retweetGraph.getVertices()){ +// System.out.println(node + " - " + eig.getVertexScore(node)); +// } + } +} diff --git a/src/Chapter4/classification/bayes/Classification.java b/src/Chapter4/classification/bayes/Classification.java new file mode 100644 index 0000000..ea9aba7 --- /dev/null +++ b/src/Chapter4/classification/bayes/Classification.java @@ -0,0 +1,22 @@ +package Chapter4.classification.bayes; + +public class Classification { + private String label; + private double confidence; + + public Classification(String label, double confidence){ + this.label = label; + this.confidence = confidence; + } + + public String getLabel() { + return label; + } + public double getConfidence() { + return confidence; + } + + public String toString(){ + return "(" + label + ", " + confidence + ")"; + } +} diff --git a/src/Chapter4/classification/bayes/NBCxv.java b/src/Chapter4/classification/bayes/NBCxv.java new file mode 100644 index 0000000..5c48e28 --- /dev/null +++ b/src/Chapter4/classification/bayes/NBCxv.java @@ -0,0 +1,60 @@ +package Chapter4.classification.bayes; + +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +import com.google.gson.JsonObject; +import com.google.gson.JsonStreamParser; + +public class NBCxv { + public static void main(String[] args){ + + String filename = args.length >= 1 ? args[0] : "owsemoticons.json"; + + ArrayList allTexts = new ArrayList(); + + try { + //read the file, and train each document + JsonStreamParser parser = new JsonStreamParser(new FileReader(filename)); + JsonObject elem; + while (parser.hasNext()) { + elem = parser.next().getAsJsonObject(); + allTexts.add(elem.get("text").getAsString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + + //do 5-fold cross validation 3 times + Map> buckets; + int bucketIdx; + NaiveBayesSentimentClassifier nbsc; + for(int i = 0; i < 3; i++){ + + //randomly split the texts into 5 buckets + buckets = new HashMap>(); + //initialize the 5 buckets + for(int j = 0; j < 5; j++) buckets.put(j, new ArrayList()); + for(String text : allTexts){ + bucketIdx = (int) (Math.random()*5); + buckets.get(bucketIdx).add(text); + } + + for(int j = 0; j < 5; j++){ + //use all but j as the training, use j as the test. + nbsc = new NaiveBayesSentimentClassifier(); + for(int k = 0; k < 5; k++){ + if(k != j){ + nbsc.trainInstances(buckets.get(k)); + } + } + //test with bucket j + + } + } + + } +} diff --git a/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java b/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java new file mode 100644 index 0000000..923416c --- /dev/null +++ b/src/Chapter4/classification/bayes/NaiveBayesSentimentClassifier.java @@ -0,0 +1,264 @@ +package Chapter4.classification.bayes; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.StringTokenizer; + +/** + * This class performs both the training and classification steps of a Naive Bayes Classifier. + * + */ +public class NaiveBayesSentimentClassifier { + //the possible sentiment labels + private static final String[] SENTIMENT_LABELS = {"happy", "sad"}; + //the tokens to look for in labeling the sentiment. + private static final String[] HAPPY_SMILEYS = {":)", ";)", ":D", ":-)", ":o)", ":-D"}; + private static final String[] SAD_SMILEYS = {":(", ":-(", ":'(", ":'-(", "D:"}; + //store these as a set for faster retrieval + private static final Set HAPPY_SMILEY_SET = new HashSet(Arrays.asList(HAPPY_SMILEYS)); + private static final Set SAD_SMILEY_SET = new HashSet(Arrays.asList(SAD_SMILEYS)); + + //counter for the number of times each word has been associated with each sentiment. + private Map sentOccurs; + //counter for the number of times we've seen each sentiment. + private Integer[] sentCount; + + public NaiveBayesSentimentClassifier(){ + //initialize the counters + sentOccurs = new HashMap(); + sentCount = new Integer[SENTIMENT_LABELS.length]; + for(int i = 0; i < SENTIMENT_LABELS.length; i++){ + sentCount[i] = 0; + } + } + + /** + * Tokenize a string. Turns string into list of words based on whitespace, then + * removes stopwords, punctuation, and reduces the word to its stem. + * @param text + * The piece of text + * @return + * Each individual word. + */ + private List getTokens(String text){ + StringTokenizer tokens = new StringTokenizer(text); + ArrayList words = new ArrayList(); + + String tmp; + StringBuilder sb; + while(tokens.hasMoreTokens()){ + sb = new StringBuilder(); + tmp = tokens.nextToken(); + tmp = tmp.toLowerCase(); + + for(char ch : tmp.toCharArray()){ + if(Character.isLetter(ch)){ + sb.append(ch); + } + } + tmp = sb.toString(); + if(tmp.length() > 0 && !StopwordsList.stopwordsSet.contains(tmp)){ + words.add(sb.toString()); + } + } + + return words; + } + + /** + * Checks if tweet has a "label" (emoticon). If so, stores the words in + * the prior. + * @param tweetText + * The text of the document to check. + */ + public void trainInstance(String tweetText){ + //see if the tweet is labeled (i.e. has a smiley) + int tweetLabel = extractLabel(tweetText); + List tokens = getTokens(tweetText); + if(tweetLabel != -1){ + //add these words to the classifier + updateClassifier(tokens, tweetLabel); + } + } + + public String printWordOccurs(int sentIndex, int topN){ + StringBuilder sb = new StringBuilder(); + + WordCountPair wpcset[] = new WordCountPair[sentOccurs.keySet().size()]; + + String s; + int t = 0; + Iterator sIter = sentOccurs.keySet().iterator(); +// int totalCount = 0; +// while(sIter.hasNext()){ +// s = sIter.next(); +// totalCount += sentOccurs.get(s)[sentIndex]; +// } + + sIter = sentOccurs.keySet().iterator(); + while(sIter.hasNext()){ + s = sIter.next(); +// wpcset[t++] = new WordCountPair(s, sentOccurs.get(s)[sentIndex] * 1.0 / totalCount); + wpcset[t++] = new WordCountPair(s, Math.sqrt(sentOccurs.get(s)[sentIndex] * 1.0 )); + } + + Arrays.sort(wpcset); + + double frac; + for(int i = 0; (i < topN || topN <= 0) && i < wpcset.length; i++){ + s = wpcset[i].getWord(); + frac = wpcset[i].getCount(); + + sb.append(s); + sb.append(":"); + sb.append(frac); + sb.append("\n"); + } + + return sb.toString(); + } + + public void trainInstances(List tweetTexts){ + for(String text : tweetTexts){ + trainInstance(text); + } + } + + /** + * Classify a tweet as happy or sad. This ignores the emoticon for demonstration purposes. + * @param tweetText + * The text of the tweet + * @return + * A Classification object that returns the sentiment of the tweet. + */ + public Classification classify(String tweetText){ + //stores the probability of each sentiment being the tweets true sentiment. + double[] labelProbs = new double[SENTIMENT_LABELS.length]; + //tokenize the string + List tokens = getTokens(tweetText); + int maxLabelIdx = 0; + for(int i = 0; i < labelProbs.length; i++){ + //calculate the probability that the tweet has that sentiment. + labelProbs[i] = calcLabelProb(tokens, i); + System.out.println(i + " -> " + labelProbs[i] ); + //keep track of the label probability + maxLabelIdx = labelProbs[i] > labelProbs[maxLabelIdx] ? i : maxLabelIdx; + } + //calc the confidence + double conf = labelProbs[maxLabelIdx]; + labelProbs[maxLabelIdx] = 0; + conf -= sumVector(labelProbs); + + return new Classification(SENTIMENT_LABELS[maxLabelIdx], conf); + } + + private int extractLabel(String tweetText){ + StringTokenizer tokens = new StringTokenizer(tweetText); + while(tokens.hasMoreTokens()){ + String token = tokens.nextToken(); + if(HAPPY_SMILEY_SET.contains(token)){ + return 0; + } + else if(SAD_SMILEY_SET.contains(token)){ + return 1; + } + } + return -1; + } + + /** + * This updates the classifier's probabilites for each word + * with the new piece of text. + * @param tokens + * The tokens in the tweet. + * @param sentIndex + * The sentiment label. + */ + private void updateClassifier(List tokens, int sentIndex){ + for(String token : tokens){ + if(sentOccurs.containsKey(token)){ + sentOccurs.get(token)[sentIndex] ++ ; + } + else{ + //make a new array and put it + Integer[] newArray = {0, 0}; + newArray[sentIndex] ++; + sentOccurs.put(token, newArray); + } + } + //update the overall document count + sentCount[sentIndex]++; + } + + /** + * The probability of the tweet having a given label. + * @param tokens + * The tokens in the tweet. + * @param sentIndex + * The probability we are testing. + * @return + * The probability the tweet has the class label indicated by "sentIndex". + */ + private double calcLabelProb(List tokens, int sentIndex){ + + //calculate the class probabilities + double[] pClass = new double[SENTIMENT_LABELS.length]; + int cSum = sumVector(sentCount); + int totalWordCount = 0; + + for(int i = 0; i < sentCount.length; i++){ + pClass[i] = sentCount[i] * 1.0 / cSum; + } + + for(String word : sentOccurs.keySet()){ + Integer[] wordCt = sentOccurs.get(word); + totalWordCount = sumVector(wordCt); + } + + + double p = 1.0; + boolean foundOne = false; + for(String token : tokens){ + if(sentOccurs.containsKey(token)){ + foundOne = true; + Integer[] probs = sentOccurs.get(token); + double pWordGivenClass = probs[sentIndex] / (double)(sumVector(probs)); + double pWord = sumVector(probs) / totalWordCount; + p *= pWordGivenClass * pClass[sentIndex] / pWord; + } + } + return foundOne ? p : 0.0; + } + + /** + * Helper function to sum the values in a 1D array. + * @param vector + * The 1D array to sum. + * @return + * The sum. + */ + private double sumVector(double[] vector){ + double sum = 0.0; + for(double d : vector) sum += d; + return sum; + } + + /** + * Helper function to sum the values in a 1D array. + * @param vector + * The 1D array to sum. + * @return + * The sum. + */ + private int sumVector(Integer[] vector){ + int sum = 0; + for(int d : vector) sum += d; + return sum; + } +} diff --git a/src/Chapter4/classification/bayes/StopwordsList.java b/src/Chapter4/classification/bayes/StopwordsList.java new file mode 100644 index 0000000..06edd5a --- /dev/null +++ b/src/Chapter4/classification/bayes/StopwordsList.java @@ -0,0 +1,10 @@ +package Chapter4.classification.bayes; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +public class StopwordsList { + private static final String[] stopwords = {"a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "did", "do", "does", "doing", "don", "down", "during", "each", "few", "for", "from", "further", "get", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "im", "i'm", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most", "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "rt", "s", "same", "she", "should", "so", "some", "such", "t", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "us", "very", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "you", "your", "yours", "yourself", "yourselves"}; + public static final Set stopwordsSet = new HashSet(Arrays.asList(stopwords)); +} diff --git a/src/Chapter4/classification/bayes/TestNBC.java b/src/Chapter4/classification/bayes/TestNBC.java new file mode 100644 index 0000000..7e0e743 --- /dev/null +++ b/src/Chapter4/classification/bayes/TestNBC.java @@ -0,0 +1,49 @@ +package Chapter4.classification.bayes; + +import java.io.FileReader; +import java.io.IOException; + +import com.google.gson.JsonObject; +import com.google.gson.JsonStreamParser; + +public class TestNBC { + public static void main(String[] args){ + + String filename = args.length >= 1 ? args[0] : "owsemoticons.json"; + + //initialize the sentiment classifier + NaiveBayesSentimentClassifier nbsc = new NaiveBayesSentimentClassifier(); + + try { + //read the file, and train each document + JsonStreamParser parser = new JsonStreamParser(new FileReader(filename)); + JsonObject elem; + String text; + while (parser.hasNext()) { + elem = parser.next().getAsJsonObject(); + text = elem.get("text").getAsString(); + nbsc.trainInstance(text); + } + + //print out the positive and negative dictionary + System.out.println("=== Positive Dictionary ==="); + System.out.println(nbsc.printWordOccurs(0, 25)); + System.out.println("=== Negative Dictionary ==="); + System.out.println(nbsc.printWordOccurs(1, 25)); + + //now go through and classify each line as positive or negative +// parser = new JsonStreamParser(new FileReader(filename)); +// while (parser.hasNext()) { +// elem = parser.next().getAsJsonObject(); +// text = elem.get("text").getAsString(); +// Classification c = nbsc.classify(text); +// System.out.println(c + " -> " + text); +// } + System.out.println(nbsc.classify("I love new york")); + + } catch (IOException e) { + e.printStackTrace(); + } + + } +} diff --git a/src/Chapter4/classification/bayes/WordCountPair.java b/src/Chapter4/classification/bayes/WordCountPair.java new file mode 100644 index 0000000..b96be92 --- /dev/null +++ b/src/Chapter4/classification/bayes/WordCountPair.java @@ -0,0 +1,34 @@ +package Chapter4.classification.bayes; + +public class WordCountPair implements Comparable{ + + + private String word; + private double count; + + public WordCountPair(String word, double count){ + this.word = word; + this.count = count; + } + + public int compareTo(WordCountPair arg0) { + return arg0.count - count < 0 ? -1 : 1; + } + + public String getWord() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public double getCount() { + return count; + } + + public void setCount(int count) { + this.count = count; + } + +} diff --git a/src/Chapter4/graph/visualization/SimpleGraphViewer.java b/src/Chapter4/graph/visualization/SimpleGraphViewer.java new file mode 100644 index 0000000..7cb46e4 --- /dev/null +++ b/src/Chapter4/graph/visualization/SimpleGraphViewer.java @@ -0,0 +1,86 @@ +package chapter4.graph.visualization; + +import Chapter4.util.TweetFileToGraph; +import java.awt.Dimension; +import java.awt.Shape; +import java.awt.geom.Ellipse2D; +import java.io.File; + +import javax.swing.JFrame; + +import org.apache.commons.collections15.Transformer; +import GraphElements.RetweetEdge; +import GraphElements.UserNode; +import edu.uci.ics.jung.algorithms.layout.KKLayout; +import edu.uci.ics.jung.algorithms.layout.Layout; +import edu.uci.ics.jung.algorithms.scoring.EigenvectorCentrality; +import edu.uci.ics.jung.graph.DirectedGraph; +import edu.uci.ics.jung.visualization.BasicVisualizationServer; + +public class SimpleGraphViewer { + public static void main(String[] args){ + + File tweetFile; + + if(args.length > 0){ + tweetFile = new File(args[0]); + } + else{ + tweetFile = new File("synthetic_retweet_network.json"); + } + + DirectedGraph retweetGraph = TweetFileToGraph.getRetweetNetwork(tweetFile); + + /* + * Converts a node to its string representation + */ + Transformer stringer = new Transformer(){ + public String transform(UserNode n){ + return n.toString(); + } + }; + + /* + * Calculate the centrality + */ + //calculate the betweenness centrality +// final InDegreeScorer centralityScore = new InDegreeScorer(retweetGraph); +// final BetweennessCentrality centralityScore = new BetweennessCentrality(retweetGraph); +// final PageRank centralityScore = new PageRank(retweetGraph, 0.85); + final EigenvectorCentrality centralityScore = new EigenvectorCentrality(retweetGraph); + centralityScore.evaluate(); + + double centralityMax = 0.0f; + for(UserNode node : retweetGraph.getVertices()){ + centralityMax = Math.max(centralityMax, centralityScore.getVertexScore(node)); + } + final double centralityMaxFinal = centralityMax; + + /* + * Sizes a node by some centrality measure + */ + Transformer shaper = new Transformer(){ + public Shape transform(UserNode n){ + System.out.println("User: " + n.getUsername() + " Cent: " + centralityScore.getVertexScore(n) + " Max: " + centralityMaxFinal); + double radius = 50 * (centralityScore.getVertexScore(n)) / centralityMaxFinal; + radius = Math.max(radius, 5.0f); + float fRadius = (float) radius; + return new Ellipse2D.Float(-fRadius/2, -fRadius/2, fRadius, fRadius); + } + }; + + Layout layout = new KKLayout(retweetGraph); + layout.setSize(new Dimension(500, 500)); + + BasicVisualizationServer vv = new BasicVisualizationServer(layout); + vv.setPreferredSize(new Dimension(550, 550)); + vv.getRenderContext().setVertexLabelTransformer(stringer); + vv.getRenderContext().setVertexShapeTransformer(shaper); + + JFrame jframe = new JFrame("Simple Graph View"); + jframe.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + jframe.getContentPane().add(vv); + jframe.pack(); + jframe.setVisible(true); + } +} diff --git a/src/Chapter4/tweetlda/LDA.java b/src/Chapter4/tweetlda/LDA.java new file mode 100644 index 0000000..ca7f9a3 --- /dev/null +++ b/src/Chapter4/tweetlda/LDA.java @@ -0,0 +1,89 @@ +package tweetlda; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import org.json.JSONObject; + +import cc.mallet.pipe.CharSequence2TokenSequence; +import cc.mallet.pipe.CharSequenceLowercase; +import cc.mallet.pipe.Pipe; +import cc.mallet.pipe.SerialPipes; +import cc.mallet.pipe.TokenSequence2FeatureSequence; +import cc.mallet.pipe.TokenSequenceRemoveStopwords; +import cc.mallet.pipe.iterator.StringArrayIterator; +import cc.mallet.topics.ParallelTopicModel; +import cc.mallet.types.Alphabet; +import cc.mallet.types.IDSorter; +import cc.mallet.types.InstanceList; + +public class LDA { + + private static final String STOP_WORDS = "stopwords.txt"; + private static final int ITERATIONS = 100; + private static final int THREADS = 4; + private static final int NUM_TOPICS = 25; + private static final int NOM_WORDS_TO_ANALYZE = 25; + + public static void main(String[] args) throws Exception { + ArrayList pipeList = new ArrayList(); + File stopwords = new File(STOP_WORDS); + + String inputFileName = args.length >= 1 ? args[0] : "testows.json"; + + File inputFile = new File(inputFileName); + + // Lowercase, tokenize, remove stopwords, stem, and convert to features + pipeList.add((Pipe) new CharSequenceLowercase()); + pipeList.add((Pipe) new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}"))); + pipeList.add((Pipe) new TokenSequenceRemoveStopwords(stopwords, "UTF-8", false, false, false)); + pipeList.add((Pipe) new PorterStemmer()); + pipeList.add((Pipe) new TokenSequence2FeatureSequence()); + + InstanceList instances = new InstanceList(new SerialPipes(pipeList)); + + BufferedReader fileReader = new BufferedReader(new FileReader(inputFile)); + LinkedList textList = new LinkedList(); + String line; + while((line = fileReader.readLine()) != null){ + JSONObject elem = new JSONObject(line); + if(elem.has("text")){ + textList.add(elem.getString("text")); + } + } + + instances.addThruPipe(new StringArrayIterator(textList.toArray(new String[textList.size()]))); + + ParallelTopicModel model = new ParallelTopicModel(NUM_TOPICS); + model.addInstances(instances); + model.setNumThreads(THREADS); + model.setNumIterations(ITERATIONS); + model.estimate(); + + // The data alphabet maps word IDs to strings + Alphabet dataAlphabet = instances.getDataAlphabet(); + + int topicIdx=0; + StringBuilder sb; + for (TreeSet set : model.getSortedWords()) { + sb = new StringBuilder().append(topicIdx); + sb.append(" - "); + int j = 0; + double sum = 0.0; + for (IDSorter s : set) { + sum += s.getWeight(); + } + for (IDSorter s : set) { + sb.append(dataAlphabet.lookupObject(s.getID())).append(":").append(s.getWeight() / sum).append(", "); + if (++j >= NOM_WORDS_TO_ANALYZE) break; + } + System.out.println(sb.append("\n").toString()); + topicIdx++; + } + } +} diff --git a/src/Chapter4/tweetlda/PorterStemmer.java b/src/Chapter4/tweetlda/PorterStemmer.java new file mode 100644 index 0000000..1a7149e --- /dev/null +++ b/src/Chapter4/tweetlda/PorterStemmer.java @@ -0,0 +1,33 @@ +package tweetlda; + +import cc.mallet.pipe.Pipe; +import cc.mallet.types.Instance; +import cc.mallet.types.TokenSequence; + +public class PorterStemmer extends Pipe { + + private static final long serialVersionUID = 154100332101873830L; + + public Instance pipe(Instance carrier){ + TokenSequence ts = (TokenSequence) carrier.getData(); + String word; + Stemmer s; + + for(int i = 0; i < ts.size(); i++){ + word = ts.get(i).getText(); + //stem the word + s = new Stemmer(); + for(char ch : word.toCharArray()){ + if(Character.isLetter(ch)){ + s.add(ch); + } + } + s.stem(); + ts.get(i).setText(s.toString()); + } + carrier.setData(ts); + + return carrier; + } + +} diff --git a/src/Chapter4/tweetlda/Stemmer.java b/src/Chapter4/tweetlda/Stemmer.java new file mode 100644 index 0000000..f06dfc6 --- /dev/null +++ b/src/Chapter4/tweetlda/Stemmer.java @@ -0,0 +1,428 @@ +package tweetlda; + + +/* + + Porter stemmer in Java. The original paper is in + + Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, + no. 3, pp 130-137, + + See also http://www.tartarus.org/~martin/PorterStemmer + + History: + + Release 1 + + Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. + The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] + is then out outside the bounds of b. + + Release 2 + + Similarly, + + Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. + 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and + b[j] is then outside the bounds of b. + + Release 3 + + Considerably revised 4/9/00 in the light of many helpful suggestions + from Brian Goetz of Quiotix Corporation (brian@quiotix.com). + + Release 4 + +*/ + +import java.io.*; + +/** + * Stemmer, implementing the Porter Stemming Algorithm + * + * The Stemmer class transforms a word into its root form. The input + * word can be provided a character at time (by calling add()), or at once + * by calling one of the various stem(something) methods. + */ + +class Stemmer +{ private char[] b; + private int i, /* offset into b */ + i_end, /* offset to end of stemmed word */ + j, k; + private static final int INC = 50; + /* unit of size whereby b is increased */ + public Stemmer() + { b = new char[INC]; + i = 0; + i_end = 0; + } + + /** + * Add a character to the word being stemmed. When you are finished + * adding characters, you can call stem(void) to stem the word. + */ + + public void add(char ch) + { if (i == b.length) + { char[] new_b = new char[i+INC]; + for (int c = 0; c < i; c++) new_b[c] = b[c]; + b = new_b; + } + b[i++] = ch; + } + + + /** Adds wLen characters to the word being stemmed contained in a portion + * of a char[] array. This is like repeated calls of add(char ch), but + * faster. + */ + + public void add(char[] w, int wLen) + { if (i+wLen >= b.length) + { char[] new_b = new char[i+wLen+INC]; + for (int c = 0; c < i; c++) new_b[c] = b[c]; + b = new_b; + } + for (int c = 0; c < wLen; c++) b[i++] = w[c]; + } + + /** + * After a word has been stemmed, it can be retrieved by toString(), + * or a reference to the internal buffer can be retrieved by getResultBuffer + * and getResultLength (which is generally more efficient.) + */ + public String toString() { return new String(b,0,i_end); } + + /** + * Returns the length of the word resulting from the stemming process. + */ + public int getResultLength() { return i_end; } + + /** + * Returns a reference to a character buffer containing the results of + * the stemming process. You also need to consult getResultLength() + * to determine the length of the result. + */ + public char[] getResultBuffer() { return b; } + + /* cons(i) is true <=> b[i] is a consonant. */ + + private final boolean cons(int i) + { switch (b[i]) + { case 'a': case 'e': case 'i': case 'o': case 'u': return false; + case 'y': return (i==0) ? true : !cons(i-1); + default: return true; + } + } + + /* m() measures the number of consonant sequences between 0 and j. if c is + a consonant sequence and v a vowel sequence, and <..> indicates arbitrary + presence, + + gives 0 + vc gives 1 + vcvc gives 2 + vcvcvc gives 3 + .... + */ + + private final int m() + { int n = 0; + int i = 0; + while(true) + { if (i > j) return n; + if (! cons(i)) break; i++; + } + i++; + while(true) + { while(true) + { if (i > j) return n; + if (cons(i)) break; + i++; + } + i++; + n++; + while(true) + { if (i > j) return n; + if (! cons(i)) break; + i++; + } + i++; + } + } + + /* vowelinstem() is true <=> 0,...j contains a vowel */ + + private final boolean vowelinstem() + { int i; for (i = 0; i <= j; i++) if (! cons(i)) return true; + return false; + } + + /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ + + private final boolean doublec(int j) + { if (j < 1) return false; + if (b[j] != b[j-1]) return false; + return cons(j); + } + + /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short word. e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + + */ + + private final boolean cvc(int i) + { if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false; + { int ch = b[i]; + if (ch == 'w' || ch == 'x' || ch == 'y') return false; + } + return true; + } + + private final boolean ends(String s) + { int l = s.length(); + int o = k-l+1; + if (o < 0) return false; + for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false; + j = k-l; + return true; + } + + /* setto(s) sets (j+1),...k to the characters in the string s, readjusting + k. */ + + private final void setto(String s) + { int l = s.length(); + int o = j+1; + for (int i = 0; i < l; i++) b[o+i] = s.charAt(i); + k = j+l; + } + + /* r(s) is used further down. */ + + private final void r(String s) { if (m() > 0) setto(s); } + + /* step1() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + + */ + + private final void step1() + { if (b[k] == 's') + { if (ends("sses")) k -= 2; else + if (ends("ies")) setto("i"); else + if (b[k-1] != 's') k--; + } + if (ends("eed")) { if (m() > 0) k--; } else + if ((ends("ed") || ends("ing")) && vowelinstem()) + { k = j; + if (ends("at")) setto("ate"); else + if (ends("bl")) setto("ble"); else + if (ends("iz")) setto("ize"); else + if (doublec(k)) + { k--; + { int ch = b[k]; + if (ch == 'l' || ch == 's' || ch == 'z') k++; + } + } + else if (m() == 1 && cvc(k)) setto("e"); + } + } + + /* step2() turns terminal y to i when there is another vowel in the stem. */ + + private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; } + + /* step3() maps double suffices to single ones. so -ization ( = -ize plus + -ation) maps to -ize etc. note that the string before the suffix must give + m() > 0. */ + + private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1]) + { + case 'a': if (ends("ational")) { r("ate"); break; } + if (ends("tional")) { r("tion"); break; } + break; + case 'c': if (ends("enci")) { r("ence"); break; } + if (ends("anci")) { r("ance"); break; } + break; + case 'e': if (ends("izer")) { r("ize"); break; } + break; + case 'l': if (ends("bli")) { r("ble"); break; } + if (ends("alli")) { r("al"); break; } + if (ends("entli")) { r("ent"); break; } + if (ends("eli")) { r("e"); break; } + if (ends("ousli")) { r("ous"); break; } + break; + case 'o': if (ends("ization")) { r("ize"); break; } + if (ends("ation")) { r("ate"); break; } + if (ends("ator")) { r("ate"); break; } + break; + case 's': if (ends("alism")) { r("al"); break; } + if (ends("iveness")) { r("ive"); break; } + if (ends("fulness")) { r("ful"); break; } + if (ends("ousness")) { r("ous"); break; } + break; + case 't': if (ends("aliti")) { r("al"); break; } + if (ends("iviti")) { r("ive"); break; } + if (ends("biliti")) { r("ble"); break; } + break; + case 'g': if (ends("logi")) { r("log"); break; } + } } + + /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ + + private final void step4() { switch (b[k]) + { + case 'e': if (ends("icate")) { r("ic"); break; } + if (ends("ative")) { r(""); break; } + if (ends("alize")) { r("al"); break; } + break; + case 'i': if (ends("iciti")) { r("ic"); break; } + break; + case 'l': if (ends("ical")) { r("ic"); break; } + if (ends("ful")) { r(""); break; } + break; + case 's': if (ends("ness")) { r(""); break; } + break; + } } + + /* step5() takes off -ant, -ence etc., in context vcvc. */ + + private final void step5() + { if (k == 0) return; /* for Bug 1 */ switch (b[k-1]) + { case 'a': if (ends("al")) break; return; + case 'c': if (ends("ance")) break; + if (ends("ence")) break; return; + case 'e': if (ends("er")) break; return; + case 'i': if (ends("ic")) break; return; + case 'l': if (ends("able")) break; + if (ends("ible")) break; return; + case 'n': if (ends("ant")) break; + if (ends("ement")) break; + if (ends("ment")) break; + /* element etc. not stripped before the m */ + if (ends("ent")) break; return; + case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; + /* j >= 0 fixes Bug 2 */ + if (ends("ou")) break; return; + /* takes care of -ous */ + case 's': if (ends("ism")) break; return; + case 't': if (ends("ate")) break; + if (ends("iti")) break; return; + case 'u': if (ends("ous")) break; return; + case 'v': if (ends("ive")) break; return; + case 'z': if (ends("ize")) break; return; + default: return; + } + if (m() > 1) k = j; + } + + /* step6() removes a final -e if m() > 1. */ + + private final void step6() + { j = k; + if (b[k] == 'e') + { int a = m(); + if (a > 1 || a == 1 && !cvc(k-1)) k--; + } + if (b[k] == 'l' && doublec(k) && m() > 1) k--; + } + + /** Stem the word placed into the Stemmer buffer through calls to add(). + * Returns true if the stemming process resulted in a word different + * from the input. You can retrieve the result with + * getResultLength()/getResultBuffer() or toString(). + */ + public void stem() + { k = i - 1; + if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); } + i_end = k+1; i = 0; + } + + /** Test program for demonstrating the Stemmer. It reads text from a + * a list of files, stems each word, and writes the result to standard + * output. Note that the word stemmed is expected to be in lower case: + * forcing lower case must be done outside the Stemmer class. + * Usage: Stemmer file-name file-name ... + */ + public static void main(String[] args) + { + char[] w = new char[501]; + Stemmer s = new Stemmer(); + for (int i = 0; i < args.length; i++) + try + { + FileInputStream in = new FileInputStream(args[i]); + + try + { while(true) + + { int ch = in.read(); + if (Character.isLetter((char) ch)) + { + int j = 0; + while(true) + { ch = Character.toLowerCase((char) ch); + w[j] = (char) ch; + if (j < 500) j++; + ch = in.read(); + if (!Character.isLetter((char) ch)) + { + /* to test add(char ch) */ + for (int c = 0; c < j; c++) s.add(w[c]); + + /* or, to test add(char[] w, int j) */ + /* s.add(w, j); */ + + s.stem(); + { String u; + + /* and now, to test toString() : */ + u = s.toString(); + + /* to test getResultBuffer(), getResultLength() : */ + /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */ + + System.out.print(u); + } + break; + } + } + } + if (ch < 0) break; + System.out.print((char)ch); + } + } + catch (IOException e) + { System.out.println("error reading " + args[i]); + break; + } + } + catch (FileNotFoundException e) + { System.out.println("file " + args[i] + " not found"); + break; + } + } +} diff --git a/src/Chapter4/util/BetweennessScorer.java b/src/Chapter4/util/BetweennessScorer.java new file mode 100644 index 0000000..0926d34 --- /dev/null +++ b/src/Chapter4/util/BetweennessScorer.java @@ -0,0 +1,25 @@ +package util; + +import GraphElements.RetweetEdge; +import GraphElements.UserNode; +import edu.uci.ics.jung.algorithms.scoring.VertexScorer; +import edu.uci.ics.jung.algorithms.shortestpath.DijkstraShortestPath; +import edu.uci.ics.jung.graph.Graph; +import edu.uci.ics.jung.graph.Hypergraph; + +public class BetweennessScorer implements VertexScorer{ + + public BetweennessScorer(Hypergraph graph){ + /* + * Step 1: Calculate the shortest path between each pair of nodes. + */ + DijkstraShortestPath paths = new DijkstraShortestPath((Graph) graph); +// paths.getDistance(source, target); + } + + public Double getVertexScore(UserNode arg0) { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/src/Chapter4/util/EigenVectorScorer.java b/src/Chapter4/util/EigenVectorScorer.java new file mode 100644 index 0000000..da0c1a8 --- /dev/null +++ b/src/Chapter4/util/EigenVectorScorer.java @@ -0,0 +1,64 @@ +package Chapter4.util; + +import GraphElements.RetweetEdge; +import GraphElements.UserNode; +import cern.colt.matrix.DoubleMatrix2D; +import cern.colt.matrix.impl.SparseDoubleMatrix2D; +import cern.colt.matrix.linalg.EigenvalueDecomposition; +import edu.uci.ics.jung.algorithms.scoring.VertexScorer; +import edu.uci.ics.jung.graph.Hypergraph; + +/** + * This is a Jung Node Scorer that computes the Eigenvector Centrality for each node. + */ +public class EigenVectorScorer implements VertexScorer { + + private UserNode[] users; + private DoubleMatrix2D eigenVectors; + private int dominantEigenvectorIdx; + + public EigenVectorScorer(Hypergraph graph){ + users = new UserNode[graph.getVertexCount()]; + graph.getVertices().toArray(users); + + /* Step 1: Create the adjacency matrix. + * + * An adjacency matrix is a matrix with N users and N columns, + * where N is the number of nodes in the network. + * An entry in the matrix is 1 when node i connects to node j, + * and 0 otherwise. + */ + SparseDoubleMatrix2D matrix = new SparseDoubleMatrix2D(users.length, users.length); + for(int i = 0; i < users.length; i++){ + for(int j = 0; j < users.length; j++){ + matrix.setQuick(i, j, graph.containsEdge(new RetweetEdge(users[i], users[j])) ? 1 : 0); + } + } + + /* Step 2: Find the principle eigenvector. + * For more information on eigen-decomposition please see + * http://mathworld.wolfram.com/EigenDecomposition.html + */ + EigenvalueDecomposition eig = new EigenvalueDecomposition(matrix); + DoubleMatrix2D eigenVals = eig.getD(); + eigenVectors = eig.getV(); + + dominantEigenvectorIdx = 0; + for(int i = 1; i < eigenVals.columns(); i++){ + if(eigenVals.getQuick(dominantEigenvectorIdx, dominantEigenvectorIdx) < + eigenVals.getQuick(i, i)){ + dominantEigenvectorIdx = i; + } + } + } + + public Double getVertexScore(UserNode arg0) { + for(int i = 0; i < users.length; i++){ + if(users[i].equals(arg0)){ + return Math.abs(eigenVectors.getQuick(i, dominantEigenvectorIdx)); + } + } + return null; + } + +} diff --git a/src/Chapter4/util/InDegreeScorer.java b/src/Chapter4/util/InDegreeScorer.java new file mode 100644 index 0000000..014adc6 --- /dev/null +++ b/src/Chapter4/util/InDegreeScorer.java @@ -0,0 +1,30 @@ +package Chapter4.util; + +import edu.uci.ics.jung.algorithms.scoring.VertexScorer; +import edu.uci.ics.jung.graph.Hypergraph; + +/** + * This is a Jung Node Scorer that computes the + * In-Degree Centrality of nodes. + */ +public class InDegreeScorer implements VertexScorer{ + + //The graph representation in JUNG. + private Hypergraph graph; + + /** + * Initialize the graph scorer. + * @param graph + * The graph we wish to score. + */ + public InDegreeScorer(Hypergraph graph){ + this.graph = graph; + } + + /** + * @return The In-Degree Centrality of the vertex. + */ + public Double getVertexScore(T node) { + return (double) graph.getInEdges(node).size(); + } +} \ No newline at end of file diff --git a/src/Chapter4/util/TweetFileProcessor.java b/src/Chapter4/util/TweetFileProcessor.java new file mode 100644 index 0000000..9b6b99c --- /dev/null +++ b/src/Chapter4/util/TweetFileProcessor.java @@ -0,0 +1,76 @@ +package Chapter4.util; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.Iterator; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.json.JSONException; +import org.json.JSONObject; + +public class TweetFileProcessor implements Iterator{ + + protected BufferedReader fileBuffer; + protected boolean endOfFile; + protected String nextLine; + + public TweetFileProcessor(File f){ + + endOfFile = false; + + InputStreamReader isr; + BufferedReader br = null; + try { + isr = new InputStreamReader(new FileInputStream(f), "UTF-8"); + br = new BufferedReader(isr); + nextLine = br.readLine(); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + endOfFile = true; + } catch (FileNotFoundException e) { + e.printStackTrace(); + endOfFile = true; + } catch (IOException e) { + e.printStackTrace(); + endOfFile = true; + } + finally{ + fileBuffer = br; + } + } + + @Override + public boolean hasNext() { + return !endOfFile; + } + + @Override + public JSONObject next() { + JSONObject obj = null; + try { + obj = new JSONObject(nextLine); + } catch (JSONException ex) { + Logger.getLogger(TweetFileProcessor.class.getName()).log(Level.SEVERE, null, ex); + } + try { + nextLine = fileBuffer.readLine(); + if(nextLine == null){ + endOfFile = true; + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return obj; + } + + @Override + public void remove() throws UnsupportedOperationException{ + throw new UnsupportedOperationException(); + } +} diff --git a/src/Chapter4/util/TweetFileToGraph.java b/src/Chapter4/util/TweetFileToGraph.java new file mode 100644 index 0000000..6cf2e3a --- /dev/null +++ b/src/Chapter4/util/TweetFileToGraph.java @@ -0,0 +1,77 @@ +package Chapter4.util; + +import java.io.File; + +import GraphElements.RetweetEdge; +import GraphElements.UserNode; + +import edu.uci.ics.jung.graph.DirectedGraph; +import edu.uci.ics.jung.graph.DirectedSparseGraph; +import edu.uci.ics.jung.graph.util.EdgeType; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.json.JSONException; +import org.json.JSONObject; + +/** + * Some basic functionality to convert files collected + * in Chapter 2 to JUNG graphs. + */ +public class TweetFileToGraph { + + public static DirectedGraph getRetweetNetwork(File tweetFile){ + + JSONObject tmp; + + TweetFileProcessor tfp = new TweetFileProcessor(tweetFile); + DirectedSparseGraph dsg = new DirectedSparseGraph(); + + while (tfp.hasNext()){ + tmp = tfp.next(); + if(tmp==null) + { + continue; + } + //get the author + String user=null; + try { + user = tmp.getJSONObject("user").getString("screen_name"); + } catch (JSONException ex) { + Logger.getLogger(TweetFileToGraph.class.getName()).log(Level.SEVERE, null, ex); + } + if(user==null) + { + continue; + } + //get the retweeted user + try{ + JSONObject retweet = tmp.getJSONObject("retweeted_status"); + String retweeted_user = retweet.getJSONObject("user").getString("screen_name"); + + //make an edge or increment the weight if it exists. + UserNode toUser = new UserNode(retweeted_user); + UserNode fromUser = new UserNode(user); + + dsg.addVertex(toUser); + dsg.addVertex(fromUser); + + RetweetEdge edge = new RetweetEdge(toUser, fromUser); + + if(dsg.containsEdge(edge)){ + dsg.findEdge(fromUser, toUser).incrementRTCount(); + } + else{ + dsg.addEdge(edge, fromUser, toUser); + } + dsg.addEdge(edge, fromUser, toUser, EdgeType.DIRECTED); + } + catch(JSONException ex){ + //the tweet is not a retweet. this is not a problem. + } + + + } + + return dsg; + } +} diff --git a/src/Chapter5/network/CreateD3Network.java b/src/Chapter5/network/CreateD3Network.java new file mode 100644 index 0000000..d4c25af --- /dev/null +++ b/src/Chapter5/network/CreateD3Network.java @@ -0,0 +1,716 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package Chapter5.network; + + +import Chapter5.support.HashTagDS; +import Chapter5.support.NetworkNode; +import Chapter5.support.NodeIDComparator; +import Chapter5.support.NodeSizeComparator; +import Chapter5.support.ToNodeInfo; +import Chapter5.support.Tweet; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import utils.TextUtils; + +/** + * + * @author shamanth + */ +public class CreateD3Network +{ + static final String DEF_INFILENAME = "ows.json"; + private String RTPATTERN = "rt @[_a-zA-Z0-9]+"; + private final int DEFAULT_NODE_SIZE = 0; +// private final int NODE_COUNT_LIMIT = 1; + //private final String[] node_color_scheme = new String[]{"#FFFFD9","#EDF8B1","#C7E9B4","#7FCDBB","#41B6C4","#1D91C0","#225EA8","#253494","#081D58"}; + //private final String[] node_color_scheme = new String[]{"#A6BDDB","#74A9CF","#3690C0","#0570B0","#045A8D","#023858"}; + + /** + * Extracts the users who have been retweeted using the RTPATTERN + * @param text + * @return + */ + public ArrayList GetRTUsers(String text) + { + Pattern p = Pattern.compile(RTPATTERN, Pattern.CASE_INSENSITIVE); + Matcher m = p.matcher(text); + ArrayList rtusers = new ArrayList(); + while(m.find()) + { + String nuser = text.substring(m.start(),m.end()); + nuser = nuser.replaceAll("rt @|RT @", ""); +// nuser = nuser.replaceAll("RT @", ""); + rtusers.add(nuser.toLowerCase()); + } + return rtusers; + } + + /** + * Identifies the category to which the tweet belongs. Each category is defined by a group of words/hashtags + * @param tweet + * @param usercategories + * @return + */ + public int GetCategory(String tweet, HashTagDS[] usercategories) + { + HashMap categoryvotes = new HashMap(); + tweet = tweet.toLowerCase(); + int i=0; + for(HashTagDS cat:usercategories) + { + + for(String s :cat.tags) + { + if(tweet.indexOf(s)!=-1) + { + if(categoryvotes.containsKey(i)) + { + categoryvotes.put(i, categoryvotes.get(i)+1); + } + else + { + categoryvotes.put(i, 1); + } + } + } + i++; + } + Set keyset = categoryvotes.keySet(); + int maxvote = 0; + //by default the tweet will be in the first category + int maxcategoryindex = 0; + for(int key:keyset) + { + if(categoryvotes.get(key)>maxvote) + { + maxvote = categoryvotes.get(key); + maxcategoryindex = key; + } + } + return maxcategoryindex; + } + + /** + * Converts the input jsonobject containing category descriptions to an array for processing. + * @param hashtagcoll JSONObject containing the list of hashtags, color, and the topic information + * @return An array of hashtags + */ + public HashTagDS[] ConvertJSONArrayToArray(JSONObject hashtagcoll) + { + HashTagDS[] hashtags = new HashTagDS[hashtagcoll.length()]; + int j=0; + try{ + if(hashtagcoll!=null) + { + Iterator keyit = hashtagcoll.keys(); + while(keyit.hasNext()) + { + HashTagDS ht = new HashTagDS(); + JSONObject tags = (JSONObject) hashtagcoll.get((String)keyit.next()); + ht.groupname = keyit.toString(); + ht.color = tags.getString("color"); + JSONArray tagjson = tags.getJSONArray("hts"); + ht.tags = new String[tagjson.length()]; + for(int i=0;i catcount = new HashMap(); + //if the node has no tolinks then look at the node that it retweeted to decide the color of the node + for(String tweet:tnfs.data) + { + int id = this.GetCategory(tweet, hashtagarray); + if(catcount.containsKey(id)) + { + catcount.put(id, catcount.get(id)+1); + } + else + catcount.put(id, 1); + } + Set keys = catcount.keySet(); + int maxcatID = -1; + int maxcount = 0; + for(int k:keys) + { + if(maxcatID==-1) + { + maxcatID = k; + maxcount = catcount.get(k); + } + else + { + if(maxcount userconnections = new HashMap(); +// HashMap tweet_class_codes = new HashMap(); +// int tweet_class_counter = 1; + HashTagDS[] hashtagarray = ConvertJSONArrayToArray(hashtags); + BufferedReader br = null; + try{ + br = new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); + String temp = ""; + while((temp = br.readLine())!=null) + { + JSONObject tweetobj; + try { + tweetobj = new JSONObject(temp); + } catch (JSONException ex) { + ex.printStackTrace(); + continue; + } + //Extract the tweet first + Tweet t = new Tweet(); + String text=""; + try { + text = TextUtils.GetCleanText(tweetobj.getString("text")).toLowerCase(); + } catch (JSONException ex) { + ex.printStackTrace(); + continue; + } + //Check that the tweet matches at least one of the topics + boolean groupmatch = false; + for(HashTagDS ht:hashtagarray) + { + String[] tags = ht.tags; + for(String tg:tags) + { + if(text.contains(tg)) + { + groupmatch = true; + break; + } + } + if(groupmatch) + { + break; + } + } + if(!groupmatch) + { + continue; + } + // + ArrayList fromusers = new ArrayList(); + if(!tweetobj.isNull("retweeted_status")) + { + JSONObject rtstatus; + try { + rtstatus = tweetobj.getJSONObject("retweeted_status"); + if(rtstatus.isNull("user")) + { + JSONObject rtuserobj = rtstatus.getJSONObject("user"); + try{ + fromusers.add(rtuserobj.get("screen_name").toString()); + }catch(JSONException ex) + { + ex.printStackTrace(); + } + } + } catch (JSONException ex) { + Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex); + } + } + else + { + //use the tweet text to retrieve the pattern "RT @username:" + fromusers = GetRTUsers(text); + } + if(fromusers.isEmpty()) + { + continue; + } + + //identify the class values to be applied to all the nodes and + //edges. +// String prunedtext = TextUtils.RemoveTwitterElements(text); +// Integer class_code = tweet_class_codes.get(prunedtext); +// if(class_code==null) +// { +// class_code = tweet_class_counter; +// tweet_class_codes.put(prunedtext, class_code); //set the unique id for this tweet +// tweet_class_counter++; +// } + t.text = TextUtils.RemoveRTElements(text); + if(!tweetobj.isNull("user")) + { + JSONObject userobj; + try { + userobj = tweetobj.getJSONObject("user"); + t.user = userobj.getString("screen_name").toLowerCase(); + } catch (JSONException ex) { + Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex); + } + } +// try { +// t.pubdate = String.valueOf(tweetobj.get("timestamp")); +// } catch (JSONException ex) { +// Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex); +// } + t.catColor = hashtagarray[t.catID].color; + //update the size of the from fromuser + int cur_level = 0; + for(int i=fromusers.size()-1;i>=0;i--) + { + String touser = ""; + if(i==0) + {//if this is the last user in the retweet sequence then use the user of the tweet as the next link + touser = t.user; + } + else + { //if there are still fromuser in the retweet chain then use them as the next link + touser = fromusers.get(i-1); + } + //don't add any selflinks + if(fromusers.get(i).equals(touser)) + { + continue; + } + NetworkNode fromuser = null; + if(userconnections.containsKey(fromusers.get(i))) + { + //from node already exists simply add this new connection to it + fromuser = userconnections.get(fromusers.get(i)); + } + else + { + //the from user was not found. add the node + fromuser = new NetworkNode(); + // fromuser.id = nodeid++; + fromuser.username = fromusers.get(i); + fromuser.tonodes = new ArrayList(); + fromuser.class_codes = new ArrayList(); + fromuser.size = DEFAULT_NODE_SIZE; + fromuser.level = cur_level; + fromuser.data = new ArrayList(); + fromuser.data.add(t.text); + //fromuser.category = ; + } +// if(!fromuser.class_codes.contains(class_code)) +// { +// //add the marker to from node if it does not have it already +// fromuser.class_codes.add(class_code); +// } + //if to node is not in the list then create it + NetworkNode tonode = null; + if(!userconnections.containsKey(touser)) + { + tonode = new NetworkNode(); + // System.out.println(touser+" "+nodeid); + // tonode.id= nodeid++; + tonode.username = touser; + tonode.tonodes= new ArrayList(); + tonode.class_codes = new ArrayList(); + tonode.catID = t.catID; + tonode.catColor = t.catColor; + tonode.size = DEFAULT_NODE_SIZE; + tonode.data= new ArrayList(); + tonode.data.add(t.text); + tonode.level = cur_level+1; + //add the classcode to the node if it doesn't already exist +// if(!tonode.class_codes.contains(class_code)) +// { +// tonode.class_codes.add(class_code); +// } + //add the touser info + userconnections.put(touser, tonode); + } + else + { + tonode = userconnections.get(touser); + tonode.data.add(t.text); + if(tonode.level keys = userconnections.keySet(); + ArrayList returnnodes = new ArrayList(); + //its +1 because nodes with size 0 are not going to be used to calculate the class + int min = DEFAULT_NODE_SIZE+1; + int max = DEFAULT_NODE_SIZE+1; + for(String k:keys) + { + NetworkNode n = userconnections.get(k); + int maxcat = GetMajorityTopicColor(n,hashtagarray); + n.catID = maxcat; + n.catColor = hashtagarray[maxcat].color; + userconnections.put(k, n); + // +// if(n.size==0) +// {//mark the node as a zero node +// n.class_codes.add(-1); +// } +// else +// { + if(n.size>max) + { + max = n.size; + } + if(n.size nodes = ComputeGroupsSqrt(returnnodes, max, min, numNodeClasses); + Collections.sort(nodes,Collections.reverseOrder(new NodeSizeComparator())); + //select how many nodes to show. + int nodes_to_visit = 0; + if(nodes.size()>=num_nodes) + { + nodes_to_visit = num_nodes; + } + else + { + nodes_to_visit = nodes.size(); + } + + HashMap prunednodes = new HashMap(); + HashMap nodeidlist = new HashMap(); + int nodeid = 0; //node nodeid counter + for(int k=0;k rtnodes = GetNextHopConnections(userconnections,nd,new HashMap()); + Set names = rtnodes.keySet(); + for(String n:names) + { + if(!prunednodes.containsKey(n)) + { + NetworkNode newnode = rtnodes.get(n); + if(newnode.size>0) + { + prunednodes.put(n, newnode); + nodeidlist.put(n, nodeid++); + } + } + } + } + + /** We now have all the nodes of the network. compute their ids sequentially + * and assign them to the respective nodes. Simultaneously compact the nodes + * of the network to remove all nodes which have not been retweeted and are + * of size 0 + */ + + Set allnodes = prunednodes.keySet(); +// System.out.println(prunednodes.size()); + ArrayList finalnodes = new ArrayList(); +// HashMap> conninfo = new HashMap>(); + for(String n:allnodes) + { + NetworkNode nd = prunednodes.get(n); + nd.id = nodeidlist.get(nd.username); + ArrayList connids = new ArrayList(); +// ArrayList compact_To_nodes = new ArrayList(); + int counter = 0; + for(ToNodeInfo tnf: nd.tonodes) + { + //user has never been retweeted. the chain terminates here, so remove it + if(nodeidlist.containsKey(tnf.tousername)) + { + tnf.tonodeid = nodeidlist.get(tnf.tousername); + connids.add(tnf.tonodeid); + nd.tonodes.set(counter, tnf); + counter++; + } + } + finalnodes.add(nd); + //store the connections to compute the clusterids later +// if(!conninfo.containsKey(nd.id)) +// { +// conninfo.put(nd.id, connids); +// } + } + //generate the clusterids +// ArrayList[] clusterids = (ArrayList[])new ArrayList[allnodes.size()]; +// Set idkeys = conninfo.keySet(); +// for(int id:idkeys) +// { +// for(int x:conninfo.get(id)) +// { +// if(clusterids[x]==null) +// { +// ArrayList toclusterid = new ArrayList(); +// toclusterid.add(id); +// clusterids[x] = toclusterid; +// } +// else +// { +// ArrayList toclusterid = clusterids[x]; +// if(!toclusterid.contains(id)) +// { +// toclusterid.add(id); +// clusterids[x] = toclusterid; +// } +// } +// } +// } + //now create the final node list with the clusterids +// for(String n:allnodes) +// { +// NetworkNode nd = prunednodes.get(n); +// ArrayList cids = clusterids[nd.id]; +// if(cids!=null) +// { +// int size = cids.size(); +// nd.clusterID = new int[size+1]; +// int counter=0; +// nd.clusterID[counter++] = nd.id; +// for(int c:cids) +// { +// nd.clusterID[counter++] = c; +// } +// } + //System.out.println(nd.class_codes.toString()); +// finalnodes.add(nd); +// } + Collections.sort(finalnodes,new NodeIDComparator()); + System.out.println(finalnodes.size()); + for(NetworkNode node:finalnodes) + { + System.out.println(node.id+" "+node.username+" "+node.level+" "+node.size+" "+node.catColor+node.data.get(0)); + } + return GetD3Structure(finalnodes); + } + + /** + * Creates a D3 representation of the nodes, consisting of two JSONArray a set of nodes and a set of links between the nodes + * @param finalnodes + * @return + */ + public JSONObject GetD3Structure(ArrayList finalnodes) + { + JSONObject alltweets = new JSONObject(); + try { + JSONArray nodes = new JSONArray(); + JSONArray links = new JSONArray(); + for (NetworkNode node : finalnodes) + { + try { + //create adjacencies + JSONArray nodedata = new JSONArray(); + for (ToNodeInfo tnf : node.tonodes) { + JSONObject jsadj = new JSONObject(); + jsadj.put("source", node.id); + jsadj.put("target", tnf.tonodeid); + //weight of the edge + jsadj.put("value", 1); + //class code is a unique id corresponding to the text + jsadj.put("data", tnf.class_code); + links.put(jsadj); + //create a data object for the node + JSONObject jsdata = new JSONObject(); + jsdata.put("tonodeid", tnf.tonodeid); + jsdata.put("nodefrom", node.username); + jsdata.put("nodeto", tnf.tousername); + jsdata.put("tweet", tnf.text); +// jsdata.put("pubtime", tnf.date); + //class code for tweet to be used to filter +// jsdata.put("classcode", tnf.class_code); + nodedata.put(jsdata); + } + //add node + JSONObject nd = new JSONObject(); + nd.put("name", node.username); + nd.put("group", node.group); + nd.put("id", node.id); + nd.put("size", node.size); + nd.put("catColor", node.catColor); + nd.put("catID", node.catID); + nd.put("data", nodedata); + nd.put("level", node.level); + //clusterids for the node +// JSONArray cids = new JSONArray(); +// if (node.clusterID != null) { +// for (int code : node.clusterID) { +// cids.put(code); +// } +// } else { +// cids.put(node.id); +// } +// nd.put("clusterids", cids); + //classcodes for the node +// JSONArray codes = new JSONArray(); +// for (int c : node.class_codes) { +// codes.put(c); +// } +// nd.put("classcodes", codes); + nodes.put(nd); + } catch (JSONException ex) { + ex.printStackTrace(); + } + } + alltweets.put("nodes", nodes); + alltweets.put("links", links); + } catch (JSONException ex) { + Logger.getLogger(CreateD3Network.class.getName()).log(Level.SEVERE, null, ex); + } + return alltweets; + } + + /** + * Recursively traverses the list of nodes to identify all nodes reachable from a starting node. + * @param userconnections A map containing the usernames as keys and the node information as value + * @param cur_node Node currently being processed. + * @param newnodes A list of nodes which can be reached from the current node + * @return A map of the usernames and the node information for all nodes reachable + */ + public HashMap GetNextHopConnections(HashMap userconnections,NetworkNode cur_node,HashMap newnodes) + { + cur_node.level = cur_node.level+1; + newnodes.put(cur_node.username,cur_node); + for(int i=0;i rtnodes = GetNextHopConnections(userconnections, userconnections.get(tnf.tousername),newnodes); + newnodes = rtnodes; + } + return newnodes; + } + + /** + * Divides a list of nodes into groups using the square root binning + * technique. If a node has size x and there are y groups in total. Then the + * group of the node is computed as ceil((sqrt(x)/sqrt(max))*y), where max is + * the size of the largest node. + * @param nodes A list of nodes + * @param max The maximum size of a node + * @param min The minimum size of a node + * @param noofclasses Number of classes into which the nodes must be classified + * @return A list of nodes along with their class + */ + public ArrayList ComputeGroupsSqrt(ArrayList nodes, int max, int min, int noofclasses) + { + ArrayList finalnodes = new ArrayList(); + for(int i=0;i0) + { + color_index = (int) Math.ceil(((double)Math.sqrt(node.size)/Math.sqrt(max))*noofclasses)-1; +// node.size = color_index*6; + } + node.group = color_index; + finalnodes.add(node); + } + return finalnodes; + } + + + //DEBUG use only + public static void main(String[] args) + { + try { + CreateD3Network cdn = new CreateD3Network(); + JSONObject jobj = new JSONObject(); + JSONObject obj = new JSONObject(); + obj.put("color", "#800000"); + JSONArray ja = new JSONArray(); + ja.put("zuccotti"); + obj.put("hts", ja); + jobj.put("Group 1", obj); + obj = new JSONObject(); + obj.put("color", "#0FFF00"); + ja = new JSONArray(); + ja.put("#nypd"); + obj.put("hts", ja); + jobj.put("Group 2", obj); + String filename = "D:\\Twitter Data Analytics\\Data\\testows.json"; + JSONObject nodes = cdn.ConvertTweetsToDiffusionPath(filename,7, jobj,5); + } catch (JSONException ex) { + ex.printStackTrace(); + } + } +} diff --git a/src/Chapter5/network/ExtractUserTagNetwork.java b/src/Chapter5/network/ExtractUserTagNetwork.java new file mode 100644 index 0000000..43ae680 --- /dev/null +++ b/src/Chapter5/network/ExtractUserTagNetwork.java @@ -0,0 +1,173 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter5.network; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +public class ExtractUserTagNetwork +{ + + static final String DEF_INFILENAME = "ows.json"; + + /** + * Extracts a map of all the hashtags a user has used in his tweets resulting in a bipartite network. The frequency of each tag is also returned in the form of a map. + * @param inFilename File containing a list of tweets as JSON objects + * @return A map containing the users as keys and a map containing the hashtags they use along with their frequency. + */ + public HashMap> ExtractUserHashtagNetwork(String inFilename) + { + HashMap> usertagmap = new HashMap>(); + BufferedReader br = null; + try{ + br = new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); + String temp = ""; + while((temp = br.readLine())!=null) + { + try{ + JSONObject tweetobj = new JSONObject(temp); + String text; + String username; + HashMap tags = new HashMap(); + if(!tweetobj.isNull("entities")) + { + JSONObject entities = tweetobj.getJSONObject("entities"); + JSONArray hashtags; + try { + hashtags = entities.getJSONArray("hashtags"); + for(int i=0;i usertags = usertagmap.get(username); + Set keys = tags.keySet(); + for(String k:keys) + { + if(usertags.containsKey(k)) + { + usertags.put(k, usertags.get(k)+tags.get(k)); + } + else + { + usertags.put(k, tags.get(k)); + } + } + usertagmap.put(username, usertags); + } + else + { + usertagmap.put(username, tags); + } + } + }catch(JSONException ex) + { + ex.printStackTrace(); + } + } + }catch(IOException ex) + { + ex.printStackTrace(); + }finally{ + try { + br.close(); + } catch (IOException ex) { + Logger.getLogger(ExtractUserTagNetwork.class.getName()).log(Level.SEVERE, null, ex); + } + } + return usertagmap; + } + + /** + * Extracts all the hashtags mentioned in a tweet and creates a map with the frequency of their occurrence. + * @param text + * @return A map containing the hashtags as keys and their frequency as value + */ + public HashMap ExtractHashTags(String text) + { + Pattern p = Pattern.compile("#[a-zA-Z0-9]+"); + Matcher m = p.matcher(text); + HashMap tags = new HashMap(); + while(m.find()) + { + String tag = text.substring(m.start(),m.end()).toLowerCase(); + if(!tags.containsKey(tag)) + { + tags.put(tag,1); + } + else + { + tags.put(tag, tags.get(tag)+1); + } + } + return tags; + } + + public static void main(String[] args) + { + ExtractUserTagNetwork eutn = new ExtractUserTagNetwork(); + + String infilename = DEF_INFILENAME; + if(args!=null) + { + if(args.length>=1&&!args[0].isEmpty()) + { + File fl = new File(args[0]); + if(fl.exists()) + { + infilename = args[0]; + } + } + } + HashMap> usertagmap = eutn.ExtractUserHashtagNetwork(infilename); + Set keys = usertagmap.keySet(); + for(String key:keys) + { + System.out.println(key); + HashMap tags = usertagmap.get(key); + Set tagkeys = tags.keySet(); + for(String tag:tagkeys) + { + System.out.println(tag+","+tags.get(tag)); + } + } + } +} diff --git a/src/Chapter5/support/DateInfo.java b/src/Chapter5/support/DateInfo.java new file mode 100644 index 0000000..9a32d4c --- /dev/null +++ b/src/Chapter5/support/DateInfo.java @@ -0,0 +1,30 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter5.support; + +import java.util.Date; +import java.util.HashMap; + +public class DateInfo implements Comparable +{ + public Date d; + public HashMap catcounts = new HashMap(); + + public int compareTo(Object o) { + DateInfo temp = (DateInfo) o; + if(temp.d.after(this.d)) + { + return 1; + } + else + if(temp.d.before(this.d)) + { + return -1; + } + else + { + return 0; + } + } +} diff --git a/src/Chapter5/support/HashTagDS.java b/src/Chapter5/support/HashTagDS.java new file mode 100644 index 0000000..b338b6d --- /dev/null +++ b/src/Chapter5/support/HashTagDS.java @@ -0,0 +1,18 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package Chapter5.support; + +/** + * + * @author shamanth + */ +public class HashTagDS +{ + public String groupname; + public String[] tags; + public String color; + +} diff --git a/src/Chapter5/support/NetworkNode.java b/src/Chapter5/support/NetworkNode.java new file mode 100644 index 0000000..4f662e8 --- /dev/null +++ b/src/Chapter5/support/NetworkNode.java @@ -0,0 +1,49 @@ +package Chapter5.support; + + +import java.util.ArrayList; + +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +/** + * + * @author shamanth + */ +public class NetworkNode +{ + public int id; + public String username; + public int size; + public String catColor; + public int group; +// public int[] clusterID; + public int catID; +// public double lat; +// public double lng; + public ArrayList data; + public int level; + public ArrayList class_codes; + public ArrayList tonodes; + + public NetworkNode Copy() + { + NetworkNode tempnode = new NetworkNode(); + tempnode.catColor = this.catColor; + tempnode.id = this.id; + tempnode.username= this.username; + tempnode.size = this.size; + tempnode.group = this.group; +// tempnode.clusterID = this.clusterID; + tempnode.catID = this.catID; +// tempnode.lat = this.lat; +// tempnode.lng = this.lng; + tempnode.data = this.data; +// tempnode.level = this.level; + tempnode.class_codes = this.class_codes; + tempnode.tonodes = this.tonodes; + return tempnode; + } +} diff --git a/src/Chapter5/support/NodeIDComparator.java b/src/Chapter5/support/NodeIDComparator.java new file mode 100644 index 0000000..0b41ae7 --- /dev/null +++ b/src/Chapter5/support/NodeIDComparator.java @@ -0,0 +1,32 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package Chapter5.support; + +import java.util.Comparator; + +/** + * + * @author shamanth + */ +public class NodeIDComparator implements Comparator +{ + + public int compare(Object o1, Object o2) { + int id1 = ((NetworkNode) o1).id; + int id2 = ((NetworkNode) o2).id; + if(id1>id2) + { + return 1; + } + else + if(id1size2) + { + return 1; + } + if(size1> CATEGORIES = new HashMap>(); + SimpleDateFormat twittersdm = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy"); + SimpleDateFormat dayhoursdm = new SimpleDateFormat("yyyy-MM-dd:HH"); +// SimpleDateFormat daysdm = new SimpleDateFormat("MM/dd/yyyy"); + SimpleDateFormat hoursdm = new SimpleDateFormat("HH"); + + /** + * + */ + public void InitializeCategories() + { + ArrayList people = new ArrayList(); + people.add("protesters"); + people.add("people"); + CATEGORIES.put("People",people); + ArrayList police = new ArrayList(); + police.add("police"); + police.add("cops"); + police.add("nypd"); + police.add("raid"); + CATEGORIES.put("Police",police); + ArrayList media = new ArrayList(); + media.add("press"); + media.add("news"); + media.add("media"); + CATEGORIES.put("Media",media); + ArrayList city = new ArrayList(); + city.add("nyc"); + city.add("zucotti"); + city.add("park"); + CATEGORIES.put("Location",city); + ArrayList judiciary = new ArrayList(); + judiciary.add("judge"); + judiciary.add("eviction"); + judiciary.add("order"); + judiciary.add("court"); + CATEGORIES.put("Judiciary", judiciary); + } + + /** + * + * @param filename + * @return + */ + public JSONObject ExtractCategoryTrends(String filename) + { + JSONObject result = new JSONObject(); + try { + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8")); + String temp = ""; + Set catkeys = CATEGORIES.keySet(); + HashMap> datecount = new HashMap>(); + while((temp = br.readLine())!=null) + { + Date d = new Date(); + try { + JSONObject jobj = new JSONObject(temp); + //Published time + if(!jobj.isNull("created_at")) + { + String time = ""; + try { + time = jobj.getString("created_at"); + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + if(time.isEmpty()) + { + continue; + } + else + { + try { + d = twittersdm.parse(time); + } catch (ParseException ex) { + continue; + } + } + } + else + if(!jobj.isNull("timestamp")) + { + long time = new Date().getTime(); + try{ + time = jobj.getLong("timestamp"); + }catch(JSONException ex) + { + ex.printStackTrace(); + } + d = new Date(); + d.setTime(time); + } + String datestr = dayhoursdm.format(d); + String text = jobj.getString("text").toLowerCase(); +// System.out.println(text); + for(String key:catkeys) + { + ArrayList words = CATEGORIES.get(key); + for(String word:words) + { + if(text.contains(word)) + { + HashMap categorycount = new HashMap(); + if(datecount.containsKey(datestr)) + { + categorycount = datecount.get(datestr); + } + if(categorycount.containsKey(key)) + { + categorycount.put(key, categorycount.get(key)+1); + } + else + { + categorycount.put(key, 1); + } + //update the categorycount for the specific date + datecount.put(datestr, categorycount); + break; + } + } + } + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + } + //sort the dates + Set datekeys = datecount.keySet(); + ArrayList dinfos = new ArrayList(); + for(String date:datekeys) + { + Date d = null; + try { + d = dayhoursdm.parse(date); + } catch (ParseException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + if(d!=null) + { + DateInfo info = new DateInfo(); + info.d = d; + info.catcounts = datecount.get(date); + dinfos.add(info); + } + } + Collections.sort(dinfos, Collections.reverseOrder()); + try { + result.put("axisxstep", dinfos.size()-1); + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + try { + result.put("axisystep", CATEGORIES.size()-1); + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + JSONArray xcoordinates = new JSONArray(); + JSONArray ycoordinates = new JSONArray(); + //now add the data and the axis labels + JSONArray axisxlabels = new JSONArray(); + JSONArray axisylabels = new JSONArray(); + JSONArray data = new JSONArray(); + for(String key:catkeys) + { + axisylabels.put(key); + } + //counters to mark the indices of the values added to data field. i is the x coordinate and j is the y coordinate + int i=0,j=0; + + for(DateInfo date:dinfos) + { + String strdate = hoursdm.format(date.d); + axisxlabels.put(strdate); + HashMap catcounts = date.catcounts; + for(String key:catkeys) + { + xcoordinates.put(j); + ycoordinates.put(i++); + if(catcounts.containsKey(key)) + { + data.put(catcounts.get(key)); + } + else + { + data.put(0); + } + } + //reset the x coordinate as we move to the next y item + i=0; + j++; + } + try { + result.put("xcoordinates", xcoordinates); + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + try { + result.put("ycoordinates", ycoordinates); + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + try { + result.put("axisxlabels", axisxlabels); + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + try { + result.put("axisylabels", axisylabels); + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + try { + result.put("data", data); + } catch (JSONException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + br.close(); + } catch (IOException ex) { + Logger.getLogger(EventSummaryExtractor.class.getName()).log(Level.SEVERE, null, ex); + } + return result; + } + + public static void main(String[] args) + { + EventSummaryExtractor ese = new EventSummaryExtractor(); + String infilename = ese.DEF_INFILENAME; + if(args!=null) + { + if(args.length>=1&&!args[0].isEmpty()) + { + File fl = new File(args[0]); + if(fl.exists()) + { + infilename = args[0]; + } + } + } + ese.InitializeCategories(); + System.out.println(ese.ExtractCategoryTrends(infilename).toString()); + } +} diff --git a/src/Chapter5/text/ExtractTopKeywords.java b/src/Chapter5/text/ExtractTopKeywords.java new file mode 100644 index 0000000..8ab412a --- /dev/null +++ b/src/Chapter5/text/ExtractTopKeywords.java @@ -0,0 +1,151 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter5.text; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import utils.Tags; +import utils.TextUtils; + +public class ExtractTopKeywords +{ + + static final String DEF_INFILENAME = "ows.json"; + static final int DEF_K = 60; + + /** + * Extracts the most frequently occurring keywords from the tweets by processing them sequentially. Stopwords are ignored. + * @param inFilename File containing a list of tweets as JSON objects + * @param K Count of the top keywords to return + * @param ignoreHashtags If true, hashtags are not considered while counting the most frequent keywords + * @param ignoreUsernames If true, usernames are not considered while counting the most frequent keywords + * @param tu TextUtils object which handles the stopwords + * @return a JSONArray containing an array of JSONObjects. Each object contains two elements "text" and "size" referring to the word and it's frequency + */ + public JSONArray GetTopKeywords(String inFilename, int K, boolean ignoreHashtags, boolean ignoreUsernames, TextUtils tu) + { + HashMap words = new HashMap(); + BufferedReader br = null; + try{ + br = new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); + String temp = ""; + while((temp = br.readLine())!=null) + { + try{ + JSONObject tweetobj = new JSONObject(temp); + if(!tweetobj.isNull("text")) + { + String text = tweetobj.getString("text"); + //System.out.println(text); + text = text.toLowerCase().replaceAll("\\s+", " "); + /** Step 1: Tokenize tweets into individual words. and count their frequency in the corpus + * Remove stop words and special characters. Ignore user names and hashtags if the user chooses to. + */ + HashMap tokens = tu.TokenizeText(text,ignoreHashtags,ignoreUsernames); + Set keys = tokens.keySet(); + for(String key:keys) + { + if(words.containsKey(key)) + { + words.put(key, words.get(key)+tokens.get(key)); + } + else + { + words.put(key, tokens.get(key)); + } + } + } + }catch(JSONException ex) + { + ex.printStackTrace(); + } + } + }catch(IOException ex) + { + ex.printStackTrace(); + }finally{ + try { + br.close(); + } catch (IOException ex) { + Logger.getLogger(ExtractTopKeywords.class.getName()).log(Level.SEVERE, null, ex); + } + } + Set keys = words.keySet(); + ArrayList tags = new ArrayList(); + for(String key:keys) + { + Tags tag = new Tags(); + tag.setKey(key); + tag.setValue(words.get(key)); + tags.add(tag); + } + // Step 2: Sort the words in descending order of frequency + Collections.sort(tags, Collections.reverseOrder()); + JSONArray cloudwords = new JSONArray(); + int numwords = K; + if(tags.size()=1&&!args[0].isEmpty()) + { + File fl = new File(args[0]); + if(fl.exists()) + { + infilename = args[0]; + } + } + if(args.length>=2&&!args[1].isEmpty()) + { + try{ + K = Integer.parseInt(args[1]); + }catch(NumberFormatException ex) + { + ex.printStackTrace(); + } + } + } + System.out.println(etk.GetTopKeywords(infilename, K, false,true,tu)); + } + +} diff --git a/src/Chapter5/trends/ControlChartExample.java b/src/Chapter5/trends/ControlChartExample.java new file mode 100644 index 0000000..2df814f --- /dev/null +++ b/src/Chapter5/trends/ControlChartExample.java @@ -0,0 +1,144 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter5.trends; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +public class ControlChartExample +{ + static final String DEF_INFILENAME = "ows.json"; + static final SimpleDateFormat SDM = new SimpleDateFormat("dd MMM yyyy HH:mm"); + + public JSONArray GenerateDataTrend(String inFilename) + { + BufferedReader br = null; + JSONArray result = new JSONArray(); + HashMap datecount = new HashMap(); + try{ + br= new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); + String temp = ""; + while((temp = br.readLine())!=null) + { + try { + JSONObject jobj = new JSONObject(temp); + long timestamp = jobj.getLong("timestamp"); + Date d = new Date(timestamp); + String strdate = SDM.format(d); + if(datecount.containsKey(strdate)) + { + datecount.put(strdate, datecount.get(strdate)+1); + } + else + { + datecount.put(strdate, 1); + } + } catch (JSONException ex) { + Logger.getLogger(ControlChartExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + ArrayList dinfos = new ArrayList(); + Set keys = datecount.keySet(); + for(String key:keys) + { + DateInfo dinfo = new DateInfo(); + try { + dinfo.d = SDM.parse(key); + } catch (ParseException ex) { + ex.printStackTrace(); + continue; + } + dinfo.count = datecount.get(key); + dinfos.add(dinfo); + } + double mean = this.GetMean(dinfos); + double stddev = this.GetStandardDev(dinfos, mean); + Collections.sort(dinfos); + //Normalize the trend by subtracting the mean and dividing by standard deviation to get a distribution with 0 mean and a standard deviation of 1 + for(DateInfo dinfo:dinfos) + { + try{ + JSONObject jobj = new JSONObject(); + jobj.put("date", SDM.format(dinfo.d)); + jobj.put("count", (dinfo.count-mean)/stddev); + jobj.put("mean", 0); + jobj.put("stdev+3", 3); + jobj.put("stdev-3", -3); + result.put(jobj); + }catch(JSONException ex) + { + ex.printStackTrace(); + } + } + }catch(IOException ex) + { + ex.printStackTrace(); + }finally{ + try { + br.close(); + } catch (IOException ex) { + Logger.getLogger(ControlChartExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + return result; + } + + public double GetStandardDev(ArrayList dateinfos,double mean) + { + double intsum = 0; + int numperiods = dateinfos.size(); + for(DateInfo dinfo:dateinfos) + { + intsum+=Math.pow((dinfo.count - mean),2); + } +// System.out.println(Math.sqrt((double)intsum/timePeriodCounts.size())); + return Math.sqrt((double)intsum/numperiods); + } + + public double GetMean(ArrayList dateinfos) + { + int numperiods = dateinfos.size(); + int sum = 0; + for(DateInfo dinfo:dateinfos) + { + sum +=dinfo.count; + } +// System.out.println((double)sum/numPeriods); + return ((double)sum/numperiods); + } + + public static void main(String[] args) + { + ControlChartExample cce = new ControlChartExample(); + String infilename = DEF_INFILENAME; + if(args!=null) + { + if(args.length>=1&&!args[0].isEmpty()) + { + File fl = new File(args[0]); + if(fl.exists()) + { + infilename = args[0]; + } + } + } + System.out.println(cce.GenerateDataTrend(infilename)); + } + +} diff --git a/src/Chapter5/trends/DateInfo.java b/src/Chapter5/trends/DateInfo.java new file mode 100644 index 0000000..209f4a3 --- /dev/null +++ b/src/Chapter5/trends/DateInfo.java @@ -0,0 +1,29 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter5.trends; + +import java.util.Date; + +public class DateInfo implements Comparable +{ + public Date d; + public int count; + + public int compareTo(Object o) { + DateInfo temp = (DateInfo) o; + if(temp.d.after(this.d)) + { + return -1; + } + else + if(temp.d.before(this.d)) + { + return 1; + } + else + { + return 0; + } + } +} diff --git a/src/Chapter5/trends/ExtractDatasetTrend.java b/src/Chapter5/trends/ExtractDatasetTrend.java new file mode 100644 index 0000000..dad7f27 --- /dev/null +++ b/src/Chapter5/trends/ExtractDatasetTrend.java @@ -0,0 +1,120 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter5.trends; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +public class ExtractDatasetTrend +{ + static final String DEF_INFILENAME = "ows.json"; + // Date pattern used to count the volume of tweets + final SimpleDateFormat SDM = new SimpleDateFormat("dd MMM yyyy HH:mm"); + + public JSONArray GenerateDataTrend(String inFilename) + { + BufferedReader br = null; + JSONArray result = new JSONArray(); + HashMap datecount = new HashMap(); + try{ + br= new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); + String temp = ""; + while((temp = br.readLine())!=null) + { + try { + JSONObject jobj = new JSONObject(temp); + long timestamp = jobj.getLong("timestamp"); + Date d = new Date(timestamp); + String strdate = SDM.format(d); + if(datecount.containsKey(strdate)) + { + datecount.put(strdate, datecount.get(strdate)+1); + } + else + { + datecount.put(strdate, 1); + } + } catch (JSONException ex) { + Logger.getLogger(ExtractDatasetTrend.class.getName()).log(Level.SEVERE, null, ex); + } + } + /** DateInfo consists of a date string and the corresponding count. + * It also implements a Comparator for sorting by date + */ + ArrayList dinfos = new ArrayList(); + Set keys = datecount.keySet(); + for(String key:keys) + { + DateInfo dinfo = new DateInfo(); + try { + dinfo.d = SDM.parse(key); + } catch (ParseException ex) { + ex.printStackTrace(); + continue; + } + dinfo.count = datecount.get(key); + dinfos.add(dinfo); + } + Collections.sort(dinfos); + // Format and return the date string and the corresponding count + for(DateInfo dinfo:dinfos) + { + try{ + JSONObject jobj = new JSONObject(); + jobj.put("date", SDM.format(dinfo.d)); + jobj.put("count", dinfo.count); + result.put(jobj); + }catch(JSONException ex) + { + ex.printStackTrace(); + } + } + }catch(IOException ex) + { + ex.printStackTrace(); + }finally{ + try { + br.close(); + } catch (IOException ex) { + Logger.getLogger(ExtractDatasetTrend.class.getName()).log(Level.SEVERE, null, ex); + } + } + return result; + } + + public static void main(String[] args) + { + ExtractDatasetTrend edt = new ExtractDatasetTrend(); + + String infilename = DEF_INFILENAME; + if(args!=null) + { + if(args.length>=1&&!args[0].isEmpty()) + { + File fl = new File(args[0]); + if(fl.exists()) + { + infilename = args[0]; + } + } + } + System.out.println(edt.GenerateDataTrend(infilename)); + } + +} diff --git a/src/Chapter5/trends/SparkLineExample.java b/src/Chapter5/trends/SparkLineExample.java new file mode 100644 index 0000000..4a0164b --- /dev/null +++ b/src/Chapter5/trends/SparkLineExample.java @@ -0,0 +1,163 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter5.trends; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +public class SparkLineExample +{ + static final String DEF_INFILENAME = "ows.json"; + static final SimpleDateFormat SDM = new SimpleDateFormat("dd MMM yyyy HH"); + + public JSONObject GenerateDataTrend(String inFilename, ArrayList keywords) + { + BufferedReader br = null; + JSONObject result = new JSONObject(); + HashMap> datecount = new HashMap>(); + try{ + br= new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); + String temp = ""; + while((temp = br.readLine())!=null) + { + try { + JSONObject jobj = new JSONObject(temp); + String text = jobj.getString("text").toLowerCase(); + long timestamp = jobj.getLong("timestamp"); + Date d = new Date(timestamp); + String strdate = SDM.format(d); + for(String word:keywords) + { + if(text.contains(word)) + { + HashMap wordcount = new HashMap(); + if(datecount.containsKey(strdate)) + { + wordcount = datecount.get(strdate); + } + if(wordcount.containsKey(word)) + { + wordcount.put(word, wordcount.get(word)+1); + } + else + { + wordcount.put(word, 1); + } + //update the wordcount for the specific date + datecount.put(strdate, wordcount); + } + } + } catch (JSONException ex) { + Logger.getLogger(SparkLineExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + //sort the dates + ArrayList dinfos = new ArrayList(); + Set keys = datecount.keySet(); + for(String key:keys) + { + TCDateInfo dinfo = new TCDateInfo(); + try { + dinfo.d = SDM.parse(key); + } catch (ParseException ex) { + ex.printStackTrace(); + continue; + } + dinfo.wordcount = datecount.get(key); + dinfos.add(dinfo); + } + Collections.sort(dinfos); + JSONArray[] tseriesvals = new JSONArray[keywords.size()]; + for(int i=0;i wordcount = date.wordcount; + int counter=0; + for(String word:keywords) + { + if(wordcount.containsKey(word)) + { + tseriesvals[counter].put(wordcount.get(word)); + } + else + { + tseriesvals[counter].put(0); + } + counter++; + } + } + int counter=0; + for(String word:keywords) + { + try { + result.put(word, tseriesvals[counter]); + } catch (JSONException ex) { + Logger.getLogger(SparkLineExample.class.getName()).log(Level.SEVERE, null, ex); + } + counter++; + } + }catch(IOException ex) + { + ex.printStackTrace(); + }finally{ + try { + br.close(); + } catch (IOException ex) { + Logger.getLogger(SparkLineExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + return result; + } + + public static void main(String[] args) + { + SparkLineExample sle = new SparkLineExample(); + ArrayList words = new ArrayList(); + String infilename = DEF_INFILENAME; + if(args!=null) + { + if(args.length>=1&&!args[0].isEmpty()) + { + File fl = new File(args[0]); + if(fl.exists()) + { + infilename = args[0]; + } + } + for(int i=1;i wordcount = new HashMap(); + + public int compareTo(Object o) { + TCDateInfo temp = (TCDateInfo) o; + if(temp.d.after(this.d)) + { + return -1; + } + else + if(temp.d.before(this.d)) + { + return 1; + } + else + { + return 0; + } + } + +} diff --git a/src/Chapter5/trends/TrendComparisonExample.java b/src/Chapter5/trends/TrendComparisonExample.java new file mode 100644 index 0000000..20991cd --- /dev/null +++ b/src/Chapter5/trends/TrendComparisonExample.java @@ -0,0 +1,155 @@ +/* TweetTracker. Copyright (c) Arizona Board of Regents on behalf of Arizona State University + * @author shamanth + */ +package Chapter5.trends; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +public class TrendComparisonExample +{ + static final String DEF_INFILENAME = "ows.json"; + static final SimpleDateFormat SDM = new SimpleDateFormat("dd MMM yyyy HH:mm"); + + public JSONArray GenerateDataTrend(String inFilename, ArrayList keywords) + { + BufferedReader br = null; + JSONArray result = new JSONArray(); + HashMap> datecount = new HashMap>(); + try{ + br= new BufferedReader(new InputStreamReader(new FileInputStream(inFilename),"UTF-8")); + String temp = ""; + while((temp = br.readLine())!=null) + { + try { + JSONObject jobj = new JSONObject(temp); + String text = jobj.getString("text").toLowerCase(); + long timestamp = jobj.getLong("timestamp"); + Date d = new Date(timestamp); + String strdate = SDM.format(d); + for(String word:keywords) + { + if(text.contains(word)) + { + HashMap wordcount = new HashMap(); + if(datecount.containsKey(strdate)) + { + wordcount = datecount.get(strdate); + } + if(wordcount.containsKey(word)) + { + wordcount.put(word, wordcount.get(word)+1); + } + else + { + wordcount.put(word, 1); + } + //update the wordcount for the specific date + datecount.put(strdate, wordcount); + } + } + } catch (JSONException ex) { + Logger.getLogger(TrendComparisonExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + //sort the dates + ArrayList dinfos = new ArrayList(); + Set keys = datecount.keySet(); + for(String key:keys) + { + TCDateInfo dinfo = new TCDateInfo(); + try { + dinfo.d = SDM.parse(key); + } catch (ParseException ex) { + ex.printStackTrace(); + continue; + } + dinfo.wordcount = datecount.get(key); + dinfos.add(dinfo); + } + Collections.sort(dinfos); + //prepare the output + for(TCDateInfo date:dinfos) + { + JSONObject item = new JSONObject(); + String strdate = SDM.format(date.d); + try{ + item.put("date",strdate); + HashMap wordcount = date.wordcount; + for(String word:keywords) + { + if(wordcount.containsKey(word)) + { + item.put(word, wordcount.get(word)); + } + else + { + item.put(word, 0); + } + } + result.put(item); + }catch(JSONException ex) + { + ex.printStackTrace(); + } + } + }catch(IOException ex) + { + ex.printStackTrace(); + }finally{ + try { + br.close(); + } catch (IOException ex) { + Logger.getLogger(TrendComparisonExample.class.getName()).log(Level.SEVERE, null, ex); + } + } + return result; + } + + public static void main(String[] args) + { + TrendComparisonExample tce = new TrendComparisonExample(); + ArrayList words = new ArrayList(); + String infilename = DEF_INFILENAME; + if(args!=null) + { + if(args.length>=1&&!args[0].isEmpty()) + { + File fl = new File(args[0]); + if(fl.exists()) + { + infilename = args[0]; + } + } + for(int i=1;itempObject.value) + return 1; + if(this.value STOPWORDS = new HashSet(); + + String SEPARATOR = " "; + + /** + * Loads the stop words from a file onto a collection. for use by all methods in this class + * @param filename + */ + public void LoadStopWords(String filename) + { + if(!filename.isEmpty()) + { + + BufferedReader bread = null; + try { + bread = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF8")); + String temp = ""; + try { + while ((temp = bread.readLine()) != null) { + if (!temp.isEmpty()) { + String[] stwords = temp.split(","); + for (String t : stwords) { + t = t.toLowerCase(); + if (!STOPWORDS.contains(t)) { + STOPWORDS.add(t); + } + } + } + } + } catch (IOException ex) { + Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex); + } + } catch (UnsupportedEncodingException ex) { + Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex); + } catch (FileNotFoundException ex) { + Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex); + } finally { + try { + bread.close(); + } catch (IOException ex) { + Logger.getLogger(TextUtils.class.getName()).log(Level.SEVERE, null, ex); + } + } + } + } + + /** + * Converts a tweet/text into individual words/tokens. All stopwords are removed and the list also does not contain hyperlinks. + * Splitting is performed on space. + * @param text + * @param ignoreHashtags + * @param ignoreUsernames + * @return a list of words contained in text + */ + public HashMap TokenizeText(String text, boolean ignoreHashtags, boolean ignoreUsernames) + { + String[] tokens = text.split(SEPARATOR); + HashMap words = new HashMap(); + for(String token:tokens) + { + token = token.replaceAll("\"|'|\\.||;|,", ""); + if(token.isEmpty()||token.length()<=2||STOPWORDS.contains(token)||token.startsWith("&")||token.startsWith("http")) + { + continue; + } + else + { + if(ignoreHashtags) + { + if(token.startsWith("#")) + { + continue; + } + } + if(ignoreUsernames) + { + if(token.startsWith("@")) + { + continue; + } + } + if(!words.containsKey(token)) + { + words.put(token,1); + } + else + { + words.put(token, words.get(token)+1); + } + } + } + return words; + } + + /** + * Checks whether the tweet is a retweet based on the presence of the RT pattern as the start of the text. Expects the tweet text to be in lowercase. + * @param text + * @return + */ + public static boolean IsTweetRT(String text) + { + Pattern p = Pattern.compile("^rt @[a-z_0-9]+"); + Matcher m = p.matcher(text); + if(m.find()) + { + return true; + } + return false; + } + + /** + * Checks whether the text contains a hyperlink in the text + * @param text + * @return + */ + public static boolean ContainsURL(String text) + { + Pattern urlpat = Pattern.compile("https?://[a-zA-Z0-9\\./]+"); + Matcher urlmat = urlpat.matcher(text); + if(urlmat.find()) + { + return true; + } + else + return false; + } + + /** + * extracts and returns a list of hashtags from the text + * @param text + * @return + */ + public static ArrayList GetHashTags(String text) + { + Pattern p = Pattern.compile("#[a-zA-Z0-9]+"); + Matcher mat = p.matcher(text); + ArrayList tags = new ArrayList(); + while(mat.find()) + { + String tag = text.substring(mat.start(),mat.end()); + if(!tags.contains(tag.toLowerCase())) + { + tags.add(tag.toLowerCase()); + } + } + return tags; + } + + /** + * Removes LF and CR from the text as well as any quotes and backslashes + * @param text + * @return + */ + public static String GetCleanText(String text) + { + text = text.replaceAll("'|\"|"", ""); + text = text.replaceAll("\\\\", ""); + text = text.replaceAll("\r\n|\n|\r", " "); + text = text.trim(); + return text; + } + + /** + * Removes all patterns that correspond to Retweeted status leaving only original text + * @param tweet + * @return + */ + public static String RemoveRTElements(String tweet) + { + String text = tweet.replaceAll("rt @[a-z_A-Z0-9]+", " "); + text = text.replaceAll("RT @[a-z_A-Z0-9]+", " "); + text = text.replaceAll(":",""); + return text.trim(); + } + + /** + * Removes all hashtags, URLs, and usernames from the tweet text + * @param tweet + * @return + */ + public static String RemoveTwitterElements(String tweet) + { + String temptweet = tweet.replaceAll("#[a-zA-Z_0-9]+", ""); + temptweet = temptweet.replaceAll("https?://[a-zA-Z0-9\\./]+", ""); + temptweet = temptweet.replaceAll("@[a-zA-Z_0-9]+", ""); + temptweet = temptweet.replaceAll("[:?\\.;<>()]", ""); + return temptweet; + } + +} diff --git a/streaming/streaming.config b/streaming/streaming.config new file mode 100644 index 0000000..d6e27cb --- /dev/null +++ b/streaming/streaming.config @@ -0,0 +1,3 @@ +#morsi #egypt #tahrir #june30 #scaf +-118.79,32.49,-115.23,34.67 +15127356 20627637 \ No newline at end of file -- cgit v1.2.1