summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/data/User.java6
-rw-r--r--src/database/QueryUtils.java16
-rw-r--r--src/main/DataFiller.java19
3 files changed, 37 insertions, 4 deletions
diff --git a/src/data/User.java b/src/data/User.java
index 3a56582..7df1d4c 100644
--- a/src/data/User.java
+++ b/src/data/User.java
@@ -26,6 +26,12 @@ public class User {
public String description;
public boolean verified;
+ @Override
+ public String toString() {
+ return TwitterJsonDeserializer.getGsonBuilder()
+ .setPrettyPrinting().create().toJson(this);
+ }
+
public static class Entities {
@ValidatingJsonDeserializer.Validator
diff --git a/src/database/QueryUtils.java b/src/database/QueryUtils.java
index 678df8c..b103a06 100644
--- a/src/database/QueryUtils.java
+++ b/src/database/QueryUtils.java
@@ -93,12 +93,12 @@ public class QueryUtils {
public static void setInsertParams(NamedPreparedStatement tweetStatement,
NamedPreparedStatement profileStatement,
- Tweet tweet) throws SQLException {
+ Tweet tweet, String tweetText) throws SQLException {
tweetStatement.setLong("tweetid", tweet.id);
tweetStatement.setTimestamp("createdat", tweet.created_at);
tweetStatement.setInt("favcount", tweet.favorite_count);
tweetStatement.setLong("retweetcount", tweet.retweet_count);
- tweetStatement.setString("text", tweet.text);
+ tweetStatement.setString("text", tweetText);
if (tweet.coordinates != null) {
float[] coords = tweet.coordinates.coordinates;
String coords_str = String.format("%f,%f", coords[0], coords[1]);
@@ -128,7 +128,8 @@ public class QueryUtils {
profileStatement.setInt("tweetcount", twuser.statuses_count);
profileStatement.setInt("followercount", twuser.followers_count);
profileStatement.setInt("followedcount", twuser.friends_count);
- profileStatement.setString("location", twuser.location);
+ String userLocation = getUserLocation(twuser);
+ profileStatement.setString("location", userLocation);
profileStatement.setString("tweetname", twuser.screen_name);
profileStatement.setTimestamp("createdat", twuser.created_at);
profileStatement.setString("language", twuser.lang);
@@ -142,4 +143,13 @@ public class QueryUtils {
brandStmt.setString("brand", brand);
// TODO: rating (positive)
}
+
+ private static String getUserLocation(User user) {
+ String location = user.location;
+ if (location != null && location.contains("\0")) {
+ System.err.println("Warning: \\0 location found for user " + user);
+ location = location.replace("\0", "");
+ }
+ return location;
+ }
}
diff --git a/src/main/DataFiller.java b/src/main/DataFiller.java
index 55bc273..437fcdb 100644
--- a/src/main/DataFiller.java
+++ b/src/main/DataFiller.java
@@ -51,14 +51,31 @@ public class DataFiller {
}
}
+ /**
+ * Replaces three HTML entities ('&', '<', '>') by their normal
+ * forms. Later, this might also strip useless characters.
+ *
+ * @param text The tweet message.
+ * @return A sanitized form of the tweet text.
+ */
+ private String sanitizeTweetText(String text) {
+ // sample tweets, 12k contained '&amp', 2.6k '>' and 300 '<'.
+ // Note: time_zone, description, expanded_url, location are unaffected.
+ return text
+ .replaceAll("&lt;", "<")
+ .replaceAll("&gt;", ">")
+ .replaceAll("&amp;", "&");
+ }
+
public void processTweet(Tweet tweet) throws SQLException {
// process retweets first because of the foreign key.
if (tweet.retweeted_status != null) {
processTweet(tweet.retweeted_status);
}
+ String text = sanitizeTweetText(tweet.text);
// ensure that the user and tweet are known before adding relations
- QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet);
+ QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet, text);
m_insertProfile.executeUpdate();
m_insertTweet.executeUpdate();