From dfe27ab80266b7191e36cbc95f3cd9dacafb6c5f Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Sun, 11 May 2014 13:54:59 +0200 Subject: More data sanization (decode HTML entities, location fix) --- src/data/User.java | 6 ++++++ src/database/QueryUtils.java | 16 +++++++++++++--- src/main/DataFiller.java | 19 ++++++++++++++++++- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/data/User.java b/src/data/User.java index 3a56582..7df1d4c 100644 --- a/src/data/User.java +++ b/src/data/User.java @@ -26,6 +26,12 @@ public class User { public String description; public boolean verified; + @Override + public String toString() { + return TwitterJsonDeserializer.getGsonBuilder() + .setPrettyPrinting().create().toJson(this); + } + public static class Entities { @ValidatingJsonDeserializer.Validator diff --git a/src/database/QueryUtils.java b/src/database/QueryUtils.java index 678df8c..b103a06 100644 --- a/src/database/QueryUtils.java +++ b/src/database/QueryUtils.java @@ -93,12 +93,12 @@ public class QueryUtils { public static void setInsertParams(NamedPreparedStatement tweetStatement, NamedPreparedStatement profileStatement, - Tweet tweet) throws SQLException { + Tweet tweet, String tweetText) throws SQLException { tweetStatement.setLong("tweetid", tweet.id); tweetStatement.setTimestamp("createdat", tweet.created_at); tweetStatement.setInt("favcount", tweet.favorite_count); tweetStatement.setLong("retweetcount", tweet.retweet_count); - tweetStatement.setString("text", tweet.text); + tweetStatement.setString("text", tweetText); if (tweet.coordinates != null) { float[] coords = tweet.coordinates.coordinates; String coords_str = String.format("%f,%f", coords[0], coords[1]); @@ -128,7 +128,8 @@ public class QueryUtils { profileStatement.setInt("tweetcount", twuser.statuses_count); profileStatement.setInt("followercount", twuser.followers_count); profileStatement.setInt("followedcount", twuser.friends_count); - profileStatement.setString("location", twuser.location); + String userLocation = getUserLocation(twuser); + profileStatement.setString("location", userLocation); profileStatement.setString("tweetname", twuser.screen_name); profileStatement.setTimestamp("createdat", twuser.created_at); profileStatement.setString("language", twuser.lang); @@ -142,4 +143,13 @@ public class QueryUtils { brandStmt.setString("brand", brand); // TODO: rating (positive) } + + private static String getUserLocation(User user) { + String location = user.location; + if (location != null && location.contains("\0")) { + System.err.println("Warning: \\0 location found for user " + user); + location = location.replace("\0", ""); + } + return location; + } } diff --git a/src/main/DataFiller.java b/src/main/DataFiller.java index 55bc273..437fcdb 100644 --- a/src/main/DataFiller.java +++ b/src/main/DataFiller.java @@ -51,14 +51,31 @@ public class DataFiller { } } + /** + * Replaces three HTML entities ('&', '<', '>') by their normal + * forms. Later, this might also strip useless characters. + * + * @param text The tweet message. + * @return A sanitized form of the tweet text. + */ + private String sanitizeTweetText(String text) { + // sample tweets, 12k contained '&', 2.6k '>' and 300 '<'. + // Note: time_zone, description, expanded_url, location are unaffected. + return text + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll("&", "&"); + } + public void processTweet(Tweet tweet) throws SQLException { // process retweets first because of the foreign key. if (tweet.retweeted_status != null) { processTweet(tweet.retweeted_status); } + String text = sanitizeTweetText(tweet.text); // ensure that the user and tweet are known before adding relations - QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet); + QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet, text); m_insertProfile.executeUpdate(); m_insertTweet.executeUpdate(); -- cgit v1.2.1