From dfe27ab80266b7191e36cbc95f3cd9dacafb6c5f Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Sun, 11 May 2014 13:54:59 +0200 Subject: More data sanization (decode HTML entities, location fix) --- src/main/DataFiller.java | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'src/main/DataFiller.java') diff --git a/src/main/DataFiller.java b/src/main/DataFiller.java index 55bc273..437fcdb 100644 --- a/src/main/DataFiller.java +++ b/src/main/DataFiller.java @@ -51,14 +51,31 @@ public class DataFiller { } } + /** + * Replaces three HTML entities ('&', '<', '>') by their normal + * forms. Later, this might also strip useless characters. + * + * @param text The tweet message. + * @return A sanitized form of the tweet text. + */ + private String sanitizeTweetText(String text) { + // sample tweets, 12k contained '&', 2.6k '>' and 300 '<'. + // Note: time_zone, description, expanded_url, location are unaffected. + return text + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll("&", "&"); + } + public void processTweet(Tweet tweet) throws SQLException { // process retweets first because of the foreign key. if (tweet.retweeted_status != null) { processTweet(tweet.retweeted_status); } + String text = sanitizeTweetText(tweet.text); // ensure that the user and tweet are known before adding relations - QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet); + QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet, text); m_insertProfile.executeUpdate(); m_insertTweet.executeUpdate(); -- cgit v1.2.1