diff options
Diffstat (limited to 'src/main/DataFiller.java')
-rw-r--r-- | src/main/DataFiller.java | 19 |
1 files changed, 18 insertions, 1 deletions
diff --git a/src/main/DataFiller.java b/src/main/DataFiller.java index 55bc273..437fcdb 100644 --- a/src/main/DataFiller.java +++ b/src/main/DataFiller.java @@ -51,14 +51,31 @@ public class DataFiller { } } + /** + * Replaces three HTML entities ('&', '<', '>') by their normal + * forms. Later, this might also strip useless characters. + * + * @param text The tweet message. + * @return A sanitized form of the tweet text. + */ + private String sanitizeTweetText(String text) { + // sample tweets, 12k contained '&', 2.6k '>' and 300 '<'. + // Note: time_zone, description, expanded_url, location are unaffected. + return text + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll("&", "&"); + } + public void processTweet(Tweet tweet) throws SQLException { // process retweets first because of the foreign key. if (tweet.retweeted_status != null) { processTweet(tweet.retweeted_status); } + String text = sanitizeTweetText(tweet.text); // ensure that the user and tweet are known before adding relations - QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet); + QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet, text); m_insertProfile.executeUpdate(); m_insertTweet.executeUpdate(); |