summaryrefslogtreecommitdiff
path: root/src/main/DataFiller.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/DataFiller.java')
-rw-r--r--src/main/DataFiller.java19
1 files changed, 18 insertions, 1 deletions
diff --git a/src/main/DataFiller.java b/src/main/DataFiller.java
index 55bc273..437fcdb 100644
--- a/src/main/DataFiller.java
+++ b/src/main/DataFiller.java
@@ -51,14 +51,31 @@ public class DataFiller {
}
}
+ /**
+ * Replaces three HTML entities ('&', '<', '>') by their normal
+ * forms. Later, this might also strip useless characters.
+ *
+ * @param text The tweet message.
+ * @return A sanitized form of the tweet text.
+ */
+ private String sanitizeTweetText(String text) {
+ // sample tweets, 12k contained '&amp', 2.6k '>' and 300 '<'.
+ // Note: time_zone, description, expanded_url, location are unaffected.
+ return text
+ .replaceAll("&lt;", "<")
+ .replaceAll("&gt;", ">")
+ .replaceAll("&amp;", "&");
+ }
+
public void processTweet(Tweet tweet) throws SQLException {
// process retweets first because of the foreign key.
if (tweet.retweeted_status != null) {
processTweet(tweet.retweeted_status);
}
+ String text = sanitizeTweetText(tweet.text);
// ensure that the user and tweet are known before adding relations
- QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet);
+ QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet, text);
m_insertProfile.executeUpdate();
m_insertTweet.executeUpdate();