package main; import data.Tweet; import data.User; import database.NamedPreparedStatement; import database.QueryUtils; import java.sql.Connection; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; /** * Process that incoming tweets and fill the database. * * @author Maurice Laveaux */ public class DataFiller { /** * The main database connection to fill. */ private final Connection m_connection; private final NamedPreparedStatement m_insertTweet; private final NamedPreparedStatement m_insertProfile; private final NamedPreparedStatement m_insertBrand; private final NamedPreparedStatement m_insertHash; private final NamedPreparedStatement m_insertTweetUrl; private final NamedPreparedStatement m_insertUserUrl; private final NamedPreparedStatement m_insertMentions; private final int m_category; /** * Create the datafiller object. * * @param connection The database connection to use. * @throws java.sql.SQLException on error preparing the database connection. */ public DataFiller(Connection connection, int category) throws SQLException { m_category = category; m_connection = connection; m_insertTweet = new NamedPreparedStatement(m_connection, QueryUtils.insertTweet); m_insertProfile = new NamedPreparedStatement(m_connection, QueryUtils.insertProfile); m_insertBrand = new NamedPreparedStatement(m_connection, QueryUtils.insertBrand); m_insertHash = new NamedPreparedStatement(m_connection, QueryUtils.insertHash); m_insertTweetUrl = new NamedPreparedStatement(m_connection, QueryUtils.insertTweetUrl); m_insertUserUrl = new NamedPreparedStatement(m_connection, QueryUtils.insertUserUrl); m_insertMentions = new NamedPreparedStatement(m_connection, QueryUtils.insertMentions); // enable transactions connection.setAutoCommit(false); } /** * Replaces three HTML entities ('&', '<', '>') by their normal * forms. Later, this might also strip useless characters. * * @param text The tweet message. * @return A sanitized form of the tweet text. */ private String sanitizeTweetText(String text) { // sample tweets, 12k contained '&', 2.6k '>' and 300 '<'. // Note: time_zone, description, expanded_url, location are unaffected. return text .replaceAll("<", "<") .replaceAll(">", ">") .replaceAll("&", "&"); } public void processTweet(Tweet tweet) throws SQLException { // process retweets first because of the foreign key. if (tweet.retweeted_status != null) { processTweet(tweet.retweeted_status); } /** * Replace all null bytes, non valid UTF-8 * */ if (tweet.text.contains("\0")) { tweet.text = tweet.text.replaceAll("\0", ""); } User user = tweet.user; if (user.name.contains("\0")) { user.name = user.name.replaceAll("\0", ""); } try { // assume that no previous transaction was started. saveTweet(tweet); // concludes the previous query, implicitly starts a new transaction m_connection.commit(); } catch (Exception ex) { m_connection.rollback(); throw ex; } } private void saveTweet(Tweet tweet) throws SQLException { String text = sanitizeTweetText(tweet.text); // ensure that the user and tweet are known before adding relations m_insertTweet.setInt("category", m_category); QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet, text); m_insertProfile.executeUpdate(); m_insertTweet.executeUpdate(); for (Tweet.Hashtag hashtag : tweet.entities.hashtags) { m_insertHash.setLong("tweetid", tweet.id); m_insertHash.setString("hashtag", hashtag.text); m_insertHash.executeUpdate(); } for (Tweet.Url url : tweet.entities.urls) { m_insertTweetUrl.setLong("tweetid", tweet.id); m_insertTweetUrl.setString("url", url.expanded_url); m_insertTweetUrl.executeUpdate(); } for (Tweet.Mention mention : tweet.entities.user_mentions) { m_insertMentions.setLong("tweetid", tweet.id); m_insertMentions.setLong("userid", mention.id); m_insertMentions.executeUpdate(); } User user = tweet.user; if (user.entities != null) { for (Tweet.Url url : tweet.entities.urls) { m_insertUserUrl.setLong("userid", user.id); m_insertUserUrl.setString("url", url.expanded_url); m_insertUserUrl.executeUpdate(); } } // determine the user's perception of the brand /* List brands = getBrands(tweet); for (String brand : brands) { QueryUtils.setInsertBrandParams(m_insertBrand, tweet.id, brand); m_insertBrand.executeUpdate(); } */ } }