summaryrefslogtreecommitdiff
path: root/src/main/DataFiller.java
blob: 803d9261a30181c1484ead3fb2b37124b0f22600 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package main;

import data.Tweet;
import data.User;
import database.NamedPreparedStatement;
import database.QueryUtils;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

/**
 * Process that incoming tweets and fill the database.
 *
 * @author Maurice Laveaux
 */
public class DataFiller {

    /**
     * The main database connection to fill.
     */
    private final Connection m_connection;

    private final NamedPreparedStatement m_insertTweet;
    private final NamedPreparedStatement m_insertProfile;
    private final NamedPreparedStatement m_insertBrand;
    private final NamedPreparedStatement m_insertHash;
    private final NamedPreparedStatement m_insertTweetUrl;
    private final NamedPreparedStatement m_insertUserUrl;
    private final NamedPreparedStatement m_insertMentions;

    /**
     * Create the datafiller object.
     *
     * @param connection The database connection to use.
     * @throws java.sql.SQLException on error preparing the database connection.
     */
    public DataFiller(Connection connection) throws SQLException {
        m_connection = connection;
        m_insertTweet = new NamedPreparedStatement(m_connection, QueryUtils.insertTweet);
        m_insertProfile = new NamedPreparedStatement(m_connection, QueryUtils.insertProfile);
        m_insertBrand = new NamedPreparedStatement(m_connection, QueryUtils.insertBrand);
        m_insertHash = new NamedPreparedStatement(m_connection, QueryUtils.insertHash);
        m_insertTweetUrl = new NamedPreparedStatement(m_connection, QueryUtils.insertTweetUrl);
        m_insertUserUrl = new NamedPreparedStatement(m_connection, QueryUtils.insertUserUrl);
        m_insertMentions = new NamedPreparedStatement(m_connection, QueryUtils.insertMentions);
    }

    /**
     * Replaces three HTML entities ('&', '<', '>') by their normal
     * forms. Later, this might also strip useless characters.
     *
     * @param text The tweet message.
     * @return A sanitized form of the tweet text.
     */
    private String sanitizeTweetText(String text) {
        // sample tweets, 12k contained '&amp', 2.6k '>' and 300 '<'.
        // Note: time_zone, description, expanded_url, location are unaffected.
        return text
                .replaceAll("&lt;", "<")
                .replaceAll("&gt;", ">")
                .replaceAll("&amp;", "&");
    }

    public void processTweet(Tweet tweet) throws SQLException {
        // process retweets first because of the foreign key.
        if (tweet.retweeted_status != null) {
            processTweet(tweet.retweeted_status);
        }

        String text = sanitizeTweetText(tweet.text);
        // ensure that the user and tweet are known before adding relations
        QueryUtils.setInsertParams(m_insertTweet, m_insertProfile, tweet, text);
        m_insertProfile.executeUpdate();
        m_insertTweet.executeUpdate();

        for (Tweet.Hashtag hashtag : tweet.entities.hashtags) {
            m_insertHash.setLong("tweetid", tweet.id);
            m_insertHash.setString("hashtag", hashtag.text);
            m_insertHash.executeUpdate();
        }
        for (Tweet.Url url : tweet.entities.urls) {
            m_insertTweetUrl.setLong("tweetid", tweet.id);
            m_insertTweetUrl.setString("url", url.expanded_url);
            m_insertTweetUrl.executeUpdate();
        }
        for (Tweet.Mention mention : tweet.entities.user_mentions) {
            m_insertMentions.setLong("tweetid", tweet.id);
            m_insertMentions.setLong("userid", mention.id);
            m_insertMentions.executeUpdate();
        }

        User user = tweet.user;
        if (user.entities != null) {
            for (Tweet.Url url : tweet.entities.urls) {
                m_insertUserUrl.setLong("userid", user.id);
                m_insertUserUrl.setString("url", url.expanded_url);
                m_insertUserUrl.executeUpdate();
            }
        }

        // determine the user's perception of the brand
        List<String> brands = getBrands(tweet);
        for (String brand : brands) {
            QueryUtils.setInsertBrandParams(m_insertBrand, tweet.id, brand);
            m_insertBrand.executeUpdate();
        }
    }

    ArrayList<String> getBrands(Tweet tweet) {
        ArrayList<String> result = new ArrayList<>();
        String text = tweet.text.toLowerCase();
        if (text.contains("samsung") || text.contains("galaxy")) {
            result.add("Samsung");
        }
        if (text.contains("htc") || text.contains("one")) {
            result.add("HTC");
        }
        if (text.contains("apple") || text.contains("iphone")) {
            result.add("Apple");
        }
        if (text.contains("sony") || text.contains("xperia")) {
            result.add("Sony");
        }
        if (text.contains("huawei") || text.contains("ascend")) {
            result.add("Huawei");
        }
        if (text.contains("lg")) {
            result.add("LG");
        }

        // TODO: WTF IS THIS PILE OF SHIT?!
        return result;
    }
}