summaryrefslogtreecommitdiff
path: root/src/analysis/BrandChecker.java
blob: 10e22b48c3e85efb8e007c95e061cf8deb2824e0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/*
 * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE Version 2, December 2004
 *
 * Copyright (C) 2004 Sam Hocevar
 *
 * Everyone is permitted to copy and distribute verbatim or modified copies 
 * of this license document, and changing it is allowed as long as the name is 
 * changed.
 *
 * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, 
 * DISTRIBUTION AND MODIFICATION
 *
 * 0. You just DO WHAT THE FUCK YOU WANT TO.
 */
package analysis;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Scanner;
import java.util.Set;

/**
 * This class obtains a text and returns the brands that are contained in this
 * text. The input file contains lines with [brandname] [extensions] |
 * [blacklisted words].
 *
 * @author Maurice Laveaux
 */
public class BrandChecker {

    /**
     * A set of rules that determine the brands.
     */
    private final ArrayList<BrandRule> ruleset = new ArrayList();

    /**
     * @param filename The filename that contains all the rules.
     */
    public BrandChecker(final String filename) {
        try {
            readFile(filename);
        } catch (FileNotFoundException ex) {
            throw new IllegalArgumentException("file named " + filename + " not found.");
        }
    }

    /**
     * Get the brands that are in some text.
     *
     * @param text Any valid text.
     * @return The list of brands that are contained in this text or null.
     */
    public List<String> getBrands(String text) {
        text = removePunct(text);
        String[] words = text.toLowerCase().split("\\s+");

        List<String> brands = new ArrayList();

        for (BrandRule rule : ruleset) {
            if (rule.analyze(words)) {
                brands.add(rule.getBrand());
            }
        }

        return brands;
    }

    /**
     * Reads the file and parses the rules, which are added to the ruleset.
     */
    private void readFile(final String filename) throws FileNotFoundException {
        InputStream inFile = new FileInputStream(filename);
        Scanner readFile = new Scanner(inFile);

        while (readFile.hasNextLine()) {
            String line = readFile.nextLine();

            parseRule(line.toLowerCase(Locale.ENGLISH));
        }
    }

    /**
     * Parses the line and adds the BrandRule to the ruleset.
     */
    private void parseRule(String line) {
        if (line.isEmpty()) {
            return;
        }
        
        if (!line.contains("-")) {
            System.err.println("illformatted rule: " + line + ", missing -");
        } else {
            String[] parts = line.split("-");
            // positive and negative.
            if (parts.length < 2) {
                System.err.println("illformatted rule: " + line + ", missing <name> - <positive>.");
                return;
            }

            if (parts.length > 4) {
                System.err.println("illformatted rule: " + line + ", forth part with - was given thus will be ignored.");
            }

            // Read the <name> line.
            String name = parts[0].trim();
            
            // Read the positive words.
            String positive = parts[1].replaceAll(" ","");            
            String[] sequence = positive.split(",");
            
            if (parts.length == 3) {
                String negative = parts[2].replaceAll(" ", "");                
                String[] blacklist = negative.split(",");
                ruleset.add(new BrandRule(name, sequence, blacklist));
            } else {
                ruleset.add(new BrandRule(name, sequence, null));
            }
        }
    }

    /**
     * Removes punctuation.
     */
    private String removePunct(String text) {
        text = text.replaceAll("[^a-zA-Z0-9]", " ");
        return text;
    }

    private class BrandRule {

        /**
         * The words that should be in the text.
         */
        private final HashMap<String, Boolean> names;
        
        /**
         * A blacklist of words that are not interesting.
         */
        private final Set<String> blacklist;

        /**
         * The brand name of this rule.
         */
        private final String brand;

        /**
         *
         * @param brand The brand of this rule.
         * @param sequential The sequence of strings to obtain.
         * @param blacklist The blacklisted words.
         */
        public BrandRule(final String brandname, final String[] names, final String[] blacklist) {
            this.brand = brandname;
            this.names = new HashMap();
            if (blacklist != null) {
                this.blacklist = new HashSet(Arrays.asList(blacklist));
            } else {
                this.blacklist = null;
            }
            
            for (String name : names) {
                this.names.put(name, Boolean.FALSE);
            }
        }

        /**
         * Analyzes if this rule is holds for some text.
         *
         * @param words A list of words in a line.
         */
        public boolean analyze(String[] words) {
            reset();
            
            int found = 0;

            for (String word : words) {
                if (blacklist != null) {
                    if (blacklist.contains(word)) {
                        return false;
                    }
                }

                if (names.containsKey(word)) {
                    if (names.get(word) == false) {
                        found++;
                        names.put(word, Boolean.TRUE);
                    }
                }
            }

            return found == names.size();
        }

        public String getBrand() {
            return brand;
        }
        
        private void reset() {
            for (String name : this.names.keySet()) {
                this.names.put(name, Boolean.FALSE);
            }
        }
    }
}