summaryrefslogtreecommitdiff
path: root/src/analysis/BrandChecker.java
blob: 740424c7892a544943f994cdb17180de69bdf0de (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/*
 * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE Version 2, December 2004
 *
 * Copyright (C) 2004 Sam Hocevar
 *
 * Everyone is permitted to copy and distribute verbatim or modified copies 
 * of this license document, and changing it is allowed as long as the name is 
 * changed.
 *
 * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, 
 * DISTRIBUTION AND MODIFICATION
 *
 * 0. You just DO WHAT THE FUCK YOU WANT TO.
 */
package analysis;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Scanner;
import java.util.Set;

/**
 * This class obtains a text and returns the brands that are contained in this
 * text. The input file contains lines with [brandname] [extensions] |
 * [blacklisted words].
 *
 * @author Maurice Laveaux
 */
public class BrandChecker {

    /**
     * A set of rules that determine the brands.
     */
    private final ArrayList<BrandRule> ruleset = new ArrayList();

    /**
     * @param filename The filename that contains all the rules.
     */
    public BrandChecker(final String filename) {
        try {
            readFile(filename);
        } catch (FileNotFoundException ex) {
            throw new IllegalArgumentException("file named " + filename + " not found.");
        }
    }

    /**
     * Get the brands that are in some text.
     *
     * @param text Any valid text.
     * @return The list of brands that are contained in this text or null.
     */
    public List<String> getBrands(String text) {
        String[] words = text.toLowerCase().split("\\s+");

        List<String> brands = new ArrayList();

        for (BrandRule rule : ruleset) {
            if (rule.analyze(words)) {
                brands.add(rule.getBrand());
            }
        }

        return brands;
    }

    private void readFile(final String filename) throws FileNotFoundException {
        InputStream inFile = new FileInputStream(filename);
        Scanner readFile = new Scanner(inFile);

        while (readFile.hasNextLine()) {
            String line = readFile.nextLine();

            parseRule(line.toLowerCase(Locale.ENGLISH));
        }
    }

    private void parseRule(String line) {
        if (line.isEmpty()) {
            return;
        }

        if (!line.contains("-")) {
            // only positive search entries.
            String[] sequence = line.split("\\s+");
            String[] blacklist = {""};
            ruleset.add(new BrandRule(line, sequence, blacklist));
        } else {
            String[] parts = line.split("-");
            // positive and negative.
            if (parts.length < 2) {
                throw new IllegalArgumentException("Brand rule contained '-' but not two parts.");
            }

            String[] sequence = parts[0].trim().split("\\s+");
            String[] blacklist = parts[1].trim().split("\\s+");

            ruleset.add(new BrandRule(parts[0].trim(), sequence, blacklist));
        }

    }

    private class BrandRule {

        /**
         * The words that should be in the text.
         */
        private final ArrayList<String> names;

        /**
         * A blacklist of words that are not interesting.
         */
        private final Set<String> blacklist;

        /**
         * The brand name of this rule.
         */
        private final String brand;

        /**
         *
         * @param brand The brand of this rule.
         * @param sequential The sequence of strings to obtain.
         * @param blacklist The blacklisted words.
         */
        public BrandRule(final String brandname, final String[] names, final String[] blacklist) {
            this.brand = brandname;
            this.names = new ArrayList(Arrays.asList(names));
            this.blacklist = new HashSet(Arrays.asList(blacklist));
        }

        /**
         * Analyzes if this rule is holds for some text.
         *
         * @param words A list of words in a line.
         */
        public boolean analyze(String[] words) {
            int found = 0;

            for (String word : words) {
                if (blacklist.contains(word)) {
                    return false;
                }
                
                if (names.contains(word)) {
                    found++;
                }
            }

            if (found == names.size()) {
                return true;
            }

            return false;
        }

        public String getBrand() {
            return brand;
        }
    }
}