From 058ef3d39029da492477c3c59f1e405193ca3013 Mon Sep 17 00:00:00 2001 From: Maurice Laveaux Date: Mon, 19 May 2014 11:11:22 +0200 Subject: Updated the brandchecker code and rules. * Changed the brandrules.txt ruleset to conform the new parsing of rules. * Brandchecker now doesn't count multiple equal words. * Updated the testcases, which don't work for others. --- src/analysis/BrandChecker.java | 91 +++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 23 deletions(-) (limited to 'src') diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java index 740424c..5bde55b 100644 --- a/src/analysis/BrandChecker.java +++ b/src/analysis/BrandChecker.java @@ -19,6 +19,7 @@ import java.io.FileNotFoundException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; @@ -57,6 +58,7 @@ public class BrandChecker { * @return The list of brands that are contained in this text or null. */ public List getBrands(String text) { + text = removePunct(text); String[] words = text.toLowerCase().split("\\s+"); List brands = new ArrayList(); @@ -70,6 +72,9 @@ public class BrandChecker { return brands; } + /** + * Reads the file and parses the rules, which are added to the ruleset. + */ private void readFile(final String filename) throws FileNotFoundException { InputStream inFile = new FileInputStream(filename); Scanner readFile = new Scanner(inFile); @@ -81,29 +86,52 @@ public class BrandChecker { } } + /** + * Parses the line and adds the BrandRule to the ruleset. + */ private void parseRule(String line) { if (line.isEmpty()) { return; } - + if (!line.contains("-")) { - // only positive search entries. - String[] sequence = line.split("\\s+"); - String[] blacklist = {""}; - ruleset.add(new BrandRule(line, sequence, blacklist)); + System.err.println("illformatted rule: " + line + ", missing -"); } else { String[] parts = line.split("-"); // positive and negative. if (parts.length < 2) { - throw new IllegalArgumentException("Brand rule contained '-' but not two parts."); + System.err.println("illformatted rule: " + line + ", missing - ."); + return; } - String[] sequence = parts[0].trim().split("\\s+"); - String[] blacklist = parts[1].trim().split("\\s+"); + if (parts.length > 4) { + System.err.println("illformatted rule: " + line + ", forth part with - was given thus will be ignored."); + } - ruleset.add(new BrandRule(parts[0].trim(), sequence, blacklist)); + // Read the line. + String name = parts[0].trim(); + + // Read the positive words. + String positive = parts[1].replaceAll(" ",""); + String[] sequence = positive.split(","); + + if (parts.length == 3) { + String negative = parts[2].replaceAll(" ", ""); + String[] blacklist = negative.split(","); + ruleset.add(new BrandRule(name, sequence, blacklist)); + } else { + ruleset.add(new BrandRule(name, sequence, null)); + } } + } + /** + * Removes punctuation and urls. + */ + private String removePunct(String text) { + //text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll(",", " "); + return text; } private class BrandRule { @@ -111,8 +139,8 @@ public class BrandChecker { /** * The words that should be in the text. */ - private final ArrayList names; - + private final HashMap names; + /** * A blacklist of words that are not interesting. */ @@ -131,8 +159,16 @@ public class BrandChecker { */ public BrandRule(final String brandname, final String[] names, final String[] blacklist) { this.brand = brandname; - this.names = new ArrayList(Arrays.asList(names)); - this.blacklist = new HashSet(Arrays.asList(blacklist)); + this.names = new HashMap(); + if (blacklist != null) { + this.blacklist = new HashSet(Arrays.asList(blacklist)); + } else { + this.blacklist = null; + } + + for (String name : names) { + this.names.put(name, Boolean.FALSE); + } } /** @@ -141,27 +177,36 @@ public class BrandChecker { * @param words A list of words in a line. */ public boolean analyze(String[] words) { + reset(); + int found = 0; for (String word : words) { - if (blacklist.contains(word)) { - return false; - } - - if (names.contains(word)) { - found++; + if (blacklist != null) { + if (blacklist.contains(word)) { + return false; + } } - } - if (found == names.size()) { - return true; + if (names.containsKey(word)) { + if (names.get(word) == false) { + found++; + names.put(word, Boolean.TRUE); + } + } } - return false; + return found == names.size(); } public String getBrand() { return brand; } + + private void reset() { + for (String name : this.names.keySet()) { + this.names.put(name, Boolean.FALSE); + } + } } } -- cgit v1.2.1