diff options
author | Maurice Laveaux <m.laveaux@student.tue.nl> | 2014-05-19 11:11:22 +0200 |
---|---|---|
committer | Maurice Laveaux <m.laveaux@student.tue.nl> | 2014-05-19 11:11:22 +0200 |
commit | 058ef3d39029da492477c3c59f1e405193ca3013 (patch) | |
tree | b12d095895b5fd09b98b3d18cd66251a550d5d72 | |
parent | b91387ded4d27fd92f76df61d08896a0317fb4b1 (diff) | |
download | Goldfarmer-058ef3d39029da492477c3c59f1e405193ca3013.tar.gz |
Updated the brandchecker code and rules.
* Changed the brandrules.txt ruleset to conform
the new parsing of rules.
* Brandchecker now doesn't count multiple equal words.
* Updated the testcases, which don't work for others.
-rwxr-xr-x | brandrules.txt | 102 | ||||
-rw-r--r-- | src/analysis/BrandChecker.java | 91 | ||||
-rw-r--r-- | test/analysis/BrandCheckerTest.java | 9 |
3 files changed, 128 insertions, 74 deletions
diff --git a/brandrules.txt b/brandrules.txt index ae7d6f9..f89cddd 100755 --- a/brandrules.txt +++ b/brandrules.txt @@ -1,54 +1,58 @@ -galaxy s5 - tablet tab
-galaxy s4 - tablet tab
-galaxy s3
-galaxy K zoom
-galaxy note
-samsung note
-samsung zoom
+galaxy - galaxy - s5,s4,s3,zoom,note
+galaxy s5 - galaxy,s5
+galaxy s4 - galaxy,s4
+galaxy s3 - galaxy,s3
+galaxy K-Zoom - galaxy,k,zoom
+galaxy note - galaxy,note
-iphone 5
-iphone 5c
-iphone 5s
-iphone 4
-iphone 4s
+iphone - iphone - 4,4s,5,5s,5c
+iphone 4 - iphone,4
+iphone 4s - iphone,4s
+iphone 5 - iphone,5
+iphone 5s - iphone,5s
+iphone 5c - iphone,5c
-huawei ascend
-huawei p6
-huawei p7
-huawei mini
-huawei y300
-huawei y530
-huawei mate
-huawei g700
-huawei g510
-huawei g6
-huawei g525
+huawei - huawei - ascend,p6,p7,mini,y300,y530,mate,g700,g510,g6,g525
+huawei ascend - huawei,ascend
+huawei p6 - huawei,p6
+huawei p7 - huawei,p7
+huawei mini - huawei,mini
+huawei y300 - huawei,y300
+huawei y530 - huawei,y530
+huawei mate - huawei,mate
+huawei g700 - huawei,g700
+huawei g510 - huawei,g510
+huawei g6 - huawei,g6
+huawei g525 - huawei,g525
-sony xperia
-sony L
-sony E1
-sony Z
-sony Z1
-sony Z2
-sony compact
-sony ZR
-sony M
+sony - sony - xperia,L,E1,Z,Z1,Z2,compact,ZR,M
+sony xperia - sony,xperia
+sony L - sony,L
+sony E1 - sony,E1
+sony Z - sony,Z
+sony Z1 - sony,Z1
+sony Z2 - sony,Z2
+sony compact - sony,compact
+sony ZR - sony,ZR
+sony M - sony,M
-HTC one - phone
-HTC M8
-HTC mini
-HTC desire
-HTC X dual
-HTC SV
+HTC - htc - one,m8,mini,desire,dual,x,sv
+HTC one - htc,one
+HTC M8 - htc,m8
+HTC mini - htc,mini
+HTC desire - htc,desire
+HTC X dual - htc,x,dual
+HTC SV - htc,sv
-LG Nexus 5
-LG G2
-LG L70
-LG L90
-LG L40
-LG G flex
-LG mini
-LG L9
-LG L7
-LG L5
-LG L3
+LG - lg - nexus,g2,l70,l90,flex,mini,l9,l7,l5,l3
+LG Nexus 5 - nexus,5
+LG G2 - lg,g2
+LG L70 - lg,l70
+LG L90 - lg,l90
+LG L40 - lg,l40
+LG G flex - lg,g,flex
+LG mini - lg,mini
+LG L9 - lg,l9
+LG L7 - lg,l7
+LG L5 - lg,l5
+LG L3 - lg,l3
diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java index 740424c..5bde55b 100644 --- a/src/analysis/BrandChecker.java +++ b/src/analysis/BrandChecker.java @@ -19,6 +19,7 @@ import java.io.FileNotFoundException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; @@ -57,6 +58,7 @@ public class BrandChecker { * @return The list of brands that are contained in this text or null. */ public List<String> getBrands(String text) { + text = removePunct(text); String[] words = text.toLowerCase().split("\\s+"); List<String> brands = new ArrayList(); @@ -70,6 +72,9 @@ public class BrandChecker { return brands; } + /** + * Reads the file and parses the rules, which are added to the ruleset. + */ private void readFile(final String filename) throws FileNotFoundException { InputStream inFile = new FileInputStream(filename); Scanner readFile = new Scanner(inFile); @@ -81,29 +86,52 @@ public class BrandChecker { } } + /** + * Parses the line and adds the BrandRule to the ruleset. + */ private void parseRule(String line) { if (line.isEmpty()) { return; } - + if (!line.contains("-")) { - // only positive search entries. - String[] sequence = line.split("\\s+"); - String[] blacklist = {""}; - ruleset.add(new BrandRule(line, sequence, blacklist)); + System.err.println("illformatted rule: " + line + ", missing -"); } else { String[] parts = line.split("-"); // positive and negative. if (parts.length < 2) { - throw new IllegalArgumentException("Brand rule contained '-' but not two parts."); + System.err.println("illformatted rule: " + line + ", missing <name> - <positive>."); + return; } - String[] sequence = parts[0].trim().split("\\s+"); - String[] blacklist = parts[1].trim().split("\\s+"); + if (parts.length > 4) { + System.err.println("illformatted rule: " + line + ", forth part with - was given thus will be ignored."); + } - ruleset.add(new BrandRule(parts[0].trim(), sequence, blacklist)); + // Read the <name> line. + String name = parts[0].trim(); + + // Read the positive words. + String positive = parts[1].replaceAll(" ",""); + String[] sequence = positive.split(","); + + if (parts.length == 3) { + String negative = parts[2].replaceAll(" ", ""); + String[] blacklist = negative.split(","); + ruleset.add(new BrandRule(name, sequence, blacklist)); + } else { + ruleset.add(new BrandRule(name, sequence, null)); + } } + } + /** + * Removes punctuation and urls. + */ + private String removePunct(String text) { + //text = text.replaceAll("https?://\\S*", ""); + text = text.replaceAll(",", " "); + return text; } private class BrandRule { @@ -111,8 +139,8 @@ public class BrandChecker { /** * The words that should be in the text. */ - private final ArrayList<String> names; - + private final HashMap<String, Boolean> names; + /** * A blacklist of words that are not interesting. */ @@ -131,8 +159,16 @@ public class BrandChecker { */ public BrandRule(final String brandname, final String[] names, final String[] blacklist) { this.brand = brandname; - this.names = new ArrayList(Arrays.asList(names)); - this.blacklist = new HashSet(Arrays.asList(blacklist)); + this.names = new HashMap(); + if (blacklist != null) { + this.blacklist = new HashSet(Arrays.asList(blacklist)); + } else { + this.blacklist = null; + } + + for (String name : names) { + this.names.put(name, Boolean.FALSE); + } } /** @@ -141,27 +177,36 @@ public class BrandChecker { * @param words A list of words in a line. */ public boolean analyze(String[] words) { + reset(); + int found = 0; for (String word : words) { - if (blacklist.contains(word)) { - return false; - } - - if (names.contains(word)) { - found++; + if (blacklist != null) { + if (blacklist.contains(word)) { + return false; + } } - } - if (found == names.size()) { - return true; + if (names.containsKey(word)) { + if (names.get(word) == false) { + found++; + names.put(word, Boolean.TRUE); + } + } } - return false; + return found == names.size(); } public String getBrand() { return brand; } + + private void reset() { + for (String name : this.names.keySet()) { + this.names.put(name, Boolean.FALSE); + } + } } } diff --git a/test/analysis/BrandCheckerTest.java b/test/analysis/BrandCheckerTest.java index 06818ac..23d8445 100644 --- a/test/analysis/BrandCheckerTest.java +++ b/test/analysis/BrandCheckerTest.java @@ -74,13 +74,18 @@ public class BrandCheckerTest { @Test public void testMultiple() { doTest("QBD - Black in Ear Earphones. 3.5mm Jack Plug for Apple iPod, " - + "IPhone 4, 4S, 5, 5S, 5C, Ipad Air, Ipad Mini", - new String[]{"iphone 4", "iphone 4s", "iphone 5S", "iphone 5c"}); + + "IPhone 4, 4S, 5, 5S, 5C, Ipad Air, Ipad Mini", + new String[]{"iphone 4", "iphone 4s", "iphone 5s", "iphone 5c", "iphone 5"}); } @Test public void testBullshit() { doTest("This applepie is delicious", new String[]{}); } + + @Test + public void multipleBrands() { + doTest("This tweet contains both iphone 4s,galaxy s5 and iphone", new String[]{"iphone 4s","galaxy s5"}); + } } |