summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorS129778 <S129778@S129778.campus.tue.nl>2014-05-19 11:18:32 +0200
committerS129778 <S129778@S129778.campus.tue.nl>2014-05-19 11:18:32 +0200
commit8dac76108aeee3c155d695897156d624ce88960c (patch)
treef66759e695b7ddfa19fcb6cc1eaed5cb07f34276 /src
parentadb75e8dd80edbd3ad4035a7540af8b060fa5dff (diff)
parentb1bbd92fdcf9f52d38852b6c5ae802cc401ff712 (diff)
downloadGoldfarmer-8dac76108aeee3c155d695897156d624ce88960c.tar.gz
Merge origin/master
Diffstat (limited to 'src')
-rw-r--r--src/analysis/BrandChecker.java91
1 files changed, 68 insertions, 23 deletions
diff --git a/src/analysis/BrandChecker.java b/src/analysis/BrandChecker.java
index 740424c..5bde55b 100644
--- a/src/analysis/BrandChecker.java
+++ b/src/analysis/BrandChecker.java
@@ -19,6 +19,7 @@ import java.io.FileNotFoundException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
@@ -57,6 +58,7 @@ public class BrandChecker {
* @return The list of brands that are contained in this text or null.
*/
public List<String> getBrands(String text) {
+ text = removePunct(text);
String[] words = text.toLowerCase().split("\\s+");
List<String> brands = new ArrayList();
@@ -70,6 +72,9 @@ public class BrandChecker {
return brands;
}
+ /**
+ * Reads the file and parses the rules, which are added to the ruleset.
+ */
private void readFile(final String filename) throws FileNotFoundException {
InputStream inFile = new FileInputStream(filename);
Scanner readFile = new Scanner(inFile);
@@ -81,29 +86,52 @@ public class BrandChecker {
}
}
+ /**
+ * Parses the line and adds the BrandRule to the ruleset.
+ */
private void parseRule(String line) {
if (line.isEmpty()) {
return;
}
-
+
if (!line.contains("-")) {
- // only positive search entries.
- String[] sequence = line.split("\\s+");
- String[] blacklist = {""};
- ruleset.add(new BrandRule(line, sequence, blacklist));
+ System.err.println("illformatted rule: " + line + ", missing -");
} else {
String[] parts = line.split("-");
// positive and negative.
if (parts.length < 2) {
- throw new IllegalArgumentException("Brand rule contained '-' but not two parts.");
+ System.err.println("illformatted rule: " + line + ", missing <name> - <positive>.");
+ return;
}
- String[] sequence = parts[0].trim().split("\\s+");
- String[] blacklist = parts[1].trim().split("\\s+");
+ if (parts.length > 4) {
+ System.err.println("illformatted rule: " + line + ", forth part with - was given thus will be ignored.");
+ }
- ruleset.add(new BrandRule(parts[0].trim(), sequence, blacklist));
+ // Read the <name> line.
+ String name = parts[0].trim();
+
+ // Read the positive words.
+ String positive = parts[1].replaceAll(" ","");
+ String[] sequence = positive.split(",");
+
+ if (parts.length == 3) {
+ String negative = parts[2].replaceAll(" ", "");
+ String[] blacklist = negative.split(",");
+ ruleset.add(new BrandRule(name, sequence, blacklist));
+ } else {
+ ruleset.add(new BrandRule(name, sequence, null));
+ }
}
+ }
+ /**
+ * Removes punctuation and urls.
+ */
+ private String removePunct(String text) {
+ //text = text.replaceAll("https?://\\S*", "");
+ text = text.replaceAll(",", " ");
+ return text;
}
private class BrandRule {
@@ -111,8 +139,8 @@ public class BrandChecker {
/**
* The words that should be in the text.
*/
- private final ArrayList<String> names;
-
+ private final HashMap<String, Boolean> names;
+
/**
* A blacklist of words that are not interesting.
*/
@@ -131,8 +159,16 @@ public class BrandChecker {
*/
public BrandRule(final String brandname, final String[] names, final String[] blacklist) {
this.brand = brandname;
- this.names = new ArrayList(Arrays.asList(names));
- this.blacklist = new HashSet(Arrays.asList(blacklist));
+ this.names = new HashMap();
+ if (blacklist != null) {
+ this.blacklist = new HashSet(Arrays.asList(blacklist));
+ } else {
+ this.blacklist = null;
+ }
+
+ for (String name : names) {
+ this.names.put(name, Boolean.FALSE);
+ }
}
/**
@@ -141,27 +177,36 @@ public class BrandChecker {
* @param words A list of words in a line.
*/
public boolean analyze(String[] words) {
+ reset();
+
int found = 0;
for (String word : words) {
- if (blacklist.contains(word)) {
- return false;
- }
-
- if (names.contains(word)) {
- found++;
+ if (blacklist != null) {
+ if (blacklist.contains(word)) {
+ return false;
+ }
}
- }
- if (found == names.size()) {
- return true;
+ if (names.containsKey(word)) {
+ if (names.get(word) == false) {
+ found++;
+ names.put(word, Boolean.TRUE);
+ }
+ }
}
- return false;
+ return found == names.size();
}
public String getBrand() {
return brand;
}
+
+ private void reset() {
+ for (String name : this.names.keySet()) {
+ this.names.put(name, Boolean.FALSE);
+ }
+ }
}
}