/* * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE Version 2, December 2004 * * Copyright (C) 2004 Sam Hocevar * * Everyone is permitted to copy and distribute verbatim or modified copies * of this license document, and changing it is allowed as long as the name is * changed. * * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, * DISTRIBUTION AND MODIFICATION * * 0. You just DO WHAT THE FUCK YOU WANT TO. */ package analysis; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Scanner; import java.util.Set; /** * This class obtains a text and returns the brands that are contained in this * text. The input file contains lines with [brandname] [extensions] | * [blacklisted words]. * * @author Maurice Laveaux */ public class BrandChecker { /** * A set of rules that determine the brands. */ private final ArrayList ruleset = new ArrayList(); /** * @param filename The filename that contains all the rules. */ public BrandChecker(final String filename) { try { readFile(filename); } catch (FileNotFoundException ex) { throw new IllegalArgumentException("file named " + filename + " not found."); } } /** * Get the brands that are in some text. * * @param text Any valid text. * @return The list of brands that are contained in this text or null. */ public List getBrands(String text) { text = removePunct(text); String[] words = text.toLowerCase().split("\\s+"); List brands = new ArrayList(); for (BrandRule rule : ruleset) { if (rule.analyze(words)) { brands.add(rule.getBrand()); } } return brands; } /** * Reads the file and parses the rules, which are added to the ruleset. */ private void readFile(final String filename) throws FileNotFoundException { InputStream inFile = new FileInputStream(filename); Scanner readFile = new Scanner(inFile); while (readFile.hasNextLine()) { String line = readFile.nextLine(); parseRule(line.toLowerCase(Locale.ENGLISH)); } } /** * Parses the line and adds the BrandRule to the ruleset. */ private void parseRule(String line) { if (line.isEmpty()) { return; } if (!line.contains("-")) { System.err.println("illformatted rule: " + line + ", missing -"); } else { String[] parts = line.split("-"); // positive and negative. if (parts.length < 2) { System.err.println("illformatted rule: " + line + ", missing - ."); return; } if (parts.length > 4) { System.err.println("illformatted rule: " + line + ", forth part with - was given thus will be ignored."); } // Read the line. String name = parts[0].trim(); // Read the positive words. String positive = parts[1].replaceAll(" ",""); String[] sequence = positive.split(","); if (parts.length == 3) { String negative = parts[2].replaceAll(" ", ""); String[] blacklist = negative.split(","); ruleset.add(new BrandRule(name, sequence, blacklist)); } else { ruleset.add(new BrandRule(name, sequence, null)); } } } /** * Removes punctuation and urls. */ private String removePunct(String text) { text = text.replaceAll("[.,!?();\"'@#-]", " "); return text; } private class BrandRule { /** * The words that should be in the text. */ private final HashMap names; /** * A blacklist of words that are not interesting. */ private final Set blacklist; /** * The brand name of this rule. */ private final String brand; /** * * @param brand The brand of this rule. * @param sequential The sequence of strings to obtain. * @param blacklist The blacklisted words. */ public BrandRule(final String brandname, final String[] names, final String[] blacklist) { this.brand = brandname; this.names = new HashMap(); if (blacklist != null) { this.blacklist = new HashSet(Arrays.asList(blacklist)); } else { this.blacklist = null; } for (String name : names) { this.names.put(name, Boolean.FALSE); } } /** * Analyzes if this rule is holds for some text. * * @param words A list of words in a line. */ public boolean analyze(String[] words) { reset(); int found = 0; for (String word : words) { if (blacklist != null) { if (blacklist.contains(word)) { return false; } } if (names.containsKey(word)) { if (names.get(word) == false) { found++; names.put(word, Boolean.TRUE); } } } return found == names.size(); } public String getBrand() { return brand; } private void reset() { for (String name : this.names.keySet()) { this.names.put(name, Boolean.FALSE); } } } }