import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.HashSet; import java.util.Set; public class CorpusReader { final static String CNTFILE_LOC = "samplecnt.txt"; final static String VOCFILE_LOC = "samplevoc.txt"; private HashMap ngrams; private Set vocabulary; private int unigramCount = 0; public CorpusReader() throws IOException { readNGrams(); readVocabulary(); } /** * Returns the n-gram count of in the file * * * @param nGram : space-separated list of words, e.g. "adopted by him" * @return 0 if cannot be found, otherwise count of in file */ public int getNGramCount(String nGram) throws NumberFormatException { if (nGram == null || nGram.length() == 0) { throw new IllegalArgumentException("NGram must be non-empty."); } Integer value = ngrams.get(nGram); return value == null ? 0 : value; } private void readNGrams() throws FileNotFoundException, IOException, NumberFormatException { ngrams = new HashMap<>(); FileInputStream fis; fis = new FileInputStream(CNTFILE_LOC); BufferedReader in = new BufferedReader(new InputStreamReader(fis)); while (in.ready()) { String phrase = in.readLine().trim(); String s1, s2; int j = phrase.indexOf(" "); s1 = phrase.substring(0, j); s2 = phrase.substring(j + 1, phrase.length()); int count = 0; try { count = Integer.parseInt(s1); ngrams.put(s2, count); // unigram if (s2.indexOf(' ') == -1) { unigramCount += count; } } catch (NumberFormatException nfe) { throw new NumberFormatException("NumberformatError: " + s1); } } } private void readVocabulary() throws FileNotFoundException, IOException { vocabulary = new HashSet<>(); FileInputStream fis = new FileInputStream(VOCFILE_LOC); BufferedReader in = new BufferedReader(new InputStreamReader(fis)); while (in.ready()) { String line = in.readLine(); vocabulary.add(line); } } /** * Returns the size of the number of unique words in the dataset * * @return the size of the number of unique words in the dataset */ public int getVocabularySize() { return vocabulary.size(); } /** * Returns the subset of words in set that are in the vocabulary * * @param set * @return */ public HashSet inVocabulary(Set set) { HashSet h = new HashSet<>(set); h.retainAll(vocabulary); return h; } public boolean inVocabulary(String word) { return vocabulary.contains(word); } /** * Returns a smoothed value based on the number of occurrences of the n-gram * in the corpus. */ public double getSmoothedCount(String NGram) { if (NGram == null || NGram.length() == 0) { throw new IllegalArgumentException("NGram must be non-empty."); } double smoothedCount = 0.0; // simplest smoothing implementation: plus 1. smoothedCount = getNGramCount(NGram) + 1; return smoothedCount; } public int getUnigramCount() { return unigramCount; } }