diff options
Diffstat (limited to 'spellchecker/src/CorpusReader.java')
-rw-r--r-- | spellchecker/src/CorpusReader.java | 44 |
1 files changed, 39 insertions, 5 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index 686f243..2ad7e85 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -15,7 +15,7 @@ public class CorpusReader { private HashMap<String, Integer> ngrams; private Set<String> vocabulary; - private int unigramCount = 0; + private int wordCount = 0; public CorpusReader() throws IOException { readNGrams(); @@ -57,9 +57,9 @@ public class CorpusReader { try { count = Integer.parseInt(s1); ngrams.put(s2, count); - // unigram + // Count total number of words in the data set if (s2.indexOf(' ') == -1) { - unigramCount += count; + wordCount += count; } } catch (NumberFormatException nfe) { throw new NumberFormatException("NumberformatError: " + s1); @@ -129,7 +129,41 @@ public class CorpusReader { return smoothedCount; } - public int getUnigramCount() { - return unigramCount; + /** + * Computes the probability P(word|ngram). + * + * @param word + * @param ngram + * @return + */ + public double getNgramProbability(String word, String ngram) { + double a, b; + // special case: unigram has no prior ngram + if (ngram.isEmpty()) { + a = getNGramCount(word); + b = wordCount; + + // apply add-1 smoothing under the assumption that there are many + // unigrams and this does not significantly affect the chance, + // it just ensures that it is non-zero. + return (a + 1) / (b + 1); + } else { + // other ngram cases + a = getNGramCount(ngram + " " + word); + b = getNGramCount(ngram); + + // apply smoothing, but add a smaller number because "b" is + // typically very small. + // TODO: Kneser-Ney smoothing? + return (a + .001) / (b + 1); + } + } + + /** + * Returns the number of words in the corpus text (based on counting + * unigrams). + */ + public double getWordCount() { + return wordCount; } } |