From 5508b2fa5c5b25d97a441aae305cb2f858307d55 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 17:31:46 +0200 Subject: Implement generic ngram counting and probability calculation TODO improve smoothing. --- spellchecker/src/CorpusReader.java | 44 ++++++++++++++++++++++++++++---- spellchecker/src/SpellCorrector.java | 49 ++++++++++++++++++++---------------- 2 files changed, 66 insertions(+), 27 deletions(-) diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index 686f243..2ad7e85 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -15,7 +15,7 @@ public class CorpusReader { private HashMap ngrams; private Set vocabulary; - private int unigramCount = 0; + private int wordCount = 0; public CorpusReader() throws IOException { readNGrams(); @@ -57,9 +57,9 @@ public class CorpusReader { try { count = Integer.parseInt(s1); ngrams.put(s2, count); - // unigram + // Count total number of words in the data set if (s2.indexOf(' ') == -1) { - unigramCount += count; + wordCount += count; } } catch (NumberFormatException nfe) { throw new NumberFormatException("NumberformatError: " + s1); @@ -129,7 +129,41 @@ public class CorpusReader { return smoothedCount; } - public int getUnigramCount() { - return unigramCount; + /** + * Computes the probability P(word|ngram). + * + * @param word + * @param ngram + * @return + */ + public double getNgramProbability(String word, String ngram) { + double a, b; + // special case: unigram has no prior ngram + if (ngram.isEmpty()) { + a = getNGramCount(word); + b = wordCount; + + // apply add-1 smoothing under the assumption that there are many + // unigrams and this does not significantly affect the chance, + // it just ensures that it is non-zero. + return (a + 1) / (b + 1); + } else { + // other ngram cases + a = getNGramCount(ngram + " " + word); + b = getNGramCount(ngram); + + // apply smoothing, but add a smaller number because "b" is + // typically very small. + // TODO: Kneser-Ney smoothing? + return (a + .001) / (b + 1); + } + } + + /** + * Returns the number of words in the corpus text (based on counting + * unigrams). + */ + public double getWordCount() { + return wordCount; } } diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index bdfc8e1..971e91e 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -9,6 +9,11 @@ public class SpellCorrector { final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray(); + /** + * The highest n-gram to look for. 1 gives a unigram, 2 gives a bigram, etc. + */ + private final static int NGRAM_N = 2; + /** * Lambda values for interpolation of n-gram probabilities. The first value * is for unigrams, the second for bigrams, etc. @@ -255,8 +260,7 @@ public class SpellCorrector { */ public double getWordLikelihood(int index, String word, double channel_probability) { - String prev_word, ngram; - double prior, score, p, p_uni, p_bi; + double prior, score; // a suggested word not in the vocabulary is certainly wrong, // changed (or consequentive) words should also not be changed. if (!cr.inVocabulary(word) || words_readonly[index]) { @@ -266,33 +270,34 @@ public class SpellCorrector { assert channel_probability > 0.0; // P(x|w) is given by language model (noisy channel probability). - // Find prior P(w) = (freq(w) + .5) / N. + // Find prior P(w) = (freq(w) + .5) / N (N is number of words). // Then compute candidate score by P(w) * P(x|w) - prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount(); + prior = (cr.getNGramCount(word) + .5) / cr.getWordCount(); score = prior * channel_probability; - // unigram probability is computed by P(w) = #w / N (no smoothing). - p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount(); - - // Add probability of bi-grams. - // For words u and w, P(w|u) = P(u, w) / P(u). - if (index > 0) { - prev_word = words[index - 1]; - ngram = prev_word + " " + word; - p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); - //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]); - } else { - // no previous word, assume likely. - p_bi = 1; + // compute unigrams + double p = LAMBDAS[0] * cr.getNgramProbability(word, ""); + // compute bigrams, etc. + String ngram = word; + for (int i = 1; i < NGRAM_N; i++) { + // are there actually enough words to compute this metric? + if (index - i >= 0) { + // increase ngram prefix + ngram += " " + words[index - i]; + + // Obtain n-gram probs and combine using interpolation. + p += LAMBDAS[i] * cr.getNgramProbability(word, ngram); + } else { + // no metrics found, cannot deduce much information from it + p += LAMBDAS[i] * .5; + } } - // Now obtain n-gram probabilities. Use interpolation to combine - // unigrams and bigrams. - p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi; - p *= score; + // finally add the score if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) { - System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi); + System.err.println(word + " p=" + (p * score) + " score=" + score + " ngram=" + p); } + p *= score; assert p > 0.0 : "failed probability for " + word; return p; } -- cgit v1.2.1