From 864afee73644051b5d4e1b21760f92f5659010da Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Sat, 4 Apr 2015 10:53:00 +0200 Subject: Do not count unigrams twice --- spellchecker/src/SpellCorrector.java | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index a435f74..2e03f26 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -274,7 +274,7 @@ public class SpellCorrector { * position {@code index}. */ public double getWordLikelihood(int index, double channel_probability) { - double prior, score, p, igram_p; + double prior, p, igram_p; String word = words[index]; // a suggested word not in the vocabulary is certainly wrong @@ -291,23 +291,14 @@ public class SpellCorrector { debug_word = ""; } - // P(x|w) is given by language model (noisy channel probability). - // Find prior P(w) = (freq(w) + .5) / N (N is number of words). - // Then compute candidate score by P(w) * P(x|w) - // Note: Kernighan uses + .5 here to compensate for unknown words, - // but that assigns too much value to the score so do not do that. - // (all words are in the vocubalary anyway). - prior = (cr.getNGramCount(word) + 0.0) / cr.getWordCount(); - score = prior * channel_probability; - - // compute unigrams + // compute unigram component of language model: P(w) igram_p = cr.getNgramProbability(word, ""); - p = LAMBDAS[0] * igram_p; + prior = LAMBDAS[0] * igram_p; if (debug_word != null) { debug_word += " 1p=" + igram_p; } - // compute bigrams, etc. + // compute bigrams (P(w|prefix)), etc. String ngram = ""; for (int i = 1; i < NGRAM_N; i++) { // are there actually enough words to compute this metric? @@ -325,18 +316,23 @@ public class SpellCorrector { // no metrics found, cannot deduce much information from it igram_p = .5; } - p += LAMBDAS[i] * igram_p; + prior += LAMBDAS[i] * igram_p; if (debug_word != null) { debug_word += " " + (i + 1) + "p=" + igram_p; } } - // finally combine the score with ngram probabilities - //p = .1 * score + .9 * p; - p *= score; + // Finally combine probabilities using the Noisy Channel Model. + // P(x|w) is given by language model (noisy channel probability). + // The prior here is different from Kernighans article. Instead of + // P(w) = (freq(w) + .5) / N (N is number of words), we use an + // interpolation of ngram probabilities. + // The candidate score is finally computed by P(w) * P(x|w) + p = prior * channel_probability; + if (debug_word != null) { System.err.println("# " + word + " p=" + p - + " score=" + score + " chan=" + channel_probability + + " chan=" + channel_probability + " prior=" + prior + debug_word); } assert p > 0.0 : "failed probability for " + word; -- cgit v1.2.1