From dee51f2ec8cdb7d5a61139455219b20e6604f4c2 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 17:54:39 +0200 Subject: Improve ngram score debugging details --- spellchecker/src/SpellCorrector.java | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index f5f5472..b5e383e 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -261,7 +261,7 @@ public class SpellCorrector { */ public double getWordLikelihood(int index, String word, double channel_probability) { - double prior, score; + double prior, score, p, igram_p; // a suggested word not in the vocabulary is certainly wrong, // changed (or consequentive) words should also not be changed. if (!cr.inVocabulary(word) || words_readonly[index]) { @@ -269,6 +269,13 @@ public class SpellCorrector { } assert channel_probability > 0.0; + String debug_word = null; + if (DEBUG_SCORE + && (word.equals("he") + || word.equals("hme") + || word.equals("home"))) { + debug_word = ""; + } // P(x|w) is given by language model (noisy channel probability). // Find prior P(w) = (freq(w) + .5) / N (N is number of words). @@ -277,7 +284,12 @@ public class SpellCorrector { score = prior * channel_probability; // compute unigrams - double p = LAMBDAS[0] * cr.getNgramProbability(word, ""); + igram_p = cr.getNgramProbability(word, ""); + p = LAMBDAS[0] * igram_p; + if (debug_word != null) { + debug_word += " 1p=" + igram_p; + } + // compute bigrams, etc. String ngram = word; for (int i = 1; i < NGRAM_N; i++) { @@ -287,16 +299,23 @@ public class SpellCorrector { ngram += " " + words[index - i]; // Obtain n-gram probs and combine using interpolation. - p += LAMBDAS[i] * cr.getNgramProbability(word, ngram); + igram_p = cr.getNgramProbability(word, ngram); } else { // no metrics found, cannot deduce much information from it - p += LAMBDAS[i] * .5; + igram_p = .5; + } + p += LAMBDAS[i] * igram_p; + if (debug_word != null) { + debug_word += " " + (i + 1) + "p=" + igram_p; } } // finally add the score - if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) { - System.err.println(word + " p=" + (p * score) + " score=" + score + " ngram=" + p); + if (debug_word != null) { + System.err.println("# " + word + " p=" + (p * score) + + " score=" + score + " chan=" + channel_probability + + " prior=" + prior + + " ngram=" + p + debug_word); } p *= score; assert p > 0.0 : "failed probability for " + word; -- cgit v1.2.1