From 81a6cc4a67c8ac5aa601eaaef728f03245576cb9 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 18:05:12 +0200 Subject: Further attmept to improve score: drop .5 correction --- spellchecker/src/SpellCorrector.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index b5e383e..33b804f 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -280,7 +280,10 @@ public class SpellCorrector { // P(x|w) is given by language model (noisy channel probability). // Find prior P(w) = (freq(w) + .5) / N (N is number of words). // Then compute candidate score by P(w) * P(x|w) - prior = (cr.getNGramCount(word) + .5) / cr.getWordCount(); + // Note: Kernighan uses + .5 here to compensate for unknown words, + // but that assigns too much value to the score so do not do that. + // (all words are in the vocubalary anyway). + prior = (cr.getNGramCount(word) + 0.0) / cr.getWordCount(); score = prior * channel_probability; // compute unigrams @@ -310,14 +313,14 @@ public class SpellCorrector { } } - // finally add the score + // finally combine the score with ngram probabilities + //p = .1 * score + .9 * p; + p *= score; if (debug_word != null) { - System.err.println("# " + word + " p=" + (p * score) + System.err.println("# " + word + " p=" + p + " score=" + score + " chan=" + channel_probability - + " prior=" + prior - + " ngram=" + p + debug_word); + + " prior=" + prior + debug_word); } - p *= score; assert p > 0.0 : "failed probability for " + word; return p; } -- cgit v1.2.1