From c6b64fa4c9b363379cbdc470cad782412e2db398 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Sat, 4 Apr 2015 12:53:45 +0200 Subject: Labda interpolation works bad It favors large unigram probabilities too much. --- spellchecker/src/SpellCorrector.java | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 0bc5b41..fbbfbef 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -16,12 +16,6 @@ public class SpellCorrector { */ private final static int NGRAM_N = 2; - /** - * Lambda values for interpolation of n-gram probabilities. The first value - * is for unigrams, the second for bigrams, etc. - */ - private final static double[] LAMBDAS = new double[]{.25, .75}; - /** * The language model probability for uncorrected words. */ @@ -321,7 +315,7 @@ public class SpellCorrector { // compute unigram component of language model: P(w) igram_p = cr.getNgramProbability(word, ""); - prior = LAMBDAS[0] * igram_p; + prior = igram_p; if (debug_word != null) { debug_word += " 1p=" + igram_p; } @@ -344,7 +338,7 @@ public class SpellCorrector { // no metrics found, cannot deduce much information from it igram_p = .5; } - prior += LAMBDAS[i] * igram_p; + prior *= igram_p; if (debug_word != null) { debug_word += " " + (i + 1) + "p=" + igram_p; } @@ -352,9 +346,7 @@ public class SpellCorrector { // Finally combine probabilities using the Noisy Channel Model. // P(x|w) is given by language model (noisy channel probability). - // The prior here is different from Kernighans article. Instead of - // P(w) = (freq(w) + .5) / N (N is number of words), we use an - // interpolation of ngram probabilities. + // Here the prior is a combination of ngram probabilities. // The candidate score is finally computed by P(w) * P(x|w) p = prior * channel_probability; -- cgit v1.2.1