diff options
author | Peter Wu <peter@lekensteyn.nl> | 2015-04-04 12:53:45 +0200 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2015-04-04 12:53:45 +0200 |
commit | c6b64fa4c9b363379cbdc470cad782412e2db398 (patch) | |
tree | 9f7a1c3aa763af8313ac86589de05d063777bcc5 | |
parent | aed75adb5b127e5943dd16b0e37a4d9b00d2ac12 (diff) | |
download | assignment4-c6b64fa4c9b363379cbdc470cad782412e2db398.tar.gz |
Labda interpolation works bad
It favors large unigram probabilities too much.
-rw-r--r-- | spellchecker/src/SpellCorrector.java | 14 |
1 files changed, 3 insertions, 11 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 0bc5b41..fbbfbef 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -17,12 +17,6 @@ public class SpellCorrector { private final static int NGRAM_N = 2; /** - * Lambda values for interpolation of n-gram probabilities. The first value - * is for unigrams, the second for bigrams, etc. - */ - private final static double[] LAMBDAS = new double[]{.25, .75}; - - /** * The language model probability for uncorrected words. */ private final static double LM_PROBABILITY_UNMODIFIED = .95; @@ -321,7 +315,7 @@ public class SpellCorrector { // compute unigram component of language model: P(w) igram_p = cr.getNgramProbability(word, ""); - prior = LAMBDAS[0] * igram_p; + prior = igram_p; if (debug_word != null) { debug_word += " 1p=" + igram_p; } @@ -344,7 +338,7 @@ public class SpellCorrector { // no metrics found, cannot deduce much information from it igram_p = .5; } - prior += LAMBDAS[i] * igram_p; + prior *= igram_p; if (debug_word != null) { debug_word += " " + (i + 1) + "p=" + igram_p; } @@ -352,9 +346,7 @@ public class SpellCorrector { // Finally combine probabilities using the Noisy Channel Model. // P(x|w) is given by language model (noisy channel probability). - // The prior here is different from Kernighans article. Instead of - // P(w) = (freq(w) + .5) / N (N is number of words), we use an - // interpolation of ngram probabilities. + // Here the prior is a combination of ngram probabilities. // The candidate score is finally computed by P(w) * P(x|w) p = prior * channel_probability; |