summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-04 12:53:45 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-04 12:53:45 +0200
commitc6b64fa4c9b363379cbdc470cad782412e2db398 (patch)
tree9f7a1c3aa763af8313ac86589de05d063777bcc5
parentaed75adb5b127e5943dd16b0e37a4d9b00d2ac12 (diff)
downloadassignment4-c6b64fa4c9b363379cbdc470cad782412e2db398.tar.gz
Labda interpolation works bad
It favors large unigram probabilities too much.
-rw-r--r--spellchecker/src/SpellCorrector.java14
1 files changed, 3 insertions, 11 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 0bc5b41..fbbfbef 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -17,12 +17,6 @@ public class SpellCorrector {
private final static int NGRAM_N = 2;
/**
- * Lambda values for interpolation of n-gram probabilities. The first value
- * is for unigrams, the second for bigrams, etc.
- */
- private final static double[] LAMBDAS = new double[]{.25, .75};
-
- /**
* The language model probability for uncorrected words.
*/
private final static double LM_PROBABILITY_UNMODIFIED = .95;
@@ -321,7 +315,7 @@ public class SpellCorrector {
// compute unigram component of language model: P(w)
igram_p = cr.getNgramProbability(word, "");
- prior = LAMBDAS[0] * igram_p;
+ prior = igram_p;
if (debug_word != null) {
debug_word += " 1p=" + igram_p;
}
@@ -344,7 +338,7 @@ public class SpellCorrector {
// no metrics found, cannot deduce much information from it
igram_p = .5;
}
- prior += LAMBDAS[i] * igram_p;
+ prior *= igram_p;
if (debug_word != null) {
debug_word += " " + (i + 1) + "p=" + igram_p;
}
@@ -352,9 +346,7 @@ public class SpellCorrector {
// Finally combine probabilities using the Noisy Channel Model.
// P(x|w) is given by language model (noisy channel probability).
- // The prior here is different from Kernighans article. Instead of
- // P(w) = (freq(w) + .5) / N (N is number of words), we use an
- // interpolation of ngram probabilities.
+ // Here the prior is a combination of ngram probabilities.
// The candidate score is finally computed by P(w) * P(x|w)
p = prior * channel_probability;