summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-04 10:53:00 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-04 10:53:00 +0200
commit864afee73644051b5d4e1b21760f92f5659010da (patch)
tree6e7a7cc8f54e4b57d9612fc9f736652ac1482c2e
parentf469c98f2f7c4b0c2c93fc4618b66a6bce725916 (diff)
downloadassignment4-864afee73644051b5d4e1b21760f92f5659010da.tar.gz
Do not count unigrams twice
-rw-r--r--spellchecker/src/SpellCorrector.java32
1 files changed, 14 insertions, 18 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index a435f74..2e03f26 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -274,7 +274,7 @@ public class SpellCorrector {
* position {@code index}.
*/
public double getWordLikelihood(int index, double channel_probability) {
- double prior, score, p, igram_p;
+ double prior, p, igram_p;
String word = words[index];
// a suggested word not in the vocabulary is certainly wrong
@@ -291,23 +291,14 @@ public class SpellCorrector {
debug_word = "";
}
- // P(x|w) is given by language model (noisy channel probability).
- // Find prior P(w) = (freq(w) + .5) / N (N is number of words).
- // Then compute candidate score by P(w) * P(x|w)
- // Note: Kernighan uses + .5 here to compensate for unknown words,
- // but that assigns too much value to the score so do not do that.
- // (all words are in the vocubalary anyway).
- prior = (cr.getNGramCount(word) + 0.0) / cr.getWordCount();
- score = prior * channel_probability;
-
- // compute unigrams
+ // compute unigram component of language model: P(w)
igram_p = cr.getNgramProbability(word, "");
- p = LAMBDAS[0] * igram_p;
+ prior = LAMBDAS[0] * igram_p;
if (debug_word != null) {
debug_word += " 1p=" + igram_p;
}
- // compute bigrams, etc.
+ // compute bigrams (P(w|prefix)), etc.
String ngram = "";
for (int i = 1; i < NGRAM_N; i++) {
// are there actually enough words to compute this metric?
@@ -325,18 +316,23 @@ public class SpellCorrector {
// no metrics found, cannot deduce much information from it
igram_p = .5;
}
- p += LAMBDAS[i] * igram_p;
+ prior += LAMBDAS[i] * igram_p;
if (debug_word != null) {
debug_word += " " + (i + 1) + "p=" + igram_p;
}
}
- // finally combine the score with ngram probabilities
- //p = .1 * score + .9 * p;
- p *= score;
+ // Finally combine probabilities using the Noisy Channel Model.
+ // P(x|w) is given by language model (noisy channel probability).
+ // The prior here is different from Kernighans article. Instead of
+ // P(w) = (freq(w) + .5) / N (N is number of words), we use an
+ // interpolation of ngram probabilities.
+ // The candidate score is finally computed by P(w) * P(x|w)
+ p = prior * channel_probability;
+
if (debug_word != null) {
System.err.println("# " + word + " p=" + p
- + " score=" + score + " chan=" + channel_probability
+ + " chan=" + channel_probability
+ " prior=" + prior + debug_word);
}
assert p > 0.0 : "failed probability for " + word;