summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-02 18:05:12 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-02 18:05:12 +0200
commit81a6cc4a67c8ac5aa601eaaef728f03245576cb9 (patch)
tree4d2cb45c6cd5f55afd63b1b59aea933d0b7bc8aa
parentdee51f2ec8cdb7d5a61139455219b20e6604f4c2 (diff)
downloadassignment4-81a6cc4a67c8ac5aa601eaaef728f03245576cb9.tar.gz
Further attmept to improve score: drop .5 correction
-rw-r--r--spellchecker/src/SpellCorrector.java15
1 files changed, 9 insertions, 6 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index b5e383e..33b804f 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -280,7 +280,10 @@ public class SpellCorrector {
// P(x|w) is given by language model (noisy channel probability).
// Find prior P(w) = (freq(w) + .5) / N (N is number of words).
// Then compute candidate score by P(w) * P(x|w)
- prior = (cr.getNGramCount(word) + .5) / cr.getWordCount();
+ // Note: Kernighan uses + .5 here to compensate for unknown words,
+ // but that assigns too much value to the score so do not do that.
+ // (all words are in the vocubalary anyway).
+ prior = (cr.getNGramCount(word) + 0.0) / cr.getWordCount();
score = prior * channel_probability;
// compute unigrams
@@ -310,14 +313,14 @@ public class SpellCorrector {
}
}
- // finally add the score
+ // finally combine the score with ngram probabilities
+ //p = .1 * score + .9 * p;
+ p *= score;
if (debug_word != null) {
- System.err.println("# " + word + " p=" + (p * score)
+ System.err.println("# " + word + " p=" + p
+ " score=" + score + " chan=" + channel_probability
- + " prior=" + prior
- + " ngram=" + p + debug_word);
+ + " prior=" + prior + debug_word);
}
- p *= score;
assert p > 0.0 : "failed probability for " + word;
return p;
}