Further attmept to improve score: drop .5 correction

author: Peter Wu <peter@lekensteyn.nl> 2015-04-02 18:05:12 +0200
committer: Peter Wu <peter@lekensteyn.nl> 2015-04-02 18:05:12 +0200
commit: 81a6cc4a67c8ac5aa601eaaef728f03245576cb9 (patch)
tree: 4d2cb45c6cd5f55afd63b1b59aea933d0b7bc8aa
parent: dee51f2ec8cdb7d5a61139455219b20e6604f4c2 (diff)
download: assignment4-81a6cc4a67c8ac5aa601eaaef728f03245576cb9.tar.gz
1 files changed, 9 insertions, 6 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index b5e383e..33b804f 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -280,7 +280,10 @@ public class SpellCorrector {
             // P(x|w) is given by language model (noisy channel probability).
             // Find prior P(w) = (freq(w) + .5) / N (N is number of words).
             // Then compute candidate score by P(w) * P(x|w)
-            prior = (cr.getNGramCount(word) + .5) / cr.getWordCount();
+            // Note: Kernighan uses + .5 here to compensate for unknown words,
+            // but that assigns too much value to the score so do not do that.
+            // (all words are in the vocubalary anyway).
+            prior = (cr.getNGramCount(word) + 0.0) / cr.getWordCount();
             score = prior * channel_probability;
 
             // compute unigrams
@@ -310,14 +313,14 @@ public class SpellCorrector {
                 }
             }
 
-            // finally add the score
+            // finally combine the score with ngram probabilities
+            //p = .1 * score + .9 * p;
+            p *= score;
             if (debug_word != null) {
-                System.err.println("# " + word + " p=" + (p * score)
+                System.err.println("# " + word + " p=" + p
                         + " score=" + score + " chan=" + channel_probability
-                        + " prior=" + prior
-                        + " ngram=" + p + debug_word);
+                        + " prior=" + prior + debug_word);
             }
-            p *= score;
             assert p > 0.0 : "failed probability for " + word;
             return p;
         }
author	Peter Wu <peter@lekensteyn.nl>	2015-04-02 18:05:12 +0200
committer	Peter Wu <peter@lekensteyn.nl>	2015-04-02 18:05:12 +0200
commit	81a6cc4a67c8ac5aa601eaaef728f03245576cb9 (patch)
tree	4d2cb45c6cd5f55afd63b1b59aea933d0b7bc8aa
parent	dee51f2ec8cdb7d5a61139455219b20e6604f4c2 (diff)
download	assignment4-81a6cc4a67c8ac5aa601eaaef728f03245576cb9.tar.gz