1 files changed, 27 insertions, 22 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index bdfc8e1..971e91e 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -10,6 +10,11 @@ public class SpellCorrector {
     final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();
 
     /**
+     * The highest n-gram to look for. 1 gives a unigram, 2 gives a bigram, etc.
+     */
+    private final static int NGRAM_N = 2;
+
+    /**
      * Lambda values for interpolation of n-gram probabilities. The first value
      * is for unigrams, the second for bigrams, etc.
      */
@@ -255,8 +260,7 @@ public class SpellCorrector {
          */
         public double getWordLikelihood(int index, String word,
                 double channel_probability) {
-            String prev_word, ngram;
-            double prior, score, p, p_uni, p_bi;
+            double prior, score;
             // a suggested word not in the vocabulary is certainly wrong,
             // changed (or consequentive) words should also not be changed.
             if (!cr.inVocabulary(word) || words_readonly[index]) {
@@ -266,33 +270,34 @@ public class SpellCorrector {
             assert channel_probability > 0.0;
 
             // P(x|w) is given by language model (noisy channel probability).
-            // Find prior P(w) = (freq(w) + .5) / N.
+            // Find prior P(w) = (freq(w) + .5) / N (N is number of words).
             // Then compute candidate score by P(w) * P(x|w)
-            prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
+            prior = (cr.getNGramCount(word) + .5) / cr.getWordCount();
             score = prior * channel_probability;
 
-            // unigram probability is computed by P(w) = #w / N (no smoothing).
-            p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount();
-
-            // Add probability of bi-grams.
-            // For words u and w, P(w|u) = P(u, w) / P(u).
-            if (index > 0) {
-                prev_word = words[index - 1];
-                ngram = prev_word + " " + word;
-                p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
-                //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]);
-            } else {
-                // no previous word, assume likely.
-                p_bi = 1;
+            // compute unigrams
+            double p = LAMBDAS[0] * cr.getNgramProbability(word, "");
+            // compute bigrams, etc.
+            String ngram = word;
+            for (int i = 1; i < NGRAM_N; i++) {
+                // are there actually enough words to compute this metric?
+                if (index - i >= 0) {
+                    // increase ngram prefix
+                    ngram += " " + words[index - i];
+
+                    // Obtain n-gram probs and combine using interpolation.
+                    p += LAMBDAS[i] * cr.getNgramProbability(word, ngram);
+                } else {
+                    // no metrics found, cannot deduce much information from it
+                    p += LAMBDAS[i] * .5;
+                }
             }
 
-            // Now obtain n-gram probabilities. Use interpolation to combine
-            // unigrams and bigrams.
-            p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi;
-            p *= score;
+            // finally add the score
             if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) {
-                System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi);
+                System.err.println(word + " p=" + (p * score) + " score=" + score + " ngram=" + p);
             }
+            p *= score;
             assert p > 0.0 : "failed probability for " + word;
             return p;
         }