From 6d0e2e299835eca671951120e72ad5898866839f Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Fri, 3 Apr 2015 15:12:45 +0200
Subject: Fix ngram PREFIX condition, improve ngram prob debug

---
 spellchecker/src/CorpusReader.java   | 9 ++++++---
 spellchecker/src/SpellCorrector.java | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 2ad7e85..2e0a855 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -137,7 +137,8 @@ public class CorpusReader {
      * @return
      */
     public double getNgramProbability(String word, String ngram) {
-        double a, b;
+        double a, b, p;
+
         // special case: unigram has no prior ngram
         if (ngram.isEmpty()) {
             a = getNGramCount(word);
@@ -146,7 +147,7 @@ public class CorpusReader {
             // apply add-1 smoothing under the assumption that there are many
             // unigrams and this does not significantly affect the chance,
             // it just ensures that it is non-zero.
-            return (a + 1) / (b + 1);
+            p = (a + 1) / (b + 1);
         } else {
             // other ngram cases
             a = getNGramCount(ngram + " " + word);
@@ -155,8 +156,10 @@ public class CorpusReader {
             // apply smoothing, but add a smaller number because "b" is
             // typically very small.
             // TODO: Kneser-Ney smoothing?
-            return (a + .001) / (b + 1);
+            p = (a + .001) / (b + 1);
         }
+
+        return p;
     }
 
     /**
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 33b804f..2bc788b 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -294,12 +294,16 @@ public class SpellCorrector {
             }
 
             // compute bigrams, etc.
-            String ngram = word;
+            String ngram = "";
             for (int i = 1; i < NGRAM_N; i++) {
                 // are there actually enough words to compute this metric?
                 if (index - i >= 0) {
                     // increase ngram prefix
-                    ngram += " " + words[index - i];
+                    if (ngram.isEmpty()) {
+                        ngram = words[index - i];
+                    } else {
+                        ngram = words[index - i] + " " + ngram;
+                    }
 
                     // Obtain n-gram probs and combine using interpolation.
                     igram_p = cr.getNgramProbability(word, ngram);
-- 
cgit v1.2.1