From 2957a918e369a32e51ea9c8b7b06063b5d7c09b6 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Thu, 2 Apr 2015 11:13:29 +0200
Subject: Accidentally added two times debug

---
 spellchecker/src/SpellCorrector.java | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 65865b9..9028fdc 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -338,17 +338,6 @@ public class SpellCorrector {
                 old_evaluation = evaluateWord(-1);
             }
 
-            if (DEBUG_SCORE) {
-                System.err.println();
-                System.err.println("Word: " + words[index] + " -> " + word);
-                System.err.println("Word score: " + word_likelihoods[index] + " -> " + score);
-                System.err.println("Phrase score: " + evaluateWord(-1));
-                for (int i = 0; i < words.length; i++) {
-                    System.err.println(String.format("%28s %s", words[i], word_likelihoods[i]));
-                }
-                System.err.println();
-            }
-
             // save the word and its associated score
             assert word_likelihoods[index] < score :
                     "The score should only get better for word " + word
-- 
cgit v1.2.1


From be34f8b98aa4942953deb383ea61156ee1bf20b1 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Thu, 2 Apr 2015 11:19:53 +0200
Subject: Split lambda p

---
 spellchecker/src/SpellCorrector.java | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 9028fdc..475e889 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -222,7 +222,7 @@ public class SpellCorrector {
         public double getWordLikelihood(int index, String word,
                 double channel_probability) {
             String prev_word, ngram;
-            double prior, score, p;
+            double prior, score, p, p_uni, p_bi;
             // a suggested word not in the vocabulary is certainly wrong,
             // changed (or consequentive) words should also not be changed.
             if (!cr.inVocabulary(word) || words_readonly[index]) {
@@ -237,21 +237,23 @@ public class SpellCorrector {
             prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
             score = prior * channel_probability;
 
-            // Now obtain n-gram probabilities. Use interpolation to combine
-            // unigrams and bigrams.
             // unigram probability is computed by P(w) = #w / N (no smoothing).
-            p = LAMBDAS[0] * cr.getSmoothedCount(word) / cr.getUnigramCount();
+            p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount();
 
             // Add probability of bi-grams.
             // For words u and w, P(w|u) = P(u, w) / P(u).
             if (index > 0) {
                 prev_word = words[index - 1];
                 ngram = prev_word + " " + word;
-                p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
+                p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
                 //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]);
+            } else {
+                p_bi = 0;
             }
 
-            // Combine the candidate score with the n-gram probabilities.
+            // Now obtain n-gram probabilities. Use interpolation to combine
+            // unigrams and bigrams.
+            p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi;
             p *= score;
             assert p > 0.0 : "failed probability for " + word;
             return p;
-- 
cgit v1.2.1


From a9839f2a780c9ddf117ad3cf39daadd5333112ee Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Thu, 2 Apr 2015 11:41:02 +0200
Subject: bigram smooth tweak

---
 spellchecker/src/CorpusReader.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 8281210..686f243 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -119,7 +119,8 @@ public class CorpusReader {
         // probability as result.
         if (NGram.indexOf(' ') != -1) {
             // bigram, must be the nominator
-            smoothedCount += 1;
+            // we do not have a lot of bigrams
+            smoothedCount += .01;
         } else {
             // unigram, must be the denominator
             smoothedCount += 1;
-- 
cgit v1.2.1


From 87b788adb8426d1ca0655ee049a68893bd54ca1b Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Thu, 2 Apr 2015 11:41:12 +0200
Subject: fix output printing

---
 spellchecker/src/SpellChecker.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spellchecker/src/SpellChecker.java b/spellchecker/src/SpellChecker.java
index 55f8f6b..533dc02 100644
--- a/spellchecker/src/SpellChecker.java
+++ b/spellchecker/src/SpellChecker.java
@@ -40,7 +40,7 @@ public class SpellChecker {
                 System.out.println("Answer: " + result);
                 System.out.println();
             } else {
-                System.out.println(s0);
+                System.out.println(result);
             }
         }
     }
-- 
cgit v1.2.1


From 7b56b731f525087c25b0e479077edac3674d2274 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Thu, 2 Apr 2015 11:45:04 +0200
Subject: use max instead of sum

---
 spellchecker/src/SpellCorrector.java | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 475e889..e4086aa 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -86,11 +86,10 @@ public class SpellCorrector {
             // add-one smoothing
             p_channel = (correctionCount + 1) / (errorCount + 1);
 
-            // TODO: take the max instead of addition?
-            // Sum the probabilities as independent modifications can result in
-            // the same word ("acess" -> "access" by "a|ac", "e|ce").
+            // while we could sum here, it does not make sense for the
+            // probability. Use the probability of the most likely change type.
             double p = candidates.getOrDefault(word2, 0.0);
-            p += p_channel;
+            p = Math.max(p, p_channel);
 
             candidates.put(word2, p);
         };
-- 
cgit v1.2.1


From f7c1881a84d468f8377b2165964911561452fef1 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Thu, 2 Apr 2015 11:45:31 +0200
Subject: Debug and lambda tweaks

Damn, greedy does not work....
---
 spellchecker/src/SpellCorrector.java | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index e4086aa..dd6e73a 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -13,7 +13,7 @@ public class SpellCorrector {
      * Lambda values for interpolation of n-gram probabilities. The first value
      * is for unigrams, the second for bigrams, etc.
      */
-    private final static double[] LAMBDAS = new double[]{.5, .5};
+    private final static double[] LAMBDAS = new double[]{.25, .75};
     /**
      * The language model probability for uncorrected words.
      */
@@ -247,13 +247,17 @@ public class SpellCorrector {
                 p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
                 //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]);
             } else {
-                p_bi = 0;
+                // no previous word, assume likely.
+                p_bi = 1;
             }
 
             // Now obtain n-gram probabilities. Use interpolation to combine
             // unigrams and bigrams.
             p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi;
             p *= score;
+            if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) {
+                System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi);
+            }
             assert p > 0.0 : "failed probability for " + word;
             return p;
         }
-- 
cgit v1.2.1