Merge branch 'master' of git.lekensteyn.nl:tue/2ID90-AI/assignment4

author: Wilco <brouwerwj@gmail.com> 2015-04-02 11:49:14 +0200
committer: Wilco <brouwerwj@gmail.com> 2015-04-02 11:49:14 +0200
commit: b0d65f43ee3ff6cf16174ebb89bcc9fd6933ff07 (patch)
tree: 23d26880c2a4f42ad157af6da3e7bd10656bce87
parent: d6fe6c9fd2523ea19f80a1ca523e76719951b00c (diff)
parent: f7c1881a84d468f8377b2165964911561452fef1 (diff)
download: assignment4-b0d65f43ee3ff6cf16174ebb89bcc9fd6933ff07.tar.gz
3 files changed, 19 insertions, 24 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 8281210..686f243 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -119,7 +119,8 @@ public class CorpusReader {
         // probability as result.
         if (NGram.indexOf(' ') != -1) {
             // bigram, must be the nominator
-            smoothedCount += 1;
+            // we do not have a lot of bigrams
+            smoothedCount += .01;
         } else {
             // unigram, must be the denominator
             smoothedCount += 1;
diff --git a/spellchecker/src/SpellChecker.java b/spellchecker/src/SpellChecker.java
index 55f8f6b..533dc02 100644
--- a/spellchecker/src/SpellChecker.java
+++ b/spellchecker/src/SpellChecker.java
@@ -40,7 +40,7 @@ public class SpellChecker {
                 System.out.println("Answer: " + result);
                 System.out.println();
             } else {
-                System.out.println(s0);
+                System.out.println(result);
             }
         }
     }
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 65865b9..dd6e73a 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -13,7 +13,7 @@ public class SpellCorrector {
      * Lambda values for interpolation of n-gram probabilities. The first value
      * is for unigrams, the second for bigrams, etc.
      */
-    private final static double[] LAMBDAS = new double[]{.5, .5};
+    private final static double[] LAMBDAS = new double[]{.25, .75};
     /**
      * The language model probability for uncorrected words.
      */
@@ -86,11 +86,10 @@ public class SpellCorrector {
             // add-one smoothing
             p_channel = (correctionCount + 1) / (errorCount + 1);
 
-            // TODO: take the max instead of addition?
-            // Sum the probabilities as independent modifications can result in
-            // the same word ("acess" -> "access" by "a|ac", "e|ce").
+            // while we could sum here, it does not make sense for the
+            // probability. Use the probability of the most likely change type.
             double p = candidates.getOrDefault(word2, 0.0);
-            p += p_channel;
+            p = Math.max(p, p_channel);
 
             candidates.put(word2, p);
         };
@@ -222,7 +221,7 @@ public class SpellCorrector {
         public double getWordLikelihood(int index, String word,
                 double channel_probability) {
             String prev_word, ngram;
-            double prior, score, p;
+            double prior, score, p, p_uni, p_bi;
             // a suggested word not in the vocabulary is certainly wrong,
             // changed (or consequentive) words should also not be changed.
             if (!cr.inVocabulary(word) || words_readonly[index]) {
@@ -237,22 +236,28 @@ public class SpellCorrector {
             prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
             score = prior * channel_probability;
 
-            // Now obtain n-gram probabilities. Use interpolation to combine
-            // unigrams and bigrams.
             // unigram probability is computed by P(w) = #w / N (no smoothing).
-            p = LAMBDAS[0] * cr.getSmoothedCount(word) / cr.getUnigramCount();
+            p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount();
 
             // Add probability of bi-grams.
             // For words u and w, P(w|u) = P(u, w) / P(u).
             if (index > 0) {
                 prev_word = words[index - 1];
                 ngram = prev_word + " " + word;
-                p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
+                p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
                 //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]);
+            } else {
+                // no previous word, assume likely.
+                p_bi = 1;
             }
 
-            // Combine the candidate score with the n-gram probabilities.
+            // Now obtain n-gram probabilities. Use interpolation to combine
+            // unigrams and bigrams.
+            p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi;
             p *= score;
+            if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) {
+                System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi);
+            }
             assert p > 0.0 : "failed probability for " + word;
             return p;
         }
@@ -338,17 +343,6 @@ public class SpellCorrector {
                 old_evaluation = evaluateWord(-1);
             }
 
-            if (DEBUG_SCORE) {
-                System.err.println();
-                System.err.println("Word: " + words[index] + " -> " + word);
-                System.err.println("Word score: " + word_likelihoods[index] + " -> " + score);
-                System.err.println("Phrase score: " + evaluateWord(-1));
-                for (int i = 0; i < words.length; i++) {
-                    System.err.println(String.format("%28s %s", words[i], word_likelihoods[i]));
-                }
-                System.err.println();
-            }
-
             // save the word and its associated score
             assert word_likelihoods[index] < score :
                     "The score should only get better for word " + word
author	Wilco <brouwerwj@gmail.com>	2015-04-02 11:49:14 +0200
committer	Wilco <brouwerwj@gmail.com>	2015-04-02 11:49:14 +0200
commit	b0d65f43ee3ff6cf16174ebb89bcc9fd6933ff07 (patch)
tree	23d26880c2a4f42ad157af6da3e7bd10656bce87
parent	d6fe6c9fd2523ea19f80a1ca523e76719951b00c (diff)
parent	f7c1881a84d468f8377b2165964911561452fef1 (diff)
download	assignment4-b0d65f43ee3ff6cf16174ebb89bcc9fd6933ff07.tar.gz