From 2957a918e369a32e51ea9c8b7b06063b5d7c09b6 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 11:13:29 +0200 Subject: Accidentally added two times debug --- spellchecker/src/SpellCorrector.java | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 65865b9..9028fdc 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -338,17 +338,6 @@ public class SpellCorrector { old_evaluation = evaluateWord(-1); } - if (DEBUG_SCORE) { - System.err.println(); - System.err.println("Word: " + words[index] + " -> " + word); - System.err.println("Word score: " + word_likelihoods[index] + " -> " + score); - System.err.println("Phrase score: " + evaluateWord(-1)); - for (int i = 0; i < words.length; i++) { - System.err.println(String.format("%28s %s", words[i], word_likelihoods[i])); - } - System.err.println(); - } - // save the word and its associated score assert word_likelihoods[index] < score : "The score should only get better for word " + word -- cgit v1.2.1 From be34f8b98aa4942953deb383ea61156ee1bf20b1 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 11:19:53 +0200 Subject: Split lambda p --- spellchecker/src/SpellCorrector.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 9028fdc..475e889 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -222,7 +222,7 @@ public class SpellCorrector { public double getWordLikelihood(int index, String word, double channel_probability) { String prev_word, ngram; - double prior, score, p; + double prior, score, p, p_uni, p_bi; // a suggested word not in the vocabulary is certainly wrong, // changed (or consequentive) words should also not be changed. if (!cr.inVocabulary(word) || words_readonly[index]) { @@ -237,21 +237,23 @@ public class SpellCorrector { prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount(); score = prior * channel_probability; - // Now obtain n-gram probabilities. Use interpolation to combine - // unigrams and bigrams. // unigram probability is computed by P(w) = #w / N (no smoothing). - p = LAMBDAS[0] * cr.getSmoothedCount(word) / cr.getUnigramCount(); + p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount(); // Add probability of bi-grams. // For words u and w, P(w|u) = P(u, w) / P(u). if (index > 0) { prev_word = words[index - 1]; ngram = prev_word + " " + word; - p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); + p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]); + } else { + p_bi = 0; } - // Combine the candidate score with the n-gram probabilities. + // Now obtain n-gram probabilities. Use interpolation to combine + // unigrams and bigrams. + p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi; p *= score; assert p > 0.0 : "failed probability for " + word; return p; -- cgit v1.2.1 From a9839f2a780c9ddf117ad3cf39daadd5333112ee Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 11:41:02 +0200 Subject: bigram smooth tweak --- spellchecker/src/CorpusReader.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index 8281210..686f243 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -119,7 +119,8 @@ public class CorpusReader { // probability as result. if (NGram.indexOf(' ') != -1) { // bigram, must be the nominator - smoothedCount += 1; + // we do not have a lot of bigrams + smoothedCount += .01; } else { // unigram, must be the denominator smoothedCount += 1; -- cgit v1.2.1 From 87b788adb8426d1ca0655ee049a68893bd54ca1b Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 11:41:12 +0200 Subject: fix output printing --- spellchecker/src/SpellChecker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spellchecker/src/SpellChecker.java b/spellchecker/src/SpellChecker.java index 55f8f6b..533dc02 100644 --- a/spellchecker/src/SpellChecker.java +++ b/spellchecker/src/SpellChecker.java @@ -40,7 +40,7 @@ public class SpellChecker { System.out.println("Answer: " + result); System.out.println(); } else { - System.out.println(s0); + System.out.println(result); } } } -- cgit v1.2.1 From 7b56b731f525087c25b0e479077edac3674d2274 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 11:45:04 +0200 Subject: use max instead of sum --- spellchecker/src/SpellCorrector.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 475e889..e4086aa 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -86,11 +86,10 @@ public class SpellCorrector { // add-one smoothing p_channel = (correctionCount + 1) / (errorCount + 1); - // TODO: take the max instead of addition? - // Sum the probabilities as independent modifications can result in - // the same word ("acess" -> "access" by "a|ac", "e|ce"). + // while we could sum here, it does not make sense for the + // probability. Use the probability of the most likely change type. double p = candidates.getOrDefault(word2, 0.0); - p += p_channel; + p = Math.max(p, p_channel); candidates.put(word2, p); }; -- cgit v1.2.1 From f7c1881a84d468f8377b2165964911561452fef1 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 11:45:31 +0200 Subject: Debug and lambda tweaks Damn, greedy does not work.... --- spellchecker/src/SpellCorrector.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index e4086aa..dd6e73a 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -13,7 +13,7 @@ public class SpellCorrector { * Lambda values for interpolation of n-gram probabilities. The first value * is for unigrams, the second for bigrams, etc. */ - private final static double[] LAMBDAS = new double[]{.5, .5}; + private final static double[] LAMBDAS = new double[]{.25, .75}; /** * The language model probability for uncorrected words. */ @@ -247,13 +247,17 @@ public class SpellCorrector { p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]); } else { - p_bi = 0; + // no previous word, assume likely. + p_bi = 1; } // Now obtain n-gram probabilities. Use interpolation to combine // unigrams and bigrams. p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi; p *= score; + if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) { + System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi); + } assert p > 0.0 : "failed probability for " + word; return p; } -- cgit v1.2.1