From 1d223946bd0f71e9bf905d4e5fd034a8aab9d56f Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 10:30:03 +0200 Subject: Consider the probability of the whole sentence --- spellchecker/src/SpellCorrector.java | 55 ++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 4978f4e..ef67622 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -48,7 +48,7 @@ public class SpellCorrector { rater.saveSuggestion(); // TODO: make this nicer words = rater.getBestSentence(); - System.err.println("Got new sentence: " + String.join(" ", words)); + System.err.println("Suggest: " + String.join(" ", words)); } else { System.err.println("No suggestion found."); break; @@ -197,7 +197,7 @@ public class SpellCorrector { */ private final boolean[] words_readonly; - private double best_likelihood = Double.MIN_VALUE; + private double best_sentence_probability; private WordModification best_modification; public SentenceRater(String[] words) { @@ -208,6 +208,9 @@ public class SpellCorrector { word_likelihoods[i] = getWordLikelihood(i, words[i], LM_PROBABILITY_UNMODIFIED); } + // determine the rating of the current sentence without ignoring + // words. + best_sentence_probability = evaluateWord(-1); } /** @@ -234,6 +237,7 @@ public class SpellCorrector { // Now obtain n-gram probabilities. Use interpolation to combine // unigrams and bigrams. + // unigram probability is computed by P(w) = #w / N (no smoothing). p = LAMBDAS[0] * cr.getSmoothedCount(word) / cr.getUnigramCount(); // Add probability of bi-grams. @@ -242,6 +246,7 @@ public class SpellCorrector { prev_word = words[index - 1]; ngram = prev_word + " " + word; p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); + //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]); } // Combine the candidate score with the n-gram probabilities. @@ -256,19 +261,28 @@ public class SpellCorrector { * remembered (the sentence itself will not be modified). */ public void tryWord(int index, String word, double channel_probability) { - double likelihood; + double score, p; - // try the modification, calculate the result and restore. - likelihood = getWordLikelihood(index, word, channel_probability); + // find score and see how it affects the whole sentence. + score = getWordLikelihood(index, word, channel_probability); + p = score * evaluateWord(index); - // look for the word which increases the likelihood the most - // (that is, the difference of the old and new likelihood). - likelihood -= word_likelihoods[index]; + if (p > best_sentence_probability) { + best_sentence_probability = p; + best_modification = new WordModification(index, word, score); + } + } - if (likelihood > best_likelihood) { - best_likelihood = likelihood; - best_modification = new WordModification(index, word); + private double evaluateWord(int ignoreIndex) { + double p = 1; + // calculate the probability of the combination of all word + // probabilities. + for (int i = 0; i < words.length; i++) { + if (ignoreIndex != i) { + p *= word_likelihoods[i]; + } } + return p; } /** @@ -297,7 +311,17 @@ public class SpellCorrector { public void saveSuggestion() { int index = best_modification.index; String word = best_modification.word; + double score = best_modification.score; + + // save the word and its associated score + assert word_likelihoods[index] < score : + "The score should only get better for word " + word + + ". Change: " + word_likelihoods[index] + " -> " + score; words[index] = word; + word_likelihoods[index] = score; + + // if this was the best word, do not change it now. The context + // should also not change. if (index > 0) { words_readonly[index - 1] = true; } @@ -305,20 +329,21 @@ public class SpellCorrector { if (index + 1 < words.length) { words_readonly[index + 1] = true; } + + // change was applied, forget suggestion. best_modification = null; - best_likelihood = Double.MIN_VALUE; - word_likelihoods[index] = getWordLikelihood(index, word, - LM_PROBABILITY_UNMODIFIED); } private class WordModification { private final int index; private final String word; + private final double score; - public WordModification(int index, String word) { + public WordModification(int index, String word, double score) { this.index = index; this.word = word; + this.score = score; } } } -- cgit v1.2.1