diff options
-rw-r--r-- | spellchecker/src/CorpusReader.java | 3 | ||||
-rw-r--r-- | spellchecker/src/SpellChecker.java | 2 | ||||
-rw-r--r-- | spellchecker/src/SpellCorrector.java | 38 |
3 files changed, 19 insertions, 24 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index 8281210..686f243 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -119,7 +119,8 @@ public class CorpusReader { // probability as result. if (NGram.indexOf(' ') != -1) { // bigram, must be the nominator - smoothedCount += 1; + // we do not have a lot of bigrams + smoothedCount += .01; } else { // unigram, must be the denominator smoothedCount += 1; diff --git a/spellchecker/src/SpellChecker.java b/spellchecker/src/SpellChecker.java index 55f8f6b..533dc02 100644 --- a/spellchecker/src/SpellChecker.java +++ b/spellchecker/src/SpellChecker.java @@ -40,7 +40,7 @@ public class SpellChecker { System.out.println("Answer: " + result); System.out.println(); } else { - System.out.println(s0); + System.out.println(result); } } } diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 65865b9..dd6e73a 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -13,7 +13,7 @@ public class SpellCorrector { * Lambda values for interpolation of n-gram probabilities. The first value * is for unigrams, the second for bigrams, etc. */ - private final static double[] LAMBDAS = new double[]{.5, .5}; + private final static double[] LAMBDAS = new double[]{.25, .75}; /** * The language model probability for uncorrected words. */ @@ -86,11 +86,10 @@ public class SpellCorrector { // add-one smoothing p_channel = (correctionCount + 1) / (errorCount + 1); - // TODO: take the max instead of addition? - // Sum the probabilities as independent modifications can result in - // the same word ("acess" -> "access" by "a|ac", "e|ce"). + // while we could sum here, it does not make sense for the + // probability. Use the probability of the most likely change type. double p = candidates.getOrDefault(word2, 0.0); - p += p_channel; + p = Math.max(p, p_channel); candidates.put(word2, p); }; @@ -222,7 +221,7 @@ public class SpellCorrector { public double getWordLikelihood(int index, String word, double channel_probability) { String prev_word, ngram; - double prior, score, p; + double prior, score, p, p_uni, p_bi; // a suggested word not in the vocabulary is certainly wrong, // changed (or consequentive) words should also not be changed. if (!cr.inVocabulary(word) || words_readonly[index]) { @@ -237,22 +236,28 @@ public class SpellCorrector { prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount(); score = prior * channel_probability; - // Now obtain n-gram probabilities. Use interpolation to combine - // unigrams and bigrams. // unigram probability is computed by P(w) = #w / N (no smoothing). - p = LAMBDAS[0] * cr.getSmoothedCount(word) / cr.getUnigramCount(); + p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount(); // Add probability of bi-grams. // For words u and w, P(w|u) = P(u, w) / P(u). if (index > 0) { prev_word = words[index - 1]; ngram = prev_word + " " + word; - p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); + p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]); + } else { + // no previous word, assume likely. + p_bi = 1; } - // Combine the candidate score with the n-gram probabilities. + // Now obtain n-gram probabilities. Use interpolation to combine + // unigrams and bigrams. + p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi; p *= score; + if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) { + System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi); + } assert p > 0.0 : "failed probability for " + word; return p; } @@ -338,17 +343,6 @@ public class SpellCorrector { old_evaluation = evaluateWord(-1); } - if (DEBUG_SCORE) { - System.err.println(); - System.err.println("Word: " + words[index] + " -> " + word); - System.err.println("Word score: " + word_likelihoods[index] + " -> " + score); - System.err.println("Phrase score: " + evaluateWord(-1)); - for (int i = 0; i < words.length; i++) { - System.err.println(String.format("%28s %s", words[i], word_likelihoods[i])); - } - System.err.println(); - } - // save the word and its associated score assert word_likelihoods[index] < score : "The score should only get better for word " + word |