summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-02 01:52:55 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-02 01:52:55 +0200
commit4615358ae4f3aab0e18bb0b290e7c0caaa7441ea (patch)
tree939f8bd1085e1a9bb9d36d09a14981681979b926
parentf24d257cc729199f6513a34f93545569c6f6589c (diff)
downloadassignment4-4615358ae4f3aab0e18bb0b290e7c0caaa7441ea.tar.gz
Fix probability calculations
Replace monkey patching by an actual implementation that correctly combines channel probability (language model), prior and interpolation.
-rw-r--r--spellchecker/src/CorpusReader.java21
-rw-r--r--spellchecker/src/SpellCorrector.java60
2 files changed, 45 insertions, 36 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index f9df1ca..8281210 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -115,19 +115,14 @@ public class CorpusReader {
double smoothedCount = getNGramCount(NGram);
- int n_words = NGram.split(" ").length + 1;
- switch (n_words) {
- case 1: // unigram
- smoothedCount += 1.0;
- break;
- case 2: // bigram
- smoothedCount += 2.0;
- break;
- case 3: // trigram
- smoothedCount += 4.0;
- break;
- default:
- throw new AssertionError("Unknown n-gram with n=" + n_words);
+ // The caller invokes func(bigram) / func(unigram) and expects a
+ // probability as result.
+ if (NGram.indexOf(' ') != -1) {
+ // bigram, must be the nominator
+ smoothedCount += 1;
+ } else {
+ // unigram, must be the denominator
+ smoothedCount += 1;
}
return smoothedCount;
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 64a35e4..ab41c3d 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -9,6 +9,16 @@ public class SpellCorrector {
final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();
+ /**
+ * Lambda values for interpolation of n-gram probabilities. The first value
+ * is for unigrams, the second for bigrams, etc.
+ */
+ private final static double[] LAMBDAS = new double[]{.5, .5};
+ /**
+ * The language model probability for uncorrected words.
+ */
+ private final static double LM_PROBABILITY_UNMODIFIED = .95;
+
public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) {
this.cr = cr;
this.cmr = cmr;
@@ -74,6 +84,7 @@ public class SpellCorrector {
// add-one smoothing
p_channel = (correctionCount + 1) / (errorCount + 1);
+ // TODO: take the max instead of addition?
// Sum the probabilities as independent modifications can result in
// the same word ("acess" -> "access" by "a|ac", "e|ce").
double p = candidates.getOrDefault(word2, 0.0);
@@ -194,7 +205,8 @@ public class SpellCorrector {
this.word_likelihoods = new double[words.length];
this.words_readonly = new boolean[words.length];
for (int i = 0; i < words.length; i++) {
- word_likelihoods[i] = getWordLikelihood(i, words[i], 0.5);
+ word_likelihoods[i] = getWordLikelihood(i, words[i],
+ LM_PROBABILITY_UNMODIFIED);
}
}
@@ -204,37 +216,38 @@ public class SpellCorrector {
*/
public double getWordLikelihood(int index, String word,
double channel_probability) {
- String old_word = words[index];
- double likelihood;
+ String prev_word, ngram;
+ double prior, score, p;
// a suggested word not in the vocabulary is certainly wrong,
// changed (or consequentive) words should also not be changed.
if (!cr.inVocabulary(word) || words_readonly[index]) {
return 0.0;
}
- // Find prior
- likelihood = (cr.getNGramCount(words[index]) + .5) / cr.getUnigramCount();
- // Add probability of bi-grams.
- if (index > 0) {
- String str;
- str = words[index - 1] + " " + words[index];
- likelihood *= (cr.getNGramCount(str) + .5) / cr.getNGramCount(words[index - 1]);
- }
+ assert channel_probability > 0.0;
- if (words[index].equals(word)) {
- // TODO: tune and magic number
- likelihood *= .80;
- } else {
- likelihood *= channel_probability;
- }
+ // P(x|w) is given by language model (noisy channel probability).
+ // Find prior P(w) = (freq(w) + .5) / N.
+ // Then compute candidate score by P(w) * P(x|w)
+ prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
+ score = prior * channel_probability;
- // old word is unknown, let's make sure to change it first.
- if (!cr.inVocabulary(old_word)) {
- likelihood += 10;
+ // Now obtain n-gram probabilities. Use interpolation to combine
+ // unigrams and bigrams.
+ p = LAMBDAS[0] * cr.getSmoothedCount(word);
+
+ // Add probability of bi-grams.
+ // For words u and w, P(w|u) = P(u, w) / P(u).
+ if (index > 0) {
+ prev_word = words[index - 1];
+ ngram = prev_word + " " + word;
+ p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
}
- assert likelihood > 0.0 : "failed probability for " + word;
- return likelihood;
+ // Combine the candidate score with the n-gram probabilities.
+ p *= score;
+ assert p > 0.0 : "failed probability for " + word;
+ return p;
}
/**
@@ -290,7 +303,8 @@ public class SpellCorrector {
}
best_modification = null;
best_likelihood = Double.MIN_VALUE;
- word_likelihoods[index] = getWordLikelihood(index, word, 0.5);
+ word_likelihoods[index] = getWordLikelihood(index, word,
+ LM_PROBABILITY_UNMODIFIED);
}
private class WordModification {