summaryrefslogtreecommitdiff
path: root/spellchecker/src/SpellCorrector.java
diff options
context:
space:
mode:
Diffstat (limited to 'spellchecker/src/SpellCorrector.java')
-rw-r--r--spellchecker/src/SpellCorrector.java49
1 files changed, 27 insertions, 22 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index bdfc8e1..971e91e 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -10,6 +10,11 @@ public class SpellCorrector {
final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();
/**
+ * The highest n-gram to look for. 1 gives a unigram, 2 gives a bigram, etc.
+ */
+ private final static int NGRAM_N = 2;
+
+ /**
* Lambda values for interpolation of n-gram probabilities. The first value
* is for unigrams, the second for bigrams, etc.
*/
@@ -255,8 +260,7 @@ public class SpellCorrector {
*/
public double getWordLikelihood(int index, String word,
double channel_probability) {
- String prev_word, ngram;
- double prior, score, p, p_uni, p_bi;
+ double prior, score;
// a suggested word not in the vocabulary is certainly wrong,
// changed (or consequentive) words should also not be changed.
if (!cr.inVocabulary(word) || words_readonly[index]) {
@@ -266,33 +270,34 @@ public class SpellCorrector {
assert channel_probability > 0.0;
// P(x|w) is given by language model (noisy channel probability).
- // Find prior P(w) = (freq(w) + .5) / N.
+ // Find prior P(w) = (freq(w) + .5) / N (N is number of words).
// Then compute candidate score by P(w) * P(x|w)
- prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
+ prior = (cr.getNGramCount(word) + .5) / cr.getWordCount();
score = prior * channel_probability;
- // unigram probability is computed by P(w) = #w / N (no smoothing).
- p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount();
-
- // Add probability of bi-grams.
- // For words u and w, P(w|u) = P(u, w) / P(u).
- if (index > 0) {
- prev_word = words[index - 1];
- ngram = prev_word + " " + word;
- p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
- //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]);
- } else {
- // no previous word, assume likely.
- p_bi = 1;
+ // compute unigrams
+ double p = LAMBDAS[0] * cr.getNgramProbability(word, "");
+ // compute bigrams, etc.
+ String ngram = word;
+ for (int i = 1; i < NGRAM_N; i++) {
+ // are there actually enough words to compute this metric?
+ if (index - i >= 0) {
+ // increase ngram prefix
+ ngram += " " + words[index - i];
+
+ // Obtain n-gram probs and combine using interpolation.
+ p += LAMBDAS[i] * cr.getNgramProbability(word, ngram);
+ } else {
+ // no metrics found, cannot deduce much information from it
+ p += LAMBDAS[i] * .5;
+ }
}
- // Now obtain n-gram probabilities. Use interpolation to combine
- // unigrams and bigrams.
- p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi;
- p *= score;
+ // finally add the score
if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) {
- System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi);
+ System.err.println(word + " p=" + (p * score) + " score=" + score + " ngram=" + p);
}
+ p *= score;
assert p > 0.0 : "failed probability for " + word;
return p;
}