summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-02 17:54:39 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-02 17:54:39 +0200
commitdee51f2ec8cdb7d5a61139455219b20e6604f4c2 (patch)
treead37788fc7ea0b3eb88f93af9fa0fd90614e9ea5
parentd303a5bde5002d2099958bcac6838b5bc463a623 (diff)
downloadassignment4-dee51f2ec8cdb7d5a61139455219b20e6604f4c2.tar.gz
Improve ngram score debugging details
-rw-r--r--spellchecker/src/SpellCorrector.java31
1 files changed, 25 insertions, 6 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index f5f5472..b5e383e 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -261,7 +261,7 @@ public class SpellCorrector {
*/
public double getWordLikelihood(int index, String word,
double channel_probability) {
- double prior, score;
+ double prior, score, p, igram_p;
// a suggested word not in the vocabulary is certainly wrong,
// changed (or consequentive) words should also not be changed.
if (!cr.inVocabulary(word) || words_readonly[index]) {
@@ -269,6 +269,13 @@ public class SpellCorrector {
}
assert channel_probability > 0.0;
+ String debug_word = null;
+ if (DEBUG_SCORE
+ && (word.equals("he")
+ || word.equals("hme")
+ || word.equals("home"))) {
+ debug_word = "";
+ }
// P(x|w) is given by language model (noisy channel probability).
// Find prior P(w) = (freq(w) + .5) / N (N is number of words).
@@ -277,7 +284,12 @@ public class SpellCorrector {
score = prior * channel_probability;
// compute unigrams
- double p = LAMBDAS[0] * cr.getNgramProbability(word, "");
+ igram_p = cr.getNgramProbability(word, "");
+ p = LAMBDAS[0] * igram_p;
+ if (debug_word != null) {
+ debug_word += " 1p=" + igram_p;
+ }
+
// compute bigrams, etc.
String ngram = word;
for (int i = 1; i < NGRAM_N; i++) {
@@ -287,16 +299,23 @@ public class SpellCorrector {
ngram += " " + words[index - i];
// Obtain n-gram probs and combine using interpolation.
- p += LAMBDAS[i] * cr.getNgramProbability(word, ngram);
+ igram_p = cr.getNgramProbability(word, ngram);
} else {
// no metrics found, cannot deduce much information from it
- p += LAMBDAS[i] * .5;
+ igram_p = .5;
+ }
+ p += LAMBDAS[i] * igram_p;
+ if (debug_word != null) {
+ debug_word += " " + (i + 1) + "p=" + igram_p;
}
}
// finally add the score
- if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) {
- System.err.println(word + " p=" + (p * score) + " score=" + score + " ngram=" + p);
+ if (debug_word != null) {
+ System.err.println("# " + word + " p=" + (p * score)
+ + " score=" + score + " chan=" + channel_probability
+ + " prior=" + prior
+ + " ngram=" + p + debug_word);
}
p *= score;
assert p > 0.0 : "failed probability for " + word;