summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco <brouwerwj@gmail.com>2015-04-02 11:49:14 +0200
committerWilco <brouwerwj@gmail.com>2015-04-02 11:49:14 +0200
commitb0d65f43ee3ff6cf16174ebb89bcc9fd6933ff07 (patch)
tree23d26880c2a4f42ad157af6da3e7bd10656bce87
parentd6fe6c9fd2523ea19f80a1ca523e76719951b00c (diff)
parentf7c1881a84d468f8377b2165964911561452fef1 (diff)
downloadassignment4-b0d65f43ee3ff6cf16174ebb89bcc9fd6933ff07.tar.gz
Merge branch 'master' of git.lekensteyn.nl:tue/2ID90-AI/assignment4
-rw-r--r--spellchecker/src/CorpusReader.java3
-rw-r--r--spellchecker/src/SpellChecker.java2
-rw-r--r--spellchecker/src/SpellCorrector.java38
3 files changed, 19 insertions, 24 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 8281210..686f243 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -119,7 +119,8 @@ public class CorpusReader {
// probability as result.
if (NGram.indexOf(' ') != -1) {
// bigram, must be the nominator
- smoothedCount += 1;
+ // we do not have a lot of bigrams
+ smoothedCount += .01;
} else {
// unigram, must be the denominator
smoothedCount += 1;
diff --git a/spellchecker/src/SpellChecker.java b/spellchecker/src/SpellChecker.java
index 55f8f6b..533dc02 100644
--- a/spellchecker/src/SpellChecker.java
+++ b/spellchecker/src/SpellChecker.java
@@ -40,7 +40,7 @@ public class SpellChecker {
System.out.println("Answer: " + result);
System.out.println();
} else {
- System.out.println(s0);
+ System.out.println(result);
}
}
}
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 65865b9..dd6e73a 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -13,7 +13,7 @@ public class SpellCorrector {
* Lambda values for interpolation of n-gram probabilities. The first value
* is for unigrams, the second for bigrams, etc.
*/
- private final static double[] LAMBDAS = new double[]{.5, .5};
+ private final static double[] LAMBDAS = new double[]{.25, .75};
/**
* The language model probability for uncorrected words.
*/
@@ -86,11 +86,10 @@ public class SpellCorrector {
// add-one smoothing
p_channel = (correctionCount + 1) / (errorCount + 1);
- // TODO: take the max instead of addition?
- // Sum the probabilities as independent modifications can result in
- // the same word ("acess" -> "access" by "a|ac", "e|ce").
+ // while we could sum here, it does not make sense for the
+ // probability. Use the probability of the most likely change type.
double p = candidates.getOrDefault(word2, 0.0);
- p += p_channel;
+ p = Math.max(p, p_channel);
candidates.put(word2, p);
};
@@ -222,7 +221,7 @@ public class SpellCorrector {
public double getWordLikelihood(int index, String word,
double channel_probability) {
String prev_word, ngram;
- double prior, score, p;
+ double prior, score, p, p_uni, p_bi;
// a suggested word not in the vocabulary is certainly wrong,
// changed (or consequentive) words should also not be changed.
if (!cr.inVocabulary(word) || words_readonly[index]) {
@@ -237,22 +236,28 @@ public class SpellCorrector {
prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
score = prior * channel_probability;
- // Now obtain n-gram probabilities. Use interpolation to combine
- // unigrams and bigrams.
// unigram probability is computed by P(w) = #w / N (no smoothing).
- p = LAMBDAS[0] * cr.getSmoothedCount(word) / cr.getUnigramCount();
+ p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount();
// Add probability of bi-grams.
// For words u and w, P(w|u) = P(u, w) / P(u).
if (index > 0) {
prev_word = words[index - 1];
ngram = prev_word + " " + word;
- p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
+ p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
//System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]);
+ } else {
+ // no previous word, assume likely.
+ p_bi = 1;
}
- // Combine the candidate score with the n-gram probabilities.
+ // Now obtain n-gram probabilities. Use interpolation to combine
+ // unigrams and bigrams.
+ p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi;
p *= score;
+ if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) {
+ System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi);
+ }
assert p > 0.0 : "failed probability for " + word;
return p;
}
@@ -338,17 +343,6 @@ public class SpellCorrector {
old_evaluation = evaluateWord(-1);
}
- if (DEBUG_SCORE) {
- System.err.println();
- System.err.println("Word: " + words[index] + " -> " + word);
- System.err.println("Word score: " + word_likelihoods[index] + " -> " + score);
- System.err.println("Phrase score: " + evaluateWord(-1));
- for (int i = 0; i < words.length; i++) {
- System.err.println(String.format("%28s %s", words[i], word_likelihoods[i]));
- }
- System.err.println();
- }
-
// save the word and its associated score
assert word_likelihoods[index] < score :
"The score should only get better for word " + word