summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-01 09:22:16 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-01 09:22:16 +0200
commitce91b395fd0139f7e9faba09f5294e6abfaec591 (patch)
treef6d74861672f8763e27b29e504f2ab346a0fcb82
parente9edaa64ba165db40329412026380a8640511160 (diff)
downloadassignment4-ce91b395fd0139f7e9faba09f5294e6abfaec591.tar.gz
SpellChecker: FUBARED
This is the submitted version, it contains random attempts and a fix (actually not that sure that this is the submitted version...). The Double.MIN_VALUE fix is probably missing.
-rw-r--r--spellchecker/src/SpellCorrector.java65
1 files changed, 34 insertions, 31 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 179206f..64a35e4 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -71,14 +71,8 @@ public class SpellCorrector {
correctionCount = (double) cmr.getConfusionCount(error, correct);
errorCount = cmr.getErrorsCount(error);
- // is this a known correction?
- if (errorCount == 0.0) {
- // no,
- p_channel = 0.1;
- } else {
- // yes,
- p_channel = correctionCount / errorCount;
- }
+ // add-one smoothing
+ p_channel = (correctionCount + 1) / (errorCount + 1);
// Sum the probabilities as independent modifications can result in
// the same word ("acess" -> "access" by "a|ac", "e|ce").
@@ -140,7 +134,7 @@ public class SpellCorrector {
private void makeWordDeletion(String word,
TriFunction<String, String, String> push) {
- // Generare words by deletion of a character. Requires at least two
+ // Generate words by deletion of a character. Requires at least two
// characters in the word (an empty string is invalid).
// |pe]in -> [e]in, p|ei]n -> p[i]n, pe|in] -> pe[n], pe[i|n] -> pe[i].
if (word.length() > 1) {
@@ -190,7 +184,7 @@ public class SpellCorrector {
/**
* Words that cannot be modified in further iterations.
*/
- private final boolean[] words_locked;
+ private final boolean[] words_readonly;
private double best_likelihood = Double.MIN_VALUE;
private WordModification best_modification;
@@ -198,9 +192,9 @@ public class SpellCorrector {
public SentenceRater(String[] words) {
this.words = words.clone();
this.word_likelihoods = new double[words.length];
- this.words_locked = new boolean[words.length];
+ this.words_readonly = new boolean[words.length];
for (int i = 0; i < words.length; i++) {
- word_likelihoods[i] = getWordProbability(i, words[i], .5);
+ word_likelihoods[i] = getWordLikelihood(i, words[i], 0.5);
}
}
@@ -208,32 +202,39 @@ public class SpellCorrector {
* Calculates the probability that the word {@code word} is valid at
* position {@code index}.
*/
- public double getWordProbability(int index, String word,
+ public double getWordLikelihood(int index, String word,
double channel_probability) {
- double probability;
- // a word not in the vocabulary is certainly wrong,
+ String old_word = words[index];
+ double likelihood;
+ // a suggested word not in the vocabulary is certainly wrong,
// changed (or consequentive) words should also not be changed.
- if (!cr.inVocabulary(word) || words_locked[index]) {
+ if (!cr.inVocabulary(word) || words_readonly[index]) {
return 0.0;
}
- // Calculate probability of unigram.
- double p = cr.getSmoothedCount(words[index]);
- p *= channel_probability;
+ // Find prior
+ likelihood = (cr.getNGramCount(words[index]) + .5) / cr.getUnigramCount();
// Add probability of bi-grams.
if (index > 0) {
- // TODO bigrams
+ String str;
+ str = words[index - 1] + " " + words[index];
+ likelihood *= (cr.getNGramCount(str) + .5) / cr.getNGramCount(words[index - 1]);
}
- probability = p;
if (words[index].equals(word)) {
// TODO: tune and magic number
- probability *= .80;
+ likelihood *= .80;
} else {
- probability *= channel_probability;
+ likelihood *= channel_probability;
}
- // TODO: what if this is 0?
- return probability;
+
+ // old word is unknown, let's make sure to change it first.
+ if (!cr.inVocabulary(old_word)) {
+ likelihood += 10;
+ }
+
+ assert likelihood > 0.0 : "failed probability for " + word;
+ return likelihood;
}
/**
@@ -245,7 +246,7 @@ public class SpellCorrector {
double likelihood;
// try the modification, calculate the result and restore.
- likelihood = getWordProbability(index, word, channel_probability);
+ likelihood = getWordLikelihood(index, word, channel_probability);
if (likelihood > best_likelihood) {
best_likelihood = likelihood;
@@ -278,16 +279,18 @@ public class SpellCorrector {
*/
public void saveSuggestion() {
int index = best_modification.index;
- words[index] = best_modification.word;
+ String word = best_modification.word;
+ words[index] = word;
if (index > 0) {
- words_locked[index - 1] = true;
+ words_readonly[index - 1] = true;
}
- words_locked[index] = true;
+ words_readonly[index] = true;
if (index + 1 < words.length) {
- words_locked[index + 1] = true;
+ words_readonly[index + 1] = true;
}
best_modification = null;
- // TODO: save likelihood
+ best_likelihood = Double.MIN_VALUE;
+ word_likelihoods[index] = getWordLikelihood(index, word, 0.5);
}
private class WordModification {