diff options
author | Peter Wu <peter@lekensteyn.nl> | 2015-04-04 14:11:54 +0200 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2015-04-04 14:11:54 +0200 |
commit | 9d7d38f1850680fadc8ce9f1d40106393135f7a9 (patch) | |
tree | 67d1e810914c8270bdb631459d03b30ce200475a /spellchecker | |
parent | 59d3cd598e0c54464005f7ff19ad9a32ea936117 (diff) | |
download | assignment4-9d7d38f1850680fadc8ce9f1d40106393135f7a9.tar.gz |
Adjust probability for larger typos count
Diffstat (limited to 'spellchecker')
-rw-r--r-- | spellchecker/src/SpellCorrector.java | 37 |
1 files changed, 35 insertions, 2 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index a015ac2..39ded2e 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -22,10 +22,21 @@ public class SpellCorrector { private final static double LM_PROBABILITY_UNMODIFIED = .95; /** - * The maximum number of misspelled words to look for. + * The maximum number of misspelled words to look for. This value must be + * the upper found for the number of typos. If there are more typos, the + * algorithm will fail with the automatic non-word detection unless you + * modify {@code calcNonWordProbability()} to set a saner value. */ private final static int MAX_TYPOS = 2; + /** + * The probability for very impossible words. Do not set this too high or it + * will be seen as a better word. Do not set it too low or a combination of + * multiple unlikely words will always yield a zero probability, making it + * impossible to compare sentences. + */ + private double probability_non_word; + private final boolean DEBUG_SCORE = System.getenv("DEBUG_SCORE") != null; public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) { @@ -33,6 +44,27 @@ public class SpellCorrector { this.cmr = cmr; } + private double calcNonWordProbability(String[] words) { + double p, exp; + // Determine the smallest possible non-word probability depending + // on the words length and the number of typos to be found. + // Assume that the average worst case probability for a word is + // 10^-15. Share the remaining number space of the number of typos. + exp = Math.floor( + (Math.log10(Double.MIN_VALUE) + 15 * words.length) / MAX_TYPOS) - 1; + + // In case there are many words, the probability may become too high. + // Therefore override it. + if (exp > -10) { + p = 1e-10; + } else { + p = Math.pow(10, exp); + } + debugPrint("Non-word probability for \"" + String.join(" ", words) + + "\": " + p + " (calculated exp=" + exp + ")"); + return p; + } + private void debugPrint(String str) { // print debugging information if NO_PEACH is set. if (System.getenv("NO_PEACH") != null) { @@ -121,6 +153,7 @@ public class SpellCorrector { String[] words = phrase.split(" "); + probability_non_word = calcNonWordProbability(words); SentenceRater rater = findBetterWord(words, new HashSet<>()); // if a better sentence is found, use it. if (rater != null) { @@ -293,7 +326,7 @@ public class SpellCorrector { for (double score : probabilities) { if (score == 0) { // Non-existing words are really bad. - p *= 1e-99; + p *= probability_non_word; continue; } p *= score; |