From 9d7d38f1850680fadc8ce9f1d40106393135f7a9 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Sat, 4 Apr 2015 14:11:54 +0200 Subject: Adjust probability for larger typos count --- spellchecker/src/SpellCorrector.java | 37 ++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index a015ac2..39ded2e 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -22,10 +22,21 @@ public class SpellCorrector { private final static double LM_PROBABILITY_UNMODIFIED = .95; /** - * The maximum number of misspelled words to look for. + * The maximum number of misspelled words to look for. This value must be + * the upper found for the number of typos. If there are more typos, the + * algorithm will fail with the automatic non-word detection unless you + * modify {@code calcNonWordProbability()} to set a saner value. */ private final static int MAX_TYPOS = 2; + /** + * The probability for very impossible words. Do not set this too high or it + * will be seen as a better word. Do not set it too low or a combination of + * multiple unlikely words will always yield a zero probability, making it + * impossible to compare sentences. + */ + private double probability_non_word; + private final boolean DEBUG_SCORE = System.getenv("DEBUG_SCORE") != null; public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) { @@ -33,6 +44,27 @@ public class SpellCorrector { this.cmr = cmr; } + private double calcNonWordProbability(String[] words) { + double p, exp; + // Determine the smallest possible non-word probability depending + // on the words length and the number of typos to be found. + // Assume that the average worst case probability for a word is + // 10^-15. Share the remaining number space of the number of typos. + exp = Math.floor( + (Math.log10(Double.MIN_VALUE) + 15 * words.length) / MAX_TYPOS) - 1; + + // In case there are many words, the probability may become too high. + // Therefore override it. + if (exp > -10) { + p = 1e-10; + } else { + p = Math.pow(10, exp); + } + debugPrint("Non-word probability for \"" + String.join(" ", words) + + "\": " + p + " (calculated exp=" + exp + ")"); + return p; + } + private void debugPrint(String str) { // print debugging information if NO_PEACH is set. if (System.getenv("NO_PEACH") != null) { @@ -121,6 +153,7 @@ public class SpellCorrector { String[] words = phrase.split(" "); + probability_non_word = calcNonWordProbability(words); SentenceRater rater = findBetterWord(words, new HashSet<>()); // if a better sentence is found, use it. if (rater != null) { @@ -293,7 +326,7 @@ public class SpellCorrector { for (double score : probabilities) { if (score == 0) { // Non-existing words are really bad. - p *= 1e-99; + p *= probability_non_word; continue; } p *= score; -- cgit v1.2.1