summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-04 14:11:54 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-04 14:11:54 +0200
commit9d7d38f1850680fadc8ce9f1d40106393135f7a9 (patch)
tree67d1e810914c8270bdb631459d03b30ce200475a
parent59d3cd598e0c54464005f7ff19ad9a32ea936117 (diff)
downloadassignment4-9d7d38f1850680fadc8ce9f1d40106393135f7a9.tar.gz
Adjust probability for larger typos count
-rw-r--r--spellchecker/src/SpellCorrector.java37
1 files changed, 35 insertions, 2 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index a015ac2..39ded2e 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -22,10 +22,21 @@ public class SpellCorrector {
private final static double LM_PROBABILITY_UNMODIFIED = .95;
/**
- * The maximum number of misspelled words to look for.
+ * The maximum number of misspelled words to look for. This value must be
+ * the upper found for the number of typos. If there are more typos, the
+ * algorithm will fail with the automatic non-word detection unless you
+ * modify {@code calcNonWordProbability()} to set a saner value.
*/
private final static int MAX_TYPOS = 2;
+ /**
+ * The probability for very impossible words. Do not set this too high or it
+ * will be seen as a better word. Do not set it too low or a combination of
+ * multiple unlikely words will always yield a zero probability, making it
+ * impossible to compare sentences.
+ */
+ private double probability_non_word;
+
private final boolean DEBUG_SCORE = System.getenv("DEBUG_SCORE") != null;
public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) {
@@ -33,6 +44,27 @@ public class SpellCorrector {
this.cmr = cmr;
}
+ private double calcNonWordProbability(String[] words) {
+ double p, exp;
+ // Determine the smallest possible non-word probability depending
+ // on the words length and the number of typos to be found.
+ // Assume that the average worst case probability for a word is
+ // 10^-15. Share the remaining number space of the number of typos.
+ exp = Math.floor(
+ (Math.log10(Double.MIN_VALUE) + 15 * words.length) / MAX_TYPOS) - 1;
+
+ // In case there are many words, the probability may become too high.
+ // Therefore override it.
+ if (exp > -10) {
+ p = 1e-10;
+ } else {
+ p = Math.pow(10, exp);
+ }
+ debugPrint("Non-word probability for \"" + String.join(" ", words)
+ + "\": " + p + " (calculated exp=" + exp + ")");
+ return p;
+ }
+
private void debugPrint(String str) {
// print debugging information if NO_PEACH is set.
if (System.getenv("NO_PEACH") != null) {
@@ -121,6 +153,7 @@ public class SpellCorrector {
String[] words = phrase.split(" ");
+ probability_non_word = calcNonWordProbability(words);
SentenceRater rater = findBetterWord(words, new HashSet<>());
// if a better sentence is found, use it.
if (rater != null) {
@@ -293,7 +326,7 @@ public class SpellCorrector {
for (double score : probabilities) {
if (score == 0) {
// Non-existing words are really bad.
- p *= 1e-99;
+ p *= probability_non_word;
continue;
}
p *= score;