From 719cdadfdee023fe742ba76cf5b404602519497d Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 16:02:47 +0200 Subject: Remove broken greedy approach, fix corrections limit --- spellchecker/src/SpellCorrector.java | 56 ++++++++++++------------------------ 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 8cbc8ca..bdfc8e1 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -14,11 +14,17 @@ public class SpellCorrector { * is for unigrams, the second for bigrams, etc. */ private final static double[] LAMBDAS = new double[]{.25, .75}; + /** * The language model probability for uncorrected words. */ private final static double LM_PROBABILITY_UNMODIFIED = .95; + /** + * The maximum number of misspelled words to look for. + */ + private final static int MAX_TYPOS = 2; + private final boolean DEBUG_SCORE = System.getenv("DEBUG_SCORE") != null; public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) { @@ -27,41 +33,13 @@ public class SpellCorrector { } /** - * Broken greedy approach, not used. - * - * @param words - */ - private void greedyApproach(String[] words) { - SentenceRater rater = new SentenceRater(words); - - // Try to change 1 word and see whether the suggestions are OK. - for (int attempt = 0; attempt < 3; attempt++) { - for (int i = 0; i < words.length; i++) { - final int word_index = i; - String old_word = words[word_index]; - // try to find a better suggestion for this word. - Map candidates = getCandidateWords(old_word); - candidates.forEach((word, channel_probability) -> { - rater.tryWord(word_index, word, channel_probability); - }); - }; - if (rater.hasBetterSuggestion()) { - rater.saveSuggestion(); - // TODO: make this nicer - words = rater.getBestSentence(); - System.err.println("Suggest: " + String.join(" ", words)); - } else { - System.err.println("No suggestion found."); - break; - } - } - } - - /** + * Tries to find up to {@code maxTypos} number of misspelled words. * - * @param level The number of words that have been modified so far. + * @param maxTypos The maximum number of look erroneous words for. + * @return The resulting rater containing probabilities and the corrected + * sentence or null if no suggestions are found. */ - private SentenceRater nonGreedyApproach(String[] words, int level) { + private SentenceRater findCorrected(String[] words, int maxTypos) { SentenceRater rater = new SentenceRater(words); // find best word @@ -78,12 +56,13 @@ public class SpellCorrector { // if a better word was found, use the change if (rater.hasBetterSuggestion()) { rater.saveSuggestion(); - // if there are less than 3 typos, hunt for more typos - if (level < 3) { + // If some other errors are still possible, hunt for those! + if (maxTypos > 1) { SentenceRater subResult; - subResult = nonGreedyApproach(rater.words.clone(), level + 1); - // check if other typos found + subResult = findCorrected(rater.getBestSentence(), maxTypos - 1); + // Did it find a corrected word? if (subResult != null) { + // Yes it did. Is this the best one so far? if (subResult.evaluateWord(-1) > rater.evaluateWord(-1)) { System.err.println("Subresult is better!"); return subResult; @@ -107,7 +86,8 @@ public class SpellCorrector { String[] words = phrase.split(" "); - SentenceRater rater = nonGreedyApproach(words, 0); + SentenceRater rater = findCorrected(words, MAX_TYPOS); + // if a better sentence is found, use it. if (rater != null) { words = rater.getBestSentence(); } -- cgit v1.2.1