From adb460408017c6344ee0f836aa56ce25143d9d6b Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 1 Apr 2015 05:13:41 +0200 Subject: Initial attempt at rating sentences --- spellchecker/src/SpellCorrector.java | 151 +++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 6 deletions(-) (limited to 'spellchecker') diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 92f7479..179206f 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -20,12 +20,32 @@ public class SpellCorrector { } String[] words = phrase.split(" "); - String finalSuggestion = ""; - /** - * CODE TO BE ADDED * - */ - return finalSuggestion.trim(); + SentenceRater rater = new SentenceRater(words); + + // Try to change 1 word and see whether the suggestions are OK. + for (int attempt = 0; attempt < 3; attempt++) { + for (int i = 0; i < words.length; i++) { + final int word_index = i; + String old_word = words[word_index]; + // try to find a better suggestion for this word. + Map candidates = getCandidateWords(old_word); + candidates.forEach((word, channel_probability) -> { + rater.tryWord(word_index, word, channel_probability); + }); + }; + if (rater.hasBetterSuggestion()) { + rater.saveSuggestion(); + // TODO: make this nicer + words = rater.getBestSentence(); + System.err.println("Got new sentence: " + String.join(" ", words)); + } else { + System.err.println("No suggestion found."); + break; + } + } + + return String.join(" ", words); } /** @@ -54,7 +74,7 @@ public class SpellCorrector { // is this a known correction? if (errorCount == 0.0) { // no, - p_channel = 0.0; + p_channel = 0.1; } else { // yes, p_channel = correctionCount / errorCount; @@ -162,4 +182,123 @@ public class SpellCorrector { push.call(head + transposed + tail, middle, transposed); } } + + private class SentenceRater { + + private final String[] words; + private final double[] word_likelihoods; + /** + * Words that cannot be modified in further iterations. + */ + private final boolean[] words_locked; + + private double best_likelihood = Double.MIN_VALUE; + private WordModification best_modification; + + public SentenceRater(String[] words) { + this.words = words.clone(); + this.word_likelihoods = new double[words.length]; + this.words_locked = new boolean[words.length]; + for (int i = 0; i < words.length; i++) { + word_likelihoods[i] = getWordProbability(i, words[i], .5); + } + } + + /** + * Calculates the probability that the word {@code word} is valid at + * position {@code index}. + */ + public double getWordProbability(int index, String word, + double channel_probability) { + double probability; + // a word not in the vocabulary is certainly wrong, + // changed (or consequentive) words should also not be changed. + if (!cr.inVocabulary(word) || words_locked[index]) { + return 0.0; + } + + // Calculate probability of unigram. + double p = cr.getSmoothedCount(words[index]); + p *= channel_probability; + // Add probability of bi-grams. + if (index > 0) { + // TODO bigrams + } + probability = p; + + if (words[index].equals(word)) { + // TODO: tune and magic number + probability *= .80; + } else { + probability *= channel_probability; + } + // TODO: what if this is 0? + return probability; + } + + /** + * Tries to modify word at position {@code index} to word and calculates + * the likelihood of the word. The best single-word modification is + * remembered (the sentence itself will not be modified). + */ + public void tryWord(int index, String word, double channel_probability) { + double likelihood; + + // try the modification, calculate the result and restore. + likelihood = getWordProbability(index, word, channel_probability); + + if (likelihood > best_likelihood) { + best_likelihood = likelihood; + best_modification = new WordModification(index, word); + } + } + + /** + * Returns the sentence, possibly changed. + */ + public String[] getBestSentence() { + String[] new_words = words.clone(); + if (best_modification != null) { + new_words[best_modification.index] = best_modification.word; + } + return new_words; + } + + /** + * Returns true if it is likely that a word in the sentence can be + * corrected. + */ + public boolean hasBetterSuggestion() { + return best_modification != null; + } + + /** + * Save the best suggestion so far, ensuring that future suggestions + * won't overwrite this one. + */ + public void saveSuggestion() { + int index = best_modification.index; + words[index] = best_modification.word; + if (index > 0) { + words_locked[index - 1] = true; + } + words_locked[index] = true; + if (index + 1 < words.length) { + words_locked[index + 1] = true; + } + best_modification = null; + // TODO: save likelihood + } + + private class WordModification { + + private final int index; + private final String word; + + public WordModification(int index, String word) { + this.index = index; + this.word = word; + } + } + } } -- cgit v1.2.1