From 82eadc91f0f697103e965bd3600475df60e8282f Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 1 Apr 2015 02:54:48 +0200 Subject: Calculate noisy channel probability --- spellchecker/src/ConfusionMatrixReader.java | 11 +++++++++++ spellchecker/src/SpellCorrector.java | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/spellchecker/src/ConfusionMatrixReader.java b/spellchecker/src/ConfusionMatrixReader.java index c9e79ab..90a8cef 100644 --- a/spellchecker/src/ConfusionMatrixReader.java +++ b/spellchecker/src/ConfusionMatrixReader.java @@ -60,4 +60,15 @@ public class ConfusionMatrixReader { Integer count = confusionMatrix.get(error + "|" + correct); return count == null ? 0 : count; } + + /** + * Given the error {@code error}, find the number of occurrences of the + * error. + * + * @param error + * @return + */ + public int getErrorsCount(String error) { + return countMatrix.getOrDefault(error, 0); + } } diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 6c9225b..3fbd8cc 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -53,7 +53,25 @@ public class SpellCorrector { return; } + // Find the channel model probability (probability of the edit). + // P(x|w) = "corrections count given error" / "errors count" + double correctionCount, errorCount, p_channel; + correctionCount = (double) cmr.getConfusionCount(error, correct); + errorCount = cmr.getErrorsCount(error); + + // is this a known correction? + if (errorCount == 0.0) { + // no, + p_channel = 0.0; + } else { + // yes, + p_channel = correctionCount / errorCount; + } + + // Sum the probabilities as independent modifications can result in + // the same word ("acess" -> "access" by "a|ac", "e|ce"). double p = candidates.getOrDefault(word2, 0.0); + p += p_channel; candidates.put(word2, p); }; -- cgit v1.2.1