From 6d0e2e299835eca671951120e72ad5898866839f Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Fri, 3 Apr 2015 15:12:45 +0200 Subject: Fix ngram PREFIX condition, improve ngram prob debug --- spellchecker/src/CorpusReader.java | 9 ++++++--- spellchecker/src/SpellCorrector.java | 8 ++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index 2ad7e85..2e0a855 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -137,7 +137,8 @@ public class CorpusReader { * @return */ public double getNgramProbability(String word, String ngram) { - double a, b; + double a, b, p; + // special case: unigram has no prior ngram if (ngram.isEmpty()) { a = getNGramCount(word); @@ -146,7 +147,7 @@ public class CorpusReader { // apply add-1 smoothing under the assumption that there are many // unigrams and this does not significantly affect the chance, // it just ensures that it is non-zero. - return (a + 1) / (b + 1); + p = (a + 1) / (b + 1); } else { // other ngram cases a = getNGramCount(ngram + " " + word); @@ -155,8 +156,10 @@ public class CorpusReader { // apply smoothing, but add a smaller number because "b" is // typically very small. // TODO: Kneser-Ney smoothing? - return (a + .001) / (b + 1); + p = (a + .001) / (b + 1); } + + return p; } /** diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 33b804f..2bc788b 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -294,12 +294,16 @@ public class SpellCorrector { } // compute bigrams, etc. - String ngram = word; + String ngram = ""; for (int i = 1; i < NGRAM_N; i++) { // are there actually enough words to compute this metric? if (index - i >= 0) { // increase ngram prefix - ngram += " " + words[index - i]; + if (ngram.isEmpty()) { + ngram = words[index - i]; + } else { + ngram = words[index - i] + " " + ngram; + } // Obtain n-gram probs and combine using interpolation. igram_p = cr.getNgramProbability(word, ngram); -- cgit v1.2.1