diff options
author | Peter Wu <peter@lekensteyn.nl> | 2015-04-03 15:12:45 +0200 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2015-04-03 15:12:45 +0200 |
commit | 6d0e2e299835eca671951120e72ad5898866839f (patch) | |
tree | b0e7068abcfa36dcbe41dbf25b836edeb6c39b3d | |
parent | 81a6cc4a67c8ac5aa601eaaef728f03245576cb9 (diff) | |
download | assignment4-6d0e2e299835eca671951120e72ad5898866839f.tar.gz |
Fix ngram PREFIX condition, improve ngram prob debug
-rw-r--r-- | spellchecker/src/CorpusReader.java | 9 | ||||
-rw-r--r-- | spellchecker/src/SpellCorrector.java | 8 |
2 files changed, 12 insertions, 5 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index 2ad7e85..2e0a855 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -137,7 +137,8 @@ public class CorpusReader { * @return */ public double getNgramProbability(String word, String ngram) { - double a, b; + double a, b, p; + // special case: unigram has no prior ngram if (ngram.isEmpty()) { a = getNGramCount(word); @@ -146,7 +147,7 @@ public class CorpusReader { // apply add-1 smoothing under the assumption that there are many // unigrams and this does not significantly affect the chance, // it just ensures that it is non-zero. - return (a + 1) / (b + 1); + p = (a + 1) / (b + 1); } else { // other ngram cases a = getNGramCount(ngram + " " + word); @@ -155,8 +156,10 @@ public class CorpusReader { // apply smoothing, but add a smaller number because "b" is // typically very small. // TODO: Kneser-Ney smoothing? - return (a + .001) / (b + 1); + p = (a + .001) / (b + 1); } + + return p; } /** diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 33b804f..2bc788b 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -294,12 +294,16 @@ public class SpellCorrector { } // compute bigrams, etc. - String ngram = word; + String ngram = ""; for (int i = 1; i < NGRAM_N; i++) { // are there actually enough words to compute this metric? if (index - i >= 0) { // increase ngram prefix - ngram += " " + words[index - i]; + if (ngram.isEmpty()) { + ngram = words[index - i]; + } else { + ngram = words[index - i] + " " + ngram; + } // Obtain n-gram probs and combine using interpolation. igram_p = cr.getNgramProbability(word, ngram); |