summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-03 15:12:45 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-03 15:12:45 +0200
commit6d0e2e299835eca671951120e72ad5898866839f (patch)
treeb0e7068abcfa36dcbe41dbf25b836edeb6c39b3d
parent81a6cc4a67c8ac5aa601eaaef728f03245576cb9 (diff)
downloadassignment4-6d0e2e299835eca671951120e72ad5898866839f.tar.gz
Fix ngram PREFIX condition, improve ngram prob debug
-rw-r--r--spellchecker/src/CorpusReader.java9
-rw-r--r--spellchecker/src/SpellCorrector.java8
2 files changed, 12 insertions, 5 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 2ad7e85..2e0a855 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -137,7 +137,8 @@ public class CorpusReader {
* @return
*/
public double getNgramProbability(String word, String ngram) {
- double a, b;
+ double a, b, p;
+
// special case: unigram has no prior ngram
if (ngram.isEmpty()) {
a = getNGramCount(word);
@@ -146,7 +147,7 @@ public class CorpusReader {
// apply add-1 smoothing under the assumption that there are many
// unigrams and this does not significantly affect the chance,
// it just ensures that it is non-zero.
- return (a + 1) / (b + 1);
+ p = (a + 1) / (b + 1);
} else {
// other ngram cases
a = getNGramCount(ngram + " " + word);
@@ -155,8 +156,10 @@ public class CorpusReader {
// apply smoothing, but add a smaller number because "b" is
// typically very small.
// TODO: Kneser-Ney smoothing?
- return (a + .001) / (b + 1);
+ p = (a + .001) / (b + 1);
}
+
+ return p;
}
/**
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 33b804f..2bc788b 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -294,12 +294,16 @@ public class SpellCorrector {
}
// compute bigrams, etc.
- String ngram = word;
+ String ngram = "";
for (int i = 1; i < NGRAM_N; i++) {
// are there actually enough words to compute this metric?
if (index - i >= 0) {
// increase ngram prefix
- ngram += " " + words[index - i];
+ if (ngram.isEmpty()) {
+ ngram = words[index - i];
+ } else {
+ ngram = words[index - i] + " " + ngram;
+ }
// Obtain n-gram probs and combine using interpolation.
igram_p = cr.getNgramProbability(word, ngram);