diff options
author | Peter Wu <peter@lekensteyn.nl> | 2015-04-01 09:20:19 +0200 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2015-04-01 09:20:19 +0200 |
commit | 023487875139e3e8acd52298979b7dd903d70ed5 (patch) | |
tree | 5f89f0c26bbc17651226751a564f1ad8ad77b516 | |
parent | 815478980f4a881d1efbbd6b7d2441ec47376c7d (diff) | |
download | assignment4-023487875139e3e8acd52298979b7dd903d70ed5.tar.gz |
Rate multi n-grams higher
-rw-r--r-- | spellchecker/src/CorpusReader.java | 20 |
1 files changed, 16 insertions, 4 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index f815cfd..f9df1ca 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -113,10 +113,22 @@ public class CorpusReader { throw new IllegalArgumentException("NGram must be non-empty."); } - double smoothedCount = 0.0; - - // simplest smoothing implementation: plus 1. - smoothedCount = getNGramCount(NGram) + 1; + double smoothedCount = getNGramCount(NGram); + + int n_words = NGram.split(" ").length + 1; + switch (n_words) { + case 1: // unigram + smoothedCount += 1.0; + break; + case 2: // bigram + smoothedCount += 2.0; + break; + case 3: // trigram + smoothedCount += 4.0; + break; + default: + throw new AssertionError("Unknown n-gram with n=" + n_words); + } return smoothedCount; } |