From 023487875139e3e8acd52298979b7dd903d70ed5 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 1 Apr 2015 09:20:19 +0200 Subject: Rate multi n-grams higher --- spellchecker/src/CorpusReader.java | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index f815cfd..f9df1ca 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -113,10 +113,22 @@ public class CorpusReader { throw new IllegalArgumentException("NGram must be non-empty."); } - double smoothedCount = 0.0; - - // simplest smoothing implementation: plus 1. - smoothedCount = getNGramCount(NGram) + 1; + double smoothedCount = getNGramCount(NGram); + + int n_words = NGram.split(" ").length + 1; + switch (n_words) { + case 1: // unigram + smoothedCount += 1.0; + break; + case 2: // bigram + smoothedCount += 2.0; + break; + case 3: // trigram + smoothedCount += 4.0; + break; + default: + throw new AssertionError("Unknown n-gram with n=" + n_words); + } return smoothedCount; } -- cgit v1.2.1