From 5508b2fa5c5b25d97a441aae305cb2f858307d55 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Thu, 2 Apr 2015 17:31:46 +0200
Subject: Implement generic ngram counting and probability calculation

TODO improve smoothing.
---
 spellchecker/src/CorpusReader.java   | 44 ++++++++++++++++++++++++++++----
 spellchecker/src/SpellCorrector.java | 49 ++++++++++++++++++++----------------
 2 files changed, 66 insertions(+), 27 deletions(-)
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 686f243..2ad7e85 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -15,7 +15,7 @@ public class CorpusReader {
 
     private HashMap<String, Integer> ngrams;
     private Set<String> vocabulary;
-    private int unigramCount = 0;
+    private int wordCount = 0;
 
     public CorpusReader() throws IOException {
         readNGrams();
@@ -57,9 +57,9 @@ public class CorpusReader {
             try {
                 count = Integer.parseInt(s1);
                 ngrams.put(s2, count);
-                // unigram
+                // Count total number of words in the data set
                 if (s2.indexOf(' ') == -1) {
-                    unigramCount += count;
+                    wordCount += count;
                 }
             } catch (NumberFormatException nfe) {
                 throw new NumberFormatException("NumberformatError: " + s1);
@@ -129,7 +129,41 @@ public class CorpusReader {
         return smoothedCount;
     }
 
-    public int getUnigramCount() {
-        return unigramCount;
+    /**
+     * Computes the probability P(word|ngram).
+     *
+     * @param word
+     * @param ngram
+     * @return
+     */
+    public double getNgramProbability(String word, String ngram) {
+        double a, b;
+        // special case: unigram has no prior ngram
+        if (ngram.isEmpty()) {
+            a = getNGramCount(word);
+            b = wordCount;
+
+            // apply add-1 smoothing under the assumption that there are many
+            // unigrams and this does not significantly affect the chance,
+            // it just ensures that it is non-zero.
+            return (a + 1) / (b + 1);
+        } else {
+            // other ngram cases
+            a = getNGramCount(ngram + " " + word);
+            b = getNGramCount(ngram);
+
+            // apply smoothing, but add a smaller number because "b" is
+            // typically very small.
+            // TODO: Kneser-Ney smoothing?
+            return (a + .001) / (b + 1);
+        }
+    }
+
+    /**
+     * Returns the number of words in the corpus text (based on counting
+     * unigrams).
+     */
+    public double getWordCount() {
+        return wordCount;
     }
 }
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index bdfc8e1..971e91e 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -9,6 +9,11 @@ public class SpellCorrector {
 
     final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();
 
+    /**
+     * The highest n-gram to look for. 1 gives a unigram, 2 gives a bigram, etc.
+     */
+    private final static int NGRAM_N = 2;
+
     /**
      * Lambda values for interpolation of n-gram probabilities. The first value
      * is for unigrams, the second for bigrams, etc.
@@ -255,8 +260,7 @@ public class SpellCorrector {
          */
         public double getWordLikelihood(int index, String word,
                 double channel_probability) {
-            String prev_word, ngram;
-            double prior, score, p, p_uni, p_bi;
+            double prior, score;
             // a suggested word not in the vocabulary is certainly wrong,
             // changed (or consequentive) words should also not be changed.
             if (!cr.inVocabulary(word) || words_readonly[index]) {
@@ -266,33 +270,34 @@ public class SpellCorrector {
             assert channel_probability > 0.0;
 
             // P(x|w) is given by language model (noisy channel probability).
-            // Find prior P(w) = (freq(w) + .5) / N.
+            // Find prior P(w) = (freq(w) + .5) / N (N is number of words).
             // Then compute candidate score by P(w) * P(x|w)
-            prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
+            prior = (cr.getNGramCount(word) + .5) / cr.getWordCount();
             score = prior * channel_probability;
 
-            // unigram probability is computed by P(w) = #w / N (no smoothing).
-            p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount();
-
-            // Add probability of bi-grams.
-            // For words u and w, P(w|u) = P(u, w) / P(u).
-            if (index > 0) {
-                prev_word = words[index - 1];
-                ngram = prev_word + " " + word;
-                p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
-                //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]);
-            } else {
-                // no previous word, assume likely.
-                p_bi = 1;
+            // compute unigrams
+            double p = LAMBDAS[0] * cr.getNgramProbability(word, "");
+            // compute bigrams, etc.
+            String ngram = word;
+            for (int i = 1; i < NGRAM_N; i++) {
+                // are there actually enough words to compute this metric?
+                if (index - i >= 0) {
+                    // increase ngram prefix
+                    ngram += " " + words[index - i];
+
+                    // Obtain n-gram probs and combine using interpolation.
+                    p += LAMBDAS[i] * cr.getNgramProbability(word, ngram);
+                } else {
+                    // no metrics found, cannot deduce much information from it
+                    p += LAMBDAS[i] * .5;
+                }
             }
 
-            // Now obtain n-gram probabilities. Use interpolation to combine
-            // unigrams and bigrams.
-            p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi;
-            p *= score;
+            // finally add the score
             if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) {
-                System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi);
+                System.err.println(word + " p=" + (p * score) + " score=" + score + " ngram=" + p);
             }
+            p *= score;
             assert p > 0.0 : "failed probability for " + word;
             return p;
         }
-- 
cgit v1.2.1