1 files changed, 39 insertions, 5 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 686f243..2ad7e85 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -15,7 +15,7 @@ public class CorpusReader {
 
     private HashMap<String, Integer> ngrams;
     private Set<String> vocabulary;
-    private int unigramCount = 0;
+    private int wordCount = 0;
 
     public CorpusReader() throws IOException {
         readNGrams();
@@ -57,9 +57,9 @@ public class CorpusReader {
             try {
                 count = Integer.parseInt(s1);
                 ngrams.put(s2, count);
-                // unigram
+                // Count total number of words in the data set
                 if (s2.indexOf(' ') == -1) {
-                    unigramCount += count;
+                    wordCount += count;
                 }
             } catch (NumberFormatException nfe) {
                 throw new NumberFormatException("NumberformatError: " + s1);
@@ -129,7 +129,41 @@ public class CorpusReader {
         return smoothedCount;
     }
 
-    public int getUnigramCount() {
-        return unigramCount;
+    /**
+     * Computes the probability P(word|ngram).
+     *
+     * @param word
+     * @param ngram
+     * @return
+     */
+    public double getNgramProbability(String word, String ngram) {
+        double a, b;
+        // special case: unigram has no prior ngram
+        if (ngram.isEmpty()) {
+            a = getNGramCount(word);
+            b = wordCount;
+
+            // apply add-1 smoothing under the assumption that there are many
+            // unigrams and this does not significantly affect the chance,
+            // it just ensures that it is non-zero.
+            return (a + 1) / (b + 1);
+        } else {
+            // other ngram cases
+            a = getNGramCount(ngram + " " + word);
+            b = getNGramCount(ngram);
+
+            // apply smoothing, but add a smaller number because "b" is
+            // typically very small.
+            // TODO: Kneser-Ney smoothing?
+            return (a + .001) / (b + 1);
+        }
+    }
+
+    /**
+     * Returns the number of words in the corpus text (based on counting
+     * unigrams).
+     */
+    public double getWordCount() {
+        return wordCount;
     }
 }