summaryrefslogtreecommitdiff
path: root/spellchecker/src/CorpusReader.java
diff options
context:
space:
mode:
Diffstat (limited to 'spellchecker/src/CorpusReader.java')
-rw-r--r--spellchecker/src/CorpusReader.java44
1 files changed, 39 insertions, 5 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 686f243..2ad7e85 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -15,7 +15,7 @@ public class CorpusReader {
private HashMap<String, Integer> ngrams;
private Set<String> vocabulary;
- private int unigramCount = 0;
+ private int wordCount = 0;
public CorpusReader() throws IOException {
readNGrams();
@@ -57,9 +57,9 @@ public class CorpusReader {
try {
count = Integer.parseInt(s1);
ngrams.put(s2, count);
- // unigram
+ // Count total number of words in the data set
if (s2.indexOf(' ') == -1) {
- unigramCount += count;
+ wordCount += count;
}
} catch (NumberFormatException nfe) {
throw new NumberFormatException("NumberformatError: " + s1);
@@ -129,7 +129,41 @@ public class CorpusReader {
return smoothedCount;
}
- public int getUnigramCount() {
- return unigramCount;
+ /**
+ * Computes the probability P(word|ngram).
+ *
+ * @param word
+ * @param ngram
+ * @return
+ */
+ public double getNgramProbability(String word, String ngram) {
+ double a, b;
+ // special case: unigram has no prior ngram
+ if (ngram.isEmpty()) {
+ a = getNGramCount(word);
+ b = wordCount;
+
+ // apply add-1 smoothing under the assumption that there are many
+ // unigrams and this does not significantly affect the chance,
+ // it just ensures that it is non-zero.
+ return (a + 1) / (b + 1);
+ } else {
+ // other ngram cases
+ a = getNGramCount(ngram + " " + word);
+ b = getNGramCount(ngram);
+
+ // apply smoothing, but add a smaller number because "b" is
+ // typically very small.
+ // TODO: Kneser-Ney smoothing?
+ return (a + .001) / (b + 1);
+ }
+ }
+
+ /**
+ * Returns the number of words in the corpus text (based on counting
+ * unigrams).
+ */
+ public double getWordCount() {
+ return wordCount;
}
}