summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-04-02 17:31:46 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-04-02 17:31:46 +0200
commit5508b2fa5c5b25d97a441aae305cb2f858307d55 (patch)
tree907eda8aa1bed9386c0e149271a4597efa329a28
parent719cdadfdee023fe742ba76cf5b404602519497d (diff)
downloadassignment4-5508b2fa5c5b25d97a441aae305cb2f858307d55.tar.gz
Implement generic ngram counting and probability calculation
TODO improve smoothing.
-rw-r--r--spellchecker/src/CorpusReader.java44
-rw-r--r--spellchecker/src/SpellCorrector.java49
2 files changed, 66 insertions, 27 deletions
diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index 686f243..2ad7e85 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -15,7 +15,7 @@ public class CorpusReader {
private HashMap<String, Integer> ngrams;
private Set<String> vocabulary;
- private int unigramCount = 0;
+ private int wordCount = 0;
public CorpusReader() throws IOException {
readNGrams();
@@ -57,9 +57,9 @@ public class CorpusReader {
try {
count = Integer.parseInt(s1);
ngrams.put(s2, count);
- // unigram
+ // Count total number of words in the data set
if (s2.indexOf(' ') == -1) {
- unigramCount += count;
+ wordCount += count;
}
} catch (NumberFormatException nfe) {
throw new NumberFormatException("NumberformatError: " + s1);
@@ -129,7 +129,41 @@ public class CorpusReader {
return smoothedCount;
}
- public int getUnigramCount() {
- return unigramCount;
+ /**
+ * Computes the probability P(word|ngram).
+ *
+ * @param word
+ * @param ngram
+ * @return
+ */
+ public double getNgramProbability(String word, String ngram) {
+ double a, b;
+ // special case: unigram has no prior ngram
+ if (ngram.isEmpty()) {
+ a = getNGramCount(word);
+ b = wordCount;
+
+ // apply add-1 smoothing under the assumption that there are many
+ // unigrams and this does not significantly affect the chance,
+ // it just ensures that it is non-zero.
+ return (a + 1) / (b + 1);
+ } else {
+ // other ngram cases
+ a = getNGramCount(ngram + " " + word);
+ b = getNGramCount(ngram);
+
+ // apply smoothing, but add a smaller number because "b" is
+ // typically very small.
+ // TODO: Kneser-Ney smoothing?
+ return (a + .001) / (b + 1);
+ }
+ }
+
+ /**
+ * Returns the number of words in the corpus text (based on counting
+ * unigrams).
+ */
+ public double getWordCount() {
+ return wordCount;
}
}
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index bdfc8e1..971e91e 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -10,6 +10,11 @@ public class SpellCorrector {
final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();
/**
+ * The highest n-gram to look for. 1 gives a unigram, 2 gives a bigram, etc.
+ */
+ private final static int NGRAM_N = 2;
+
+ /**
* Lambda values for interpolation of n-gram probabilities. The first value
* is for unigrams, the second for bigrams, etc.
*/
@@ -255,8 +260,7 @@ public class SpellCorrector {
*/
public double getWordLikelihood(int index, String word,
double channel_probability) {
- String prev_word, ngram;
- double prior, score, p, p_uni, p_bi;
+ double prior, score;
// a suggested word not in the vocabulary is certainly wrong,
// changed (or consequentive) words should also not be changed.
if (!cr.inVocabulary(word) || words_readonly[index]) {
@@ -266,33 +270,34 @@ public class SpellCorrector {
assert channel_probability > 0.0;
// P(x|w) is given by language model (noisy channel probability).
- // Find prior P(w) = (freq(w) + .5) / N.
+ // Find prior P(w) = (freq(w) + .5) / N (N is number of words).
// Then compute candidate score by P(w) * P(x|w)
- prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
+ prior = (cr.getNGramCount(word) + .5) / cr.getWordCount();
score = prior * channel_probability;
- // unigram probability is computed by P(w) = #w / N (no smoothing).
- p_uni = cr.getSmoothedCount(word) / cr.getUnigramCount();
-
- // Add probability of bi-grams.
- // For words u and w, P(w|u) = P(u, w) / P(u).
- if (index > 0) {
- prev_word = words[index - 1];
- ngram = prev_word + " " + word;
- p_bi = cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
- //System.err.println("W: " + word + " " + score + " " + ngram + " |" + words[index]);
- } else {
- // no previous word, assume likely.
- p_bi = 1;
+ // compute unigrams
+ double p = LAMBDAS[0] * cr.getNgramProbability(word, "");
+ // compute bigrams, etc.
+ String ngram = word;
+ for (int i = 1; i < NGRAM_N; i++) {
+ // are there actually enough words to compute this metric?
+ if (index - i >= 0) {
+ // increase ngram prefix
+ ngram += " " + words[index - i];
+
+ // Obtain n-gram probs and combine using interpolation.
+ p += LAMBDAS[i] * cr.getNgramProbability(word, ngram);
+ } else {
+ // no metrics found, cannot deduce much information from it
+ p += LAMBDAS[i] * .5;
+ }
}
- // Now obtain n-gram probabilities. Use interpolation to combine
- // unigrams and bigrams.
- p = LAMBDAS[0] * p_uni + LAMBDAS[1] * p_bi;
- p *= score;
+ // finally add the score
if (DEBUG_SCORE && (word.equals("he") || word.equals("hme") || word.equals("home"))) {
- System.err.println(word + " p=" + p + " score=" + score + " uni=" + p_uni + " bi=" + p_bi);
+ System.err.println(word + " p=" + (p * score) + " score=" + score + " ngram=" + p);
}
+ p *= score;
assert p > 0.0 : "failed probability for " + word;
return p;
}