From c42186671c9808a7192504f5556c48718607f7d3 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 1 Apr 2015 09:19:46 +0200 Subject: Count unigrams --- spellchecker/src/CorpusReader.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index 536a41b..f815cfd 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -15,6 +15,7 @@ public class CorpusReader { private HashMap ngrams; private Set vocabulary; + private int unigramCount = 0; public CorpusReader() throws IOException { readNGrams(); @@ -56,6 +57,10 @@ public class CorpusReader { try { count = Integer.parseInt(s1); ngrams.put(s2, count); + // unigram + if (s2.indexOf(' ') == -1) { + unigramCount += count; + } } catch (NumberFormatException nfe) { throw new NumberFormatException("NumberformatError: " + s1); } @@ -115,4 +120,8 @@ public class CorpusReader { return smoothedCount; } + + public int getUnigramCount() { + return unigramCount; + } } -- cgit v1.2.1