blob: f815cfd7914658c9a1d3dde73a2c822743738cb9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
public class CorpusReader {
final static String CNTFILE_LOC = "samplecnt.txt";
final static String VOCFILE_LOC = "samplevoc.txt";
private HashMap<String, Integer> ngrams;
private Set<String> vocabulary;
private int unigramCount = 0;
public CorpusReader() throws IOException {
readNGrams();
readVocabulary();
}
/**
* Returns the n-gram count of <NGram> in the file
*
*
* @param nGram : space-separated list of words, e.g. "adopted by him"
* @return 0 if <NGram> cannot be found, otherwise count of <NGram> in file
*/
public int getNGramCount(String nGram) throws NumberFormatException {
if (nGram == null || nGram.length() == 0) {
throw new IllegalArgumentException("NGram must be non-empty.");
}
Integer value = ngrams.get(nGram);
return value == null ? 0 : value;
}
private void readNGrams() throws
FileNotFoundException, IOException, NumberFormatException {
ngrams = new HashMap<>();
FileInputStream fis;
fis = new FileInputStream(CNTFILE_LOC);
BufferedReader in = new BufferedReader(new InputStreamReader(fis));
while (in.ready()) {
String phrase = in.readLine().trim();
String s1, s2;
int j = phrase.indexOf(" ");
s1 = phrase.substring(0, j);
s2 = phrase.substring(j + 1, phrase.length());
int count = 0;
try {
count = Integer.parseInt(s1);
ngrams.put(s2, count);
// unigram
if (s2.indexOf(' ') == -1) {
unigramCount += count;
}
} catch (NumberFormatException nfe) {
throw new NumberFormatException("NumberformatError: " + s1);
}
}
}
private void readVocabulary() throws FileNotFoundException, IOException {
vocabulary = new HashSet<>();
FileInputStream fis = new FileInputStream(VOCFILE_LOC);
BufferedReader in = new BufferedReader(new InputStreamReader(fis));
while (in.ready()) {
String line = in.readLine();
vocabulary.add(line);
}
}
/**
* Returns the size of the number of unique words in the dataset
*
* @return the size of the number of unique words in the dataset
*/
public int getVocabularySize() {
return vocabulary.size();
}
/**
* Returns the subset of words in set that are in the vocabulary
*
* @param set
* @return
*/
public HashSet<String> inVocabulary(Set<String> set) {
HashSet<String> h = new HashSet<>(set);
h.retainAll(vocabulary);
return h;
}
public boolean inVocabulary(String word) {
return vocabulary.contains(word);
}
/**
* Returns a smoothed value based on the number of occurrences of the n-gram
* in the corpus.
*/
public double getSmoothedCount(String NGram) {
if (NGram == null || NGram.length() == 0) {
throw new IllegalArgumentException("NGram must be non-empty.");
}
double smoothedCount = 0.0;
// simplest smoothing implementation: plus 1.
smoothedCount = getNGramCount(NGram) + 1;
return smoothedCount;
}
public int getUnigramCount() {
return unigramCount;
}
}
|