summaryrefslogtreecommitdiff
path: root/spellchecker/src/SpellCorrector.java
blob: 8ebb82995ed64e5df633585a55a3237312da2d98 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

import java.util.HashSet;

public class SpellCorrector {

    final private CorpusReader cr;
    final private ConfusionMatrixReader cmr;

    final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();

    public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) {
        this.cr = cr;
        this.cmr = cmr;
    }

    public String correctPhrase(String phrase) {
        if (phrase == null || phrase.length() == 0) {
            throw new IllegalArgumentException("phrase must be non-empty.");
        }

        String[] words = phrase.split(" ");
        String finalSuggestion = "";

        /**
         * CODE TO BE ADDED *
         */
        return finalSuggestion.trim();
    }

    public double calculateChannelModelProbability(String suggested, String incorrect) {
        /**
         * CODE TO BE ADDED *
         */

        return 0.0;
    }

    /**
     * Gets all candidate words, resulting from a single addition, deletion.
     * substitution or transposition.
     *
     * @param word The word to find candidates for.
     * @return A set of words with candidate words.
     */
    public HashSet<String> getCandidateWords(String word) {
        HashSet<String> ListOfWords = new HashSet<>();

        // generate words by insertion of a character
        for (int i = 0; i <= word.length(); i++) {
            // the word is split into [0..i] [i..n]
            // if i == word.length, then the last part is empty
            String head = word.substring(0, i);
            String tail = i < word.length() ? word.substring(i) : "";
            for (char c : ALPHABET) {
                // insertion of a single character
                ListOfWords.add(head + c + tail);
            }
        }

        for (int i = 0; i < word.length(); i++) {
            // the word is split into [0..i] [i..i+1] [i+1..n]
            // if i == word.length() - 1, then the tail is empty.
            String head = word.substring(0, i);
            String tail = word.substring(i + 1);

            for (char c : ALPHABET) {
                // substitution
                ListOfWords.add(head + c + tail);
            }

            // deletion of a single character (prevent adding empty words)
            if (word.length() > 1) {
                ListOfWords.add(head + tail);
            }
        }

        // The Damerau-Levenshtein distance also includes transposition to
        // account for spelling mistakes. This loop transposes the previous
        // character with the current index.
        for (int i = 1; i < word.length(); i++) {
            // the word is split into [0..i-1] [i-1..i] [i..i+1] [i+1..n]
            // if i == word.length() - 1, then the tail is empty.
            String head = word.substring(0, i - 1);
            char c1 = word.charAt(i - 1);
            char c2 = word.charAt(i);
            String tail = i + 1 < word.length() ? word.substring(i + 1) : "";

            // transposition of two adjacent letters
            ListOfWords.add(head + c2 + c1 + tail);
        }

        return cr.inVocabulary(ListOfWords);
    }
}