From e8f5721b7955e83562b8364b2717b9ef0983fac4 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Sat, 4 Apr 2015 16:16:17 +0200 Subject: Extend report --- report.pdf | Bin 142274 -> 175996 bytes report.tex | 112 ++++++++++++++++++++++++++++++----- spellchecker/src/SpellCorrector.java | 4 ++ 3 files changed, 101 insertions(+), 15 deletions(-) diff --git a/report.pdf b/report.pdf index 0e62ce1..a757392 100644 Binary files a/report.pdf and b/report.pdf differ diff --git a/report.tex b/report.tex index 84aef14..14f982d 100644 --- a/report.tex +++ b/report.tex @@ -9,6 +9,8 @@ \usepackage{listings} \usepackage{xcolor} \usepackage{color,courier} +\usepackage{etoolbox} +\patchcmd{\thebibliography}{\section*}{\section}{}{} \definecolor{dkgreen}{rgb}{0,0.55,0} \definecolor{gray}{rgb}{0.5,0.5,0.5} @@ -82,8 +84,8 @@ listing below). Once a rating is found which is higher than the initial one, the word is replaced. When a word was modified, the hunt for more faulty words continues until all of -them (less than three) are found. Otherwise, it is assumed that the sentence -has no more spelling errors. +them (at most \texttt{MAX\_TYPOS} which is 2 by default) are found. Otherwise, +it is assumed that the sentence has no more spelling errors. Pseudo-code: @@ -218,13 +220,14 @@ The channel model probability $P(x|w)$ is obtained via $getCandidateWords()$ while the language model probability $P(w)$ depends on the frequency of prior n-grams from a corpus. The combination gives the word score $P(x|w) * P(w)$. -The prior n-grams are combined by interpolation with lambda conditionals on -context \cite{interpolation}. -Intuitively bigrams (or trigrams) are more precise than unigrams because these -are more specific and use more details from the context. However, there are also -less bigrams and even less trigrams to get information about. This could cause a -skew in the results, so while it should be weighted more than unigrams, the -lambda weight parameter should not get too high. +Initially we have used interpolation with lambda conditionals on context +\cite{interpolation} to combine the n-gram probabilities, but this gave bad +results because it put too much favor on high probabilities for unigrams. So +instead the language model probability is now the result from combining the +n-gram probabilities by simple multiplication. The weight of bigrams is not +changed now. Since there are fewer bigrams in the corpus, it will naturally be +ranked higher than unigrams. Note that the algorithm can also be scaled to use +trigrams, fourgrams, etc just by modifying the $NGRAM\_N$ parameter. Due to the usage of n-grams, the score of a sentence as a whole must be considered. This is the case because the $n - 1$ words before and after a word @@ -235,17 +238,96 @@ words which are certainly incorrect (not in the vocabulary) should not be ranked with 0, but with a very small number (such as $10^{-99}$). This ensures that words with at least two errors have the chance to be seen as a better result. -\section{Results and Explanation} +Pseudo-code to summarize the evaluation: +\begin{lstlisting}[language=python] +def evaluateSentence(words, correctedWordIndex): + p = 1 + # Simply multiple all word probabilities to obtain a sentence rating + for wordIndex, word in enumerate(words): + p *= getWordLikelihood(words, wordIndex, correctedWordIndex) + return p + +def getWordLikelihood(words, wordIndex, correctedWordIndex): + word = words[wordIndex] + + if wordIndex == correctedWordIndex: + # If this is the possibly misspelled word, use the channel + # probability to compare multiple candidates. + p = channelProbs[word] + else: + # For other words, assume that they are correct and use a + # high probability + p = LM_PROBABILITY_UNMODIFIED + + # Calculate P(w), or, how often does it occur in the corpus + # Apply add-1 smoothing under the assumption that there are many + # unigrams and this does not significantly affect the chance, + # it just ensures that it is non-zero. + o *= (getNGramCount(word) + 1) / (unigramCount + 1) + + # multiply by n-gram probabilities + for n in range(1, NGRAM\_N): + # calculate P(word|prefix) + p *= ngramProbability(word, words[wordIndex-n..wordIndex]) + + return p + +# Calculates P(word|prefix) = #word / #(prefiix word) +def ngramProbability(word, prefix): + a = getNGramCount(word) + b = getNGramCount(prefix " " word) + + # apply smoothing, but add a smaller number because "b" is + # typically very small. + return (a + 0.001) / (b + 1) +\end{lstlisting} -\section{Additional Resources} +\section{Steps to reproduce} +\begin{enumerate} +\item Obtain the source code of the SpellChecker project. +\item Run \texttt{ant run} from the project directory to compile and execute the +program. +\item At the \texttt{run:} step, type the sentence to be corrected and press +Enter. This should have at most \texttt{MAX\_TYPOS} errors (defaulting to two, +but changeable in the source code of \texttt{SpellCorrector.java}. +\item The corrected sentence is printed as \texttt{Answer: }. +\end{enumerate} + +A mode exists where you can keep inputting lines and for each of the line, an +answer will be displayed. This can be achieved by setting the environment +variable \texttt{NO\_PEACH} prior to executing \texttt{ant run}. + +\section{Results and Explanation} +The algorithm can be tuned with the following parameters: +\begin{description} +\item[NGRAM\_N] The largest $n$ (as in $n$-gram) that should be checked for. +This depends on the input file \textit{samplecnt.txt} (which is also dependent +on \textit{samplevoc.txt}). +\item[MAX\_TYPOS] This parameter is set to 2 for the task, but it could be +increased for other situations. The worst-case running time will exponentially +increase with this parameter. +%\item[LM\_PROBABILITY\_UNMODIFIED] Words that possibly have an error are +%influenced by the channel probability. For other words (those that are not being +%considered for correction), this factor can be .... (I think that the factor +%can be removed from the code since it only decreases the probability...) +% todo you can set it to just 1... it does not matter since we are just trying +% to find the best correction for a word +\end{description} + +% todo include results, it looks good but need to be extended!!! +The program was tested against the \textit{test-sentences.txt} dataset for which +it only failed to correct \textit{the development of diabetes \textbf{u}s +present in mice that \textbf{h}arry a transgen(\textbf{e})} (errors or additions +are emphasized). +% add reason! + +With the initial ngram text file, it failed to correct \textit{kind retards} to +\textit{kind regards}. +% add reason! \section{Statement of the Contributions} Peter wrote most of the Spell Checker implementation and report. -\section{Manual} - -\section{References} - \begin{thebibliography}{9} \bibitem{kernighan} Kernighan et al. diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index f3e504d..875213f 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -13,6 +13,7 @@ public class SpellCorrector { /** * The highest n-gram to look for. 1 gives a unigram, 2 gives a bigram, etc. + * Be sure that the CorupusReader actually provide such many details! */ private final static int NGRAM_N = 2; @@ -335,6 +336,9 @@ public class SpellCorrector { } p *= score; } + // this can happen if probability_non_word or + // LM_PROBABILITY_UNMODIFIED are badly chosen (too small). + assert p > 0 : "The combined probabilities were too small"; return p; } -- cgit v1.2.1