From b9070693776c448b542c5f7d7f4a0882a89fbab0 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Mon, 16 Jan 2017 07:35:02 +0100
Subject: NFA: finish 2

---
 part2.tex | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 182 insertions(+), 2 deletions(-)

diff --git a/part2.tex b/part2.tex
index 9d925ed..4c5d6de 100644
--- a/part2.tex
+++ b/part2.tex
@@ -290,10 +290,189 @@ for which the only non-empty words in $L(M)$ of length $<$ 5 are $aa$, $bb$,
 $aba$, $baa$, $abab$, $babb$ and $bbba$.
 
 \subsection*{Solution:}
+Model the NFA as $M = \{Q, \Sigma, \Delta, q_0, F\}$ where $Q=\{q_0, \ldots,
+q_{m-1}\}$ is the set of states, $\Sigma$ is the alphabet, $\Delta$ is the
+transition function $\Delta: Q \times \Sigma \to P(Q)$, $q_0 \in Q$ is the
+initial state and $F \subseteq Q$ is the set of final (accepting) states. The
+list of accepted words is $L(M)$.
+Let $S_{i,j}$ be the mapping to a state in $Q$ from character index $0 \leq j$
+of word $w_i \in L(M)$.
+
+For the given scenario, we also define \emph{bad} words (words with a length
+smaller than five, but not in the accepted language):
+\begin{align}
+    W' = \{ w |
+        w \in \Sigma^*
+        \land 1 \leq \#w \leq 4
+        \land w \not\in L(M)
+    \}
+\end{align}
+
+To solve the problem, we first try to find a NFA for a $m$ states and then
+gradually decrement it until we obtain \texttt{unsat}. An overview of our
+approach (a more detailed description will follow):
+\begin{enumerate}
+\item For each valid word $w_i$ in $L(M)$:
+    \begin{enumerate}
+        \item Add new states for each letter $j$ of the word $w_i$: $S_{i,j}$.
+            Note that these states use different variables ($S$) than the states
+            of the NFA ($Q$).
+        \item Add new transitions to the formula, starting from the common
+            initial state $q_0$. The next states are $S_{i,0}$, $S_{i,1}$, etc.
+            This expresses that the input $w_i$ is indeed accepted by $M$.
+        \item Add a constraint to the formula to express that the last letter
+            ends up in a final state.
+    \end{enumerate}
+\item For every \emph{bad} word, add an implication to the formula: if
+    the word exists, then the last state of this word must not be accepting.
+\item Finally map each of the states $S_{i,j}$ to a state in $Q$ via
+    conjunctions (for each word) consisting of disjunctions (for each state
+    $S_{i,j}$) that try to match with any state in $Q$.
+\end{enumerate}
+
+First we require that every non-empty word $w_i$ is indeed accepted as input: a
+path exists from the initial state to the final state, following each letter one
+by one:
+\begin{align}
+    \bigwedge_{w_i \in L(M)}
+    \Delta(q_0, w_{i,0}) = S_{i,0}
+    \land
+    (\bigwedge_{j=1}^{\#w_i}
+    \Delta(S_{i,j-1}, w_{i,j-1}) = S_{i,j})
+    \land
+    S_{i,\#w_i} \in F
+\end{align}
+
+The above expression assigns values to $S_{i,j}$, making it easier to put
+restrictions on the ``characters'' of each word (for example, requiring that the
+word originates from the initial state $q_0$ and that the last state is
+accepting). To make it more useful, it should be mapped somehow to $Q$ so we can
+obtain the desired NFA. For simplicity, let's assume that each character can be
+assigned to any state in $Q$:
+\begin{align}
+    \bigwedge_{w_i \in L(M)}
+    \bigwedge_{j=0}^{\#w_i}
+    \bigvee_{q \in Q} S_{i,j} = q
+\end{align}
+Note that this includes all possible states in $Q$, including the initial state
+$q_0$. Being the initial state does not exclude the state from receiving
+incoming transitions.
+
+Finally we have to exclude transitions, otherwise even a single, initial and
+final state would be permitted (with transitions to itself). Since loops are
+possible and transitions to any earlier state can occur, it is not sufficient
+to (for example) just exclude input that differ only in the trailing character.
+Therefore we check all possible \emph{bad words} $W'$.
+\begin{align}
+    \bigwedge_{w_i \in W'}
+        \Delta(q_0, w_{i,0}) = T_{i,0}
+        \land
+        (\bigwedge_{j=1}^{\#w_i}
+        \Delta(T_{i,j-1}, w_{i,j-1}) = T_{i,j})
+    \implies
+    T_{i,\#w_i} \not\in F
+\end{align}
+
+\textbf{Implementation notes}:
+we combined the above three ingredients into a conjunction and implemented it
+via a Python script that generates a SMT. To allow for concise SMT without
+exploding in the number of variables, we used functions. These have no limits in
+their domain or range so we have to add manual checks for these. For the states
+$S_{i,j}$ that match good words and for states $T_{i,j}$ that match bad words,
+we implemented range checks like $T_{i,j} \in Q$.
+(Actually, we set $q_i = i$ for $0 \leq i < m$, so the above range check simply
+checks whether the values are within that range.)
+These checks are either added to the global formula (for the good words) or
+before the implication (for the bad words).
+
+The generated SMT program has a shape which is shown in Listing~\ref{lst:nfa}.
+It runs within a second, even for $m=1000$. By gradually reducing it as stated
+before, we found that for $m=7$ no automaton was found (\texttt{unsat}) but for
+$m=8$, a valid result was obtained (see Figure~\ref{fig:nfa}). Note that the Z3
+output (Listing~\ref{lst:nfaZ3}) contains out-of-range and out-of-domain values
+because the $trans$ ($\Delta$) function for example is used with arbitrary
+values before the implication. When an implication evaluates to false, then the
+solver still keeps the garbage values even if it remains unused (its exact value
+does not influence the result of the evaluation).
+
+\begin{lstlisting}[
+    caption={Generated SMT program for NFA with $m=8$.},
+    label={lst:nfa},basicstyle=\scriptsize,language=lisp
+]
+(benchmark test.smt
+:logic QF_UFLIA
+:extrafuns (
+(q0 Int) (q1 Int) (q2 Int) (q3 Int) (q4 Int) (q5 Int) (q6 Int) (q7 Int)
+(s0_0 Int) (s0_1 Int) (s1_0 Int) (s1_1 Int) (s2_0 Int) (s2_1 Int) (s2_2 Int)
+(s3_0 Int) (s3_1 Int) (s3_2 Int) (s4_0 Int) (s4_1 Int) (s4_2 Int) (s4_3 Int)
+(s5_0 Int) (s5_1 Int) (s5_2 Int) (s5_3 Int) (s6_0 Int) (s6_1 Int) (s6_2 Int)
+(s6_3 Int)
+(trans Int Int Int)
+(final Int Bool)
+)
+:formula (and
+(= q0 0) (= q1 1) (= q2 2) (= q3 3) (= q4 4) (= q5 5) (= q6 6) (= q7 7)
+(= (trans q0 0) s0_0)
+(= (trans s0_0 0) s0_1)
+(final s0_1)
+...
+(= (trans q0 1) s6_0)
+(= (trans s6_0 1) s6_1)
+(= (trans s6_1 1) s6_2)
+(= (trans s6_2 0) s6_3)
+(final s6_3)
+(=> (and (>= (trans q0 0) 0) (< (trans q0 0) 8)) (not (final (trans q0 0))))
+(=> (and (>= (trans q0 1) 0) (< (trans q0 1) 8)) (not (final (trans q0 1))))
+(=> (and (>= (trans          q0  0) 0) (< (trans          q0  0) 8)
+         (>= (trans (trans q0 0) 1) 0) (< (trans (trans q0 0) 1) 8))
+    (not (final (trans (trans q0 0) 1))))
+...
+(or (= q0 s0_0) (= q1 s0_0) ... (= q7 s0_0))
+...
+(or (= q0 s6_3) (= q1 s6_3) ... (= q7 s6_3))
+))
+\end{lstlisting}
 
+\begin{minipage}{.35\textwidth}
+\begin{lstlisting}[
+    caption={Output of Z3 for the generated SMT in Listing~\ref{lst:nfa}.},
+    label={lst:nfaZ3},basicstyle=\scriptsize,language=lisp
+]
+sat
+s6_3 -> 5
+s6_2 -> 1
+...
+
+s0_0 -> 4
+q7 -> 7
+q6 -> 6
+...
+q0 -> 0
+trans -> {
+  0 0 -> 4
+  4 0 -> 5
+...
+  3 0 -> (- 5921)
+  (- 1) 0 -> (- 8946)
+...
+  else -> 5
+}
+final -> {
+  2 -> true
+  5 -> true
+...
+  1 -> false
+...
+  (- 1324) -> false
+  else -> false
+}
+\end{lstlisting}
+\end{minipage}
+\hfill
+\begin{minipage}{.55\textwidth}
 \begin{figure}[H]
 \centering
-\begin{tikzpicture}[>=latex,line join=bevel,scale=.8]
+\begin{tikzpicture}[>=latex,line join=bevel,scale=.6]
 %%
 \node (q1) at (349.84bp,107.75bp) [draw,circle] {q1};
   \node (q0) at (63.348bp,142.75bp) [draw,circle] {q0};
@@ -331,8 +510,9 @@ $aba$, $baa$, $abab$, $babb$ and $bbba$.
   \draw (200.59bp,68.248bp) node {b};
 %
 \end{tikzpicture}
+\caption{\label{fig:nfa}Resulting NFA for $m=8$.}
 \end{figure}
-
+\end{minipage}
 
 \section*{Problem: }
 The goal of this problem is to exploit the power of the recommended tools rather
-- 
cgit v1.2.1