From 8a9dacb5e1ee825e9e26692e93c1d20e0acd6a3e Mon Sep 17 00:00:00 2001
From: Jim Martens <github@2martens.de>
Date: Thu, 5 Sep 2019 16:11:43 +0200
Subject: [PATCH] Added results tables and text about vanilla SSD results

Signed-off-by: Jim Martens <github@2martens.de>
---
 body.tex | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/body.tex b/body.tex
index 70a907c..d729a91 100644
--- a/body.tex
+++ b/body.tex
@@ -550,6 +550,9 @@ increases linearly with more forward passes.
 These detections have to be decoded first. Afterwards,
 all detections are thrown away which do not pass a confidence
 threshold for the class with the highest prediction probability.
+Additionally, all detections with a winning prediction of background are discarded.
+However, to test the impact an alternative has been implemented as well:
+Only detections with a background prediction of 0.8 or higher are discarded.
 The remaining detections are partitioned into observations to
 further reduce the size of the output, and
 to identify uncertainty. This is accomplished by calculating the
@@ -643,7 +646,7 @@ to vanilla SSD with 0.01 confidence threshold; this comparison
 investigates the effect of the per class confidence threshold
 on the object detection performance.
 
-Bayesian SSD was run with 0.5 confidence threshold and compared
+Bayesian SSD was run with 0.2 confidence threshold and compared
 to vanilla SSD with 0.2 confidence threshold. Coupled with the
 entropy threshold, this comparison shows how uncertain the network
 is. If it is very certain the dropout sampling should have no
@@ -655,6 +658,77 @@ from 0.1 to 2.5 as specified in Miller et al.~\cite{Miller2018}.
 
 \section{Results}
 
+Results in this section are presented both for micro and macro averaging.
+In macro averaging, for example, the precision values of each class are added up
+and then divided by the number of classes. Conversely, for micro averaging the
+precision is calculated across all classes directly. Both methods have
+a specific impact: macro averaging weighs every class the same while micro
+averaging weighs every detection the same. They will be largely identical
+when every class is balanced and has about the same number of detections.
+However, in case of a class imbalance the macro averaging
+favours classes with few detections whereas micro averaging benefits classes
+with many detections.
+
+In both cases, vanilla SSD with a per-class confidence threshold of 0.2
+performs best (see tables \ref{tab:results-micro} and \ref{tab:results-macro})
+with a maximum \(F_1\) score of 0.376/0.375 (always micro/macro) compared to both vanilla SSD with a per-class
+threshold of 0.01 (0.255/0.370) and vanilla SSD with entropy thresholding (0.255/0.370).
+It has the fewest open set errors (2939/1218 to 3176/1426 and 3168/1373), and the best recall
+(0.382/0.338 to 0.214/0.328 and 0.214/0.329). For micro averaging it has the best precision (0.372 to 0.318 and 0.318), macro averaging is won by vanilla SSD with entropy test (0.425 to 0.424 and 0.424).
+This shows: a higher per-class confidence threshold removes many bad detections and hence the end
+result is that much better. These comparisons also show that the network is
+not very uncertain. The best performing entropy threshold is not any better than
+the corresponding vanilla SSD without entropy threshold. Therefore, in this
+case the per-class confidence score is far more important for the result.
+
+\begin{table}
+    \begin{tabular}{rcccc}
+        \hline
+        Forward & max & abs OSE & Recall & Precision\\
+        Passes & \(F_1\) Score & \multicolumn{3}{c}{at max \(F_1\) point} \\
+            \hline
+            vanilla SSD - 0.01 conf & 0.255 & 3176 & 0.214 & 0.318 \\
+            vanilla SSD - 0.2 conf & \textbf{0.376} & \textbf{2939} & \textbf{0.382} & \textbf{0.372} \\
+            SSD with Entropy test - 0.01 conf & 0.255 & 3168 & 0.214 & 0.318 \\
+            % entropy thresh: 2.4 for vanilla SSD is best
+            \hline
+            Bayesian SSD - no bg - 0.2 conf \; 10 & 0.003 & 2145 & 0.005 & 0.002 \\
+            no bg > 0.8 conf - 0.2 conf \; 10 & 0.003 & 151 & 0.004 & 0.003 \\
+            % entropy thresh: 1.2 for Bayesian - 2 is best, 0.4 for 3
+        \hline
+    \end{tabular}
+    \caption{Results for micro averaging. SSD with Entropy test and Bayesian SSD are represented with
+    their best performing entropy threshold. Vanilla SSD with Entropy test performed best with an
+    entropy threshold of 2.4, Bayesian SSD with no background performed best for 1.2,
+    and Bayesian SSD with no background prediction higher than 0.8 performed best for 0.4 as entropy
+    threshold.}
+    \label{tab:results-micro}
+\end{table}
+
+\begin{table}
+    \begin{tabular}{rcccc}
+        \hline
+        Forward & max & abs OSE & Recall & Precision\\
+        Passes & \(F_1\) Score & \multicolumn{3}{c}{at max \(F_1\) point} \\
+            \hline
+            vanilla SSD - 0.01 conf & 0.370 & 1426 & 0.328 & 0.424 \\
+            vanilla SSD - 0.2 conf & \textbf{0.375} & \textbf{1218} & \textbf{0.338} & 0.424 \\
+            SSD with Entropy test - 0.01 conf & 0.370 & 1373 & 0.329 & \textbf{0.425} \\
+            % entropy thresh: 1.7 for vanilla SSD is best
+            \hline
+            Bayesian SSD - no bg - 0.2 conf \; 10 & 0.002 & 1784 & 0.005 & 0.002 \\
+             no bg > 0.8 conf - 0.2 conf \; 10 & 0.002 & 122 & 0.003 & 0.002 \\
+            % entropy thresh: 1.2 for Bayesian - 2 is best, 0.4 for 3
+        \hline
+    \end{tabular}
+    \caption{Results for macro averaging. SSD with Entropy test and Bayesian SSD are represented with
+    their best performing entropy threshold. Vanilla SSD with Entropy test performed best with an
+    entropy threshold of 1.7, Bayesian SSD with no background performed best for 1.2,
+    and Bayesian SSD with no background prediction higher than 0.8 performed best for 0.4 as entropy
+    threshold.}
+    \label{tab:results-macro}
+\end{table}
+
 \chapter{Discussion}
 
 \label{chap:discussion}