From 08f26fa9e0c3b679f654e3cad244327901ee7ee9 Mon Sep 17 00:00:00 2001
From: Jim Martens <github@2martens.de>
Date: Thu, 4 Apr 2019 17:21:29 +0200
Subject: [PATCH] Extracted preparation of MNIST data into separate file

Signed-off-by: Jim Martens <github@2martens.de>
---
 src/twomartens/masterthesis/aae/data.py  | 102 +++++++++++++++++++++++
 src/twomartens/masterthesis/aae/train.py |  82 +-----------------
 2 files changed, 103 insertions(+), 81 deletions(-)
 create mode 100644 src/twomartens/masterthesis/aae/data.py

diff --git a/src/twomartens/masterthesis/aae/data.py b/src/twomartens/masterthesis/aae/data.py
new file mode 100644
index 0000000..72cb05d
--- /dev/null
+++ b/src/twomartens/masterthesis/aae/data.py
@@ -0,0 +1,102 @@
+#  -*- coding: utf-8 -*-
+#
+#  Copyright 2019 Jim Martens
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pickle
+from typing import Sequence
+from typing import Tuple
+
+import numpy as np
+import tensorflow as tf
+
+K = tf.keras.backend
+
+
+def prepare_training_data(test_fold_id: int,
+                          inlier_classes: Sequence[int],
+                          total_classes: int,
+                          fold_prefix: str = 'data/data_fold_',
+                          batch_size: int = 128,
+                          folds: int = 5) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
+    """
+    Prepares the MNIST training data.
+    
+    Args:
+        test_fold_id: id of test fold
+        inlier_classes: list of class ids that are considered inliers
+        total_classes: total number of classes
+        fold_prefix: the prefix for the fold pickle files (default: 'data/data_fold_')
+        batch_size: size of batch (default: 128)
+        folds: number of folds (default: 5)
+    
+    Returns:
+        A tuple (train dataset, valid dataset)
+    """
+    # prepare data
+    mnist_train = []
+    mnist_valid = []
+    
+    for i in range(folds):
+        if i != test_fold_id:  # exclude testing fold, representing 20% of each class
+            with open(f"{fold_prefix}{i:d}.pkl", 'rb') as pkl:
+                fold = pickle.load(pkl)
+            if len(mnist_valid) == 0:  # single out one fold, comprising 20% of each class
+                mnist_valid = fold
+            else:  # form train set from remaining folds, comprising 60% of each class
+                mnist_train += fold
+    
+    outlier_classes = []
+    for i in range(total_classes):
+        if i not in inlier_classes:
+            outlier_classes.append(i)
+    
+    # keep only train classes
+    mnist_train = [x for x in mnist_train if x[0] in inlier_classes]
+    
+    def _list_of_pairs_to_numpy(list_of_pairs: Sequence[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Converts a list of pairs to a numpy array.
+
+        Args:
+            list_of_pairs: list of pairs
+        
+        Returns:
+            tuple (feature array, label array)
+        """
+        return np.asarray([x[1] for x in list_of_pairs], np.float32), np.asarray([x[0] for x in list_of_pairs], np.int)
+    
+    mnist_train_x, mnist_train_y = _list_of_pairs_to_numpy(mnist_train)
+    mnist_valid_x, mnist_valid_y = _list_of_pairs_to_numpy(mnist_valid)
+    
+    # get dataset
+    train_dataset = tf.data.Dataset.from_tensor_slices((mnist_train_x, mnist_train_y))
+    train_dataset = train_dataset.shuffle(mnist_train_x.shape[0]).batch(batch_size,
+                                                                        drop_remainder=True).map(_normalize)
+    valid_dataset = tf.data.Dataset.from_tensor_slices((mnist_valid_x, mnist_valid_y))
+    valid_dataset = valid_dataset.shuffle(mnist_valid_x.shape[0]).batch(batch_size,
+                                                                        drop_remainder=True).map(_normalize)
+    
+    return train_dataset, valid_dataset
+
+
+def _normalize(feature: tf.Tensor, label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """
+    Normalizes a tensor from a 0-255 range to a 0-1 range and adds one dimension.
+    
+    :param feature: tensor to be normalized
+    :param label: label tensor
+    :return: normalized tensor
+    """
+    return K.expand_dims(tf.divide(feature, 255.0)), label
diff --git a/src/twomartens/masterthesis/aae/train.py b/src/twomartens/masterthesis/aae/train.py
index cfe31c0..7bfa1ad 100644
--- a/src/twomartens/masterthesis/aae/train.py
+++ b/src/twomartens/masterthesis/aae/train.py
@@ -29,13 +29,10 @@ Functions:
 
 """
 import os
-import pickle
 import time
 from typing import Dict
-from typing import Sequence
 from typing import Tuple
 
-import numpy as np
 import tensorflow as tf
 from tensorflow.python.ops import summary_ops_v2
 
@@ -49,73 +46,6 @@ tfe = tf.contrib.eager
 LOG_FREQUENCY: int = 10
 
 
-def prepare_training_data(test_fold_id: int,
-                          inlier_classes: Sequence[int],
-                          total_classes: int,
-                          fold_prefix: str = 'data/data_fold_',
-                          batch_size: int = 128,
-                          folds: int = 5) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
-    """
-    Prepares the MNIST training data.
-    
-    Args:
-        test_fold_id: id of test fold
-        inlier_classes: list of class ids that are considered inliers
-        total_classes: total number of classes
-        fold_prefix: the prefix for the fold pickle files (default: 'data/data_fold_')
-        batch_size: size of batch (default: 128)
-        folds: number of folds (default: 5)
-    
-    Returns:
-        A tuple (train dataset, valid dataset)
-    """
-    # prepare data
-    mnist_train = []
-    mnist_valid = []
-    
-    for i in range(folds):
-        if i != test_fold_id:  # exclude testing fold, representing 20% of each class
-            with open(f"{fold_prefix}{i:d}.pkl", 'rb') as pkl:
-                fold = pickle.load(pkl)
-            if len(mnist_valid) == 0:  # single out one fold, comprising 20% of each class
-                mnist_valid = fold
-            else:  # form train set from remaining folds, comprising 60% of each class
-                mnist_train += fold
-    
-    outlier_classes = []
-    for i in range(total_classes):
-        if i not in inlier_classes:
-            outlier_classes.append(i)
-    
-    # keep only train classes
-    mnist_train = [x for x in mnist_train if x[0] in inlier_classes]
-    
-    def _list_of_pairs_to_numpy(list_of_pairs: Sequence[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]:
-        """
-        Converts a list of pairs to a numpy array.
-
-        Args:
-            list_of_pairs: list of pairs
-        
-        Returns:
-            tuple (feature array, label array)
-        """
-        return np.asarray([x[1] for x in list_of_pairs], np.float32), np.asarray([x[0] for x in list_of_pairs], np.int)
-    
-    mnist_train_x, mnist_train_y = _list_of_pairs_to_numpy(mnist_train)
-    mnist_valid_x, mnist_valid_y = _list_of_pairs_to_numpy(mnist_valid)
-    
-    # get dataset
-    train_dataset = tf.data.Dataset.from_tensor_slices((mnist_train_x, mnist_train_y))
-    train_dataset = train_dataset.shuffle(mnist_train_x.shape[0]).batch(batch_size,
-                                                                        drop_remainder=True).map(_normalize)
-    valid_dataset = tf.data.Dataset.from_tensor_slices((mnist_valid_x, mnist_valid_y))
-    valid_dataset = valid_dataset.shuffle(mnist_valid_x.shape[0]).batch(batch_size,
-                                                                        drop_remainder=True).map(_normalize)
-    
-    return train_dataset, valid_dataset
-
-
 def train_simple(dataset: tf.data.Dataset,
                  iteration: int,
                  weights_prefix: str,
@@ -323,18 +253,8 @@ def _train_enc_dec_step_simple(encoder: model.Encoder, decoder: model.Decoder,
     return reconstruction_loss, x_decoded
 
 
-def _normalize(feature: tf.Tensor, label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
-    """
-    Normalizes a tensor from a 0-255 range to a 0-1 range and adds one dimension.
-    
-    :param feature: tensor to be normalized
-    :param label: label tensor
-    :return: normalized tensor
-    """
-    return K.expand_dims(tf.divide(feature, 255.0)), label
-
-
 if __name__ == "__main__":
+    from twomartens.masterthesis.aae.data import prepare_training_data
     tf.enable_eager_execution()
     inlier_classes = [3]
     iteration = 1