From 08f26fa9e0c3b679f654e3cad244327901ee7ee9 Mon Sep 17 00:00:00 2001 From: Jim Martens Date: Thu, 4 Apr 2019 17:21:29 +0200 Subject: [PATCH] Extracted preparation of MNIST data into separate file Signed-off-by: Jim Martens --- src/twomartens/masterthesis/aae/data.py | 102 +++++++++++++++++++++++ src/twomartens/masterthesis/aae/train.py | 82 +----------------- 2 files changed, 103 insertions(+), 81 deletions(-) create mode 100644 src/twomartens/masterthesis/aae/data.py diff --git a/src/twomartens/masterthesis/aae/data.py b/src/twomartens/masterthesis/aae/data.py new file mode 100644 index 0000000..72cb05d --- /dev/null +++ b/src/twomartens/masterthesis/aae/data.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2019 Jim Martens +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle +from typing import Sequence +from typing import Tuple + +import numpy as np +import tensorflow as tf + +K = tf.keras.backend + + +def prepare_training_data(test_fold_id: int, + inlier_classes: Sequence[int], + total_classes: int, + fold_prefix: str = 'data/data_fold_', + batch_size: int = 128, + folds: int = 5) -> Tuple[tf.data.Dataset, tf.data.Dataset]: + """ + Prepares the MNIST training data. + + Args: + test_fold_id: id of test fold + inlier_classes: list of class ids that are considered inliers + total_classes: total number of classes + fold_prefix: the prefix for the fold pickle files (default: 'data/data_fold_') + batch_size: size of batch (default: 128) + folds: number of folds (default: 5) + + Returns: + A tuple (train dataset, valid dataset) + """ + # prepare data + mnist_train = [] + mnist_valid = [] + + for i in range(folds): + if i != test_fold_id: # exclude testing fold, representing 20% of each class + with open(f"{fold_prefix}{i:d}.pkl", 'rb') as pkl: + fold = pickle.load(pkl) + if len(mnist_valid) == 0: # single out one fold, comprising 20% of each class + mnist_valid = fold + else: # form train set from remaining folds, comprising 60% of each class + mnist_train += fold + + outlier_classes = [] + for i in range(total_classes): + if i not in inlier_classes: + outlier_classes.append(i) + + # keep only train classes + mnist_train = [x for x in mnist_train if x[0] in inlier_classes] + + def _list_of_pairs_to_numpy(list_of_pairs: Sequence[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]: + """ + Converts a list of pairs to a numpy array. + + Args: + list_of_pairs: list of pairs + + Returns: + tuple (feature array, label array) + """ + return np.asarray([x[1] for x in list_of_pairs], np.float32), np.asarray([x[0] for x in list_of_pairs], np.int) + + mnist_train_x, mnist_train_y = _list_of_pairs_to_numpy(mnist_train) + mnist_valid_x, mnist_valid_y = _list_of_pairs_to_numpy(mnist_valid) + + # get dataset + train_dataset = tf.data.Dataset.from_tensor_slices((mnist_train_x, mnist_train_y)) + train_dataset = train_dataset.shuffle(mnist_train_x.shape[0]).batch(batch_size, + drop_remainder=True).map(_normalize) + valid_dataset = tf.data.Dataset.from_tensor_slices((mnist_valid_x, mnist_valid_y)) + valid_dataset = valid_dataset.shuffle(mnist_valid_x.shape[0]).batch(batch_size, + drop_remainder=True).map(_normalize) + + return train_dataset, valid_dataset + + +def _normalize(feature: tf.Tensor, label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: + """ + Normalizes a tensor from a 0-255 range to a 0-1 range and adds one dimension. + + :param feature: tensor to be normalized + :param label: label tensor + :return: normalized tensor + """ + return K.expand_dims(tf.divide(feature, 255.0)), label diff --git a/src/twomartens/masterthesis/aae/train.py b/src/twomartens/masterthesis/aae/train.py index cfe31c0..7bfa1ad 100644 --- a/src/twomartens/masterthesis/aae/train.py +++ b/src/twomartens/masterthesis/aae/train.py @@ -29,13 +29,10 @@ Functions: """ import os -import pickle import time from typing import Dict -from typing import Sequence from typing import Tuple -import numpy as np import tensorflow as tf from tensorflow.python.ops import summary_ops_v2 @@ -49,73 +46,6 @@ tfe = tf.contrib.eager LOG_FREQUENCY: int = 10 -def prepare_training_data(test_fold_id: int, - inlier_classes: Sequence[int], - total_classes: int, - fold_prefix: str = 'data/data_fold_', - batch_size: int = 128, - folds: int = 5) -> Tuple[tf.data.Dataset, tf.data.Dataset]: - """ - Prepares the MNIST training data. - - Args: - test_fold_id: id of test fold - inlier_classes: list of class ids that are considered inliers - total_classes: total number of classes - fold_prefix: the prefix for the fold pickle files (default: 'data/data_fold_') - batch_size: size of batch (default: 128) - folds: number of folds (default: 5) - - Returns: - A tuple (train dataset, valid dataset) - """ - # prepare data - mnist_train = [] - mnist_valid = [] - - for i in range(folds): - if i != test_fold_id: # exclude testing fold, representing 20% of each class - with open(f"{fold_prefix}{i:d}.pkl", 'rb') as pkl: - fold = pickle.load(pkl) - if len(mnist_valid) == 0: # single out one fold, comprising 20% of each class - mnist_valid = fold - else: # form train set from remaining folds, comprising 60% of each class - mnist_train += fold - - outlier_classes = [] - for i in range(total_classes): - if i not in inlier_classes: - outlier_classes.append(i) - - # keep only train classes - mnist_train = [x for x in mnist_train if x[0] in inlier_classes] - - def _list_of_pairs_to_numpy(list_of_pairs: Sequence[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]: - """ - Converts a list of pairs to a numpy array. - - Args: - list_of_pairs: list of pairs - - Returns: - tuple (feature array, label array) - """ - return np.asarray([x[1] for x in list_of_pairs], np.float32), np.asarray([x[0] for x in list_of_pairs], np.int) - - mnist_train_x, mnist_train_y = _list_of_pairs_to_numpy(mnist_train) - mnist_valid_x, mnist_valid_y = _list_of_pairs_to_numpy(mnist_valid) - - # get dataset - train_dataset = tf.data.Dataset.from_tensor_slices((mnist_train_x, mnist_train_y)) - train_dataset = train_dataset.shuffle(mnist_train_x.shape[0]).batch(batch_size, - drop_remainder=True).map(_normalize) - valid_dataset = tf.data.Dataset.from_tensor_slices((mnist_valid_x, mnist_valid_y)) - valid_dataset = valid_dataset.shuffle(mnist_valid_x.shape[0]).batch(batch_size, - drop_remainder=True).map(_normalize) - - return train_dataset, valid_dataset - - def train_simple(dataset: tf.data.Dataset, iteration: int, weights_prefix: str, @@ -323,18 +253,8 @@ def _train_enc_dec_step_simple(encoder: model.Encoder, decoder: model.Decoder, return reconstruction_loss, x_decoded -def _normalize(feature: tf.Tensor, label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: - """ - Normalizes a tensor from a 0-255 range to a 0-1 range and adds one dimension. - - :param feature: tensor to be normalized - :param label: label tensor - :return: normalized tensor - """ - return K.expand_dims(tf.divide(feature, 255.0)), label - - if __name__ == "__main__": + from twomartens.masterthesis.aae.data import prepare_training_data tf.enable_eager_execution() inlier_classes = [3] iteration = 1