Extracted preparation of MNIST data into separate file

Signed-off-by: Jim Martens <github@2martens.de>
This commit is contained in:
2019-04-04 17:21:29 +02:00
parent 403577da0a
commit 08f26fa9e0
2 changed files with 103 additions and 81 deletions

View File

@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-
#
# Copyright 2019 Jim Martens
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pickle
from typing import Sequence
from typing import Tuple
import numpy as np
import tensorflow as tf
K = tf.keras.backend
def prepare_training_data(test_fold_id: int,
inlier_classes: Sequence[int],
total_classes: int,
fold_prefix: str = 'data/data_fold_',
batch_size: int = 128,
folds: int = 5) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
"""
Prepares the MNIST training data.
Args:
test_fold_id: id of test fold
inlier_classes: list of class ids that are considered inliers
total_classes: total number of classes
fold_prefix: the prefix for the fold pickle files (default: 'data/data_fold_')
batch_size: size of batch (default: 128)
folds: number of folds (default: 5)
Returns:
A tuple (train dataset, valid dataset)
"""
# prepare data
mnist_train = []
mnist_valid = []
for i in range(folds):
if i != test_fold_id: # exclude testing fold, representing 20% of each class
with open(f"{fold_prefix}{i:d}.pkl", 'rb') as pkl:
fold = pickle.load(pkl)
if len(mnist_valid) == 0: # single out one fold, comprising 20% of each class
mnist_valid = fold
else: # form train set from remaining folds, comprising 60% of each class
mnist_train += fold
outlier_classes = []
for i in range(total_classes):
if i not in inlier_classes:
outlier_classes.append(i)
# keep only train classes
mnist_train = [x for x in mnist_train if x[0] in inlier_classes]
def _list_of_pairs_to_numpy(list_of_pairs: Sequence[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]:
"""
Converts a list of pairs to a numpy array.
Args:
list_of_pairs: list of pairs
Returns:
tuple (feature array, label array)
"""
return np.asarray([x[1] for x in list_of_pairs], np.float32), np.asarray([x[0] for x in list_of_pairs], np.int)
mnist_train_x, mnist_train_y = _list_of_pairs_to_numpy(mnist_train)
mnist_valid_x, mnist_valid_y = _list_of_pairs_to_numpy(mnist_valid)
# get dataset
train_dataset = tf.data.Dataset.from_tensor_slices((mnist_train_x, mnist_train_y))
train_dataset = train_dataset.shuffle(mnist_train_x.shape[0]).batch(batch_size,
drop_remainder=True).map(_normalize)
valid_dataset = tf.data.Dataset.from_tensor_slices((mnist_valid_x, mnist_valid_y))
valid_dataset = valid_dataset.shuffle(mnist_valid_x.shape[0]).batch(batch_size,
drop_remainder=True).map(_normalize)
return train_dataset, valid_dataset
def _normalize(feature: tf.Tensor, label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
"""
Normalizes a tensor from a 0-255 range to a 0-1 range and adds one dimension.
:param feature: tensor to be normalized
:param label: label tensor
:return: normalized tensor
"""
return K.expand_dims(tf.divide(feature, 255.0)), label

View File

@ -29,13 +29,10 @@ Functions:
""" """
import os import os
import pickle
import time import time
from typing import Dict from typing import Dict
from typing import Sequence
from typing import Tuple from typing import Tuple
import numpy as np
import tensorflow as tf import tensorflow as tf
from tensorflow.python.ops import summary_ops_v2 from tensorflow.python.ops import summary_ops_v2
@ -49,73 +46,6 @@ tfe = tf.contrib.eager
LOG_FREQUENCY: int = 10 LOG_FREQUENCY: int = 10
def prepare_training_data(test_fold_id: int,
inlier_classes: Sequence[int],
total_classes: int,
fold_prefix: str = 'data/data_fold_',
batch_size: int = 128,
folds: int = 5) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
"""
Prepares the MNIST training data.
Args:
test_fold_id: id of test fold
inlier_classes: list of class ids that are considered inliers
total_classes: total number of classes
fold_prefix: the prefix for the fold pickle files (default: 'data/data_fold_')
batch_size: size of batch (default: 128)
folds: number of folds (default: 5)
Returns:
A tuple (train dataset, valid dataset)
"""
# prepare data
mnist_train = []
mnist_valid = []
for i in range(folds):
if i != test_fold_id: # exclude testing fold, representing 20% of each class
with open(f"{fold_prefix}{i:d}.pkl", 'rb') as pkl:
fold = pickle.load(pkl)
if len(mnist_valid) == 0: # single out one fold, comprising 20% of each class
mnist_valid = fold
else: # form train set from remaining folds, comprising 60% of each class
mnist_train += fold
outlier_classes = []
for i in range(total_classes):
if i not in inlier_classes:
outlier_classes.append(i)
# keep only train classes
mnist_train = [x for x in mnist_train if x[0] in inlier_classes]
def _list_of_pairs_to_numpy(list_of_pairs: Sequence[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]:
"""
Converts a list of pairs to a numpy array.
Args:
list_of_pairs: list of pairs
Returns:
tuple (feature array, label array)
"""
return np.asarray([x[1] for x in list_of_pairs], np.float32), np.asarray([x[0] for x in list_of_pairs], np.int)
mnist_train_x, mnist_train_y = _list_of_pairs_to_numpy(mnist_train)
mnist_valid_x, mnist_valid_y = _list_of_pairs_to_numpy(mnist_valid)
# get dataset
train_dataset = tf.data.Dataset.from_tensor_slices((mnist_train_x, mnist_train_y))
train_dataset = train_dataset.shuffle(mnist_train_x.shape[0]).batch(batch_size,
drop_remainder=True).map(_normalize)
valid_dataset = tf.data.Dataset.from_tensor_slices((mnist_valid_x, mnist_valid_y))
valid_dataset = valid_dataset.shuffle(mnist_valid_x.shape[0]).batch(batch_size,
drop_remainder=True).map(_normalize)
return train_dataset, valid_dataset
def train_simple(dataset: tf.data.Dataset, def train_simple(dataset: tf.data.Dataset,
iteration: int, iteration: int,
weights_prefix: str, weights_prefix: str,
@ -323,18 +253,8 @@ def _train_enc_dec_step_simple(encoder: model.Encoder, decoder: model.Decoder,
return reconstruction_loss, x_decoded return reconstruction_loss, x_decoded
def _normalize(feature: tf.Tensor, label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
"""
Normalizes a tensor from a 0-255 range to a 0-1 range and adds one dimension.
:param feature: tensor to be normalized
:param label: label tensor
:return: normalized tensor
"""
return K.expand_dims(tf.divide(feature, 255.0)), label
if __name__ == "__main__": if __name__ == "__main__":
from twomartens.masterthesis.aae.data import prepare_training_data
tf.enable_eager_execution() tf.enable_eager_execution()
inlier_classes = [3] inlier_classes = [3]
iteration = 1 iteration = 1