Extracted preparation of MNIST data into separate file
Signed-off-by: Jim Martens <github@2martens.de>
This commit is contained in:
102
src/twomartens/masterthesis/aae/data.py
Normal file
102
src/twomartens/masterthesis/aae/data.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# Copyright 2019 Jim Martens
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
from typing import Sequence
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
K = tf.keras.backend
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_training_data(test_fold_id: int,
|
||||||
|
inlier_classes: Sequence[int],
|
||||||
|
total_classes: int,
|
||||||
|
fold_prefix: str = 'data/data_fold_',
|
||||||
|
batch_size: int = 128,
|
||||||
|
folds: int = 5) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
|
||||||
|
"""
|
||||||
|
Prepares the MNIST training data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_fold_id: id of test fold
|
||||||
|
inlier_classes: list of class ids that are considered inliers
|
||||||
|
total_classes: total number of classes
|
||||||
|
fold_prefix: the prefix for the fold pickle files (default: 'data/data_fold_')
|
||||||
|
batch_size: size of batch (default: 128)
|
||||||
|
folds: number of folds (default: 5)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple (train dataset, valid dataset)
|
||||||
|
"""
|
||||||
|
# prepare data
|
||||||
|
mnist_train = []
|
||||||
|
mnist_valid = []
|
||||||
|
|
||||||
|
for i in range(folds):
|
||||||
|
if i != test_fold_id: # exclude testing fold, representing 20% of each class
|
||||||
|
with open(f"{fold_prefix}{i:d}.pkl", 'rb') as pkl:
|
||||||
|
fold = pickle.load(pkl)
|
||||||
|
if len(mnist_valid) == 0: # single out one fold, comprising 20% of each class
|
||||||
|
mnist_valid = fold
|
||||||
|
else: # form train set from remaining folds, comprising 60% of each class
|
||||||
|
mnist_train += fold
|
||||||
|
|
||||||
|
outlier_classes = []
|
||||||
|
for i in range(total_classes):
|
||||||
|
if i not in inlier_classes:
|
||||||
|
outlier_classes.append(i)
|
||||||
|
|
||||||
|
# keep only train classes
|
||||||
|
mnist_train = [x for x in mnist_train if x[0] in inlier_classes]
|
||||||
|
|
||||||
|
def _list_of_pairs_to_numpy(list_of_pairs: Sequence[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""
|
||||||
|
Converts a list of pairs to a numpy array.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
list_of_pairs: list of pairs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple (feature array, label array)
|
||||||
|
"""
|
||||||
|
return np.asarray([x[1] for x in list_of_pairs], np.float32), np.asarray([x[0] for x in list_of_pairs], np.int)
|
||||||
|
|
||||||
|
mnist_train_x, mnist_train_y = _list_of_pairs_to_numpy(mnist_train)
|
||||||
|
mnist_valid_x, mnist_valid_y = _list_of_pairs_to_numpy(mnist_valid)
|
||||||
|
|
||||||
|
# get dataset
|
||||||
|
train_dataset = tf.data.Dataset.from_tensor_slices((mnist_train_x, mnist_train_y))
|
||||||
|
train_dataset = train_dataset.shuffle(mnist_train_x.shape[0]).batch(batch_size,
|
||||||
|
drop_remainder=True).map(_normalize)
|
||||||
|
valid_dataset = tf.data.Dataset.from_tensor_slices((mnist_valid_x, mnist_valid_y))
|
||||||
|
valid_dataset = valid_dataset.shuffle(mnist_valid_x.shape[0]).batch(batch_size,
|
||||||
|
drop_remainder=True).map(_normalize)
|
||||||
|
|
||||||
|
return train_dataset, valid_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize(feature: tf.Tensor, label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
|
||||||
|
"""
|
||||||
|
Normalizes a tensor from a 0-255 range to a 0-1 range and adds one dimension.
|
||||||
|
|
||||||
|
:param feature: tensor to be normalized
|
||||||
|
:param label: label tensor
|
||||||
|
:return: normalized tensor
|
||||||
|
"""
|
||||||
|
return K.expand_dims(tf.divide(feature, 255.0)), label
|
||||||
@ -29,13 +29,10 @@ Functions:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import pickle
|
|
||||||
import time
|
import time
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from typing import Sequence
|
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from tensorflow.python.ops import summary_ops_v2
|
from tensorflow.python.ops import summary_ops_v2
|
||||||
|
|
||||||
@ -49,73 +46,6 @@ tfe = tf.contrib.eager
|
|||||||
LOG_FREQUENCY: int = 10
|
LOG_FREQUENCY: int = 10
|
||||||
|
|
||||||
|
|
||||||
def prepare_training_data(test_fold_id: int,
|
|
||||||
inlier_classes: Sequence[int],
|
|
||||||
total_classes: int,
|
|
||||||
fold_prefix: str = 'data/data_fold_',
|
|
||||||
batch_size: int = 128,
|
|
||||||
folds: int = 5) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
|
|
||||||
"""
|
|
||||||
Prepares the MNIST training data.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
test_fold_id: id of test fold
|
|
||||||
inlier_classes: list of class ids that are considered inliers
|
|
||||||
total_classes: total number of classes
|
|
||||||
fold_prefix: the prefix for the fold pickle files (default: 'data/data_fold_')
|
|
||||||
batch_size: size of batch (default: 128)
|
|
||||||
folds: number of folds (default: 5)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A tuple (train dataset, valid dataset)
|
|
||||||
"""
|
|
||||||
# prepare data
|
|
||||||
mnist_train = []
|
|
||||||
mnist_valid = []
|
|
||||||
|
|
||||||
for i in range(folds):
|
|
||||||
if i != test_fold_id: # exclude testing fold, representing 20% of each class
|
|
||||||
with open(f"{fold_prefix}{i:d}.pkl", 'rb') as pkl:
|
|
||||||
fold = pickle.load(pkl)
|
|
||||||
if len(mnist_valid) == 0: # single out one fold, comprising 20% of each class
|
|
||||||
mnist_valid = fold
|
|
||||||
else: # form train set from remaining folds, comprising 60% of each class
|
|
||||||
mnist_train += fold
|
|
||||||
|
|
||||||
outlier_classes = []
|
|
||||||
for i in range(total_classes):
|
|
||||||
if i not in inlier_classes:
|
|
||||||
outlier_classes.append(i)
|
|
||||||
|
|
||||||
# keep only train classes
|
|
||||||
mnist_train = [x for x in mnist_train if x[0] in inlier_classes]
|
|
||||||
|
|
||||||
def _list_of_pairs_to_numpy(list_of_pairs: Sequence[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]:
|
|
||||||
"""
|
|
||||||
Converts a list of pairs to a numpy array.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
list_of_pairs: list of pairs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple (feature array, label array)
|
|
||||||
"""
|
|
||||||
return np.asarray([x[1] for x in list_of_pairs], np.float32), np.asarray([x[0] for x in list_of_pairs], np.int)
|
|
||||||
|
|
||||||
mnist_train_x, mnist_train_y = _list_of_pairs_to_numpy(mnist_train)
|
|
||||||
mnist_valid_x, mnist_valid_y = _list_of_pairs_to_numpy(mnist_valid)
|
|
||||||
|
|
||||||
# get dataset
|
|
||||||
train_dataset = tf.data.Dataset.from_tensor_slices((mnist_train_x, mnist_train_y))
|
|
||||||
train_dataset = train_dataset.shuffle(mnist_train_x.shape[0]).batch(batch_size,
|
|
||||||
drop_remainder=True).map(_normalize)
|
|
||||||
valid_dataset = tf.data.Dataset.from_tensor_slices((mnist_valid_x, mnist_valid_y))
|
|
||||||
valid_dataset = valid_dataset.shuffle(mnist_valid_x.shape[0]).batch(batch_size,
|
|
||||||
drop_remainder=True).map(_normalize)
|
|
||||||
|
|
||||||
return train_dataset, valid_dataset
|
|
||||||
|
|
||||||
|
|
||||||
def train_simple(dataset: tf.data.Dataset,
|
def train_simple(dataset: tf.data.Dataset,
|
||||||
iteration: int,
|
iteration: int,
|
||||||
weights_prefix: str,
|
weights_prefix: str,
|
||||||
@ -323,18 +253,8 @@ def _train_enc_dec_step_simple(encoder: model.Encoder, decoder: model.Decoder,
|
|||||||
return reconstruction_loss, x_decoded
|
return reconstruction_loss, x_decoded
|
||||||
|
|
||||||
|
|
||||||
def _normalize(feature: tf.Tensor, label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
|
|
||||||
"""
|
|
||||||
Normalizes a tensor from a 0-255 range to a 0-1 range and adds one dimension.
|
|
||||||
|
|
||||||
:param feature: tensor to be normalized
|
|
||||||
:param label: label tensor
|
|
||||||
:return: normalized tensor
|
|
||||||
"""
|
|
||||||
return K.expand_dims(tf.divide(feature, 255.0)), label
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
from twomartens.masterthesis.aae.data import prepare_training_data
|
||||||
tf.enable_eager_execution()
|
tf.enable_eager_execution()
|
||||||
inlier_classes = [3]
|
inlier_classes = [3]
|
||||||
iteration = 1
|
iteration = 1
|
||||||
|
|||||||
Reference in New Issue
Block a user