Removed double-iteration over epochs

1. Dataset was repeated nr_epochs times
2. loop over dataset was repeated nr_epoch times
Consequence: each checkpointed epoch was in fact nr_epochs long

Signed-off-by: Jim Martens <github@2martens.de>
This commit is contained in:
2019-06-06 11:06:59 +02:00
parent c893954120
commit 110e098d78
3 changed files with 80 additions and 53 deletions

View File

@ -60,9 +60,11 @@ def _ssd_train(args: argparse.Namespace) -> None:
with open(f"{args.ground_truth_path}/instances.bin", "rb") as file:
instances = pickle.load(file)
scenenet_data, nr_digits = data.load_scenenet_data(file_names_photos, instances, args.coco_path,
batch_size=batch_size,
resized_shape=(image_size, image_size))
scenenet_data, nr_digits, length_dataset = \
data.load_scenenet_data(file_names_photos, instances, args.coco_path,
batch_size=batch_size,
resized_shape=(image_size, image_size),
mode="training")
del file_names_photos, instances
use_summary_writer = summary_ops_v2.create_file_writer(
@ -71,13 +73,17 @@ def _ssd_train(args: argparse.Namespace) -> None:
if args.debug:
with use_summary_writer.as_default():
ssd.train(scenenet_data, args.iteration, use_dropout, weights_prefix=weights_path,
ssd.train(scenenet_data, args.iteration, use_dropout, length_dataset,
weights_prefix=weights_path,
weights_path=pre_trained_weights_file, batch_size=batch_size,
nr_epochs=args.num_epochs)
nr_epochs=args.num_epochs,
verbose=args.verbose)
else:
ssd.train(scenenet_data, args.iteration, use_dropout, weights_prefix=weights_path,
ssd.train(scenenet_data, args.iteration, use_dropout, length_dataset,
weights_prefix=weights_path,
weights_path=pre_trained_weights_file, batch_size=batch_size,
nr_epochs=args.num_epochs)
nr_epochs=args.num_epochs,
verbose=args.verbose)
def _auto_encoder_train(args: argparse.Namespace) -> None:
@ -245,9 +251,10 @@ def _ssd_val(args: argparse.Namespace) -> None:
with open(f"{args.ground_truth_path}/instances.bin", "rb") as file:
instances = pickle.load(file)
scenenet_data, nr_digits = data.load_scenenet_data(file_names_photos, instances, args.coco_path,
batch_size=batch_size,
resized_shape=(image_size, image_size))
scenenet_data, nr_digits, length_dataset = \
data.load_scenenet_data(file_names_photos, instances, args.coco_path,
batch_size=batch_size,
resized_shape=(image_size, image_size))
del file_names_photos, instances
use_summary_writer = summary_ops_v2.create_file_writer(

View File

@ -231,7 +231,8 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
instances: Sequence[Sequence[Sequence[dict]]],
coco_path: str,
num_epochs: int = 1, batch_size: int = 32,
resized_shape: Sequence[int] = (256, 256)) -> Tuple[tf.data.Dataset, int]:
resized_shape: Sequence[int] = (256, 256),
mode: str = "inference") -> Tuple[tf.data.Dataset, int, int]:
"""
Loads the SceneNet RGB-D data and returns a data set.
@ -242,10 +243,12 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
num_epochs: number of epochs to use
batch_size: size of every batch
resized_shape: shape of input images to SSD
mode: one of "inference" or "training"
Returns:
scenenet data set
number of digits required to print largest batch number
length of dataset
"""
trajectories = zip(photo_paths, instances)
final_image_paths = []
@ -292,14 +295,17 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
path_dataset = tf.data.Dataset.from_tensor_slices(final_image_paths)
label_dataset = tf.data.Dataset.from_tensor_slices(real_final_labels)
dataset = tf.data.Dataset.zip((path_dataset, label_dataset))
dataset = dataset.repeat(num_epochs)
if mode == "inference":
dataset = dataset.repeat(num_epochs)
elif mode == "training":
dataset = dataset.apply(tf.data.experimental.shuffle_and_repeat(length_dataset, num_epochs))
dataset = dataset.batch(batch_size=batch_size)
dataset = dataset.map(_load_images_ssd_callback(resized_shape))
dataset = dataset.prefetch(1)
nr_digits = math.ceil(math.log10(math.ceil((length_dataset * num_epochs) / batch_size)))
return dataset, nr_digits
return dataset, nr_digits, length_dataset
def _load_images_ssd_callback(resized_shape: Sequence[int]) \

View File

@ -34,6 +34,7 @@ Functions:
predict(...): runs trained SSD/DropoutSSD on a given data set
train(...): trains the SSD/DropoutSSD on a given data set
"""
import math
import os
import pickle
import time
@ -341,6 +342,7 @@ def _get_observations(detections: Sequence[Sequence[np.ndarray]]) -> List[List[n
def train(dataset: tf.data.Dataset,
iteration: int,
use_dropout: bool,
length_dataset: int,
weights_prefix: str,
weights_path: Optional[str] = None,
verbose: Optional[bool] = False,
@ -360,6 +362,7 @@ def train(dataset: tf.data.Dataset,
dataset: the training data set
iteration: identifier for current training run
use_dropout: if True, the DropoutSSD will be used
length_dataset: specifies number of images in data set
weights_prefix: prefix for weights directory
weights_path: path to the pre-trained SSD weights
verbose: if True, progress is printed to the standard output
@ -412,31 +415,14 @@ def train(dataset: tf.data.Dataset,
[1.0, 2.0, 0.5],
[1.0, 2.0, 0.5]])
def _get_last_epoch(epoch_var: tf.Variable, **kwargs) -> int:
return int(epoch_var)
last_epoch = _get_last_epoch(**checkpointables)
previous_epochs = 0
if last_epoch != -1:
previous_epochs = last_epoch + 1
with summary_ops_v2.always_record_summaries():
summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'],
step=checkpointables['global_step'])
nr_batches_per_epoch = int(math.ceil(length_dataset / float(batch_size)))
for epoch in range(nr_epochs - previous_epochs):
_epoch = epoch + previous_epochs
outputs = _train_one_epoch(_epoch, dataset, input_encoder, **checkpointables)
if verbose:
print((
f"[{_epoch + 1:d}/{nr_epochs:d} - "
f"train time: {outputs['per_epoch_time']:.2f}, "
f"SSD loss: {outputs['ssd_loss']:.3f}, "
))
# save weights at end of epoch
checkpoint.save(checkpoint_prefix)
_train_epochs(nr_batches_per_epoch, nr_epochs, dataset, input_encoder,
checkpoint, checkpoint_prefix, verbose=verbose, **checkpointables)
if verbose:
print("Training finished!... save model weights")
@ -445,24 +431,34 @@ def train(dataset: tf.data.Dataset,
checkpoint.save(checkpoint_prefix)
def _train_one_epoch(epoch: int,
dataset: tf.data.Dataset,
input_encoder: ssd_input_encoder.SSDInputEncoder,
ssd: tf.keras.Model,
ssd_optimizer: tf.train.Optimizer,
global_step: tf.Variable,
epoch_var: tf.Variable,
learning_rate_var: tf.Variable) -> Dict[str, float]:
def _train_epochs(nr_batches_per_epoch: int,
nr_epochs: int,
dataset: tf.data.Dataset,
input_encoder: ssd_input_encoder.SSDInputEncoder,
checkpoint: tf.train.Checkpoint,
checkpoint_prefix: str,
ssd: tf.keras.Model,
ssd_optimizer: tf.train.Optimizer,
global_step: tf.Variable,
epoch_var: tf.Variable,
learning_rate_var: tf.Variable,
verbose: bool) -> None:
with summary_ops_v2.always_record_summaries():
epoch = 0
batch_counter = 0
epoch_var.assign(epoch)
epoch_start_time = time.time()
epoch_start_time = None
# define loss variables
ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32)
# go through data set
for x, y in dataset:
if batch_counter == 0:
# epoch starts
epoch_start_time = time.time()
labels = []
for i in range(y.shape[0]):
image_labels = np.asarray(y[i])
@ -476,17 +472,35 @@ def _train_one_epoch(epoch: int,
global_step=global_step)
ssd_loss_avg(ssd_train_loss)
global_step.assign_add(1)
batch_counter += 1
if batch_counter == nr_batches_per_epoch:
# one epoch is over
epoch_end_time = time.time()
per_epoch_time = epoch_end_time - epoch_start_time
# final losses of epoch
outputs = {
'ssd_loss': ssd_loss_avg.result(False),
'per_epoch_time': per_epoch_time,
}
if verbose:
print((
f"[{epoch + 1:d}/{nr_epochs:d} - "
f"train time: {outputs['per_epoch_time']:.2f}, "
f"SSD loss: {outputs['ssd_loss']:.3f}, "
))
# save weights at end of epoch
checkpoint.save(checkpoint_prefix)
epoch += 1
batch_counter = 0
epoch_end_time = time.time()
per_epoch_time = epoch_end_time - epoch_start_time
# final losses of epoch
outputs = {
'ssd_loss': ssd_loss_avg.result(False),
'per_epoch_time': per_epoch_time,
}
return outputs
def _train_ssd_step(ssd: tf.keras.Model,