Removed double-iteration over epochs
1. Dataset was repeated nr_epochs times 2. loop over dataset was repeated nr_epoch times Consequence: each checkpointed epoch was in fact nr_epochs long Signed-off-by: Jim Martens <github@2martens.de>
This commit is contained in:
@ -60,9 +60,11 @@ def _ssd_train(args: argparse.Namespace) -> None:
|
||||
with open(f"{args.ground_truth_path}/instances.bin", "rb") as file:
|
||||
instances = pickle.load(file)
|
||||
|
||||
scenenet_data, nr_digits = data.load_scenenet_data(file_names_photos, instances, args.coco_path,
|
||||
scenenet_data, nr_digits, length_dataset = \
|
||||
data.load_scenenet_data(file_names_photos, instances, args.coco_path,
|
||||
batch_size=batch_size,
|
||||
resized_shape=(image_size, image_size))
|
||||
resized_shape=(image_size, image_size),
|
||||
mode="training")
|
||||
del file_names_photos, instances
|
||||
|
||||
use_summary_writer = summary_ops_v2.create_file_writer(
|
||||
@ -71,13 +73,17 @@ def _ssd_train(args: argparse.Namespace) -> None:
|
||||
|
||||
if args.debug:
|
||||
with use_summary_writer.as_default():
|
||||
ssd.train(scenenet_data, args.iteration, use_dropout, weights_prefix=weights_path,
|
||||
ssd.train(scenenet_data, args.iteration, use_dropout, length_dataset,
|
||||
weights_prefix=weights_path,
|
||||
weights_path=pre_trained_weights_file, batch_size=batch_size,
|
||||
nr_epochs=args.num_epochs)
|
||||
nr_epochs=args.num_epochs,
|
||||
verbose=args.verbose)
|
||||
else:
|
||||
ssd.train(scenenet_data, args.iteration, use_dropout, weights_prefix=weights_path,
|
||||
ssd.train(scenenet_data, args.iteration, use_dropout, length_dataset,
|
||||
weights_prefix=weights_path,
|
||||
weights_path=pre_trained_weights_file, batch_size=batch_size,
|
||||
nr_epochs=args.num_epochs)
|
||||
nr_epochs=args.num_epochs,
|
||||
verbose=args.verbose)
|
||||
|
||||
|
||||
def _auto_encoder_train(args: argparse.Namespace) -> None:
|
||||
@ -245,7 +251,8 @@ def _ssd_val(args: argparse.Namespace) -> None:
|
||||
with open(f"{args.ground_truth_path}/instances.bin", "rb") as file:
|
||||
instances = pickle.load(file)
|
||||
|
||||
scenenet_data, nr_digits = data.load_scenenet_data(file_names_photos, instances, args.coco_path,
|
||||
scenenet_data, nr_digits, length_dataset = \
|
||||
data.load_scenenet_data(file_names_photos, instances, args.coco_path,
|
||||
batch_size=batch_size,
|
||||
resized_shape=(image_size, image_size))
|
||||
del file_names_photos, instances
|
||||
|
||||
@ -231,7 +231,8 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
|
||||
instances: Sequence[Sequence[Sequence[dict]]],
|
||||
coco_path: str,
|
||||
num_epochs: int = 1, batch_size: int = 32,
|
||||
resized_shape: Sequence[int] = (256, 256)) -> Tuple[tf.data.Dataset, int]:
|
||||
resized_shape: Sequence[int] = (256, 256),
|
||||
mode: str = "inference") -> Tuple[tf.data.Dataset, int, int]:
|
||||
"""
|
||||
Loads the SceneNet RGB-D data and returns a data set.
|
||||
|
||||
@ -242,10 +243,12 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
|
||||
num_epochs: number of epochs to use
|
||||
batch_size: size of every batch
|
||||
resized_shape: shape of input images to SSD
|
||||
mode: one of "inference" or "training"
|
||||
|
||||
Returns:
|
||||
scenenet data set
|
||||
number of digits required to print largest batch number
|
||||
length of dataset
|
||||
"""
|
||||
trajectories = zip(photo_paths, instances)
|
||||
final_image_paths = []
|
||||
@ -292,14 +295,17 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
|
||||
path_dataset = tf.data.Dataset.from_tensor_slices(final_image_paths)
|
||||
label_dataset = tf.data.Dataset.from_tensor_slices(real_final_labels)
|
||||
dataset = tf.data.Dataset.zip((path_dataset, label_dataset))
|
||||
if mode == "inference":
|
||||
dataset = dataset.repeat(num_epochs)
|
||||
elif mode == "training":
|
||||
dataset = dataset.apply(tf.data.experimental.shuffle_and_repeat(length_dataset, num_epochs))
|
||||
dataset = dataset.batch(batch_size=batch_size)
|
||||
dataset = dataset.map(_load_images_ssd_callback(resized_shape))
|
||||
dataset = dataset.prefetch(1)
|
||||
|
||||
nr_digits = math.ceil(math.log10(math.ceil((length_dataset * num_epochs) / batch_size)))
|
||||
|
||||
return dataset, nr_digits
|
||||
return dataset, nr_digits, length_dataset
|
||||
|
||||
|
||||
def _load_images_ssd_callback(resized_shape: Sequence[int]) \
|
||||
|
||||
@ -34,6 +34,7 @@ Functions:
|
||||
predict(...): runs trained SSD/DropoutSSD on a given data set
|
||||
train(...): trains the SSD/DropoutSSD on a given data set
|
||||
"""
|
||||
import math
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
@ -341,6 +342,7 @@ def _get_observations(detections: Sequence[Sequence[np.ndarray]]) -> List[List[n
|
||||
def train(dataset: tf.data.Dataset,
|
||||
iteration: int,
|
||||
use_dropout: bool,
|
||||
length_dataset: int,
|
||||
weights_prefix: str,
|
||||
weights_path: Optional[str] = None,
|
||||
verbose: Optional[bool] = False,
|
||||
@ -360,6 +362,7 @@ def train(dataset: tf.data.Dataset,
|
||||
dataset: the training data set
|
||||
iteration: identifier for current training run
|
||||
use_dropout: if True, the DropoutSSD will be used
|
||||
length_dataset: specifies number of images in data set
|
||||
weights_prefix: prefix for weights directory
|
||||
weights_path: path to the pre-trained SSD weights
|
||||
verbose: if True, progress is printed to the standard output
|
||||
@ -412,31 +415,14 @@ def train(dataset: tf.data.Dataset,
|
||||
[1.0, 2.0, 0.5],
|
||||
[1.0, 2.0, 0.5]])
|
||||
|
||||
def _get_last_epoch(epoch_var: tf.Variable, **kwargs) -> int:
|
||||
return int(epoch_var)
|
||||
|
||||
last_epoch = _get_last_epoch(**checkpointables)
|
||||
previous_epochs = 0
|
||||
if last_epoch != -1:
|
||||
previous_epochs = last_epoch + 1
|
||||
|
||||
with summary_ops_v2.always_record_summaries():
|
||||
summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'],
|
||||
step=checkpointables['global_step'])
|
||||
|
||||
for epoch in range(nr_epochs - previous_epochs):
|
||||
_epoch = epoch + previous_epochs
|
||||
outputs = _train_one_epoch(_epoch, dataset, input_encoder, **checkpointables)
|
||||
nr_batches_per_epoch = int(math.ceil(length_dataset / float(batch_size)))
|
||||
|
||||
if verbose:
|
||||
print((
|
||||
f"[{_epoch + 1:d}/{nr_epochs:d} - "
|
||||
f"train time: {outputs['per_epoch_time']:.2f}, "
|
||||
f"SSD loss: {outputs['ssd_loss']:.3f}, "
|
||||
))
|
||||
|
||||
# save weights at end of epoch
|
||||
checkpoint.save(checkpoint_prefix)
|
||||
_train_epochs(nr_batches_per_epoch, nr_epochs, dataset, input_encoder,
|
||||
checkpoint, checkpoint_prefix, verbose=verbose, **checkpointables)
|
||||
|
||||
if verbose:
|
||||
print("Training finished!... save model weights")
|
||||
@ -445,24 +431,34 @@ def train(dataset: tf.data.Dataset,
|
||||
checkpoint.save(checkpoint_prefix)
|
||||
|
||||
|
||||
def _train_one_epoch(epoch: int,
|
||||
def _train_epochs(nr_batches_per_epoch: int,
|
||||
nr_epochs: int,
|
||||
dataset: tf.data.Dataset,
|
||||
input_encoder: ssd_input_encoder.SSDInputEncoder,
|
||||
checkpoint: tf.train.Checkpoint,
|
||||
checkpoint_prefix: str,
|
||||
ssd: tf.keras.Model,
|
||||
ssd_optimizer: tf.train.Optimizer,
|
||||
global_step: tf.Variable,
|
||||
epoch_var: tf.Variable,
|
||||
learning_rate_var: tf.Variable) -> Dict[str, float]:
|
||||
learning_rate_var: tf.Variable,
|
||||
verbose: bool) -> None:
|
||||
|
||||
with summary_ops_v2.always_record_summaries():
|
||||
epoch = 0
|
||||
batch_counter = 0
|
||||
epoch_var.assign(epoch)
|
||||
epoch_start_time = time.time()
|
||||
epoch_start_time = None
|
||||
|
||||
# define loss variables
|
||||
ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32)
|
||||
|
||||
# go through data set
|
||||
for x, y in dataset:
|
||||
if batch_counter == 0:
|
||||
# epoch starts
|
||||
epoch_start_time = time.time()
|
||||
|
||||
labels = []
|
||||
for i in range(y.shape[0]):
|
||||
image_labels = np.asarray(y[i])
|
||||
@ -477,6 +473,10 @@ def _train_one_epoch(epoch: int,
|
||||
ssd_loss_avg(ssd_train_loss)
|
||||
global_step.assign_add(1)
|
||||
|
||||
batch_counter += 1
|
||||
|
||||
if batch_counter == nr_batches_per_epoch:
|
||||
# one epoch is over
|
||||
epoch_end_time = time.time()
|
||||
per_epoch_time = epoch_end_time - epoch_start_time
|
||||
|
||||
@ -486,7 +486,21 @@ def _train_one_epoch(epoch: int,
|
||||
'per_epoch_time': per_epoch_time,
|
||||
}
|
||||
|
||||
return outputs
|
||||
if verbose:
|
||||
print((
|
||||
f"[{epoch + 1:d}/{nr_epochs:d} - "
|
||||
f"train time: {outputs['per_epoch_time']:.2f}, "
|
||||
f"SSD loss: {outputs['ssd_loss']:.3f}, "
|
||||
))
|
||||
|
||||
# save weights at end of epoch
|
||||
checkpoint.save(checkpoint_prefix)
|
||||
|
||||
epoch += 1
|
||||
|
||||
batch_counter = 0
|
||||
|
||||
|
||||
|
||||
|
||||
def _train_ssd_step(ssd: tf.keras.Model,
|
||||
|
||||
Reference in New Issue
Block a user