Removed double-iteration over epochs
1. Dataset was repeated nr_epochs times 2. loop over dataset was repeated nr_epoch times Consequence: each checkpointed epoch was in fact nr_epochs long Signed-off-by: Jim Martens <github@2martens.de>
This commit is contained in:
@ -60,9 +60,11 @@ def _ssd_train(args: argparse.Namespace) -> None:
|
|||||||
with open(f"{args.ground_truth_path}/instances.bin", "rb") as file:
|
with open(f"{args.ground_truth_path}/instances.bin", "rb") as file:
|
||||||
instances = pickle.load(file)
|
instances = pickle.load(file)
|
||||||
|
|
||||||
scenenet_data, nr_digits = data.load_scenenet_data(file_names_photos, instances, args.coco_path,
|
scenenet_data, nr_digits, length_dataset = \
|
||||||
batch_size=batch_size,
|
data.load_scenenet_data(file_names_photos, instances, args.coco_path,
|
||||||
resized_shape=(image_size, image_size))
|
batch_size=batch_size,
|
||||||
|
resized_shape=(image_size, image_size),
|
||||||
|
mode="training")
|
||||||
del file_names_photos, instances
|
del file_names_photos, instances
|
||||||
|
|
||||||
use_summary_writer = summary_ops_v2.create_file_writer(
|
use_summary_writer = summary_ops_v2.create_file_writer(
|
||||||
@ -71,13 +73,17 @@ def _ssd_train(args: argparse.Namespace) -> None:
|
|||||||
|
|
||||||
if args.debug:
|
if args.debug:
|
||||||
with use_summary_writer.as_default():
|
with use_summary_writer.as_default():
|
||||||
ssd.train(scenenet_data, args.iteration, use_dropout, weights_prefix=weights_path,
|
ssd.train(scenenet_data, args.iteration, use_dropout, length_dataset,
|
||||||
|
weights_prefix=weights_path,
|
||||||
weights_path=pre_trained_weights_file, batch_size=batch_size,
|
weights_path=pre_trained_weights_file, batch_size=batch_size,
|
||||||
nr_epochs=args.num_epochs)
|
nr_epochs=args.num_epochs,
|
||||||
|
verbose=args.verbose)
|
||||||
else:
|
else:
|
||||||
ssd.train(scenenet_data, args.iteration, use_dropout, weights_prefix=weights_path,
|
ssd.train(scenenet_data, args.iteration, use_dropout, length_dataset,
|
||||||
|
weights_prefix=weights_path,
|
||||||
weights_path=pre_trained_weights_file, batch_size=batch_size,
|
weights_path=pre_trained_weights_file, batch_size=batch_size,
|
||||||
nr_epochs=args.num_epochs)
|
nr_epochs=args.num_epochs,
|
||||||
|
verbose=args.verbose)
|
||||||
|
|
||||||
|
|
||||||
def _auto_encoder_train(args: argparse.Namespace) -> None:
|
def _auto_encoder_train(args: argparse.Namespace) -> None:
|
||||||
@ -245,9 +251,10 @@ def _ssd_val(args: argparse.Namespace) -> None:
|
|||||||
with open(f"{args.ground_truth_path}/instances.bin", "rb") as file:
|
with open(f"{args.ground_truth_path}/instances.bin", "rb") as file:
|
||||||
instances = pickle.load(file)
|
instances = pickle.load(file)
|
||||||
|
|
||||||
scenenet_data, nr_digits = data.load_scenenet_data(file_names_photos, instances, args.coco_path,
|
scenenet_data, nr_digits, length_dataset = \
|
||||||
batch_size=batch_size,
|
data.load_scenenet_data(file_names_photos, instances, args.coco_path,
|
||||||
resized_shape=(image_size, image_size))
|
batch_size=batch_size,
|
||||||
|
resized_shape=(image_size, image_size))
|
||||||
del file_names_photos, instances
|
del file_names_photos, instances
|
||||||
|
|
||||||
use_summary_writer = summary_ops_v2.create_file_writer(
|
use_summary_writer = summary_ops_v2.create_file_writer(
|
||||||
|
|||||||
@ -231,7 +231,8 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
|
|||||||
instances: Sequence[Sequence[Sequence[dict]]],
|
instances: Sequence[Sequence[Sequence[dict]]],
|
||||||
coco_path: str,
|
coco_path: str,
|
||||||
num_epochs: int = 1, batch_size: int = 32,
|
num_epochs: int = 1, batch_size: int = 32,
|
||||||
resized_shape: Sequence[int] = (256, 256)) -> Tuple[tf.data.Dataset, int]:
|
resized_shape: Sequence[int] = (256, 256),
|
||||||
|
mode: str = "inference") -> Tuple[tf.data.Dataset, int, int]:
|
||||||
"""
|
"""
|
||||||
Loads the SceneNet RGB-D data and returns a data set.
|
Loads the SceneNet RGB-D data and returns a data set.
|
||||||
|
|
||||||
@ -242,10 +243,12 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
|
|||||||
num_epochs: number of epochs to use
|
num_epochs: number of epochs to use
|
||||||
batch_size: size of every batch
|
batch_size: size of every batch
|
||||||
resized_shape: shape of input images to SSD
|
resized_shape: shape of input images to SSD
|
||||||
|
mode: one of "inference" or "training"
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
scenenet data set
|
scenenet data set
|
||||||
number of digits required to print largest batch number
|
number of digits required to print largest batch number
|
||||||
|
length of dataset
|
||||||
"""
|
"""
|
||||||
trajectories = zip(photo_paths, instances)
|
trajectories = zip(photo_paths, instances)
|
||||||
final_image_paths = []
|
final_image_paths = []
|
||||||
@ -292,14 +295,17 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]],
|
|||||||
path_dataset = tf.data.Dataset.from_tensor_slices(final_image_paths)
|
path_dataset = tf.data.Dataset.from_tensor_slices(final_image_paths)
|
||||||
label_dataset = tf.data.Dataset.from_tensor_slices(real_final_labels)
|
label_dataset = tf.data.Dataset.from_tensor_slices(real_final_labels)
|
||||||
dataset = tf.data.Dataset.zip((path_dataset, label_dataset))
|
dataset = tf.data.Dataset.zip((path_dataset, label_dataset))
|
||||||
dataset = dataset.repeat(num_epochs)
|
if mode == "inference":
|
||||||
|
dataset = dataset.repeat(num_epochs)
|
||||||
|
elif mode == "training":
|
||||||
|
dataset = dataset.apply(tf.data.experimental.shuffle_and_repeat(length_dataset, num_epochs))
|
||||||
dataset = dataset.batch(batch_size=batch_size)
|
dataset = dataset.batch(batch_size=batch_size)
|
||||||
dataset = dataset.map(_load_images_ssd_callback(resized_shape))
|
dataset = dataset.map(_load_images_ssd_callback(resized_shape))
|
||||||
dataset = dataset.prefetch(1)
|
dataset = dataset.prefetch(1)
|
||||||
|
|
||||||
nr_digits = math.ceil(math.log10(math.ceil((length_dataset * num_epochs) / batch_size)))
|
nr_digits = math.ceil(math.log10(math.ceil((length_dataset * num_epochs) / batch_size)))
|
||||||
|
|
||||||
return dataset, nr_digits
|
return dataset, nr_digits, length_dataset
|
||||||
|
|
||||||
|
|
||||||
def _load_images_ssd_callback(resized_shape: Sequence[int]) \
|
def _load_images_ssd_callback(resized_shape: Sequence[int]) \
|
||||||
|
|||||||
@ -34,6 +34,7 @@ Functions:
|
|||||||
predict(...): runs trained SSD/DropoutSSD on a given data set
|
predict(...): runs trained SSD/DropoutSSD on a given data set
|
||||||
train(...): trains the SSD/DropoutSSD on a given data set
|
train(...): trains the SSD/DropoutSSD on a given data set
|
||||||
"""
|
"""
|
||||||
|
import math
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import time
|
import time
|
||||||
@ -341,6 +342,7 @@ def _get_observations(detections: Sequence[Sequence[np.ndarray]]) -> List[List[n
|
|||||||
def train(dataset: tf.data.Dataset,
|
def train(dataset: tf.data.Dataset,
|
||||||
iteration: int,
|
iteration: int,
|
||||||
use_dropout: bool,
|
use_dropout: bool,
|
||||||
|
length_dataset: int,
|
||||||
weights_prefix: str,
|
weights_prefix: str,
|
||||||
weights_path: Optional[str] = None,
|
weights_path: Optional[str] = None,
|
||||||
verbose: Optional[bool] = False,
|
verbose: Optional[bool] = False,
|
||||||
@ -360,6 +362,7 @@ def train(dataset: tf.data.Dataset,
|
|||||||
dataset: the training data set
|
dataset: the training data set
|
||||||
iteration: identifier for current training run
|
iteration: identifier for current training run
|
||||||
use_dropout: if True, the DropoutSSD will be used
|
use_dropout: if True, the DropoutSSD will be used
|
||||||
|
length_dataset: specifies number of images in data set
|
||||||
weights_prefix: prefix for weights directory
|
weights_prefix: prefix for weights directory
|
||||||
weights_path: path to the pre-trained SSD weights
|
weights_path: path to the pre-trained SSD weights
|
||||||
verbose: if True, progress is printed to the standard output
|
verbose: if True, progress is printed to the standard output
|
||||||
@ -412,31 +415,14 @@ def train(dataset: tf.data.Dataset,
|
|||||||
[1.0, 2.0, 0.5],
|
[1.0, 2.0, 0.5],
|
||||||
[1.0, 2.0, 0.5]])
|
[1.0, 2.0, 0.5]])
|
||||||
|
|
||||||
def _get_last_epoch(epoch_var: tf.Variable, **kwargs) -> int:
|
|
||||||
return int(epoch_var)
|
|
||||||
|
|
||||||
last_epoch = _get_last_epoch(**checkpointables)
|
|
||||||
previous_epochs = 0
|
|
||||||
if last_epoch != -1:
|
|
||||||
previous_epochs = last_epoch + 1
|
|
||||||
|
|
||||||
with summary_ops_v2.always_record_summaries():
|
with summary_ops_v2.always_record_summaries():
|
||||||
summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'],
|
summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'],
|
||||||
step=checkpointables['global_step'])
|
step=checkpointables['global_step'])
|
||||||
|
|
||||||
|
nr_batches_per_epoch = int(math.ceil(length_dataset / float(batch_size)))
|
||||||
|
|
||||||
for epoch in range(nr_epochs - previous_epochs):
|
_train_epochs(nr_batches_per_epoch, nr_epochs, dataset, input_encoder,
|
||||||
_epoch = epoch + previous_epochs
|
checkpoint, checkpoint_prefix, verbose=verbose, **checkpointables)
|
||||||
outputs = _train_one_epoch(_epoch, dataset, input_encoder, **checkpointables)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print((
|
|
||||||
f"[{_epoch + 1:d}/{nr_epochs:d} - "
|
|
||||||
f"train time: {outputs['per_epoch_time']:.2f}, "
|
|
||||||
f"SSD loss: {outputs['ssd_loss']:.3f}, "
|
|
||||||
))
|
|
||||||
|
|
||||||
# save weights at end of epoch
|
|
||||||
checkpoint.save(checkpoint_prefix)
|
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print("Training finished!... save model weights")
|
print("Training finished!... save model weights")
|
||||||
@ -445,24 +431,34 @@ def train(dataset: tf.data.Dataset,
|
|||||||
checkpoint.save(checkpoint_prefix)
|
checkpoint.save(checkpoint_prefix)
|
||||||
|
|
||||||
|
|
||||||
def _train_one_epoch(epoch: int,
|
def _train_epochs(nr_batches_per_epoch: int,
|
||||||
dataset: tf.data.Dataset,
|
nr_epochs: int,
|
||||||
input_encoder: ssd_input_encoder.SSDInputEncoder,
|
dataset: tf.data.Dataset,
|
||||||
ssd: tf.keras.Model,
|
input_encoder: ssd_input_encoder.SSDInputEncoder,
|
||||||
ssd_optimizer: tf.train.Optimizer,
|
checkpoint: tf.train.Checkpoint,
|
||||||
global_step: tf.Variable,
|
checkpoint_prefix: str,
|
||||||
epoch_var: tf.Variable,
|
ssd: tf.keras.Model,
|
||||||
learning_rate_var: tf.Variable) -> Dict[str, float]:
|
ssd_optimizer: tf.train.Optimizer,
|
||||||
|
global_step: tf.Variable,
|
||||||
|
epoch_var: tf.Variable,
|
||||||
|
learning_rate_var: tf.Variable,
|
||||||
|
verbose: bool) -> None:
|
||||||
|
|
||||||
with summary_ops_v2.always_record_summaries():
|
with summary_ops_v2.always_record_summaries():
|
||||||
|
epoch = 0
|
||||||
|
batch_counter = 0
|
||||||
epoch_var.assign(epoch)
|
epoch_var.assign(epoch)
|
||||||
epoch_start_time = time.time()
|
epoch_start_time = None
|
||||||
|
|
||||||
# define loss variables
|
# define loss variables
|
||||||
ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32)
|
ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32)
|
||||||
|
|
||||||
# go through data set
|
# go through data set
|
||||||
for x, y in dataset:
|
for x, y in dataset:
|
||||||
|
if batch_counter == 0:
|
||||||
|
# epoch starts
|
||||||
|
epoch_start_time = time.time()
|
||||||
|
|
||||||
labels = []
|
labels = []
|
||||||
for i in range(y.shape[0]):
|
for i in range(y.shape[0]):
|
||||||
image_labels = np.asarray(y[i])
|
image_labels = np.asarray(y[i])
|
||||||
@ -476,17 +472,35 @@ def _train_one_epoch(epoch: int,
|
|||||||
global_step=global_step)
|
global_step=global_step)
|
||||||
ssd_loss_avg(ssd_train_loss)
|
ssd_loss_avg(ssd_train_loss)
|
||||||
global_step.assign_add(1)
|
global_step.assign_add(1)
|
||||||
|
|
||||||
|
batch_counter += 1
|
||||||
|
|
||||||
|
if batch_counter == nr_batches_per_epoch:
|
||||||
|
# one epoch is over
|
||||||
|
epoch_end_time = time.time()
|
||||||
|
per_epoch_time = epoch_end_time - epoch_start_time
|
||||||
|
|
||||||
|
# final losses of epoch
|
||||||
|
outputs = {
|
||||||
|
'ssd_loss': ssd_loss_avg.result(False),
|
||||||
|
'per_epoch_time': per_epoch_time,
|
||||||
|
}
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print((
|
||||||
|
f"[{epoch + 1:d}/{nr_epochs:d} - "
|
||||||
|
f"train time: {outputs['per_epoch_time']:.2f}, "
|
||||||
|
f"SSD loss: {outputs['ssd_loss']:.3f}, "
|
||||||
|
))
|
||||||
|
|
||||||
|
# save weights at end of epoch
|
||||||
|
checkpoint.save(checkpoint_prefix)
|
||||||
|
|
||||||
|
epoch += 1
|
||||||
|
|
||||||
|
batch_counter = 0
|
||||||
|
|
||||||
epoch_end_time = time.time()
|
|
||||||
per_epoch_time = epoch_end_time - epoch_start_time
|
|
||||||
|
|
||||||
# final losses of epoch
|
|
||||||
outputs = {
|
|
||||||
'ssd_loss': ssd_loss_avg.result(False),
|
|
||||||
'per_epoch_time': per_epoch_time,
|
|
||||||
}
|
|
||||||
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
|
|
||||||
def _train_ssd_step(ssd: tf.keras.Model,
|
def _train_ssd_step(ssd: tf.keras.Model,
|
||||||
|
|||||||
Reference in New Issue
Block a user