diff --git a/src/twomartens/masterthesis/cli.py b/src/twomartens/masterthesis/cli.py index 4858f51..404e2cc 100644 --- a/src/twomartens/masterthesis/cli.py +++ b/src/twomartens/masterthesis/cli.py @@ -60,9 +60,11 @@ def _ssd_train(args: argparse.Namespace) -> None: with open(f"{args.ground_truth_path}/instances.bin", "rb") as file: instances = pickle.load(file) - scenenet_data, nr_digits = data.load_scenenet_data(file_names_photos, instances, args.coco_path, - batch_size=batch_size, - resized_shape=(image_size, image_size)) + scenenet_data, nr_digits, length_dataset = \ + data.load_scenenet_data(file_names_photos, instances, args.coco_path, + batch_size=batch_size, + resized_shape=(image_size, image_size), + mode="training") del file_names_photos, instances use_summary_writer = summary_ops_v2.create_file_writer( @@ -71,13 +73,17 @@ def _ssd_train(args: argparse.Namespace) -> None: if args.debug: with use_summary_writer.as_default(): - ssd.train(scenenet_data, args.iteration, use_dropout, weights_prefix=weights_path, + ssd.train(scenenet_data, args.iteration, use_dropout, length_dataset, + weights_prefix=weights_path, weights_path=pre_trained_weights_file, batch_size=batch_size, - nr_epochs=args.num_epochs) + nr_epochs=args.num_epochs, + verbose=args.verbose) else: - ssd.train(scenenet_data, args.iteration, use_dropout, weights_prefix=weights_path, + ssd.train(scenenet_data, args.iteration, use_dropout, length_dataset, + weights_prefix=weights_path, weights_path=pre_trained_weights_file, batch_size=batch_size, - nr_epochs=args.num_epochs) + nr_epochs=args.num_epochs, + verbose=args.verbose) def _auto_encoder_train(args: argparse.Namespace) -> None: @@ -245,9 +251,10 @@ def _ssd_val(args: argparse.Namespace) -> None: with open(f"{args.ground_truth_path}/instances.bin", "rb") as file: instances = pickle.load(file) - scenenet_data, nr_digits = data.load_scenenet_data(file_names_photos, instances, args.coco_path, - batch_size=batch_size, - resized_shape=(image_size, image_size)) + scenenet_data, nr_digits, length_dataset = \ + data.load_scenenet_data(file_names_photos, instances, args.coco_path, + batch_size=batch_size, + resized_shape=(image_size, image_size)) del file_names_photos, instances use_summary_writer = summary_ops_v2.create_file_writer( diff --git a/src/twomartens/masterthesis/data.py b/src/twomartens/masterthesis/data.py index b0068e2..e216ff7 100644 --- a/src/twomartens/masterthesis/data.py +++ b/src/twomartens/masterthesis/data.py @@ -231,7 +231,8 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]], instances: Sequence[Sequence[Sequence[dict]]], coco_path: str, num_epochs: int = 1, batch_size: int = 32, - resized_shape: Sequence[int] = (256, 256)) -> Tuple[tf.data.Dataset, int]: + resized_shape: Sequence[int] = (256, 256), + mode: str = "inference") -> Tuple[tf.data.Dataset, int, int]: """ Loads the SceneNet RGB-D data and returns a data set. @@ -242,10 +243,12 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]], num_epochs: number of epochs to use batch_size: size of every batch resized_shape: shape of input images to SSD + mode: one of "inference" or "training" Returns: scenenet data set number of digits required to print largest batch number + length of dataset """ trajectories = zip(photo_paths, instances) final_image_paths = [] @@ -292,14 +295,17 @@ def load_scenenet_data(photo_paths: Sequence[Sequence[str]], path_dataset = tf.data.Dataset.from_tensor_slices(final_image_paths) label_dataset = tf.data.Dataset.from_tensor_slices(real_final_labels) dataset = tf.data.Dataset.zip((path_dataset, label_dataset)) - dataset = dataset.repeat(num_epochs) + if mode == "inference": + dataset = dataset.repeat(num_epochs) + elif mode == "training": + dataset = dataset.apply(tf.data.experimental.shuffle_and_repeat(length_dataset, num_epochs)) dataset = dataset.batch(batch_size=batch_size) dataset = dataset.map(_load_images_ssd_callback(resized_shape)) dataset = dataset.prefetch(1) nr_digits = math.ceil(math.log10(math.ceil((length_dataset * num_epochs) / batch_size))) - return dataset, nr_digits + return dataset, nr_digits, length_dataset def _load_images_ssd_callback(resized_shape: Sequence[int]) \ diff --git a/src/twomartens/masterthesis/ssd.py b/src/twomartens/masterthesis/ssd.py index dc0ab41..50cc42f 100644 --- a/src/twomartens/masterthesis/ssd.py +++ b/src/twomartens/masterthesis/ssd.py @@ -34,6 +34,7 @@ Functions: predict(...): runs trained SSD/DropoutSSD on a given data set train(...): trains the SSD/DropoutSSD on a given data set """ +import math import os import pickle import time @@ -341,6 +342,7 @@ def _get_observations(detections: Sequence[Sequence[np.ndarray]]) -> List[List[n def train(dataset: tf.data.Dataset, iteration: int, use_dropout: bool, + length_dataset: int, weights_prefix: str, weights_path: Optional[str] = None, verbose: Optional[bool] = False, @@ -360,6 +362,7 @@ def train(dataset: tf.data.Dataset, dataset: the training data set iteration: identifier for current training run use_dropout: if True, the DropoutSSD will be used + length_dataset: specifies number of images in data set weights_prefix: prefix for weights directory weights_path: path to the pre-trained SSD weights verbose: if True, progress is printed to the standard output @@ -412,31 +415,14 @@ def train(dataset: tf.data.Dataset, [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]]) - def _get_last_epoch(epoch_var: tf.Variable, **kwargs) -> int: - return int(epoch_var) - - last_epoch = _get_last_epoch(**checkpointables) - previous_epochs = 0 - if last_epoch != -1: - previous_epochs = last_epoch + 1 - with summary_ops_v2.always_record_summaries(): summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'], step=checkpointables['global_step']) + + nr_batches_per_epoch = int(math.ceil(length_dataset / float(batch_size))) - for epoch in range(nr_epochs - previous_epochs): - _epoch = epoch + previous_epochs - outputs = _train_one_epoch(_epoch, dataset, input_encoder, **checkpointables) - - if verbose: - print(( - f"[{_epoch + 1:d}/{nr_epochs:d} - " - f"train time: {outputs['per_epoch_time']:.2f}, " - f"SSD loss: {outputs['ssd_loss']:.3f}, " - )) - - # save weights at end of epoch - checkpoint.save(checkpoint_prefix) + _train_epochs(nr_batches_per_epoch, nr_epochs, dataset, input_encoder, + checkpoint, checkpoint_prefix, verbose=verbose, **checkpointables) if verbose: print("Training finished!... save model weights") @@ -445,24 +431,34 @@ def train(dataset: tf.data.Dataset, checkpoint.save(checkpoint_prefix) -def _train_one_epoch(epoch: int, - dataset: tf.data.Dataset, - input_encoder: ssd_input_encoder.SSDInputEncoder, - ssd: tf.keras.Model, - ssd_optimizer: tf.train.Optimizer, - global_step: tf.Variable, - epoch_var: tf.Variable, - learning_rate_var: tf.Variable) -> Dict[str, float]: +def _train_epochs(nr_batches_per_epoch: int, + nr_epochs: int, + dataset: tf.data.Dataset, + input_encoder: ssd_input_encoder.SSDInputEncoder, + checkpoint: tf.train.Checkpoint, + checkpoint_prefix: str, + ssd: tf.keras.Model, + ssd_optimizer: tf.train.Optimizer, + global_step: tf.Variable, + epoch_var: tf.Variable, + learning_rate_var: tf.Variable, + verbose: bool) -> None: with summary_ops_v2.always_record_summaries(): + epoch = 0 + batch_counter = 0 epoch_var.assign(epoch) - epoch_start_time = time.time() + epoch_start_time = None # define loss variables ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32) # go through data set for x, y in dataset: + if batch_counter == 0: + # epoch starts + epoch_start_time = time.time() + labels = [] for i in range(y.shape[0]): image_labels = np.asarray(y[i]) @@ -476,17 +472,35 @@ def _train_one_epoch(epoch: int, global_step=global_step) ssd_loss_avg(ssd_train_loss) global_step.assign_add(1) + + batch_counter += 1 + + if batch_counter == nr_batches_per_epoch: + # one epoch is over + epoch_end_time = time.time() + per_epoch_time = epoch_end_time - epoch_start_time + + # final losses of epoch + outputs = { + 'ssd_loss': ssd_loss_avg.result(False), + 'per_epoch_time': per_epoch_time, + } + + if verbose: + print(( + f"[{epoch + 1:d}/{nr_epochs:d} - " + f"train time: {outputs['per_epoch_time']:.2f}, " + f"SSD loss: {outputs['ssd_loss']:.3f}, " + )) + + # save weights at end of epoch + checkpoint.save(checkpoint_prefix) + + epoch += 1 + + batch_counter = 0 - epoch_end_time = time.time() - per_epoch_time = epoch_end_time - epoch_start_time - - # final losses of epoch - outputs = { - 'ssd_loss': ssd_loss_avg.result(False), - 'per_epoch_time': per_epoch_time, - } - - return outputs + def _train_ssd_step(ssd: tf.keras.Model,