diff --git a/src/twomartens/masterthesis/ssd.py b/src/twomartens/masterthesis/ssd.py index 7e64332..c839dab 100644 --- a/src/twomartens/masterthesis/ssd.py +++ b/src/twomartens/masterthesis/ssd.py @@ -23,25 +23,36 @@ Attributes: DROPOUT_RATE: rate for dropping weights IOU_THRESHOLD: threshold for required overlap with ground truth bounding box TOP_K: maximum number of predictions kept for each batch item after non-maximum suppression + LOG_FREQUENCY: number of steps that muss pass before logging happens Classes: ``DropoutSSD``: wraps Dropout SSD 300 model ``SSD``: wraps vanilla SSD 300 model """ +import os +import time +from typing import Dict from typing import Optional import tensorflow as tf +from tensorflow.python.ops import summary_ops_v2 +from twomartens.masterthesis.ssd_keras.keras_loss_function import keras_ssd_loss from twomartens.masterthesis.ssd_keras.models import keras_ssd300 from twomartens.masterthesis.ssd_keras.models import keras_ssd300_dropout +K = tf.keras.backend +tfe = tf.contrib.eager + IMAGE_SIZE = (240, 320, 3) # TODO check with SceneNet RGB-D N_CLASSES = 80 DROPOUT_RATE = 0.5 IOU_THRESHOLD = 0.45 TOP_K = 200 +LOG_FREQUENCY = 10 + class SSD: """ @@ -86,3 +97,170 @@ class DropoutSSD: def __call__(self, inputs: tf.Tensor, *args, **kwargs) -> tf.Tensor: return self._model(inputs) + + +def train(dataset: tf.data.Dataset, + iteration: int, + use_dropout: bool, + weights_prefix: str, + weights_path: Optional[str] = None, + verbose: Optional[bool] = False, + batch_size: Optional[int] = 128, + nr_epochs: Optional[int] = 80, + lr: Optional[float] = 0.002) -> None: + """ + Trains the SSD on the given data set. + + This function provides early stopping and creates checkpoints after every + epoch as well as after finishing training. When starting + this function with the same ``iteration`` then the training will try to + continue where it ended last time by restoring a saved checkpoint. + The loss values are provided as scalar summaries. + + Args: + dataset: the training data set + iteration: identifier for current training run + use_dropout: if True, the DropoutSSD will be used + weights_prefix: prefix for weights directory + weights_path: path to the pre-trained SSD weights + verbose: if True, progress is printed to the standard output + batch_size: size of each batch + nr_epochs: number of epochs to train + lr: initial learning rate + """ + + # define checkpointed tensors and variables + checkpointables = { + 'learning_rate_var': K.variable(lr), + } + # model + if use_dropout: + checkpointables.update({ + 'ssd': DropoutSSD(mode='training', weights_path=weights_path) + }) + else: + checkpointables.update({ + 'ssd': SSD(mode='training', weights_path=weights_path) + }) + + checkpointables.update({ + # optimizer + 'ssd_optimizer': tf.train.AdamOptimizer(learning_rate=checkpointables['learning_rate_var'], + beta1=0.5, beta2=0.999), + # global step counter + 'global_step': tf.train.get_or_create_global_step(), + 'epoch_var': K.variable(-1, dtype=tf.int64) + }) + + # checkpoint + checkpoint_dir = os.path.join(weights_prefix, str(iteration) + '/') + os.makedirs(checkpoint_dir, exist_ok=True) + checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') + latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) + checkpoint = tf.train.Checkpoint(**checkpointables) + checkpoint.restore(latest_checkpoint) + + def _get_last_epoch(epoch_var: tf.Variable, **kwargs) -> int: + return int(epoch_var) + + last_epoch = _get_last_epoch(**checkpointables) + previous_epochs = 0 + if last_epoch != -1: + previous_epochs = last_epoch + 1 + + with summary_ops_v2.always_record_summaries(): + summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'], + step=checkpointables['global_step']) + + for epoch in range(nr_epochs - previous_epochs): + _epoch = epoch + previous_epochs + outputs = _train_one_epoch(_epoch, dataset, **checkpointables) + + if verbose: + print(( + f"[{_epoch + 1:d}/{nr_epochs:d} - " + f"train time: {outputs['per_epoch_time']:.2f}, " + f"SSD loss: {outputs['ssd_loss']:.3f}, " + )) + + # save weights at end of epoch + checkpoint.save(checkpoint_prefix) + + if verbose: + print("Training finished!... save model weights") + + # save trained models + checkpoint.save(checkpoint_prefix) + + +def _train_one_epoch(epoch: int, + dataset: tf.data.Dataset, + ssd: tf.keras.Model, + ssd_optimizer: tf.train.Optimizer, + global_step: tf.Variable, + epoch_var: tf.Variable) -> Dict[str, float]: + + with summary_ops_v2.always_record_summaries(): + epoch_var.assign(epoch) + epoch_start_time = time.time() + + # define loss variables + ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32) + + # go through data set + for x, y in dataset: + ssd_train_loss = _train_ssd_step(ssd=ssd, + optimizer=ssd_optimizer, + inputs=x, + ground_truth=y, + global_step=global_step) + ssd_loss_avg(ssd_train_loss) + global_step.assign_add(1) + + epoch_end_time = time.time() + per_epoch_time = epoch_end_time - epoch_start_time + + # final losses of epoch + outputs = { + 'ssd_loss': ssd_loss_avg.result(False), + 'per_epoch_time': per_epoch_time, + } + + return outputs + + +def _train_ssd_step(ssd: tf.keras.Model, + optimizer: tf.train.Optimizer, + inputs: tf.Tensor, + ground_truth: tf.Tensor, + global_step: tf.Variable) -> tf.Tensor: + """ + Trains the SSD model for one step (one batch). + + :param ssd: instance of the SSD model + :param optimizer: instance of chosen optimizer + :param inputs: inputs from data set + :param ground_truth: ground truth from data set + :param global_step: the global step variable + :return: the calculated loss + """ + with tf.GradientTape() as tape: + predictions = ssd(inputs) + loss = keras_ssd_loss.SSDLoss() + batch_size = tf.shape(predictions)[0] + ssd_loss = loss.compute_loss(ground_truth, predictions) / tf.to_float(batch_size) + + ssd_grads = tape.gradient(ssd_loss, ssd.trainable_variables) + if int(global_step % LOG_FREQUENCY) == 0: + summary_ops_v2.scalar(name='ssd_loss', tensor=ssd_loss, step=global_step) + + for grad, variable in zip(ssd_grads, ssd.trainable_variables): + summary_ops_v2.histogram(name='gradients/' + variable.name, tensor=tf.math.l2_normalize(grad), + step=global_step) + summary_ops_v2.histogram(name='variables/' + variable.name, tensor=tf.math.l2_normalize(variable), + step=global_step) + + optimizer.apply_gradients(zip(ssd_grads, ssd.trainable_variables), + global_step=global_step) + + return ssd_loss