Implemented training function for SSD

Signed-off-by: Jim Martens <github@2martens.de>
2019-03-20 16:46:14 +01:00
parent 167899b2f8
commit b7825c0095
1 changed files with 178 additions and 0 deletions
--- a/src/twomartens/masterthesis/ssd.py
+++ b/src/twomartens/masterthesis/ssd.py
@ -23,25 +23,36 @@ Attributes:
    DROPOUT_RATE: rate for dropping weights
    IOU_THRESHOLD: threshold for required overlap with ground truth bounding box
    TOP_K: maximum number of predictions kept for each batch item after non-maximum suppression
    LOG_FREQUENCY: number of steps that muss pass before logging happens
 Classes:
    ``DropoutSSD``: wraps Dropout SSD 300 model
    ``SSD``: wraps vanilla SSD 300 model
 """
 import os
 import time
 from typing import Dict
 from typing import Optional
 import tensorflow as tf
 from tensorflow.python.ops import summary_ops_v2
 from twomartens.masterthesis.ssd_keras.keras_loss_function import keras_ssd_loss
 from twomartens.masterthesis.ssd_keras.models import keras_ssd300
 from twomartens.masterthesis.ssd_keras.models import keras_ssd300_dropout
 K = tf.keras.backend
 tfe = tf.contrib.eager
 IMAGE_SIZE = (240, 320, 3)  # TODO check with SceneNet RGB-D
 N_CLASSES = 80
 DROPOUT_RATE = 0.5
 IOU_THRESHOLD = 0.45
 TOP_K = 200
 LOG_FREQUENCY = 10
 class SSD:
    """
@ -86,3 +97,170 @@ class DropoutSSD:
    def __call__(self, inputs: tf.Tensor, *args, **kwargs) -> tf.Tensor:
        return self._model(inputs)
 def train(dataset: tf.data.Dataset,
          iteration: int,
          use_dropout: bool,
          weights_prefix: str,
          weights_path: Optional[str] = None,
          verbose: Optional[bool] = False,
          batch_size: Optional[int] = 128,
          nr_epochs: Optional[int] = 80,
          lr: Optional[float] = 0.002) -> None:
    """
    Trains the SSD on the given data set.
    This function provides early stopping and creates checkpoints after every
    epoch as well as after finishing training. When starting
    this function with the same ``iteration`` then the training will try to
    continue where it ended last time by restoring a saved checkpoint.
    The loss values are provided as scalar summaries.
    Args:
        dataset: the training data set
        iteration: identifier for current training run
        use_dropout: if True, the DropoutSSD will be used
        weights_prefix: prefix for weights directory
        weights_path: path to the pre-trained SSD weights
        verbose: if True, progress is printed to the standard output
        batch_size: size of each batch
        nr_epochs: number of epochs to train
        lr: initial learning rate
    """
    # define checkpointed tensors and variables
    checkpointables = {
        'learning_rate_var': K.variable(lr),
    }
    # model
    if use_dropout:
        checkpointables.update({
            'ssd': DropoutSSD(mode='training', weights_path=weights_path)
        })
    else:
        checkpointables.update({
            'ssd': SSD(mode='training', weights_path=weights_path)
        })
    checkpointables.update({
        # optimizer
        'ssd_optimizer': tf.train.AdamOptimizer(learning_rate=checkpointables['learning_rate_var'],
                                                beta1=0.5, beta2=0.999),
        # global step counter
        'global_step': tf.train.get_or_create_global_step(),
        'epoch_var': K.variable(-1, dtype=tf.int64)
    })
    # checkpoint
    checkpoint_dir = os.path.join(weights_prefix, str(iteration) + '/')
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    checkpoint = tf.train.Checkpoint(**checkpointables)
    checkpoint.restore(latest_checkpoint)
    def _get_last_epoch(epoch_var: tf.Variable, **kwargs) -> int:
        return int(epoch_var)
    last_epoch = _get_last_epoch(**checkpointables)
    previous_epochs = 0
    if last_epoch != -1:
        previous_epochs = last_epoch + 1
    with summary_ops_v2.always_record_summaries():
        summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'],
                              step=checkpointables['global_step'])
    for epoch in range(nr_epochs - previous_epochs):
        _epoch = epoch + previous_epochs
        outputs = _train_one_epoch(_epoch, dataset, **checkpointables)
        if verbose:
            print((
                f"[{_epoch + 1:d}/{nr_epochs:d} - "
                f"train time: {outputs['per_epoch_time']:.2f}, "
                f"SSD loss: {outputs['ssd_loss']:.3f}, "
            ))
        # save weights at end of epoch
        checkpoint.save(checkpoint_prefix)
    if verbose:
        print("Training finished!... save model weights")
    # save trained models
    checkpoint.save(checkpoint_prefix)
 def _train_one_epoch(epoch: int,
                     dataset: tf.data.Dataset,
                     ssd: tf.keras.Model,
                     ssd_optimizer: tf.train.Optimizer,
                     global_step: tf.Variable,
                     epoch_var: tf.Variable) -> Dict[str, float]:
    with summary_ops_v2.always_record_summaries():
        epoch_var.assign(epoch)
        epoch_start_time = time.time()
        # define loss variables
        ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32)
        # go through data set
        for x, y in dataset:
            ssd_train_loss = _train_ssd_step(ssd=ssd,
                                             optimizer=ssd_optimizer,
                                             inputs=x,
                                             ground_truth=y,
                                             global_step=global_step)
            ssd_loss_avg(ssd_train_loss)
            global_step.assign_add(1)
        epoch_end_time = time.time()
        per_epoch_time = epoch_end_time - epoch_start_time
        # final losses of epoch
        outputs = {
            'ssd_loss': ssd_loss_avg.result(False),
            'per_epoch_time': per_epoch_time,
        }
        return outputs
 def _train_ssd_step(ssd: tf.keras.Model,
                    optimizer: tf.train.Optimizer,
                    inputs: tf.Tensor,
                    ground_truth: tf.Tensor,
                    global_step: tf.Variable) -> tf.Tensor:
    """
    Trains the SSD model for one step (one batch).
    :param ssd: instance of the SSD model
    :param optimizer: instance of chosen optimizer
    :param inputs: inputs from data set
    :param ground_truth: ground truth from data set
    :param global_step: the global step variable
    :return: the calculated loss
    """
    with tf.GradientTape() as tape:
        predictions = ssd(inputs)
        loss = keras_ssd_loss.SSDLoss()
        batch_size = tf.shape(predictions)[0]
        ssd_loss = loss.compute_loss(ground_truth, predictions) / tf.to_float(batch_size)
    ssd_grads = tape.gradient(ssd_loss, ssd.trainable_variables)
    if int(global_step % LOG_FREQUENCY) == 0:
        summary_ops_v2.scalar(name='ssd_loss', tensor=ssd_loss, step=global_step)
        for grad, variable in zip(ssd_grads, ssd.trainable_variables):
            summary_ops_v2.histogram(name='gradients/' + variable.name, tensor=tf.math.l2_normalize(grad),
                                     step=global_step)
            summary_ops_v2.histogram(name='variables/' + variable.name, tensor=tf.math.l2_normalize(variable),
                                     step=global_step)
    optimizer.apply_gradients(zip(ssd_grads, ssd.trainable_variables),
                              global_step=global_step)
    return ssd_loss