Implemented training function for SSD
Signed-off-by: Jim Martens <github@2martens.de>
This commit is contained in:
@ -23,25 +23,36 @@ Attributes:
|
||||
DROPOUT_RATE: rate for dropping weights
|
||||
IOU_THRESHOLD: threshold for required overlap with ground truth bounding box
|
||||
TOP_K: maximum number of predictions kept for each batch item after non-maximum suppression
|
||||
LOG_FREQUENCY: number of steps that muss pass before logging happens
|
||||
|
||||
Classes:
|
||||
``DropoutSSD``: wraps Dropout SSD 300 model
|
||||
|
||||
``SSD``: wraps vanilla SSD 300 model
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
from typing import Dict
|
||||
from typing import Optional
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.ops import summary_ops_v2
|
||||
|
||||
from twomartens.masterthesis.ssd_keras.keras_loss_function import keras_ssd_loss
|
||||
from twomartens.masterthesis.ssd_keras.models import keras_ssd300
|
||||
from twomartens.masterthesis.ssd_keras.models import keras_ssd300_dropout
|
||||
|
||||
K = tf.keras.backend
|
||||
tfe = tf.contrib.eager
|
||||
|
||||
IMAGE_SIZE = (240, 320, 3) # TODO check with SceneNet RGB-D
|
||||
N_CLASSES = 80
|
||||
DROPOUT_RATE = 0.5
|
||||
IOU_THRESHOLD = 0.45
|
||||
TOP_K = 200
|
||||
|
||||
LOG_FREQUENCY = 10
|
||||
|
||||
|
||||
class SSD:
|
||||
"""
|
||||
@ -86,3 +97,170 @@ class DropoutSSD:
|
||||
|
||||
def __call__(self, inputs: tf.Tensor, *args, **kwargs) -> tf.Tensor:
|
||||
return self._model(inputs)
|
||||
|
||||
|
||||
def train(dataset: tf.data.Dataset,
|
||||
iteration: int,
|
||||
use_dropout: bool,
|
||||
weights_prefix: str,
|
||||
weights_path: Optional[str] = None,
|
||||
verbose: Optional[bool] = False,
|
||||
batch_size: Optional[int] = 128,
|
||||
nr_epochs: Optional[int] = 80,
|
||||
lr: Optional[float] = 0.002) -> None:
|
||||
"""
|
||||
Trains the SSD on the given data set.
|
||||
|
||||
This function provides early stopping and creates checkpoints after every
|
||||
epoch as well as after finishing training. When starting
|
||||
this function with the same ``iteration`` then the training will try to
|
||||
continue where it ended last time by restoring a saved checkpoint.
|
||||
The loss values are provided as scalar summaries.
|
||||
|
||||
Args:
|
||||
dataset: the training data set
|
||||
iteration: identifier for current training run
|
||||
use_dropout: if True, the DropoutSSD will be used
|
||||
weights_prefix: prefix for weights directory
|
||||
weights_path: path to the pre-trained SSD weights
|
||||
verbose: if True, progress is printed to the standard output
|
||||
batch_size: size of each batch
|
||||
nr_epochs: number of epochs to train
|
||||
lr: initial learning rate
|
||||
"""
|
||||
|
||||
# define checkpointed tensors and variables
|
||||
checkpointables = {
|
||||
'learning_rate_var': K.variable(lr),
|
||||
}
|
||||
# model
|
||||
if use_dropout:
|
||||
checkpointables.update({
|
||||
'ssd': DropoutSSD(mode='training', weights_path=weights_path)
|
||||
})
|
||||
else:
|
||||
checkpointables.update({
|
||||
'ssd': SSD(mode='training', weights_path=weights_path)
|
||||
})
|
||||
|
||||
checkpointables.update({
|
||||
# optimizer
|
||||
'ssd_optimizer': tf.train.AdamOptimizer(learning_rate=checkpointables['learning_rate_var'],
|
||||
beta1=0.5, beta2=0.999),
|
||||
# global step counter
|
||||
'global_step': tf.train.get_or_create_global_step(),
|
||||
'epoch_var': K.variable(-1, dtype=tf.int64)
|
||||
})
|
||||
|
||||
# checkpoint
|
||||
checkpoint_dir = os.path.join(weights_prefix, str(iteration) + '/')
|
||||
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
|
||||
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
|
||||
checkpoint = tf.train.Checkpoint(**checkpointables)
|
||||
checkpoint.restore(latest_checkpoint)
|
||||
|
||||
def _get_last_epoch(epoch_var: tf.Variable, **kwargs) -> int:
|
||||
return int(epoch_var)
|
||||
|
||||
last_epoch = _get_last_epoch(**checkpointables)
|
||||
previous_epochs = 0
|
||||
if last_epoch != -1:
|
||||
previous_epochs = last_epoch + 1
|
||||
|
||||
with summary_ops_v2.always_record_summaries():
|
||||
summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'],
|
||||
step=checkpointables['global_step'])
|
||||
|
||||
for epoch in range(nr_epochs - previous_epochs):
|
||||
_epoch = epoch + previous_epochs
|
||||
outputs = _train_one_epoch(_epoch, dataset, **checkpointables)
|
||||
|
||||
if verbose:
|
||||
print((
|
||||
f"[{_epoch + 1:d}/{nr_epochs:d} - "
|
||||
f"train time: {outputs['per_epoch_time']:.2f}, "
|
||||
f"SSD loss: {outputs['ssd_loss']:.3f}, "
|
||||
))
|
||||
|
||||
# save weights at end of epoch
|
||||
checkpoint.save(checkpoint_prefix)
|
||||
|
||||
if verbose:
|
||||
print("Training finished!... save model weights")
|
||||
|
||||
# save trained models
|
||||
checkpoint.save(checkpoint_prefix)
|
||||
|
||||
|
||||
def _train_one_epoch(epoch: int,
|
||||
dataset: tf.data.Dataset,
|
||||
ssd: tf.keras.Model,
|
||||
ssd_optimizer: tf.train.Optimizer,
|
||||
global_step: tf.Variable,
|
||||
epoch_var: tf.Variable) -> Dict[str, float]:
|
||||
|
||||
with summary_ops_v2.always_record_summaries():
|
||||
epoch_var.assign(epoch)
|
||||
epoch_start_time = time.time()
|
||||
|
||||
# define loss variables
|
||||
ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32)
|
||||
|
||||
# go through data set
|
||||
for x, y in dataset:
|
||||
ssd_train_loss = _train_ssd_step(ssd=ssd,
|
||||
optimizer=ssd_optimizer,
|
||||
inputs=x,
|
||||
ground_truth=y,
|
||||
global_step=global_step)
|
||||
ssd_loss_avg(ssd_train_loss)
|
||||
global_step.assign_add(1)
|
||||
|
||||
epoch_end_time = time.time()
|
||||
per_epoch_time = epoch_end_time - epoch_start_time
|
||||
|
||||
# final losses of epoch
|
||||
outputs = {
|
||||
'ssd_loss': ssd_loss_avg.result(False),
|
||||
'per_epoch_time': per_epoch_time,
|
||||
}
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def _train_ssd_step(ssd: tf.keras.Model,
|
||||
optimizer: tf.train.Optimizer,
|
||||
inputs: tf.Tensor,
|
||||
ground_truth: tf.Tensor,
|
||||
global_step: tf.Variable) -> tf.Tensor:
|
||||
"""
|
||||
Trains the SSD model for one step (one batch).
|
||||
|
||||
:param ssd: instance of the SSD model
|
||||
:param optimizer: instance of chosen optimizer
|
||||
:param inputs: inputs from data set
|
||||
:param ground_truth: ground truth from data set
|
||||
:param global_step: the global step variable
|
||||
:return: the calculated loss
|
||||
"""
|
||||
with tf.GradientTape() as tape:
|
||||
predictions = ssd(inputs)
|
||||
loss = keras_ssd_loss.SSDLoss()
|
||||
batch_size = tf.shape(predictions)[0]
|
||||
ssd_loss = loss.compute_loss(ground_truth, predictions) / tf.to_float(batch_size)
|
||||
|
||||
ssd_grads = tape.gradient(ssd_loss, ssd.trainable_variables)
|
||||
if int(global_step % LOG_FREQUENCY) == 0:
|
||||
summary_ops_v2.scalar(name='ssd_loss', tensor=ssd_loss, step=global_step)
|
||||
|
||||
for grad, variable in zip(ssd_grads, ssd.trainable_variables):
|
||||
summary_ops_v2.histogram(name='gradients/' + variable.name, tensor=tf.math.l2_normalize(grad),
|
||||
step=global_step)
|
||||
summary_ops_v2.histogram(name='variables/' + variable.name, tensor=tf.math.l2_normalize(variable),
|
||||
step=global_step)
|
||||
|
||||
optimizer.apply_gradients(zip(ssd_grads, ssd.trainable_variables),
|
||||
global_step=global_step)
|
||||
|
||||
return ssd_loss
|
||||
|
||||
Reference in New Issue
Block a user