Implemented training function for SSD
Signed-off-by: Jim Martens <github@2martens.de>
This commit is contained in:
@ -23,25 +23,36 @@ Attributes:
|
|||||||
DROPOUT_RATE: rate for dropping weights
|
DROPOUT_RATE: rate for dropping weights
|
||||||
IOU_THRESHOLD: threshold for required overlap with ground truth bounding box
|
IOU_THRESHOLD: threshold for required overlap with ground truth bounding box
|
||||||
TOP_K: maximum number of predictions kept for each batch item after non-maximum suppression
|
TOP_K: maximum number of predictions kept for each batch item after non-maximum suppression
|
||||||
|
LOG_FREQUENCY: number of steps that muss pass before logging happens
|
||||||
|
|
||||||
Classes:
|
Classes:
|
||||||
``DropoutSSD``: wraps Dropout SSD 300 model
|
``DropoutSSD``: wraps Dropout SSD 300 model
|
||||||
|
|
||||||
``SSD``: wraps vanilla SSD 300 model
|
``SSD``: wraps vanilla SSD 300 model
|
||||||
"""
|
"""
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from typing import Dict
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
from tensorflow.python.ops import summary_ops_v2
|
||||||
|
|
||||||
|
from twomartens.masterthesis.ssd_keras.keras_loss_function import keras_ssd_loss
|
||||||
from twomartens.masterthesis.ssd_keras.models import keras_ssd300
|
from twomartens.masterthesis.ssd_keras.models import keras_ssd300
|
||||||
from twomartens.masterthesis.ssd_keras.models import keras_ssd300_dropout
|
from twomartens.masterthesis.ssd_keras.models import keras_ssd300_dropout
|
||||||
|
|
||||||
|
K = tf.keras.backend
|
||||||
|
tfe = tf.contrib.eager
|
||||||
|
|
||||||
IMAGE_SIZE = (240, 320, 3) # TODO check with SceneNet RGB-D
|
IMAGE_SIZE = (240, 320, 3) # TODO check with SceneNet RGB-D
|
||||||
N_CLASSES = 80
|
N_CLASSES = 80
|
||||||
DROPOUT_RATE = 0.5
|
DROPOUT_RATE = 0.5
|
||||||
IOU_THRESHOLD = 0.45
|
IOU_THRESHOLD = 0.45
|
||||||
TOP_K = 200
|
TOP_K = 200
|
||||||
|
|
||||||
|
LOG_FREQUENCY = 10
|
||||||
|
|
||||||
|
|
||||||
class SSD:
|
class SSD:
|
||||||
"""
|
"""
|
||||||
@ -86,3 +97,170 @@ class DropoutSSD:
|
|||||||
|
|
||||||
def __call__(self, inputs: tf.Tensor, *args, **kwargs) -> tf.Tensor:
|
def __call__(self, inputs: tf.Tensor, *args, **kwargs) -> tf.Tensor:
|
||||||
return self._model(inputs)
|
return self._model(inputs)
|
||||||
|
|
||||||
|
|
||||||
|
def train(dataset: tf.data.Dataset,
|
||||||
|
iteration: int,
|
||||||
|
use_dropout: bool,
|
||||||
|
weights_prefix: str,
|
||||||
|
weights_path: Optional[str] = None,
|
||||||
|
verbose: Optional[bool] = False,
|
||||||
|
batch_size: Optional[int] = 128,
|
||||||
|
nr_epochs: Optional[int] = 80,
|
||||||
|
lr: Optional[float] = 0.002) -> None:
|
||||||
|
"""
|
||||||
|
Trains the SSD on the given data set.
|
||||||
|
|
||||||
|
This function provides early stopping and creates checkpoints after every
|
||||||
|
epoch as well as after finishing training. When starting
|
||||||
|
this function with the same ``iteration`` then the training will try to
|
||||||
|
continue where it ended last time by restoring a saved checkpoint.
|
||||||
|
The loss values are provided as scalar summaries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset: the training data set
|
||||||
|
iteration: identifier for current training run
|
||||||
|
use_dropout: if True, the DropoutSSD will be used
|
||||||
|
weights_prefix: prefix for weights directory
|
||||||
|
weights_path: path to the pre-trained SSD weights
|
||||||
|
verbose: if True, progress is printed to the standard output
|
||||||
|
batch_size: size of each batch
|
||||||
|
nr_epochs: number of epochs to train
|
||||||
|
lr: initial learning rate
|
||||||
|
"""
|
||||||
|
|
||||||
|
# define checkpointed tensors and variables
|
||||||
|
checkpointables = {
|
||||||
|
'learning_rate_var': K.variable(lr),
|
||||||
|
}
|
||||||
|
# model
|
||||||
|
if use_dropout:
|
||||||
|
checkpointables.update({
|
||||||
|
'ssd': DropoutSSD(mode='training', weights_path=weights_path)
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
checkpointables.update({
|
||||||
|
'ssd': SSD(mode='training', weights_path=weights_path)
|
||||||
|
})
|
||||||
|
|
||||||
|
checkpointables.update({
|
||||||
|
# optimizer
|
||||||
|
'ssd_optimizer': tf.train.AdamOptimizer(learning_rate=checkpointables['learning_rate_var'],
|
||||||
|
beta1=0.5, beta2=0.999),
|
||||||
|
# global step counter
|
||||||
|
'global_step': tf.train.get_or_create_global_step(),
|
||||||
|
'epoch_var': K.variable(-1, dtype=tf.int64)
|
||||||
|
})
|
||||||
|
|
||||||
|
# checkpoint
|
||||||
|
checkpoint_dir = os.path.join(weights_prefix, str(iteration) + '/')
|
||||||
|
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||||
|
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
|
||||||
|
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
|
||||||
|
checkpoint = tf.train.Checkpoint(**checkpointables)
|
||||||
|
checkpoint.restore(latest_checkpoint)
|
||||||
|
|
||||||
|
def _get_last_epoch(epoch_var: tf.Variable, **kwargs) -> int:
|
||||||
|
return int(epoch_var)
|
||||||
|
|
||||||
|
last_epoch = _get_last_epoch(**checkpointables)
|
||||||
|
previous_epochs = 0
|
||||||
|
if last_epoch != -1:
|
||||||
|
previous_epochs = last_epoch + 1
|
||||||
|
|
||||||
|
with summary_ops_v2.always_record_summaries():
|
||||||
|
summary_ops_v2.scalar(name='learning_rate', tensor=checkpointables['learning_rate_var'],
|
||||||
|
step=checkpointables['global_step'])
|
||||||
|
|
||||||
|
for epoch in range(nr_epochs - previous_epochs):
|
||||||
|
_epoch = epoch + previous_epochs
|
||||||
|
outputs = _train_one_epoch(_epoch, dataset, **checkpointables)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print((
|
||||||
|
f"[{_epoch + 1:d}/{nr_epochs:d} - "
|
||||||
|
f"train time: {outputs['per_epoch_time']:.2f}, "
|
||||||
|
f"SSD loss: {outputs['ssd_loss']:.3f}, "
|
||||||
|
))
|
||||||
|
|
||||||
|
# save weights at end of epoch
|
||||||
|
checkpoint.save(checkpoint_prefix)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Training finished!... save model weights")
|
||||||
|
|
||||||
|
# save trained models
|
||||||
|
checkpoint.save(checkpoint_prefix)
|
||||||
|
|
||||||
|
|
||||||
|
def _train_one_epoch(epoch: int,
|
||||||
|
dataset: tf.data.Dataset,
|
||||||
|
ssd: tf.keras.Model,
|
||||||
|
ssd_optimizer: tf.train.Optimizer,
|
||||||
|
global_step: tf.Variable,
|
||||||
|
epoch_var: tf.Variable) -> Dict[str, float]:
|
||||||
|
|
||||||
|
with summary_ops_v2.always_record_summaries():
|
||||||
|
epoch_var.assign(epoch)
|
||||||
|
epoch_start_time = time.time()
|
||||||
|
|
||||||
|
# define loss variables
|
||||||
|
ssd_loss_avg = tfe.metrics.Mean(name='ssd_loss', dtype=tf.float32)
|
||||||
|
|
||||||
|
# go through data set
|
||||||
|
for x, y in dataset:
|
||||||
|
ssd_train_loss = _train_ssd_step(ssd=ssd,
|
||||||
|
optimizer=ssd_optimizer,
|
||||||
|
inputs=x,
|
||||||
|
ground_truth=y,
|
||||||
|
global_step=global_step)
|
||||||
|
ssd_loss_avg(ssd_train_loss)
|
||||||
|
global_step.assign_add(1)
|
||||||
|
|
||||||
|
epoch_end_time = time.time()
|
||||||
|
per_epoch_time = epoch_end_time - epoch_start_time
|
||||||
|
|
||||||
|
# final losses of epoch
|
||||||
|
outputs = {
|
||||||
|
'ssd_loss': ssd_loss_avg.result(False),
|
||||||
|
'per_epoch_time': per_epoch_time,
|
||||||
|
}
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
def _train_ssd_step(ssd: tf.keras.Model,
|
||||||
|
optimizer: tf.train.Optimizer,
|
||||||
|
inputs: tf.Tensor,
|
||||||
|
ground_truth: tf.Tensor,
|
||||||
|
global_step: tf.Variable) -> tf.Tensor:
|
||||||
|
"""
|
||||||
|
Trains the SSD model for one step (one batch).
|
||||||
|
|
||||||
|
:param ssd: instance of the SSD model
|
||||||
|
:param optimizer: instance of chosen optimizer
|
||||||
|
:param inputs: inputs from data set
|
||||||
|
:param ground_truth: ground truth from data set
|
||||||
|
:param global_step: the global step variable
|
||||||
|
:return: the calculated loss
|
||||||
|
"""
|
||||||
|
with tf.GradientTape() as tape:
|
||||||
|
predictions = ssd(inputs)
|
||||||
|
loss = keras_ssd_loss.SSDLoss()
|
||||||
|
batch_size = tf.shape(predictions)[0]
|
||||||
|
ssd_loss = loss.compute_loss(ground_truth, predictions) / tf.to_float(batch_size)
|
||||||
|
|
||||||
|
ssd_grads = tape.gradient(ssd_loss, ssd.trainable_variables)
|
||||||
|
if int(global_step % LOG_FREQUENCY) == 0:
|
||||||
|
summary_ops_v2.scalar(name='ssd_loss', tensor=ssd_loss, step=global_step)
|
||||||
|
|
||||||
|
for grad, variable in zip(ssd_grads, ssd.trainable_variables):
|
||||||
|
summary_ops_v2.histogram(name='gradients/' + variable.name, tensor=tf.math.l2_normalize(grad),
|
||||||
|
step=global_step)
|
||||||
|
summary_ops_v2.histogram(name='variables/' + variable.name, tensor=tf.math.l2_normalize(variable),
|
||||||
|
step=global_step)
|
||||||
|
|
||||||
|
optimizer.apply_gradients(zip(ssd_grads, ssd.trainable_variables),
|
||||||
|
global_step=global_step)
|
||||||
|
|
||||||
|
return ssd_loss
|
||||||
|
|||||||
Reference in New Issue
Block a user