486 lines
22 KiB
Python
486 lines
22 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2018 Timon Brüning, Inga Kempfert, Anne Kunstmann, Jim Martens,
|
|
# Marius Pierenkemper, Yanneck Reiss
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Functionality to evaluate results of networks.
|
|
|
|
Functions:
|
|
get_number_gt_per_class(...): calculates the number of ground truth boxes per class
|
|
prepare-predictions(...): prepares the predictions for further processing
|
|
match_predictions(...): matches predictions to ground truth boxes
|
|
get_precision_recall(...): computes the precision and recall values and returns them
|
|
get_f1_score(...): computes the F1 score for every class
|
|
get_mean_average_precisions(...): computes the mean average precision for each class and returns them
|
|
get_mean_average_precision(...): computes the mean average precision over all classes and returns it
|
|
"""
|
|
from typing import Sequence, Union, Tuple, List
|
|
|
|
import numpy as np
|
|
|
|
|
|
def get_number_gt_per_class(labels: Sequence[Sequence[Sequence[int]]],
|
|
nr_classes: int) -> np.ndarray:
|
|
"""
|
|
Calculates the number of ground truth boxes per class and returns result.
|
|
|
|
Args:
|
|
labels: list of labels per image
|
|
nr_classes: number of classes
|
|
|
|
Returns:
|
|
numpy array with respective counts
|
|
"""
|
|
number_gt_per_class = np.zeros(shape=(nr_classes + 1), dtype=np.int)
|
|
label_range = range(len(labels))
|
|
|
|
# iterate over images
|
|
for i in label_range:
|
|
boxes = np.asarray(labels[i])
|
|
|
|
# iterate over boxes in image
|
|
for j in range(boxes.shape[0]):
|
|
class_id = int(boxes[j, 0])
|
|
if class_id > nr_classes:
|
|
continue
|
|
number_gt_per_class[class_id] += 1
|
|
|
|
return number_gt_per_class
|
|
|
|
|
|
def prepare_predictions(predictions: Sequence[Sequence[Sequence[Union[int, float]]]],
|
|
nr_classes: int) -> \
|
|
List[List[Tuple[int, float, int, int, int, int]]]:
|
|
"""
|
|
Prepares the predictions for further processing.
|
|
|
|
Args:
|
|
predictions: list of predictions per image
|
|
nr_classes: number of classes
|
|
|
|
Returns:
|
|
list of predictions per class
|
|
"""
|
|
results = [list() for _ in range(nr_classes + 1)]
|
|
# index positions for bounding box coordinates
|
|
xmin = 2
|
|
ymin = 3
|
|
xmax = 4
|
|
ymax = 5
|
|
|
|
for i, batch_item in enumerate(predictions):
|
|
image_id = i
|
|
|
|
for box in batch_item:
|
|
if len(box) == 7:
|
|
# entropy is in box list
|
|
xmin += 1
|
|
ymin += 1
|
|
xmax += 1
|
|
ymax += 1
|
|
|
|
class_id = int(box[0])
|
|
# Round the box coordinates to reduce the required memory.
|
|
confidence = box[1]
|
|
xmin = round(box[2])
|
|
ymin = round(box[3])
|
|
xmax = round(box[4])
|
|
ymax = round(box[5])
|
|
prediction = (image_id, confidence, xmin, ymin, xmax, ymax)
|
|
# Append the predicted box to the results list for its class.
|
|
results[class_id].append(prediction)
|
|
|
|
return results
|
|
|
|
|
|
def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, int, int, int, int]]],
|
|
labels: Sequence[Sequence[Sequence[int]]],
|
|
iou_func: callable,
|
|
nr_classes: int,
|
|
iou_threshold: float,
|
|
border_pixels: str = "include",
|
|
sorting_algorithm: str = "quicksort") -> Tuple[List[np.ndarray], List[np.ndarray],
|
|
List[np.ndarray], List[np.ndarray],
|
|
np.ndarray, np.ndarray,
|
|
np.ndarray, np.ndarray]:
|
|
"""
|
|
Matches predictions to ground truth boxes.
|
|
|
|
Args:
|
|
predictions: list of predictions
|
|
labels: list of labels per image
|
|
iou_func: function to calculate the intersection over union
|
|
nr_classes: number of classes
|
|
iou_threshold: only matches higher than this value will be considered
|
|
border_pixels: How to treat the border pixels of the bounding boxes.
|
|
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
|
|
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
|
|
If 'half', then one of each of the two horizontal and vertical borders belong
|
|
to the boxes, but not the other.
|
|
sorting_algorithm: Which sorting algorithm the matching algorithm should use. This
|
|
argument accepts any valid sorting algorithm for Numpy's `argsort()` function.
|
|
You will usually want to choose between 'quicksort' (fastest and most memory efficient,
|
|
but not stable) and 'mergesort' (slight slower and less memory efficient, but stable).
|
|
The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm
|
|
is only guaranteed to behave identically if you choose 'mergesort' as the sorting algorithm,
|
|
but it will almost always behave identically even if you choose 'quicksort' (but no guarantees).
|
|
|
|
Returns:
|
|
true positives, false positives, cumulative true positives, and cumulative false positives for
|
|
each class, open set error as defined by Miller et al, cumulative open set error,
|
|
cumulative true positives and cumulative false positives over all classes
|
|
"""
|
|
true_positives = [[]] # The false positives for each class, sorted by descending confidence.
|
|
false_positives = [[]] # The true positives for each class, sorted by descending confidence.
|
|
cumulative_true_positives = [[]]
|
|
cumulative_false_positives = [[]]
|
|
most_predictions = -1
|
|
|
|
for class_id in range(1, nr_classes + 1):
|
|
nr_predictions = len(predictions[class_id])
|
|
if nr_predictions > most_predictions:
|
|
most_predictions = nr_predictions
|
|
|
|
open_set_error = np.zeros(most_predictions, dtype=np.int)
|
|
true_positives_micro = np.zeros(most_predictions, dtype=np.int)
|
|
false_positives_micro = np.zeros(most_predictions, dtype=np.int)
|
|
|
|
for class_id in range(1, nr_classes + 1):
|
|
predictions_class = predictions[class_id]
|
|
|
|
# Store the matching results in these lists:
|
|
true_pos = np.zeros(len(predictions_class),
|
|
dtype=np.int) # 1 for every prediction that is a true positive, 0 otherwise
|
|
false_pos = np.zeros(len(predictions_class),
|
|
dtype=np.int) # 1 for every prediction that is a false positive, 0 otherwise
|
|
|
|
# In case there are no predictions at all for this class, we're done here.
|
|
if len(predictions_class) == 0:
|
|
true_positives.append(true_pos)
|
|
false_positives.append(false_pos)
|
|
cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives
|
|
cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives
|
|
cumulative_true_positives.append(cumulative_true_pos)
|
|
cumulative_false_positives.append(cumulative_false_pos)
|
|
continue
|
|
|
|
# Convert the predictions list for this class into a structured array so that we can sort it by confidence.
|
|
|
|
# Create the data type for the structured array.
|
|
preds_data_type = np.dtype([('image_id', np.int32),
|
|
('confidence', 'f4'),
|
|
('xmin', 'f4'),
|
|
('ymin', 'f4'),
|
|
('xmax', 'f4'),
|
|
('ymax', 'f4')])
|
|
# Create the structured array
|
|
predictions_class = np.array(predictions_class, dtype=preds_data_type)
|
|
# Sort the detections by decreasing confidence.
|
|
descending_indices = np.argsort(-predictions_class['confidence'], kind=sorting_algorithm)
|
|
predictions_sorted = predictions_class[descending_indices]
|
|
|
|
# Keep track of which ground truth boxes were already matched to a detection.
|
|
gt_matched = {}
|
|
|
|
for i in range(len(predictions_class)):
|
|
prediction = predictions_sorted[i]
|
|
image_id = prediction['image_id']
|
|
# Convert the structured array element to a regular array
|
|
pred_box = np.asarray(list(prediction[['xmin', 'ymin', 'xmax', 'ymax']]))
|
|
|
|
# Get the relevant ground truth boxes for this prediction,
|
|
# i.e. all ground truth boxes that match the prediction's
|
|
# image ID and class ID.
|
|
|
|
gt = labels[image_id]
|
|
gt = np.asarray(gt)
|
|
class_mask = gt[:, 0] == class_id
|
|
gt = gt[class_mask]
|
|
|
|
if gt.size == 0:
|
|
# If the image doesn't contain any objects of this class,
|
|
# the prediction becomes a false positive.
|
|
false_pos[i] = 1
|
|
false_positives_micro[i] += 1
|
|
open_set_error[i] += 1
|
|
continue
|
|
|
|
# Compute the IoU of this prediction with all ground truth boxes of the same class.
|
|
overlaps = iou_func(boxes1=gt[:, [1, 2, 3, 4]],
|
|
boxes2=pred_box,
|
|
coords='corners',
|
|
mode='element-wise',
|
|
border_pixels=border_pixels)
|
|
|
|
# For each detection, match the ground truth box with the highest overlap.
|
|
# It's possible that the same ground truth box will be matched to multiple
|
|
# detections.
|
|
gt_match_index = np.argmax(overlaps)
|
|
gt_match_overlap = overlaps[gt_match_index]
|
|
|
|
if gt_match_overlap < iou_threshold:
|
|
# False positive, IoU threshold violated:
|
|
# Those predictions whose matched overlap is below the threshold become
|
|
# false positives.
|
|
false_pos[i] = 1
|
|
false_positives_micro[i] += 1
|
|
else:
|
|
if image_id not in gt_matched:
|
|
# True positive:
|
|
# If the matched ground truth box for this prediction hasn't been matched to a
|
|
# different prediction already, we have a true positive.
|
|
true_pos[i] = 1
|
|
true_positives_micro[i] += 1
|
|
gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool)
|
|
gt_matched[image_id][gt_match_index] = True
|
|
elif not gt_matched[image_id][gt_match_index]:
|
|
# True positive:
|
|
# If the matched ground truth box for this prediction hasn't been matched to a
|
|
# different prediction already, we have a true positive.
|
|
true_pos[i] = 1
|
|
true_positives_micro[i] += 1
|
|
gt_matched[image_id][gt_match_index] = True
|
|
else:
|
|
# False positive, duplicate detection:
|
|
# If the matched ground truth box for this prediction has already been matched
|
|
# to a different prediction previously, it is a duplicate detection for an
|
|
# already detected object, which counts as a false positive.
|
|
false_pos[i] = 1
|
|
false_positives_micro[i] += 1
|
|
|
|
true_positives.append(true_pos)
|
|
false_positives.append(false_pos)
|
|
|
|
cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives
|
|
cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives
|
|
|
|
cumulative_true_positives.append(cumulative_true_pos)
|
|
cumulative_false_positives.append(cumulative_false_pos)
|
|
|
|
cumulative_open_set_error = np.cumsum(open_set_error)
|
|
cumulative_false_positives_micro = np.cumsum(false_positives_micro)
|
|
cumulative_true_positives_micro = np.cumsum(true_positives_micro)
|
|
|
|
return (
|
|
true_positives, false_positives, cumulative_true_positives, cumulative_false_positives,
|
|
open_set_error, cumulative_open_set_error,
|
|
cumulative_true_positives_micro, cumulative_false_positives_micro
|
|
)
|
|
|
|
|
|
def get_precision_recall(number_gt_per_class: np.ndarray,
|
|
cumulative_true_positives: Sequence[np.ndarray],
|
|
cumulative_false_positives: Sequence[np.ndarray],
|
|
cumulative_true_positives_micro: np.ndarray,
|
|
cumulative_false_positives_micro: np.ndarray,
|
|
nr_classes: int) -> Tuple[List[np.ndarray], List[np.ndarray],
|
|
np.ndarray, np.ndarray,
|
|
np.ndarray, np.ndarray]:
|
|
"""
|
|
Computes the precision and recall values and returns them.
|
|
|
|
Args:
|
|
number_gt_per_class: number of ground truth bounding boxes per class
|
|
cumulative_true_positives: cumulative true positives per class
|
|
cumulative_false_positives: cumulative false positives per class
|
|
cumulative_true_positives_micro: cumulative true positives over all classes
|
|
cumulative_false_positives_micro: cumulative false positives over all classes
|
|
nr_classes: number of classes
|
|
|
|
Returns:
|
|
cumulative precisions and cumulative recalls per class,
|
|
micro averaged precision/recall, and
|
|
macro averaged precision/recall
|
|
"""
|
|
cumulative_precisions = [[]]
|
|
cumulative_recalls = [[]]
|
|
cumulative_precision_micro = np.zeros(cumulative_true_positives_micro.shape, dtype=np.float)
|
|
cumulative_recall_micro = np.zeros(cumulative_true_positives_micro.shape, dtype=np.float)
|
|
cumulative_precision_macro = np.zeros_like(cumulative_precision_micro)
|
|
cumulative_recall_macro = np.zeros_like(cumulative_recall_micro)
|
|
total_number_gt = 0
|
|
number_of_nonzero_classes = 0
|
|
|
|
# Iterate over all classes.
|
|
for class_id in range(1, nr_classes + 1):
|
|
|
|
if number_gt_per_class[class_id] == 0:
|
|
cumulative_precisions.append([])
|
|
cumulative_recalls.append([])
|
|
continue
|
|
|
|
tp = cumulative_true_positives[class_id]
|
|
fp = cumulative_false_positives[class_id]
|
|
|
|
cumulative_precision = np.where(tp + fp > 0, tp / (tp + fp), 0) # 1D array with shape `(num_predictions,)`
|
|
number_gt = number_gt_per_class[class_id]
|
|
total_number_gt += number_gt
|
|
cumulative_recall = tp / number_gt # 1D array with shape `(num_predictions,)`
|
|
|
|
cumulative_precisions.append(cumulative_precision)
|
|
cumulative_recalls.append(cumulative_recall)
|
|
|
|
diff_to_largest_class = cumulative_precision_micro.shape[0] - cumulative_precision.shape[0]
|
|
if diff_to_largest_class:
|
|
highest_precision = cumulative_precision[-1] if cumulative_precision.shape[0] else 0
|
|
highest_recall = cumulative_recall[-1] if cumulative_recall.shape[0] else 0
|
|
repeated_last_precision = np.tile(highest_precision, diff_to_largest_class)
|
|
repeated_last_recall = np.tile(highest_recall, diff_to_largest_class)
|
|
extended_precision = np.concatenate((cumulative_precision, repeated_last_precision))
|
|
extended_recall = np.concatenate((cumulative_recall, repeated_last_recall))
|
|
cumulative_precision_macro += extended_precision
|
|
cumulative_recall_macro += extended_recall
|
|
else:
|
|
cumulative_precision_macro += cumulative_precision
|
|
cumulative_recall_macro += cumulative_recall
|
|
|
|
number_of_nonzero_classes += 1
|
|
|
|
# calculate micro averaged precision and recall
|
|
tp = cumulative_true_positives_micro
|
|
fp = cumulative_false_positives_micro
|
|
cumulative_precision_micro = np.where(tp + fp > 0, tp / (tp + fp), 0)
|
|
cumulative_recall_micro = tp / total_number_gt
|
|
|
|
# calculate macro averaged precision and recall
|
|
cumulative_precision_macro /= number_of_nonzero_classes
|
|
cumulative_recall_macro /= number_of_nonzero_classes
|
|
|
|
return (cumulative_precisions, cumulative_recalls,
|
|
cumulative_precision_micro, cumulative_recall_micro,
|
|
cumulative_precision_macro, cumulative_recall_macro
|
|
)
|
|
|
|
|
|
def get_f1_score(cumulative_precisions: List[np.ndarray],
|
|
cumulative_recalls: List[np.ndarray],
|
|
cumulative_precision_micro: np.ndarray,
|
|
cumulative_recall_micro: np.ndarray,
|
|
cumulative_precision_macro: np.ndarray,
|
|
cumulative_recall_macro: np.ndarray,
|
|
nr_classes: int) -> Tuple[List[np.ndarray],
|
|
np.ndarray, np.ndarray]:
|
|
"""
|
|
Computes the F1 score for every class.
|
|
|
|
Args:
|
|
cumulative_precisions: cumulative precisions for each class
|
|
cumulative_recalls: cumulative recalls for each class
|
|
cumulative_precision_micro: cumulative precision micro averaged
|
|
cumulative_recall_micro: cumulative recall micro averaged
|
|
cumulative_precision_macro: cumulative precision macro averaged
|
|
cumulative_recall_macro: cumulative recall macro averaged
|
|
nr_classes: number of classes
|
|
|
|
Returns:
|
|
cumulative F1 score per class,
|
|
cumulative F1 score micro averaged, cumulative F1 score macro averaged
|
|
"""
|
|
cumulative_f1_scores = [[]]
|
|
|
|
# iterate over all classes
|
|
for class_id in range(1, nr_classes + 1):
|
|
cumulative_precision = cumulative_precisions[class_id]
|
|
cumulative_recall = cumulative_recalls[class_id]
|
|
if not np.count_nonzero(cumulative_precision + cumulative_recall):
|
|
cumulative_f1_scores.append([])
|
|
continue
|
|
f1_score = 2 * ((cumulative_precision * cumulative_recall) / (cumulative_precision + cumulative_recall + 0.001))
|
|
cumulative_f1_scores.append(f1_score)
|
|
|
|
f1_score_micro = 2 * ((cumulative_precision_micro * cumulative_recall_micro) /
|
|
(cumulative_precision_micro + cumulative_recall_micro + 0.001))
|
|
f1_score_macro = 2 * ((cumulative_precision_macro * cumulative_recall_macro) /
|
|
(cumulative_precision_macro + cumulative_recall_macro + 0.001))
|
|
|
|
return cumulative_f1_scores, f1_score_micro, f1_score_macro
|
|
|
|
|
|
def get_mean_average_precisions(cumulative_precisions: List[np.ndarray],
|
|
cumulative_recalls: List[np.ndarray],
|
|
nr_classes: int) -> List[float]:
|
|
"""
|
|
Computes the mean average precision for each class and returns them.
|
|
|
|
Args:
|
|
cumulative_precisions: cumulative precisions for each class
|
|
cumulative_recalls: cumulative recalls for each class
|
|
nr_classes: number of classes
|
|
|
|
Returns:
|
|
average precision per class
|
|
"""
|
|
average_precisions = [0.0]
|
|
|
|
# Iterate over all classes.
|
|
for class_id in range(1, nr_classes + 1):
|
|
|
|
cumulative_precision = cumulative_precisions[class_id]
|
|
cumulative_recall = cumulative_recalls[class_id]
|
|
|
|
# We will compute the precision at all unique recall values.
|
|
unique_recalls, unique_recall_indices, unique_recall_counts = np.unique(cumulative_recall,
|
|
return_index=True,
|
|
return_counts=True)
|
|
|
|
# Store the maximal precision for each recall value and the absolute difference
|
|
# between any two unique recall values in the lists below. The products of these
|
|
# two numbers constitute the rectangular areas whose sum will be our numerical
|
|
# integral.
|
|
maximal_precisions = np.zeros_like(unique_recalls)
|
|
recall_deltas = np.zeros_like(unique_recalls)
|
|
|
|
# Iterate over all unique recall values in reverse order. This saves a lot of computation:
|
|
# For each unique recall value `r`, we want to get the maximal precision value obtained
|
|
# for any recall value `r* >= r`. Once we know the maximal precision for the last `k` recall
|
|
# values after a given iteration, then in the next iteration, in order compute the maximal
|
|
# precisions for the last `l > k` recall values, we only need to compute the maximal precision
|
|
# for `l - k` recall values and then take the maximum between that and the previously computed
|
|
# maximum instead of computing the maximum over all `l` values.
|
|
# We skip the very last recall value, since the precision after the last recall value
|
|
# 1.0 is defined to be zero.
|
|
for i in range(len(unique_recalls) - 2, -1, -1):
|
|
begin = unique_recall_indices[i]
|
|
end = unique_recall_indices[i + 1]
|
|
# When computing the maximal precisions, use the maximum of the previous iteration to
|
|
# avoid unnecessary repeated computation over the same precision values.
|
|
# The maximal precisions are the heights of the rectangle areas of our integral under
|
|
# the precision-recall curve.
|
|
maximal_precisions[i] = np.maximum(np.amax(cumulative_precision[begin:end]),
|
|
maximal_precisions[i + 1])
|
|
# The differences between two adjacent recall values are the widths of our rectangle areas.
|
|
recall_deltas[i] = unique_recalls[i + 1] - unique_recalls[i]
|
|
|
|
average_precision = np.sum(maximal_precisions * recall_deltas)
|
|
average_precisions.append(average_precision)
|
|
|
|
return average_precisions
|
|
|
|
|
|
def get_mean_average_precision(average_precisions: List[float]) -> float:
|
|
"""
|
|
Computes the mean average precision over all classes and returns it.
|
|
|
|
Args:
|
|
average_precisions: list of average precisions per class
|
|
|
|
Returns:
|
|
mean average precision over all classes
|
|
"""
|
|
average_precisions = np.copy(average_precisions)
|
|
average_precisions = average_precisions[average_precisions != 0.0]
|
|
return np.average(average_precisions[:])
|