masterthesis/src/twomartens/masterthesis/evaluate.py

486 lines
22 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2018 Timon Brüning, Inga Kempfert, Anne Kunstmann, Jim Martens,
# Marius Pierenkemper, Yanneck Reiss
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Functionality to evaluate results of networks.
Functions:
get_number_gt_per_class(...): calculates the number of ground truth boxes per class
prepare-predictions(...): prepares the predictions for further processing
match_predictions(...): matches predictions to ground truth boxes
get_precision_recall(...): computes the precision and recall values and returns them
get_f1_score(...): computes the F1 score for every class
get_mean_average_precisions(...): computes the mean average precision for each class and returns them
get_mean_average_precision(...): computes the mean average precision over all classes and returns it
"""
from typing import Sequence, Union, Tuple, List
import numpy as np
def get_number_gt_per_class(labels: Sequence[Sequence[Sequence[int]]],
nr_classes: int) -> np.ndarray:
"""
Calculates the number of ground truth boxes per class and returns result.
Args:
labels: list of labels per image
nr_classes: number of classes
Returns:
numpy array with respective counts
"""
number_gt_per_class = np.zeros(shape=(nr_classes + 1), dtype=np.int)
label_range = range(len(labels))
# iterate over images
for i in label_range:
boxes = np.asarray(labels[i])
# iterate over boxes in image
for j in range(boxes.shape[0]):
class_id = int(boxes[j, 0])
if class_id > nr_classes:
continue
number_gt_per_class[class_id] += 1
return number_gt_per_class
def prepare_predictions(predictions: Sequence[Sequence[Sequence[Union[int, float]]]],
nr_classes: int) -> \
List[List[Tuple[int, float, int, int, int, int]]]:
"""
Prepares the predictions for further processing.
Args:
predictions: list of predictions per image
nr_classes: number of classes
Returns:
list of predictions per class
"""
results = [list() for _ in range(nr_classes + 1)]
# index positions for bounding box coordinates
xmin = 2
ymin = 3
xmax = 4
ymax = 5
for i, batch_item in enumerate(predictions):
image_id = i
for box in batch_item:
if len(box) == 7:
# entropy is in box list
xmin += 1
ymin += 1
xmax += 1
ymax += 1
class_id = int(box[0])
# Round the box coordinates to reduce the required memory.
confidence = box[1]
xmin = round(box[2])
ymin = round(box[3])
xmax = round(box[4])
ymax = round(box[5])
prediction = (image_id, confidence, xmin, ymin, xmax, ymax)
# Append the predicted box to the results list for its class.
results[class_id].append(prediction)
return results
def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, int, int, int, int]]],
labels: Sequence[Sequence[Sequence[int]]],
iou_func: callable,
nr_classes: int,
iou_threshold: float,
border_pixels: str = "include",
sorting_algorithm: str = "quicksort") -> Tuple[List[np.ndarray], List[np.ndarray],
List[np.ndarray], List[np.ndarray],
np.ndarray, np.ndarray,
np.ndarray, np.ndarray]:
"""
Matches predictions to ground truth boxes.
Args:
predictions: list of predictions
labels: list of labels per image
iou_func: function to calculate the intersection over union
nr_classes: number of classes
iou_threshold: only matches higher than this value will be considered
border_pixels: How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxes, but not the other.
sorting_algorithm: Which sorting algorithm the matching algorithm should use. This
argument accepts any valid sorting algorithm for Numpy's `argsort()` function.
You will usually want to choose between 'quicksort' (fastest and most memory efficient,
but not stable) and 'mergesort' (slight slower and less memory efficient, but stable).
The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm
is only guaranteed to behave identically if you choose 'mergesort' as the sorting algorithm,
but it will almost always behave identically even if you choose 'quicksort' (but no guarantees).
Returns:
true positives, false positives, cumulative true positives, and cumulative false positives for
each class, open set error as defined by Miller et al, cumulative open set error,
cumulative true positives and cumulative false positives over all classes
"""
true_positives = [[]] # The false positives for each class, sorted by descending confidence.
false_positives = [[]] # The true positives for each class, sorted by descending confidence.
cumulative_true_positives = [[]]
cumulative_false_positives = [[]]
most_predictions = -1
for class_id in range(1, nr_classes + 1):
nr_predictions = len(predictions[class_id])
if nr_predictions > most_predictions:
most_predictions = nr_predictions
open_set_error = np.zeros(most_predictions, dtype=np.int)
true_positives_micro = np.zeros(most_predictions, dtype=np.int)
false_positives_micro = np.zeros(most_predictions, dtype=np.int)
for class_id in range(1, nr_classes + 1):
predictions_class = predictions[class_id]
# Store the matching results in these lists:
true_pos = np.zeros(len(predictions_class),
dtype=np.int) # 1 for every prediction that is a true positive, 0 otherwise
false_pos = np.zeros(len(predictions_class),
dtype=np.int) # 1 for every prediction that is a false positive, 0 otherwise
# In case there are no predictions at all for this class, we're done here.
if len(predictions_class) == 0:
true_positives.append(true_pos)
false_positives.append(false_pos)
cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives
cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives
cumulative_true_positives.append(cumulative_true_pos)
cumulative_false_positives.append(cumulative_false_pos)
continue
# Convert the predictions list for this class into a structured array so that we can sort it by confidence.
# Create the data type for the structured array.
preds_data_type = np.dtype([('image_id', np.int32),
('confidence', 'f4'),
('xmin', 'f4'),
('ymin', 'f4'),
('xmax', 'f4'),
('ymax', 'f4')])
# Create the structured array
predictions_class = np.array(predictions_class, dtype=preds_data_type)
# Sort the detections by decreasing confidence.
descending_indices = np.argsort(-predictions_class['confidence'], kind=sorting_algorithm)
predictions_sorted = predictions_class[descending_indices]
# Keep track of which ground truth boxes were already matched to a detection.
gt_matched = {}
for i in range(len(predictions_class)):
prediction = predictions_sorted[i]
image_id = prediction['image_id']
# Convert the structured array element to a regular array
pred_box = np.asarray(list(prediction[['xmin', 'ymin', 'xmax', 'ymax']]))
# Get the relevant ground truth boxes for this prediction,
# i.e. all ground truth boxes that match the prediction's
# image ID and class ID.
gt = labels[image_id]
gt = np.asarray(gt)
class_mask = gt[:, 0] == class_id
gt = gt[class_mask]
if gt.size == 0:
# If the image doesn't contain any objects of this class,
# the prediction becomes a false positive.
false_pos[i] = 1
false_positives_micro[i] += 1
open_set_error[i] += 1
continue
# Compute the IoU of this prediction with all ground truth boxes of the same class.
overlaps = iou_func(boxes1=gt[:, [1, 2, 3, 4]],
boxes2=pred_box,
coords='corners',
mode='element-wise',
border_pixels=border_pixels)
# For each detection, match the ground truth box with the highest overlap.
# It's possible that the same ground truth box will be matched to multiple
# detections.
gt_match_index = np.argmax(overlaps)
gt_match_overlap = overlaps[gt_match_index]
if gt_match_overlap < iou_threshold:
# False positive, IoU threshold violated:
# Those predictions whose matched overlap is below the threshold become
# false positives.
false_pos[i] = 1
false_positives_micro[i] += 1
else:
if image_id not in gt_matched:
# True positive:
# If the matched ground truth box for this prediction hasn't been matched to a
# different prediction already, we have a true positive.
true_pos[i] = 1
true_positives_micro[i] += 1
gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool)
gt_matched[image_id][gt_match_index] = True
elif not gt_matched[image_id][gt_match_index]:
# True positive:
# If the matched ground truth box for this prediction hasn't been matched to a
# different prediction already, we have a true positive.
true_pos[i] = 1
true_positives_micro[i] += 1
gt_matched[image_id][gt_match_index] = True
else:
# False positive, duplicate detection:
# If the matched ground truth box for this prediction has already been matched
# to a different prediction previously, it is a duplicate detection for an
# already detected object, which counts as a false positive.
false_pos[i] = 1
false_positives_micro[i] += 1
true_positives.append(true_pos)
false_positives.append(false_pos)
cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives
cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives
cumulative_true_positives.append(cumulative_true_pos)
cumulative_false_positives.append(cumulative_false_pos)
cumulative_open_set_error = np.cumsum(open_set_error)
cumulative_false_positives_micro = np.cumsum(false_positives_micro)
cumulative_true_positives_micro = np.cumsum(true_positives_micro)
return (
true_positives, false_positives, cumulative_true_positives, cumulative_false_positives,
open_set_error, cumulative_open_set_error,
cumulative_true_positives_micro, cumulative_false_positives_micro
)
def get_precision_recall(number_gt_per_class: np.ndarray,
cumulative_true_positives: Sequence[np.ndarray],
cumulative_false_positives: Sequence[np.ndarray],
cumulative_true_positives_micro: np.ndarray,
cumulative_false_positives_micro: np.ndarray,
nr_classes: int) -> Tuple[List[np.ndarray], List[np.ndarray],
np.ndarray, np.ndarray,
np.ndarray, np.ndarray]:
"""
Computes the precision and recall values and returns them.
Args:
number_gt_per_class: number of ground truth bounding boxes per class
cumulative_true_positives: cumulative true positives per class
cumulative_false_positives: cumulative false positives per class
cumulative_true_positives_micro: cumulative true positives over all classes
cumulative_false_positives_micro: cumulative false positives over all classes
nr_classes: number of classes
Returns:
cumulative precisions and cumulative recalls per class,
micro averaged precision/recall, and
macro averaged precision/recall
"""
cumulative_precisions = [[]]
cumulative_recalls = [[]]
cumulative_precision_micro = np.zeros(cumulative_true_positives_micro.shape, dtype=np.float)
cumulative_recall_micro = np.zeros(cumulative_true_positives_micro.shape, dtype=np.float)
cumulative_precision_macro = np.zeros_like(cumulative_precision_micro)
cumulative_recall_macro = np.zeros_like(cumulative_recall_micro)
total_number_gt = 0
number_of_nonzero_classes = 0
# Iterate over all classes.
for class_id in range(1, nr_classes + 1):
if number_gt_per_class[class_id] == 0:
cumulative_precisions.append([])
cumulative_recalls.append([])
continue
tp = cumulative_true_positives[class_id]
fp = cumulative_false_positives[class_id]
cumulative_precision = np.where(tp + fp > 0, tp / (tp + fp), 0) # 1D array with shape `(num_predictions,)`
number_gt = number_gt_per_class[class_id]
total_number_gt += number_gt
cumulative_recall = tp / number_gt # 1D array with shape `(num_predictions,)`
cumulative_precisions.append(cumulative_precision)
cumulative_recalls.append(cumulative_recall)
diff_to_largest_class = cumulative_precision_micro.shape[0] - cumulative_precision.shape[0]
if diff_to_largest_class:
highest_precision = cumulative_precision[-1] if cumulative_precision.shape[0] else 0
highest_recall = cumulative_recall[-1] if cumulative_recall.shape[0] else 0
repeated_last_precision = np.tile(highest_precision, diff_to_largest_class)
repeated_last_recall = np.tile(highest_recall, diff_to_largest_class)
extended_precision = np.concatenate((cumulative_precision, repeated_last_precision))
extended_recall = np.concatenate((cumulative_recall, repeated_last_recall))
cumulative_precision_macro += extended_precision
cumulative_recall_macro += extended_recall
else:
cumulative_precision_macro += cumulative_precision
cumulative_recall_macro += cumulative_recall
number_of_nonzero_classes += 1
# calculate micro averaged precision and recall
tp = cumulative_true_positives_micro
fp = cumulative_false_positives_micro
cumulative_precision_micro = np.where(tp + fp > 0, tp / (tp + fp), 0)
cumulative_recall_micro = tp / total_number_gt
# calculate macro averaged precision and recall
cumulative_precision_macro /= number_of_nonzero_classes
cumulative_recall_macro /= number_of_nonzero_classes
return (cumulative_precisions, cumulative_recalls,
cumulative_precision_micro, cumulative_recall_micro,
cumulative_precision_macro, cumulative_recall_macro
)
def get_f1_score(cumulative_precisions: List[np.ndarray],
cumulative_recalls: List[np.ndarray],
cumulative_precision_micro: np.ndarray,
cumulative_recall_micro: np.ndarray,
cumulative_precision_macro: np.ndarray,
cumulative_recall_macro: np.ndarray,
nr_classes: int) -> Tuple[List[np.ndarray],
np.ndarray, np.ndarray]:
"""
Computes the F1 score for every class.
Args:
cumulative_precisions: cumulative precisions for each class
cumulative_recalls: cumulative recalls for each class
cumulative_precision_micro: cumulative precision micro averaged
cumulative_recall_micro: cumulative recall micro averaged
cumulative_precision_macro: cumulative precision macro averaged
cumulative_recall_macro: cumulative recall macro averaged
nr_classes: number of classes
Returns:
cumulative F1 score per class,
cumulative F1 score micro averaged, cumulative F1 score macro averaged
"""
cumulative_f1_scores = [[]]
# iterate over all classes
for class_id in range(1, nr_classes + 1):
cumulative_precision = cumulative_precisions[class_id]
cumulative_recall = cumulative_recalls[class_id]
if not np.count_nonzero(cumulative_precision + cumulative_recall):
cumulative_f1_scores.append([])
continue
f1_score = 2 * ((cumulative_precision * cumulative_recall) / (cumulative_precision + cumulative_recall + 0.001))
cumulative_f1_scores.append(f1_score)
f1_score_micro = 2 * ((cumulative_precision_micro * cumulative_recall_micro) /
(cumulative_precision_micro + cumulative_recall_micro + 0.001))
f1_score_macro = 2 * ((cumulative_precision_macro * cumulative_recall_macro) /
(cumulative_precision_macro + cumulative_recall_macro + 0.001))
return cumulative_f1_scores, f1_score_micro, f1_score_macro
def get_mean_average_precisions(cumulative_precisions: List[np.ndarray],
cumulative_recalls: List[np.ndarray],
nr_classes: int) -> List[float]:
"""
Computes the mean average precision for each class and returns them.
Args:
cumulative_precisions: cumulative precisions for each class
cumulative_recalls: cumulative recalls for each class
nr_classes: number of classes
Returns:
average precision per class
"""
average_precisions = [0.0]
# Iterate over all classes.
for class_id in range(1, nr_classes + 1):
cumulative_precision = cumulative_precisions[class_id]
cumulative_recall = cumulative_recalls[class_id]
# We will compute the precision at all unique recall values.
unique_recalls, unique_recall_indices, unique_recall_counts = np.unique(cumulative_recall,
return_index=True,
return_counts=True)
# Store the maximal precision for each recall value and the absolute difference
# between any two unique recall values in the lists below. The products of these
# two numbers constitute the rectangular areas whose sum will be our numerical
# integral.
maximal_precisions = np.zeros_like(unique_recalls)
recall_deltas = np.zeros_like(unique_recalls)
# Iterate over all unique recall values in reverse order. This saves a lot of computation:
# For each unique recall value `r`, we want to get the maximal precision value obtained
# for any recall value `r* >= r`. Once we know the maximal precision for the last `k` recall
# values after a given iteration, then in the next iteration, in order compute the maximal
# precisions for the last `l > k` recall values, we only need to compute the maximal precision
# for `l - k` recall values and then take the maximum between that and the previously computed
# maximum instead of computing the maximum over all `l` values.
# We skip the very last recall value, since the precision after the last recall value
# 1.0 is defined to be zero.
for i in range(len(unique_recalls) - 2, -1, -1):
begin = unique_recall_indices[i]
end = unique_recall_indices[i + 1]
# When computing the maximal precisions, use the maximum of the previous iteration to
# avoid unnecessary repeated computation over the same precision values.
# The maximal precisions are the heights of the rectangle areas of our integral under
# the precision-recall curve.
maximal_precisions[i] = np.maximum(np.amax(cumulative_precision[begin:end]),
maximal_precisions[i + 1])
# The differences between two adjacent recall values are the widths of our rectangle areas.
recall_deltas[i] = unique_recalls[i + 1] - unique_recalls[i]
average_precision = np.sum(maximal_precisions * recall_deltas)
average_precisions.append(average_precision)
return average_precisions
def get_mean_average_precision(average_precisions: List[float]) -> float:
"""
Computes the mean average precision over all classes and returns it.
Args:
average_precisions: list of average precisions per class
Returns:
mean average precision over all classes
"""
average_precisions = np.copy(average_precisions)
average_precisions = average_precisions[average_precisions != 0.0]
return np.average(average_precisions[:])