Implemented micro and macro averaged metrics

Signed-off-by: Jim Martens <github@2martens.de>
2019-07-18 13:40:19 +02:00
parent 3b2f9fe4da
commit a5555c5f73
2 changed files with 121 additions and 18 deletions
--- a/src/twomartens/masterthesis/cli.py
+++ b/src/twomartens/masterthesis/cli.py
@ -312,16 +312,25 @@ def _ssd_evaluate(args: argparse.Namespace) -> None:

    true_positives, false_positives, \
        cum_true_positives, cum_false_positives, \
-        open_set_error, cumulative_open_set_error = evaluate.match_predictions(predictions_per_class, labels,
-                                                                               bounding_box_utils.iou,
-                                                                               nr_classes, iou_threshold)
+        open_set_error, cumulative_open_set_error, \
+        cum_true_positives_overall, cum_false_positives_overall = evaluate.match_predictions(predictions_per_class,
+                                                                                             labels,
+                                                                                             bounding_box_utils.iou,
+                                                                                             nr_classes, iou_threshold)
    
-    cum_precisions, cum_recalls = evaluate.get_precision_recall(number_gt_per_class,
-                                                                cum_true_positives,
-                                                                cum_false_positives,
-                                                                nr_classes)
+    cum_precisions, cum_recalls, \
+        cum_precisions_micro, cum_recalls_micro, \
+        cum_precisions_macro, cum_recalls_macro = evaluate.get_precision_recall(number_gt_per_class,
+                                                                                cum_true_positives,
+                                                                                cum_false_positives,
+                                                                                cum_true_positives_overall,
+                                                                                cum_false_positives_overall,
+                                                                                nr_classes)
    
-    f1_scores = evaluate.get_f1_score(cum_precisions, cum_recalls, nr_classes)
+    f1_scores, f1_scores_micro, f1_scores_macro = evaluate.get_f1_score(cum_precisions, cum_recalls,
+                                                                        cum_precisions_micro, cum_recalls_micro,
+                                                                        cum_precisions_macro, cum_recalls_macro,
+                                                                        nr_classes)
    average_precisions = evaluate.get_mean_average_precisions(cum_precisions, cum_recalls, nr_classes)
    mean_average_precision = evaluate.get_mean_average_precision(average_precisions)
    
@ -329,9 +338,17 @@ def _ssd_evaluate(args: argparse.Namespace) -> None:
                                        false_positives,
                                        cum_true_positives,
                                        cum_false_positives,
+                                        cum_true_positives_overall,
+                                        cum_false_positives_overall,
                                        cum_precisions,
                                        cum_recalls,
+                                        cum_precisions_micro,
+                                        cum_recalls_micro,
+                                        cum_precisions_macro,
+                                        cum_recalls_macro,
                                        f1_scores,
+                                        f1_scores_micro,
+                                        f1_scores_macro,
                                        average_precisions,
                                        mean_average_precision,
                                        open_set_error,
@ -868,9 +885,17 @@ def _ssd_evaluate_get_results(true_positives: Sequence[np.ndarray],
                              false_positives: Sequence[np.ndarray],
                              cum_true_positives: Sequence[np.ndarray],
                              cum_false_positives: Sequence[np.ndarray],
+                              cum_true_positives_micro: np.ndarray,
+                              cum_false_positives_micro: np.ndarray,
                              cum_precisions: Sequence[np.ndarray],
                              cum_recalls: Sequence[np.ndarray],
+                              cum_precision_micro: np.ndarray,
+                              cum_recall_micro: np.ndarray,
+                              cum_precision_macro: np.ndarray,
+                              cum_recall_macro: np.ndarray,
                              f1_scores: Sequence[np.ndarray],
+                              f1_scores_micro: np.ndarray,
+                              f1_scores_macro: np.ndarray,
                              average_precisions: Sequence[float],
                              mean_average_precision: float,
                              open_set_error: np.ndarray,
@ -881,9 +906,17 @@ def _ssd_evaluate_get_results(true_positives: Sequence[np.ndarray],
        "false_positives":            false_positives,
        "cumulative_true_positives":  cum_true_positives,
        "cumulative_false_positives": cum_false_positives,
+        "cumulative_true_positives_micro": cum_true_positives_micro,
+        "cumulative_false_positives_micro": cum_false_positives_micro,
        "cumulative_precisions":      cum_precisions,
        "cumulative_recalls":         cum_recalls,
+        "cumulative_precision_micro": cum_precision_micro,
+        "cumulative_recall_micro":    cum_recall_micro,
+        "cumulative_precision_macro": cum_precision_macro,
+        "cumulative_recall_macro":    cum_recall_macro,
        "f1_scores":                  f1_scores,
+        "f1_scores_micro":            f1_scores_micro,
+        "f1_scores_macro":            f1_scores_macro,
        "mean_average_precisions":    average_precisions,
        "mean_average_precision":     mean_average_precision,
        "open_set_error":             open_set_error,
--- a/src/twomartens/masterthesis/evaluate.py
+++ b/src/twomartens/masterthesis/evaluate.py
@ -101,6 +101,7 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in
                      border_pixels: str = "include",
                      sorting_algorithm: str = "quicksort") -> Tuple[List[np.ndarray], List[np.ndarray],
                                                                     List[np.ndarray], List[np.ndarray],
+                                                                     np.ndarray, np.ndarray,
                                                                     np.ndarray, np.ndarray]:
    """
    Matches predictions to ground truth boxes.
@ -126,7 +127,8 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in

    Returns:
        true positives, false positives, cumulative true positives, and cumulative false positives for
-            each class, open set error as defined by Miller et al, cumulative open set error
+            each class, open set error as defined by Miller et al, cumulative open set error,
+            cumulative true positives and cumulative false positives over all classes
    """
    true_positives = [[]]  # The false positives for each class, sorted by descending confidence.
    false_positives = [[]]  # The true positives for each class, sorted by descending confidence.
@ -140,7 +142,9 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in
            most_predictions = nr_predictions
    
    open_set_error = np.zeros(most_predictions, dtype=np.int)
-    
+    true_positives_micro = np.zeros(most_predictions, dtype=np.int)
+    false_positives_micro = np.zeros(most_predictions, dtype=np.int)
+
    for class_id in range(1, nr_classes + 1):
        predictions_class = predictions[class_id]
        
@ -198,6 +202,7 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in
                # If the image doesn't contain any objects of this class,
                # the prediction becomes a false positive.
                false_pos[i] = 1
+                false_positives_micro[i] += 1
                open_set_error[i] += 1
                continue

@ -219,12 +224,14 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in
                # Those predictions whose matched overlap is below the threshold become
                # false positives.
                false_pos[i] = 1
+                false_positives_micro[i] += 1
            else:
                if image_id not in gt_matched:
                    # True positive:
                    # If the matched ground truth box for this prediction hasn't been matched to a
                    # different prediction already, we have a true positive.
                    true_pos[i] = 1
+                    true_positives_micro[i] += 1
                    gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool)
                    gt_matched[image_id][gt_match_index] = True
                elif not gt_matched[image_id][gt_match_index]:
@ -232,6 +239,7 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in
                    # If the matched ground truth box for this prediction hasn't been matched to a
                    # different prediction already, we have a true positive.
                    true_pos[i] = 1
+                    true_positives_micro[i] += 1
                    gt_matched[image_id][gt_match_index] = True
                else:
                    # False positive, duplicate detection:
@ -239,6 +247,7 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in
                    # to a different prediction previously, it is a duplicate detection for an
                    # already detected object, which counts as a false positive.
                    false_pos[i] = 1
+                    false_positives_micro[i] += 1
        
        true_positives.append(true_pos)
        false_positives.append(false_pos)
@ -250,17 +259,24 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in
        cumulative_false_positives.append(cumulative_false_pos)
    
    cumulative_open_set_error = np.cumsum(open_set_error)
+    cumulative_false_positives_micro = np.cumsum(false_positives_micro)
+    cumulative_true_positives_micro = np.cumsum(true_positives_micro)
    
    return (
        true_positives, false_positives, cumulative_true_positives, cumulative_false_positives,
-        open_set_error, cumulative_open_set_error
+        open_set_error, cumulative_open_set_error,
+        cumulative_true_positives_micro, cumulative_false_positives_micro
    )


 def get_precision_recall(number_gt_per_class: np.ndarray,
                         cumulative_true_positives: Sequence[np.ndarray],
                         cumulative_false_positives: Sequence[np.ndarray],
-                         nr_classes: int) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+                         cumulative_true_positives_micro: np.ndarray,
+                         cumulative_false_positives_micro: np.ndarray,
+                         nr_classes: int) -> Tuple[List[np.ndarray], List[np.ndarray],
+                                                   np.ndarray, np.ndarray,
+                                                   np.ndarray, np.ndarray]:
    """
    Computes the precision and recall values and returns them.
    
@ -268,13 +284,23 @@ def get_precision_recall(number_gt_per_class: np.ndarray,
        number_gt_per_class: number of ground truth bounding boxes per class
        cumulative_true_positives: cumulative true positives per class
        cumulative_false_positives: cumulative false positives per class
+        cumulative_true_positives_micro: cumulative true positives over all classes
+        cumulative_false_positives_micro: cumulative false positives over all classes
        nr_classes: number of classes

    Returns:
-        cumulative precisions and cumulative recalls per class
+        cumulative precisions and cumulative recalls per class,
+        micro averaged precision/recall, and
+        macro averaged precision/recall
    """
    cumulative_precisions = [[]]
    cumulative_recalls = [[]]
+    cumulative_precision_micro = np.zeros_like(cumulative_true_positives_micro)
+    cumulative_recall_micro = np.zeros_like(cumulative_true_positives_micro)
+    cumulative_precision_macro = np.zeros_like(cumulative_precision_micro)
+    cumulative_recall_macro = np.zeros_like(cumulative_recall_micro)
+    total_number_gt = 0
+    number_of_nonzero_classes = 0

    # Iterate over all classes.
    for class_id in range(1, nr_classes + 1):
@ -288,27 +314,66 @@ def get_precision_recall(number_gt_per_class: np.ndarray,
        fp = cumulative_false_positives[class_id]
    
        cumulative_precision = np.where(tp + fp > 0, tp / (tp + fp), 0)  # 1D array with shape `(num_predictions,)`
-        cumulative_recall = tp / number_gt_per_class[class_id]  # 1D array with shape `(num_predictions,)`
+        number_gt = number_gt_per_class[class_id]
+        total_number_gt += number_gt
+        cumulative_recall = tp / number_gt  # 1D array with shape `(num_predictions,)`
    
        cumulative_precisions.append(cumulative_precision)
        cumulative_recalls.append(cumulative_recall)
+
+        diff_to_largest_class = cumulative_precision_micro.shape[0] - cumulative_precision.shape[0]
+        if diff_to_largest_class:
+            repeated_last_precision = np.tile(cumulative_precision[-1], diff_to_largest_class)
+            repeated_last_recall = np.tile(cumulative_recall[-1], diff_to_largest_class)
+            extended_precision = np.concatenate((cumulative_precision, repeated_last_precision))
+            extended_recall = np.concatenate((cumulative_recall, repeated_last_recall))
+            cumulative_precision_macro += extended_precision
+            cumulative_recall_macro += extended_recall
+        else:
+            cumulative_precision_macro += cumulative_precision
+            cumulative_recall_macro += cumulative_recall
+
+        number_of_nonzero_classes += 1
+
+    # calculate micro averaged precision and recall
+    tp = cumulative_true_positives_micro
+    fp = cumulative_false_positives_micro
+    cumulative_precision_micro = np.where(tp + fp > 0, tp / (tp + fp), 0)
+    cumulative_recall_micro = tp / total_number_gt
    
-    return cumulative_precisions, cumulative_recalls
+    # calculate macro averaged precision and recall
+    cumulative_precision_macro /= number_of_nonzero_classes
+    cumulative_recall_macro /= number_of_nonzero_classes
+
+    return (cumulative_precisions, cumulative_recalls,
+            cumulative_precision_micro, cumulative_recall_micro,
+            cumulative_precision_macro, cumulative_recall_macro
+    )


 def get_f1_score(cumulative_precisions: List[np.ndarray],
                 cumulative_recalls: List[np.ndarray],
-                 nr_classes: int) -> List[np.ndarray]:
+                 cumulative_precision_micro: np.ndarray,
+                 cumulative_recall_micro: np.ndarray,
+                 cumulative_precision_macro: np.ndarray,
+                 cumulative_recall_macro: np.ndarray,
+                 nr_classes: int) -> Tuple[List[np.ndarray],
+                                           np.ndarray, np.ndarray]:
    """
    Computes the F1 score for every class.
    
    Args:
        cumulative_precisions: cumulative precisions for each class
        cumulative_recalls: cumulative recalls for each class
+        cumulative_precision_micro: cumulative precision micro averaged
+        cumulative_recall_micro: cumulative recall micro averaged
+        cumulative_precision_macro: cumulative precision macro averaged
+        cumulative_recall_macro: cumulative recall macro averaged
        nr_classes: number of classes

    Returns:
-        cumulative F1 score per class
+        cumulative F1 score per class,
+        cumulative F1 score micro averaged, cumulative F1 score macro averaged
    """
    cumulative_f1_scores = [[]]
    
@ -321,8 +386,13 @@ def get_f1_score(cumulative_precisions: List[np.ndarray],
            continue
        f1_score = 2 * ((cumulative_precision * cumulative_recall) / (cumulative_precision + cumulative_recall + 0.001))
        cumulative_f1_scores.append(f1_score)
+
+    f1_score_micro = 2 * ((cumulative_precision_micro * cumulative_recall_micro) /
+                          (cumulative_precision_micro + cumulative_recall_micro + 0.001))
+    f1_score_macro = 2 * ((cumulative_precision_macro * cumulative_recall_macro) /
+                          (cumulative_precision_macro + cumulative_recall_macro + 0.001))
    
-    return cumulative_f1_scores
+    return cumulative_f1_scores, f1_score_micro, f1_score_macro


 def get_mean_average_precisions(cumulative_precisions: List[np.ndarray],