diff --git a/src/twomartens/masterthesis/cli.py b/src/twomartens/masterthesis/cli.py index 2891314..a922d51 100644 --- a/src/twomartens/masterthesis/cli.py +++ b/src/twomartens/masterthesis/cli.py @@ -312,16 +312,25 @@ def _ssd_evaluate(args: argparse.Namespace) -> None: true_positives, false_positives, \ cum_true_positives, cum_false_positives, \ - open_set_error, cumulative_open_set_error = evaluate.match_predictions(predictions_per_class, labels, - bounding_box_utils.iou, - nr_classes, iou_threshold) + open_set_error, cumulative_open_set_error, \ + cum_true_positives_overall, cum_false_positives_overall = evaluate.match_predictions(predictions_per_class, + labels, + bounding_box_utils.iou, + nr_classes, iou_threshold) - cum_precisions, cum_recalls = evaluate.get_precision_recall(number_gt_per_class, - cum_true_positives, - cum_false_positives, - nr_classes) + cum_precisions, cum_recalls, \ + cum_precisions_micro, cum_recalls_micro, \ + cum_precisions_macro, cum_recalls_macro = evaluate.get_precision_recall(number_gt_per_class, + cum_true_positives, + cum_false_positives, + cum_true_positives_overall, + cum_false_positives_overall, + nr_classes) - f1_scores = evaluate.get_f1_score(cum_precisions, cum_recalls, nr_classes) + f1_scores, f1_scores_micro, f1_scores_macro = evaluate.get_f1_score(cum_precisions, cum_recalls, + cum_precisions_micro, cum_recalls_micro, + cum_precisions_macro, cum_recalls_macro, + nr_classes) average_precisions = evaluate.get_mean_average_precisions(cum_precisions, cum_recalls, nr_classes) mean_average_precision = evaluate.get_mean_average_precision(average_precisions) @@ -329,9 +338,17 @@ def _ssd_evaluate(args: argparse.Namespace) -> None: false_positives, cum_true_positives, cum_false_positives, + cum_true_positives_overall, + cum_false_positives_overall, cum_precisions, cum_recalls, + cum_precisions_micro, + cum_recalls_micro, + cum_precisions_macro, + cum_recalls_macro, f1_scores, + f1_scores_micro, + f1_scores_macro, average_precisions, mean_average_precision, open_set_error, @@ -868,9 +885,17 @@ def _ssd_evaluate_get_results(true_positives: Sequence[np.ndarray], false_positives: Sequence[np.ndarray], cum_true_positives: Sequence[np.ndarray], cum_false_positives: Sequence[np.ndarray], + cum_true_positives_micro: np.ndarray, + cum_false_positives_micro: np.ndarray, cum_precisions: Sequence[np.ndarray], cum_recalls: Sequence[np.ndarray], + cum_precision_micro: np.ndarray, + cum_recall_micro: np.ndarray, + cum_precision_macro: np.ndarray, + cum_recall_macro: np.ndarray, f1_scores: Sequence[np.ndarray], + f1_scores_micro: np.ndarray, + f1_scores_macro: np.ndarray, average_precisions: Sequence[float], mean_average_precision: float, open_set_error: np.ndarray, @@ -881,9 +906,17 @@ def _ssd_evaluate_get_results(true_positives: Sequence[np.ndarray], "false_positives": false_positives, "cumulative_true_positives": cum_true_positives, "cumulative_false_positives": cum_false_positives, + "cumulative_true_positives_micro": cum_true_positives_micro, + "cumulative_false_positives_micro": cum_false_positives_micro, "cumulative_precisions": cum_precisions, "cumulative_recalls": cum_recalls, + "cumulative_precision_micro": cum_precision_micro, + "cumulative_recall_micro": cum_recall_micro, + "cumulative_precision_macro": cum_precision_macro, + "cumulative_recall_macro": cum_recall_macro, "f1_scores": f1_scores, + "f1_scores_micro": f1_scores_micro, + "f1_scores_macro": f1_scores_macro, "mean_average_precisions": average_precisions, "mean_average_precision": mean_average_precision, "open_set_error": open_set_error, diff --git a/src/twomartens/masterthesis/evaluate.py b/src/twomartens/masterthesis/evaluate.py index 863cba4..d3b850f 100644 --- a/src/twomartens/masterthesis/evaluate.py +++ b/src/twomartens/masterthesis/evaluate.py @@ -101,6 +101,7 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in border_pixels: str = "include", sorting_algorithm: str = "quicksort") -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], + np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Matches predictions to ground truth boxes. @@ -126,7 +127,8 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in Returns: true positives, false positives, cumulative true positives, and cumulative false positives for - each class, open set error as defined by Miller et al, cumulative open set error + each class, open set error as defined by Miller et al, cumulative open set error, + cumulative true positives and cumulative false positives over all classes """ true_positives = [[]] # The false positives for each class, sorted by descending confidence. false_positives = [[]] # The true positives for each class, sorted by descending confidence. @@ -140,7 +142,9 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in most_predictions = nr_predictions open_set_error = np.zeros(most_predictions, dtype=np.int) - + true_positives_micro = np.zeros(most_predictions, dtype=np.int) + false_positives_micro = np.zeros(most_predictions, dtype=np.int) + for class_id in range(1, nr_classes + 1): predictions_class = predictions[class_id] @@ -198,6 +202,7 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in # If the image doesn't contain any objects of this class, # the prediction becomes a false positive. false_pos[i] = 1 + false_positives_micro[i] += 1 open_set_error[i] += 1 continue @@ -219,12 +224,14 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in # Those predictions whose matched overlap is below the threshold become # false positives. false_pos[i] = 1 + false_positives_micro[i] += 1 else: if image_id not in gt_matched: # True positive: # If the matched ground truth box for this prediction hasn't been matched to a # different prediction already, we have a true positive. true_pos[i] = 1 + true_positives_micro[i] += 1 gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool) gt_matched[image_id][gt_match_index] = True elif not gt_matched[image_id][gt_match_index]: @@ -232,6 +239,7 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in # If the matched ground truth box for this prediction hasn't been matched to a # different prediction already, we have a true positive. true_pos[i] = 1 + true_positives_micro[i] += 1 gt_matched[image_id][gt_match_index] = True else: # False positive, duplicate detection: @@ -239,6 +247,7 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in # to a different prediction previously, it is a duplicate detection for an # already detected object, which counts as a false positive. false_pos[i] = 1 + false_positives_micro[i] += 1 true_positives.append(true_pos) false_positives.append(false_pos) @@ -250,17 +259,24 @@ def match_predictions(predictions: Sequence[Sequence[Tuple[int, float, float, in cumulative_false_positives.append(cumulative_false_pos) cumulative_open_set_error = np.cumsum(open_set_error) + cumulative_false_positives_micro = np.cumsum(false_positives_micro) + cumulative_true_positives_micro = np.cumsum(true_positives_micro) return ( true_positives, false_positives, cumulative_true_positives, cumulative_false_positives, - open_set_error, cumulative_open_set_error + open_set_error, cumulative_open_set_error, + cumulative_true_positives_micro, cumulative_false_positives_micro ) def get_precision_recall(number_gt_per_class: np.ndarray, cumulative_true_positives: Sequence[np.ndarray], cumulative_false_positives: Sequence[np.ndarray], - nr_classes: int) -> Tuple[List[np.ndarray], List[np.ndarray]]: + cumulative_true_positives_micro: np.ndarray, + cumulative_false_positives_micro: np.ndarray, + nr_classes: int) -> Tuple[List[np.ndarray], List[np.ndarray], + np.ndarray, np.ndarray, + np.ndarray, np.ndarray]: """ Computes the precision and recall values and returns them. @@ -268,13 +284,23 @@ def get_precision_recall(number_gt_per_class: np.ndarray, number_gt_per_class: number of ground truth bounding boxes per class cumulative_true_positives: cumulative true positives per class cumulative_false_positives: cumulative false positives per class + cumulative_true_positives_micro: cumulative true positives over all classes + cumulative_false_positives_micro: cumulative false positives over all classes nr_classes: number of classes Returns: - cumulative precisions and cumulative recalls per class + cumulative precisions and cumulative recalls per class, + micro averaged precision/recall, and + macro averaged precision/recall """ cumulative_precisions = [[]] cumulative_recalls = [[]] + cumulative_precision_micro = np.zeros_like(cumulative_true_positives_micro) + cumulative_recall_micro = np.zeros_like(cumulative_true_positives_micro) + cumulative_precision_macro = np.zeros_like(cumulative_precision_micro) + cumulative_recall_macro = np.zeros_like(cumulative_recall_micro) + total_number_gt = 0 + number_of_nonzero_classes = 0 # Iterate over all classes. for class_id in range(1, nr_classes + 1): @@ -288,27 +314,66 @@ def get_precision_recall(number_gt_per_class: np.ndarray, fp = cumulative_false_positives[class_id] cumulative_precision = np.where(tp + fp > 0, tp / (tp + fp), 0) # 1D array with shape `(num_predictions,)` - cumulative_recall = tp / number_gt_per_class[class_id] # 1D array with shape `(num_predictions,)` + number_gt = number_gt_per_class[class_id] + total_number_gt += number_gt + cumulative_recall = tp / number_gt # 1D array with shape `(num_predictions,)` cumulative_precisions.append(cumulative_precision) cumulative_recalls.append(cumulative_recall) + + diff_to_largest_class = cumulative_precision_micro.shape[0] - cumulative_precision.shape[0] + if diff_to_largest_class: + repeated_last_precision = np.tile(cumulative_precision[-1], diff_to_largest_class) + repeated_last_recall = np.tile(cumulative_recall[-1], diff_to_largest_class) + extended_precision = np.concatenate((cumulative_precision, repeated_last_precision)) + extended_recall = np.concatenate((cumulative_recall, repeated_last_recall)) + cumulative_precision_macro += extended_precision + cumulative_recall_macro += extended_recall + else: + cumulative_precision_macro += cumulative_precision + cumulative_recall_macro += cumulative_recall + + number_of_nonzero_classes += 1 + + # calculate micro averaged precision and recall + tp = cumulative_true_positives_micro + fp = cumulative_false_positives_micro + cumulative_precision_micro = np.where(tp + fp > 0, tp / (tp + fp), 0) + cumulative_recall_micro = tp / total_number_gt - return cumulative_precisions, cumulative_recalls + # calculate macro averaged precision and recall + cumulative_precision_macro /= number_of_nonzero_classes + cumulative_recall_macro /= number_of_nonzero_classes + + return (cumulative_precisions, cumulative_recalls, + cumulative_precision_micro, cumulative_recall_micro, + cumulative_precision_macro, cumulative_recall_macro + ) def get_f1_score(cumulative_precisions: List[np.ndarray], cumulative_recalls: List[np.ndarray], - nr_classes: int) -> List[np.ndarray]: + cumulative_precision_micro: np.ndarray, + cumulative_recall_micro: np.ndarray, + cumulative_precision_macro: np.ndarray, + cumulative_recall_macro: np.ndarray, + nr_classes: int) -> Tuple[List[np.ndarray], + np.ndarray, np.ndarray]: """ Computes the F1 score for every class. Args: cumulative_precisions: cumulative precisions for each class cumulative_recalls: cumulative recalls for each class + cumulative_precision_micro: cumulative precision micro averaged + cumulative_recall_micro: cumulative recall micro averaged + cumulative_precision_macro: cumulative precision macro averaged + cumulative_recall_macro: cumulative recall macro averaged nr_classes: number of classes Returns: - cumulative F1 score per class + cumulative F1 score per class, + cumulative F1 score micro averaged, cumulative F1 score macro averaged """ cumulative_f1_scores = [[]] @@ -321,8 +386,13 @@ def get_f1_score(cumulative_precisions: List[np.ndarray], continue f1_score = 2 * ((cumulative_precision * cumulative_recall) / (cumulative_precision + cumulative_recall + 0.001)) cumulative_f1_scores.append(f1_score) + + f1_score_micro = 2 * ((cumulative_precision_micro * cumulative_recall_micro) / + (cumulative_precision_micro + cumulative_recall_micro + 0.001)) + f1_score_macro = 2 * ((cumulative_precision_macro * cumulative_recall_macro) / + (cumulative_precision_macro + cumulative_recall_macro + 0.001)) - return cumulative_f1_scores + return cumulative_f1_scores, f1_score_micro, f1_score_macro def get_mean_average_precisions(cumulative_precisions: List[np.ndarray],