Merge pull request #45 from Pandoro/fork/ap_computation

lucasb-eyer · web-flow · commit a538696b8965 · 2019-03-23T17:45:40.000+01:00
Fixes the "wrong" computation of the AP score for newer sklearn versions.
diff --git a/README.md b/README.md
@@ -273,6 +273,9 @@ The evaluation code in this repository simply uses the scikit-learn code, and th
 Unfortunately, almost no paper mentions which code-base they used and how they computed `mAP` scores, so comparison is difficult.
 Other frameworks have [the same problem](https://github.com/Cysu/open-reid/issues/50), but we expect many not to be aware of this.
 
+We provide evaluation code that computes the mAP as done by the Market-1501 MATLAB evaluation script, independent of the scikit-learn version.
+This can be used by providing the `--use_market_ap` flag when running `evaluate.py`.
+
 # Independent re-implementations
 
 These are the independent re-implementations of our paper that we are aware of,
diff --git a/evaluate.py b/evaluate.py
@@ -51,6 +51,52 @@
     '--batch_size', default=256, type=common.positive_int,
     help='Batch size used during evaluation, adapt based on your memory usage.')
 
+parser.add_argument(
+    '--use_market_ap', action='store_true', default=False,
+    help='When this flag is provided, the average precision is computed exactly'
+         ' as done by the Market-1501 evaluation script, rather than the '
+         'default scikit-learn implementation that gives slightly different'
+         'scores.')
+
+
+def average_precision_score_market(y_true, y_score):
+    """ Compute average precision (AP) from prediction scores.
+
+    This is a replacement for the scikit-learn version which, while likely more
+    correct does not follow the same protocol as used in the default Market-1501
+    evaluation that first introduced this score to the person ReID field.
+
+    Args:
+        y_true (array): The binary labels for all data points.
+        y_score (array): The predicted scores for each samples for all data
+            points.
+
+    Raises:
+        ValueError if the length of the labels and scores do not match.
+
+    Returns:
+        A float representing the average precision given the predictions.
+    """
+
+    if len(y_true) != len(y_score):
+        raise ValueError('The length of the labels and predictions must match '
+                         'got lengths y_true:{} and y_score:{}'.format(
+                            len(y_true), len(y_score)))
+
+    # Mergesort is used since it is a stable sorting algorithm. This is
+    # important to compute consistent and correct scores.
+    y_true_sorted = y_true[np.argsort(-y_score, kind='mergesort')]
+
+    tp = np.cumsum(y_true_sorted)
+    total_true = np.sum(y_true_sorted)
+    recall = tp / total_true
+    recall = np.insert(recall, 0, 0.)
+    precision = tp / np.arange(1, len(tp) + 1)
+    precision = np.insert(precision, 0, 1.)
+    ap = np.sum(np.diff(recall) * ((precision[1:] + precision[:-1]) / 2))
+
+    return ap
+
 
 def main():
     # Verify that parameters are set correctly.
@@ -83,6 +129,12 @@ def main():
 
     batch_distances = loss.cdist(batch_embs, gallery_embs, metric=args.metric)
 
+    # Check if we should use Market-1501 specific average precision computation.
+    if args.use_market_ap:
+        average_precision = average_precision_score_market
+    else:
+        average_precision = average_precision_score
+
     # Loop over the query embeddings and compute their APs and the CMC curve.
     aps = []
     cmc = np.zeros(len(gallery_pids), dtype=np.int32)
@@ -117,7 +169,7 @@ def main():
             # it won't change anything.
             scores = 1 / (1 + distances)
             for i in range(len(distances)):
-                ap = average_precision_score(pid_matches[i], scores[i])
+                ap = average_precision(pid_matches[i], scores[i])
 
                 if np.isnan(ap):
                     print()