9. Benchmarking Bounding Box using data

Contents

9. Benchmarking Bounding Box using data#

written by Tiankang Xie

In the tutorial we will demonstrate how to evaluate pyfeat bounding box algorithms with evaluation data.

import pickle
import numpy as np
import os
from scipy.io import loadmat
from tqdm import tqdm
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import pandas as pd

from feat.facepose_detectors.img2pose.img2pose_test import Img2Pose
from PIL import Image, ImageDraw
import torchvision.transforms as transforms
import torch
import glob
from feat import Detector
import matplotlib.pyplot as plt

from feat.facepose_detectors.img2pose.img2pose_test import Img2Pose

The benchmark script has already been provided by the authors in Matlab. We borrow code from https://github.com/wondervictor/WiderFace-Evaluation, which is a Python version of the original Matlab code.

# Copyright (c) OpenMMLab. All rights reserved.
def voc_ap(rec, prec):

    # correct AP calculation
    # first append sentinel values at the end
    mrec = np.concatenate(([0.], rec, [1.]))
    mpre = np.concatenate(([0.], prec, [0.]))

    # compute the precision envelope
    for i in range(mpre.size - 1, 0, -1):
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap
    
def dataset_pr_info(thresh_num, pr_curve, count_face):
    _pr_curve = np.zeros((thresh_num, 2))
    for i in range(thresh_num):
        _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0]
        _pr_curve[i, 1] = pr_curve[i, 1] / count_face
    return _pr_curve

def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall):
    pr_info = np.zeros((thresh_num, 2)).astype('float')
    for t in range(thresh_num):

        thresh = 1 - (t+1)/thresh_num
        r_index = np.where(pred_info[:, 4] >= thresh)[0]
        if len(r_index) == 0:
            pr_info[t, 0] = 0
            pr_info[t, 1] = 0
        else:
            r_index = r_index[-1]
            p_index = np.where(proposal_list[:r_index+1] == 1)[0]
            pr_info[t, 0] = len(p_index)
            pr_info[t, 1] = pred_recall[r_index]
    return pr_info

def bbox_overlaps(bboxes1,
                  bboxes2,
                  mode='iou',
                  eps=1e-6,
                  use_legacy_coordinate=False):
    """Calculate the ious between each bbox of bboxes1 and bboxes2.
    Args:
        bboxes1 (ndarray): Shape (n, 4)
        bboxes2 (ndarray): Shape (k, 4)
        mode (str): IOU (intersection over union) or IOF (intersection
            over foreground)
        use_legacy_coordinate (bool): Whether to use coordinate system in
            mmdet v1.x. which means width, height should be
            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
            Note when function is used in `VOCDataset`, it should be
            True to align with the official implementation
            `http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar`
            Default: False.
    Returns:
        ious (ndarray): Shape (n, k)
    """

    assert mode in ['iou', 'iof']
    if not use_legacy_coordinate:
        extra_length = 0.
    else:
        extra_length = 1.
    bboxes1 = bboxes1.astype(np.float32)
    bboxes2 = bboxes2.astype(np.float32)
    rows = bboxes1.shape[0]
    cols = bboxes2.shape[0]
    ious = np.zeros((rows, cols), dtype=np.float32)
    if rows * cols == 0:
        return ious
    exchange = False
    if bboxes1.shape[0] > bboxes2.shape[0]:
        bboxes1, bboxes2 = bboxes2, bboxes1
        ious = np.zeros((cols, rows), dtype=np.float32)
        exchange = True
    area1 = (bboxes1[:, 2] - bboxes1[:, 0] + extra_length) * (
        bboxes1[:, 3] - bboxes1[:, 1] + extra_length)
    area2 = (bboxes2[:, 2] - bboxes2[:, 0] + extra_length) * (
        bboxes2[:, 3] - bboxes2[:, 1] + extra_length)
    for i in range(bboxes1.shape[0]):
        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
        overlap = np.maximum(x_end - x_start + extra_length, 0) * np.maximum(
            y_end - y_start + extra_length, 0)
        if mode == 'iou':
            union = area1[i] + area2 - overlap
        else:
            union = area1[i] if not exchange else area2
        union = np.maximum(union, eps)
        ious[i, :] = overlap / union
    if exchange:
        ious = ious.T
    return ious

def image_eval(pred, gt, ignore, iou_thresh):
    """ single image evaluation
    pred: Nx5
    gt: Nx4
    ignore:
    """
    _pred = pred.copy()
    _gt = gt.copy()
    pred_recall = np.zeros(_pred.shape[0])
    recall_list = np.zeros(_gt.shape[0])
    proposal_list = np.ones(_pred.shape[0])

    # _pred[:, 2] = _pred[:, 2] + _pred[:, 0]
    # _pred[:, 3] = _pred[:, 3] + _pred[:, 1]
    _gt[:, 2] = _gt[:, 2] + _gt[:, 0]
    _gt[:, 3] = _gt[:, 3] + _gt[:, 1]

    overlaps = bbox_overlaps(_pred[:, :4], _gt)

    for h in range(_pred.shape[0]):

        gt_overlap = overlaps[h]
        max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()
        if max_overlap >= iou_thresh:
            if ignore[max_idx] == 0:
                recall_list[max_idx] = -1
                proposal_list[h] = -1
            elif recall_list[max_idx] == 0:
                recall_list[max_idx] = 1

        r_keep_index = np.where(recall_list == 1)[0]
        pred_recall[h] = len(r_keep_index)
    return pred_recall, proposal_list
    
def get_gt_boxes(gt_dir):
    """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""

    gt_mat = loadmat(os.path.join(gt_dir, 'wider_face_val.mat'))
    hard_mat = loadmat(os.path.join(gt_dir, 'wider_hard_val.mat'))
    medium_mat = loadmat(os.path.join(gt_dir, 'wider_medium_val.mat'))
    easy_mat = loadmat(os.path.join(gt_dir, 'wider_easy_val.mat'))

    facebox_list = gt_mat['face_bbx_list']
    event_list = gt_mat['event_list']
    file_list = gt_mat['file_list']

    hard_gt_list = hard_mat['gt_list']
    medium_gt_list = medium_mat['gt_list']
    easy_gt_list = easy_mat['gt_list']

    return facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list
    
def load_preds(pred_dir='/Storage/Projects/pyfeat_testing/Data/WIDER_BBOX_IMG2POSE/preds.pkl'):
    with open(pred_dir, 'rb') as fp:
        all_imgs, all_pred_vals = pickle.load(fp)
    boxes = dict()
    for i, img_name in enumerate(all_imgs):
        event_name = [mai for mai in img_name.split('/') if '--' in mai][0]
        if event_name not in boxes:
            boxes[event_name] = {}
        
        pred_box = np.array(all_pred_vals[i])
        boxes[event_name][os.path.basename(img_name).rstrip('.jpg')] = pred_box
    return boxes

def norm_score(pred):
    """ norm score
    pred {key: [[x1,y1,x2,y2,s]]}
    """

    max_score = 0
    min_score = 1

    for _, k in pred.items():
        for _, v in k.items():
            if len(v.shape) == 0 or len(v) == 0:
                continue
            _min = np.min(v[:, -1])
            _max = np.max(v[:, -1])
            max_score = max(_max, max_score)
            min_score = min(_min, min_score)

    diff = max_score - min_score
    for _, k in pred.items():
        for _, v in k.items():
            if len(v.shape) == 0 or len(v) == 0:
                continue
            v[:, -1] = (v[:, -1] - min_score)/diff

def print_ap_scores(result_fp):
    iou_thresh=0.5
    pred = load_preds(pred_dir=result_fp) # Where you save the result in the extract_bbox_img2Pose.py script
    norm_score(pred)
    facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list = get_gt_boxes(gt_dir='/Storage/Data/wider/wider_face_split/') # Where ground-truth are stored
    event_num = len(event_list)
    thresh_num = 1000
    settings = ['easy', 'medium', 'hard']
    setting_gts = [easy_gt_list, medium_gt_list, hard_gt_list]
    aps = []
    for setting_id in range(3):
        # different setting
        gt_list = setting_gts[setting_id]
        count_face = 0
        pr_curve = np.zeros((thresh_num, 2)).astype('float')
        # [hard, medium, easy]
        pbar = tqdm(range(event_num))
        for i in pbar:
            pbar.set_description('Processing {}'.format(settings[setting_id]))
            event_name = str(event_list[i][0][0])
            img_list = file_list[i][0]
            pred_list = pred[event_name]
            sub_gt_list = gt_list[i][0]
            # img_pr_info_list = np.zeros((len(img_list), thresh_num, 2))
            gt_bbx_list = facebox_list[i][0]

            for j in range(len(img_list)):
                pred_info = pred_list[str(img_list[j][0][0])]

                gt_boxes = gt_bbx_list[j][0].astype('float')
                keep_index = sub_gt_list[j][0]
                count_face += len(keep_index)

                if len(gt_boxes) == 0 or len(pred_info.shape) == 0 or len(pred_info) == 0:
                    continue
                ignore = np.zeros(gt_boxes.shape[0])
                if len(keep_index) != 0:
                    ignore[keep_index-1] = 1
                pred_recall, proposal_list = image_eval(pred_info, gt_boxes, ignore, iou_thresh)

                _img_pr_info = img_pr_info(thresh_num, pred_info, proposal_list, pred_recall)

                pr_curve += _img_pr_info
        pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face)

        propose = pr_curve[:, 0]
        recall = pr_curve[:, 1]

        ap = voc_ap(recall, propose)
        aps.append(ap)

    print("==================== Results ====================")
    print("Easy   Val AP: {}".format(aps[0]))
    print("Medium Val AP: {}".format(aps[1]))
    print("Hard   Val AP: {}".format(aps[2]))
    print("=================================================")
    return aps

We provide the path for wider dataset, the ground true labels, and where the results are been saved. Dataset and ground truth labels can be downloaded at http://shuoyang1213.me/WIDERFACE/

data_dir = '/Storage/Data/wider/'
true_result_dir = '/Storage/Data/wider/wider_face_split/'
save_result_dir = '/Storage/Projects/pyfeat_testing/Data_Eshin/facebox_test/'
all_imgs = glob.glob(data_dir+'WIDER_val/images/**/*.jpg')

1. Test of FaceBoxes#

detector = Detector(face_model='faceboxes',emotion_model='resmasknet', landmark_model='mobilefacenet', au_model='xgb', device='cpu')
all_pred_vals = []

for img in tqdm(all_imgs):
    im1 = Image.open(img)
    face_aus = detector.detect_faces(im1)
    all_pred_vals.append(face_aus[0])

with open(save_result_dir+'FaceBoxes_bench_results.pkl', 'wb') as fp:
    pickle.dump((all_imgs, all_pred_vals), fp)

100%|██████████| 3226/3226 [02:32<00:00, 21.11it/s]

facebox_normal = print_ap_scores(result_fp=save_result_dir+'FaceBoxes_bench_results.pkl')

Processing easy: 100%|██████████| 61/61 [00:11<00:00,  5.33it/s]
Processing medium: 100%|██████████| 61/61 [00:11<00:00,  5.33it/s]
Processing hard: 100%|██████████| 61/61 [00:11<00:00,  5.31it/s]

==================== Results ====================
Easy   Val AP: 0.5368750176845414
Medium Val AP: 0.34812514764839486
Hard   Val AP: 0.14662014664396028
=================================================

2. Test of MTCNN#

detector = Detector(face_model='mtcnn',emotion_model='resmasknet', landmark_model='mobilefacenet', au_model='xgb', device='cpu')
all_pred_vals = []

for img in tqdm(all_imgs):
    im1 = Image.open(img)
    face_aus = detector.detect_faces(im1)
    all_pred_vals.append(face_aus[0])

with open(save_result_dir+'MTCNN_bench_results.pkl', 'wb') as fp:
    pickle.dump((all_imgs, all_pred_vals), fp)

/home/tiankang/anaconda3/envs/py39/lib/python3.9/site-packages/torchvision/models/_utils.py:135: UserWarning: Using 'backbone_name' as positional parameter(s) is deprecated since 0.13 and may be removed in the future. Please use keyword parameter(s) instead.
  warnings.warn(
/home/tiankang/anaconda3/envs/py39/lib/python3.9/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)
100%|██████████| 3226/3226 [07:13<00:00,  7.44it/s]

mtcnn_normal = print_ap_scores(result_fp=save_result_dir+'MTCNN_bench_results.pkl')

Processing easy: 100%|██████████| 61/61 [00:16<00:00,  3.77it/s]
Processing medium: 100%|██████████| 61/61 [00:16<00:00,  3.75it/s]
Processing hard: 100%|██████████| 61/61 [00:16<00:00,  3.75it/s]

==================== Results ====================
Easy   Val AP: 0.7248933447919402
Medium Val AP: 0.7175922904388756
Hard   Val AP: 0.47326227608164284
=================================================

3. Test of RetinaFace#

detector = Detector(face_model='retinaface',emotion_model='resmasknet', landmark_model='mobilefacenet', au_model='xgb', device='cpu')
all_pred_vals = []

for img in tqdm(all_imgs):
    im1 = Image.open(img)
    face_aus = detector.detect_faces(im1)
    all_pred_vals.append(face_aus[0])

with open(save_result_dir+'RetinaFace_bench_results.pkl', 'wb') as fp:
    pickle.dump((all_imgs, all_pred_vals), fp)

/home/tiankang/anaconda3/envs/py39/lib/python3.9/site-packages/torchvision/models/_utils.py:135: UserWarning: Using 'backbone_name' as positional parameter(s) is deprecated since 0.13 and may be removed in the future. Please use keyword parameter(s) instead.
  warnings.warn(
/home/tiankang/anaconda3/envs/py39/lib/python3.9/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)
  6%|▌         | 178/3226 [00:22<05:13,  9.72it/s]

retinaface_normal = print_ap_scores(result_fp=save_result_dir+'RetinaFace_bench_results.pkl')

4. Test of Img2Pose unconstrained#

Img2Pose models are heavy both in architecture and in number of hyperparameters. We advise to use different parameter combinations for different settings, especially for constrained vs unconstrained#

from feat.facepose_detectors.img2pose.img2pose_test import Img2Pose
from torch.utils.data import Dataset, DataLoader

class GenericImageDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, file_paths, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.filePaths = file_paths
        self.transform = transform

    def __len__(self):
        return len(self.filePaths)

    def __getitem__(self, idx):
        img = Image.open(self.filePaths[idx])
        if self.transform:
            img = self.transform(img)
        return img

imclassifier = Img2Pose(constrained=False, detection_threshold=0.25, rpn_pre_nms_top_n_test=6000, rpn_post_nms_top_n_test=1000)

img_trans = transforms.Compose([
    transforms.ToTensor()
])

dataset = GenericImageDataset(all_imgs, transform=img_trans)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=1)

all_pred_vals = []
for i_batch, sample_batched in enumerate(tqdm(dataloader)):
    preds = imclassifier(sample_batched)
    all_pred_vals.append(preds[0][0]) # Append Face Bounding Box

# Save Result
with open(save_result_dir+'Img2poseuncon_bench_results.pkl', 'wb') as fp:
    pickle.dump((all_imgs, all_pred_vals), fp)

100%|██████████| 3226/3226 [01:48<00:00, 29.82it/s]

img2pose_uncon_normal = print_ap_scores(result_fp=save_result_dir+'Img2poseuncon_bench_results.pkl')

Processing easy: 100%|██████████| 61/61 [00:19<00:00,  3.18it/s]
Processing medium: 100%|██████████| 61/61 [00:19<00:00,  3.20it/s]
Processing hard: 100%|██████████| 61/61 [00:18<00:00,  3.21it/s]

==================== Results ====================
Easy   Val AP: 0.8563027227316843
Medium Val AP: 0.8136765059086696
Hard   Val AP: 0.5739961257745989
=================================================

5. Test of Img2Pose constrained#

imclassifier = Img2Pose(constrained=True, detection_threshold=0.25, rpn_pre_nms_top_n_test=2000, rpn_post_nms_top_n_test=200)

img_trans = transforms.Compose([
    transforms.ToTensor()
])

dataset = GenericImageDataset(all_imgs, transform=img_trans)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=1)

all_pred_vals = []
for i_batch, sample_batched in enumerate(tqdm(dataloader)):
    preds = imclassifier(sample_batched)
    all_pred_vals.append(preds[0][0]) # Append Face Bounding Box

# Save Result
with open(save_result_dir+'Img2posecon_bench_results.pkl', 'wb') as fp:
    pickle.dump((all_imgs, all_pred_vals), fp)

100%|██████████| 3226/3226 [01:13<00:00, 43.72it/s]

img2pose_con_normal = print_ap_scores(result_fp=save_result_dir+'Img2posecon_bench_results.pkl')

Processing easy: 100%|██████████| 61/61 [00:16<00:00,  3.60it/s]
Processing medium: 100%|██████████| 61/61 [00:17<00:00,  3.56it/s]
Processing hard: 100%|██████████| 61/61 [00:17<00:00,  3.58it/s]

==================== Results ====================
Easy   Val AP: 0.6470359115773076
Medium Val AP: 0.5878560483278932
Hard   Val AP: 0.32415904798673495
=================================================