Extracting labels and landmarks

written by Eshin Jolly

This notebook demonstrates how we extracted labels and landmarsk from the EmotioNet dataset. You’ll want to repeat this for the DISFA Plus and BP4d datasets prior to training the AU visualization model.

Imports, paths, and helper functions

Make sure to adjust data paths as needed. By default this notebook assumes datasets are in the data/datasets folder at the root of this repository.

from PIL import Image, ImageOps
import math, cv2, csv
from scipy.spatial import ConvexHull
from skimage.morphology.convex_hull import grid_points_in_poly
from feat import Detector
import os, glob, pandas as pd, numpy as np
import matplotlib.pyplot as plt
from skimage import data, exposure
from skimage.feature import hog
from tqdm import tqdm

# Set data directory to data folder relative to location of this notebook
data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(""))), "data" , "datasets")

def padding(img, expected_size):
    desired_size = expected_size
    delta_width = desired_size - img.size[0]
    delta_height = desired_size - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (
        pad_width,
        pad_height,
        delta_width - pad_width,
        delta_height - pad_height,
    )
    return ImageOps.expand(img, padding)


def resize_with_padding(img, expected_size):
    img.thumbnail((expected_size[0], expected_size[1]))
    delta_width = expected_size[0] - img.size[0]
    delta_height = expected_size[1] - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (
        pad_width,
        pad_height,
        delta_width - pad_width,
        delta_height - pad_height,
    )
    return ImageOps.expand(img, padding)


def align_face_68pts(img, img_land, box_enlarge, img_size=112):
    """
    img: image
    img_land: landmarks 68
    box_enlarge: relative size of face
    img_size = 112

    """
    leftEye0 = (
        img_land[2 * 36]
        + img_land[2 * 37]
        + img_land[2 * 38]
        + img_land[2 * 39]
        + img_land[2 * 40]
        + img_land[2 * 41]
    ) / 6.0
    leftEye1 = (
        img_land[2 * 36 + 1]
        + img_land[2 * 37 + 1]
        + img_land[2 * 38 + 1]
        + img_land[2 * 39 + 1]
        + img_land[2 * 40 + 1]
        + img_land[2 * 41 + 1]
    ) / 6.0
    rightEye0 = (
        img_land[2 * 42]
        + img_land[2 * 43]
        + img_land[2 * 44]
        + img_land[2 * 45]
        + img_land[2 * 46]
        + img_land[2 * 47]
    ) / 6.0
    rightEye1 = (
        img_land[2 * 42 + 1]
        + img_land[2 * 43 + 1]
        + img_land[2 * 44 + 1]
        + img_land[2 * 45 + 1]
        + img_land[2 * 46 + 1]
        + img_land[2 * 47 + 1]
    ) / 6.0
    deltaX = rightEye0 - leftEye0
    deltaY = rightEye1 - leftEye1
    l = math.sqrt(deltaX * deltaX + deltaY * deltaY)
    sinVal = deltaY / l
    cosVal = deltaX / l
    mat1 = np.mat([[cosVal, sinVal, 0], [-sinVal, cosVal, 0], [0, 0, 1]])
    mat2 = np.mat(
        [
            [leftEye0, leftEye1, 1],
            [rightEye0, rightEye1, 1],
            [img_land[2 * 30], img_land[2 * 30 + 1], 1],
            [img_land[2 * 48], img_land[2 * 48 + 1], 1],
            [img_land[2 * 54], img_land[2 * 54 + 1], 1],
        ]
    )
    mat2 = (mat1 * mat2.T).T
    cx = float((max(mat2[:, 0]) + min(mat2[:, 0]))) * 0.5
    cy = float((max(mat2[:, 1]) + min(mat2[:, 1]))) * 0.5
    if float(max(mat2[:, 0]) - min(mat2[:, 0])) > float(
        max(mat2[:, 1]) - min(mat2[:, 1])
    ):
        halfSize = 0.5 * box_enlarge * float((max(mat2[:, 0]) - min(mat2[:, 0])))
    else:
        halfSize = 0.5 * box_enlarge * float((max(mat2[:, 1]) - min(mat2[:, 1])))
    scale = (img_size - 1) / 2.0 / halfSize
    mat3 = np.mat(
        [
            [scale, 0, scale * (halfSize - cx)],
            [0, scale, scale * (halfSize - cy)],
            [0, 0, 1],
        ]
    )
    mat = mat3 * mat1
    aligned_img = cv2.warpAffine(
        img,
        mat[0:2, :],
        (img_size, img_size),
        cv2.INTER_LINEAR,
        borderValue=(128, 128, 128),
    )
    land_3d = np.ones((int(len(img_land) / 2), 3))
    land_3d[:, 0:2] = np.reshape(np.array(img_land), (int(len(img_land) / 2), 2))
    mat_land_3d = np.mat(land_3d)
    new_land = np.array((mat * mat_land_3d.T).T)
    new_land = np.array(list(zip(new_land[:, 0], new_land[:, 1]))).astype(int)
    return aligned_img, new_land


def extract_hog(image, detector):
    im = cv2.imread(image)
    detected_faces = np.array(detector.detect_faces(im)[0])
    if np.any(detected_faces < 0):
        orig_size = np.array(im).shape
        if np.where(detected_faces < 0)[0][0] == 1:
            new_size = (
                orig_size[0],
                int(orig_size[1] + 2 * abs(detected_faces[detected_faces < 0][0])),
            )
        else:
            new_size = (
                int(orig_size[0] + 2 * abs(detected_faces[detected_faces < 0][0])),
                orig_size[1],
            )
        im = resize_with_padding(Image.fromarray(im), new_size)
        im = np.asarray(im)
        detected_faces = np.array(detector.detect_faces(np.array(im))[0])
    detected_faces = detected_faces.astype(int)
    points = detector.detect_landmarks(np.array(im), [detected_faces])[0].astype(int)

    aligned_img, points = align_face_68pts(im, points.flatten(), 2.5)

    hull = ConvexHull(points)
    mask = grid_points_in_poly(
        shape=np.array(aligned_img).shape,
        verts=list(
            zip(points[hull.vertices][:, 1], points[hull.vertices][:, 0])
        ),  # for some reason verts need to be flipped
    )

    mask[0 : np.min([points[0][1], points[16][1]]), points[0][0] : points[16][0]] = True
    aligned_img[~mask] = 0
    resized_face_np = aligned_img

    fd, hog_image = hog(
        resized_face_np,
        orientations=8,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        visualize=True,
        multichannel=True,
    )

    return fd, hog_image, points

Example extraction on EmotioNet

You’ll need to run this for DISFA Plus and BP4d as well in order to train our AU visualization model

dataset = "EmotionNet"
labels_filename = "emotionet_labels.csv"
landmarks_filename = "emotionet_landmarks.csv"

detector = Detector(face_model="retinaface", landmark_model="mobilenet")
EmotioNet_images = np.sort(glob.glob(os.path.join(data_dir, dataset, "imgs", "*.jpg")))
labels = pd.read_csv(
    os.path.join(data_dir, dataset, "labels", "EmotioNet_FACS_aws_2020_24600.csv")
)
labels = labels.dropna(axis=0)
for col in labels.columns:
    if "AU" in col:
        kwargs = {col.replace("'", "").replace('"', "").replace(" ", ""): labels[[col]]}
        labels = labels.assign(**kwargs)
        labels = labels.drop(columns=col)
labels = labels.assign(
    URL=labels.URL.apply(lambda x: x.split("/")[-1].replace("'", ""))
)
labels = labels.set_index("URL")
labels = labels.drop(columns=["URL orig"])

aus_to_train = [
    "AU1",
    "AU2",
    "AU4",
    "AU5",
    "AU6",
    "AU9",
    "AU10",
    "AU12",
    "AU15",
    "AU17",
    "AU18",
    "AU20",
    "AU24",
    "AU25",
    "AU26",
    "AU28",
    "AU43",
]

with open(labels_filename, "w", newline="") as csvfile:
    writer = csv.writer(csvfile, delimiter=",")
    writer.writerow(["URL"] + aus_to_train)

landmark_cols = [f"x_{i}" for i in range(68)] + [f"y_{i}" for i in range(68)]
with open(landmarks_filename, "w", newline="") as csvfile:
    writer = csv.writer(csvfile, delimiter=",")
    writer.writerow(landmark_cols)

for ix, image in enumerate(tqdm(EmotioNet_images)):
    try:
        imageURL = os.path.split(image)[-1]
        label = labels.loc[imageURL][aus_to_train]
        fd, _, points = extract_hog(image, detector=detector)
        with open(labels_filename, "a+", newline="") as csvfile:
            writer = csv.writer(csvfile, delimiter=",")
            writer.writerow([imageURL] + list(label.values))
        with open(landmarks_filename, "a+", newline="") as csvfile:
            writer = csv.writer(csvfile, delimiter=",")
            writer.writerow(points.T.flatten())
    except:
        print(f"failed {image}")