pigo/examples/talk_detector/talkdet.py

from ctypes import *

import subprocess
import numpy as np
import os
import cv2
import time

os.system('go build -o talkdet.so -buildmode=c-shared talkdet.go')
pigo = cdll.LoadLibrary('./talkdet.so')
os.system('rm talkdet.so')

MAX_NDETS = 2024
ARRAY_DIM = 6

MOUTH_AR_THRESH = 0.2
MOUTH_AR_CONSEC_FRAMES = 5

def verify_alpha_channel(frame):
    try:
        frame.shape[3]  # 4th position
    except IndexError:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
    return frame

def alpha_blend(frame_1, frame_2, mask):
    alpha = mask/255.0
    blended = cv2.convertScaleAbs(frame_1*(1-alpha) + frame_2*alpha)
    return blended

def apply_circle_focus_blur(frame, x, y, dim, blur):
    frame = verify_alpha_channel(frame)
    height, width, _ = frame.shape
    mask = np.zeros((height, width, 4), dtype='uint8')
    cv2.circle(mask, (int(x), int(y)), int(dim/1.5),
               (255, 255, 255), -1, cv2.LINE_AA)
    mask = cv2.blur(mask, (blur, blur), cv2.BORDER_DEFAULT)
    blured = cv2.blur(frame, (blur, blur), cv2.BORDER_DEFAULT)
    blended = alpha_blend(frame, blured, 255-mask)
    frame = cv2.cvtColor(blended, cv2.COLOR_BGRA2BGR)
    return frame

# define class GoPixelSlice to map to:
# C type struct { void *data; GoInt len; GoInt cap; }
class GoPixelSlice(Structure):
    _fields_ = [
        ("pixels", POINTER(c_ubyte)), ("len", c_longlong), ("cap", c_longlong),
    ]

# Obtain the camera pixels and transfer them to Go through Ctypes
def process_frame(pixs):
    dets = np.zeros(ARRAY_DIM * MAX_NDETS, dtype=np.float32)
    pixels = cast((c_ubyte * len(pixs))(*pixs), POINTER(c_ubyte))

    # call FindFaces
    faces = GoPixelSlice(pixels, len(pixs), len(pixs))
    pigo.FindFaces.argtypes = [GoPixelSlice]
    pigo.FindFaces.restype = c_void_p

    # Call the exported FindFaces function from Go.
    ndets = pigo.FindFaces(faces)
    data_pointer = cast(ndets, POINTER((c_longlong * ARRAY_DIM) * MAX_NDETS))

    if data_pointer:
        buffarr = ((c_longlong * ARRAY_DIM) *
                   MAX_NDETS).from_address(addressof(data_pointer.contents))
        res = np.ndarray(buffer=buffarr, dtype=c_longlong,
                         shape=(MAX_NDETS, ARRAY_DIM,))

        # The first value of the buffer aray represents the buffer length.
        dets_len = res[0][0]
        res = np.delete(res, 0, 0)  # delete the first element from the array

        # We have to multiply the detection length with the total
        # detection points(face, pupils and facial lendmark points), in total 18
        dets = list(res.reshape(-1, ARRAY_DIM))[0:dets_len*19]
        return dets

# initialize the camera
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

# Changing the camera resolution introduce a short delay in the camera initialization.
# For this reason we should delay the object detection process with a few milliseconds.
time.sleep(0.4)

showFaceDet = False
showPupil = True
showLandmarkPoints = True

counter = 0
talking = False

while(True):
    ret, frame = cap.read()
    pixs = np.ascontiguousarray(
        frame[:, :, 1].reshape((frame.shape[0], frame.shape[1])))
    pixs = pixs.flatten()

    # Verify if camera is intialized by checking if pixel array is not empty.
    if np.any(pixs):
        dets = process_frame(pixs)  # pixs needs to be numpy.uint8 array

        if dets is not None:
            face_col, face_row, face_dim = 0, 0, 0
            # We know that the detected faces are taking place in the first positions of the multidimensional array.
            for row, col, scale, q, det_type, mouth_ar in dets:
                if q > 50:
                    if det_type == 0:  # 0 == face;
                        face_col, face_row, face_dim = col, row, scale
                        if showFaceDet:
                            cv2.rectangle(frame, (col-scale/2, row-scale/2), (col+scale/2, row+scale/2), (0, 0, 255), 2)
                    elif det_type == 1:  # 1 == pupil;
                        if showPupil:
                            cv2.circle(frame, (int(col), int(row)), 4, (0, 0, 255), -1, 8, 0)
                    elif det_type == 2:  # 2 == facial landmark;
                        if showLandmarkPoints:
                            cv2.circle(frame, (int(col), int(row)), 4, (0, 255, 0), -1, 8, 0)
                    elif det_type == 3:
                        if mouth_ar < MOUTH_AR_THRESH: # mouth is open
                            talking = True
                            counter = 0
                        else: # mouth is closed
                            if counter < MOUTH_AR_CONSEC_FRAMES:
                                counter += 1
                            else:
                                talking = False
                                counter = 0

                        if talking and counter < MOUTH_AR_CONSEC_FRAMES:
                            frame = apply_circle_focus_blur(frame, face_col, face_row, face_dim, 25)
                            cv2.putText(frame, "Bla bla bla...", (10, 30),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

    cv2.imshow('', frame)

    key = cv2.waitKey(1)
    if key & 0xFF == ord('q'):
        break
    elif key & 0xFF == ord('w'):
        showFaceDet = not showFaceDet
    elif key & 0xFF == ord('e'):
        showPupil = not showPupil
    elif key & 0xFF == ord('r'):
        showLandmarkPoints = not showLandmarkPoints

cap.release()
cv2.destroyAllWindows()