Compare commits

...

3 Commits

Author SHA1 Message Date
Blake Blackshear
8507bbbb31 make object processor resilient to plasma failures 2020-03-13 16:35:58 -05:00
Blake Blackshear
b6fcb88e5c remove sharedarray references 2020-03-13 15:50:27 -05:00
Blake Blackshear
d3cd4afa65 handle various scenarios with external process failures 2020-03-09 21:12:19 -05:00
6 changed files with 148 additions and 55 deletions

View File

@@ -25,7 +25,6 @@ RUN apt -qq update && apt -qq install --no-install-recommends -y \
imutils \
scipy \
&& python3.7 -m pip install -U \
SharedArray \
Flask \
paho-mqtt \
PyYAML \

View File

@@ -1,4 +1,7 @@
import os
import sys
import traceback
import signal
import cv2
import time
import datetime
@@ -58,14 +61,23 @@ GLOBAL_OBJECT_CONFIG = CONFIG.get('objects', {})
WEB_PORT = CONFIG.get('web_port', 5000)
DEBUG = (CONFIG.get('debug', '0') == '1')
def start_plasma_store():
plasma_cmd = ['plasma_store', '-m', '400000000', '-s', '/tmp/plasma']
plasma_process = sp.Popen(plasma_cmd, stdout=sp.DEVNULL)
time.sleep(1)
rc = plasma_process.poll()
if rc is not None:
return None
return plasma_process
class CameraWatchdog(threading.Thread):
def __init__(self, camera_processes, config, tflite_process, tracked_objects_queue, object_processor):
def __init__(self, camera_processes, config, tflite_process, tracked_objects_queue, plasma_process):
threading.Thread.__init__(self)
self.camera_processes = camera_processes
self.config = config
self.tflite_process = tflite_process
self.tracked_objects_queue = tracked_objects_queue
self.object_processor = object_processor
self.plasma_process = plasma_process
def run(self):
time.sleep(10)
@@ -73,12 +85,25 @@ class CameraWatchdog(threading.Thread):
# wait a bit before checking
time.sleep(30)
# check the plasma process
rc = self.plasma_process.poll()
if rc != None:
print(f"plasma_process exited unexpectedly with {rc}")
self.plasma_process = start_plasma_store()
time.sleep(10)
# check the detection process
if (self.tflite_process.detection_start.value > 0.0 and
datetime.datetime.now().timestamp() - self.tflite_process.detection_start.value > 10):
print("Detection appears to be stuck. Restarting detection process")
self.tflite_process.start_or_restart()
time.sleep(30)
elif not self.tflite_process.detect_process.is_alive():
print("Detection appears to have stopped. Restarting detection process")
self.tflite_process.start_or_restart()
time.sleep(30)
# check the camera processes
for name, camera_process in self.camera_processes.items():
process = camera_process['process']
if not process.is_alive():
@@ -86,14 +111,33 @@ class CameraWatchdog(threading.Thread):
camera_process['fps'].value = float(self.config[name]['fps'])
camera_process['skipped_fps'].value = 0.0
camera_process['detection_fps'].value = 0.0
camera_process['read_start'].value = 0.0
camera_process['ffmpeg_pid'].value = 0
process = mp.Process(target=track_camera, args=(name, self.config[name], FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG,
self.tflite_process.detection_queue, self.tracked_objects_queue,
camera_process['fps'], camera_process['skipped_fps'], camera_process['detection_fps']))
camera_process['fps'], camera_process['skipped_fps'], camera_process['detection_fps'],
camera_process['read_start'], camera_process['ffmpeg_pid']))
process.daemon = True
camera_process['process'] = process
process.start()
print(f"Camera_process started for {name}: {process.pid}")
if (camera_process['read_start'].value > 0.0 and
datetime.datetime.now().timestamp() - camera_process['read_start'].value > 10):
print(f"Process for {name} has been reading from ffmpeg for over 10 seconds long. Killing ffmpeg...")
ffmpeg_pid = camera_process['ffmpeg_pid'].value
if ffmpeg_pid != 0:
try:
os.kill(ffmpeg_pid, signal.SIGTERM)
except OSError:
print(f"Unable to terminate ffmpeg with pid {ffmpeg_pid}")
time.sleep(10)
try:
os.kill(ffmpeg_pid, signal.SIGKILL)
print(f"Unable to kill ffmpeg with pid {ffmpeg_pid}")
except OSError:
pass
def main():
# connect to mqtt and setup last will
def on_connect(client, userdata, flags, rc):
@@ -117,14 +161,7 @@ def main():
client.connect(MQTT_HOST, MQTT_PORT, 60)
client.loop_start()
# start plasma store
plasma_cmd = ['plasma_store', '-m', '400000000', '-s', '/tmp/plasma']
plasma_process = sp.Popen(plasma_cmd, stdout=sp.DEVNULL)
time.sleep(1)
rc = plasma_process.poll()
if rc is not None:
raise RuntimeError("plasma_store exited unexpectedly with "
"code %d" % (rc,))
plasma_process = start_plasma_store()
##
# Setup config defaults for cameras
@@ -135,7 +172,7 @@ def main():
}
# Queue for cameras to push tracked objects to
tracked_objects_queue = mp.Queue()
tracked_objects_queue = mp.SimpleQueue()
# Start the shared tflite process
tflite_process = EdgeTPUProcess()
@@ -146,11 +183,14 @@ def main():
camera_processes[name] = {
'fps': mp.Value('d', float(config['fps'])),
'skipped_fps': mp.Value('d', 0.0),
'detection_fps': mp.Value('d', 0.0)
'detection_fps': mp.Value('d', 0.0),
'read_start': mp.Value('d', 0.0),
'ffmpeg_pid': mp.Value('i', 0)
}
camera_process = mp.Process(target=track_camera, args=(name, config, FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG,
tflite_process.detection_queue, tracked_objects_queue,
camera_processes[name]['fps'], camera_processes[name]['skipped_fps'], camera_processes[name]['detection_fps']))
tflite_process.detection_queue, tracked_objects_queue, camera_processes[name]['fps'],
camera_processes[name]['skipped_fps'], camera_processes[name]['detection_fps'],
camera_processes[name]['read_start'], camera_processes[name]['ffmpeg_pid']))
camera_process.daemon = True
camera_processes[name]['process'] = camera_process
@@ -161,7 +201,7 @@ def main():
object_processor = TrackedObjectProcessor(CONFIG['cameras'], client, MQTT_TOPIC_PREFIX, tracked_objects_queue)
object_processor.start()
camera_watchdog = CameraWatchdog(camera_processes, CONFIG['cameras'], tflite_process, tracked_objects_queue, object_processor)
camera_watchdog = CameraWatchdog(camera_processes, CONFIG['cameras'], tflite_process, tracked_objects_queue, plasma_process)
camera_watchdog.start()
# create a flask app that encodes frames a mjpeg on demand
@@ -174,6 +214,23 @@ def main():
# return a healh
return "Frigate is running. Alive and healthy!"
@app.route('/debug/stack')
def processor_stack():
frame = sys._current_frames().get(object_processor.ident, None)
if frame:
return "<br>".join(traceback.format_stack(frame)), 200
else:
return "no frame found", 200
@app.route('/debug/print_stack')
def print_stack():
pid = int(request.args.get('pid', 0))
if pid == 0:
return "missing pid", 200
else:
os.kill(pid, signal.SIGUSR1)
return "check logs", 200
@app.route('/debug/stats')
def stats():
stats = {}
@@ -185,21 +242,22 @@ def main():
stats[name] = {
'fps': round(camera_stats['fps'].value, 2),
'skipped_fps': round(camera_stats['skipped_fps'].value, 2),
'detection_fps': round(camera_stats['detection_fps'].value, 2)
'detection_fps': round(camera_stats['detection_fps'].value, 2),
'read_start': camera_stats['read_start'].value,
'pid': camera_stats['process'].pid,
'ffmpeg_pid': camera_stats['ffmpeg_pid'].value
}
stats['coral'] = {
'fps': round(total_detection_fps, 2),
'inference_speed': round(tflite_process.avg_inference_speed.value*1000, 2),
'detection_queue': tflite_process.detection_queue.qsize(),
'detection_start': tflite_process.detection_start.value
'detection_start': tflite_process.detection_start.value,
'pid': tflite_process.detect_process.pid
}
rc = plasma_process.poll()
rc = camera_watchdog.plasma_process.poll()
stats['plasma_store_rc'] = rc
stats['tracked_objects_queue'] = tracked_objects_queue.qsize()
return jsonify(stats)
@app.route('/<camera_name>/<label>/best.jpg')

View File

@@ -3,11 +3,10 @@ import datetime
import hashlib
import multiprocessing as mp
import numpy as np
import SharedArray as sa
import pyarrow.plasma as plasma
import tflite_runtime.interpreter as tflite
from tflite_runtime.interpreter import load_delegate
from frigate.util import EventsPerSecond
from frigate.util import EventsPerSecond, listen
def load_labels(path, encoding='utf-8'):
"""Loads labels from file (with or without index numbers).
@@ -64,6 +63,7 @@ class ObjectDetector():
def run_detector(detection_queue, avg_speed, start):
print(f"Starting detection process: {os.getpid()}")
listen()
plasma_client = plasma.connect("/tmp/plasma")
object_detector = ObjectDetector()
@@ -87,7 +87,7 @@ def run_detector(detection_queue, avg_speed, start):
class EdgeTPUProcess():
def __init__(self):
self.detection_queue = mp.Queue()
self.detection_queue = mp.SimpleQueue()
self.avg_inference_speed = mp.Value('d', 0.01)
self.detection_start = mp.Value('d', 0.0)
self.detect_process = None

View File

@@ -1,6 +1,7 @@
import json
import hashlib
import datetime
import time
import copy
import cv2
import threading
@@ -8,7 +9,6 @@ import numpy as np
from collections import Counter, defaultdict
import itertools
import pyarrow.plasma as plasma
import SharedArray as sa
import matplotlib.pyplot as plt
from frigate.util import draw_box_with_label
from frigate.edgetpu import load_labels
@@ -29,7 +29,6 @@ class TrackedObjectProcessor(threading.Thread):
self.client = client
self.topic_prefix = topic_prefix
self.tracked_objects_queue = tracked_objects_queue
self.plasma_client = plasma.connect("/tmp/plasma")
self.camera_data = defaultdict(lambda: {
'best_objects': {},
'object_status': defaultdict(lambda: defaultdict(lambda: 'OFF')),
@@ -47,7 +46,34 @@ class TrackedObjectProcessor(threading.Thread):
def get_current_frame(self, camera):
return self.camera_data[camera]['current_frame']
def connect_plasma_client(self):
while True:
try:
self.plasma_client = plasma.connect("/tmp/plasma")
return
except:
print(f"TrackedObjectProcessor: unable to connect plasma client")
time.sleep(10)
def get_from_plasma(self, object_id):
while True:
try:
return self.plasma_client.get(object_id, timeout_ms=0)
except:
self.connect_plasma_client()
time.sleep(1)
def delete_from_plasma(self, object_ids):
while True:
try:
self.plasma_client.delete(object_ids)
return
except:
self.connect_plasma_client()
time.sleep(1)
def run(self):
self.connect_plasma_client()
while True:
camera, frame_time, tracked_objects = self.tracked_objects_queue.get()
@@ -62,7 +88,7 @@ class TrackedObjectProcessor(threading.Thread):
object_id_hash = hashlib.sha1(str.encode(f"{camera}{frame_time}"))
object_id_bytes = object_id_hash.digest()
object_id = plasma.ObjectID(object_id_bytes)
current_frame = self.plasma_client.get(object_id, timeout_ms=0)
current_frame = self.get_from_plasma(object_id)
if not current_frame is plasma.ObjectNotAvailable:
# draw the bounding boxes on the frame
@@ -93,7 +119,7 @@ class TrackedObjectProcessor(threading.Thread):
# store the object id, so you can delete it at the next loop
previous_object_id = self.camera_data[camera]['object_id']
if not previous_object_id is None:
self.plasma_client.delete([previous_object_id])
self.delete_from_plasma([previous_object_id])
self.camera_data[camera]['object_id'] = object_id
###

View File

@@ -1,4 +1,6 @@
import datetime
import signal
import traceback
import collections
import numpy as np
import cv2
@@ -127,3 +129,9 @@ class EventsPerSecond:
now = datetime.datetime.now().timestamp()
seconds = min(now-self._start, last_n_seconds)
return len([t for t in self._timestamps if t > (now-last_n_seconds)]) / seconds
def print_stack(sig, frame):
traceback.print_stack(frame)
def listen():
signal.signal(signal.SIGUSR1, print_stack)

View File

@@ -10,12 +10,11 @@ import subprocess as sp
import numpy as np
import hashlib
import pyarrow.plasma as plasma
import SharedArray as sa
import copy
import itertools
import json
from collections import defaultdict
from frigate.util import draw_box_with_label, area, calculate_region, clipped, intersection_over_union, intersection, EventsPerSecond
from frigate.util import draw_box_with_label, area, calculate_region, clipped, intersection_over_union, intersection, EventsPerSecond, listen
from frigate.objects import ObjectTracker
from frigate.edgetpu import RemoteObjectDetector
from frigate.motion import MotionDetector
@@ -98,28 +97,33 @@ def create_tensor_input(frame, region):
# Expand dimensions since the model expects images to have shape: [1, 300, 300, 3]
return np.expand_dims(cropped_frame, axis=0)
def start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_process=None):
def start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, pid, ffmpeg_process=None):
if not ffmpeg_process is None:
print("Terminating the existing ffmpeg process...")
ffmpeg_process.terminate()
try:
print("Waiting for ffmpeg to exit gracefully...")
ffmpeg_process.wait(timeout=30)
ffmpeg_process.communicate(timeout=30)
except sp.TimeoutExpired:
print("FFmpeg didnt exit. Force killing...")
ffmpeg_process.kill()
ffmpeg_process.wait()
ffmpeg_process.communicate()
ffmpeg_process = None
print("Creating ffmpeg process...")
print(" ".join(ffmpeg_cmd))
return sp.Popen(ffmpeg_cmd, stdout = sp.PIPE, bufsize=frame_size*10)
process = sp.Popen(ffmpeg_cmd, stdout = sp.PIPE, bufsize=frame_size*10)
pid.value = process.pid
return process
def track_camera(name, config, ffmpeg_global_config, global_objects_config, detection_queue, detected_objects_queue, fps, skipped_fps, detection_fps):
def track_camera(name, config, ffmpeg_global_config, global_objects_config, detection_queue, detected_objects_queue, fps, skipped_fps, detection_fps, read_start, ffmpeg_pid):
print(f"Starting process for {name}: {os.getpid()}")
listen()
# Merge the ffmpeg config with the global config
ffmpeg = config.get('ffmpeg', {})
ffmpeg_input = get_ffmpeg_input(ffmpeg['input'])
ffmpeg_restart_delay = ffmpeg.get('restart_delay', 0)
ffmpeg_global_args = ffmpeg.get('global_args', ffmpeg_global_config['global_args'])
ffmpeg_hwaccel_args = ffmpeg.get('hwaccel_args', ffmpeg_global_config['hwaccel_args'])
ffmpeg_input_args = ffmpeg.get('input_args', ffmpeg_global_config['input_args'])
@@ -154,12 +158,7 @@ def track_camera(name, config, ffmpeg_global_config, global_objects_config, dete
frame_size = frame_shape[0] * frame_shape[1] * frame_shape[2]
try:
sa.delete(name)
except:
pass
frame = sa.create(name, shape=frame_shape, dtype=np.uint8)
frame = np.zeros(frame_shape, np.uint8)
# load in the mask for object detection
if 'mask' in config:
@@ -176,7 +175,7 @@ def track_camera(name, config, ffmpeg_global_config, global_objects_config, dete
object_tracker = ObjectTracker(10)
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size)
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_pid)
plasma_client = plasma.connect("/tmp/plasma")
frame_num = 0
@@ -187,19 +186,22 @@ def track_camera(name, config, ffmpeg_global_config, global_objects_config, dete
skipped_fps_tracker.start()
object_detector.fps.start()
while True:
start = datetime.datetime.now().timestamp()
rc = ffmpeg_process.poll()
if rc != None:
print(f"{name}: ffmpeg_process exited unexpectedly with {rc}")
print(f"Letting {name} rest for {ffmpeg_restart_delay} seconds before restarting...")
time.sleep(ffmpeg_restart_delay)
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_pid, ffmpeg_process)
time.sleep(10)
read_start.value = datetime.datetime.now().timestamp()
frame_bytes = ffmpeg_process.stdout.read(frame_size)
duration = datetime.datetime.now().timestamp()-start
duration = datetime.datetime.now().timestamp()-read_start.value
read_start.value = 0.0
avg_wait = (avg_wait*99+duration)/100
if not frame_bytes:
rc = ffmpeg_process.poll()
if rc is not None:
print(f"{name}: ffmpeg_process exited unexpectedly with {rc}")
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_process)
time.sleep(10)
else:
print(f"{name}: ffmpeg_process is still running but didnt return any bytes")
if len(frame_bytes) == 0:
print(f"{name}: ffmpeg_process didnt return any bytes")
continue
# limit frame rate