Compare commits

...

3 Commits

Author SHA1 Message Date
Blake Blackshear
8507bbbb31 make object processor resilient to plasma failures 2020-03-13 16:35:58 -05:00
Blake Blackshear
b6fcb88e5c remove sharedarray references 2020-03-13 15:50:27 -05:00
Blake Blackshear
d3cd4afa65 handle various scenarios with external process failures 2020-03-09 21:12:19 -05:00
6 changed files with 148 additions and 55 deletions

View File

@@ -25,7 +25,6 @@ RUN apt -qq update && apt -qq install --no-install-recommends -y \
imutils \ imutils \
scipy \ scipy \
&& python3.7 -m pip install -U \ && python3.7 -m pip install -U \
SharedArray \
Flask \ Flask \
paho-mqtt \ paho-mqtt \
PyYAML \ PyYAML \

View File

@@ -1,4 +1,7 @@
import os import os
import sys
import traceback
import signal
import cv2 import cv2
import time import time
import datetime import datetime
@@ -58,14 +61,23 @@ GLOBAL_OBJECT_CONFIG = CONFIG.get('objects', {})
WEB_PORT = CONFIG.get('web_port', 5000) WEB_PORT = CONFIG.get('web_port', 5000)
DEBUG = (CONFIG.get('debug', '0') == '1') DEBUG = (CONFIG.get('debug', '0') == '1')
def start_plasma_store():
plasma_cmd = ['plasma_store', '-m', '400000000', '-s', '/tmp/plasma']
plasma_process = sp.Popen(plasma_cmd, stdout=sp.DEVNULL)
time.sleep(1)
rc = plasma_process.poll()
if rc is not None:
return None
return plasma_process
class CameraWatchdog(threading.Thread): class CameraWatchdog(threading.Thread):
def __init__(self, camera_processes, config, tflite_process, tracked_objects_queue, object_processor): def __init__(self, camera_processes, config, tflite_process, tracked_objects_queue, plasma_process):
threading.Thread.__init__(self) threading.Thread.__init__(self)
self.camera_processes = camera_processes self.camera_processes = camera_processes
self.config = config self.config = config
self.tflite_process = tflite_process self.tflite_process = tflite_process
self.tracked_objects_queue = tracked_objects_queue self.tracked_objects_queue = tracked_objects_queue
self.object_processor = object_processor self.plasma_process = plasma_process
def run(self): def run(self):
time.sleep(10) time.sleep(10)
@@ -73,12 +85,25 @@ class CameraWatchdog(threading.Thread):
# wait a bit before checking # wait a bit before checking
time.sleep(30) time.sleep(30)
# check the plasma process
rc = self.plasma_process.poll()
if rc != None:
print(f"plasma_process exited unexpectedly with {rc}")
self.plasma_process = start_plasma_store()
time.sleep(10)
# check the detection process
if (self.tflite_process.detection_start.value > 0.0 and if (self.tflite_process.detection_start.value > 0.0 and
datetime.datetime.now().timestamp() - self.tflite_process.detection_start.value > 10): datetime.datetime.now().timestamp() - self.tflite_process.detection_start.value > 10):
print("Detection appears to be stuck. Restarting detection process") print("Detection appears to be stuck. Restarting detection process")
self.tflite_process.start_or_restart() self.tflite_process.start_or_restart()
time.sleep(30) time.sleep(30)
elif not self.tflite_process.detect_process.is_alive():
print("Detection appears to have stopped. Restarting detection process")
self.tflite_process.start_or_restart()
time.sleep(30)
# check the camera processes
for name, camera_process in self.camera_processes.items(): for name, camera_process in self.camera_processes.items():
process = camera_process['process'] process = camera_process['process']
if not process.is_alive(): if not process.is_alive():
@@ -86,14 +111,33 @@ class CameraWatchdog(threading.Thread):
camera_process['fps'].value = float(self.config[name]['fps']) camera_process['fps'].value = float(self.config[name]['fps'])
camera_process['skipped_fps'].value = 0.0 camera_process['skipped_fps'].value = 0.0
camera_process['detection_fps'].value = 0.0 camera_process['detection_fps'].value = 0.0
camera_process['read_start'].value = 0.0
camera_process['ffmpeg_pid'].value = 0
process = mp.Process(target=track_camera, args=(name, self.config[name], FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG, process = mp.Process(target=track_camera, args=(name, self.config[name], FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG,
self.tflite_process.detection_queue, self.tracked_objects_queue, self.tflite_process.detection_queue, self.tracked_objects_queue,
camera_process['fps'], camera_process['skipped_fps'], camera_process['detection_fps'])) camera_process['fps'], camera_process['skipped_fps'], camera_process['detection_fps'],
camera_process['read_start'], camera_process['ffmpeg_pid']))
process.daemon = True process.daemon = True
camera_process['process'] = process camera_process['process'] = process
process.start() process.start()
print(f"Camera_process started for {name}: {process.pid}") print(f"Camera_process started for {name}: {process.pid}")
if (camera_process['read_start'].value > 0.0 and
datetime.datetime.now().timestamp() - camera_process['read_start'].value > 10):
print(f"Process for {name} has been reading from ffmpeg for over 10 seconds long. Killing ffmpeg...")
ffmpeg_pid = camera_process['ffmpeg_pid'].value
if ffmpeg_pid != 0:
try:
os.kill(ffmpeg_pid, signal.SIGTERM)
except OSError:
print(f"Unable to terminate ffmpeg with pid {ffmpeg_pid}")
time.sleep(10)
try:
os.kill(ffmpeg_pid, signal.SIGKILL)
print(f"Unable to kill ffmpeg with pid {ffmpeg_pid}")
except OSError:
pass
def main(): def main():
# connect to mqtt and setup last will # connect to mqtt and setup last will
def on_connect(client, userdata, flags, rc): def on_connect(client, userdata, flags, rc):
@@ -117,14 +161,7 @@ def main():
client.connect(MQTT_HOST, MQTT_PORT, 60) client.connect(MQTT_HOST, MQTT_PORT, 60)
client.loop_start() client.loop_start()
# start plasma store plasma_process = start_plasma_store()
plasma_cmd = ['plasma_store', '-m', '400000000', '-s', '/tmp/plasma']
plasma_process = sp.Popen(plasma_cmd, stdout=sp.DEVNULL)
time.sleep(1)
rc = plasma_process.poll()
if rc is not None:
raise RuntimeError("plasma_store exited unexpectedly with "
"code %d" % (rc,))
## ##
# Setup config defaults for cameras # Setup config defaults for cameras
@@ -135,7 +172,7 @@ def main():
} }
# Queue for cameras to push tracked objects to # Queue for cameras to push tracked objects to
tracked_objects_queue = mp.Queue() tracked_objects_queue = mp.SimpleQueue()
# Start the shared tflite process # Start the shared tflite process
tflite_process = EdgeTPUProcess() tflite_process = EdgeTPUProcess()
@@ -146,11 +183,14 @@ def main():
camera_processes[name] = { camera_processes[name] = {
'fps': mp.Value('d', float(config['fps'])), 'fps': mp.Value('d', float(config['fps'])),
'skipped_fps': mp.Value('d', 0.0), 'skipped_fps': mp.Value('d', 0.0),
'detection_fps': mp.Value('d', 0.0) 'detection_fps': mp.Value('d', 0.0),
'read_start': mp.Value('d', 0.0),
'ffmpeg_pid': mp.Value('i', 0)
} }
camera_process = mp.Process(target=track_camera, args=(name, config, FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG, camera_process = mp.Process(target=track_camera, args=(name, config, FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG,
tflite_process.detection_queue, tracked_objects_queue, tflite_process.detection_queue, tracked_objects_queue, camera_processes[name]['fps'],
camera_processes[name]['fps'], camera_processes[name]['skipped_fps'], camera_processes[name]['detection_fps'])) camera_processes[name]['skipped_fps'], camera_processes[name]['detection_fps'],
camera_processes[name]['read_start'], camera_processes[name]['ffmpeg_pid']))
camera_process.daemon = True camera_process.daemon = True
camera_processes[name]['process'] = camera_process camera_processes[name]['process'] = camera_process
@@ -161,7 +201,7 @@ def main():
object_processor = TrackedObjectProcessor(CONFIG['cameras'], client, MQTT_TOPIC_PREFIX, tracked_objects_queue) object_processor = TrackedObjectProcessor(CONFIG['cameras'], client, MQTT_TOPIC_PREFIX, tracked_objects_queue)
object_processor.start() object_processor.start()
camera_watchdog = CameraWatchdog(camera_processes, CONFIG['cameras'], tflite_process, tracked_objects_queue, object_processor) camera_watchdog = CameraWatchdog(camera_processes, CONFIG['cameras'], tflite_process, tracked_objects_queue, plasma_process)
camera_watchdog.start() camera_watchdog.start()
# create a flask app that encodes frames a mjpeg on demand # create a flask app that encodes frames a mjpeg on demand
@@ -174,6 +214,23 @@ def main():
# return a healh # return a healh
return "Frigate is running. Alive and healthy!" return "Frigate is running. Alive and healthy!"
@app.route('/debug/stack')
def processor_stack():
frame = sys._current_frames().get(object_processor.ident, None)
if frame:
return "<br>".join(traceback.format_stack(frame)), 200
else:
return "no frame found", 200
@app.route('/debug/print_stack')
def print_stack():
pid = int(request.args.get('pid', 0))
if pid == 0:
return "missing pid", 200
else:
os.kill(pid, signal.SIGUSR1)
return "check logs", 200
@app.route('/debug/stats') @app.route('/debug/stats')
def stats(): def stats():
stats = {} stats = {}
@@ -185,21 +242,22 @@ def main():
stats[name] = { stats[name] = {
'fps': round(camera_stats['fps'].value, 2), 'fps': round(camera_stats['fps'].value, 2),
'skipped_fps': round(camera_stats['skipped_fps'].value, 2), 'skipped_fps': round(camera_stats['skipped_fps'].value, 2),
'detection_fps': round(camera_stats['detection_fps'].value, 2) 'detection_fps': round(camera_stats['detection_fps'].value, 2),
'read_start': camera_stats['read_start'].value,
'pid': camera_stats['process'].pid,
'ffmpeg_pid': camera_stats['ffmpeg_pid'].value
} }
stats['coral'] = { stats['coral'] = {
'fps': round(total_detection_fps, 2), 'fps': round(total_detection_fps, 2),
'inference_speed': round(tflite_process.avg_inference_speed.value*1000, 2), 'inference_speed': round(tflite_process.avg_inference_speed.value*1000, 2),
'detection_queue': tflite_process.detection_queue.qsize(), 'detection_start': tflite_process.detection_start.value,
'detection_start': tflite_process.detection_start.value 'pid': tflite_process.detect_process.pid
} }
rc = plasma_process.poll() rc = camera_watchdog.plasma_process.poll()
stats['plasma_store_rc'] = rc stats['plasma_store_rc'] = rc
stats['tracked_objects_queue'] = tracked_objects_queue.qsize()
return jsonify(stats) return jsonify(stats)
@app.route('/<camera_name>/<label>/best.jpg') @app.route('/<camera_name>/<label>/best.jpg')

View File

@@ -3,11 +3,10 @@ import datetime
import hashlib import hashlib
import multiprocessing as mp import multiprocessing as mp
import numpy as np import numpy as np
import SharedArray as sa
import pyarrow.plasma as plasma import pyarrow.plasma as plasma
import tflite_runtime.interpreter as tflite import tflite_runtime.interpreter as tflite
from tflite_runtime.interpreter import load_delegate from tflite_runtime.interpreter import load_delegate
from frigate.util import EventsPerSecond from frigate.util import EventsPerSecond, listen
def load_labels(path, encoding='utf-8'): def load_labels(path, encoding='utf-8'):
"""Loads labels from file (with or without index numbers). """Loads labels from file (with or without index numbers).
@@ -64,6 +63,7 @@ class ObjectDetector():
def run_detector(detection_queue, avg_speed, start): def run_detector(detection_queue, avg_speed, start):
print(f"Starting detection process: {os.getpid()}") print(f"Starting detection process: {os.getpid()}")
listen()
plasma_client = plasma.connect("/tmp/plasma") plasma_client = plasma.connect("/tmp/plasma")
object_detector = ObjectDetector() object_detector = ObjectDetector()
@@ -87,7 +87,7 @@ def run_detector(detection_queue, avg_speed, start):
class EdgeTPUProcess(): class EdgeTPUProcess():
def __init__(self): def __init__(self):
self.detection_queue = mp.Queue() self.detection_queue = mp.SimpleQueue()
self.avg_inference_speed = mp.Value('d', 0.01) self.avg_inference_speed = mp.Value('d', 0.01)
self.detection_start = mp.Value('d', 0.0) self.detection_start = mp.Value('d', 0.0)
self.detect_process = None self.detect_process = None

View File

@@ -1,6 +1,7 @@
import json import json
import hashlib import hashlib
import datetime import datetime
import time
import copy import copy
import cv2 import cv2
import threading import threading
@@ -8,7 +9,6 @@ import numpy as np
from collections import Counter, defaultdict from collections import Counter, defaultdict
import itertools import itertools
import pyarrow.plasma as plasma import pyarrow.plasma as plasma
import SharedArray as sa
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from frigate.util import draw_box_with_label from frigate.util import draw_box_with_label
from frigate.edgetpu import load_labels from frigate.edgetpu import load_labels
@@ -29,7 +29,6 @@ class TrackedObjectProcessor(threading.Thread):
self.client = client self.client = client
self.topic_prefix = topic_prefix self.topic_prefix = topic_prefix
self.tracked_objects_queue = tracked_objects_queue self.tracked_objects_queue = tracked_objects_queue
self.plasma_client = plasma.connect("/tmp/plasma")
self.camera_data = defaultdict(lambda: { self.camera_data = defaultdict(lambda: {
'best_objects': {}, 'best_objects': {},
'object_status': defaultdict(lambda: defaultdict(lambda: 'OFF')), 'object_status': defaultdict(lambda: defaultdict(lambda: 'OFF')),
@@ -47,7 +46,34 @@ class TrackedObjectProcessor(threading.Thread):
def get_current_frame(self, camera): def get_current_frame(self, camera):
return self.camera_data[camera]['current_frame'] return self.camera_data[camera]['current_frame']
def connect_plasma_client(self):
while True:
try:
self.plasma_client = plasma.connect("/tmp/plasma")
return
except:
print(f"TrackedObjectProcessor: unable to connect plasma client")
time.sleep(10)
def get_from_plasma(self, object_id):
while True:
try:
return self.plasma_client.get(object_id, timeout_ms=0)
except:
self.connect_plasma_client()
time.sleep(1)
def delete_from_plasma(self, object_ids):
while True:
try:
self.plasma_client.delete(object_ids)
return
except:
self.connect_plasma_client()
time.sleep(1)
def run(self): def run(self):
self.connect_plasma_client()
while True: while True:
camera, frame_time, tracked_objects = self.tracked_objects_queue.get() camera, frame_time, tracked_objects = self.tracked_objects_queue.get()
@@ -62,7 +88,7 @@ class TrackedObjectProcessor(threading.Thread):
object_id_hash = hashlib.sha1(str.encode(f"{camera}{frame_time}")) object_id_hash = hashlib.sha1(str.encode(f"{camera}{frame_time}"))
object_id_bytes = object_id_hash.digest() object_id_bytes = object_id_hash.digest()
object_id = plasma.ObjectID(object_id_bytes) object_id = plasma.ObjectID(object_id_bytes)
current_frame = self.plasma_client.get(object_id, timeout_ms=0) current_frame = self.get_from_plasma(object_id)
if not current_frame is plasma.ObjectNotAvailable: if not current_frame is plasma.ObjectNotAvailable:
# draw the bounding boxes on the frame # draw the bounding boxes on the frame
@@ -93,7 +119,7 @@ class TrackedObjectProcessor(threading.Thread):
# store the object id, so you can delete it at the next loop # store the object id, so you can delete it at the next loop
previous_object_id = self.camera_data[camera]['object_id'] previous_object_id = self.camera_data[camera]['object_id']
if not previous_object_id is None: if not previous_object_id is None:
self.plasma_client.delete([previous_object_id]) self.delete_from_plasma([previous_object_id])
self.camera_data[camera]['object_id'] = object_id self.camera_data[camera]['object_id'] = object_id
### ###

View File

@@ -1,4 +1,6 @@
import datetime import datetime
import signal
import traceback
import collections import collections
import numpy as np import numpy as np
import cv2 import cv2
@@ -127,3 +129,9 @@ class EventsPerSecond:
now = datetime.datetime.now().timestamp() now = datetime.datetime.now().timestamp()
seconds = min(now-self._start, last_n_seconds) seconds = min(now-self._start, last_n_seconds)
return len([t for t in self._timestamps if t > (now-last_n_seconds)]) / seconds return len([t for t in self._timestamps if t > (now-last_n_seconds)]) / seconds
def print_stack(sig, frame):
traceback.print_stack(frame)
def listen():
signal.signal(signal.SIGUSR1, print_stack)

View File

@@ -10,12 +10,11 @@ import subprocess as sp
import numpy as np import numpy as np
import hashlib import hashlib
import pyarrow.plasma as plasma import pyarrow.plasma as plasma
import SharedArray as sa
import copy import copy
import itertools import itertools
import json import json
from collections import defaultdict from collections import defaultdict
from frigate.util import draw_box_with_label, area, calculate_region, clipped, intersection_over_union, intersection, EventsPerSecond from frigate.util import draw_box_with_label, area, calculate_region, clipped, intersection_over_union, intersection, EventsPerSecond, listen
from frigate.objects import ObjectTracker from frigate.objects import ObjectTracker
from frigate.edgetpu import RemoteObjectDetector from frigate.edgetpu import RemoteObjectDetector
from frigate.motion import MotionDetector from frigate.motion import MotionDetector
@@ -98,28 +97,33 @@ def create_tensor_input(frame, region):
# Expand dimensions since the model expects images to have shape: [1, 300, 300, 3] # Expand dimensions since the model expects images to have shape: [1, 300, 300, 3]
return np.expand_dims(cropped_frame, axis=0) return np.expand_dims(cropped_frame, axis=0)
def start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_process=None): def start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, pid, ffmpeg_process=None):
if not ffmpeg_process is None: if not ffmpeg_process is None:
print("Terminating the existing ffmpeg process...") print("Terminating the existing ffmpeg process...")
ffmpeg_process.terminate() ffmpeg_process.terminate()
try: try:
print("Waiting for ffmpeg to exit gracefully...") print("Waiting for ffmpeg to exit gracefully...")
ffmpeg_process.wait(timeout=30) ffmpeg_process.communicate(timeout=30)
except sp.TimeoutExpired: except sp.TimeoutExpired:
print("FFmpeg didnt exit. Force killing...") print("FFmpeg didnt exit. Force killing...")
ffmpeg_process.kill() ffmpeg_process.kill()
ffmpeg_process.wait() ffmpeg_process.communicate()
ffmpeg_process = None
print("Creating ffmpeg process...") print("Creating ffmpeg process...")
print(" ".join(ffmpeg_cmd)) print(" ".join(ffmpeg_cmd))
return sp.Popen(ffmpeg_cmd, stdout = sp.PIPE, bufsize=frame_size*10) process = sp.Popen(ffmpeg_cmd, stdout = sp.PIPE, bufsize=frame_size*10)
pid.value = process.pid
return process
def track_camera(name, config, ffmpeg_global_config, global_objects_config, detection_queue, detected_objects_queue, fps, skipped_fps, detection_fps): def track_camera(name, config, ffmpeg_global_config, global_objects_config, detection_queue, detected_objects_queue, fps, skipped_fps, detection_fps, read_start, ffmpeg_pid):
print(f"Starting process for {name}: {os.getpid()}") print(f"Starting process for {name}: {os.getpid()}")
listen()
# Merge the ffmpeg config with the global config # Merge the ffmpeg config with the global config
ffmpeg = config.get('ffmpeg', {}) ffmpeg = config.get('ffmpeg', {})
ffmpeg_input = get_ffmpeg_input(ffmpeg['input']) ffmpeg_input = get_ffmpeg_input(ffmpeg['input'])
ffmpeg_restart_delay = ffmpeg.get('restart_delay', 0)
ffmpeg_global_args = ffmpeg.get('global_args', ffmpeg_global_config['global_args']) ffmpeg_global_args = ffmpeg.get('global_args', ffmpeg_global_config['global_args'])
ffmpeg_hwaccel_args = ffmpeg.get('hwaccel_args', ffmpeg_global_config['hwaccel_args']) ffmpeg_hwaccel_args = ffmpeg.get('hwaccel_args', ffmpeg_global_config['hwaccel_args'])
ffmpeg_input_args = ffmpeg.get('input_args', ffmpeg_global_config['input_args']) ffmpeg_input_args = ffmpeg.get('input_args', ffmpeg_global_config['input_args'])
@@ -154,12 +158,7 @@ def track_camera(name, config, ffmpeg_global_config, global_objects_config, dete
frame_size = frame_shape[0] * frame_shape[1] * frame_shape[2] frame_size = frame_shape[0] * frame_shape[1] * frame_shape[2]
try: frame = np.zeros(frame_shape, np.uint8)
sa.delete(name)
except:
pass
frame = sa.create(name, shape=frame_shape, dtype=np.uint8)
# load in the mask for object detection # load in the mask for object detection
if 'mask' in config: if 'mask' in config:
@@ -176,7 +175,7 @@ def track_camera(name, config, ffmpeg_global_config, global_objects_config, dete
object_tracker = ObjectTracker(10) object_tracker = ObjectTracker(10)
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size) ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_pid)
plasma_client = plasma.connect("/tmp/plasma") plasma_client = plasma.connect("/tmp/plasma")
frame_num = 0 frame_num = 0
@@ -187,19 +186,22 @@ def track_camera(name, config, ffmpeg_global_config, global_objects_config, dete
skipped_fps_tracker.start() skipped_fps_tracker.start()
object_detector.fps.start() object_detector.fps.start()
while True: while True:
start = datetime.datetime.now().timestamp() rc = ffmpeg_process.poll()
if rc != None:
print(f"{name}: ffmpeg_process exited unexpectedly with {rc}")
print(f"Letting {name} rest for {ffmpeg_restart_delay} seconds before restarting...")
time.sleep(ffmpeg_restart_delay)
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_pid, ffmpeg_process)
time.sleep(10)
read_start.value = datetime.datetime.now().timestamp()
frame_bytes = ffmpeg_process.stdout.read(frame_size) frame_bytes = ffmpeg_process.stdout.read(frame_size)
duration = datetime.datetime.now().timestamp()-start duration = datetime.datetime.now().timestamp()-read_start.value
read_start.value = 0.0
avg_wait = (avg_wait*99+duration)/100 avg_wait = (avg_wait*99+duration)/100
if not frame_bytes: if len(frame_bytes) == 0:
rc = ffmpeg_process.poll() print(f"{name}: ffmpeg_process didnt return any bytes")
if rc is not None:
print(f"{name}: ffmpeg_process exited unexpectedly with {rc}")
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_process)
time.sleep(10)
else:
print(f"{name}: ffmpeg_process is still running but didnt return any bytes")
continue continue
# limit frame rate # limit frame rate