Revert "Remove scene detection code (#377)"

This reverts commit 98566e26c0.
2025-09-26 19:51:36 +08:00 · 2024-01-19 09:07:47 +00:00
parent 98566e26c0
commit 5ad522cf23
12 changed files with 478 additions and 6 deletions
--- a/.github/runner/Dockerfile
+++ b/.github/runner/Dockerfile
@@ -37,6 +37,11 @@ RUN cd /home/devops && mkdir actions-runner && cd actions-runner \
    && curl -O -L https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz \
    && tar xzf ./actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz && chown -R devops ~devops

+RUN LIBTENSORFLOW_VERSION=2.6.3 \
+  && curl -LO https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-${LIBTENSORFLOW_VERSION}.tar.gz \
+  && sudo tar -C /usr/local -xzf libtensorflow-gpu-linux-x86_64-${LIBTENSORFLOW_VERSION}.tar.gz \
+  && sudo ldconfig
+
 # Add mime type for ts
 RUN sudo echo '<?xml version="1.0" encoding="UTF-8"?><mime-info xmlns="http://www.freedesktop.org/standards/shared-mime-info"><mime-type type="video/mp2t"><comment>ts</comment><glob pattern="*.ts"/></mime-type></mime-info>'>>/usr/share/mime/packages/custom_mime_type.xml
 RUN sudo update-mime-database /usr/share/mime
--- a/cmd/scenedetection/scenedetection.go
+++ b/cmd/scenedetection/scenedetection.go
@@ -0,0 +1,111 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/livepeer/lpms/ffmpeg"
+)
+
+func validRenditions() []string {
+	valids := make([]string, len(ffmpeg.VideoProfileLookup))
+	for p, _ := range ffmpeg.VideoProfileLookup {
+		valids = append(valids, p)
+	}
+	return valids
+}
+
+func main() {
+	if len(os.Args) <= 4 {
+		//0,1 input.mp4 P720p25fps16x9,P720p30fps4x3 nv 0
+		panic("Usage:<dnn init deviceid> <input file> <output renditions, comma separated> <sw/nv>")
+	}
+	str2accel := func(inp string) (ffmpeg.Acceleration, string) {
+		if inp == "nv" {
+			return ffmpeg.Nvidia, "nv"
+		}
+		return ffmpeg.Software, "sw"
+	}
+	str2profs := func(inp string) []ffmpeg.VideoProfile {
+		profs := []ffmpeg.VideoProfile{}
+		strs := strings.Split(inp, ",")
+		for _, k := range strs {
+			p, ok := ffmpeg.VideoProfileLookup[k]
+			if !ok {
+				panic(fmt.Sprintf("Invalid rendition %s. Valid renditions are:\n%s", k, validRenditions()))
+			}
+			profs = append(profs, p)
+		}
+		return profs
+	}
+	deviceid := os.Args[1]
+	fname := os.Args[2]
+	profiles := str2profs(os.Args[3])
+	accel, lbl := str2accel(os.Args[4])
+
+	var dev string
+	if accel == ffmpeg.Nvidia {
+		if len(os.Args) <= 5 {
+			panic("Expected device number")
+		}
+		dev = os.Args[5]
+	}
+	ffmpeg.InitFFmpeg()
+
+	t := time.Now()
+	tc, err := ffmpeg.NewTranscoderWithDetector(&ffmpeg.DSceneAdultSoccer, deviceid)
+	defer tc.StopTranscoder()
+	end := time.Now()
+
+	if err != nil {
+		panic(err)
+	}
+	fmt.Printf("InitFFmpegWithDetectorProfile time %0.4v\n", end.Sub(t).Seconds())
+
+	profs2opts := func(profs []ffmpeg.VideoProfile) []ffmpeg.TranscodeOptions {
+		opts := []ffmpeg.TranscodeOptions{}
+		for i := range profs {
+			o := ffmpeg.TranscodeOptions{
+				Oname:   fmt.Sprintf("out_%s_%d_out.mkv", lbl, i),
+				Profile: profs[i],
+				Accel:   accel,
+			}
+			opts = append(opts, o)
+		}
+		//add detection profile
+		detectorProfile := ffmpeg.DSceneAdultSoccer
+		detectorProfile.SampleRate = 100
+		o := ffmpeg.TranscodeOptions{
+			Oname:    fmt.Sprintf("out_dnn.mkv"),
+			Profile:  ffmpeg.P144p30fps16x9,
+			Detector: &detectorProfile,
+			Accel:    accel,
+		}
+		opts = append(opts, o)
+		return opts
+	}
+	options := profs2opts(profiles)
+
+	t = time.Now()
+	fmt.Printf("Setting fname %s encoding %d renditions with %v\n", fname, len(options), lbl)
+	res, err := tc.Transcode(&ffmpeg.TranscodeOptionsIn{
+		Fname:  fname,
+		Accel:  accel,
+		Device: dev,
+	}, options)
+	if err != nil {
+		panic(err)
+	}
+	end = time.Now()
+	fmt.Printf("profile=input frames=%v pixels=%v\n", res.Decoded.Frames, res.Decoded.Pixels)
+	for i, r := range res.Encoded {
+		if r.DetectData != nil {
+			fmt.Printf("profile=%v frames=%v pixels=%v detectdata= %v\n", options[i].Profile, r.Frames, r.Pixels, r.DetectData)
+		} else {
+			fmt.Printf("profile=%v frames=%v pixels=%v\n", options[i].Profile, r.Frames, r.Pixels)
+		}
+	}
+	fmt.Printf("Transcoding time %0.4v\n", end.Sub(t).Seconds())
+}
--- a/cmd/transcoding/transcoding.go
+++ b/cmd/transcoding/transcoding.go
@@ -64,7 +64,9 @@ func main() {
 			o := ffmpeg.TranscodeOptions{
 				Oname:   fmt.Sprintf("out_%s_%d_out.mp4", lbl, i),
 				Profile: profs[i],
-				Accel:   accel,
+				// Uncomment the following to test scene classifier
+				// Detector: &ffmpeg.DSceneAdultSoccer,
+				Accel: accel,
 			}
 			o.From = *from
 			o.To = *to
@@ -97,7 +99,11 @@ func main() {
 	end := time.Now()
 	fmt.Printf("profile=input frames=%v pixels=%v\n", res.Decoded.Frames, res.Decoded.Pixels)
 	for i, r := range res.Encoded {
-		fmt.Printf("profile=%v frames=%v pixels=%v\n", profiles[i].Name, r.Frames, r.Pixels)
+		if r.DetectData != nil {
+			fmt.Printf("profile=%v frames=%v pixels=%v detectdata=%v\n", profiles[i].Name, r.Frames, r.Pixels, r.DetectData)
+		} else {
+			fmt.Printf("profile=%v frames=%v pixels=%v\n", profiles[i].Name, r.Frames, r.Pixels)
+		}
 	}
 	fmt.Printf("Transcoding time %0.4v\n", end.Sub(t).Seconds())
 }
--- a/ffmpeg/api_test.go
+++ b/ffmpeg/api_test.go
@@ -1520,6 +1520,61 @@ func TestTranscoder_CompareVideo(t *testing.T) {
 	compareVideo(t, Software)
 }

+func detectionFreq(t *testing.T, accel Acceleration, deviceid string) {
+	run, dir := setupTest(t)
+	defer os.RemoveAll(dir)
+	cmd := `
+    # run segmenter and sanity check frame counts . Hardcode for now.
+    ffmpeg -loglevel warning -i "$1"/../transcoder/test.ts -c:a copy -c:v copy -f hls test.m3u8
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test0.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test1.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test2.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test3.ts | grep nb_read_frames=120
+  `
+	run(cmd)
+
+	InitFFmpeg()
+	tc, err := NewTranscoderWithDetector(&DSceneAdultSoccer, deviceid)
+	require.NotNil(t, tc, "look for `Failed to load native model` logs above")
+	if err != nil {
+		t.Error(err)
+	} else {
+		defer tc.StopTranscoder()
+		// Test encoding with only seg0 and seg2 under detection
+		prof := P144p30fps16x9
+		for i := 0; i < 4; i++ {
+			in := &TranscodeOptionsIn{
+				Fname: fmt.Sprintf("%s/test%d.ts", dir, i),
+				Accel: accel,
+			}
+			out := []TranscodeOptions{
+				{
+					Oname:   fmt.Sprintf("%s/out%d.ts", dir, i),
+					Profile: prof,
+					Accel:   accel,
+				},
+			}
+			if i%2 == 0 {
+				out = append(out, TranscodeOptions{
+					Detector: &DSceneAdultSoccer,
+					Accel:    accel,
+				})
+			}
+			res, err := tc.Transcode(in, out)
+			if err != nil {
+				t.Error(err)
+			}
+			if i%2 == 0 && (len(res.Encoded) < 2 || res.Encoded[1].DetectData == nil) {
+				t.Error("No detect data returned for detection profile")
+			}
+		}
+	}
+}
+
+func TestTranscoder_DetectionFreq(t *testing.T) {
+	detectionFreq(t, Software, "-1")
+}
+
 func discontinuityAudioSegment(t *testing.T, accel Acceleration) {
 	run, dir := setupTest(t)
 	defer os.RemoveAll(dir)
--- a/ffmpeg/detector.go
+++ b/ffmpeg/detector.go
@@ -0,0 +1,69 @@
+package ffmpeg
+
+type DetectorType int
+
+const (
+	SceneClassification = iota
+	// Example for future:
+	// ObjectDetection
+)
+
+type DetectorProfile interface {
+	Type() DetectorType
+}
+
+type DetectorClass struct {
+	ID   int    // unique ID within LPMS per class
+	Name string // unique Name within LPMS per class
+}
+
+type SceneClassificationProfile struct {
+	SampleRate uint
+	ModelPath  string
+	Input      string
+	Output     string
+	Classes    []DetectorClass
+}
+
+func (p *SceneClassificationProfile) Type() DetectorType {
+	return SceneClassification
+}
+
+var (
+	DSceneAdultSoccer = SceneClassificationProfile{
+		SampleRate: 30,
+		ModelPath:  "tasmodel.pb",
+		Input:      "input_1",
+		Output:     "Identity",
+		Classes:    []DetectorClass{{ID: 0, Name: "adult"}, {ID: 1, Name: "soccer"}},
+	}
+	DSceneViolence = SceneClassificationProfile{
+		SampleRate: 30,
+		ModelPath:  "tviomodel.pb",
+		Input:      "input_1",
+		Output:     "reshape_3/Reshape",
+		Classes:    []DetectorClass{{ID: 2, Name: "violence"}},
+	}
+)
+
+var SceneClassificationProfileLookup = map[string]SceneClassificationProfile{
+	"adult":    DSceneAdultSoccer,
+	"soccer":   DSceneAdultSoccer,
+	"violence": DSceneViolence,
+}
+
+var DetectorClassIDLookup = map[string]int{
+	"adult":    0,
+	"soccer":   1,
+	"violence": 2,
+}
+
+type DetectData interface {
+	Type() DetectorType
+}
+
+type SceneClassificationData map[int]float64
+
+func (scd SceneClassificationData) Type() DetectorType {
+	return SceneClassification
+}
--- a/ffmpeg/encoder.c
+++ b/ffmpeg/encoder.c
@@ -222,6 +222,11 @@ int open_output(struct output_ctx *octx, struct input_ctx *ictx)

  // add video encoder if a decoder exists and this output requires one
  if (ictx->vc && needs_decoder(octx->video->name)) {
+    if (octx->dnn_filtergraph && !ictx->vc->hw_frames_ctx) {
+      // swap filtergraph with the pre-initialized DNN filtergraph for SW
+      // for HW we handle it later during filter re-init
+      octx->vf.graph = *octx->dnn_filtergraph;
+    }
    ret = init_video_filters(ictx, octx);
    if (ret < 0) LPMS_ERR(open_output_err, "Unable to open video filter");

@@ -430,6 +435,32 @@ int mux(AVPacket *pkt, AVRational tb, struct output_ctx *octx, AVStream *ost)
  return av_interleaved_write_frame(octx->oc, pkt);
 }

+static int getmetadatainf(AVFrame *inf, struct output_ctx *octx)
+{
+  if(inf == NULL) return -1;
+  char classinfo[128] = {0,};
+  AVDictionaryEntry *element = NULL;
+  AVDictionary *metadata = inf->metadata;
+
+  if(metadata != NULL) {
+    element = av_dict_get(metadata, LVPDNN_FILTER_META, element, 0);
+    if(element != NULL) {
+      strcpy(classinfo, element->value);
+      if(strlen(classinfo) > 0) {
+        char * token = strtok(classinfo, ",");
+        int cid = 0;
+        while( token != NULL ) {
+            octx->res->probs[cid] += atof(token);
+            token = strtok(NULL, ",");
+            cid++;
+        }
+        octx->res->frames++;
+      }
+    }
+  }
+  return 0;
+}
+
 static int calc_signature(AVFrame *inf, struct output_ctx *octx)
 {
  int ret = 0;
@@ -521,11 +552,19 @@ int process_out(struct input_ctx *ictx, struct output_ctx *octx, AVCodecContext
        octx->next_kf_pts = frame->pts + octx->gop_pts_len;
    }

+    if(octx->is_dnn_profile) {
+      ret = getmetadatainf(frame, octx);
+      if(ret == -1 && frame == NULL) {
+        // Return EOF in case of flushing procedure
+        ret = AVERROR_EOF;
+      }
+    } else {
      if(is_video && frame != NULL && octx->sfilters != NULL) {
         ret = calc_signature(frame, octx);
         if(ret < 0) LPMS_WARN("Could not calculate signature value for frame");
      }
      ret = encode(encoder, frame, octx, ost);
+    }
 skip:
    av_frame_unref(frame);
    // For HW we keep the encoder open so will only get EAGAIN.
--- a/ffmpeg/ffmpeg.go
+++ b/ffmpeg/ffmpeg.go
@@ -39,6 +39,7 @@ var ErrTranscoderPrf = errors.New("TranscoderUnrecognizedProfile")
 var ErrTranscoderGOP = errors.New("TranscoderInvalidGOP")
 var ErrTranscoderDev = errors.New("TranscoderIncompatibleDevices")
 var ErrEmptyData = errors.New("EmptyData")
+var ErrDNNInitialize = errors.New("DetectorInitializationError")
 var ErrSignCompare = errors.New("InvalidSignData")
 var ErrTranscoderPixelformat = errors.New("TranscoderInvalidPixelformat")
 var ErrVideoCompare = errors.New("InvalidVideoData")
@@ -102,6 +103,7 @@ type TranscodeOptionsIn struct {
 type TranscodeOptions struct {
 	Oname    string
 	Profile  VideoProfile
+	Detector DetectorProfile
 	Accel    Acceleration
 	Device   string
 	CalcSign bool
@@ -114,8 +116,9 @@ type TranscodeOptions struct {
 }

 type MediaInfo struct {
-	Frames int
-	Pixels int64
+	Frames     int
+	Pixels     int64
+	DetectData DetectData
 }

 type TranscodeResults struct {
@@ -618,6 +621,14 @@ func createCOutputParams(input *TranscodeOptionsIn, ps []TranscodeOptions) ([]C.
 	params := make([]C.output_params, len(ps))
 	finalizer := func() { destroyCOutputParams(params) }
 	for i, p := range ps {
+		if p.Detector != nil {
+			// We don't do any encoding for detector profiles
+			// Adding placeholder values to pass checks for these everywhere
+			p.Oname = "/dev/null"
+			p.Profile = P144p30fps16x9
+			p.Muxer = ComponentOptions{Name: "mpegts"}
+		}
+
 		param := p.Profile
 		w, h, err := VideoProfileResolution(param)
 		if err != nil {
@@ -667,7 +678,18 @@ func createCOutputParams(input *TranscodeOptionsIn, ps []TranscodeOptions) ([]C.
 			filters += fmt.Sprintf(",fps=%d/%d", param.Framerate, param.FramerateDen)
 			fps = C.AVRational{num: C.int(param.Framerate), den: C.int(param.FramerateDen)}
 		}
-
+		// if has a detector profile, ignore all video options
+		if p.Detector != nil {
+			switch p.Detector.Type() {
+			case SceneClassification:
+				detectorProfile := p.Detector.(*SceneClassificationProfile)
+				// Set samplerate using select filter to prevent unnecessary HW->SW copying
+				filters = fmt.Sprintf("select='not(mod(n\\,%v))'", detectorProfile.SampleRate)
+				if input.Accel != Software {
+					filters += ",hwdownload,format=nv12"
+				}
+			}
+		}
 		// Set video encoder options
 		// TODO understand how h264 profiles and GOP setting works for
 		// NETINT encoder, and make sure we change relevant things here
@@ -787,13 +809,17 @@ func createCOutputParams(input *TranscodeOptionsIn, ps []TranscodeOptions) ([]C.
 		fromMs := int(p.From.Milliseconds())
 		toMs := int(p.To.Milliseconds())
 		vfilt := C.CString(filters)
+		isDNN := C.int(0)
+		if p.Detector != nil {
+			isDNN = C.int(1)
+		}
 		oname := C.CString(p.Oname)
 		xcoderOutParams := C.CString(xcoderOutParamsStr)
 		params[i] = C.output_params{fname: oname, fps: fps,
 			w: C.int(w), h: C.int(h), bitrate: C.int(bitrate),
 			gop_time: C.int(gopMs), from: C.int(fromMs), to: C.int(toMs),
 			muxer: muxOpts, audio: audioOpts, video: vidOpts,
-			vfilters: vfilt, sfilters: nil, xcoderParams: xcoderOutParams}
+			vfilters: vfilt, sfilters: nil, is_dnn: isDNN, xcoderParams: xcoderOutParams}
 		if p.CalcSign {
 			//signfilter string
 			escapedOname := ffmpegStrEscape(p.Oname)
@@ -996,6 +1022,18 @@ func (t *Transcoder) Transcode(input *TranscodeOptionsIn, ps []TranscodeOptions)
 			Frames: int(r.frames),
 			Pixels: int64(r.pixels),
 		}
+		// add detect result
+		if ps[i].Detector != nil {
+			switch ps[i].Detector.Type() {
+			case SceneClassification:
+				detector := ps[i].Detector.(*SceneClassificationProfile)
+				res := make(SceneClassificationData)
+				for j, class := range detector.Classes {
+					res[class.ID] = float64(r.probs[j])
+				}
+				tr[i].DetectData = res
+			}
+		}
 	}
 	dec := MediaInfo{
 		Frames: int(decoded.frames),
@@ -1050,6 +1088,32 @@ func InitFFmpeg() {
 	InitFFmpegWithLogLevel(FFLogWarning)
 }

+func NewTranscoderWithDetector(detector DetectorProfile, deviceid string) (*Transcoder, error) {
+	switch detector.Type() {
+	case SceneClassification:
+		detectorProfile := detector.(*SceneClassificationProfile)
+		backendConfigs := createBackendConfig(deviceid)
+		dnnOpt := &C.lvpdnn_opts{
+			modelpath:       C.CString(detectorProfile.ModelPath),
+			inputname:       C.CString(detectorProfile.Input),
+			outputname:      C.CString(detectorProfile.Output),
+			backend_configs: C.CString(backendConfigs),
+		}
+		defer C.free(unsafe.Pointer(dnnOpt.modelpath))
+		defer C.free(unsafe.Pointer(dnnOpt.inputname))
+		defer C.free(unsafe.Pointer(dnnOpt.outputname))
+		defer C.free(unsafe.Pointer(dnnOpt.backend_configs))
+		handle := C.lpms_transcode_new_with_dnn(dnnOpt)
+		if handle != nil {
+			return &Transcoder{
+				handle: handle,
+				mu:     &sync.Mutex{},
+			}, nil
+		}
+	}
+	return nil, ErrDNNInitialize
+}
+
 func createBackendConfig(deviceid string) string {
 	configProto := &pb.ConfigProto{GpuOptions: &pb.GPUOptions{AllowGrowth: true}}
 	bytes, err := proto.Marshal(configProto)
--- a/ffmpeg/filter.c
+++ b/ffmpeg/filter.c
@@ -106,6 +106,20 @@ int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx)
    ret = filtergraph_parser(vf, filters_descr, &inputs, &outputs);
    if (ret < 0) LPMS_ERR(vf_init_cleanup, "Unable to parse video filters desc");

+    if (octx->is_dnn_profile && vf->graph == *octx->dnn_filtergraph) {
+        // Try to find DNN filter in the pre-initialized graph
+        AVFilterContext *dnn_filter = avfilter_graph_get_filter(vf->graph, "livepeer_dnn");
+        if (!dnn_filter) {
+            ret = AVERROR_FILTER_NOT_FOUND;
+            LPMS_ERR(vf_init_cleanup, "Unable to find DNN filter inside filtergraph");
+        }
+        // Place DNN filter in correct position, i.e. just before the sink
+        assert(vf->sink_ctx->nb_inputs == 1);
+        ret = avfilter_insert_filter(vf->sink_ctx->inputs[0], dnn_filter, 0, 0);
+        // Take ownership of the filtergraph from the thread/output_ctx
+        *octx->dnn_filtergraph = NULL;
+    }
+
    ret = avfilter_graph_config(vf->graph, NULL);
    if (ret < 0) LPMS_ERR(vf_init_cleanup, "Unable configure video filtergraph");

@@ -279,6 +293,10 @@ int filtergraph_write(AVFrame *inf, struct input_ctx *ictx, struct output_ctx *o
  if (is_video && inf && inf->hw_frames_ctx && filter->hwframes &&
      inf->hw_frames_ctx->data != filter->hwframes) {
    free_filter(&octx->vf); // XXX really should flush filter first
+    if (octx->dnn_filtergraph) {
+      // swap filtergraph with the pre-initialized DNN filtergraph
+      octx->vf.graph = *octx->dnn_filtergraph;
+    }
    ret = init_video_filters(ictx, octx);
    if (ret < 0) return lpms_ERR_FILTERS;
  }
--- a/ffmpeg/filter.h
+++ b/ffmpeg/filter.h
@@ -68,6 +68,9 @@ struct output_ctx {
  int64_t clip_from, clip_to, clip_from_pts, clip_to_pts, clip_started, clip_start_pts, clip_start_pts_found; // for clipping
  int64_t clip_audio_from_pts, clip_audio_to_pts, clip_audio_start_pts, clip_audio_start_pts_found; // for clipping

+  AVFilterGraph **dnn_filtergraph;
+  int is_dnn_profile; //if not dnn profile: 0
+
  output_results  *res; // data to return for this output
  char *xcoderParams;
 };
--- a/ffmpeg/nvidia_test.go
+++ b/ffmpeg/nvidia_test.go
@@ -728,6 +728,10 @@ func TestNvidia_CompareVideo(t *testing.T) {
 	compareVideo(t, Nvidia)
 }

+func TestNvidia_DetectionFreq(t *testing.T) {
+	detectionFreq(t, Nvidia, "0")
+}
+
 func portraitTest(t *testing.T, input string, checkResults bool, profiles []VideoProfile) error {
 	wd, err := os.Getwd()
 	require.NoError(t, err)
--- a/ffmpeg/transcoder.c
+++ b/ffmpeg/transcoder.c
@@ -76,6 +76,8 @@ struct transcode_thread {
  struct input_ctx ictx;
  struct output_ctx outputs[MAX_OUTPUT_SIZE];

+  AVFilterGraph *dnn_filtergraph;
+
  int nb_outputs;
 };

@@ -213,6 +215,10 @@ int transcode_init(struct transcode_thread *h, input_params *inp,
    octx->vfilters = params[i].vfilters;
    octx->sfilters = params[i].sfilters;
    octx->xcoderParams = params[i].xcoderParams;
+    if (params[i].is_dnn && h->dnn_filtergraph != NULL) {
+      octx->is_dnn_profile = params[i].is_dnn;
+      octx->dnn_filtergraph = &h->dnn_filtergraph;
+    }
    if (params[i].bitrate) octx->bitrate = params[i].bitrate;
    if (params[i].fps.den) octx->fps = params[i].fps;
    if (params[i].gop_time) octx->gop_time = params[i].gop_time;
@@ -569,10 +575,16 @@ int flush_all_outputs(struct transcode_thread *h)
      // just flush muxer, but do not write trailer and close
      av_interleaved_write_frame(h->outputs[i].oc, NULL);
    } else {
+      if(h->outputs[i].is_dnn_profile == 0) {
        // this will flush video and audio streams, flush muxer, write trailer
        // and close
        ret = flush_outputs(ictx, h->outputs + i);
        if (ret < 0) LPMS_ERR_RETURN("Unable to fully flush outputs")
+      } else if(h->outputs[i].is_dnn_profile && h->outputs[i].res->frames > 0) {
+        for (int j = 0; j < MAX_CLASSIFY_SIZE; j++) {
+          h->outputs[i].res->probs[j] = h->outputs[i].res->probs[j] / h->outputs[i].res->frames;
+        }
+      }
    }
  }

@@ -863,8 +875,15 @@ whileloop_end:

  // flush outputs
  for (int i = 0; i < nb_outputs; i++) {
+    if(outputs[i].is_dnn_profile == 0/* && outputs[i].has_output > 0*/) {
      ret = flush_outputs(ictx, &outputs[i]);
      if (ret < 0) LPMS_ERR(transcode_cleanup, "Unable to fully flush outputs")
+    }
+    else if(outputs[i].is_dnn_profile && outputs[i].res->frames > 0) {
+       for (int j = 0; j < MAX_CLASSIFY_SIZE; j++) {
+         outputs[i].res->probs[j] =  outputs[i].res->probs[j] / outputs[i].res->frames;
+       }
+    }
  }

 transcode_cleanup:
@@ -910,6 +929,7 @@ int lpms_transcode(input_params *inp, output_params *params,
  if (h->nb_outputs != nb_outputs) {
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
+    bool only_detector_diff = true;
    // MA: we have a problem here. Consider first configuration with 1 output,
    // and second one with 2 outputs. When transcode_thread was created
    // (in lpms_transcode_new) all the outputs were cleared with zeros. Then,
@@ -924,7 +944,15 @@ int lpms_transcode(input_params *inp, output_params *params,
    // approach doesn't work if the "new" configuration has more outputs than
    // old one, even if "added" outputs are actually dnn outputs.
    // make sure only detection related outputs are changed
+    for (int i = MIN(nb_outputs, h->nb_outputs); i < MAX(nb_outputs, h->nb_outputs); i++) {
+      if (!h->outputs[i].is_dnn_profile)
+        only_detector_diff = false;
+    }
+    if (only_detector_diff) {
+      h->nb_outputs = nb_outputs;
+    } else {
      return lpms_ERR_OUTPUTS;
+    }
 #undef MAX
 #undef MIN
  }
@@ -975,9 +1003,66 @@ void lpms_transcode_stop(struct transcode_thread *handle) {
    free_output(&handle->outputs[i]);
  }

+  if (handle->dnn_filtergraph) avfilter_graph_free(&handle->dnn_filtergraph);
+
  free(handle);
 }

+static AVFilterGraph * create_dnn_filtergraph(lvpdnn_opts *dnn_opts)
+{
+  const AVFilter *filter = NULL;
+  AVFilterContext *filter_ctx = NULL;
+  AVFilterGraph *graph_ctx = NULL;
+  int ret = 0;
+  char errstr[1024];
+  char *filter_name = "livepeer_dnn";
+  char filter_args[512];
+  snprintf(filter_args, sizeof filter_args, "model=%s:input=%s:output=%s:backend_configs=%s",
+           dnn_opts->modelpath, dnn_opts->inputname, dnn_opts->outputname, dnn_opts->backend_configs);
+
+  /* allocate graph */
+  graph_ctx = avfilter_graph_alloc();
+  if (!graph_ctx)
+    LPMS_ERR(create_dnn_error, "Unable to open DNN filtergraph");
+
+  /* get a corresponding filter and open it */
+  if (!(filter = avfilter_get_by_name(filter_name))) {
+    snprintf(errstr, sizeof errstr, "Unrecognized filter with name '%s'\n", filter_name);
+    LPMS_ERR(create_dnn_error, errstr);
+  }
+
+  /* open filter and add it to the graph */
+  if (!(filter_ctx = avfilter_graph_alloc_filter(graph_ctx, filter, filter_name))) {
+    snprintf(errstr, sizeof errstr, "Impossible to open filter with name '%s'\n", filter_name);
+    LPMS_ERR(create_dnn_error, errstr);
+  }
+  if (avfilter_init_str(filter_ctx, filter_args) < 0) {
+    snprintf(errstr, sizeof errstr, "Impossible to init filter '%s' with arguments '%s'\n", filter_name, filter_args);
+    LPMS_ERR(create_dnn_error, errstr);
+  }
+
+  return graph_ctx;
+
+create_dnn_error:
+  avfilter_graph_free(&graph_ctx);
+  return NULL;
+}
+
+struct transcode_thread* lpms_transcode_new_with_dnn(lvpdnn_opts *dnn_opts)
+{
+  struct transcode_thread *h = malloc(sizeof (struct transcode_thread));
+  if (!h) return NULL;
+  memset(h, 0, sizeof *h);
+  AVFilterGraph *filtergraph = create_dnn_filtergraph(dnn_opts);
+  if (!filtergraph) {
+      free(h);
+      h = NULL;
+  } else {
+      h->dnn_filtergraph = filtergraph;
+  }
+  return h;
+}
+
 void lpms_transcode_discontinuity(struct transcode_thread *handle) {
  if (!handle)
    return;
--- a/ffmpeg/transcoder.h
+++ b/ffmpeg/transcoder.h
@@ -31,6 +31,7 @@ typedef struct {
  char *sfilters;
  int w, h, bitrate, gop_time, from, to;
  AVRational fps;
+  int is_dnn;
  char *xcoderParams;
  component_opts muxer;
  component_opts audio;
@@ -58,11 +59,22 @@ typedef struct {
 } input_params;

 #define MAX_CLASSIFY_SIZE 10
+#define LVPDNN_FILTER_NAME "lvpdnn"
+#define LVPDNN_FILTER_META "lavfi.lvpdnn.text"
 #define MAX_OUTPUT_SIZE 10

+typedef struct {
+    char *modelpath;
+    char *inputname;
+    char *outputname;
+    char *backend_configs;
+} lvpdnn_opts;
+
 typedef struct {
    int frames;
    int64_t pixels;
+    //for scene classification  
+    float probs[MAX_CLASSIFY_SIZE];//probability
 } output_results;

 enum LPMSLogLevel {
@@ -81,6 +93,7 @@ void lpms_init(enum LPMSLogLevel max_level);
 int lpms_transcode(input_params *inp, output_params *params, output_results *results, int nb_outputs, output_results *decoded_results, int use_new);
 int lpms_transcode_reopen_demux(input_params *inp);
 struct transcode_thread* lpms_transcode_new();
+struct transcode_thread* lpms_transcode_new_with_dnn(lvpdnn_opts *dnn_opts);
 void lpms_transcode_stop(struct transcode_thread* handle);
 void lpms_transcode_discontinuity(struct transcode_thread *handle);