Use ffmpeg parser for H.264 (#431)

Fixes a number of things including a LPMS crash, choppy video quality, green screens during rotation, inconsistent frame counts vs software decoding, etc. We also apparently gained GPU support for MPEG2 decoding. This is a massive change: we can no longer add outputs up front due to the ffmpeg hwaccel API, so we have to wait until we receive a decoded video frame in order to add outputs. This also means properly queuing up audio and draining things in the same order.
2025-09-26 19:51:36 +08:00 · 2025-01-17 17:43:04 -08:00
parent 25cbb3659a
commit 79e6dcf080
12 changed files with 255 additions and 148 deletions
--- a/data/bad-cuvid.ts
+++ b/data/bad-cuvid.ts
--- a/data/broken-h264-parser.ts
+++ b/data/broken-h264-parser.ts
--- a/ffmpeg/decoder.c
+++ b/ffmpeg/decoder.c
@@ -188,38 +188,17 @@ enum AVPixelFormat hw2pixfmt(AVCodecContext *ctx)
  return AV_PIX_FMT_NONE;
 }

-/**
- * Callback to negotiate the pixel format for AVCodecContext.
- */
-static enum AVPixelFormat get_hw_pixfmt(AVCodecContext *vc, const enum AVPixelFormat *pix_fmts)
+static enum AVPixelFormat get_hw_format(AVCodecContext *ctx,
+                                        const enum AVPixelFormat *pix_fmts)
 {
-  AVHWFramesContext *frames;
-  int ret = 0;
+  const enum AVPixelFormat *p;
+  const enum AVPixelFormat hw_pix_fmt = hw2pixfmt(ctx);

-  // XXX Ideally this would be auto initialized by the HW device ctx
-  //     However the initialization doesn't occur in time to set up filters
-  //     So we do it here. Also see avcodec_get_hw_frames_parameters
-  av_buffer_unref(&vc->hw_frames_ctx);
-  vc->hw_frames_ctx = av_hwframe_ctx_alloc(vc->hw_device_ctx);
-  if (!vc->hw_frames_ctx) LPMS_ERR(pixfmt_cleanup, "Unable to allocate hwframe context for decoding");
+  for (p = pix_fmts; *p != -1; p++) {
+    if (*p == hw_pix_fmt) return *p;
+  }

-  frames = (AVHWFramesContext*)vc->hw_frames_ctx->data;
-  frames->format = hw2pixfmt(vc);
-  frames->sw_format = vc->sw_pix_fmt;
-  frames->width = vc->width;
-  frames->height = vc->height;
-
-  // May want to allocate extra HW frames if we encounter samples where
-  // the defaults are insufficient. Raising this increases GPU memory usage
-  // For now, the defaults seems OK.
-  //vc->extra_hw_frames = 16 + 1; // H.264 max refs
-
-  ret = av_hwframe_ctx_init(vc->hw_frames_ctx);
-  if (AVERROR(ENOSYS) == ret) ret = lpms_ERR_INPUT_PIXFMT; // most likely
-  if (ret < 0) LPMS_ERR(pixfmt_cleanup, "Unable to initialize a hardware frame pool");
-  return frames->format;
-
-pixfmt_cleanup:
+  fprintf(stderr, "Failed to get HW surface format.\n");
  return AV_PIX_FMT_NONE;
 }

@@ -253,38 +232,6 @@ open_audio_err:
  return ret;
 }

-char* get_hw_decoder(int ff_codec_id, int hw_type)
-{
-    switch (hw_type) {
-        case AV_HWDEVICE_TYPE_CUDA:
-            switch (ff_codec_id) {
-                case AV_CODEC_ID_H264:
-                    return "h264_cuvid";
-                case AV_CODEC_ID_HEVC:
-                    return "hevc_cuvid";
-                case AV_CODEC_ID_VP8:
-                    return "vp8_cuvid";
-                case AV_CODEC_ID_VP9:
-                    return "vp9_cuvid";
-                default:
-                    return "";
-            }
-        case AV_HWDEVICE_TYPE_MEDIACODEC:
-            switch (ff_codec_id) {
-                case AV_CODEC_ID_H264:
-                    return "h264_ni_dec";
-                case AV_CODEC_ID_HEVC:
-                    return "h265_ni_dec";
-                case AV_CODEC_ID_VP8:
-                    return "";
-                case AV_CODEC_ID_VP9:
-                    return "";
-                default:
-                    return "";
-            }
-    }
-}
-
 int open_video_decoder(input_params *params, struct input_ctx *ctx)
 {
  int ret = 0;
@@ -298,14 +245,6 @@ int open_video_decoder(input_params *params, struct input_ctx *ctx)
    LPMS_WARN("No video stream found in input");
  } else {
    if (params->hw_type > AV_HWDEVICE_TYPE_NONE) {
-      char* decoder_name = get_hw_decoder(codec->id, params->hw_type);
-      if (!*decoder_name) {
-        ret = lpms_ERR_INPUT_CODEC;
-        LPMS_ERR(open_decoder_err, "Input codec does not support hardware acceleration");
-      }
-      const AVCodec *c = avcodec_find_decoder_by_name(decoder_name);
-      if (c) codec = c;
-      else LPMS_WARN("Nvidia decoder not found; defaulting to software");
      if (AV_PIX_FMT_YUV420P != ic->streams[ctx->vi]->codecpar->format &&
          AV_PIX_FMT_YUVJ420P != ic->streams[ctx->vi]->codecpar->format) {
        // TODO check whether the color range is truncated if yuvj420p is used
@@ -330,13 +269,19 @@ int open_video_decoder(input_params *params, struct input_ctx *ctx)
      ret = av_hwdevice_ctx_create(&ctx->hw_device_ctx, params->hw_type, params->device, NULL, 0);
      if (ret < 0) LPMS_ERR(open_decoder_err, "Unable to open hardware context for decoding")
      vc->hw_device_ctx = av_buffer_ref(ctx->hw_device_ctx);
-      vc->get_format = get_hw_pixfmt;
+      vc->get_format = get_hw_format;
    }
    ctx->hw_type = params->hw_type;
    vc->pkt_timebase = ic->streams[ctx->vi]->time_base;
    av_opt_set(vc->priv_data, "xcoder-params", ctx->xcoderParams, 0);
    ret = avcodec_open2(vc, codec, opts);
    if (ret < 0) LPMS_ERR(open_decoder_err, "Unable to open video decoder");
+    if (params->hw_type > AV_HWDEVICE_TYPE_NONE) {
+      if (AV_PIX_FMT_NONE == hw2pixfmt(vc)) {
+        ret = lpms_ERR_INPUT_CODEC;
+        LPMS_ERR(open_decoder_err, "Input codec does not support hardware acceleration");
+      }
+    }
  }

  return 0;
--- a/ffmpeg/decoder.h
+++ b/ffmpeg/decoder.h
@@ -66,7 +66,6 @@ enum AVPixelFormat hw2pixfmt(AVCodecContext *ctx);
 int open_input(input_params *params, struct input_ctx *ctx);
 int open_video_decoder(input_params *params, struct input_ctx *ctx);
 int open_audio_decoder(input_params *params, struct input_ctx *ctx);
-char* get_hw_decoder(int ff_codec_id, int hw_type);
 void free_input(struct input_ctx *inctx);

 // Utility functions
--- a/ffmpeg/encoder.c
+++ b/ffmpeg/encoder.c
@@ -224,7 +224,7 @@ int open_output(struct output_ctx *octx, struct input_ctx *ictx)

  // add video encoder if a decoder exists and this output requires one
  if (ictx->vc && needs_decoder(octx->video->name)) {
-    ret = init_video_filters(ictx, octx);
+    ret = init_video_filters(ictx, octx, NULL);
    if (ret < 0) LPMS_ERR(open_output_err, "Unable to open video filter");

    codec = avcodec_find_encoder_by_name(octx->video->name);
@@ -296,6 +296,8 @@ int open_output(struct output_ctx *octx, struct input_ctx *ictx)
    if (ret < 0) LPMS_ERR(open_output_err, "Unable to open signature filter");
  }

+  octx->initialized = 1;
+
  return 0;

 open_output_err:
@@ -521,7 +523,7 @@ int mux(AVPacket *pkt, AVRational tb, struct output_ctx *octx, AVStream *ost)
 static int calc_signature(AVFrame *inf, struct output_ctx *octx)
 {
  int ret = 0;
-  if (inf->hw_frames_ctx && octx->sf.hwframes && inf->hw_frames_ctx->data != octx->sf.hwframes) {
+  if (inf->hw_frames_ctx && octx->sf.hw_frames_ctx && inf->hw_frames_ctx->data != octx->sf.hw_frames_ctx->data) {
      free_filter(&octx->sf);
      ret = init_signature_filters(octx, inf);
      if (ret < 0) return lpms_ERR_FILTERS;
--- a/ffmpeg/ffmpeg_test.go
+++ b/ffmpeg/ffmpeg_test.go
@@ -2218,23 +2218,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 		`
 	}

-	// TODO figure out why cpu/gpu are different
-	if accel == Nvidia {
-		cmd = cmd + `
-			cat <<-EOF1 > expected.dims
-				115 256,144
-				120 146,260
-				125 256,144
-			EOF1
-
-			cat <<-EOF2 > expected-30fps.dims
-				58 256,144
-				60 146,260
-				63 256,144
-			EOF2
-		`
-	} else {
-		cmd = cmd + `
+	cmd = cmd + `
 			cat <<-EOF1 > expected.dims
 				120 256,144
 				120 146,260
@@ -2246,10 +2230,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 				60 146,260
 				61 256,144
 			EOF2
-		`
-	}

-	cmd = cmd + `
 		diff -u expected.dims out.dims
 		diff -u expected-30fps.dims out-30fps.dims
 	`
@@ -2299,9 +2280,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 	}})
 	require.NoError(t, err)

-	// TODO figure out why nvidia is different; green screen?
-	if accel == Software {
-		cmd = `
+	cmd = `
 		cat out-test-0.ts  out-transposed.ts out-test-2.ts > out-test-concat.ts
 		ffprobe -show_entries frame=pts,pkt_dts,duration,pict_type,width,height -of csv out-test-concat.ts > out-test-concat.framedata

@@ -2317,8 +2296,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 		# this does not line up
 		#diff -u out-test-concat-30fps.framedata out-double-rotated-30fps.framedata
 	`
-		run(cmd)
-	}
+	run(cmd)

 	// check single rotations
 	res, err = Transcode3(
@@ -2344,21 +2322,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 		ffprobe -show_entries frame=height,width -of csv=p=0 out-single-rotated-30fps.ts | sed 's/,$//g' | uniq -c | sed 's/^ *//g' > single-out-30fps.dims
 	`

-	// TODO figure out why cpu/gpu are different
-	if accel == Nvidia {
-		cmd = cmd + `
-			cat <<-EOF1 > single-expected.dims
-				115 256,144
-				125 146,260
-			EOF1
-
-			cat <<-EOF2 > single-expected-30fps.dims
-				58 256,144
-				63 146,260
-			EOF2
-		`
-	} else {
-		cmd = cmd + `
+	cmd = cmd + `
 			cat <<-EOF1 > single-expected.dims
 				120 256,144
 				120 146,260
@@ -2368,10 +2332,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 				60 256,144
 				61 146,260
 			EOF2
-		`
-	}

-	cmd = cmd + `
 		diff -u single-expected.dims single-out.dims
 		diff -u single-expected-30fps.dims single-out-30fps.dims
 	`
--- a/ffmpeg/filter.c
+++ b/ffmpeg/filter.c
@@ -47,7 +47,7 @@ int filtergraph_parser(struct filter_ctx *fctx, char* filters_descr, AVFilterInO
  return ret;
 }

-int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx)
+int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx, AVFrame *inf)
 {
    char args[512];
    int ret = 0;
@@ -92,8 +92,9 @@ int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx)
    if (ictx->vc && ictx->vc->hw_frames_ctx) {
      // XXX a bit problematic in that it's set before decoder is fully ready
      AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();
-      srcpar->hw_frames_ctx = ictx->vc->hw_frames_ctx;
-      vf->hwframes = ictx->vc->hw_frames_ctx->data;
+      AVBufferRef *hw_frames_ctx = inf && inf->hw_frames_ctx ? inf->hw_frames_ctx : ictx->vc->hw_frames_ctx;
+      srcpar->hw_frames_ctx = hw_frames_ctx;
+      av_buffer_replace(&vf->hw_frames_ctx, hw_frames_ctx);
      av_buffersrc_parameters_set(vf->src_ctx, srcpar);
      av_freep(&srcpar);
    }
@@ -243,13 +244,13 @@ int init_signature_filters(struct output_ctx *octx, AVFrame *inf)
    if (octx->vc && inf && inf->hw_frames_ctx) {
      AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();      
      srcpar->hw_frames_ctx = inf->hw_frames_ctx;
-      sf->hwframes = inf->hw_frames_ctx->data;
+      av_buffer_replace(&sf->hw_frames_ctx, inf->hw_frames_ctx);
      av_buffersrc_parameters_set(sf->src_ctx, srcpar);
      av_freep(&srcpar);
    } else if (octx->vc && octx->vc->hw_frames_ctx) {
      AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();
      srcpar->hw_frames_ctx = octx->vc->hw_frames_ctx;
-      sf->hwframes = octx->vc->hw_frames_ctx->data;
+      av_buffer_replace(&sf->hw_frames_ctx, octx->vc->hw_frames_ctx);
      av_buffersrc_parameters_set(sf->src_ctx, srcpar);
      av_freep(&srcpar);
    }
@@ -288,8 +289,8 @@ int filtergraph_write(AVFrame *inf, struct input_ctx *ictx, struct output_ctx *o
  // before the decoder is fully ready, and the decoder may change HW params
  // XXX: Unclear if this path is hit on all devices
  if (is_video && inf && (
-      (inf->hw_frames_ctx && filter->hwframes &&
-        inf->hw_frames_ctx->data != filter->hwframes) ||
+      (inf->hw_frames_ctx && filter->hw_frames_ctx &&
+        inf->hw_frames_ctx->data != filter->hw_frames_ctx->data) ||
      (filter->src_ctx->nb_outputs > 0 &&
        filter->src_ctx->outputs[0]->w != inf->width &&
        filter->src_ctx->outputs[0]->h != inf->height))) {
@@ -326,7 +327,7 @@ int filtergraph_write(AVFrame *inf, struct input_ctx *ictx, struct output_ctx *o
    ret = 0;

    free_filter(&octx->vf);
-    ret = init_video_filters(ictx, octx);
+    ret = init_video_filters(ictx, octx, inf);
    if (ret < 0) return lpms_ERR_FILTERS;
  }

@@ -411,5 +412,6 @@ void free_filter(struct filter_ctx *filter)
 {
  if (filter->frame) av_frame_free(&filter->frame);
  if (filter->graph) avfilter_graph_free(&filter->graph);
+  if (filter->hw_frames_ctx) av_buffer_unref(&filter->hw_frames_ctx);
  memset(filter, 0, sizeof(struct filter_ctx));
 }
--- a/ffmpeg/filter.h
+++ b/ffmpeg/filter.h
@@ -11,7 +11,7 @@ struct filter_ctx {
  AVFilterContext *sink_ctx;
  AVFilterContext *src_ctx;

-  uint8_t *hwframes; // GPU frame pool data
+  AVBufferRef *hw_frames_ctx; // GPU frame pool data

  // Input timebase for this filter
  AVRational time_base;
@@ -46,6 +46,7 @@ struct filter_ctx {
 };

 struct output_ctx {
+  int initialized;     // whether this output is ready
  char *fname;         // required output file name
  char *vfilters;      // required output video filters
  char *sfilters;      // required output signature filters
@@ -82,7 +83,7 @@ struct output_ctx {
  char *xcoderParams;
 };

-int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx);
+int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx, AVFrame *inf);
 int init_audio_filters(struct input_ctx *ictx, struct output_ctx *octx);
 int init_signature_filters(struct output_ctx *octx, AVFrame *inf);
 int filtergraph_write(AVFrame *inf, struct input_ctx *ictx, struct output_ctx *octx, struct filter_ctx *filter, int is_video);
--- a/ffmpeg/nvidia_test.go
+++ b/ffmpeg/nvidia_test.go
@@ -23,15 +23,15 @@ func TestNvidia_BadCodecs(t *testing.T) {
 	run, dir := setupTest(t)
 	defer os.RemoveAll(dir)

-	fname := dir + "/mpeg2.ts"
+	fname := dir + "/test.flv"
 	oname := dir + "/out.ts"
 	prof := P240p30fps16x9

 	cmd := `
 	cp "$1/../transcoder/test.ts" test.ts
-		# Generate an input file that uses unsupported codec MPEG2 and sanity check
-		ffmpeg -loglevel warning -i test.ts -an -c:v mpeg2video -t 1 mpeg2.ts
-		ffprobe -loglevel warning mpeg2.ts -show_streams | grep codec_name=mpeg2video
+		# Generate an input file that uses unsupported codec FLV and sanity check
+		ffmpeg -loglevel warning -i test.ts -an -c:v flv -t 1 test.flv
+		ffprobe -loglevel warning test.flv -show_streams | grep codec_name=flv
 	`
 	run(cmd)

@@ -603,6 +603,7 @@ func TestNvidia_API_AlternatingTimestamps(t *testing.T) {
 	tc := NewTranscoder()
 	idx := []int{1, 0, 3, 2}
 	for _, i := range idx {
+		// TODO this breaks with nvidia acceleration on the input!
 		in := &TranscodeOptionsIn{Fname: fmt.Sprintf("%s/out_%d.ts", dir, i)}
 		out := []TranscodeOptions{{
 			Oname:        fmt.Sprintf("%s/%d.md5", dir, i),
@@ -806,3 +807,54 @@ func TestNvidia_Metadata(t *testing.T) {
 	// with nvenc we reopen the outputs so exercise that
 	runTestTranscoder_Metadata(t, Nvidia)
 }
+
+func TestNvidia_H264Parser(t *testing.T) {
+	// this sample breaks with the cuvid (nvidia) h264 parser, so ensure ffmpeg parser is used
+
+	run, dir := setupTest(t)
+	defer os.RemoveAll(dir)
+
+	res, err := Transcode3(&TranscodeOptionsIn{
+		Fname: "../data/bad-cuvid.ts",
+		Accel: Nvidia,
+	}, []TranscodeOptions{{
+		Oname:   dir + "/out.ts",
+		Profile: P240p30fps16x9,
+		Accel:   Nvidia,
+	}})
+	require.Nil(t, err)
+	require.Equal(t, 500, res.Decoded.Frames)
+
+	_, err = Transcode3(&TranscodeOptionsIn{
+		Fname: "../data/broken-h264-parser.ts",
+		Accel: Nvidia,
+	}, []TranscodeOptions{{
+		Oname:   dir + "/nv.ts",
+		Profile: P240p30fps16x9,
+		Accel:   Nvidia,
+	}})
+	require.Nil(t, err)
+
+	_, err = Transcode3(&TranscodeOptionsIn{
+		Fname: "../data/broken-h264-parser.ts",
+		Accel: Software,
+	}, []TranscodeOptions{{
+		Oname:   dir + "/sw.ts",
+		Profile: P240p30fps16x9,
+		Accel:   Software,
+	}})
+	require.Nil(t, err)
+
+	// TODO nvidia is one frame offset (first frame seems to be duplicated)
+	// and this leads to a poor ssim score compared to software encoding
+	// Figure out why this is! Ideally scores should be >99 for ALL frames
+	cmd := `
+    # check image quality
+    ffmpeg -loglevel warning -i sw.ts -i nv.ts \
+      -lavfi "[0:v][1:v]ssim=stats.log" -f null -
+    grep -Po 'All:\K\d+.\d+' stats.log | \
+      awk '{ if ($1 < 0.90) count=count+1 } END{ exit count > 30 }'
+	`
+	run(cmd)
+
+}
--- a/ffmpeg/queue.c
+++ b/ffmpeg/queue.c
@@ -0,0 +1,88 @@
+#include "queue.h"
+
+/**
+ * Queue for buffering frames and packets while the hardware video decoder initializes
+ */
+
+/**
+ * Each queue item holds both an AVPacket* and an AVFrame*.
+ */
+typedef struct {
+    AVPacket *pkt;
+    AVFrame  *frame;
+    int decoder_return;
+} queue_item;
+
+AVFifo* queue_create()
+{
+  // Create a FIFO that can hold 8 items initially, each of size queue_item,
+  // and auto-grow as needed.
+  return av_fifo_alloc2(8, sizeof(queue_item), AV_FIFO_FLAG_AUTO_GROW);
+}
+
+void queue_free(AVFifo **fifo)
+{
+  if (!fifo || !*fifo) return;
+
+  // Drain everything still in the FIFO
+  queue_item item;
+  memset(&item, 0, sizeof(item));
+  while (av_fifo_read(*fifo, &item, 1) >= 0) {
+    if (item.pkt) av_packet_free(&item.pkt);
+    if (item.frame) av_frame_free(&item.frame);
+  }
+
+  av_fifo_freep2(fifo);  // Frees the buffer & sets *fifo = NULL
+}
+
+int queue_write(AVFifo *fifo, const AVPacket *pkt, const AVFrame *frame, int decoder_return)
+{
+  if (!fifo) return AVERROR(EINVAL);
+
+  queue_item item;
+  memset(&item, 0, sizeof(item));
+
+  item.decoder_return = decoder_return;
+
+  // Create a new packet reference if needed
+  if (pkt) {
+    item.pkt = av_packet_clone(pkt);
+    if (!item.pkt) return AVERROR(EINVAL);
+  }
+
+  // Create a new frame reference if needed
+  if (frame) {
+    item.frame = av_frame_clone(frame);
+    if (!item.frame) {
+      av_packet_free(&item.pkt);
+      return AVERROR(EINVAL);
+    }
+  }
+
+  return av_fifo_write(fifo, &item, 1);
+}
+
+int queue_read(AVFifo *fifo, AVFrame *out_frame, AVPacket *out_pkt, int *stream_index, int *decoder_return)
+{
+  if (!fifo) return AVERROR(EINVAL);
+
+  queue_item item;
+  int ret = av_fifo_read(fifo, &item, 1);
+  if (ret < 0) return ret;
+
+  // Transfer ownership
+  if (out_pkt && item.pkt) {
+    *stream_index = item.pkt->stream_index;
+    av_packet_move_ref(out_pkt, item.pkt);
+  }
+  av_packet_free(&item.pkt);
+
+  if (out_frame && item.frame) {
+    av_frame_move_ref(out_frame, item.frame);
+  }
+  av_frame_free(&item.frame);
+
+  *decoder_return = item.decoder_return;
+
+  return 0;
+}
--- a/ffmpeg/queue.h
+++ b/ffmpeg/queue.h
@@ -0,0 +1,8 @@
+
+#include <libavutil/fifo.h>
+#include <libavcodec/avcodec.h>
+
+AVFifo* queue_create();
+void queue_free(AVFifo **fifo);
+int queue_write(AVFifo *fifo, const AVPacket *pkt, const AVFrame *frame, int decoder_return);
+int queue_read(AVFifo *fifo, AVFrame *out_frame, AVPacket *out_pkt, int *stream_index, int *decoder_return);
--- a/ffmpeg/transcoder.c
+++ b/ffmpeg/transcoder.c
@@ -3,6 +3,7 @@
 #include "filter.h"
 #include "encoder.h"
 #include "logging.h"
+#include "queue.h"

 #include <libavcodec/avcodec.h>
 #include <libavformat/avformat.h>
@@ -236,19 +237,13 @@ int transcode_init(struct transcode_thread *h, input_params *inp,
    octx->dv = ictx->vi < 0 || is_drop(octx->video->name);
    octx->da = ictx->ai < 0 || is_drop(octx->audio->name);
    octx->res = &results[i];
+    octx->initialized = h->initialized && (AV_HWDEVICE_TYPE_NONE != octx->hw_type || ictx->transmuxing);

-    // first segment of a stream, need to initalize output HW context
-    // XXX valgrind this line up
+    // either first segment of a GPU stream or a CPU stream
    // when transmuxing we're opening output with first segment, but closing it
    // only when lpms_transcode_stop called, so we don't want to re-open it
    // on subsequent segments
-    if (!h->initialized || (AV_HWDEVICE_TYPE_NONE == octx->hw_type && !ictx->transmuxing)) {
-      ret = open_output(octx, ictx);
-      if (ret < 0) LPMS_ERR(transcode_cleanup, "Unable to open output");
-      if (ictx->transmuxing) {
-        octx->oc->flags |= AVFMT_FLAG_FLUSH_PACKETS;
-        octx->oc->flush_packets = 1;
-      }
+    if (!octx->initialized) {
      continue;
    }

@@ -325,14 +320,18 @@ int transcode(struct transcode_thread *h,
  int ret = 0;
  AVPacket *ipkt = NULL;
  AVFrame *dframe = NULL;
+  AVFifo *frame_queue = NULL;
  struct input_ctx *ictx = &h->ictx;
  struct output_ctx *outputs = h->outputs;
  int nb_outputs = h->nb_outputs;
+  int outputs_ready = 0, hit_eof = 0;

  ipkt = av_packet_alloc();
  if (!ipkt) LPMS_ERR(transcode_cleanup, "Unable to allocated packet");
  dframe = av_frame_alloc();
  if (!dframe) LPMS_ERR(transcode_cleanup, "Unable to allocate frame");
+  frame_queue = queue_create();
+  if (!frame_queue) LPMS_ERR(transcode_cleanup, "Unable to allocate audio queue");

  while (1) {
    // DEMUXING & DECODING
@@ -342,10 +341,19 @@ int transcode(struct transcode_thread *h,
    int stream_index = -1;

    av_frame_unref(dframe);
-    ret = process_in(ictx, dframe, ipkt, &stream_index);
+
+    // Check if we have any queued frames and if not, process normally
+    int queue_ret = 0;
+    if (outputs_ready || hit_eof) {
+      queue_ret = queue_read(frame_queue, dframe,  ipkt, &stream_index, &ret);
+    }
+    if (!outputs_ready || queue_ret < 0) {
+      ret = process_in(ictx, dframe, ipkt, &stream_index);
+    }
    if (ret == AVERROR_EOF) {
      // no more processing, go for flushes
-      break;
+      if (!outputs_ready) hit_eof = 1; // Set flag to force opening all outputs
+      else break;
    }
    else if (lpms_ERR_PACKET_ONLY == ret) ; // keep going for stream copy
    else if (ret == AVERROR(EAGAIN)) ;  // this is a-ok
@@ -353,6 +361,53 @@ int transcode(struct transcode_thread *h,
      LPMS_ERR(transcode_cleanup, "Could not decode; No keyframes in input");
    } else if (ret < 0) LPMS_ERR(transcode_cleanup, "Could not decode; stopping");

+    // This is for the case when we _are_ decoding but frame is not complete yet
+    // So for example multislice h.264 picture without all slices fed in.
+    // IMPORTANT: this should also be false if we are transmuxing, and it is not
+    // so, at least not automatically, because then process_in returns 0 and not
+    // lpms_ERR_PACKET_ONLY
+    has_frame = lpms_ERR_PACKET_ONLY != ret;
+
+    // Open outputs. Do this here because we can't initialize a hw encoder
+    // until we first receive a hw-decoded frame
+    int is_ready = 1, set_outputs = 0, packet_ret = ret, is_eof = AVERROR_EOF == ret;
+    for (int i = 0; !outputs_ready && i < nb_outputs; i++) {
+      struct output_ctx *octx = &outputs[i];
+      if (!octx->initialized) {
+        // only open output if any of the following are true:
+        if ((ictx->vi >= 0 && stream_index == ictx->vi) || // is a video frame
+            ictx->vi < 0 || // input does not have video
+            octx->dv || // video is being dropped from output
+            is_eof) {  // eof was hit, so force opening outputs
+          ret = open_output(octx, ictx);
+          if (ret < 0) LPMS_ERR(transcode_cleanup, "Unable to open output");
+        }
+        if (ictx->transmuxing) {
+          octx->oc->flags |= AVFMT_FLAG_FLUSH_PACKETS;
+          octx->oc->flush_packets = 1;
+        }
+        set_outputs = 1;
+      }
+      is_ready = is_ready && octx->initialized;
+    }
+    outputs_ready = is_ready;
+    if (set_outputs) {
+      int output_frame = has_frame;
+      // We add both video / audio streams simultaneously.
+      // Since video is always added first if present, queue up audio
+      // until we receive a video frame from the decoder
+      if (stream_index == ictx->vi) {
+        // width / height will be zero for pure streamcopy (no decoding)
+        output_frame = has_frame && dframe->width && dframe->height;
+      } else if (stream_index == ictx->ai) {
+        output_frame = has_frame && dframe->nb_samples;
+      }
+      ret = queue_write(frame_queue, is_eof ? NULL : ipkt, output_frame ? dframe : NULL, packet_ret);
+      if (ret < 0) LPMS_ERR(transcode_cleanup, "Unable to queue packet");
+      goto whileloop_end;
+    }
+
+
    // So here we have several possibilities:
    // ipkt: usually it will be here, but if we are decoding, and if we reached
    // end of stream, it may be so that draining of the decoder produces frames
@@ -362,13 +417,6 @@ int transcode(struct transcode_thread *h,

    ist = ictx->ic->streams[stream_index];

-    // This is for the case when we _are_ decoding but frame is not complete yet
-    // So for example multislice h.264 picture without all slices fed in.
-    // IMPORTANT: this should also be false if we are transmuxing, and it is not
-    // so, at least not automatically, because then process_in returns 0 and not
-    // lpms_ERR_PACKET_ONLY
-    has_frame = lpms_ERR_PACKET_ONLY != ret;
-
    // Now apart from if (is_flush_frame(dframe)) goto whileloop_end; statement
    // this code just updates has_frame properly for video and audio, updates
    // statistics for video and ausio and sets last_frame
@@ -516,6 +564,7 @@ whileloop_end:
 transcode_cleanup:
  if (dframe) av_frame_free(&dframe);
  if (ipkt) av_packet_free(&ipkt);  // needed for early exits
+  if (frame_queue) queue_free(&frame_queue);
  return transcode_shutdown(h, ret);
 }