From c38bc5634d8eb4540cb8dfa616eafcc0b3c85e59 Mon Sep 17 00:00:00 2001 From: Ting Fu Date: Thu, 6 May 2021 16:46:10 +0800 Subject: dnn/vf_dnn_detect.c: add tensorflow output parse support Testing model is tensorflow offical model in github repo, please refer https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md to download the detect model as you need. For example, local testing was carried on with 'ssd_mobilenet_v2_coco_2018_03_29.tar.gz', and used one image of dog in https://github.com/tensorflow/models/blob/master/research/object_detection/test_images/image1.jpg Testing command is: ./ffmpeg -i image1.jpg -vf dnn_detect=dnn_backend=tensorflow:input=image_tensor:output=\ "num_detections&detection_scores&detection_classes&detection_boxes":model=ssd_mobilenet_v2_coco.pb,\ showinfo -f null - We will see the result similar as below: [Parsed_showinfo_1 @ 0x33e65f0] side data - detection bounding boxes: [Parsed_showinfo_1 @ 0x33e65f0] source: ssd_mobilenet_v2_coco.pb [Parsed_showinfo_1 @ 0x33e65f0] index: 0, region: (382, 60) -> (1005, 593), label: 18, confidence: 9834/10000. [Parsed_showinfo_1 @ 0x33e65f0] index: 1, region: (12, 8) -> (328, 549), label: 18, confidence: 8555/10000. [Parsed_showinfo_1 @ 0x33e65f0] index: 2, region: (293, 7) -> (682, 458), label: 1, confidence: 8033/10000. [Parsed_showinfo_1 @ 0x33e65f0] index: 3, region: (342, 0) -> (690, 325), label: 1, confidence: 5878/10000. There are two boxes of dog with cores 94.05% & 93.45% and two boxes of person with scores 80.33% & 58.78%. Signed-off-by: Ting Fu Signed-off-by: Guo, Yejun --- libavfilter/vf_dnn_detect.c | 95 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) (limited to 'libavfilter/vf_dnn_detect.c') diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c index 7d39acb653..d23e30aedd 100644 --- a/libavfilter/vf_dnn_detect.c +++ b/libavfilter/vf_dnn_detect.c @@ -48,6 +48,9 @@ typedef struct DnnDetectContext { #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM static const AVOption dnn_detect_options[] = { { "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = 2 }, INT_MIN, INT_MAX, FLAGS, "backend" }, +#if (CONFIG_LIBTENSORFLOW == 1) + { "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "backend" }, +#endif #if (CONFIG_LIBOPENVINO == 1) { "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "backend" }, #endif @@ -59,7 +62,7 @@ static const AVOption dnn_detect_options[] = { AVFILTER_DEFINE_CLASS(dnn_detect); -static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx) +static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx) { DnnDetectContext *ctx = filter_ctx->priv; float conf_threshold = ctx->confidence; @@ -136,6 +139,96 @@ static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AV return 0; } +static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx) +{ + DnnDetectContext *ctx = filter_ctx->priv; + int proposal_count; + float conf_threshold = ctx->confidence; + float *conf, *position, *label_id, x0, y0, x1, y1; + int nb_bboxes = 0; + AVFrameSideData *sd; + AVDetectionBBox *bbox; + AVDetectionBBoxHeader *header; + + proposal_count = *(float *)(output[0].data); + conf = output[1].data; + position = output[3].data; + label_id = output[2].data; + + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); + if (sd) { + av_log(filter_ctx, AV_LOG_ERROR, "already have dnn bounding boxes in side data.\n"); + return -1; + } + + for (int i = 0; i < proposal_count; ++i) { + if (conf[i] < conf_threshold) + continue; + nb_bboxes++; + } + + if (nb_bboxes == 0) { + av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n"); + return 0; + } + + header = av_detection_bbox_create_side_data(frame, nb_bboxes); + if (!header) { + av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes); + return -1; + } + + av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source)); + + for (int i = 0; i < proposal_count; ++i) { + y0 = position[i * 4]; + x0 = position[i * 4 + 1]; + y1 = position[i * 4 + 2]; + x1 = position[i * 4 + 3]; + + bbox = av_get_detection_bbox(header, i); + + if (conf[i] < conf_threshold) { + continue; + } + + bbox->x = (int)(x0 * frame->width); + bbox->w = (int)(x1 * frame->width) - bbox->x; + bbox->y = (int)(y0 * frame->height); + bbox->h = (int)(y1 * frame->height) - bbox->y; + + bbox->detect_confidence = av_make_q((int)(conf[i] * 10000), 10000); + bbox->classify_count = 0; + + if (ctx->labels && label_id[i] < ctx->label_count) { + av_strlcpy(bbox->detect_label, ctx->labels[(int)label_id[i]], sizeof(bbox->detect_label)); + } else { + snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", (int)label_id[i]); + } + + nb_bboxes--; + if (nb_bboxes == 0) { + break; + } + } + return 0; +} + +static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx) +{ + DnnDetectContext *ctx = filter_ctx->priv; + DnnContext *dnn_ctx = &ctx->dnnctx; + switch (dnn_ctx->backend_type) { + case DNN_OV: + return dnn_detect_post_proc_ov(frame, output, filter_ctx); + case DNN_TF: + return dnn_detect_post_proc_tf(frame, output, filter_ctx); + default: + avpriv_report_missing_feature(filter_ctx, "Current dnn backend does not support detect filter\n"); + return AVERROR(EINVAL); + } +} + static void free_detect_labels(DnnDetectContext *ctx) { for (int i = 0; i < ctx->label_count; i++) { -- cgit v1.2.3