From d943330dc01f4d222dc1127d6216cd683dc41ee0 Mon Sep 17 00:00:00 2001 From: Lenny Wang Date: Sat, 14 Dec 2013 05:11:00 -0600 Subject: lavfi/deshake_opencl: optimze transform filter Reviewed-by: Wei Gao Signed-off-by: Michael Niedermayer --- libavfilter/deshake.h | 9 +- libavfilter/deshake_opencl.c | 81 +++++++----- libavfilter/deshake_opencl.h | 7 + libavfilter/deshake_opencl_kernel.h | 254 +++++++++++++++++++----------------- 4 files changed, 188 insertions(+), 163 deletions(-) (limited to 'libavfilter') diff --git a/libavfilter/deshake.h b/libavfilter/deshake.h index 5792973957..615953cfe3 100644 --- a/libavfilter/deshake.h +++ b/libavfilter/deshake.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2013 Wei Gao + * Copyright (C) 2013 Lenny Wang * * This file is part of FFmpeg. * @@ -57,12 +58,8 @@ typedef struct { typedef struct { cl_command_queue command_queue; cl_program program; - cl_kernel kernel; - size_t matrix_size; - float matrix_y[9]; - float matrix_uv[9]; - cl_mem cl_matrix_y; - cl_mem cl_matrix_uv; + cl_kernel kernel_luma; + cl_kernel kernel_chroma; int in_plane_size[8]; int out_plane_size[8]; int plane_num; diff --git a/libavfilter/deshake_opencl.c b/libavfilter/deshake_opencl.c index e4e4df19e8..caf2bf2e2d 100644 --- a/libavfilter/deshake_opencl.c +++ b/libavfilter/deshake_opencl.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2013 Wei Gao + * Copyright (C) 2013 Lenny Wang * * This file is part of FFmpeg. * @@ -29,8 +30,8 @@ #include "deshake_opencl.h" #include "libavutil/opencl_internal.h" -#define MATRIX_SIZE 6 #define PLANE_NUM 3 +#define ROUND_TO_16(a) ((((a - 1)/16)+1)*16) int ff_opencl_transform(AVFilterContext *ctx, int width, int height, int cw, int ch, @@ -39,29 +40,40 @@ int ff_opencl_transform(AVFilterContext *ctx, enum FillMethod fill, AVFrame *in, AVFrame *out) { int ret = 0; - const size_t global_work_size = width * height + 2 * ch * cw; cl_int status; DeshakeContext *deshake = ctx->priv; - FFOpenclParam opencl_param = {0}; - - opencl_param.ctx = ctx; - opencl_param.kernel = deshake->opencl_ctx.kernel; - ret = av_opencl_buffer_write(deshake->opencl_ctx.cl_matrix_y, (uint8_t *)matrix_y, deshake->opencl_ctx.matrix_size * sizeof(cl_float)); - if (ret < 0) - return ret; - ret = av_opencl_buffer_write(deshake->opencl_ctx.cl_matrix_uv, (uint8_t *)matrix_uv, deshake->opencl_ctx.matrix_size * sizeof(cl_float)); - if (ret < 0) - return ret; + float4 packed_matrix_lu = {matrix_y[0], matrix_y[1], matrix_y[2], matrix_y[5]}; + float4 packed_matrix_ch = {matrix_uv[0], matrix_uv[1], matrix_uv[2], matrix_uv[5]}; + size_t global_worksize_lu[2] = {(size_t)ROUND_TO_16(width), (size_t)ROUND_TO_16(height)}; + size_t global_worksize_ch[2] = {(size_t)ROUND_TO_16(cw), (size_t)(2*ROUND_TO_16(ch))}; + size_t local_worksize[2] = {16, 16}; + FFOpenclParam param_lu = {0}; + FFOpenclParam param_ch = {0}; + param_lu.ctx = param_ch.ctx = ctx; + param_lu.kernel = deshake->opencl_ctx.kernel_luma; + param_ch.kernel = deshake->opencl_ctx.kernel_chroma; if ((unsigned int)interpolate > INTERPOLATE_BIQUADRATIC) { av_log(ctx, AV_LOG_ERROR, "Selected interpolate method is invalid\n"); return AVERROR(EINVAL); } - ret = ff_opencl_set_parameter(&opencl_param, + ret = ff_opencl_set_parameter(¶m_lu, + FF_OPENCL_PARAM_INFO(deshake->opencl_ctx.cl_inbuf), + FF_OPENCL_PARAM_INFO(deshake->opencl_ctx.cl_outbuf), + FF_OPENCL_PARAM_INFO(packed_matrix_lu), + FF_OPENCL_PARAM_INFO(interpolate), + FF_OPENCL_PARAM_INFO(fill), + FF_OPENCL_PARAM_INFO(in->linesize[0]), + FF_OPENCL_PARAM_INFO(out->linesize[0]), + FF_OPENCL_PARAM_INFO(height), + FF_OPENCL_PARAM_INFO(width), + NULL); + if (ret < 0) + return ret; + ret = ff_opencl_set_parameter(¶m_ch, FF_OPENCL_PARAM_INFO(deshake->opencl_ctx.cl_inbuf), FF_OPENCL_PARAM_INFO(deshake->opencl_ctx.cl_outbuf), - FF_OPENCL_PARAM_INFO(deshake->opencl_ctx.cl_matrix_y), - FF_OPENCL_PARAM_INFO(deshake->opencl_ctx.cl_matrix_uv), + FF_OPENCL_PARAM_INFO(packed_matrix_ch), FF_OPENCL_PARAM_INFO(interpolate), FF_OPENCL_PARAM_INFO(fill), FF_OPENCL_PARAM_INFO(in->linesize[0]), @@ -76,13 +88,15 @@ int ff_opencl_transform(AVFilterContext *ctx, if (ret < 0) return ret; status = clEnqueueNDRangeKernel(deshake->opencl_ctx.command_queue, - deshake->opencl_ctx.kernel, 1, NULL, - &global_work_size, NULL, 0, NULL, NULL); + deshake->opencl_ctx.kernel_luma, 2, NULL, + global_worksize_lu, local_worksize, 0, NULL, NULL); + status |= clEnqueueNDRangeKernel(deshake->opencl_ctx.command_queue, + deshake->opencl_ctx.kernel_chroma, 2, NULL, + global_worksize_ch, local_worksize, 0, NULL, NULL); if (status != CL_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "OpenCL run kernel error occurred: %s\n", av_opencl_errstr(status)); return AVERROR_EXTERNAL; } - clFinish(deshake->opencl_ctx.command_queue); ret = av_opencl_buffer_read_image(out->data, deshake->opencl_ctx.out_plane_size, deshake->opencl_ctx.plane_num, deshake->opencl_ctx.cl_outbuf, deshake->opencl_ctx.cl_outbuf_size); @@ -98,16 +112,7 @@ int ff_opencl_deshake_init(AVFilterContext *ctx) ret = av_opencl_init(NULL); if (ret < 0) return ret; - deshake->opencl_ctx.matrix_size = MATRIX_SIZE; - deshake->opencl_ctx.plane_num = PLANE_NUM; - ret = av_opencl_buffer_create(&deshake->opencl_ctx.cl_matrix_y, - deshake->opencl_ctx.matrix_size*sizeof(cl_float), CL_MEM_READ_ONLY, NULL); - if (ret < 0) - return ret; - ret = av_opencl_buffer_create(&deshake->opencl_ctx.cl_matrix_uv, - deshake->opencl_ctx.matrix_size*sizeof(cl_float), CL_MEM_READ_ONLY, NULL); - if (ret < 0) - return ret; + deshake->opencl_ctx.plane_num = PLANE_NUM; deshake->opencl_ctx.command_queue = av_opencl_get_command_queue(); if (!deshake->opencl_ctx.command_queue) { av_log(ctx, AV_LOG_ERROR, "Unable to get OpenCL command queue in filter 'deshake'\n"); @@ -118,10 +123,19 @@ int ff_opencl_deshake_init(AVFilterContext *ctx) av_log(ctx, AV_LOG_ERROR, "OpenCL failed to compile program 'avfilter_transform'\n"); return AVERROR(EINVAL); } - if (!deshake->opencl_ctx.kernel) { - deshake->opencl_ctx.kernel = clCreateKernel(deshake->opencl_ctx.program, "avfilter_transform", &ret); + if (!deshake->opencl_ctx.kernel_luma) { + deshake->opencl_ctx.kernel_luma = clCreateKernel(deshake->opencl_ctx.program, + "avfilter_transform_luma", &ret); + if (ret != CL_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'avfilter_transform_luma'\n"); + return AVERROR(EINVAL); + } + } + if (!deshake->opencl_ctx.kernel_chroma) { + deshake->opencl_ctx.kernel_chroma = clCreateKernel(deshake->opencl_ctx.program, + "avfilter_transform_chroma", &ret); if (ret != CL_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'avfilter_transform'\n"); + av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'avfilter_transform_chroma'\n"); return AVERROR(EINVAL); } } @@ -133,9 +147,8 @@ void ff_opencl_deshake_uninit(AVFilterContext *ctx) DeshakeContext *deshake = ctx->priv; av_opencl_buffer_release(&deshake->opencl_ctx.cl_inbuf); av_opencl_buffer_release(&deshake->opencl_ctx.cl_outbuf); - av_opencl_buffer_release(&deshake->opencl_ctx.cl_matrix_y); - av_opencl_buffer_release(&deshake->opencl_ctx.cl_matrix_uv); - clReleaseKernel(deshake->opencl_ctx.kernel); + clReleaseKernel(deshake->opencl_ctx.kernel_luma); + clReleaseKernel(deshake->opencl_ctx.kernel_chroma); clReleaseProgram(deshake->opencl_ctx.program); deshake->opencl_ctx.command_queue = NULL; av_opencl_uninit(); diff --git a/libavfilter/deshake_opencl.h b/libavfilter/deshake_opencl.h index 30d17d4426..5b0a2414b8 100644 --- a/libavfilter/deshake_opencl.h +++ b/libavfilter/deshake_opencl.h @@ -23,6 +23,13 @@ #include "deshake.h" +typedef struct { + float x; + float y; + float z; + float w; +} float4; + int ff_opencl_deshake_init(AVFilterContext *ctx); void ff_opencl_deshake_uninit(AVFilterContext *ctx); diff --git a/libavfilter/deshake_opencl_kernel.h b/libavfilter/deshake_opencl_kernel.h index ca0bf839b1..dd45d6f60b 100644 --- a/libavfilter/deshake_opencl_kernel.h +++ b/libavfilter/deshake_opencl_kernel.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2013 Wei Gao + * Copyright (C) 2013 Lenny Wang * * * This file is part of FFmpeg. @@ -25,16 +26,16 @@ #include "libavutil/opencl.h" const char *ff_kernel_deshake_opencl = AV_OPENCL_KERNEL( - -inline unsigned char pixel(global const unsigned char *src, float x, float y, +inline unsigned char pixel(global const unsigned char *src, int x, int y, int w, int h,int stride, unsigned char def) { - return (x < 0 || y < 0 || x >= w || y >= h) ? def : src[(int)x + (int)y * stride]; + return (x < 0 || y < 0 || x >= w || y >= h) ? def : src[x + y * stride]; } + unsigned char interpolate_nearest(float x, float y, global const unsigned char *src, int width, int height, int stride, unsigned char def) { - return pixel(src, (int)(x + 0.5), (int)(y + 0.5), width, height, stride, def); + return pixel(src, (int)(x + 0.5f), (int)(y + 0.5f), width, height, stride, def); } unsigned char interpolate_bilinear(float x, float y, global const unsigned char *src, @@ -42,21 +43,18 @@ unsigned char interpolate_bilinear(float x, float y, global const unsigned char { int x_c, x_f, y_c, y_f; int v1, v2, v3, v4; + x_f = (int)x; + y_f = (int)y; + x_c = x_f + 1; + y_c = y_f + 1; - if (x < -1 || x > width || y < -1 || y > height) { + if (x_f < -1 || x_f > width || y_f < -1 || y_f > height) { return def; } else { - x_f = (int)x; - x_c = x_f + 1; - - y_f = (int)y; - y_c = y_f + 1; - - v1 = pixel(src, x_c, y_c, width, height, stride, def); + v4 = pixel(src, x_f, y_f, width, height, stride, def); v2 = pixel(src, x_c, y_f, width, height, stride, def); v3 = pixel(src, x_f, y_c, width, height, stride, def); - v4 = pixel(src, x_f, y_f, width, height, stride, def); - + v1 = pixel(src, x_c, y_c, width, height, stride, def); return (v1*(x - x_f)*(y - y_f) + v2*((x - x_f)*(y_c - y)) + v3*(x_c - x)*(y - y_f) + v4*((x_c - x)*(y_c - y))); } @@ -68,19 +66,18 @@ unsigned char interpolate_biquadratic(float x, float y, global const unsigned ch int x_c, x_f, y_c, y_f; unsigned char v1, v2, v3, v4; float f1, f2, f3, f4; + x_f = (int)x; + y_f = (int)y; + x_c = x_f + 1; + y_c = y_f + 1; - if (x < - 1 || x > width || y < -1 || y > height) + if (x_f < - 1 || x_f > width || y_f < -1 || y_f > height) return def; else { - x_f = (int)x; - x_c = x_f + 1; - y_f = (int)y; - y_c = y_f + 1; - - v1 = pixel(src, x_c, y_c, width, height, stride, def); + v4 = pixel(src, x_f, y_f, width, height, stride, def); v2 = pixel(src, x_c, y_f, width, height, stride, def); v3 = pixel(src, x_f, y_c, width, height, stride, def); - v4 = pixel(src, x_f, y_f, width, height, stride, def); + v1 = pixel(src, x_c, y_c, width, height, stride, def); f1 = 1 - sqrt((x_c - x) * (y_c - y)); f2 = 1 - sqrt((x_c - x) * (y - y_f)); @@ -107,109 +104,120 @@ inline int mirror(int v, int m) return v; } -kernel void avfilter_transform(global unsigned char *src, - global unsigned char *dst, - global float *matrix, - global float *matrix2, - int interpolate, - int fillmethod, - int src_stride_lu, - int dst_stride_lu, - int src_stride_ch, - int dst_stride_ch, - int height, - int width, - int ch, - int cw) +kernel void avfilter_transform_luma(global unsigned char *src, + global unsigned char *dst, + float4 matrix, + int interpolate, + int fill, + int src_stride_lu, + int dst_stride_lu, + int height, + int width) { - int global_id = get_global_id(0); - - global unsigned char *dst_y = dst; - global unsigned char *dst_u = dst_y + height * dst_stride_lu; - global unsigned char *dst_v = dst_u + ch * dst_stride_ch; - - global unsigned char *src_y = src; - global unsigned char *src_u = src_y + height * src_stride_lu; - global unsigned char *src_v = src_u + ch * src_stride_ch; - - global unsigned char *tempdst; - global unsigned char *tempsrc; - - int x; - int y; - float x_s; - float y_s; - int tempsrc_stride; - int tempdst_stride; - int temp_height; - int temp_width; - int curpos; - unsigned char def = 0; - if (global_id < width*height) { - y = global_id/width; - x = global_id%width; - x_s = x * matrix[0] + y * matrix[1] + matrix[2]; - y_s = x * matrix[3] + y * matrix[4] + matrix[5]; - tempdst = dst_y; - tempsrc = src_y; - tempsrc_stride = src_stride_lu; - tempdst_stride = dst_stride_lu; - temp_height = height; - temp_width = width; - } else if ((global_id >= width*height)&&(global_id < width*height + ch*cw)) { - y = (global_id - width*height)/cw; - x = (global_id - width*height)%cw; - x_s = x * matrix2[0] + y * matrix2[1] + matrix2[2]; - y_s = x * matrix2[3] + y * matrix2[4] + matrix2[5]; - tempdst = dst_u; - tempsrc = src_u; - tempsrc_stride = src_stride_ch; - tempdst_stride = dst_stride_ch; - temp_height = ch; - temp_width = cw; - } else { - y = (global_id - width*height - ch*cw)/cw; - x = (global_id - width*height - ch*cw)%cw; - x_s = x * matrix2[0] + y * matrix2[1] + matrix2[2]; - y_s = x * matrix2[3] + y * matrix2[4] + matrix2[5]; - tempdst = dst_v; - tempsrc = src_v; - tempsrc_stride = src_stride_ch; - tempdst_stride = dst_stride_ch; - temp_height = ch; - temp_width = cw; - } - curpos = y * tempdst_stride + x; - switch (fillmethod) { - case 0: //FILL_BLANK - def = 0; - break; - case 1: //FILL_ORIGINAL - def = tempsrc[y*tempsrc_stride+x]; - break; - case 2: //FILL_CLAMP - y_s = clipf(y_s, 0, temp_height - 1); - x_s = clipf(x_s, 0, temp_width - 1); - def = tempsrc[(int)y_s * tempsrc_stride + (int)x_s]; - break; - case 3: //FILL_MIRROR - y_s = mirror(y_s,temp_height - 1); - x_s = mirror(x_s,temp_width - 1); - def = tempsrc[(int)y_s * tempsrc_stride + (int)x_s]; - break; + int x = get_global_id(0); + int y = get_global_id(1); + int idx_dst = y * dst_stride_lu + x; + unsigned char def = 0; + float x_s = x * matrix.x + y * matrix.y + matrix.z; + float y_s = x * (-matrix.y) + y * matrix.x + matrix.w; + + if (x < width && y < height) { + switch (fill) { + case 0: //FILL_BLANK + def = 0; + break; + case 1: //FILL_ORIGINAL + def = src[y*src_stride_lu + x]; + break; + case 2: //FILL_CLAMP + y_s = clipf(y_s, 0, height - 1); + x_s = clipf(x_s, 0, width - 1); + def = src[(int)y_s * src_stride_lu + (int)x_s]; + break; + case 3: //FILL_MIRROR + y_s = mirror(y_s, height - 1); + x_s = mirror(x_s, width - 1); + def = src[(int)y_s * src_stride_lu + (int)x_s]; + break; + } + switch (interpolate) { + case 0: //INTERPOLATE_NEAREST + dst[idx_dst] = interpolate_nearest(x_s, y_s, src, width, height, src_stride_lu, def); + break; + case 1: //INTERPOLATE_BILINEAR + dst[idx_dst] = interpolate_bilinear(x_s, y_s, src, width, height, src_stride_lu, def); + break; + case 2: //INTERPOLATE_BIQUADRATIC + dst[idx_dst] = interpolate_biquadratic(x_s, y_s, src, width, height, src_stride_lu, def); + break; + default: + return; + } } - switch (interpolate) { - case 0: //INTERPOLATE_NEAREST - tempdst[curpos] = interpolate_nearest(x_s, y_s, tempsrc, temp_width, temp_height, tempsrc_stride, def); - break; - case 1: //INTERPOLATE_BILINEAR - tempdst[curpos] = interpolate_bilinear(x_s, y_s, tempsrc, temp_width, temp_height, tempsrc_stride, def); - break; - case 2: //INTERPOLATE_BIQUADRATIC - tempdst[curpos] = interpolate_biquadratic(x_s, y_s, tempsrc, temp_width, temp_height, tempsrc_stride, def); - break; - default: - return; +} + +kernel void avfilter_transform_chroma(global unsigned char *src, + global unsigned char *dst, + float4 matrix, + int interpolate, + int fill, + int src_stride_lu, + int dst_stride_lu, + int src_stride_ch, + int dst_stride_ch, + int height, + int width, + int ch, + int cw) +{ + + int x = get_global_id(0); + int y = get_global_id(1); + int pad_ch = get_global_size(1)>>1; + global unsigned char *dst_u = dst + height * dst_stride_lu; + global unsigned char *src_u = src + height * src_stride_lu; + global unsigned char *dst_v = dst_u + ch * dst_stride_ch; + global unsigned char *src_v = src_u + ch * src_stride_ch; + src = y < pad_ch ? src_u : src_v; + dst = y < pad_ch ? dst_u : dst_v; + y = select(y - pad_ch, y, y < pad_ch); + float x_s = x * matrix.x + y * matrix.y + matrix.z; + float y_s = x * (-matrix.y) + y * matrix.x + matrix.w; + int idx_dst = y * dst_stride_ch + x; + unsigned char def; + + if (x < cw && y < ch) { + switch (fill) { + case 0: //FILL_BLANK + def = 0; + break; + case 1: //FILL_ORIGINAL + def = src[y*src_stride_ch + x]; + break; + case 2: //FILL_CLAMP + y_s = clipf(y_s, 0, ch - 1); + x_s = clipf(x_s, 0, cw - 1); + def = src[(int)y_s * src_stride_ch + (int)x_s]; + break; + case 3: //FILL_MIRROR + y_s = mirror(y_s, ch - 1); + x_s = mirror(x_s, cw - 1); + def = src[(int)y_s * src_stride_ch + (int)x_s]; + break; + } + switch (interpolate) { + case 0: //INTERPOLATE_NEAREST + dst[idx_dst] = interpolate_nearest(x_s, y_s, src, cw, ch, src_stride_ch, def); + break; + case 1: //INTERPOLATE_BILINEAR + dst[idx_dst] = interpolate_bilinear(x_s, y_s, src, cw, ch, src_stride_ch, def); + break; + case 2: //INTERPOLATE_BIQUADRATIC + dst[idx_dst] = interpolate_biquadratic(x_s, y_s, src, cw, ch, src_stride_ch, def); + break; + default: + return; + } } } ); -- cgit v1.2.3