/***************************************************************************** * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead) ***************************************************************************** * Copyright (C) 2012-2022 x264 project * * Authors: Steve Borho * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" #include "me.h" #include "slicetype-cl.h" #if HAVE_OPENCL #ifdef _WIN32 #include #endif #define x264_weights_analyse x264_template(weights_analyse) void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ); /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined * in the OpenCL headers shipped with NVIDIA drivers. We need to be * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */ #define CL_QUEUE_THREAD_HANDLE_AMD 0x403E #define OCLCHECK( method, ... )\ do\ {\ if( h->opencl.b_fatal_error )\ return -1;\ status = ocl->method( __VA_ARGS__ );\ if( status != CL_SUCCESS ) {\ h->param.b_opencl = 0;\ h->opencl.b_fatal_error = 1;\ x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\ return -1;\ }\ } while( 0 ) void x264_opencl_flush( x264_t *h ) { x264_opencl_function_t *ocl = h->opencl.ocl; ocl->clFinish( h->opencl.queue ); /* Finish copies from the GPU by copying from the page-locked buffer to * their final destination */ for( int i = 0; i < h->opencl.num_copies; i++ ) memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes ); h->opencl.num_copies = 0; h->opencl.pl_occupancy = 0; } static void *opencl_alloc_locked( x264_t *h, int bytes ) { if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE ) x264_opencl_flush( h ); assert( bytes < PAGE_LOCKED_BUF_SIZE ); char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy; h->opencl.pl_occupancy += bytes; return ptr; } int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda ) { if( fenc->b_intra_calculated ) return 0; fenc->b_intra_calculated = 1; x264_opencl_function_t *ocl = h->opencl.ocl; int luma_length = fenc->i_stride[0] * fenc->i_lines[0]; #define CREATEBUF( out, flags, size )\ out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; } #define CREATEIMAGE( out, flags, pf, width, height )\ out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; } int mb_count = h->mb.i_mb_count; cl_int status; if( !h->opencl.lowres_mv_costs ) { /* Allocate shared memory buffers */ int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL; int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL; cl_image_format pixel_format; pixel_format.image_channel_order = CL_R; pixel_format.image_channel_data_type = CL_UNSIGNED_INT32; CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height ); for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) { pixel_format.image_channel_order = CL_RGBA; pixel_format.image_channel_data_type = CL_UNSIGNED_INT8; CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height ); width >>= 1; height >>= 1; } CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) ); CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) ); CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) ); CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) ); CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length ); CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length ); } if( !fenc->opencl.intra_cost ) { /* Allocate per-frame buffers */ int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL; int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL; cl_image_format pixel_format; pixel_format.image_channel_order = CL_R; pixel_format.image_channel_data_type = CL_UNSIGNED_INT32; CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height ); for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) { pixel_format.image_channel_order = CL_RGBA; pixel_format.image_channel_data_type = CL_UNSIGNED_INT8; CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height ); width >>= 1; height >>= 1; } CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) ); CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) ); CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) ); CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) ); CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) ); CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) ); } #undef CREATEBUF #undef CREATEIMAGE /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */ char *locked = opencl_alloc_locked( h, luma_length ); memcpy( locked, fenc->plane[0], luma_length ); OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL ); size_t gdim[2]; if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor ) { int size = h->mb.i_mb_count * sizeof(int16_t); locked = opencl_alloc_locked( h, size ); memcpy( locked, fenc->i_inv_qscale_factor, size ); OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL ); } else { /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */ cl_uint arg = 0; int16_t value = 256; OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value ); gdim[0] = h->mb.i_mb_count; OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL ); } int stride = fenc->i_stride[0]; cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride ); gdim[0] = 8 * h->mb.i_mb_width; gdim[1] = 8 * h->mb.i_mb_height; OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL ); for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ ) { /* Workaround for AMD Southern Island: * * Alternate kernel instances. No perf impact to this, so we do it for * all GPUs. It prevents the same kernel from being enqueued * back-to-back, avoiding a dependency calculation bug in the driver. */ cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2; arg = 0; OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] ); OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] ); gdim[0] >>= 1; gdim[1] >>= 1; if( gdim[0] < 16 || gdim[1] < 16 ) break; OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL ); } size_t ldim[2]; gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5; gdim[1] = 8*h->mb.i_mb_height; ldim[0] = 32; ldim[1] = 8; arg = 0; /* For presets slow, slower, and placebo, check all 10 intra modes that the * C lookahead supports. For faster presets, only check the most frequent 8 * modes */ int slow = h->param.analyse.i_subpel_refine > 7; OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); gdim[0] = 256; gdim[1] = h->mb.i_mb_height; ldim[0] = 256; ldim[1] = 1; arg = 0; OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 ) x264_opencl_flush( h ); int size = h->mb.i_mb_count * sizeof(int16_t); locked = opencl_alloc_locked( h, size ); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0]; h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].bytes = size; h->opencl.num_copies++; size = h->mb.i_mb_height * sizeof(int); locked = opencl_alloc_locked( h, size ); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0]; h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].bytes = size; h->opencl.num_copies++; size = sizeof(int) * 4; locked = opencl_alloc_locked( h, size ); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0]; h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0]; h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int); h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; h->opencl.last_buf = !h->opencl.last_buf; return 0; } /* This function was tested empirically on a number of AMD and NV GPUs. Making a * function which returns perfect launch dimensions is impossible; some * applications will have self-tuning code to try many possible variables and * measure the runtime. Here we simply make an educated guess based on what we * know GPUs typically prefer. */ static void optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device ) { x264_opencl_function_t *ocl = h->opencl.ocl; size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */ size_t preferred_multiple = 64; cl_uint num_cus = 6; ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL ); ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL ); ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL ); ldims[0] = preferred_multiple; ldims[1] = 8; /* make ldims[1] an even divisor of gdims[1] */ while( gdims[1] & (ldims[1] - 1) ) { ldims[0] <<= 1; ldims[1] >>= 1; } /* make total ldims fit under the max work-group dimensions for the device */ while( ldims[0] * ldims[1] > max_work_group ) { if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) ) ldims[1] >>= 1; else ldims[0] >>= 1; } if( ldims[0] > gdims[0] ) { /* remove preferred multiples until we're close to gdims[0] */ while( gdims[0] + preferred_multiple < ldims[0] ) ldims[0] -= preferred_multiple; gdims[0] = ldims[0]; } else { /* make gdims an even multiple of ldims */ gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0]; gdims[0] *= ldims[0]; } /* make ldims smaller to spread work across compute units */ while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus ) { if( ldims[0] > preferred_multiple ) ldims[0] >>= 1; else if( ldims[1] > 1 ) ldims[1] >>= 1; else break; } /* for smaller GPUs, try not to abuse their texture cache */ if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 ) ldims[0] = 32; } int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w ) { x264_opencl_function_t *ocl = h->opencl.ocl; x264_frame_t *fenc = frames[b]; x264_frame_t *fref = frames[ref]; cl_mem ref_scaled_images[NUM_IMAGE_SCALES]; cl_mem ref_luma_hpel; cl_int status; if( w && w->weightfn ) { size_t gdims[2]; gdims[0] = 8 * h->mb.i_mb_width; gdims[1] = 8 * h->mb.i_mb_height; /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */ for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) { cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] ); OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] ); OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset ); OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale ); OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL ); gdims[0] >>= 1; gdims[1] >>= 1; if( gdims[0] < 16 || gdims[1] < 16 ) break; } cl_uint arg = 0; gdims[0] = 8 * h->mb.i_mb_width; gdims[1] = 8 * h->mb.i_mb_height; OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset ); OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale ); OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL ); /* Use weighted reference planes for motion search */ for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) ref_scaled_images[i] = h->opencl.weighted_scaled_images[i]; ref_luma_hpel = h->opencl.weighted_luma_hpel; } else { /* Use unweighted reference planes for motion search */ for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i]; ref_luma_hpel = fref->opencl.luma_hpel; } const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 }; int b_first_iteration = 1; int b_reverse_references = 1; int A = 1; int mb_per_group = 0; int cost_local_size = 0; int mvc_local_size = 0; int mb_width; size_t gdims[2]; size_t ldims[2]; /* scale 0 is 8x8 */ for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- ) { mb_width = h->mb.i_mb_width >> scale; gdims[0] = mb_width; gdims[1] = h->mb.i_mb_height >> scale; if( gdims[0] < 2 || gdims[1] < 2 ) continue; gdims[0] <<= 2; optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device ); mb_per_group = (ldims[0] >> 2) * ldims[1]; cost_local_size = 4 * mb_per_group * sizeof(int16_t); mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2; int scaled_me_range = h->param.analyse.i_me_range >> scale; int b_shift_index = 1; cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references ); for( int iter = 0; iter < num_iterations[scale]; iter++ ) { OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); b_shift_index = 0; b_first_iteration = 0; /* alternate top-left vs bot-right MB references at lower scales, so * motion field smooths more quickly. */ if( scale > 2 ) b_reverse_references ^= 1; else b_reverse_references = 0; A = !A; OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references ); } } int satd_local_size = mb_per_group * sizeof(uint32_t) * 16; cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL ); if( b_islist1 ) { OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 ); } else { OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 ); } OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 ); if( h->opencl.b_device_AMD_SI ) { /* workaround for AMD Southern Island driver scheduling bug (fixed in * July 2012), perform meaningless small copy to add a data dependency */ OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL ); } OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count; if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 ) x264_opencl_flush( h ); char *locked = opencl_alloc_locked( h, mvlen ); h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].bytes = mvlen; if( b_islist1 ) { int mvs_offset = mvlen * (ref - b - 1); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1]; } else { int mvs_offset = mvlen * (b - ref - 1); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1]; } h->opencl.num_copies++; return 0; } int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor ) { x264_opencl_function_t *ocl = h->opencl.ocl; cl_int status; x264_frame_t *fenc = frames[b]; x264_frame_t *fref0 = frames[p0]; x264_frame_t *fref1 = frames[p1]; int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32; /* Tasks for this kernel: * 1. Select least cost mode (intra, ref0, ref1) * list_used 0, 1, 2, or 3. if B frame, do not allow intra * 2. if B frame, try bidir predictions. * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */ size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height }; size_t ldim_bidir[2]; size_t *ldims = NULL; int cost_local_size = 4; int satd_local_size = 4; if( b < p1 ) { /* For B frames, use 4 threads per MB for BIDIR checks */ ldims = ldim_bidir; gdims[0] <<= 2; optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device ); int mb_per_group = (ldims[0] >> 2) * ldims[1]; cost_local_size = 4 * mb_per_group * sizeof(int16_t); satd_local_size = 16 * mb_per_group * sizeof(uint32_t); } cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); /* Sum costs across rows, atomicAdd down frame */ size_t gdim[2] = { 256, h->mb.i_mb_height }; size_t ldim[2] = { 256, 1 }; arg = 0; OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 ) x264_opencl_flush( h ); int size = h->mb.i_mb_count * sizeof(int16_t); char *locked = opencl_alloc_locked( h, size ); h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b]; h->opencl.copies[h->opencl.num_copies].bytes = size; OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.num_copies++; size = h->mb.i_mb_height * sizeof(int); locked = opencl_alloc_locked( h, size ); h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b]; h->opencl.copies[h->opencl.num_copies].bytes = size; OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.num_copies++; size = 4 * sizeof(int); locked = opencl_alloc_locked( h, size ); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.last_buf = !h->opencl.last_buf; h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b]; h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int); h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b]; h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; if( b == p1 ) // P frames only { h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int); h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0]; h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; } return 0; } void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda ) { if( h->param.b_opencl ) { #ifdef _WIN32 /* Temporarily boost priority of this lookahead thread and the OpenCL * driver's thread until the end of this function. On AMD GPUs this * greatly reduces the latency of enqueuing kernels and getting results * on Windows. */ HANDLE id = GetCurrentThread(); h->opencl.lookahead_thread_pri = GetThreadPriority( id ); SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL ); x264_opencl_function_t *ocl = h->opencl.ocl; cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL ); if( status == CL_SUCCESS ) { h->opencl.opencl_thread_pri = GetThreadPriority( id ); SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL ); } #endif /* precalculate intra and I frames */ for( int i = 0; i <= num_frames; i++ ) x264_opencl_lowres_init( h, frames[i], lambda ); x264_opencl_flush( h ); if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe ) { /* For trellis B-Adapt, precompute exhaustive motion searches */ for( int b = 0; b <= num_frames; b++ ) { for( int j = 1; j < h->param.i_bframe; j++ ) { int p0 = b - j; if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF ) { const x264_weight_t *w = x264_weight_none; if( h->param.analyse.i_weighted_pred ) { x264_emms(); x264_weights_analyse( h, frames[b], frames[p0], 1 ); w = frames[b]->weight[0]; } frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w ); } int p1 = b + j; if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF ) { frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL ); } } } x264_opencl_flush( h ); } } } void x264_opencl_slicetype_end( x264_t *h ) { #ifdef _WIN32 if( h->param.b_opencl ) { HANDLE id = GetCurrentThread(); SetThreadPriority( id, h->opencl.lookahead_thread_pri ); x264_opencl_function_t *ocl = h->opencl.ocl; cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL ); if( status == CL_SUCCESS ) SetThreadPriority( id, h->opencl.opencl_thread_pri ); } #endif } int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b ) { if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) ) return 0; else { int do_search[2]; int dist_scale_factor = 128; const x264_weight_t *w = x264_weight_none; // avoid duplicating work frames[b]->i_cost_est[b-p0][p1-b] = 0; do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF; do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF; if( do_search[0] ) { if( h->param.analyse.i_weighted_pred && b == p1 ) { x264_emms(); x264_weights_analyse( h, frames[b], frames[p0], 1 ); w = frames[b]->weight[0]; } frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; } if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; if( b == p1 ) frames[b]->i_intra_mbs[b-p0] = 0; if( p1 != p0 ) dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); frames[b]->i_cost_est[b-p0][p1-b] = 0; frames[b]->i_cost_est_aq[b-p0][p1-b] = 0; x264_opencl_lowres_init( h, frames[b], lambda ); if( do_search[0] ) { x264_opencl_lowres_init( h, frames[p0], lambda ); x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w ); } if( do_search[1] ) { x264_opencl_lowres_init( h, frames[p1], lambda ); x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL ); } x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor ); return 1; } } #endif