OSDN Git Service

Use single GPE context for the optimization for 8bit/10bit scaling/CSC
authorXiang, Haihao <haihao.xiang@intel.com>
Fri, 4 Aug 2017 01:30:45 +0000 (09:30 +0800)
committerXiang, Haihao <haihao.xiang@intel.com>
Thu, 17 Aug 2017 04:09:31 +0000 (12:09 +0800)
The interface description table can used for multiple kernels and we can
use interface offset to specify the used kernel

Signed-off-by: Xiang, Haihao <haihao.xiang@intel.com>
src/gen75_picture_process.c
src/gen8_post_processing.c
src/gen9_post_processing.c
src/i965_post_processing.h
src/intel_common_vpp_internal.c

index 9ee23ba..5a9ce49 100644 (file)
@@ -294,7 +294,7 @@ gen75_proc_picture(VADriverContextP ctx,
         gpe_proc_ctx = (struct i965_proc_context *)proc_ctx->vpp_fmt_cvt_ctx;
         assert(gpe_proc_ctx != NULL); // gpe_proc_ctx must be a non-NULL pointer
 
-        if ((gpe_proc_ctx->pp_context.scaling_8bit_initialized & VPPGPE_8BIT_420) &&
+        if ((gpe_proc_ctx->pp_context.scaling_gpe_context_initialized & VPPGPE_8BIT_8BIT) &&
             (obj_dst_surf->fourcc == VA_FOURCC_NV12) &&
             pipeline_param->output_background_color)
             gen8plus_vpp_clear_surface(ctx,
index 09cc876..033b50e 100644 (file)
@@ -1527,14 +1527,9 @@ static void
 gen8_post_processing_context_finalize(VADriverContextP ctx,
                                       struct i965_post_processing_context *pp_context)
 {
-    if (pp_context->scaling_context_initialized) {
-        gen8_gpe_context_destroy(&pp_context->scaling_10bit_context);
-        pp_context->scaling_context_initialized = 0;
-    }
-
-    if (pp_context->scaling_8bit_initialized & VPPGPE_8BIT_420) {
-        gen8_gpe_context_destroy(&pp_context->scaling_yuv420p8_context);
-        pp_context->scaling_8bit_initialized &= ~(VPPGPE_8BIT_420);
+    if (pp_context->scaling_gpe_context_initialized) {
+        gen8_gpe_context_destroy(&pp_context->scaling_gpe_context);
+        pp_context->scaling_gpe_context_initialized = 0;
     }
 
     if (pp_context->vebox_proc_ctx) {
@@ -1677,7 +1672,7 @@ gen8_post_processing_context_init(VADriverContextP ctx,
      * I420 ->I420
      * I420 ->NV12
      */
-    gpe_context = &pp_context->scaling_yuv420p8_context;
+    gpe_context = &pp_context->scaling_gpe_context;
     memset(&scaling_kernel, 0, sizeof(scaling_kernel));
     scaling_kernel.bin = pp_yuv420p8_scaling_gen8;
     scaling_kernel.size = sizeof(pp_yuv420p8_scaling_gen8);
@@ -1712,7 +1707,8 @@ gen8_post_processing_context_init(VADriverContextP ctx,
     gpe_context->vfe_state.gpgpu_mode = 0;
 
     gen8_gpe_context_init(ctx, gpe_context);
-    pp_context->scaling_8bit_initialized = VPPGPE_8BIT_420;
+    pp_context->scaling_gpe_context_initialized |= VPPGPE_8BIT_8BIT;
+
     return;
 }
 
@@ -2078,10 +2074,10 @@ gen8_yuv420p8_scaling_post_processing(
     if (!pp_context || !src_surface || !src_rect || !dst_surface || !dst_rect)
         return VA_STATUS_ERROR_INVALID_PARAMETER;
 
-    if (!(pp_context->scaling_8bit_initialized & VPPGPE_8BIT_420))
+    if (!(pp_context->scaling_gpe_context_initialized & VPPGPE_8BIT_8BIT))
         return VA_STATUS_ERROR_UNIMPLEMENTED;
 
-    gpe_context = &pp_context->scaling_yuv420p8_context;
+    gpe_context = &pp_context->scaling_gpe_context;
 
     gen8_gpe_context_init(ctx, gpe_context);
     gen8_vpp_scaling_sample_state(ctx, gpe_context, src_rect, dst_rect);
@@ -2102,7 +2098,7 @@ gen8_yuv420p8_scaling_post_processing(
     kernel_walker_param.no_dependency = 1;
 
     intel_vpp_init_media_object_walker_parameter(&kernel_walker_param, &media_object_walker_param);
-
+    media_object_walker_param.interface_offset = 0;
     gen8_run_kernel_media_object_walker(ctx, pp_context->batch,
                                         gpe_context,
                                         &media_object_walker_param);
index bf7fd0e..baae0d0 100644 (file)
@@ -118,6 +118,24 @@ static const uint32_t pp_yuv420p8_scaling_gen9[][4] = {
 #include "shaders/post_processing/gen9/conv_nv12.g9b"
 };
 
+struct i965_kernel pp_common_scaling_gen9[] = {
+    {
+        "10bit to 10bit",
+        0,
+        pp_10bit_scaling_gen9,
+        sizeof(pp_10bit_scaling_gen9),
+        NULL,
+    },
+
+    {
+        "8bit to 8bit",
+        1,
+        pp_yuv420p8_scaling_gen9,
+        sizeof(pp_yuv420p8_scaling_gen9),
+        NULL,
+    },
+};
+
 static struct pp_module pp_modules_gen9[] = {
     {
         {
@@ -499,20 +517,16 @@ gen9_post_processing_context_init(VADriverContextP ctx,
     struct i965_driver_data *i965 = i965_driver_data(ctx);
     struct i965_post_processing_context *pp_context = data;
     struct i965_gpe_context *gpe_context;
-    struct i965_kernel scaling_kernel;
 
     gen8_post_processing_context_common_init(ctx, data, pp_modules_gen9, ARRAY_ELEMS(pp_modules_gen9), batch);
     avs_init_state(&pp_context->pp_avs_context.state, &gen9_avs_config);
 
     pp_context->intel_post_processing = gen9_post_processing;
 
-    gpe_context = &pp_context->scaling_10bit_context;
-    memset(&scaling_kernel, 0, sizeof(scaling_kernel));
-    scaling_kernel.bin = pp_10bit_scaling_gen9;
-    scaling_kernel.size = sizeof(pp_10bit_scaling_gen9);
-    gen8_gpe_load_kernels(ctx, gpe_context, &scaling_kernel, 1);
+    gpe_context = &pp_context->scaling_gpe_context;
+    gen8_gpe_load_kernels(ctx, gpe_context, pp_common_scaling_gen9, ARRAY_ELEMS(pp_common_scaling_gen9));
     gpe_context->idrt.entry_size = ALIGN(sizeof(struct gen8_interface_descriptor_data), 64);
-    gpe_context->idrt.max_entries = 1;
+    gpe_context->idrt.max_entries = ALIGN(ARRAY_ELEMS(pp_common_scaling_gen9), 2);
     gpe_context->sampler.entry_size = ALIGN(sizeof(struct gen8_sampler_state), 64);
     gpe_context->sampler.max_entries = 1;
     gpe_context->curbe.length = ALIGN(sizeof(struct scaling_input_parameter), 64);
@@ -537,46 +551,8 @@ gen9_post_processing_context_init(VADriverContextP ctx,
     gpe_context->vfe_state.gpgpu_mode = 0;
 
     gen8_gpe_context_init(ctx, gpe_context);
-    pp_context->scaling_context_initialized = 1;
-
-    /* initialize the YUV420 8-Bit scaling context. The below is supported.
-     * NV12 ->NV12
-     * NV12 ->I420
-     * I420 ->I420
-     * I420 ->NV12
-     */
-    gpe_context = &pp_context->scaling_yuv420p8_context;
-    memset(&scaling_kernel, 0, sizeof(scaling_kernel));
-    scaling_kernel.bin = pp_yuv420p8_scaling_gen9;
-    scaling_kernel.size = sizeof(pp_yuv420p8_scaling_gen9);
-    gen8_gpe_load_kernels(ctx, gpe_context, &scaling_kernel, 1);
-    gpe_context->idrt.entry_size = ALIGN(sizeof(struct gen8_interface_descriptor_data), 64);
-    gpe_context->idrt.max_entries = 1;
-    gpe_context->sampler.entry_size = ALIGN(sizeof(struct gen8_sampler_state), 64);
-    gpe_context->sampler.max_entries = 1;
-    gpe_context->curbe.length = ALIGN(sizeof(struct scaling_input_parameter), 32);
-
-    gpe_context->surface_state_binding_table.max_entries = MAX_SCALING_SURFACES;
-    gpe_context->surface_state_binding_table.binding_table_offset = 0;
-    gpe_context->surface_state_binding_table.surface_state_offset = ALIGN(MAX_SCALING_SURFACES * 4, 64);
-    gpe_context->surface_state_binding_table.length = ALIGN(MAX_SCALING_SURFACES * 4, 64) + ALIGN(MAX_SCALING_SURFACES * SURFACE_STATE_PADDED_SIZE_GEN9, 64);
-
-    if (i965->intel.eu_total > 0) {
-        gpe_context->vfe_state.max_num_threads = i965->intel.eu_total * 6;
-    } else {
-        if (i965->intel.has_bsd2)
-            gpe_context->vfe_state.max_num_threads = 300;
-        else
-            gpe_context->vfe_state.max_num_threads = 60;
-    }
+    pp_context->scaling_gpe_context_initialized |= (VPPGPE_8BIT_8BIT | VPPGPE_10BIT_10BIT);
 
-    gpe_context->vfe_state.curbe_allocation_size = 37;
-    gpe_context->vfe_state.urb_entry_size = 16;
-    gpe_context->vfe_state.num_urb_entries = 127;
-    gpe_context->vfe_state.gpgpu_mode = 0;
-
-    gen8_gpe_context_init(ctx, gpe_context);
-    pp_context->scaling_8bit_initialized = VPPGPE_8BIT_420;
     return;
 }
 
@@ -909,10 +885,10 @@ gen9_p010_scaling_post_processing(
     if (!pp_context || !src_surface || !src_rect || !dst_surface || !dst_rect)
         return VA_STATUS_ERROR_INVALID_PARAMETER;
 
-    if (!pp_context->scaling_context_initialized)
+    if (!(pp_context->scaling_gpe_context_initialized & VPPGPE_10BIT_10BIT))
         return VA_STATUS_ERROR_UNIMPLEMENTED;
 
-    gpe_context = &pp_context->scaling_10bit_context;
+    gpe_context = &pp_context->scaling_gpe_context;
 
     gen8_gpe_context_init(ctx, gpe_context);
     gen9_vpp_scaling_sample_state(ctx, gpe_context, src_rect, dst_rect);
@@ -933,7 +909,7 @@ gen9_p010_scaling_post_processing(
     kernel_walker_param.no_dependency = 1;
 
     intel_vpp_init_media_object_walker_parameter(&kernel_walker_param, &media_object_walker_param);
-
+    media_object_walker_param.interface_offset = 0;
     gen9_run_kernel_media_object_walker(ctx, pp_context->batch,
                                         gpe_context,
                                         &media_object_walker_param);
@@ -1134,10 +1110,10 @@ gen9_yuv420p8_scaling_post_processing(
     if (!pp_context || !src_surface || !src_rect || !dst_surface || !dst_rect)
         return VA_STATUS_ERROR_INVALID_PARAMETER;
 
-    if (!(pp_context->scaling_8bit_initialized & VPPGPE_8BIT_420))
+    if (!(pp_context->scaling_gpe_context_initialized & VPPGPE_8BIT_8BIT))
         return VA_STATUS_ERROR_UNIMPLEMENTED;
 
-    gpe_context = &pp_context->scaling_yuv420p8_context;
+    gpe_context = &pp_context->scaling_gpe_context;
 
     gen8_gpe_context_init(ctx, gpe_context);
     gen9_vpp_scaling_sample_state(ctx, gpe_context, src_rect, dst_rect);
@@ -1158,7 +1134,7 @@ gen9_yuv420p8_scaling_post_processing(
     kernel_walker_param.no_dependency = 1;
 
     intel_vpp_init_media_object_walker_parameter(&kernel_walker_param, &media_object_walker_param);
-
+    media_object_walker_param.interface_offset = 1;
     gen9_run_kernel_media_object_walker(ctx, pp_context->batch,
                                         gpe_context,
                                         &media_object_walker_param);
index fa2337d..4f16a3c 100644 (file)
@@ -586,13 +586,14 @@ struct i965_post_processing_context {
                      struct i965_post_processing_context *pp_context);
 
 
-    struct i965_gpe_context scaling_10bit_context;
-    int scaling_context_initialized;
-    struct i965_gpe_context scaling_yuv420p8_context;
-#define VPPGPE_8BIT_420    (1 << 0)
-#define VPPGPE_8BIT_422    (1 << 1)
-#define VPPGPE_8BIT_444    (1 << 2)
-    unsigned int scaling_8bit_initialized;
+    struct i965_gpe_context scaling_gpe_context;
+
+#define VPPGPE_8BIT_8BIT        (1 << 0)
+#define VPPGPE_8BIT_10BIT       (1 << 1)
+#define VPPGPE_10BIT_10BIT      (1 << 2)
+#define VPPGPE_10BIT_8BIT       (1 << 3)
+
+    unsigned int scaling_gpe_context_initialized;
 };
 
 struct i965_proc_context {
index c6b4565..33edee9 100644 (file)
@@ -132,7 +132,7 @@ intel_common_scaling_post_processing(VADriverContextP ctx,
         scale_flag = 0;
 
     if (((scale_flag & MASK_CSC) == SCALE_10BIT_10BIT_420) &&
-        pp_context->scaling_context_initialized) {
+        (pp_context->scaling_gpe_context_initialized & VPPGPE_10BIT_10BIT)) {
         unsigned int tmp_width, tmp_x;
 
         tmp_x = ALIGN_FLOOR(dst_rect->x, 2);
@@ -148,7 +148,7 @@ intel_common_scaling_post_processing(VADriverContextP ctx,
     }
 
     if (((scale_flag & MASK_CSC) == SCALE_8BIT_8BIT_420) &&
-        (pp_context->scaling_8bit_initialized & VPPGPE_8BIT_420)) {
+        (pp_context->scaling_gpe_context_initialized & VPPGPE_8BIT_8BIT)) {
 
         tmp_x = ALIGN_FLOOR(dst_rect->x, 4);
         tmp_width = dst_rect->x + dst_rect->width - tmp_x;