Use single GPE context for the optimization for 8bit/10bit scaling/CSC

author Xiang, Haihao <haihao.xiang@intel.com>

Fri, 4 Aug 2017 01:30:45 +0000 (09:30 +0800)

committer Xiang, Haihao <haihao.xiang@intel.com>

Thu, 17 Aug 2017 04:09:31 +0000 (12:09 +0800)
author Xiang, Haihao <haihao.xiang@intel.com>
Fri, 4 Aug 2017 01:30:45 +0000 (09:30 +0800)
committer Xiang, Haihao <haihao.xiang@intel.com>
Thu, 17 Aug 2017 04:09:31 +0000 (12:09 +0800)
diff --git a/src/gen75_picture_process.c b/src/gen75_picture_process.c

index 9ee23ba..5a9ce49 100644 (file)
--- a/src/gen75_picture_process.c
+++ b/src/gen75_picture_process.c
@@ -294,7 +294,7 @@ gen75_proc_picture(VADriverContextP ctx,
          gpe_proc_ctx = (struct i965_proc_context *)proc_ctx->vpp_fmt_cvt_ctx;
          assert(gpe_proc_ctx != NULL); // gpe_proc_ctx must be a non-NULL pointer
  
-        if ((gpe_proc_ctx->pp_context.scaling_8bit_initialized & VPPGPE_8BIT_420) &&
+        if ((gpe_proc_ctx->pp_context.scaling_gpe_context_initialized & VPPGPE_8BIT_8BIT) &&
              (obj_dst_surf->fourcc == VA_FOURCC_NV12) &&
              pipeline_param->output_background_color)
              gen8plus_vpp_clear_surface(ctx,
diff --git a/src/gen8_post_processing.c b/src/gen8_post_processing.c

index 09cc876..033b50e 100644 (file)
--- a/src/gen8_post_processing.c
+++ b/src/gen8_post_processing.c
@@ -1527,14 +1527,9 @@ static void
  gen8_post_processing_context_finalize(VADriverContextP ctx,
                                        struct i965_post_processing_context *pp_context)
  {
-    if (pp_context->scaling_context_initialized) {
-        gen8_gpe_context_destroy(&pp_context->scaling_10bit_context);
-        pp_context->scaling_context_initialized = 0;
-    }
-
-    if (pp_context->scaling_8bit_initialized & VPPGPE_8BIT_420) {
-        gen8_gpe_context_destroy(&pp_context->scaling_yuv420p8_context);
-        pp_context->scaling_8bit_initialized &= ~(VPPGPE_8BIT_420);
+    if (pp_context->scaling_gpe_context_initialized) {
+        gen8_gpe_context_destroy(&pp_context->scaling_gpe_context);
+        pp_context->scaling_gpe_context_initialized = 0;
      }
  
      if (pp_context->vebox_proc_ctx) {
@@ -1677,7 +1672,7 @@ gen8_post_processing_context_init(VADriverContextP ctx,
       * I420 ->I420
       * I420 ->NV12
       */
-    gpe_context = &pp_context->scaling_yuv420p8_context;
+    gpe_context = &pp_context->scaling_gpe_context;
      memset(&scaling_kernel, 0, sizeof(scaling_kernel));
      scaling_kernel.bin = pp_yuv420p8_scaling_gen8;
      scaling_kernel.size = sizeof(pp_yuv420p8_scaling_gen8);
@@ -1712,7 +1707,8 @@ gen8_post_processing_context_init(VADriverContextP ctx,
      gpe_context->vfe_state.gpgpu_mode = 0;
  
      gen8_gpe_context_init(ctx, gpe_context);
-    pp_context->scaling_8bit_initialized = VPPGPE_8BIT_420;
+    pp_context->scaling_gpe_context_initialized |= VPPGPE_8BIT_8BIT;
+
      return;
  }
  
@@ -2078,10 +2074,10 @@ gen8_yuv420p8_scaling_post_processing(
      if (!pp_context || !src_surface || !src_rect || !dst_surface || !dst_rect)
          return VA_STATUS_ERROR_INVALID_PARAMETER;
  
-    if (!(pp_context->scaling_8bit_initialized & VPPGPE_8BIT_420))
+    if (!(pp_context->scaling_gpe_context_initialized & VPPGPE_8BIT_8BIT))
          return VA_STATUS_ERROR_UNIMPLEMENTED;
  
-    gpe_context = &pp_context->scaling_yuv420p8_context;
+    gpe_context = &pp_context->scaling_gpe_context;
  
      gen8_gpe_context_init(ctx, gpe_context);
      gen8_vpp_scaling_sample_state(ctx, gpe_context, src_rect, dst_rect);
@@ -2102,7 +2098,7 @@ gen8_yuv420p8_scaling_post_processing(
      kernel_walker_param.no_dependency = 1;
  
      intel_vpp_init_media_object_walker_parameter(&kernel_walker_param, &media_object_walker_param);
-
+    media_object_walker_param.interface_offset = 0;
      gen8_run_kernel_media_object_walker(ctx, pp_context->batch,
                                          gpe_context,
                                          &media_object_walker_param);
diff --git a/src/gen9_post_processing.c b/src/gen9_post_processing.c

index bf7fd0e..baae0d0 100644 (file)
--- a/src/gen9_post_processing.c
+++ b/src/gen9_post_processing.c
@@ -118,6 +118,24 @@ static const uint32_t pp_yuv420p8_scaling_gen9[][4] = {
  #include "shaders/post_processing/gen9/conv_nv12.g9b"
  };
  
+struct i965_kernel pp_common_scaling_gen9[] = {
+    {
+        "10bit to 10bit",
+        0,
+        pp_10bit_scaling_gen9,
+        sizeof(pp_10bit_scaling_gen9),
+        NULL,
+    },
+
+    {
+        "8bit to 8bit",
+        1,
+        pp_yuv420p8_scaling_gen9,
+        sizeof(pp_yuv420p8_scaling_gen9),
+        NULL,
+    },
+};
+
  static struct pp_module pp_modules_gen9[] = {
      {
          {
@@ -499,20 +517,16 @@ gen9_post_processing_context_init(VADriverContextP ctx,
      struct i965_driver_data *i965 = i965_driver_data(ctx);
      struct i965_post_processing_context *pp_context = data;
      struct i965_gpe_context *gpe_context;
-    struct i965_kernel scaling_kernel;
  
      gen8_post_processing_context_common_init(ctx, data, pp_modules_gen9, ARRAY_ELEMS(pp_modules_gen9), batch);
      avs_init_state(&pp_context->pp_avs_context.state, &gen9_avs_config);
  
      pp_context->intel_post_processing = gen9_post_processing;
  
-    gpe_context = &pp_context->scaling_10bit_context;
-    memset(&scaling_kernel, 0, sizeof(scaling_kernel));
-    scaling_kernel.bin = pp_10bit_scaling_gen9;
-    scaling_kernel.size = sizeof(pp_10bit_scaling_gen9);
-    gen8_gpe_load_kernels(ctx, gpe_context, &scaling_kernel, 1);
+    gpe_context = &pp_context->scaling_gpe_context;
+    gen8_gpe_load_kernels(ctx, gpe_context, pp_common_scaling_gen9, ARRAY_ELEMS(pp_common_scaling_gen9));
      gpe_context->idrt.entry_size = ALIGN(sizeof(struct gen8_interface_descriptor_data), 64);
-    gpe_context->idrt.max_entries = 1;
+    gpe_context->idrt.max_entries = ALIGN(ARRAY_ELEMS(pp_common_scaling_gen9), 2);
      gpe_context->sampler.entry_size = ALIGN(sizeof(struct gen8_sampler_state), 64);
      gpe_context->sampler.max_entries = 1;
      gpe_context->curbe.length = ALIGN(sizeof(struct scaling_input_parameter), 64);
@@ -537,46 +551,8 @@ gen9_post_processing_context_init(VADriverContextP ctx,
      gpe_context->vfe_state.gpgpu_mode = 0;
  
      gen8_gpe_context_init(ctx, gpe_context);
-    pp_context->scaling_context_initialized = 1;
-
-    /* initialize the YUV420 8-Bit scaling context. The below is supported.
-     * NV12 ->NV12
-     * NV12 ->I420
-     * I420 ->I420
-     * I420 ->NV12
-     */
-    gpe_context = &pp_context->scaling_yuv420p8_context;
-    memset(&scaling_kernel, 0, sizeof(scaling_kernel));
-    scaling_kernel.bin = pp_yuv420p8_scaling_gen9;
-    scaling_kernel.size = sizeof(pp_yuv420p8_scaling_gen9);
-    gen8_gpe_load_kernels(ctx, gpe_context, &scaling_kernel, 1);
-    gpe_context->idrt.entry_size = ALIGN(sizeof(struct gen8_interface_descriptor_data), 64);
-    gpe_context->idrt.max_entries = 1;
-    gpe_context->sampler.entry_size = ALIGN(sizeof(struct gen8_sampler_state), 64);
-    gpe_context->sampler.max_entries = 1;
-    gpe_context->curbe.length = ALIGN(sizeof(struct scaling_input_parameter), 32);
-
-    gpe_context->surface_state_binding_table.max_entries = MAX_SCALING_SURFACES;
-    gpe_context->surface_state_binding_table.binding_table_offset = 0;
-    gpe_context->surface_state_binding_table.surface_state_offset = ALIGN(MAX_SCALING_SURFACES * 4, 64);
-    gpe_context->surface_state_binding_table.length = ALIGN(MAX_SCALING_SURFACES * 4, 64) + ALIGN(MAX_SCALING_SURFACES * SURFACE_STATE_PADDED_SIZE_GEN9, 64);
-
-    if (i965->intel.eu_total > 0) {
-        gpe_context->vfe_state.max_num_threads = i965->intel.eu_total * 6;
-    } else {
-        if (i965->intel.has_bsd2)
-            gpe_context->vfe_state.max_num_threads = 300;
-        else
-            gpe_context->vfe_state.max_num_threads = 60;
-    }
+    pp_context->scaling_gpe_context_initialized |= (VPPGPE_8BIT_8BIT | VPPGPE_10BIT_10BIT);
  
-    gpe_context->vfe_state.curbe_allocation_size = 37;
-    gpe_context->vfe_state.urb_entry_size = 16;
-    gpe_context->vfe_state.num_urb_entries = 127;
-    gpe_context->vfe_state.gpgpu_mode = 0;
-
-    gen8_gpe_context_init(ctx, gpe_context);
-    pp_context->scaling_8bit_initialized = VPPGPE_8BIT_420;
      return;
  }
  
@@ -909,10 +885,10 @@ gen9_p010_scaling_post_processing(
      if (!pp_context || !src_surface || !src_rect || !dst_surface || !dst_rect)
          return VA_STATUS_ERROR_INVALID_PARAMETER;
  
-    if (!pp_context->scaling_context_initialized)
+    if (!(pp_context->scaling_gpe_context_initialized & VPPGPE_10BIT_10BIT))
          return VA_STATUS_ERROR_UNIMPLEMENTED;
  
-    gpe_context = &pp_context->scaling_10bit_context;
+    gpe_context = &pp_context->scaling_gpe_context;
  
      gen8_gpe_context_init(ctx, gpe_context);
      gen9_vpp_scaling_sample_state(ctx, gpe_context, src_rect, dst_rect);
@@ -933,7 +909,7 @@ gen9_p010_scaling_post_processing(
      kernel_walker_param.no_dependency = 1;
  
      intel_vpp_init_media_object_walker_parameter(&kernel_walker_param, &media_object_walker_param);
-
+    media_object_walker_param.interface_offset = 0;
      gen9_run_kernel_media_object_walker(ctx, pp_context->batch,
                                          gpe_context,
                                          &media_object_walker_param);
@@ -1134,10 +1110,10 @@ gen9_yuv420p8_scaling_post_processing(
      if (!pp_context || !src_surface || !src_rect || !dst_surface || !dst_rect)
          return VA_STATUS_ERROR_INVALID_PARAMETER;
  
-    if (!(pp_context->scaling_8bit_initialized & VPPGPE_8BIT_420))
+    if (!(pp_context->scaling_gpe_context_initialized & VPPGPE_8BIT_8BIT))
          return VA_STATUS_ERROR_UNIMPLEMENTED;
  
-    gpe_context = &pp_context->scaling_yuv420p8_context;
+    gpe_context = &pp_context->scaling_gpe_context;
  
      gen8_gpe_context_init(ctx, gpe_context);
      gen9_vpp_scaling_sample_state(ctx, gpe_context, src_rect, dst_rect);
@@ -1158,7 +1134,7 @@ gen9_yuv420p8_scaling_post_processing(
      kernel_walker_param.no_dependency = 1;
  
      intel_vpp_init_media_object_walker_parameter(&kernel_walker_param, &media_object_walker_param);
-
+    media_object_walker_param.interface_offset = 1;
      gen9_run_kernel_media_object_walker(ctx, pp_context->batch,
                                          gpe_context,
                                          &media_object_walker_param);
diff --git a/src/i965_post_processing.h b/src/i965_post_processing.h

index fa2337d..4f16a3c 100644 (file)
--- a/src/i965_post_processing.h
+++ b/src/i965_post_processing.h
@@ -586,13 +586,14 @@ struct i965_post_processing_context {
                       struct i965_post_processing_context *pp_context);
  
  
-    struct i965_gpe_context scaling_10bit_context;
-    int scaling_context_initialized;
-    struct i965_gpe_context scaling_yuv420p8_context;
-#define VPPGPE_8BIT_420    (1 << 0)
-#define VPPGPE_8BIT_422    (1 << 1)
-#define VPPGPE_8BIT_444    (1 << 2)
-    unsigned int scaling_8bit_initialized;
+    struct i965_gpe_context scaling_gpe_context;
+
+#define VPPGPE_8BIT_8BIT        (1 << 0)
+#define VPPGPE_8BIT_10BIT       (1 << 1)
+#define VPPGPE_10BIT_10BIT      (1 << 2)
+#define VPPGPE_10BIT_8BIT       (1 << 3)
+
+    unsigned int scaling_gpe_context_initialized;
  };
  
  struct i965_proc_context {
diff --git a/src/intel_common_vpp_internal.c b/src/intel_common_vpp_internal.c

index c6b4565..33edee9 100644 (file)
--- a/src/intel_common_vpp_internal.c
+++ b/src/intel_common_vpp_internal.c
@@ -132,7 +132,7 @@ intel_common_scaling_post_processing(VADriverContextP ctx,
          scale_flag = 0;
  
      if (((scale_flag & MASK_CSC) == SCALE_10BIT_10BIT_420) &&
-        pp_context->scaling_context_initialized) {
+        (pp_context->scaling_gpe_context_initialized & VPPGPE_10BIT_10BIT)) {
          unsigned int tmp_width, tmp_x;
  
          tmp_x = ALIGN_FLOOR(dst_rect->x, 2);
@@ -148,7 +148,7 @@ intel_common_scaling_post_processing(VADriverContextP ctx,
      }
  
      if (((scale_flag & MASK_CSC) == SCALE_8BIT_8BIT_420) &&
-        (pp_context->scaling_8bit_initialized & VPPGPE_8BIT_420)) {
+        (pp_context->scaling_gpe_context_initialized & VPPGPE_8BIT_8BIT)) {
  
          tmp_x = ALIGN_FLOOR(dst_rect->x, 4);
          tmp_width = dst_rect->x + dst_rect->width - tmp_x;
author	Xiang, Haihao <haihao.xiang@intel.com>
	Fri, 4 Aug 2017 01:30:45 +0000 (09:30 +0800)
committer	Xiang, Haihao <haihao.xiang@intel.com>
	Thu, 17 Aug 2017 04:09:31 +0000 (12:09 +0800)
src/gen75_picture_process.c		patch \| blob \| history
src/gen8_post_processing.c		patch \| blob \| history
src/gen9_post_processing.c		patch \| blob \| history
src/i965_post_processing.h		patch \| blob \| history
src/intel_common_vpp_internal.c		patch \| blob \| history