OSDN Git Service

intel-vaapi-driver 1.8.1.pre1
[android-x86/hardware-intel-common-vaapi.git] / src / gen9_mfc_hevc.c
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the
13  * next paragraph) shall be included in all copies or substantial portions
14  * of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
19  * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
20  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Qu Pengfei <Pengfei.Qu@intel.com>
26  *
27  */
28
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <math.h>
33 #include <assert.h>
34
35 #include "intel_batchbuffer.h"
36 #include "i965_defines.h"
37 #include "i965_structs.h"
38 #include "i965_drv_video.h"
39 #include "i965_encoder.h"
40 #include "i965_encoder_utils.h"
41 #include "gen9_mfc.h"
42 #include "gen6_vme.h"
43 #include "intel_media.h"
44
45 typedef enum _gen6_brc_status {
46     BRC_NO_HRD_VIOLATION = 0,
47     BRC_UNDERFLOW = 1,
48     BRC_OVERFLOW = 2,
49     BRC_UNDERFLOW_WITH_MAX_QP = 3,
50     BRC_OVERFLOW_WITH_MIN_QP = 4,
51 } gen6_brc_status;
52
53 /* BRC define */
54 #define BRC_CLIP(x, min, max)                                   \
55     {                                                           \
56         x = ((x > (max)) ? (max) : ((x < (min)) ? (min) : x));  \
57     }
58
59 #define BRC_P_B_QP_DIFF 4
60 #define BRC_I_P_QP_DIFF 2
61 #define BRC_I_B_QP_DIFF (BRC_I_P_QP_DIFF + BRC_P_B_QP_DIFF)
62
63 #define BRC_PWEIGHT 0.6  /* weight if P slice with comparison to I slice */
64 #define BRC_BWEIGHT 0.25 /* weight if B slice with comparison to I slice */
65
66 #define BRC_QP_MAX_CHANGE 5 /* maximum qp modification */
67 #define BRC_CY 0.1 /* weight for */
68 #define BRC_CX_UNDERFLOW 5.
69 #define BRC_CX_OVERFLOW -4.
70
71 #define BRC_PI_0_5 1.5707963267948966192313216916398
72
73 /* intel buffer write */
74 #define ALLOC_ENCODER_BUFFER(gen_buffer, string, size) do {     \
75         dri_bo_unreference(gen_buffer->bo);                     \
76         gen_buffer->bo = dri_bo_alloc(i965->intel.bufmgr,       \
77                                       string,                   \
78                                       size,                     \
79                                       0x1000);                  \
80         assert(gen_buffer->bo);                                 \
81     } while (0);
82
83
84 #define OUT_BUFFER_X(buf_bo, is_target, ma)  do {                         \
85         if (buf_bo) {                                                   \
86             OUT_BCS_RELOC64(batch,                                        \
87                           buf_bo,                                       \
88                           I915_GEM_DOMAIN_INSTRUCTION,                       \
89                           is_target ? I915_GEM_DOMAIN_INSTRUCTION : 0,       \
90                           0);                                           \
91         } else {                                                        \
92             OUT_BCS_BATCH(batch, 0);                                    \
93             OUT_BCS_BATCH(batch, 0);                                    \
94         }                                                               \
95         if (ma)                                                         \
96             OUT_BCS_BATCH(batch, i965->intel.mocs_state);                                    \
97     } while (0)
98
99 #define OUT_BUFFER_MA_TARGET(buf_bo)       OUT_BUFFER_X(buf_bo, 1, 1)
100 #define OUT_BUFFER_MA_REFERENCE(buf_bo)    OUT_BUFFER_X(buf_bo, 0, 1)
101 #define OUT_BUFFER_NMA_TARGET(buf_bo)      OUT_BUFFER_X(buf_bo, 1, 0)
102 #define OUT_BUFFER_NMA_REFERENCE(buf_bo)   OUT_BUFFER_X(buf_bo, 0, 0)
103
104
105 #define SURFACE_STATE_PADDED_SIZE               SURFACE_STATE_PADDED_SIZE_GEN8
106 #define SURFACE_STATE_OFFSET(index)             (SURFACE_STATE_PADDED_SIZE * index)
107 #define BINDING_TABLE_OFFSET(index)             (SURFACE_STATE_OFFSET(MAX_MEDIA_SURFACES_GEN6) + sizeof(unsigned int) * index)
108
109 #define HCP_SOFTWARE_SKYLAKE    1
110
111 #define NUM_HCPE_KERNEL 2
112
113 #define     INTER_MODE_MASK     0x03
114 #define     INTER_8X8       0x03
115 #define     INTER_16X8      0x01
116 #define     INTER_8X16      0x02
117 #define     SUBMB_SHAPE_MASK    0x00FF00
118
119 #define     INTER_MV8       (4 << 20)
120 #define     INTER_MV32      (6 << 20)
121
122
123 /* HEVC */
124
125 /* utils */
126 static void
127 hevc_gen_default_iq_matrix_encoder(VAQMatrixBufferHEVC *iq_matrix)
128 {
129     /* Flat_4x4_16 */
130     memset(&iq_matrix->scaling_lists_4x4, 16, sizeof(iq_matrix->scaling_lists_4x4));
131
132     /* Flat_8x8_16 */
133     memset(&iq_matrix->scaling_lists_8x8, 16, sizeof(iq_matrix->scaling_lists_8x8));
134
135     /* Flat_16x16_16 */
136     memset(&iq_matrix->scaling_lists_16x16, 16, sizeof(iq_matrix->scaling_lists_16x16));
137
138     /* Flat_32x32_16 */
139     memset(&iq_matrix->scaling_lists_32x32, 16, sizeof(iq_matrix->scaling_lists_32x32));
140
141     /* Flat_16x16_dc_16 */
142     memset(&iq_matrix->scaling_list_dc_16x16, 16, sizeof(iq_matrix->scaling_list_dc_16x16));
143
144     /* Flat_32x32_dc_16 */
145     memset(&iq_matrix->scaling_list_dc_32x32, 16, sizeof(iq_matrix->scaling_list_dc_32x32));
146 }
147
148 /* HEVC picture and slice state related */
149
150 static void
151 gen9_hcpe_pipe_mode_select(VADriverContextP ctx,
152                            int standard_select,
153                            struct intel_encoder_context *encoder_context)
154 {
155     struct i965_driver_data *i965 = i965_driver_data(ctx);
156     struct intel_batchbuffer *batch = encoder_context->base.batch;
157
158     assert(standard_select == HCP_CODEC_HEVC);
159
160     if(IS_KBL(i965->intel.device_info))
161     {
162         BEGIN_BCS_BATCH(batch, 6);
163
164         OUT_BCS_BATCH(batch, HCP_PIPE_MODE_SELECT | (6 - 2));
165     }
166     else
167     {
168         BEGIN_BCS_BATCH(batch, 4);
169
170         OUT_BCS_BATCH(batch, HCP_PIPE_MODE_SELECT | (4 - 2));
171     }
172
173     OUT_BCS_BATCH(batch,
174                   (standard_select << 5) |
175                   (0 << 3) | /* disable Pic Status / Error Report */
176                   HCP_CODEC_SELECT_ENCODE);
177     OUT_BCS_BATCH(batch, 0);
178     OUT_BCS_BATCH(batch, 0);
179
180     if(IS_KBL(i965->intel.device_info))
181     {
182         OUT_BCS_BATCH(batch, 0);
183         OUT_BCS_BATCH(batch, 0);
184     }
185
186     ADVANCE_BCS_BATCH(batch);
187 }
188
189 static void
190 gen9_hcpe_surface_state(VADriverContextP ctx, struct encode_state *encode_state,
191                         struct intel_encoder_context *encoder_context)
192 {
193     struct intel_batchbuffer *batch = encoder_context->base.batch;
194     struct object_surface *obj_surface = encode_state->reconstructed_object;
195     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
196     VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
197     unsigned int surface_format = SURFACE_FORMAT_PLANAR_420_8;
198
199     /* to do */
200     unsigned int y_cb_offset;
201
202     assert(obj_surface);
203
204     if((pSequenceParameter->seq_fields.bits.bit_depth_luma_minus8 > 0)
205         || (pSequenceParameter->seq_fields.bits.bit_depth_chroma_minus8 > 0))
206     {
207         assert(obj_surface->fourcc == VA_FOURCC_P010);
208         surface_format = SURFACE_FORMAT_P010;
209     }
210
211     y_cb_offset = obj_surface->y_cb_offset;
212
213     BEGIN_BCS_BATCH(batch, 3);
214     OUT_BCS_BATCH(batch, HCP_SURFACE_STATE | (3 - 2));
215     OUT_BCS_BATCH(batch,
216                   (1 << 28) |                   /* surface id */
217                   (mfc_context->surface_state.w_pitch - 1));    /* pitch - 1 */
218     OUT_BCS_BATCH(batch,
219                   surface_format << 28 |
220                   y_cb_offset);
221     ADVANCE_BCS_BATCH(batch);
222
223     BEGIN_BCS_BATCH(batch, 3);
224     OUT_BCS_BATCH(batch, HCP_SURFACE_STATE | (3 - 2));
225     OUT_BCS_BATCH(batch,
226                   (0 << 28) |                   /* surface id */
227                   (mfc_context->surface_state.w_pitch - 1));    /* pitch - 1 */
228     OUT_BCS_BATCH(batch,
229                   surface_format << 28 |
230                   y_cb_offset);
231     ADVANCE_BCS_BATCH(batch);
232 }
233
234 static void
235 gen9_hcpe_pipe_buf_addr_state(VADriverContextP ctx, struct encode_state *encode_state,
236                               struct intel_encoder_context *encoder_context)
237 {
238     struct i965_driver_data *i965 = i965_driver_data(ctx);
239     struct intel_batchbuffer *batch = encoder_context->base.batch;
240     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
241     struct object_surface *obj_surface;
242     GenHevcSurface *hcpe_hevc_surface;
243     dri_bo *bo;
244     unsigned int i;
245
246     if(IS_KBL(i965->intel.device_info))
247     {
248         BEGIN_BCS_BATCH(batch, 104);
249
250         OUT_BCS_BATCH(batch, HCP_PIPE_BUF_ADDR_STATE | (104 - 2));
251     }
252     else
253     {
254         BEGIN_BCS_BATCH(batch, 95);
255
256         OUT_BCS_BATCH(batch, HCP_PIPE_BUF_ADDR_STATE | (95 - 2));
257     }
258
259     obj_surface = encode_state->reconstructed_object;
260     assert(obj_surface && obj_surface->bo);
261     hcpe_hevc_surface = obj_surface->private_data;
262     assert(hcpe_hevc_surface && hcpe_hevc_surface->motion_vector_temporal_bo);
263
264     OUT_BUFFER_MA_TARGET(obj_surface->bo); /* DW 1..3 */
265     OUT_BUFFER_MA_TARGET(mfc_context->deblocking_filter_line_buffer.bo);/* DW 4..6 */
266     OUT_BUFFER_MA_TARGET(mfc_context->deblocking_filter_tile_line_buffer.bo); /* DW 7..9 */
267     OUT_BUFFER_MA_TARGET(mfc_context->deblocking_filter_tile_column_buffer.bo); /* DW 10..12 */
268     OUT_BUFFER_MA_TARGET(mfc_context->metadata_line_buffer.bo);         /* DW 13..15 */
269     OUT_BUFFER_MA_TARGET(mfc_context->metadata_tile_line_buffer.bo);    /* DW 16..18 */
270     OUT_BUFFER_MA_TARGET(mfc_context->metadata_tile_column_buffer.bo);  /* DW 19..21 */
271     OUT_BUFFER_MA_TARGET(mfc_context->sao_line_buffer.bo);              /* DW 22..24 */
272     OUT_BUFFER_MA_TARGET(mfc_context->sao_tile_line_buffer.bo);         /* DW 25..27 */
273     OUT_BUFFER_MA_TARGET(mfc_context->sao_tile_column_buffer.bo);       /* DW 28..30 */
274     OUT_BUFFER_MA_TARGET(hcpe_hevc_surface->motion_vector_temporal_bo); /* DW 31..33 */
275     OUT_BUFFER_MA_TARGET(NULL); /* DW 34..36, reserved */
276
277     /* here only max 8 reference allowed */
278     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
279         bo = mfc_context->reference_surfaces[i].bo;
280
281         if (bo) {
282             OUT_BUFFER_NMA_REFERENCE(bo);
283         } else
284             OUT_BUFFER_NMA_REFERENCE(NULL);
285     }
286     OUT_BCS_BATCH(batch, 0);    /* DW 53, memory address attributes */
287
288     OUT_BUFFER_MA_TARGET(mfc_context->uncompressed_picture_source.bo); /* DW 54..56, uncompressed picture source */
289     OUT_BUFFER_MA_TARGET(NULL); /* DW 57..59, ignore  */
290     OUT_BUFFER_MA_TARGET(NULL); /* DW 60..62, ignore  */
291     OUT_BUFFER_MA_TARGET(NULL); /* DW 63..65, ignore  */
292
293     for (i = 0; i < ARRAY_ELEMS(mfc_context->current_collocated_mv_temporal_buffer) - 1; i++) {
294         bo = mfc_context->current_collocated_mv_temporal_buffer[i].bo;
295
296         if (bo) {
297             OUT_BUFFER_NMA_REFERENCE(bo);
298         } else
299             OUT_BUFFER_NMA_REFERENCE(NULL);
300     }
301     OUT_BCS_BATCH(batch, 0);    /* DW 82, memory address attributes */
302
303     OUT_BUFFER_MA_TARGET(NULL);    /* DW 83..85, ignore for HEVC */
304     OUT_BUFFER_MA_TARGET(NULL);    /* DW 86..88, ignore for HEVC */
305     OUT_BUFFER_MA_TARGET(NULL);    /* DW 89..91, ignore for HEVC */
306     OUT_BUFFER_MA_TARGET(NULL);    /* DW 92..94, ignore for HEVC */
307
308     if(IS_KBL(i965->intel.device_info))
309     {
310         for(i = 0;i < 9;i++)
311             OUT_BCS_BATCH(batch, 0);
312     }
313
314     ADVANCE_BCS_BATCH(batch);
315 }
316
317 static void
318 gen9_hcpe_ind_obj_base_addr_state(VADriverContextP ctx,
319                                   struct intel_encoder_context *encoder_context)
320 {
321     struct i965_driver_data *i965 = i965_driver_data(ctx);
322     struct intel_batchbuffer *batch = encoder_context->base.batch;
323     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
324
325     /* to do */
326     BEGIN_BCS_BATCH(batch, 14);
327
328     OUT_BCS_BATCH(batch, HCP_IND_OBJ_BASE_ADDR_STATE | (14 - 2));
329     OUT_BUFFER_MA_REFERENCE(NULL);                 /* DW 1..3 igonre for encoder*/
330     OUT_BUFFER_NMA_REFERENCE(NULL);                /* DW 4..5, Upper Bound */
331     OUT_BUFFER_MA_TARGET(mfc_context->hcp_indirect_cu_object.bo);                 /* DW 6..8, CU */
332     /* DW 9..11, PAK-BSE */
333     OUT_BCS_RELOC64(batch,
334                   mfc_context->hcp_indirect_pak_bse_object.bo,
335                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
336                   mfc_context->hcp_indirect_pak_bse_object.offset);
337     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
338     OUT_BCS_RELOC64(batch,
339                   mfc_context->hcp_indirect_pak_bse_object.bo,
340                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
341                   mfc_context->hcp_indirect_pak_bse_object.end_offset);
342
343     ADVANCE_BCS_BATCH(batch);
344 }
345
346 static void
347 gen9_hcpe_fqm_state(VADriverContextP ctx,
348                     int size_id,
349                     int color_component,
350                     int pred_type,
351                     int dc,
352                     unsigned int *fqm,
353                     int fqm_length,
354                     struct intel_encoder_context *encoder_context)
355 {
356     struct intel_batchbuffer *batch = encoder_context->base.batch;
357     unsigned int fqm_buffer[32];
358
359     assert(fqm_length <= 32);
360     assert(sizeof(*fqm) == 4);
361     memset(fqm_buffer, 0, sizeof(fqm_buffer));
362     memcpy(fqm_buffer, fqm, fqm_length * 4);
363
364     BEGIN_BCS_BATCH(batch, 34);
365
366     OUT_BCS_BATCH(batch, HCP_FQM_STATE | (34 - 2));
367     OUT_BCS_BATCH(batch,
368                   dc << 16 |
369                   color_component << 3 |
370                   size_id << 1 |
371                   pred_type);
372     intel_batchbuffer_data(batch, fqm_buffer, 32 * 4);
373
374     ADVANCE_BCS_BATCH(batch);
375 }
376
377
378 static void
379 gen9_hcpe_hevc_fqm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
380 {
381     unsigned int qm[32] = {
382         0x10001000, 0x10001000, 0x10001000, 0x10001000,
383         0x10001000, 0x10001000, 0x10001000, 0x10001000,
384         0x10001000, 0x10001000, 0x10001000, 0x10001000,
385         0x10001000, 0x10001000, 0x10001000, 0x10001000,
386         0x10001000, 0x10001000, 0x10001000, 0x10001000,
387         0x10001000, 0x10001000, 0x10001000, 0x10001000,
388         0x10001000, 0x10001000, 0x10001000, 0x10001000,
389         0x10001000, 0x10001000, 0x10001000, 0x10001000
390     };
391
392     gen9_hcpe_fqm_state(ctx,
393                         0, 0, 0, 0,
394                         qm, 8,
395                         encoder_context);
396     gen9_hcpe_fqm_state(ctx,
397                         0, 0, 1, 0,
398                         qm, 8,
399                         encoder_context);
400     gen9_hcpe_fqm_state(ctx,
401                         1, 0, 0, 0,
402                         qm, 32,
403                         encoder_context);
404     gen9_hcpe_fqm_state(ctx,
405                         1, 0, 1, 0,
406                         qm, 32,
407                         encoder_context);
408     gen9_hcpe_fqm_state(ctx,
409                         2, 0, 0, 0x1000,
410                         qm, 0,
411                         encoder_context);
412     gen9_hcpe_fqm_state(ctx,
413                         2, 0, 1, 0x1000,
414                         qm, 0,
415                         encoder_context);
416     gen9_hcpe_fqm_state(ctx,
417                         3, 0, 0, 0x1000,
418                         qm, 0,
419                         encoder_context);
420     gen9_hcpe_fqm_state(ctx,
421                         3, 0, 1, 0x1000,
422                         qm, 0,
423                         encoder_context);
424 }
425
426 static void
427 gen9_hcpe_qm_state(VADriverContextP ctx,
428                    int size_id,
429                    int color_component,
430                    int pred_type,
431                    int dc,
432                    unsigned int *qm,
433                    int qm_length,
434                    struct intel_encoder_context *encoder_context)
435 {
436     struct intel_batchbuffer *batch = encoder_context->base.batch;
437     unsigned int qm_buffer[16];
438
439     assert(qm_length <= 16);
440     assert(sizeof(*qm) == 4);
441     memset(qm_buffer, 0, sizeof(qm_buffer));
442     memcpy(qm_buffer, qm, qm_length * 4);
443
444     BEGIN_BCS_BATCH(batch, 18);
445
446     OUT_BCS_BATCH(batch, HCP_QM_STATE | (18 - 2));
447     OUT_BCS_BATCH(batch,
448                   dc << 5 |
449                   color_component << 3 |
450                   size_id << 1 |
451                   pred_type);
452     intel_batchbuffer_data(batch, qm_buffer, 16 * 4);
453
454     ADVANCE_BCS_BATCH(batch);
455 }
456
457 static void
458 gen9_hcpe_hevc_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
459 {
460
461     int i;
462
463     unsigned int qm[16] = {
464         0x10101010, 0x10101010, 0x10101010, 0x10101010,
465         0x10101010, 0x10101010, 0x10101010, 0x10101010,
466         0x10101010, 0x10101010, 0x10101010, 0x10101010,
467         0x10101010, 0x10101010, 0x10101010, 0x10101010
468     };
469
470     for (i = 0; i < 6; i++) {
471         gen9_hcpe_qm_state(ctx,
472                            0, i % 3, i / 3, 0,
473                            qm, 4,
474                            encoder_context);
475     }
476
477     for (i = 0; i < 6; i++) {
478         gen9_hcpe_qm_state(ctx,
479                            1, i % 3, i / 3, 0,
480                            qm, 16,
481                            encoder_context);
482     }
483
484     for (i = 0; i < 6; i++) {
485         gen9_hcpe_qm_state(ctx,
486                            2, i % 3, i / 3, 16,
487                            qm, 16,
488                            encoder_context);
489     }
490
491     for (i = 0; i < 2; i++) {
492         gen9_hcpe_qm_state(ctx,
493                            3, 0, i % 2, 16,
494                            qm, 16,
495                            encoder_context);
496     }
497 }
498
499 static void
500 gen9_hcpe_hevc_pic_state(VADriverContextP ctx, struct encode_state *encode_state,
501                          struct intel_encoder_context *encoder_context)
502 {
503     struct i965_driver_data *i965 = i965_driver_data(ctx);
504     struct intel_batchbuffer *batch = encoder_context->base.batch;
505     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
506     VAEncPictureParameterBufferHEVC *pic_param ;
507     VAEncSequenceParameterBufferHEVC *seq_param ;
508
509     int max_pcm_size_minus3 = 0, min_pcm_size_minus3 = 0;
510     int pcm_sample_bit_depth_luma_minus1 = 7, pcm_sample_bit_depth_chroma_minus1 = 7;
511     /*
512      * 7.4.3.1
513      *
514      * When not present, the value of loop_filter_across_tiles_enabled_flag
515      * is inferred to be equal to 1.
516      */
517     int loop_filter_across_tiles_enabled_flag = 0;
518     pic_param = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
519     seq_param = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
520
521     int log2_cu_size = seq_param->log2_min_luma_coding_block_size_minus3 + 3;
522     int log2_ctb_size =  seq_param->log2_diff_max_min_luma_coding_block_size + log2_cu_size;
523     int ctb_size = 1 << log2_ctb_size;
524     double rawctubits = 8 * 3 * ctb_size * ctb_size / 2.0;
525     int maxctubits = (int)(5 * rawctubits / 3) ;
526     double bitrate = (double)encoder_context->brc.bits_per_second[0];
527     double framebitrate = bitrate / 32 / 8; //32 byte unit
528     int minframebitrate = 0;//(int) (framebitrate * 3 / 10);
529     int maxframebitrate = (int)(framebitrate * 10 / 10);
530     int maxdeltaframebitrate = 0x1c5c; //(int) (framebitrate * 1/ 10);
531     int mindeltaframebitrate = 0; //(int) (framebitrate * 1/ 10);
532     int minframesize = 0;//(int)(rawframebits * 1/50);
533
534     if (seq_param->seq_fields.bits.pcm_enabled_flag) {
535         max_pcm_size_minus3 = seq_param->log2_max_pcm_luma_coding_block_size_minus3;
536         min_pcm_size_minus3 = seq_param->log2_min_pcm_luma_coding_block_size_minus3;
537         pcm_sample_bit_depth_luma_minus1 = (seq_param->pcm_sample_bit_depth_luma_minus1 & 0x0f);
538         pcm_sample_bit_depth_chroma_minus1 = (seq_param->pcm_sample_bit_depth_chroma_minus1 & 0x0f);
539     } else {
540         max_pcm_size_minus3 = MIN(seq_param->log2_min_luma_coding_block_size_minus3 + seq_param->log2_diff_max_min_luma_coding_block_size, 2);
541     }
542
543     if (pic_param->pic_fields.bits.tiles_enabled_flag)
544         loop_filter_across_tiles_enabled_flag = pic_param->pic_fields.bits.loop_filter_across_tiles_enabled_flag;
545
546     /* set zero for encoder */
547     loop_filter_across_tiles_enabled_flag = 0;
548
549     if(IS_KBL(i965->intel.device_info))
550     {
551         BEGIN_BCS_BATCH(batch, 31);
552
553         OUT_BCS_BATCH(batch, HCP_PIC_STATE | (31 - 2));
554     }
555     else
556     {
557         BEGIN_BCS_BATCH(batch, 19);
558
559         OUT_BCS_BATCH(batch, HCP_PIC_STATE | (19 - 2));
560     }
561
562     OUT_BCS_BATCH(batch,
563                   mfc_context->pic_size.picture_height_in_min_cb_minus1 << 16 |
564                   0 << 14 |
565                   mfc_context->pic_size.picture_width_in_min_cb_minus1);
566     OUT_BCS_BATCH(batch,
567                   max_pcm_size_minus3 << 10 |
568                   min_pcm_size_minus3 << 8 |
569                   (seq_param->log2_min_transform_block_size_minus2 +
570                    seq_param->log2_diff_max_min_transform_block_size) << 6 |
571                   seq_param->log2_min_transform_block_size_minus2 << 4 |
572                   (seq_param->log2_min_luma_coding_block_size_minus3 +
573                    seq_param->log2_diff_max_min_luma_coding_block_size) << 2 |
574                   seq_param->log2_min_luma_coding_block_size_minus3);
575     OUT_BCS_BATCH(batch, 0); /* DW 3, ignored */
576     OUT_BCS_BATCH(batch,
577                   (IS_KBL(i965->intel.device_info)? 1 : 0) << 27 | /* CU packet structure is 0 for SKL */
578                   seq_param->seq_fields.bits.strong_intra_smoothing_enabled_flag << 26 |
579                   pic_param->pic_fields.bits.transquant_bypass_enabled_flag << 25 |
580                   seq_param->seq_fields.bits.amp_enabled_flag << 23 |
581                   pic_param->pic_fields.bits.transform_skip_enabled_flag << 22 |
582                   0 << 21 | /* 0 for encoder !(pic_param->decoded_curr_pic.flags & VA_PICTURE_HEVC_BOTTOM_FIELD)*/
583                   0 << 20 |     /* 0 for encoder !!(pic_param->decoded_curr_pic.flags & VA_PICTURE_HEVC_FIELD_PIC)*/
584                   pic_param->pic_fields.bits.weighted_pred_flag << 19 |
585                   pic_param->pic_fields.bits.weighted_bipred_flag << 18 |
586                   pic_param->pic_fields.bits.tiles_enabled_flag << 17 |                 /* 0 for encoder */
587                   pic_param->pic_fields.bits.entropy_coding_sync_enabled_flag << 16 |
588                   loop_filter_across_tiles_enabled_flag << 15 |
589                   pic_param->pic_fields.bits.sign_data_hiding_enabled_flag << 13 |  /* 0 for encoder */
590                   pic_param->log2_parallel_merge_level_minus2 << 10 |               /* 0 for encoder */
591                   pic_param->pic_fields.bits.constrained_intra_pred_flag << 9 |     /* 0 for encoder */
592                   seq_param->seq_fields.bits.pcm_loop_filter_disabled_flag << 8 |
593                   (pic_param->diff_cu_qp_delta_depth & 0x03) << 6 |                 /* 0 for encoder */
594                   pic_param->pic_fields.bits.cu_qp_delta_enabled_flag << 5 |        /* 0 for encoder */
595                   seq_param->seq_fields.bits.pcm_enabled_flag << 4 |
596                   seq_param->seq_fields.bits.sample_adaptive_offset_enabled_flag << 3 | /* 0 for encoder */
597                   0);
598     OUT_BCS_BATCH(batch,
599                   seq_param->seq_fields.bits.bit_depth_luma_minus8 << 27 |                 /* 10 bit for KBL+*/
600                   seq_param->seq_fields.bits.bit_depth_chroma_minus8 << 24 |                 /* 10 bit for KBL+ */
601                   pcm_sample_bit_depth_luma_minus1 << 20 |
602                   pcm_sample_bit_depth_chroma_minus1 << 16 |
603                   seq_param->max_transform_hierarchy_depth_inter << 13 |    /*  for encoder */
604                   seq_param->max_transform_hierarchy_depth_intra << 10 |    /*  for encoder */
605                   (pic_param->pps_cr_qp_offset & 0x1f) << 5 |
606                   (pic_param->pps_cb_qp_offset & 0x1f));
607     OUT_BCS_BATCH(batch,
608                   0 << 29 | /* must be 0 for encoder */
609                   maxctubits); /* DW 6, max LCU bit size allowed for encoder  */
610     OUT_BCS_BATCH(batch,
611                   0 << 31 | /* frame bitrate max unit */
612                   maxframebitrate); /* DW 7, frame bitrate max 0:13   */
613     OUT_BCS_BATCH(batch,
614                   0 << 31 | /* frame bitrate min unit */
615                   minframebitrate); /* DW 8, frame bitrate min 0:13   */
616     OUT_BCS_BATCH(batch,
617                   maxdeltaframebitrate << 16 | /* frame bitrate max delta ,help to select deltaQP of slice*/
618                   mindeltaframebitrate); /* DW 9,(0,14) frame bitrate min delta ,help to select deltaQP of slice*/
619     OUT_BCS_BATCH(batch, 0x07050402);   /* DW 10, frame delta qp max */
620     OUT_BCS_BATCH(batch, 0x0d0b0908);
621     OUT_BCS_BATCH(batch, 0);    /* DW 12, frame delta qp min */
622     OUT_BCS_BATCH(batch, 0);
623     OUT_BCS_BATCH(batch, 0x04030200);   /* DW 14, frame delta qp max range  */
624     OUT_BCS_BATCH(batch, 0x100c0806);   /* DW 15 */
625     OUT_BCS_BATCH(batch, 0x04030200);   /* DW 16, frame delta qp min range  */
626     OUT_BCS_BATCH(batch, 0x100c0806);
627     OUT_BCS_BATCH(batch,
628                   0 << 30 |
629                   minframesize);    /* DW 18, min frame size units */
630
631     if(IS_KBL(i965->intel.device_info))
632     {
633         int i = 0;
634
635         for(i = 0;i < 12;i++)
636             OUT_BCS_BATCH(batch, 0);
637     }
638
639     ADVANCE_BCS_BATCH(batch);
640 }
641
642
643 static void
644 gen9_hcpe_hevc_insert_object(VADriverContextP ctx, struct intel_encoder_context *encoder_context,
645                              unsigned int *insert_data, int lenght_in_dws, int data_bits_in_last_dw,
646                              int skip_emul_byte_count, int is_last_header, int is_end_of_slice, int emulation_flag,
647                              struct intel_batchbuffer *batch)
648 {
649     if (batch == NULL)
650         batch = encoder_context->base.batch;
651
652     if (data_bits_in_last_dw == 0)
653         data_bits_in_last_dw = 32;
654
655     BEGIN_BCS_BATCH(batch, lenght_in_dws + 2);
656
657     OUT_BCS_BATCH(batch, HCP_INSERT_PAK_OBJECT | (lenght_in_dws + 2 - 2));
658     OUT_BCS_BATCH(batch,
659                   (0 << 31) |   /* inline payload */
660                   (0 << 16) |   /* always start at offset 0 */
661                   (0 << 15) |   /* HeaderLengthExcludeFrmSize */
662                   (data_bits_in_last_dw << 8) |
663                   (skip_emul_byte_count << 4) |
664                   (!!emulation_flag << 3) |
665                   ((!!is_last_header) << 2) |
666                   ((!!is_end_of_slice) << 1) |
667                   (0 << 0));    /* Reserved */
668     intel_batchbuffer_data(batch, insert_data, lenght_in_dws * 4);
669
670     ADVANCE_BCS_BATCH(batch);
671 }
672 /*
673 // To be do: future
674 static uint8_t
675 intel_get_ref_idx_state_1(VAPictureHEVC *va_pic, unsigned int frame_store_id)
676 {
677     unsigned int is_long_term =
678         !!(va_pic->flags & VA_PICTURE_HEVC_LONG_TERM_REFERENCE);
679     unsigned int is_top_field =
680         !!!(va_pic->flags & VA_PICTURE_HEVC_BOTTOM_FIELD);
681     unsigned int is_bottom_field =
682         !!(va_pic->flags & VA_PICTURE_HEVC_BOTTOM_FIELD);
683
684     return ((is_long_term                         << 6) |
685             ((is_top_field ^ is_bottom_field ^ 1) << 5) |
686             (frame_store_id                       << 1) |
687             ((is_top_field ^ 1) & is_bottom_field));
688 }
689 */
690 static void
691 gen9_hcpe_ref_idx_state_1(struct intel_batchbuffer *batch,
692                           int list,
693                           struct intel_encoder_context *encoder_context,
694                           struct encode_state *encode_state)
695 {
696     int i;
697     VAEncPictureParameterBufferHEVC *pic_param = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
698     VAEncSliceParameterBufferHEVC *slice_param = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
699     uint8_t num_ref_minus1 = (list ? slice_param->num_ref_idx_l1_active_minus1 : slice_param->num_ref_idx_l0_active_minus1);
700     VAPictureHEVC *ref_list = (list ? slice_param->ref_pic_list1 : slice_param->ref_pic_list0);
701     struct gen6_vme_context *vme_context = encoder_context->vme_context;
702     struct object_surface *obj_surface;
703     int frame_index;
704
705     int ref_idx_l0 = (vme_context->ref_index_in_mb[list] & 0xff);
706
707     if (ref_idx_l0 > 3) {
708         WARN_ONCE("ref_idx_l0 is out of range\n");
709         ref_idx_l0 = 0;
710     }
711
712     obj_surface = vme_context->used_reference_objects[list];
713     frame_index = -1;
714     for (i = 0; i < 16; i++) {
715         if (obj_surface &&
716             obj_surface == encode_state->reference_objects[i]) {
717             frame_index = i;
718             break;
719         }
720     }
721     if (frame_index == -1) {
722         WARN_ONCE("RefPicList 0 or 1 is not found in DPB!\n");
723     }
724
725     BEGIN_BCS_BATCH(batch, 18);
726
727     OUT_BCS_BATCH(batch, HCP_REF_IDX_STATE | (18 - 2));
728     OUT_BCS_BATCH(batch,
729                   num_ref_minus1 << 1 |
730                   list);
731
732     for (i = 0; i < 16; i++) {
733         if (i < MIN((num_ref_minus1 + 1), 15)) {
734             VAPictureHEVC *ref_pic = &ref_list[i];
735             VAPictureHEVC *curr_pic = &pic_param->decoded_curr_pic;
736
737             OUT_BCS_BATCH(batch,
738                           1 << 15 |         /* bottom_field_flag 0 */
739                           0 << 14 |         /* field_pic_flag 0 */
740                           !!(ref_pic->flags & VA_PICTURE_HEVC_LONG_TERM_REFERENCE) << 13 |  /* short term is 1 */
741                           0 << 12 | /* disable WP */
742                           0 << 11 | /* disable WP */
743                           frame_index << 8 |
744                           (CLAMP(-128, 127, curr_pic->pic_order_cnt - ref_pic->pic_order_cnt) & 0xff));
745         } else {
746             OUT_BCS_BATCH(batch, 0);
747         }
748     }
749
750     ADVANCE_BCS_BATCH(batch);
751 }
752
753 void
754 intel_hcpe_hevc_ref_idx_state(VADriverContextP ctx,
755                               struct encode_state *encode_state,
756                               struct intel_encoder_context *encoder_context
757                              )
758 {
759     struct intel_batchbuffer *batch = encoder_context->base.batch;
760     VAEncSliceParameterBufferHEVC *slice_param = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
761
762     if (slice_param->slice_type == HEVC_SLICE_I)
763         return;
764
765     gen9_hcpe_ref_idx_state_1(batch, 0, encoder_context, encode_state);
766
767     if (slice_param->slice_type == HEVC_SLICE_P)
768         return;
769
770     gen9_hcpe_ref_idx_state_1(batch, 1, encoder_context, encode_state);
771 }
772
773 static void
774 gen9_hcpe_hevc_slice_state(VADriverContextP ctx,
775                            VAEncPictureParameterBufferHEVC *pic_param,
776                            VAEncSliceParameterBufferHEVC *slice_param,
777                            struct encode_state *encode_state,
778                            struct intel_encoder_context *encoder_context,
779                            struct intel_batchbuffer *batch)
780 {
781     struct i965_driver_data *i965 = i965_driver_data(ctx);
782     VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
783     int slice_type = slice_param->slice_type;
784
785     int log2_cu_size = pSequenceParameter->log2_min_luma_coding_block_size_minus3 + 3;
786     int log2_ctb_size = pSequenceParameter->log2_diff_max_min_luma_coding_block_size + log2_cu_size;
787     int ctb_size = 1 << log2_ctb_size;
788     int width_in_ctb = (pSequenceParameter->pic_width_in_luma_samples + ctb_size - 1) / ctb_size;
789     int height_in_ctb = (pSequenceParameter->pic_height_in_luma_samples + ctb_size - 1) / ctb_size;
790     int last_slice = (((slice_param->slice_segment_address + slice_param->num_ctu_in_slice) == (width_in_ctb * height_in_ctb)) ? 1 : 0);
791
792     int slice_hor_pos, slice_ver_pos, next_slice_hor_pos, next_slice_ver_pos;
793
794     slice_hor_pos = slice_param->slice_segment_address % width_in_ctb;
795     slice_ver_pos = slice_param->slice_segment_address / width_in_ctb;
796
797     next_slice_hor_pos = (slice_param->slice_segment_address + slice_param->num_ctu_in_slice) % width_in_ctb;
798     next_slice_ver_pos = (slice_param->slice_segment_address + slice_param->num_ctu_in_slice) / width_in_ctb;
799
800     /* only support multi slice begin from row start address */
801     assert((slice_param->slice_segment_address % width_in_ctb) == 0);
802
803     if (last_slice == 1) {
804         if (slice_param->slice_segment_address == 0) {
805             next_slice_hor_pos = 0;
806             next_slice_ver_pos = height_in_ctb;
807         } else {
808             next_slice_hor_pos = 0;
809             next_slice_ver_pos = 0;
810         }
811     }
812
813     if(IS_KBL(i965->intel.device_info))
814     {
815         BEGIN_BCS_BATCH(batch, 11);
816
817         OUT_BCS_BATCH(batch, HCP_SLICE_STATE | (11 - 2));
818     }
819     else
820     {
821         BEGIN_BCS_BATCH(batch, 9);
822
823         OUT_BCS_BATCH(batch, HCP_SLICE_STATE | (9 - 2));
824     }
825
826     OUT_BCS_BATCH(batch,
827                   slice_ver_pos << 16 |
828                   slice_hor_pos);
829     OUT_BCS_BATCH(batch,
830                   next_slice_ver_pos << 16 |
831                   next_slice_hor_pos);
832     OUT_BCS_BATCH(batch,
833                   (slice_param->slice_cr_qp_offset & 0x1f) << 17 |
834                   (slice_param->slice_cb_qp_offset & 0x1f) << 12 |
835                   (pic_param->pic_init_qp + slice_param->slice_qp_delta) << 6 |
836                   slice_param->slice_fields.bits.slice_temporal_mvp_enabled_flag << 5 |
837                   slice_param->slice_fields.bits.dependent_slice_segment_flag << 4 |
838                   last_slice << 2 |
839                   slice_type);
840     OUT_BCS_BATCH(batch,
841                   0 << 26 |
842                   (slice_param->max_num_merge_cand - 1)  << 23 |
843                   slice_param->slice_fields.bits.cabac_init_flag << 22 |
844                   slice_param->luma_log2_weight_denom << 19 |
845                   (slice_param->luma_log2_weight_denom + slice_param->delta_chroma_log2_weight_denom) << 16 |
846                   slice_param->slice_fields.bits.collocated_from_l0_flag << 15 |
847                   (slice_type != HEVC_SLICE_B) << 14 |
848                   slice_param->slice_fields.bits.mvd_l1_zero_flag << 13 |
849                   slice_param->slice_fields.bits.slice_sao_luma_flag << 12 |
850                   slice_param->slice_fields.bits.slice_sao_chroma_flag << 11 |
851                   slice_param->slice_fields.bits.slice_loop_filter_across_slices_enabled_flag << 10 |
852                   (slice_param->slice_beta_offset_div2 & 0xf) << 5 |
853                   (slice_param->slice_tc_offset_div2 & 0xf) << 1 |
854                   slice_param->slice_fields.bits.slice_deblocking_filter_disabled_flag);
855     OUT_BCS_BATCH(batch, 0); /* DW 5 ,ignore for encoder.*/
856     OUT_BCS_BATCH(batch,
857                   4 << 26 |
858                   4 << 20 |
859                   0);
860     OUT_BCS_BATCH(batch,
861                   1 << 10 |  /* header insertion enable */
862                   1 << 9  |  /* slice data enable */
863                   1 << 8  |  /* tail insertion enable, must at end of frame, not slice */
864                   1 << 2  |  /* RBSP or EBSP, EmulationByteSliceInsertEnable */
865                   1 << 1  |  /* cabacZeroWordInsertionEnable */
866                   0);        /* Ignored for decoding */
867     OUT_BCS_BATCH(batch, 0); /* PAK-BSE data start offset */
868
869     if(IS_KBL(i965->intel.device_info))
870     {
871         OUT_BCS_BATCH(batch, 0);
872         OUT_BCS_BATCH(batch, 0);
873     }
874
875     ADVANCE_BCS_BATCH(batch);
876 }
877
878 /* HEVC pipe line related */
879 static void gen9_hcpe_hevc_pipeline_picture_programing(VADriverContextP ctx,
880         struct encode_state *encode_state,
881         struct intel_encoder_context *encoder_context)
882 {
883     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
884
885     mfc_context->pipe_mode_select(ctx, HCP_CODEC_HEVC, encoder_context);
886     mfc_context->set_surface_state(ctx, encode_state, encoder_context);
887     gen9_hcpe_pipe_buf_addr_state(ctx, encode_state, encoder_context);
888     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
889
890     mfc_context->qm_state(ctx, encoder_context);
891     mfc_context->fqm_state(ctx, encoder_context);
892     mfc_context->pic_state(ctx, encode_state, encoder_context);
893     intel_hcpe_hevc_ref_idx_state(ctx, encode_state, encoder_context);
894 }
895
896 static void gen9_hcpe_init(VADriverContextP ctx,
897                            struct encode_state *encode_state,
898                            struct intel_encoder_context *encoder_context)
899 {
900     /* to do */
901     struct i965_driver_data *i965 = i965_driver_data(ctx);
902     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
903     VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
904     VAEncSliceParameterBufferHEVC *slice_param = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
905     dri_bo *bo;
906     int i, size = 0;
907     int slice_batchbuffer_size;
908     int slice_type = slice_param->slice_type;
909     int is_inter = (slice_type != HEVC_SLICE_I);
910
911     int log2_cu_size = pSequenceParameter->log2_min_luma_coding_block_size_minus3 + 3;
912     int log2_ctb_size = pSequenceParameter->log2_diff_max_min_luma_coding_block_size + log2_cu_size;
913     int ctb_size = 1 << log2_ctb_size;
914     int cu_size  = 1 << log2_cu_size;
915
916     int width_in_ctb  = ALIGN(pSequenceParameter->pic_width_in_luma_samples , ctb_size) / ctb_size;
917     int height_in_ctb = ALIGN(pSequenceParameter->pic_height_in_luma_samples, ctb_size) / ctb_size;
918     int width_in_cu  = ALIGN(pSequenceParameter->pic_width_in_luma_samples , cu_size) / cu_size;
919     int height_in_cu = ALIGN(pSequenceParameter->pic_height_in_luma_samples, cu_size) / cu_size;
920     int width_in_mb  = ALIGN(pSequenceParameter->pic_width_in_luma_samples , 16) / 16;
921     int height_in_mb = ALIGN(pSequenceParameter->pic_height_in_luma_samples, 16) / 16;
922
923     int num_cu_record = 64;
924     int size_shift = 3;
925
926     if((pSequenceParameter->seq_fields.bits.bit_depth_luma_minus8 > 0)
927         || (pSequenceParameter->seq_fields.bits.bit_depth_chroma_minus8 > 0))
928         size_shift = 2;
929
930     if (log2_ctb_size == 5) num_cu_record = 16;
931     else if (log2_ctb_size == 4) num_cu_record = 4;
932     else if (log2_ctb_size == 6) num_cu_record = 64;
933
934     /* frame size in samples, cu,ctu, mb */
935     mfc_context->pic_size.picture_width_in_samples = pSequenceParameter->pic_width_in_luma_samples;
936     mfc_context->pic_size.picture_height_in_samples = pSequenceParameter->pic_height_in_luma_samples;
937     mfc_context->pic_size.ctb_size = ctb_size;
938     mfc_context->pic_size.picture_width_in_ctbs = width_in_ctb;
939     mfc_context->pic_size.picture_height_in_ctbs = height_in_ctb;
940     mfc_context->pic_size.min_cb_size = cu_size;
941     mfc_context->pic_size.picture_width_in_min_cb_minus1 = width_in_cu - 1;
942     mfc_context->pic_size.picture_height_in_min_cb_minus1 = height_in_cu - 1;
943     mfc_context->pic_size.picture_width_in_mbs = width_in_mb;
944     mfc_context->pic_size.picture_height_in_mbs = height_in_mb;
945
946     slice_batchbuffer_size = 64 * width_in_ctb * width_in_ctb + 4096 +
947                              (SLICE_HEADER + SLICE_TAIL) * encode_state->num_slice_params_ext;
948
949     /*Encode common setup for HCP*/
950     /*deblocking */
951     dri_bo_unreference(mfc_context->deblocking_filter_line_buffer.bo);
952     mfc_context->deblocking_filter_line_buffer.bo = NULL;
953
954     dri_bo_unreference(mfc_context->deblocking_filter_tile_line_buffer.bo);
955     mfc_context->deblocking_filter_tile_line_buffer.bo = NULL;
956
957     dri_bo_unreference(mfc_context->deblocking_filter_tile_column_buffer.bo);
958     mfc_context->deblocking_filter_tile_column_buffer.bo = NULL;
959
960     /* input source */
961     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
962     mfc_context->uncompressed_picture_source.bo = NULL;
963
964     /* metadata */
965     dri_bo_unreference(mfc_context->metadata_line_buffer.bo);
966     mfc_context->metadata_line_buffer.bo = NULL;
967
968     dri_bo_unreference(mfc_context->metadata_tile_line_buffer.bo);
969     mfc_context->metadata_tile_line_buffer.bo = NULL;
970
971     dri_bo_unreference(mfc_context->metadata_tile_column_buffer.bo);
972     mfc_context->metadata_tile_column_buffer.bo = NULL;
973
974     /* sao */
975     dri_bo_unreference(mfc_context->sao_line_buffer.bo);
976     mfc_context->sao_line_buffer.bo = NULL;
977
978     dri_bo_unreference(mfc_context->sao_tile_line_buffer.bo);
979     mfc_context->sao_tile_line_buffer.bo = NULL;
980
981     dri_bo_unreference(mfc_context->sao_tile_column_buffer.bo);
982     mfc_context->sao_tile_column_buffer.bo = NULL;
983
984     /* mv temporal buffer */
985     for (i = 0; i < NUM_HCP_CURRENT_COLLOCATED_MV_TEMPORAL_BUFFERS; i++) {
986         if (mfc_context->current_collocated_mv_temporal_buffer[i].bo != NULL)
987             dri_bo_unreference(mfc_context->current_collocated_mv_temporal_buffer[i].bo);
988         mfc_context->current_collocated_mv_temporal_buffer[i].bo = NULL;
989     }
990
991     /* reference */
992     for (i = 0; i < MAX_HCP_REFERENCE_SURFACES; i++) {
993         if (mfc_context->reference_surfaces[i].bo != NULL)
994             dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
995         mfc_context->reference_surfaces[i].bo = NULL;
996     }
997
998     /* indirect data CU recording */
999     dri_bo_unreference(mfc_context->hcp_indirect_cu_object.bo);
1000     mfc_context->hcp_indirect_cu_object.bo = NULL;
1001
1002     dri_bo_unreference(mfc_context->hcp_indirect_pak_bse_object.bo);
1003     mfc_context->hcp_indirect_pak_bse_object.bo = NULL;
1004
1005     /* Current internal buffer for HCP */
1006
1007     size = ALIGN(pSequenceParameter->pic_width_in_luma_samples, 32) >> size_shift;
1008     size <<= 6;
1009     ALLOC_ENCODER_BUFFER((&mfc_context->deblocking_filter_line_buffer), "line buffer", size);
1010     ALLOC_ENCODER_BUFFER((&mfc_context->deblocking_filter_tile_line_buffer), "tile line buffer", size);
1011
1012     size = ALIGN(pSequenceParameter->pic_height_in_luma_samples + 6 * width_in_ctb, 32) >> size_shift;
1013     size <<= 6;
1014     ALLOC_ENCODER_BUFFER((&mfc_context->deblocking_filter_tile_column_buffer), "tile column buffer", size);
1015
1016     if (is_inter) {
1017         size = (((pSequenceParameter->pic_width_in_luma_samples + 15) >> 4) * 188 + 9 * width_in_ctb + 1023) >> 9;
1018         size <<= 6;
1019         ALLOC_ENCODER_BUFFER((&mfc_context->metadata_line_buffer), "metadata line buffer", size);
1020
1021         size = (((pSequenceParameter->pic_width_in_luma_samples + 15) >> 4) * 172 + 9 * width_in_ctb + 1023) >> 9;
1022         size <<= 6;
1023         ALLOC_ENCODER_BUFFER((&mfc_context->metadata_tile_line_buffer), "metadata tile line buffer", size);
1024
1025         size = (((pSequenceParameter->pic_height_in_luma_samples + 15) >> 4) * 176 + 89 * width_in_ctb + 1023) >> 9;
1026         size <<= 6;
1027         ALLOC_ENCODER_BUFFER((&mfc_context->metadata_tile_column_buffer), "metadata tile column buffer", size);
1028     } else {
1029         size = (pSequenceParameter->pic_width_in_luma_samples + 8 * width_in_ctb + 1023) >> 9;
1030         size <<= 6;
1031         ALLOC_ENCODER_BUFFER((&mfc_context->metadata_line_buffer), "metadata line buffer", size);
1032
1033         size = (pSequenceParameter->pic_width_in_luma_samples + 16 * width_in_ctb + 1023) >> 9;
1034         size <<= 6;
1035         ALLOC_ENCODER_BUFFER((&mfc_context->metadata_tile_line_buffer), "metadata tile line buffer", size);
1036
1037         size = (pSequenceParameter->pic_height_in_luma_samples + 8 * height_in_ctb + 1023) >> 9;
1038         size <<= 6;
1039         ALLOC_ENCODER_BUFFER((&mfc_context->metadata_tile_column_buffer), "metadata tile column buffer", size);
1040     }
1041
1042     size = ALIGN(((pSequenceParameter->pic_width_in_luma_samples >> 1) + 3 * width_in_ctb), 16) >> size_shift;
1043     size <<= 6;
1044     ALLOC_ENCODER_BUFFER((&mfc_context->sao_line_buffer), "sao line buffer", size);
1045
1046     size = ALIGN(((pSequenceParameter->pic_width_in_luma_samples >> 1) + 6 * width_in_ctb), 16) >> size_shift;
1047     size <<= 6;
1048     ALLOC_ENCODER_BUFFER((&mfc_context->sao_tile_line_buffer), "sao tile line buffer", size);
1049
1050     size = ALIGN(((pSequenceParameter->pic_height_in_luma_samples >> 1) + 6 * height_in_ctb), 16) >> size_shift;
1051     size <<= 6;
1052     ALLOC_ENCODER_BUFFER((&mfc_context->sao_tile_column_buffer), "sao tile column buffer", size);
1053
1054     /////////////////////
1055     dri_bo_unreference(mfc_context->hcp_indirect_cu_object.bo);
1056     bo = dri_bo_alloc(i965->intel.bufmgr,
1057                       "Indirect data CU Buffer",
1058                       width_in_ctb * height_in_ctb * num_cu_record * 16 * 4,
1059                       0x1000);
1060     assert(bo);
1061     mfc_context->hcp_indirect_cu_object.bo = bo;
1062
1063     /* to do pak bse object buffer */
1064     /* to do current collocated mv temporal buffer */
1065
1066     dri_bo_unreference(mfc_context->hcp_batchbuffer_surface.bo);
1067     mfc_context->hcp_batchbuffer_surface.bo = NULL;
1068
1069     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
1070     mfc_context->aux_batchbuffer_surface.bo = NULL;
1071
1072     if (mfc_context->aux_batchbuffer)
1073         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
1074
1075     mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
1076     mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
1077     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
1078     mfc_context->aux_batchbuffer_surface.pitch = 16;
1079     mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
1080     mfc_context->aux_batchbuffer_surface.size_block = 16;
1081 }
1082
1083 static VAStatus gen9_hcpe_run(VADriverContextP ctx,
1084                               struct encode_state *encode_state,
1085                               struct intel_encoder_context *encoder_context)
1086 {
1087     struct intel_batchbuffer *batch = encoder_context->base.batch;
1088
1089     intel_batchbuffer_flush(batch);     //run the pipeline
1090
1091     return VA_STATUS_SUCCESS;
1092 }
1093
1094
1095 static VAStatus
1096 gen9_hcpe_stop(VADriverContextP ctx,
1097                struct encode_state *encode_state,
1098                struct intel_encoder_context *encoder_context,
1099                int *encoded_bits_size)
1100 {
1101     VAStatus vaStatus = VA_STATUS_ERROR_UNKNOWN;
1102     VAEncPictureParameterBufferHEVC *pPicParameter = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
1103     VACodedBufferSegment *coded_buffer_segment;
1104
1105     vaStatus = i965_MapBuffer(ctx, pPicParameter->coded_buf, (void **)&coded_buffer_segment);
1106     assert(vaStatus == VA_STATUS_SUCCESS);
1107     *encoded_bits_size = coded_buffer_segment->size * 8;
1108     i965_UnmapBuffer(ctx, pPicParameter->coded_buf);
1109
1110     return VA_STATUS_SUCCESS;
1111 }
1112
1113
1114 int intel_hevc_find_skipemulcnt(unsigned char *buf, int bits_length)
1115 {
1116     /* to do */
1117     int i, found;
1118     int leading_zero_cnt, byte_length, zero_byte;
1119     int nal_unit_type;
1120     int skip_cnt = 0;
1121
1122 #define NAL_UNIT_TYPE_MASK 0x7e
1123 #define HW_MAX_SKIP_LENGTH 15
1124
1125     byte_length = ALIGN(bits_length, 32) >> 3;
1126
1127
1128     leading_zero_cnt = 0;
1129     found = 0;
1130     for (i = 0; i < byte_length - 4; i++) {
1131         if (((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 1)) ||
1132             ((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 0) && (buf[i + 3] == 1))) {
1133             found = 1;
1134             break;
1135         }
1136         leading_zero_cnt++;
1137     }
1138     if (!found) {
1139         /* warning message is complained. But anyway it will be inserted. */
1140         WARN_ONCE("Invalid packed header data. "
1141                   "Can't find the 000001 start_prefix code\n");
1142         return 0;
1143     }
1144     i = leading_zero_cnt;
1145
1146     zero_byte = 0;
1147     if (!((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 1)))
1148         zero_byte = 1;
1149
1150     skip_cnt = leading_zero_cnt + zero_byte + 3;
1151
1152     /* the unit header byte is accounted */
1153     nal_unit_type = (buf[skip_cnt]) & NAL_UNIT_TYPE_MASK;
1154     skip_cnt += 1;
1155     skip_cnt += 1;  /* two bytes length of nal headers in hevc */
1156
1157     if (nal_unit_type == 14 || nal_unit_type == 20 || nal_unit_type == 21) {
1158         /* more unit header bytes are accounted for MVC/SVC */
1159         //skip_cnt += 3;
1160     }
1161     if (skip_cnt > HW_MAX_SKIP_LENGTH) {
1162         WARN_ONCE("Too many leading zeros are padded for packed data. "
1163                   "It is beyond the HW range.!!!\n");
1164     }
1165     return skip_cnt;
1166 }
1167
1168 #ifdef HCP_SOFTWARE_SKYLAKE
1169
1170 static int
1171 gen9_hcpe_hevc_pak_object(VADriverContextP ctx, int lcu_x, int lcu_y, int isLast_ctb,
1172                           struct intel_encoder_context *encoder_context,
1173                           int cu_count_in_lcu, unsigned int split_coding_unit_flag,
1174                           struct intel_batchbuffer *batch)
1175 {
1176     struct i965_driver_data *i965 = i965_driver_data(ctx);
1177     int len_in_dwords = 3;
1178
1179     if(IS_KBL(i965->intel.device_info))
1180         len_in_dwords = 5;
1181
1182     if (batch == NULL)
1183         batch = encoder_context->base.batch;
1184
1185     BEGIN_BCS_BATCH(batch, len_in_dwords);
1186
1187     OUT_BCS_BATCH(batch, HCP_PAK_OBJECT | (len_in_dwords - 2));
1188     OUT_BCS_BATCH(batch,
1189                   (((isLast_ctb > 0) ? 1 : 0) << 31) |  /* last ctb?*/
1190                   ((cu_count_in_lcu - 1) << 24) |           /* No motion vector */
1191                   split_coding_unit_flag);
1192
1193     OUT_BCS_BATCH(batch, (lcu_y << 16) | lcu_x);        /* LCU  for Y*/
1194
1195     if(IS_KBL(i965->intel.device_info))
1196     {
1197         OUT_BCS_BATCH(batch, 0);
1198         OUT_BCS_BATCH(batch, 0);
1199     }
1200
1201     ADVANCE_BCS_BATCH(batch);
1202
1203     return len_in_dwords;
1204 }
1205
1206 #define     AVC_INTRA_RDO_OFFSET    4
1207 #define     AVC_INTER_RDO_OFFSET    10
1208 #define     AVC_INTER_MSG_OFFSET    8
1209 #define     AVC_INTER_MV_OFFSET     48
1210 #define     AVC_RDO_MASK            0xFFFF
1211
1212 #define     AVC_INTRA_MODE_MASK     0x30
1213 #define     AVC_INTRA_16X16         0x00
1214 #define     AVC_INTRA_8X8           0x01
1215 #define     AVC_INTRA_4X4           0x02
1216
1217 #define     AVC_INTER_MODE_MASK     0x03
1218 #define     AVC_INTER_8X8           0x03
1219 #define     AVC_INTER_8X16          0x02
1220 #define     AVC_INTER_16X8          0x01
1221 #define     AVC_INTER_16X16         0x00
1222 #define     AVC_SUBMB_SHAPE_MASK    0x00FF00
1223
1224 /* VME output message, write back message */
1225 #define     AVC_INTER_SUBMB_PRE_MODE_MASK       0x00ff0000
1226 #define     AVC_SUBMB_SHAPE_MASK    0x00FF00
1227
1228 /* here 1 MB = 1CU = 16x16 */
1229 static void
1230 gen9_hcpe_hevc_fill_indirect_cu_intra(VADriverContextP ctx,
1231                                       struct encode_state *encode_state,
1232                                       struct intel_encoder_context *encoder_context,
1233                                       int qp, unsigned int *msg,
1234                                       int ctb_x, int ctb_y,
1235                                       int mb_x, int mb_y,
1236                                       int ctb_width_in_mb, int width_in_ctb, int num_cu_record, int slice_type,int cu_index,int index)
1237 {
1238     /* here cu == mb, so we use mb address as the cu address */
1239     /* to fill the indirect cu by the vme out */
1240     static int intra_mode_8x8_avc2hevc[9] = {26, 10, 1, 34, 18, 24, 13, 28, 8};
1241     static int intra_mode_16x16_avc2hevc[4] = {26, 10, 1, 34};
1242     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
1243     unsigned char * cu_record_ptr = NULL;
1244     unsigned int * cu_msg = NULL;
1245     int ctb_address = (ctb_y * width_in_ctb + ctb_x) * num_cu_record;
1246     int mb_address_in_ctb = 0;
1247     int cu_address = (ctb_address + mb_address_in_ctb + cu_index) * 16 * 4;
1248     int zero = 0;
1249     int is_inter = 0;
1250     int intraMbMode = 0;
1251     int cu_part_mode = 0;
1252     int intraMode[4];
1253     int inerpred_idc = 0;
1254     int intra_chroma_mode = 5;
1255     int cu_size = 1;
1256     int tu_size = 0x55;
1257     int tu_count = 4;
1258     int chroma_mode_remap[4]={5,4,3,2};
1259
1260     if (!is_inter) inerpred_idc = 0xff;
1261
1262     intraMbMode = (msg[0] & AVC_INTRA_MODE_MASK) >> 4;
1263
1264     intra_chroma_mode = (msg[3] & 0x3);
1265     intra_chroma_mode =  chroma_mode_remap[intra_chroma_mode];
1266     if (intraMbMode == AVC_INTRA_16X16) {
1267         cu_part_mode = 0; //2Nx2N
1268         cu_size = 1;
1269         tu_size = 0x55;
1270         tu_count = 4;
1271         intraMode[0] = intra_mode_16x16_avc2hevc[msg[1] & 0xf];
1272         intraMode[1] = intra_mode_16x16_avc2hevc[msg[1] & 0xf];
1273         intraMode[2] = intra_mode_16x16_avc2hevc[msg[1] & 0xf];
1274         intraMode[3] = intra_mode_16x16_avc2hevc[msg[1] & 0xf];
1275     } else if (intraMbMode == AVC_INTRA_8X8) {
1276         cu_part_mode = 0; //2Nx2N
1277         cu_size = 0;
1278         tu_size = 0;
1279         tu_count = 4;
1280         intraMode[0] = intra_mode_8x8_avc2hevc[msg[1] >> (index << 2) & 0xf];
1281         intraMode[1] = intra_mode_8x8_avc2hevc[msg[1] >> (index << 2) & 0xf];
1282         intraMode[2] = intra_mode_8x8_avc2hevc[msg[1] >> (index << 2) & 0xf];
1283         intraMode[3] = intra_mode_8x8_avc2hevc[msg[1] >> (index << 2) & 0xf];
1284
1285     } else { // for 4x4 to use 8x8 replace
1286         cu_part_mode = 3; //NxN
1287         cu_size = 0;
1288         tu_size = 0;
1289         tu_count = 4;
1290         intraMode[0] = intra_mode_8x8_avc2hevc[msg[1] >> ((index << 4) + 0) & 0xf];
1291         intraMode[1] = intra_mode_8x8_avc2hevc[msg[1] >> ((index << 4) + 4) & 0xf];
1292         intraMode[2] = intra_mode_8x8_avc2hevc[msg[1] >> ((index << 4) + 8) & 0xf];
1293         intraMode[3] = intra_mode_8x8_avc2hevc[msg[1] >> ((index << 4) + 12) & 0xf];
1294
1295     }
1296
1297     cu_record_ptr = (unsigned char *)mfc_context->hcp_indirect_cu_object.bo->virtual;
1298     /* get the mb info from the vme out */
1299     cu_msg = (unsigned int *)(cu_record_ptr + cu_address);
1300
1301     cu_msg[0] = (inerpred_idc << 24 |   /* interpred_idc[3:0][1:0] */
1302                  zero << 23 |   /* reserved */
1303                  qp << 16 | /* CU_qp */
1304                  zero << 11 |   /* reserved */
1305                  intra_chroma_mode << 8 |   /* intra_chroma_mode */
1306                  zero << 7 |    /* IPCM_enable , reserved for SKL*/
1307                  cu_part_mode << 4 |    /* cu_part_mode */
1308                  zero << 3 |    /* cu_transquant_bypass_flag */
1309                  is_inter << 2 |    /* cu_pred_mode :intra 1,inter 1*/
1310                  cu_size          /* cu_size */
1311                 );
1312     cu_msg[1] = (zero << 30 |   /* reserved  */
1313                  intraMode[3] << 24 |   /* intra_mode */
1314                  zero << 22 |   /* reserved  */
1315                  intraMode[2] << 16 |   /* intra_mode */
1316                  zero << 14 |   /* reserved  */
1317                  intraMode[1] << 8 |    /* intra_mode */
1318                  zero << 6 |    /* reserved  */
1319                  intraMode[0]           /* intra_mode */
1320                 );
1321     /* l0: 4 MV (x,y); l1; 4 MV (x,y) */
1322     cu_msg[2] = (zero << 16 |   /* mvx_l0[1]  */
1323                  zero           /* mvx_l0[0] */
1324                 );
1325     cu_msg[3] = (zero << 16 |   /* mvx_l0[3]  */
1326                  zero           /* mvx_l0[2] */
1327                 );
1328     cu_msg[4] = (zero << 16 |   /* mvy_l0[1]  */
1329                  zero           /* mvy_l0[0] */
1330                 );
1331     cu_msg[5] = (zero << 16 |   /* mvy_l0[3]  */
1332                  zero           /* mvy_l0[2] */
1333                 );
1334
1335     cu_msg[6] = (zero << 16 |   /* mvx_l1[1]  */
1336                  zero           /* mvx_l1[0] */
1337                 );
1338     cu_msg[7] = (zero << 16 |   /* mvx_l1[3]  */
1339                  zero           /* mvx_l1[2] */
1340                 );
1341     cu_msg[8] = (zero << 16 |   /* mvy_l1[1]  */
1342                  zero           /* mvy_l1[0] */
1343                 );
1344     cu_msg[9] = (zero << 16 |   /* mvy_l1[3]  */
1345                  zero           /* mvy_l1[2] */
1346                 );
1347
1348     cu_msg[10] = (zero << 28 |  /* ref_idx_l1[3]  */
1349                   zero << 24 |  /* ref_idx_l1[2] */
1350                   zero << 20 |  /* ref_idx_l1[1]  */
1351                   zero << 16 |  /* ref_idx_l1[0] */
1352                   zero << 12 |  /* ref_idx_l0[3]  */
1353                   zero << 8 |   /* ref_idx_l0[2] */
1354                   zero << 4 |   /* ref_idx_l0[1]  */
1355                   zero          /* ref_idx_l0[0] */
1356                  );
1357
1358     cu_msg[11] = tu_size; /* tu_size 00000000 00000000 00000000 10101010  or 0x0*/
1359     cu_msg[12] = ((tu_count - 1) << 28 | /* tu count - 1 */
1360                   zero << 16 |  /* reserved  */
1361                   zero          /* tu_xform_Yskip[15:0] */
1362                  );
1363     cu_msg[13] = (zero << 16 |  /* tu_xform_Vskip[15:0]  */
1364                   zero          /* tu_xform_Uskip[15:0] */
1365                  );
1366     cu_msg[14] = zero ;
1367     cu_msg[15] = zero ;
1368 }
1369
1370 /* here 1 MB = 1CU = 16x16 */
1371 static void
1372 gen9_hcpe_hevc_fill_indirect_cu_inter(VADriverContextP ctx,
1373                                       struct encode_state *encode_state,
1374                                       struct intel_encoder_context *encoder_context,
1375                                       int qp, unsigned int *msg,
1376                                       int ctb_x, int ctb_y,
1377                                       int mb_x, int mb_y,
1378                                       int ctb_width_in_mb, int width_in_ctb, int num_cu_record, int slice_type, int cu_index,int index)
1379 {
1380     /* here cu == mb, so we use mb address as the cu address */
1381     /* to fill the indirect cu by the vme out */
1382     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
1383     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1384     unsigned char * cu_record_ptr = NULL;
1385     unsigned int * cu_msg = NULL;
1386     int ctb_address = (ctb_y * width_in_ctb + ctb_x) * num_cu_record;
1387     int mb_address_in_ctb = 0;
1388     int cu_address = (ctb_address + mb_address_in_ctb + cu_index) * 16 * 4;
1389     int zero = 0;
1390     int cu_part_mode = 0;
1391     int submb_pre_mode = 0;
1392     int is_inter = 1;
1393     int cu_size = 1;
1394     int tu_size = 0x55;
1395     int tu_count = 4;
1396     int inter_mode = 0;
1397
1398     unsigned int *mv_ptr;
1399     {
1400         inter_mode = (msg[0] & AVC_INTER_MODE_MASK);
1401         submb_pre_mode = (msg[1] & AVC_INTER_SUBMB_PRE_MODE_MASK) >> 16;
1402 #define MSG_MV_OFFSET   4
1403         mv_ptr = msg + MSG_MV_OFFSET;
1404         /* MV of VME output is based on 16 sub-blocks. So it is necessary
1405         * to convert them to be compatible with the format of AVC_PAK
1406         * command.
1407         */
1408         /* 0/2/4/6/8... : l0, 1/3/5/7...: l1 ; now it only support 16x16,16x8,8x16,8x8*/
1409
1410         if (inter_mode == AVC_INTER_16X16) {
1411             mv_ptr[4] = mv_ptr[0];
1412             mv_ptr[5] = mv_ptr[1];
1413             mv_ptr[2] = mv_ptr[0];
1414             mv_ptr[3] = mv_ptr[1];
1415             mv_ptr[6] = mv_ptr[0];
1416             mv_ptr[7] = mv_ptr[1];
1417             cu_part_mode = 0;
1418             cu_size = 1;
1419             tu_size = 0x55;
1420             tu_count = 4;
1421         } else if (inter_mode == AVC_INTER_8X16) {
1422             mv_ptr[4] = mv_ptr[0];
1423             mv_ptr[5] = mv_ptr[1];
1424             mv_ptr[2] = mv_ptr[8];
1425             mv_ptr[3] = mv_ptr[9];
1426             mv_ptr[6] = mv_ptr[8];
1427             mv_ptr[7] = mv_ptr[9];
1428             cu_part_mode = 1;
1429             cu_size = 1;
1430             tu_size = 0x55;
1431             tu_count = 4;
1432         } else if (inter_mode == AVC_INTER_16X8) {
1433             mv_ptr[2] = mv_ptr[0];
1434             mv_ptr[3] = mv_ptr[1];
1435             mv_ptr[4] = mv_ptr[16];
1436             mv_ptr[5] = mv_ptr[17];
1437             mv_ptr[6] = mv_ptr[24];
1438             mv_ptr[7] = mv_ptr[25];
1439             cu_part_mode = 2;
1440             cu_size = 1;
1441             tu_size = 0x55;
1442             tu_count = 4;
1443         }else if(inter_mode == AVC_INTER_8X8) {
1444             mv_ptr[0] = mv_ptr[index * 8 + 0 ];
1445             mv_ptr[1] = mv_ptr[index * 8 + 1 ];
1446             mv_ptr[2] = mv_ptr[index * 8 + 0 ];
1447             mv_ptr[3] = mv_ptr[index * 8 + 1 ];
1448             mv_ptr[4] = mv_ptr[index * 8 + 0 ];
1449             mv_ptr[5] = mv_ptr[index * 8 + 1 ];
1450             mv_ptr[6] = mv_ptr[index * 8 + 0 ];
1451             mv_ptr[7] = mv_ptr[index * 8 + 1 ];
1452             cu_part_mode = 0;
1453             cu_size = 0;
1454             tu_size = 0x0;
1455             tu_count = 4;
1456
1457         }else
1458         {
1459             mv_ptr[4] = mv_ptr[0];
1460             mv_ptr[5] = mv_ptr[1];
1461             mv_ptr[2] = mv_ptr[0];
1462             mv_ptr[3] = mv_ptr[1];
1463             mv_ptr[6] = mv_ptr[0];
1464             mv_ptr[7] = mv_ptr[1];
1465             cu_part_mode = 0;
1466             cu_size = 1;
1467             tu_size = 0x55;
1468             tu_count = 4;
1469
1470         }
1471     }
1472
1473     cu_record_ptr = (unsigned char *)mfc_context->hcp_indirect_cu_object.bo->virtual;
1474     /* get the mb info from the vme out */
1475     cu_msg = (unsigned int *)(cu_record_ptr + cu_address);
1476
1477     cu_msg[0] = (submb_pre_mode << 24 | /* interpred_idc[3:0][1:0] */
1478                  zero << 23 |   /* reserved */
1479                  qp << 16 | /* CU_qp */
1480                  zero << 11 |   /* reserved */
1481                  5 << 8 |   /* intra_chroma_mode */
1482                  zero << 7 |    /* IPCM_enable , reserved for SKL*/
1483                  cu_part_mode << 4 |    /* cu_part_mode */
1484                  zero << 3 |    /* cu_transquant_bypass_flag */
1485                  is_inter << 2 |    /* cu_pred_mode :intra 1,inter 1*/
1486                  cu_size          /* cu_size */
1487                 );
1488     cu_msg[1] = (zero << 30 |   /* reserved  */
1489                  zero << 24 |   /* intra_mode */
1490                  zero << 22 |   /* reserved  */
1491                  zero << 16 |   /* intra_mode */
1492                  zero << 14 |   /* reserved  */
1493                  zero << 8 |    /* intra_mode */
1494                  zero << 6 |    /* reserved  */
1495                  zero           /* intra_mode */
1496                 );
1497     /* l0: 4 MV (x,y); l1; 4 MV (x,y) */
1498     cu_msg[2] = ((mv_ptr[2] & 0xffff) << 16 |   /* mvx_l0[1]  */
1499                  (mv_ptr[0] & 0xffff)           /* mvx_l0[0] */
1500                 );
1501     cu_msg[3] = ((mv_ptr[6] & 0xffff) << 16 |   /* mvx_l0[3]  */
1502                  (mv_ptr[4] & 0xffff)           /* mvx_l0[2] */
1503                 );
1504     cu_msg[4] = ((mv_ptr[2] & 0xffff0000) |         /* mvy_l0[1]  */
1505                  (mv_ptr[0] & 0xffff0000) >> 16     /* mvy_l0[0] */
1506                 );
1507     cu_msg[5] = ((mv_ptr[6] & 0xffff0000) |         /* mvy_l0[3]  */
1508                  (mv_ptr[4] & 0xffff0000) >> 16     /* mvy_l0[2] */
1509                 );
1510
1511     cu_msg[6] = ((mv_ptr[3] & 0xffff) << 16 |   /* mvx_l1[1]  */
1512                  (mv_ptr[1] & 0xffff)           /* mvx_l1[0] */
1513                 );
1514     cu_msg[7] = ((mv_ptr[7] & 0xffff) << 16 |   /* mvx_l1[3]  */
1515                  (mv_ptr[5] & 0xffff)           /* mvx_l1[2] */
1516                 );
1517     cu_msg[8] = ((mv_ptr[3] & 0xffff0000) |         /* mvy_l1[1]  */
1518                  (mv_ptr[1] & 0xffff0000) >> 16     /* mvy_l1[0] */
1519                 );
1520     cu_msg[9] = ((mv_ptr[7] & 0xffff0000) |         /* mvy_l1[3]  */
1521                  (mv_ptr[5] & 0xffff0000) >> 16     /* mvy_l1[2] */
1522                 );
1523
1524     cu_msg[10] = (((vme_context->ref_index_in_mb[1] >> 24) & 0xf) << 28 |   /* ref_idx_l1[3]  */
1525                   ((vme_context->ref_index_in_mb[1] >> 16) & 0xf) << 24 |   /* ref_idx_l1[2] */
1526                   ((vme_context->ref_index_in_mb[1] >> 8) & 0xf) << 20 |    /* ref_idx_l1[1]  */
1527                   ((vme_context->ref_index_in_mb[1] >> 0) & 0xf) << 16 |    /* ref_idx_l1[0] */
1528                   ((vme_context->ref_index_in_mb[0] >> 24) & 0xf) << 12 |   /* ref_idx_l0[3]  */
1529                   ((vme_context->ref_index_in_mb[0] >> 16) & 0xf) << 8  |   /* ref_idx_l0[2] */
1530                   ((vme_context->ref_index_in_mb[0] >> 8) & 0xf) << 4 |     /* ref_idx_l0[1]  */
1531                   ((vme_context->ref_index_in_mb[0] >> 0) & 0xf)            /* ref_idx_l0[0] */
1532                  );
1533
1534     cu_msg[11] = tu_size; /* tu_size 00000000 00000000 00000000 10101010  or 0x0*/
1535     cu_msg[12] = ((tu_count - 1) << 28 | /* tu count - 1 */
1536                   zero << 16 |  /* reserved  */
1537                   zero          /* tu_xform_Yskip[15:0] */
1538                  );
1539     cu_msg[13] = (zero << 16 |  /* tu_xform_Vskip[15:0]  */
1540                   zero          /* tu_xform_Uskip[15:0] */
1541                  );
1542     cu_msg[14] = zero ;
1543     cu_msg[15] = zero ;
1544 }
1545
1546 #define HEVC_SPLIT_CU_FLAG_64_64 ((0x1<<20)|(0xf<<16)|(0x0<<12)|(0x0<<8)|(0x0<<4)|(0x0))
1547 #define HEVC_SPLIT_CU_FLAG_32_32 ((0x1<<20)|(0x0<<16)|(0x0<<12)|(0x0<<8)|(0x0<<4)|(0x0))
1548 #define HEVC_SPLIT_CU_FLAG_16_16 ((0x0<<20)|(0x0<<16)|(0x0<<12)|(0x0<<8)|(0x0<<4)|(0x0))
1549 #define HEVC_SPLIT_CU_FLAG_8_8   ((0x1<<20)|(0x0<<16)|(0x0<<12)|(0x0<<8)|(0x0<<4)|(0x0))
1550
1551
1552 void
1553 intel_hevc_slice_insert_packed_data(VADriverContextP ctx,
1554                                     struct encode_state *encode_state,
1555                                     struct intel_encoder_context *encoder_context,
1556                                     int slice_index,
1557                                     struct intel_batchbuffer *slice_batch)
1558 {
1559     int count, i, start_index;
1560     unsigned int length_in_bits;
1561     VAEncPackedHeaderParameterBuffer *param = NULL;
1562     unsigned int *header_data = NULL;
1563     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
1564     int slice_header_index;
1565
1566     if (encode_state->slice_header_index[slice_index] == 0)
1567         slice_header_index = -1;
1568     else
1569         slice_header_index = (encode_state->slice_header_index[slice_index] & SLICE_PACKED_DATA_INDEX_MASK);
1570
1571     count = encode_state->slice_rawdata_count[slice_index];
1572     start_index = (encode_state->slice_rawdata_index[slice_index] & SLICE_PACKED_DATA_INDEX_MASK);
1573
1574     for (i = 0; i < count; i++) {
1575         unsigned int skip_emul_byte_cnt;
1576
1577         header_data = (unsigned int *)encode_state->packed_header_data_ext[start_index + i]->buffer;
1578
1579         param = (VAEncPackedHeaderParameterBuffer *)
1580                 (encode_state->packed_header_params_ext[start_index + i]->buffer);
1581
1582         /* skip the slice header packed data type as it is lastly inserted */
1583         if (param->type == VAEncPackedHeaderSlice)
1584             continue;
1585
1586         length_in_bits = param->bit_length;
1587
1588         skip_emul_byte_cnt = intel_hevc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
1589
1590         /* as the slice header is still required, the last header flag is set to
1591          * zero.
1592          */
1593         mfc_context->insert_object(ctx,
1594                                    encoder_context,
1595                                    header_data,
1596                                    ALIGN(length_in_bits, 32) >> 5,
1597                                    length_in_bits & 0x1f,
1598                                    skip_emul_byte_cnt,
1599                                    0,
1600                                    0,
1601                                    !param->has_emulation_bytes,
1602                                    slice_batch);
1603     }
1604
1605     if (slice_header_index == -1) {
1606         unsigned char *slice_header = NULL;
1607         int slice_header_length_in_bits = 0;
1608         VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
1609         VAEncPictureParameterBufferHEVC *pPicParameter = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
1610         VAEncSliceParameterBufferHEVC *pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[slice_index]->buffer;
1611
1612         /* For the Normal HEVC */
1613         slice_header_length_in_bits = build_hevc_slice_header(pSequenceParameter,
1614                                       pPicParameter,
1615                                       pSliceParameter,
1616                                       &slice_header,
1617                                       0);
1618         mfc_context->insert_object(ctx, encoder_context,
1619                                    (unsigned int *)slice_header,
1620                                    ALIGN(slice_header_length_in_bits, 32) >> 5,
1621                                    slice_header_length_in_bits & 0x1f,
1622                                    5,  /* first 6 bytes are start code + nal unit type */
1623                                    1, 0, 1, slice_batch);
1624         free(slice_header);
1625     } else {
1626         unsigned int skip_emul_byte_cnt;
1627
1628         header_data = (unsigned int *)encode_state->packed_header_data_ext[slice_header_index]->buffer;
1629
1630         param = (VAEncPackedHeaderParameterBuffer *)
1631                 (encode_state->packed_header_params_ext[slice_header_index]->buffer);
1632         length_in_bits = param->bit_length;
1633
1634         /* as the slice header is the last header data for one slice,
1635          * the last header flag is set to one.
1636          */
1637         skip_emul_byte_cnt = intel_hevc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
1638
1639         mfc_context->insert_object(ctx,
1640                                    encoder_context,
1641                                    header_data,
1642                                    ALIGN(length_in_bits, 32) >> 5,
1643                                    length_in_bits & 0x1f,
1644                                    skip_emul_byte_cnt,
1645                                    1,
1646                                    0,
1647                                    !param->has_emulation_bytes,
1648                                    slice_batch);
1649     }
1650
1651     return;
1652 }
1653
1654 static void
1655 gen9_hcpe_hevc_pipeline_slice_programing(VADriverContextP ctx,
1656         struct encode_state *encode_state,
1657         struct intel_encoder_context *encoder_context,
1658         int slice_index,
1659         struct intel_batchbuffer *slice_batch)
1660 {
1661     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
1662     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1663     VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
1664     VAEncPictureParameterBufferHEVC *pPicParameter = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
1665     VAEncSliceParameterBufferHEVC *pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[slice_index]->buffer;
1666     int qp_slice = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1667     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1668     //unsigned char *slice_header = NULL;         // for future use
1669     //int slice_header_length_in_bits = 0;
1670     unsigned int tail_data[] = { 0x0, 0x0 };
1671     int slice_type = pSliceParameter->slice_type;
1672
1673     int log2_cu_size = pSequenceParameter->log2_min_luma_coding_block_size_minus3 + 3;
1674     int log2_ctb_size = pSequenceParameter->log2_diff_max_min_luma_coding_block_size + log2_cu_size;
1675     int ctb_size = 1 << log2_ctb_size;
1676     int width_in_ctb = (pSequenceParameter->pic_width_in_luma_samples + ctb_size - 1) / ctb_size;
1677     int height_in_ctb = (pSequenceParameter->pic_height_in_luma_samples + ctb_size - 1) / ctb_size;
1678     int last_slice = (pSliceParameter->slice_segment_address + pSliceParameter->num_ctu_in_slice) == (width_in_ctb * height_in_ctb);
1679     int ctb_width_in_mb = (ctb_size + 15) / 16;
1680     int i_ctb, ctb_x, ctb_y;
1681     unsigned int split_coding_unit_flag = 0;
1682     int width_in_mbs = (pSequenceParameter->pic_width_in_luma_samples + 15) / 16;
1683     int row_pad_flag = (pSequenceParameter->pic_height_in_luma_samples % ctb_size)> 0 ? 1:0;
1684     int col_pad_flag = (pSequenceParameter->pic_width_in_luma_samples % ctb_size)> 0 ? 1:0;
1685
1686     int is_intra = (slice_type == HEVC_SLICE_I);
1687     unsigned int *msg = NULL;
1688     unsigned char *msg_ptr = NULL;
1689     int macroblock_address = 0;
1690     int num_cu_record = 64;
1691     int cu_count = 1;
1692     int tmp_mb_mode = 0;
1693     int mb_x = 0, mb_y = 0;
1694     int mb_addr = 0;
1695     int cu_index = 0;
1696     int inter_rdo, intra_rdo;
1697     int qp;
1698     int drop_cu_row_in_last_mb = 0;
1699     int drop_cu_column_in_last_mb = 0;
1700
1701     if (log2_ctb_size == 5) num_cu_record = 16;
1702     else if (log2_ctb_size == 4) num_cu_record = 4;
1703     else if (log2_ctb_size == 6) num_cu_record = 64;
1704
1705     qp = qp_slice;
1706     if (rate_control_mode == VA_RC_CBR) {
1707         qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
1708         if(slice_type == HEVC_SLICE_B) {
1709             if(pSequenceParameter->ip_period == 1)
1710             {
1711                 qp = mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY;
1712
1713             }else if(mfc_context->vui_hrd.i_frame_number % pSequenceParameter->ip_period == 1){
1714                 qp = mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY;
1715             }
1716         }
1717         if (encode_state->slice_header_index[slice_index] == 0) {
1718             pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1719         }
1720     }
1721
1722     /* only support for 8-bit pixel bit-depth */
1723     assert(pSequenceParameter->seq_fields.bits.bit_depth_luma_minus8 >= 0 && pSequenceParameter->seq_fields.bits.bit_depth_luma_minus8 <= 2);
1724     assert(pSequenceParameter->seq_fields.bits.bit_depth_chroma_minus8 >= 0 && pSequenceParameter->seq_fields.bits.bit_depth_chroma_minus8 <= 2);
1725     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1726     assert(qp >= 0 && qp < 52);
1727
1728     {
1729         gen9_hcpe_hevc_slice_state(ctx,
1730                                    pPicParameter,
1731                                    pSliceParameter,
1732                                    encode_state, encoder_context,
1733                                    slice_batch);
1734
1735         if (slice_index == 0)
1736             intel_hcpe_hevc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1737
1738         intel_hevc_slice_insert_packed_data(ctx, encode_state, encoder_context, slice_index, slice_batch);
1739
1740         /*
1741         slice_header_length_in_bits = build_hevc_slice_header(pSequenceParameter, pPicParameter, pSliceParameter, &slice_header, slice_index);
1742         int skip_emul_byte_cnt = intel_hevc_find_skipemulcnt((unsigned char *)slice_header, slice_header_length_in_bits);
1743
1744         mfc_context->insert_object(ctx, encoder_context,
1745                                    (unsigned int *)slice_header, ALIGN(slice_header_length_in_bits, 32) >> 5, slice_header_length_in_bits & 0x1f,
1746                                     skip_emul_byte_cnt,
1747                                     1, 0, 1, slice_batch);
1748         free(slice_header);
1749         */
1750     }
1751
1752
1753
1754     split_coding_unit_flag = (ctb_width_in_mb == 4) ? HEVC_SPLIT_CU_FLAG_64_64 : ((ctb_width_in_mb == 2) ? HEVC_SPLIT_CU_FLAG_32_32 : HEVC_SPLIT_CU_FLAG_16_16);
1755
1756     dri_bo_map(vme_context->vme_output.bo , 1);
1757     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
1758     dri_bo_map(mfc_context->hcp_indirect_cu_object.bo , 1);
1759
1760     for (i_ctb = pSliceParameter->slice_segment_address;i_ctb < pSliceParameter->slice_segment_address + pSliceParameter->num_ctu_in_slice; i_ctb++) {
1761         int last_ctb = (i_ctb == (pSliceParameter->slice_segment_address + pSliceParameter->num_ctu_in_slice - 1));
1762         int ctb_height_in_mb_internal = ctb_width_in_mb;
1763         int ctb_width_in_mb_internal = ctb_width_in_mb;
1764         int max_cu_num_in_mb = 4;
1765
1766         ctb_x = i_ctb % width_in_ctb;
1767         ctb_y = i_ctb / width_in_ctb;
1768
1769         drop_cu_row_in_last_mb = 0;
1770         drop_cu_column_in_last_mb = 0;
1771
1772         if(ctb_y == (height_in_ctb - 1) && row_pad_flag)
1773         {
1774             ctb_height_in_mb_internal = (pSequenceParameter->pic_height_in_luma_samples - (ctb_y * ctb_size) + 15)/16;
1775
1776             if((log2_cu_size == 3) && (pSequenceParameter->pic_height_in_luma_samples % 16))
1777                 drop_cu_row_in_last_mb = (16 - (pSequenceParameter->pic_height_in_luma_samples % 16))>>log2_cu_size;
1778         }
1779
1780         if(ctb_x == (width_in_ctb - 1) && col_pad_flag)
1781         {
1782             ctb_width_in_mb_internal = (pSequenceParameter->pic_width_in_luma_samples - (ctb_x * ctb_size) + 15) / 16;
1783
1784             if((log2_cu_size == 3) && (pSequenceParameter->pic_width_in_luma_samples % 16))
1785                 drop_cu_column_in_last_mb = (16 - (pSequenceParameter->pic_width_in_luma_samples % 16))>>log2_cu_size;
1786         }
1787
1788         mb_x = 0;
1789         mb_y = 0;
1790         macroblock_address = ctb_y * width_in_mbs * ctb_width_in_mb + ctb_x * ctb_width_in_mb;
1791         split_coding_unit_flag = ((ctb_width_in_mb == 2) ? HEVC_SPLIT_CU_FLAG_32_32 : HEVC_SPLIT_CU_FLAG_16_16);
1792         cu_count = 1;
1793         cu_index = 0;
1794         mb_addr = 0;
1795         msg = NULL;
1796         for (mb_y = 0; mb_y < ctb_height_in_mb_internal; mb_y++)
1797         {
1798             mb_addr = macroblock_address + mb_y * width_in_mbs ;
1799             for (mb_x = 0; mb_x < ctb_width_in_mb_internal; mb_x++)
1800             {
1801                 max_cu_num_in_mb = 4;
1802                 if(drop_cu_row_in_last_mb && (mb_y == ctb_height_in_mb_internal - 1))
1803                     max_cu_num_in_mb /= 2;
1804
1805                 if(drop_cu_column_in_last_mb && (mb_x == ctb_width_in_mb_internal - 1))
1806                     max_cu_num_in_mb /= 2;
1807
1808                 /* get the mb info from the vme out */
1809                 msg = (unsigned int *)(msg_ptr + mb_addr * vme_context->vme_output.size_block);
1810
1811                 inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
1812                 intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
1813                 /*fill to indirect cu */
1814                 /*to do */
1815                 if (is_intra || intra_rdo < inter_rdo) {
1816                     /* fill intra cu */
1817                     tmp_mb_mode = (msg[0] & AVC_INTRA_MODE_MASK) >> 4;
1818                     if(max_cu_num_in_mb < 4){
1819                         if(tmp_mb_mode == AVC_INTRA_16X16)
1820                         {
1821                             msg[0] = (msg[0] & !AVC_INTRA_MODE_MASK) | (AVC_INTRA_8X8<<4);
1822                             tmp_mb_mode = AVC_INTRA_8X8;
1823                         }
1824
1825                         gen9_hcpe_hevc_fill_indirect_cu_intra(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,0);
1826                         if(--max_cu_num_in_mb > 0)
1827                             gen9_hcpe_hevc_fill_indirect_cu_intra(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,2);
1828
1829                         if(ctb_width_in_mb == 2)
1830                             split_coding_unit_flag |= 0x1 << (mb_x + mb_y * ctb_width_in_mb + 16);
1831                         else if(ctb_width_in_mb == 1)
1832                             split_coding_unit_flag |= 0x1 << 20;
1833                     }
1834                     else if(tmp_mb_mode == AVC_INTRA_16X16) {
1835                         gen9_hcpe_hevc_fill_indirect_cu_intra(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,0);
1836                     } else { // for 4x4 to use 8x8 replace
1837                         gen9_hcpe_hevc_fill_indirect_cu_intra(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,0);
1838                         gen9_hcpe_hevc_fill_indirect_cu_intra(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,1);
1839                         gen9_hcpe_hevc_fill_indirect_cu_intra(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,2);
1840                         gen9_hcpe_hevc_fill_indirect_cu_intra(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,3);
1841                         if(ctb_width_in_mb == 2)
1842                             split_coding_unit_flag |= 0x1 << (mb_x + mb_y * ctb_width_in_mb + 16);
1843                         else if(ctb_width_in_mb == 1)
1844                             split_coding_unit_flag |= 0x1 << 20;
1845                     }
1846                 } else {
1847                     msg += AVC_INTER_MSG_OFFSET;
1848                     /* fill inter cu */
1849                     tmp_mb_mode = msg[0] & AVC_INTER_MODE_MASK;
1850                     if(max_cu_num_in_mb < 4)
1851                     {
1852                         if(tmp_mb_mode != AVC_INTER_8X8)
1853                         {
1854                             msg[0] = (msg[0] & !AVC_INTER_MODE_MASK) | AVC_INTER_8X8;
1855                             tmp_mb_mode = AVC_INTER_8X8;
1856                         }
1857                         gen9_hcpe_hevc_fill_indirect_cu_inter(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,0);
1858                         if(--max_cu_num_in_mb > 0)
1859                             gen9_hcpe_hevc_fill_indirect_cu_inter(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,1);
1860
1861                         if(ctb_width_in_mb == 2)
1862                             split_coding_unit_flag |= 0x1 << (mb_x + mb_y * ctb_width_in_mb + 16);
1863                         else if(ctb_width_in_mb == 1)
1864                             split_coding_unit_flag |= 0x1 << 20;
1865                     }
1866                     else if (tmp_mb_mode == AVC_INTER_8X8){
1867                         gen9_hcpe_hevc_fill_indirect_cu_inter(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,0);
1868                         gen9_hcpe_hevc_fill_indirect_cu_inter(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,1);
1869                         gen9_hcpe_hevc_fill_indirect_cu_inter(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,2);
1870                         gen9_hcpe_hevc_fill_indirect_cu_inter(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,3);
1871                         if(ctb_width_in_mb == 2)
1872                             split_coding_unit_flag |= 0x1 << (mb_x + mb_y * ctb_width_in_mb + 16);
1873                         else if(ctb_width_in_mb == 1)
1874                             split_coding_unit_flag |= 0x1 << 20;
1875
1876                     }else if(tmp_mb_mode == AVC_INTER_16X16 ||
1877                         tmp_mb_mode == AVC_INTER_8X16 ||
1878                         tmp_mb_mode == AVC_INTER_16X8) {
1879                         gen9_hcpe_hevc_fill_indirect_cu_inter(ctx, encode_state, encoder_context, qp, msg, ctb_x, ctb_y, mb_x, mb_y, ctb_width_in_mb, width_in_ctb, num_cu_record, slice_type,cu_index++,0);
1880                     }
1881                 }
1882                 mb_addr++;
1883             }
1884         }
1885
1886         cu_count = cu_index;
1887         // PAK object fill accordingly.
1888         gen9_hcpe_hevc_pak_object(ctx, ctb_x, ctb_y, last_ctb, encoder_context, cu_count, split_coding_unit_flag, slice_batch);
1889     }
1890
1891     dri_bo_unmap(mfc_context->hcp_indirect_cu_object.bo);
1892     dri_bo_unmap(vme_context->vme_output.bo);
1893
1894     if (last_slice) {
1895         mfc_context->insert_object(ctx, encoder_context,
1896                                    tail_data, 2, 8,
1897                                    2, 1, 1, 0, slice_batch);
1898     } else {
1899         mfc_context->insert_object(ctx, encoder_context,
1900                                    tail_data, 1, 8,
1901                                    1, 1, 1, 0, slice_batch);
1902     }
1903 }
1904
1905 static dri_bo *
1906 gen9_hcpe_hevc_software_batchbuffer(VADriverContextP ctx,
1907                                     struct encode_state *encode_state,
1908                                     struct intel_encoder_context *encoder_context)
1909 {
1910     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
1911     struct intel_batchbuffer *batch;
1912     dri_bo *batch_bo;
1913     int i;
1914
1915     batch = mfc_context->aux_batchbuffer;
1916     batch_bo = batch->buffer;
1917
1918     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
1919         gen9_hcpe_hevc_pipeline_slice_programing(ctx, encode_state, encoder_context, i, batch);
1920     }
1921
1922     intel_batchbuffer_align(batch, 8);
1923
1924     BEGIN_BCS_BATCH(batch, 2);
1925     OUT_BCS_BATCH(batch, 0);
1926     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
1927     ADVANCE_BCS_BATCH(batch);
1928
1929     dri_bo_reference(batch_bo);
1930     intel_batchbuffer_free(batch);
1931     mfc_context->aux_batchbuffer = NULL;
1932
1933     return batch_bo;
1934 }
1935
1936 #else
1937
1938 #endif
1939
1940 static void
1941 gen9_hcpe_hevc_pipeline_programing(VADriverContextP ctx,
1942                                    struct encode_state *encode_state,
1943                                    struct intel_encoder_context *encoder_context)
1944 {
1945     struct i965_driver_data *i965 = i965_driver_data(ctx);
1946     struct intel_batchbuffer *batch = encoder_context->base.batch;
1947     dri_bo *slice_batch_bo;
1948
1949 #ifdef HCP_SOFTWARE_SKYLAKE
1950     slice_batch_bo = gen9_hcpe_hevc_software_batchbuffer(ctx, encode_state, encoder_context);
1951 #else
1952     slice_batch_bo = gen9_hcpe_hevc_hardware_batchbuffer(ctx, encode_state, encoder_context);
1953 #endif
1954
1955     // begin programing
1956     if (i965->intel.has_bsd2)
1957         intel_batchbuffer_start_atomic_bcs_override(batch, 0x4000, BSD_RING0);
1958     else
1959         intel_batchbuffer_start_atomic_bcs(batch, 0x4000);
1960     intel_batchbuffer_emit_mi_flush(batch);
1961
1962     // picture level programing
1963     gen9_hcpe_hevc_pipeline_picture_programing(ctx, encode_state, encoder_context);
1964
1965     BEGIN_BCS_BATCH(batch, 3);
1966     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
1967     OUT_BCS_RELOC64(batch,
1968                   slice_batch_bo,
1969                   I915_GEM_DOMAIN_COMMAND, 0,
1970                   0);
1971     ADVANCE_BCS_BATCH(batch);
1972
1973     // end programing
1974     intel_batchbuffer_end_atomic(batch);
1975
1976     dri_bo_unreference(slice_batch_bo);
1977 }
1978
1979 void intel_hcpe_hevc_pipeline_header_programing(VADriverContextP ctx,
1980         struct encode_state *encode_state,
1981         struct intel_encoder_context *encoder_context,
1982         struct intel_batchbuffer *slice_batch)
1983 {
1984     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
1985     int idx = va_enc_packed_type_to_idx(VAEncPackedHeaderHEVC_VPS);
1986     unsigned int skip_emul_byte_cnt;
1987
1988     if (encode_state->packed_header_data[idx]) {
1989         VAEncPackedHeaderParameterBuffer *param = NULL;
1990         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
1991         unsigned int length_in_bits;
1992
1993         assert(encode_state->packed_header_param[idx]);
1994         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
1995         length_in_bits = param->bit_length;
1996
1997         skip_emul_byte_cnt = intel_hevc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
1998         mfc_context->insert_object(ctx,
1999                                    encoder_context,
2000                                    header_data,
2001                                    ALIGN(length_in_bits, 32) >> 5,
2002                                    length_in_bits & 0x1f,
2003                                    skip_emul_byte_cnt,
2004                                    0,
2005                                    0,
2006                                    !param->has_emulation_bytes,
2007                                    slice_batch);
2008     }
2009
2010     idx = va_enc_packed_type_to_idx(VAEncPackedHeaderHEVC_VPS) + 1; // index to SPS
2011
2012     if (encode_state->packed_header_data[idx]) {
2013         VAEncPackedHeaderParameterBuffer *param = NULL;
2014         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2015         unsigned int length_in_bits;
2016
2017         assert(encode_state->packed_header_param[idx]);
2018         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2019         length_in_bits = param->bit_length;
2020
2021         skip_emul_byte_cnt = intel_hevc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
2022         mfc_context->insert_object(ctx,
2023                                    encoder_context,
2024                                    header_data,
2025                                    ALIGN(length_in_bits, 32) >> 5,
2026                                    length_in_bits & 0x1f,
2027                                    skip_emul_byte_cnt,
2028                                    0,
2029                                    0,
2030                                    !param->has_emulation_bytes,
2031                                    slice_batch);
2032     }
2033
2034     idx = va_enc_packed_type_to_idx(VAEncPackedHeaderHEVC_PPS);
2035
2036     if (encode_state->packed_header_data[idx]) {
2037         VAEncPackedHeaderParameterBuffer *param = NULL;
2038         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2039         unsigned int length_in_bits;
2040
2041         assert(encode_state->packed_header_param[idx]);
2042         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2043         length_in_bits = param->bit_length;
2044
2045         skip_emul_byte_cnt = intel_hevc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
2046
2047         mfc_context->insert_object(ctx,
2048                                    encoder_context,
2049                                    header_data,
2050                                    ALIGN(length_in_bits, 32) >> 5,
2051                                    length_in_bits & 0x1f,
2052                                    skip_emul_byte_cnt,
2053                                    0,
2054                                    0,
2055                                    !param->has_emulation_bytes,
2056                                    slice_batch);
2057     }
2058
2059     idx = va_enc_packed_type_to_idx(VAEncPackedHeaderHEVC_SEI);
2060
2061     if (encode_state->packed_header_data[idx]) {
2062         VAEncPackedHeaderParameterBuffer *param = NULL;
2063         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2064         unsigned int length_in_bits;
2065
2066         assert(encode_state->packed_header_param[idx]);
2067         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2068         length_in_bits = param->bit_length;
2069
2070         skip_emul_byte_cnt = intel_hevc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
2071         mfc_context->insert_object(ctx,
2072                                    encoder_context,
2073                                    header_data,
2074                                    ALIGN(length_in_bits, 32) >> 5,
2075                                    length_in_bits & 0x1f,
2076                                    skip_emul_byte_cnt,
2077                                    0,
2078                                    0,
2079                                    !param->has_emulation_bytes,
2080                                    slice_batch);
2081     }
2082 }
2083
2084 VAStatus intel_hcpe_hevc_prepare(VADriverContextP ctx,
2085                                  struct encode_state *encode_state,
2086                                  struct intel_encoder_context *encoder_context)
2087 {
2088     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
2089     struct object_surface *obj_surface;
2090     struct object_buffer *obj_buffer;
2091     GenHevcSurface *hevc_encoder_surface;
2092     dri_bo *bo;
2093     VAStatus vaStatus = VA_STATUS_SUCCESS;
2094     int i;
2095     struct i965_coded_buffer_segment *coded_buffer_segment;
2096
2097     /*Setup all the input&output object*/
2098
2099     /* Setup current frame and current direct mv buffer*/
2100     obj_surface = encode_state->reconstructed_object;
2101
2102     hevc_encoder_surface = (GenHevcSurface *) obj_surface->private_data;
2103     assert(hevc_encoder_surface);
2104
2105     if (hevc_encoder_surface) {
2106         hevc_encoder_surface->has_p010_to_nv12_done=0;
2107         hevc_encoder_surface->base.frame_store_id = -1;
2108         mfc_context->current_collocated_mv_temporal_buffer[NUM_HCP_CURRENT_COLLOCATED_MV_TEMPORAL_BUFFERS - 1].bo = hevc_encoder_surface->motion_vector_temporal_bo;
2109         dri_bo_reference(hevc_encoder_surface->motion_vector_temporal_bo);
2110     }
2111
2112     mfc_context->surface_state.width = obj_surface->orig_width;
2113     mfc_context->surface_state.height = obj_surface->orig_height;
2114     mfc_context->surface_state.w_pitch = obj_surface->width;
2115     mfc_context->surface_state.h_pitch = obj_surface->height;
2116
2117     /* Setup reference frames and direct mv buffers*/
2118     for (i = 0; i < MAX_HCP_REFERENCE_SURFACES; i++) {
2119         obj_surface = encode_state->reference_objects[i];
2120
2121         if (obj_surface && obj_surface->bo) {
2122             mfc_context->reference_surfaces[i].bo = obj_surface->bo;
2123             dri_bo_reference(obj_surface->bo);
2124
2125             /* Check MV temporal buffer */
2126             hevc_encoder_surface = (GenHevcSurface *) obj_surface->private_data;
2127             assert(hevc_encoder_surface);
2128
2129             if (hevc_encoder_surface) {
2130                 hevc_encoder_surface->base.frame_store_id = -1;
2131                 /* Setup MV temporal buffer */
2132                 mfc_context->current_collocated_mv_temporal_buffer[i].bo = hevc_encoder_surface->motion_vector_temporal_bo;
2133                 dri_bo_reference(hevc_encoder_surface->motion_vector_temporal_bo);
2134             }
2135         } else {
2136             break;
2137         }
2138     }
2139
2140
2141     mfc_context->uncompressed_picture_source.bo = encode_state->input_yuv_object->bo;
2142     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
2143
2144     obj_buffer = encode_state->coded_buf_object;
2145     bo = obj_buffer->buffer_store->bo;
2146     mfc_context->hcp_indirect_pak_bse_object.bo = bo;
2147     mfc_context->hcp_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
2148     mfc_context->hcp_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
2149     dri_bo_reference(mfc_context->hcp_indirect_pak_bse_object.bo);
2150
2151     dri_bo_map(bo, 1);
2152     coded_buffer_segment = (struct i965_coded_buffer_segment *)(bo->virtual);
2153     coded_buffer_segment->mapped = 0;
2154     coded_buffer_segment->codec = encoder_context->codec;
2155     dri_bo_unmap(bo);
2156
2157     return vaStatus;
2158 }
2159
2160 /* HEVC BRC related */
2161
2162 static void
2163 intel_hcpe_bit_rate_control_context_init(struct encode_state *encode_state,
2164                                          struct intel_encoder_context *encoder_context)
2165 {
2166     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
2167     VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
2168     int ctb_size = 16;
2169     int width_in_mbs = (pSequenceParameter->pic_width_in_luma_samples + ctb_size - 1) / ctb_size;
2170     int height_in_mbs = (pSequenceParameter->pic_height_in_luma_samples + ctb_size - 1) / ctb_size;
2171
2172     double fps = (double)encoder_context->brc.framerate[0].num / (double)encoder_context->brc.framerate[0].den;
2173     double bitrate = encoder_context->brc.bits_per_second[0];
2174     int inter_mb_size = bitrate * 1.0 / (fps + 4.0) / width_in_mbs / height_in_mbs;
2175     int intra_mb_size = inter_mb_size * 5.0;
2176     int i;
2177
2178     mfc_context->bit_rate_control_context[HEVC_SLICE_I].target_mb_size = intra_mb_size;
2179     mfc_context->bit_rate_control_context[HEVC_SLICE_I].target_frame_size = intra_mb_size * width_in_mbs * height_in_mbs;
2180     mfc_context->bit_rate_control_context[HEVC_SLICE_P].target_mb_size = inter_mb_size;
2181     mfc_context->bit_rate_control_context[HEVC_SLICE_P].target_frame_size = inter_mb_size * width_in_mbs * height_in_mbs;
2182     mfc_context->bit_rate_control_context[HEVC_SLICE_B].target_mb_size = inter_mb_size;
2183     mfc_context->bit_rate_control_context[HEVC_SLICE_B].target_frame_size = inter_mb_size * width_in_mbs * height_in_mbs;
2184
2185     for (i = 0 ; i < 3; i++) {
2186         mfc_context->bit_rate_control_context[i].QpPrimeY = 26;
2187         mfc_context->bit_rate_control_context[i].MaxQpNegModifier = 6;
2188         mfc_context->bit_rate_control_context[i].MaxQpPosModifier = 6;
2189         mfc_context->bit_rate_control_context[i].GrowInit = 6;
2190         mfc_context->bit_rate_control_context[i].GrowResistance = 4;
2191         mfc_context->bit_rate_control_context[i].ShrinkInit = 6;
2192         mfc_context->bit_rate_control_context[i].ShrinkResistance = 4;
2193
2194         mfc_context->bit_rate_control_context[i].Correct[0] = 8;
2195         mfc_context->bit_rate_control_context[i].Correct[1] = 4;
2196         mfc_context->bit_rate_control_context[i].Correct[2] = 2;
2197         mfc_context->bit_rate_control_context[i].Correct[3] = 2;
2198         mfc_context->bit_rate_control_context[i].Correct[4] = 4;
2199         mfc_context->bit_rate_control_context[i].Correct[5] = 8;
2200     }
2201
2202     mfc_context->bit_rate_control_context[HEVC_SLICE_I].TargetSizeInWord = (intra_mb_size + 16) / 16;
2203     mfc_context->bit_rate_control_context[HEVC_SLICE_P].TargetSizeInWord = (inter_mb_size + 16) / 16;
2204     mfc_context->bit_rate_control_context[HEVC_SLICE_B].TargetSizeInWord = (inter_mb_size + 16) / 16;
2205
2206     mfc_context->bit_rate_control_context[HEVC_SLICE_I].MaxSizeInWord = mfc_context->bit_rate_control_context[HEVC_SLICE_I].TargetSizeInWord * 1.5;
2207     mfc_context->bit_rate_control_context[HEVC_SLICE_P].MaxSizeInWord = mfc_context->bit_rate_control_context[HEVC_SLICE_P].TargetSizeInWord * 1.5;
2208     mfc_context->bit_rate_control_context[HEVC_SLICE_B].MaxSizeInWord = mfc_context->bit_rate_control_context[HEVC_SLICE_B].TargetSizeInWord * 1.5;
2209 }
2210
2211 static void intel_hcpe_brc_init(struct encode_state *encode_state,
2212                                 struct intel_encoder_context* encoder_context)
2213 {
2214     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
2215     VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
2216
2217     double bitrate = (double)encoder_context->brc.bits_per_second[0];
2218     double framerate = (double)encoder_context->brc.framerate[0].num / (double)encoder_context->brc.framerate[0].den;
2219     int inum = 1, pnum = 0, bnum = 0; /* Gop structure: number of I, P, B frames in the Gop. */
2220     int intra_period = pSequenceParameter->intra_period;
2221     int ip_period = pSequenceParameter->ip_period;
2222     double qp1_size = 0.1 * 8 * 3 * pSequenceParameter->pic_width_in_luma_samples * pSequenceParameter->pic_height_in_luma_samples / 2;
2223     double qp51_size = 0.001 * 8 * 3 * pSequenceParameter->pic_width_in_luma_samples * pSequenceParameter->pic_height_in_luma_samples / 2;
2224     double bpf;
2225     int ratio_min = 1;
2226     int ratio_max = 32;
2227     int ratio = 8;
2228     double buffer_size = 0;
2229     int bpp = 1;
2230
2231     if((pSequenceParameter->seq_fields.bits.bit_depth_luma_minus8 > 0) ||
2232         (pSequenceParameter->seq_fields.bits.bit_depth_chroma_minus8 > 0))
2233         bpp = 2;
2234
2235     qp1_size = qp1_size * bpp;
2236     qp51_size = qp51_size * bpp;
2237
2238     if (pSequenceParameter->ip_period) {
2239         pnum = (intra_period + ip_period - 1) / ip_period - 1;
2240         bnum = intra_period - inum - pnum;
2241     }
2242
2243     mfc_context->brc.mode = encoder_context->rate_control_mode;
2244
2245     mfc_context->brc.target_frame_size[HEVC_SLICE_I] = (int)((double)((bitrate * intra_period) / framerate) /
2246             (double)(inum + BRC_PWEIGHT * pnum + BRC_BWEIGHT * bnum));
2247     mfc_context->brc.target_frame_size[HEVC_SLICE_P] = BRC_PWEIGHT * mfc_context->brc.target_frame_size[HEVC_SLICE_I];
2248     mfc_context->brc.target_frame_size[HEVC_SLICE_B] = BRC_BWEIGHT * mfc_context->brc.target_frame_size[HEVC_SLICE_I];
2249
2250     mfc_context->brc.gop_nums[HEVC_SLICE_I] = inum;
2251     mfc_context->brc.gop_nums[HEVC_SLICE_P] = pnum;
2252     mfc_context->brc.gop_nums[HEVC_SLICE_B] = bnum;
2253
2254     bpf = mfc_context->brc.bits_per_frame = bitrate / framerate;
2255
2256     if (!encoder_context->brc.hrd_buffer_size)
2257     {
2258         mfc_context->hrd.buffer_size = bitrate * ratio;
2259         mfc_context->hrd.current_buffer_fullness =
2260             (double)(bitrate * ratio/2 < mfc_context->hrd.buffer_size) ?
2261             bitrate * ratio/2 : mfc_context->hrd.buffer_size / 2.;
2262     }else
2263     {
2264         buffer_size = (double)encoder_context->brc.hrd_buffer_size;
2265         if(buffer_size < bitrate * ratio_min)
2266         {
2267             buffer_size = bitrate * ratio_min;
2268         }else if (buffer_size > bitrate * ratio_max)
2269         {
2270             buffer_size = bitrate * ratio_max ;
2271         }
2272         mfc_context->hrd.buffer_size =buffer_size;
2273         if(encoder_context->brc.hrd_initial_buffer_fullness)
2274         {
2275             mfc_context->hrd.current_buffer_fullness =
2276                 (double)(encoder_context->brc.hrd_initial_buffer_fullness < mfc_context->hrd.buffer_size) ?
2277                 encoder_context->brc.hrd_initial_buffer_fullness : mfc_context->hrd.buffer_size / 2.;
2278         }else
2279         {
2280             mfc_context->hrd.current_buffer_fullness = mfc_context->hrd.buffer_size / 2.;
2281
2282         }
2283     }
2284
2285     mfc_context->hrd.target_buffer_fullness = (double)mfc_context->hrd.buffer_size / 2.;
2286     mfc_context->hrd.buffer_capacity = (double)mfc_context->hrd.buffer_size / qp1_size;
2287     mfc_context->hrd.violation_noted = 0;
2288
2289     if ((bpf > qp51_size) && (bpf < qp1_size)) {
2290         mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY = 51 - 50 * (bpf - qp51_size) / (qp1_size - qp51_size);
2291     } else if (bpf >= qp1_size)
2292         mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY = 1;
2293     else if (bpf <= qp51_size)
2294         mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY = 51;
2295
2296     mfc_context->bit_rate_control_context[HEVC_SLICE_I].QpPrimeY = mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY;
2297     mfc_context->bit_rate_control_context[HEVC_SLICE_B].QpPrimeY = mfc_context->bit_rate_control_context[HEVC_SLICE_I].QpPrimeY;
2298
2299     BRC_CLIP(mfc_context->bit_rate_control_context[HEVC_SLICE_I].QpPrimeY, 1, 36);
2300     BRC_CLIP(mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY, 1, 40);
2301     BRC_CLIP(mfc_context->bit_rate_control_context[HEVC_SLICE_B].QpPrimeY, 1, 45);
2302 }
2303
2304 int intel_hcpe_update_hrd(struct encode_state *encode_state,
2305                           struct gen9_hcpe_context *mfc_context,
2306                           int frame_bits)
2307 {
2308     double prev_bf = mfc_context->hrd.current_buffer_fullness;
2309
2310     mfc_context->hrd.current_buffer_fullness -= frame_bits;
2311
2312     if (mfc_context->hrd.buffer_size > 0 && mfc_context->hrd.current_buffer_fullness <= 0.) {
2313         mfc_context->hrd.current_buffer_fullness = prev_bf;
2314         return BRC_UNDERFLOW;
2315     }
2316
2317     mfc_context->hrd.current_buffer_fullness += mfc_context->brc.bits_per_frame;
2318     if (mfc_context->hrd.buffer_size > 0 && mfc_context->hrd.current_buffer_fullness > mfc_context->hrd.buffer_size) {
2319         if (mfc_context->brc.mode == VA_RC_VBR)
2320             mfc_context->hrd.current_buffer_fullness = mfc_context->hrd.buffer_size;
2321         else {
2322             mfc_context->hrd.current_buffer_fullness = prev_bf;
2323             return BRC_OVERFLOW;
2324         }
2325     }
2326     return BRC_NO_HRD_VIOLATION;
2327 }
2328
2329 int intel_hcpe_brc_postpack(struct encode_state *encode_state,
2330                             struct gen9_hcpe_context *mfc_context,
2331                             int frame_bits)
2332 {
2333     gen6_brc_status sts = BRC_NO_HRD_VIOLATION;
2334     VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
2335     VAEncSliceParameterBufferHEVC *pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
2336     int slicetype = pSliceParameter->slice_type;
2337     int qpi = mfc_context->bit_rate_control_context[HEVC_SLICE_I].QpPrimeY;
2338     int qpp = mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY;
2339     int qpb = mfc_context->bit_rate_control_context[HEVC_SLICE_B].QpPrimeY;
2340     int qp; // quantizer of previously encoded slice of current type
2341     int qpn; // predicted quantizer for next frame of current type in integer format
2342     double qpf; // predicted quantizer for next frame of current type in float format
2343     double delta_qp; // QP correction
2344     int target_frame_size, frame_size_next;
2345     /* Notes:
2346      *  x - how far we are from HRD buffer borders
2347      *  y - how far we are from target HRD buffer fullness
2348      */
2349     double x, y;
2350     double frame_size_alpha;
2351
2352     if(slicetype == HEVC_SLICE_B) {
2353         if(pSequenceParameter->ip_period == 1)
2354         {
2355             slicetype = HEVC_SLICE_P;
2356         }else if(mfc_context->vui_hrd.i_frame_number % pSequenceParameter->ip_period == 1){
2357             slicetype = HEVC_SLICE_P;
2358         }
2359     }
2360
2361     qp = mfc_context->bit_rate_control_context[slicetype].QpPrimeY;
2362
2363     target_frame_size = mfc_context->brc.target_frame_size[slicetype];
2364     if (mfc_context->hrd.buffer_capacity < 5)
2365         frame_size_alpha = 0;
2366     else
2367         frame_size_alpha = (double)mfc_context->brc.gop_nums[slicetype];
2368     if (frame_size_alpha > 30) frame_size_alpha = 30;
2369     frame_size_next = target_frame_size + (double)(target_frame_size - frame_bits) /
2370                       (double)(frame_size_alpha + 1.);
2371
2372     /* frame_size_next: avoiding negative number and too small value */
2373     if ((double)frame_size_next < (double)(target_frame_size * 0.25))
2374         frame_size_next = (int)((double)target_frame_size * 0.25);
2375
2376     qpf = (double)qp * target_frame_size / frame_size_next;
2377     qpn = (int)(qpf + 0.5);
2378
2379     if (qpn == qp) {
2380         /* setting qpn we round qpf making mistakes: now we are trying to compensate this */
2381         mfc_context->brc.qpf_rounding_accumulator += qpf - qpn;
2382         if (mfc_context->brc.qpf_rounding_accumulator > 1.0) {
2383             qpn++;
2384             mfc_context->brc.qpf_rounding_accumulator = 0.;
2385         } else if (mfc_context->brc.qpf_rounding_accumulator < -1.0) {
2386             qpn--;
2387             mfc_context->brc.qpf_rounding_accumulator = 0.;
2388         }
2389     }
2390     /* making sure that QP is not changing too fast */
2391     if ((qpn - qp) > BRC_QP_MAX_CHANGE) qpn = qp + BRC_QP_MAX_CHANGE;
2392     else if ((qpn - qp) < -BRC_QP_MAX_CHANGE) qpn = qp - BRC_QP_MAX_CHANGE;
2393     /* making sure that with QP predictions we did do not leave QPs range */
2394     BRC_CLIP(qpn, 1, 51);
2395
2396     /* checking wthether HRD compliance is still met */
2397     sts = intel_hcpe_update_hrd(encode_state, mfc_context, frame_bits);
2398
2399     /* calculating QP delta as some function*/
2400     x = mfc_context->hrd.target_buffer_fullness - mfc_context->hrd.current_buffer_fullness;
2401     if (x > 0) {
2402         x /= mfc_context->hrd.target_buffer_fullness;
2403         y = mfc_context->hrd.current_buffer_fullness;
2404     } else {
2405         x /= (mfc_context->hrd.buffer_size - mfc_context->hrd.target_buffer_fullness);
2406         y = mfc_context->hrd.buffer_size - mfc_context->hrd.current_buffer_fullness;
2407     }
2408     if (y < 0.01) y = 0.01;
2409     if (x > 1) x = 1;
2410     else if (x < -1) x = -1;
2411
2412     delta_qp = BRC_QP_MAX_CHANGE * exp(-1 / y) * sin(BRC_PI_0_5 * x);
2413     qpn = (int)(qpn + delta_qp + 0.5);
2414
2415     /* making sure that with QP predictions we did do not leave QPs range */
2416     BRC_CLIP(qpn, 1, 51);
2417
2418     if (sts == BRC_NO_HRD_VIOLATION) { // no HRD violation
2419         /* correcting QPs of slices of other types */
2420         if (slicetype == HEVC_SLICE_P) {
2421             if (abs(qpn + BRC_P_B_QP_DIFF - qpb) > 2)
2422                 mfc_context->bit_rate_control_context[HEVC_SLICE_B].QpPrimeY += (qpn + BRC_P_B_QP_DIFF - qpb) >> 1;
2423             if (abs(qpn - BRC_I_P_QP_DIFF - qpi) > 2)
2424                 mfc_context->bit_rate_control_context[HEVC_SLICE_I].QpPrimeY += (qpn - BRC_I_P_QP_DIFF - qpi) >> 1;
2425         } else if (slicetype == HEVC_SLICE_I) {
2426             if (abs(qpn + BRC_I_B_QP_DIFF - qpb) > 4)
2427                 mfc_context->bit_rate_control_context[HEVC_SLICE_B].QpPrimeY += (qpn + BRC_I_B_QP_DIFF - qpb) >> 2;
2428             if (abs(qpn + BRC_I_P_QP_DIFF - qpp) > 2)
2429                 mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY += (qpn + BRC_I_P_QP_DIFF - qpp) >> 2;
2430         } else { // HEVC_SLICE_B
2431             if (abs(qpn - BRC_P_B_QP_DIFF - qpp) > 2)
2432                 mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY += (qpn - BRC_P_B_QP_DIFF - qpp) >> 1;
2433             if (abs(qpn - BRC_I_B_QP_DIFF - qpi) > 4)
2434                 mfc_context->bit_rate_control_context[HEVC_SLICE_I].QpPrimeY += (qpn - BRC_I_B_QP_DIFF - qpi) >> 2;
2435         }
2436         BRC_CLIP(mfc_context->bit_rate_control_context[HEVC_SLICE_I].QpPrimeY, 1, 51);
2437         BRC_CLIP(mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY, 1, 51);
2438         BRC_CLIP(mfc_context->bit_rate_control_context[HEVC_SLICE_B].QpPrimeY, 1, 51);
2439     } else if (sts == BRC_UNDERFLOW) { // underflow
2440         if (qpn <= qp) qpn = qp + 1;
2441         if (qpn > 51) {
2442             qpn = 51;
2443             sts = BRC_UNDERFLOW_WITH_MAX_QP; //underflow with maxQP
2444         }
2445     } else if (sts == BRC_OVERFLOW) {
2446         if (qpn >= qp) qpn = qp - 1;
2447         if (qpn < 1) { // < 0 (?) overflow with minQP
2448             qpn = 1;
2449             sts = BRC_OVERFLOW_WITH_MIN_QP; // bit stuffing to be done
2450         }
2451     }
2452
2453     mfc_context->bit_rate_control_context[slicetype].QpPrimeY = qpn;
2454
2455     return sts;
2456 }
2457
2458 static void intel_hcpe_hrd_context_init(struct encode_state *encode_state,
2459                                         struct intel_encoder_context *encoder_context)
2460 {
2461     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
2462     unsigned int rate_control_mode = encoder_context->rate_control_mode;
2463     unsigned int target_bit_rate = encoder_context->brc.bits_per_second[0];
2464
2465     // current we only support CBR mode.
2466     if (rate_control_mode == VA_RC_CBR) {
2467         mfc_context->vui_hrd.i_bit_rate_value = target_bit_rate >> 10;
2468         mfc_context->vui_hrd.i_cpb_size_value = (target_bit_rate * 8) >> 10;
2469         mfc_context->vui_hrd.i_initial_cpb_removal_delay = mfc_context->vui_hrd.i_cpb_size_value * 0.5 * 1024 / target_bit_rate * 90000;
2470         mfc_context->vui_hrd.i_cpb_removal_delay = 2;
2471         mfc_context->vui_hrd.i_frame_number = 0;
2472
2473         mfc_context->vui_hrd.i_initial_cpb_removal_delay_length = 24;
2474         mfc_context->vui_hrd.i_cpb_removal_delay_length = 24;
2475         mfc_context->vui_hrd.i_dpb_output_delay_length = 24;
2476     }
2477
2478 }
2479
2480 void
2481 intel_hcpe_hrd_context_update(struct encode_state *encode_state,
2482                               struct gen9_hcpe_context *mfc_context)
2483 {
2484     mfc_context->vui_hrd.i_frame_number++;
2485 }
2486
2487 int intel_hcpe_interlace_check(VADriverContextP ctx,
2488                                struct encode_state *encode_state,
2489                                struct intel_encoder_context *encoder_context)
2490 {
2491     VAEncSliceParameterBufferHEVC *pSliceParameter;
2492     VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
2493     int log2_cu_size = pSequenceParameter->log2_min_luma_coding_block_size_minus3 + 3;
2494     int log2_ctb_size = pSequenceParameter->log2_diff_max_min_luma_coding_block_size + log2_cu_size;
2495     int ctb_size = 1 << log2_ctb_size;
2496     int width_in_ctb = (pSequenceParameter->pic_width_in_luma_samples + ctb_size - 1) / ctb_size;
2497     int height_in_ctb = (pSequenceParameter->pic_height_in_luma_samples + ctb_size - 1) / ctb_size;
2498     int i;
2499     int ctbCount = 0;
2500
2501     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
2502         pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[i]->buffer;
2503         ctbCount += pSliceParameter->num_ctu_in_slice;
2504     }
2505
2506     if (ctbCount == (width_in_ctb * height_in_ctb))
2507         return 0;
2508
2509     return 1;
2510 }
2511
2512 void intel_hcpe_brc_prepare(struct encode_state *encode_state,
2513                             struct intel_encoder_context *encoder_context)
2514 {
2515     unsigned int rate_control_mode = encoder_context->rate_control_mode;
2516     struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
2517
2518     if (rate_control_mode == VA_RC_CBR) {
2519         bool brc_updated;
2520         assert(encoder_context->codec != CODEC_MPEG2);
2521
2522         brc_updated = encoder_context->brc.need_reset;
2523
2524         /*Programing bit rate control */
2525         if ((mfc_context->bit_rate_control_context[HEVC_SLICE_I].MaxSizeInWord == 0) ||
2526             brc_updated) {
2527             intel_hcpe_bit_rate_control_context_init(encode_state, encoder_context);
2528             intel_hcpe_brc_init(encode_state, encoder_context);
2529         }
2530
2531         /*Programing HRD control */
2532         if ((mfc_context->vui_hrd.i_cpb_size_value == 0) || brc_updated)
2533             intel_hcpe_hrd_context_init(encode_state, encoder_context);
2534     }
2535 }
2536
2537 /* HEVC interface API for encoder */
2538
2539 static VAStatus
2540 gen9_hcpe_hevc_encode_picture(VADriverContextP ctx,
2541                               struct encode_state *encode_state,
2542                               struct intel_encoder_context *encoder_context)
2543 {
2544     struct gen9_hcpe_context *hcpe_context = encoder_context->mfc_context;
2545     unsigned int rate_control_mode = encoder_context->rate_control_mode;
2546     int current_frame_bits_size;
2547     int sts;
2548
2549     for (;;) {
2550         gen9_hcpe_init(ctx, encode_state, encoder_context);
2551         intel_hcpe_hevc_prepare(ctx, encode_state, encoder_context);
2552         /*Programing bcs pipeline*/
2553         gen9_hcpe_hevc_pipeline_programing(ctx, encode_state, encoder_context); //filling the pipeline
2554         gen9_hcpe_run(ctx, encode_state, encoder_context);
2555         if (rate_control_mode == VA_RC_CBR /*|| rate_control_mode == VA_RC_VBR*/) {
2556             gen9_hcpe_stop(ctx, encode_state, encoder_context, &current_frame_bits_size);
2557             sts = intel_hcpe_brc_postpack(encode_state, hcpe_context, current_frame_bits_size);
2558             if (sts == BRC_NO_HRD_VIOLATION) {
2559                 intel_hcpe_hrd_context_update(encode_state, hcpe_context);
2560                 break;
2561             } else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
2562                 if (!hcpe_context->hrd.violation_noted) {
2563                     fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP) ? "overflow" : "underflow");
2564                     hcpe_context->hrd.violation_noted = 1;
2565                 }
2566                 return VA_STATUS_SUCCESS;
2567             }
2568         } else {
2569             break;
2570         }
2571     }
2572
2573     return VA_STATUS_SUCCESS;
2574 }
2575
2576 void
2577 gen9_hcpe_context_destroy(void *context)
2578 {
2579     struct gen9_hcpe_context *hcpe_context = context;
2580     int i;
2581
2582     dri_bo_unreference(hcpe_context->deblocking_filter_line_buffer.bo);
2583     hcpe_context->deblocking_filter_line_buffer.bo = NULL;
2584
2585     dri_bo_unreference(hcpe_context->deblocking_filter_tile_line_buffer.bo);
2586     hcpe_context->deblocking_filter_tile_line_buffer.bo = NULL;
2587
2588     dri_bo_unreference(hcpe_context->deblocking_filter_tile_column_buffer.bo);
2589     hcpe_context->deblocking_filter_tile_column_buffer.bo = NULL;
2590
2591     dri_bo_unreference(hcpe_context->uncompressed_picture_source.bo);
2592     hcpe_context->uncompressed_picture_source.bo = NULL;
2593
2594     dri_bo_unreference(hcpe_context->metadata_line_buffer.bo);
2595     hcpe_context->metadata_line_buffer.bo = NULL;
2596
2597     dri_bo_unreference(hcpe_context->metadata_tile_line_buffer.bo);
2598     hcpe_context->metadata_tile_line_buffer.bo = NULL;
2599
2600     dri_bo_unreference(hcpe_context->metadata_tile_column_buffer.bo);
2601     hcpe_context->metadata_tile_column_buffer.bo = NULL;
2602
2603     dri_bo_unreference(hcpe_context->sao_line_buffer.bo);
2604     hcpe_context->sao_line_buffer.bo = NULL;
2605
2606     dri_bo_unreference(hcpe_context->sao_tile_line_buffer.bo);
2607     hcpe_context->sao_tile_line_buffer.bo = NULL;
2608
2609     dri_bo_unreference(hcpe_context->sao_tile_column_buffer.bo);
2610     hcpe_context->sao_tile_column_buffer.bo = NULL;
2611
2612     /* mv temporal buffer */
2613     for (i = 0; i < NUM_HCP_CURRENT_COLLOCATED_MV_TEMPORAL_BUFFERS; i++) {
2614         if (hcpe_context->current_collocated_mv_temporal_buffer[i].bo != NULL)
2615             dri_bo_unreference(hcpe_context->current_collocated_mv_temporal_buffer[i].bo);
2616         hcpe_context->current_collocated_mv_temporal_buffer[i].bo = NULL;
2617     }
2618
2619     for (i = 0; i < MAX_HCP_REFERENCE_SURFACES; i++) {
2620         dri_bo_unreference(hcpe_context->reference_surfaces[i].bo);
2621         hcpe_context->reference_surfaces[i].bo = NULL;
2622     }
2623
2624     dri_bo_unreference(hcpe_context->hcp_indirect_cu_object.bo);
2625     hcpe_context->hcp_indirect_cu_object.bo = NULL;
2626
2627     dri_bo_unreference(hcpe_context->hcp_indirect_pak_bse_object.bo);
2628     hcpe_context->hcp_indirect_pak_bse_object.bo = NULL;
2629
2630     dri_bo_unreference(hcpe_context->hcp_batchbuffer_surface.bo);
2631     hcpe_context->hcp_batchbuffer_surface.bo = NULL;
2632
2633     dri_bo_unreference(hcpe_context->aux_batchbuffer_surface.bo);
2634     hcpe_context->aux_batchbuffer_surface.bo = NULL;
2635
2636     if (hcpe_context->aux_batchbuffer)
2637         intel_batchbuffer_free(hcpe_context->aux_batchbuffer);
2638
2639     hcpe_context->aux_batchbuffer = NULL;
2640
2641     free(hcpe_context);
2642 }
2643
2644 VAStatus gen9_hcpe_pipeline(VADriverContextP ctx,
2645                             VAProfile profile,
2646                             struct encode_state *encode_state,
2647                             struct intel_encoder_context *encoder_context)
2648 {
2649     VAStatus vaStatus;
2650
2651     switch (profile) {
2652     case VAProfileHEVCMain:
2653     case VAProfileHEVCMain10:
2654         vaStatus = gen9_hcpe_hevc_encode_picture(ctx, encode_state, encoder_context);
2655         break;
2656
2657     default:
2658         vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
2659         break;
2660     }
2661
2662     return vaStatus;
2663 }
2664
2665 Bool gen9_hcpe_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
2666 {
2667     struct gen9_hcpe_context *hcpe_context = calloc(1, sizeof(struct gen9_hcpe_context));
2668
2669     assert(hcpe_context);
2670     hcpe_context->pipe_mode_select = gen9_hcpe_pipe_mode_select;
2671     hcpe_context->set_surface_state = gen9_hcpe_surface_state;
2672     hcpe_context->ind_obj_base_addr_state = gen9_hcpe_ind_obj_base_addr_state;
2673     hcpe_context->pic_state = gen9_hcpe_hevc_pic_state;
2674     hcpe_context->qm_state = gen9_hcpe_hevc_qm_state;
2675     hcpe_context->fqm_state = gen9_hcpe_hevc_fqm_state;
2676     hcpe_context->insert_object = gen9_hcpe_hevc_insert_object;
2677     hcpe_context->buffer_suface_setup = gen8_gpe_buffer_suface_setup;
2678
2679     encoder_context->mfc_context = hcpe_context;
2680     encoder_context->mfc_context_destroy = gen9_hcpe_context_destroy;
2681     encoder_context->mfc_pipeline = gen9_hcpe_pipeline;
2682     encoder_context->mfc_brc_prepare = intel_hcpe_brc_prepare;
2683
2684     hevc_gen_default_iq_matrix_encoder(&hcpe_context->iq_matrix_hevc);
2685
2686     return True;
2687 }