OSDN Git Service

VDENC: Fix the incorrect shift in REF_IDX_STATE command
[android-x86/hardware-intel-common-vaapi.git] / src / gen8_mfc.c
1 /*
2  * Copyright © 2012 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the
13  * next paragraph) shall be included in all copies or substantial portions
14  * of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
19  * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
20  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Zhao Yakui <yakui.zhao@intel.com>
26  *    Xiang Haihao <haihao.xiang@intel.com>
27  *
28  */
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <math.h>
34 #include <assert.h>
35
36 #include "intel_batchbuffer.h"
37 #include "i965_defines.h"
38 #include "i965_structs.h"
39 #include "i965_drv_video.h"
40 #include "i965_encoder.h"
41 #include "i965_encoder_utils.h"
42 #include "gen6_mfc.h"
43 #include "gen6_vme.h"
44 #include "intel_media.h"
45 #include <va/va_enc_jpeg.h>
46 #include "vp8_probs.h"
47
48 #define SURFACE_STATE_PADDED_SIZE               SURFACE_STATE_PADDED_SIZE_GEN8
49 #define SURFACE_STATE_OFFSET(index)             (SURFACE_STATE_PADDED_SIZE * index)
50 #define BINDING_TABLE_OFFSET(index)             (SURFACE_STATE_OFFSET(MAX_MEDIA_SURFACES_GEN6) + sizeof(unsigned int) * index)
51
52 #define MFC_SOFTWARE_BATCH      0
53
54 #define B0_STEP_REV             2
55 #define IS_STEPPING_BPLUS(i965) ((i965->intel.revision) >= B0_STEP_REV)
56
57 //Zigzag scan order of the the Luma and Chroma components
58 //Note: Jpeg Spec ISO/IEC 10918-1, Figure A.6 shows the zigzag order differently.
59 //The Spec is trying to show the zigzag pattern with number positions. The below
60 //table will use the pattern shown by A.6 and map the position of the elements in the array
61 static const uint32_t zigzag_direct[64] = {
62     0,   1,  8, 16,  9,  2,  3, 10,
63     17, 24, 32, 25, 18, 11,  4,  5,
64     12, 19, 26, 33, 40, 48, 41, 34,
65     27, 20, 13,  6,  7, 14, 21, 28,
66     35, 42, 49, 56, 57, 50, 43, 36,
67     29, 22, 15, 23, 30, 37, 44, 51,
68     58, 59, 52, 45, 38, 31, 39, 46,
69     53, 60, 61, 54, 47, 55, 62, 63
70 };
71
72 //Default Luminance quantization table
73 //Source: Jpeg Spec ISO/IEC 10918-1, Annex K, Table K.1
74 static const uint8_t jpeg_luma_quant[64] = {
75     16, 11, 10, 16, 24,  40,  51,  61,
76     12, 12, 14, 19, 26,  58,  60,  55,
77     14, 13, 16, 24, 40,  57,  69,  56,
78     14, 17, 22, 29, 51,  87,  80,  62,
79     18, 22, 37, 56, 68,  109, 103, 77,
80     24, 35, 55, 64, 81,  104, 113, 92,
81     49, 64, 78, 87, 103, 121, 120, 101,
82     72, 92, 95, 98, 112, 100, 103, 99    
83 };
84
85 //Default Chroma quantization table
86 //Source: Jpeg Spec ISO/IEC 10918-1, Annex K, Table K.2
87 static const uint8_t jpeg_chroma_quant[64] = {
88     17, 18, 24, 47, 99, 99, 99, 99,
89     18, 21, 26, 66, 99, 99, 99, 99,
90     24, 26, 56, 99, 99, 99, 99, 99,
91     47, 66, 99, 99, 99, 99, 99, 99,
92     99, 99, 99, 99, 99, 99, 99, 99,
93     99, 99, 99, 99, 99, 99, 99, 99,
94     99, 99, 99, 99, 99, 99, 99, 99,
95     99, 99, 99, 99, 99, 99, 99, 99
96 };
97
98
99 static const int va_to_gen7_jpeg_hufftable[2] = {
100     MFX_HUFFTABLE_ID_Y,
101     MFX_HUFFTABLE_ID_UV
102 };
103
104 static const uint32_t gen8_mfc_batchbuffer_avc[][4] = {
105 #include "shaders/utils/mfc_batchbuffer_hsw.g8b"
106 };
107
108 static const uint32_t gen9_mfc_batchbuffer_avc[][4] = {
109 #include "shaders/utils/mfc_batchbuffer_hsw.g9b"
110 };
111
112 static struct i965_kernel gen8_mfc_kernels[] = {
113     {
114         "MFC AVC INTRA BATCHBUFFER ",
115         MFC_BATCHBUFFER_AVC_INTRA,
116         gen8_mfc_batchbuffer_avc,
117         sizeof(gen8_mfc_batchbuffer_avc),
118         NULL
119     },
120 };
121
122 static struct i965_kernel gen9_mfc_kernels[] = {
123     {
124         "MFC AVC INTRA BATCHBUFFER ",
125         MFC_BATCHBUFFER_AVC_INTRA,
126         gen9_mfc_batchbuffer_avc,
127         sizeof(gen9_mfc_batchbuffer_avc),
128         NULL
129     },
130 };
131
132 static const uint32_t qm_flat[16] = {
133     0x10101010, 0x10101010, 0x10101010, 0x10101010,
134     0x10101010, 0x10101010, 0x10101010, 0x10101010,
135     0x10101010, 0x10101010, 0x10101010, 0x10101010,
136     0x10101010, 0x10101010, 0x10101010, 0x10101010
137 };
138
139 static const uint32_t fqm_flat[32] = {
140     0x10001000, 0x10001000, 0x10001000, 0x10001000,
141     0x10001000, 0x10001000, 0x10001000, 0x10001000,
142     0x10001000, 0x10001000, 0x10001000, 0x10001000,
143     0x10001000, 0x10001000, 0x10001000, 0x10001000,
144     0x10001000, 0x10001000, 0x10001000, 0x10001000,
145     0x10001000, 0x10001000, 0x10001000, 0x10001000,
146     0x10001000, 0x10001000, 0x10001000, 0x10001000,
147     0x10001000, 0x10001000, 0x10001000, 0x10001000
148 };
149
150 #define         INTER_MODE_MASK         0x03
151 #define         INTER_8X8               0x03
152 #define         INTER_16X8              0x01
153 #define         INTER_8X16              0x02
154 #define         SUBMB_SHAPE_MASK        0x00FF00
155 #define         INTER_16X16             0x00
156
157 #define         INTER_MV8               (4 << 20)
158 #define         INTER_MV32              (6 << 20)
159
160
161 static void
162 gen8_mfc_pipe_mode_select(VADriverContextP ctx,
163                           int standard_select,
164                           struct intel_encoder_context *encoder_context)
165 {
166     struct intel_batchbuffer *batch = encoder_context->base.batch;
167     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
168
169     assert(standard_select == MFX_FORMAT_MPEG2 ||
170            standard_select == MFX_FORMAT_AVC   ||
171            standard_select == MFX_FORMAT_JPEG  ||
172            standard_select == MFX_FORMAT_VP8);
173
174     BEGIN_BCS_BATCH(batch, 5);
175
176     OUT_BCS_BATCH(batch, MFX_PIPE_MODE_SELECT | (5 - 2));
177     OUT_BCS_BATCH(batch,
178                   (MFX_LONG_MODE << 17) | /* Must be long format for encoder */
179                   (MFD_MODE_VLD << 15) | /* VLD mode */
180                   (0 << 10) | /* Stream-Out Enable */
181                   ((!!mfc_context->post_deblocking_output.bo) << 9)  | /* Post Deblocking Output */
182                   ((!!mfc_context->pre_deblocking_output.bo) << 8)  | /* Pre Deblocking Output */
183                   (0 << 6)  | /* frame statistics stream-out enable*/
184                   (0 << 5)  | /* not in stitch mode */
185                   (1 << 4)  | /* encoding mode */
186                   (standard_select << 0));  /* standard select: avc or mpeg2 or jpeg*/
187     OUT_BCS_BATCH(batch,
188                   (0 << 7)  | /* expand NOA bus flag */
189                   (0 << 6)  | /* disable slice-level clock gating */
190                   (0 << 5)  | /* disable clock gating for NOA */
191                   (0 << 4)  | /* terminate if AVC motion and POC table error occurs */
192                   (0 << 3)  | /* terminate if AVC mbdata error occurs */
193                   (0 << 2)  | /* terminate if AVC CABAC/CAVLC decode error occurs */
194                   (0 << 1)  |
195                   (0 << 0));
196     OUT_BCS_BATCH(batch, 0);
197     OUT_BCS_BATCH(batch, 0);
198
199     ADVANCE_BCS_BATCH(batch);
200 }
201
202 static void
203 gen8_mfc_surface_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
204 {
205     struct intel_batchbuffer *batch = encoder_context->base.batch;
206     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
207
208     BEGIN_BCS_BATCH(batch, 6);
209
210     OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
211     OUT_BCS_BATCH(batch, 0);
212     OUT_BCS_BATCH(batch,
213                   ((mfc_context->surface_state.height - 1) << 18) |
214                   ((mfc_context->surface_state.width - 1) << 4));
215     OUT_BCS_BATCH(batch,
216                   (MFX_SURFACE_PLANAR_420_8 << 28) | /* 420 planar YUV surface */
217                   (1 << 27) | /* must be 1 for interleave U/V, hardware requirement */
218                   (0 << 22) | /* surface object control state, FIXME??? */
219                   ((mfc_context->surface_state.w_pitch - 1) << 3) | /* pitch */
220                   (0 << 2)  | /* must be 0 for interleave U/V */
221                   (1 << 1)  | /* must be tiled */
222                   (I965_TILEWALK_YMAJOR << 0));  /* tile walk, TILEWALK_YMAJOR */
223     OUT_BCS_BATCH(batch,
224                   (0 << 16) |                                                           /* must be 0 for interleave U/V */
225                   (mfc_context->surface_state.h_pitch));                /* y offset for U(cb) */
226     OUT_BCS_BATCH(batch, 0);
227
228     ADVANCE_BCS_BATCH(batch);
229 }
230
231 static void
232 gen8_mfc_ind_obj_base_addr_state(VADriverContextP ctx,
233                                  struct intel_encoder_context *encoder_context)
234 {
235     struct i965_driver_data *i965 = i965_driver_data(ctx);
236     struct intel_batchbuffer *batch = encoder_context->base.batch;
237     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
238     struct gen6_vme_context *vme_context = encoder_context->vme_context;
239     int vme_size;
240     unsigned int bse_offset;
241
242     BEGIN_BCS_BATCH(batch, 26);
243
244     OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (26 - 2));
245     /* the DW1-3 is for the MFX indirect bistream offset */
246     OUT_BCS_BATCH(batch, 0);
247     OUT_BCS_BATCH(batch, 0);
248     OUT_BCS_BATCH(batch, 0);
249
250     /* the DW4-5 is the MFX upper bound */
251     if (encoder_context->codec == CODEC_VP8) {
252         OUT_BCS_RELOC64(batch,
253                 mfc_context->mfc_indirect_pak_bse_object.bo,
254                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
255                 mfc_context->mfc_indirect_pak_bse_object.end_offset);
256     } else {
257         OUT_BCS_BATCH(batch, 0);
258         OUT_BCS_BATCH(batch, 0);
259     }
260
261     if(encoder_context->codec != CODEC_JPEG) {
262         vme_size = vme_context->vme_output.size_block * vme_context->vme_output.num_blocks;
263         /* the DW6-10 is for MFX Indirect MV Object Base Address */
264         OUT_BCS_RELOC64(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
265         OUT_BCS_BATCH(batch, i965->intel.mocs_state);
266         OUT_BCS_RELOC64(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, vme_size);
267     } else {
268         /* No VME for JPEG */
269         OUT_BCS_BATCH(batch, 0);
270         OUT_BCS_BATCH(batch, 0);
271         OUT_BCS_BATCH(batch, 0);
272         OUT_BCS_BATCH(batch, 0);
273         OUT_BCS_BATCH(batch, 0);
274     }
275
276     /* the DW11-15 is for MFX IT-COFF. Not used on encoder */
277     OUT_BCS_BATCH(batch, 0);
278     OUT_BCS_BATCH(batch, 0);
279     OUT_BCS_BATCH(batch, 0);
280     OUT_BCS_BATCH(batch, 0);
281     OUT_BCS_BATCH(batch, 0);
282
283     /* the DW16-20 is for MFX indirect DBLK. Not used on encoder */
284     OUT_BCS_BATCH(batch, 0);
285     OUT_BCS_BATCH(batch, 0);
286     OUT_BCS_BATCH(batch, 0);
287     OUT_BCS_BATCH(batch, 0);
288     OUT_BCS_BATCH(batch, 0);
289
290     /* the DW21-25 is for MFC Indirect PAK-BSE Object Base Address for Encoder*/        
291     bse_offset = (encoder_context->codec == CODEC_JPEG) ? (mfc_context->mfc_indirect_pak_bse_object.offset) : 0;
292     OUT_BCS_RELOC64(batch,
293                   mfc_context->mfc_indirect_pak_bse_object.bo,
294                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
295                   bse_offset);
296     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
297         
298     OUT_BCS_RELOC64(batch,
299                   mfc_context->mfc_indirect_pak_bse_object.bo,
300                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
301                   mfc_context->mfc_indirect_pak_bse_object.end_offset);
302
303     ADVANCE_BCS_BATCH(batch);
304 }
305
306 static void
307 gen8_mfc_avc_img_state(VADriverContextP ctx, struct encode_state *encode_state,  
308                        struct intel_encoder_context *encoder_context)
309 {
310     struct intel_batchbuffer *batch = encoder_context->base.batch;
311     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
312     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
313
314     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
315     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
316
317     BEGIN_BCS_BATCH(batch, 16);
318
319     OUT_BCS_BATCH(batch, MFX_AVC_IMG_STATE | (16 - 2));
320     /*DW1. MB setting of frame */
321     OUT_BCS_BATCH(batch,
322                   ((width_in_mbs * height_in_mbs - 1) & 0xFFFF));
323     OUT_BCS_BATCH(batch, 
324                   ((height_in_mbs - 1) << 16) | 
325                   ((width_in_mbs - 1) << 0));
326     /* DW3 QP setting */
327     OUT_BCS_BATCH(batch, 
328                   (0 << 24) |   /* Second Chroma QP Offset */
329                   (0 << 16) |   /* Chroma QP Offset */
330                   (0 << 14) |   /* Max-bit conformance Intra flag */
331                   (0 << 13) |   /* Max Macroblock size conformance Inter flag */
332                   (pPicParameter->pic_fields.bits.weighted_pred_flag << 12) |   /*Weighted_Pred_Flag */
333                   (pPicParameter->pic_fields.bits.weighted_bipred_idc << 10) |  /* Weighted_BiPred_Idc */
334                   (0 << 8)  |   /* FIXME: Image Structure */
335                   (0 << 0) );   /* Current Decoed Image Frame Store ID, reserved in Encode mode */
336     OUT_BCS_BATCH(batch,
337                   (0 << 16) |   /* Mininum Frame size */
338                   (0 << 15) |   /* Disable reading of Macroblock Status Buffer */
339                   (0 << 14) |   /* Load BitStream Pointer only once, 1 slic 1 frame */
340                   (0 << 13) |   /* CABAC 0 word insertion test enable */
341                   (1 << 12) |   /* MVUnpackedEnable,compliant to DXVA */
342                   (1 << 10) |   /* Chroma Format IDC, 4:2:0 */
343                   (0 << 8)  |   /* FIXME: MbMvFormatFlag */
344                   (pPicParameter->pic_fields.bits.entropy_coding_mode_flag << 7)  |   /*0:CAVLC encoding mode,1:CABAC*/
345                   (0 << 6)  |   /* Only valid for VLD decoding mode */
346                   (0 << 5)  |   /* Constrained Intra Predition Flag, from PPS */
347                   (0 << 4)  |   /* Direct 8x8 inference flag */
348                   (pPicParameter->pic_fields.bits.transform_8x8_mode_flag << 3)  |   /*8x8 or 4x4 IDCT Transform Mode Flag*/
349                   (1 << 2)  |   /* Frame MB only flag */
350                   (0 << 1)  |   /* MBAFF mode is in active */
351                   (0 << 0));    /* Field picture flag */
352     /* DW5 Trellis quantization */
353     OUT_BCS_BATCH(batch, 0);    /* Mainly about MB rate control and debug, just ignoring */
354     OUT_BCS_BATCH(batch,        /* Inter and Intra Conformance Max size limit */
355                   (0xBB8 << 16) |       /* InterMbMaxSz */
356                   (0xEE8) );            /* IntraMbMaxSz */
357     OUT_BCS_BATCH(batch, 0);            /* Reserved */
358     /* DW8. QP delta */
359     OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
360     OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
361     /* DW10. Bit setting for MB */
362     OUT_BCS_BATCH(batch, 0x8C000000);
363     OUT_BCS_BATCH(batch, 0x00010000);
364     /* DW12. */
365     OUT_BCS_BATCH(batch, 0);
366     OUT_BCS_BATCH(batch, 0x02010100);
367     /* DW14. For short format */
368     OUT_BCS_BATCH(batch, 0);
369     OUT_BCS_BATCH(batch, 0);
370
371     ADVANCE_BCS_BATCH(batch);
372 }
373
374 static void
375 gen8_mfc_qm_state(VADriverContextP ctx,
376                   int qm_type,
377                   const uint32_t *qm,
378                   int qm_length,
379                   struct intel_encoder_context *encoder_context)
380 {
381     struct intel_batchbuffer *batch = encoder_context->base.batch;
382     unsigned int qm_buffer[16];
383
384     assert(qm_length <= 16);
385     assert(sizeof(*qm) == 4);
386     memcpy(qm_buffer, qm, qm_length * 4);
387
388     BEGIN_BCS_BATCH(batch, 18);
389     OUT_BCS_BATCH(batch, MFX_QM_STATE | (18 - 2));
390     OUT_BCS_BATCH(batch, qm_type << 0);
391     intel_batchbuffer_data(batch, qm_buffer, 16 * 4);
392     ADVANCE_BCS_BATCH(batch);
393 }
394
395 static void
396 gen8_mfc_avc_qm_state(VADriverContextP ctx,
397                       struct encode_state *encode_state,
398                       struct intel_encoder_context *encoder_context)
399 {
400     const unsigned int *qm_4x4_intra;
401     const unsigned int *qm_4x4_inter;
402     const unsigned int *qm_8x8_intra;
403     const unsigned int *qm_8x8_inter;
404     VAEncSequenceParameterBufferH264 *pSeqParameter =
405         (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
406     VAEncPictureParameterBufferH264 *pPicParameter =
407         (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
408
409     if (!pSeqParameter->seq_fields.bits.seq_scaling_matrix_present_flag
410         && !pPicParameter->pic_fields.bits.pic_scaling_matrix_present_flag) {
411         qm_4x4_intra = qm_4x4_inter = qm_8x8_intra = qm_8x8_inter = qm_flat;
412     } else {
413         VAIQMatrixBufferH264 *qm;
414         assert(encode_state->q_matrix && encode_state->q_matrix->buffer);
415         qm = (VAIQMatrixBufferH264 *)encode_state->q_matrix->buffer;
416         qm_4x4_intra = (unsigned int *)qm->ScalingList4x4[0];
417         qm_4x4_inter = (unsigned int *)qm->ScalingList4x4[3];
418         qm_8x8_intra = (unsigned int *)qm->ScalingList8x8[0];
419         qm_8x8_inter = (unsigned int *)qm->ScalingList8x8[1];
420     }
421
422     gen8_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, qm_4x4_intra, 12, encoder_context);
423     gen8_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, qm_4x4_inter, 12, encoder_context);
424     gen8_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, qm_8x8_intra, 16, encoder_context);
425     gen8_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, qm_8x8_inter, 16, encoder_context);
426 }
427
428 static void
429 gen8_mfc_fqm_state(VADriverContextP ctx,
430                    int fqm_type,
431                    const uint32_t *fqm,
432                    int fqm_length,
433                    struct intel_encoder_context *encoder_context)
434 {
435     struct intel_batchbuffer *batch = encoder_context->base.batch;
436     unsigned int fqm_buffer[32];
437
438     assert(fqm_length <= 32);
439     assert(sizeof(*fqm) == 4);
440     memcpy(fqm_buffer, fqm, fqm_length * 4);
441
442     BEGIN_BCS_BATCH(batch, 34);
443     OUT_BCS_BATCH(batch, MFX_FQM_STATE | (34 - 2));
444     OUT_BCS_BATCH(batch, fqm_type << 0);
445     intel_batchbuffer_data(batch, fqm_buffer, 32 * 4);
446     ADVANCE_BCS_BATCH(batch);
447 }
448
449 static void
450 gen8_mfc_avc_fill_fqm(uint8_t *qm, uint16_t *fqm, int len)
451 {
452     int i, j;
453     for (i = 0; i < len; i++)
454        for (j = 0; j < len; j++)
455            fqm[i * len + j] = (1 << 16) / qm[j * len + i];
456 }
457
458 static void
459 gen8_mfc_avc_fqm_state(VADriverContextP ctx,
460                        struct encode_state *encode_state,
461                        struct intel_encoder_context *encoder_context)
462 {
463     VAEncSequenceParameterBufferH264 *pSeqParameter =
464         (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
465     VAEncPictureParameterBufferH264 *pPicParameter =
466         (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
467
468     if (!pSeqParameter->seq_fields.bits.seq_scaling_matrix_present_flag
469         && !pPicParameter->pic_fields.bits.pic_scaling_matrix_present_flag) {
470         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, fqm_flat, 24, encoder_context);
471         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, fqm_flat, 24, encoder_context);
472         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, fqm_flat, 32, encoder_context);
473         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, fqm_flat, 32, encoder_context);
474     } else {
475         int i;
476         uint32_t fqm[32];
477         VAIQMatrixBufferH264 *qm;
478         assert(encode_state->q_matrix && encode_state->q_matrix->buffer);
479         qm = (VAIQMatrixBufferH264 *)encode_state->q_matrix->buffer;
480
481         for (i = 0; i < 3; i++)
482             gen8_mfc_avc_fill_fqm(qm->ScalingList4x4[i], (uint16_t *)fqm + 16 * i, 4);
483         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, fqm, 24, encoder_context);
484
485         for (i = 3; i < 6; i++)
486             gen8_mfc_avc_fill_fqm(qm->ScalingList4x4[i], (uint16_t *)fqm + 16 * (i - 3), 4);
487         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, fqm, 24, encoder_context);
488
489         gen8_mfc_avc_fill_fqm(qm->ScalingList8x8[0], (uint16_t *)fqm, 8);
490         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, fqm, 32, encoder_context);
491
492         gen8_mfc_avc_fill_fqm(qm->ScalingList8x8[1], (uint16_t *)fqm, 8);
493         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, fqm, 32, encoder_context);
494     }
495 }
496
497 static void
498 gen8_mfc_avc_insert_object(VADriverContextP ctx, struct intel_encoder_context *encoder_context,
499                            unsigned int *insert_data, int lenght_in_dws, int data_bits_in_last_dw,
500                            int skip_emul_byte_count, int is_last_header, int is_end_of_slice, int emulation_flag,
501                            struct intel_batchbuffer *batch)
502 {
503     if (batch == NULL)
504         batch = encoder_context->base.batch;
505
506     if (data_bits_in_last_dw == 0)
507         data_bits_in_last_dw = 32;
508
509     BEGIN_BCS_BATCH(batch, lenght_in_dws + 2);
510
511     OUT_BCS_BATCH(batch, MFX_INSERT_OBJECT | (lenght_in_dws + 2 - 2));
512     OUT_BCS_BATCH(batch,
513                   (0 << 16) |   /* always start at offset 0 */
514                   (data_bits_in_last_dw << 8) |
515                   (skip_emul_byte_count << 4) |
516                   (!!emulation_flag << 3) |
517                   ((!!is_last_header) << 2) |
518                   ((!!is_end_of_slice) << 1) |
519                   (0 << 0));    /* FIXME: ??? */
520     intel_batchbuffer_data(batch, insert_data, lenght_in_dws * 4);
521
522     ADVANCE_BCS_BATCH(batch);
523 }
524
525
526 static void gen8_mfc_init(VADriverContextP ctx,
527                           struct encode_state *encode_state,
528                           struct intel_encoder_context *encoder_context)
529 {
530     struct i965_driver_data *i965 = i965_driver_data(ctx);
531     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
532     dri_bo *bo;
533     int i;
534     int width_in_mbs = 0;
535     int height_in_mbs = 0;
536     int slice_batchbuffer_size;
537
538     if (encoder_context->codec == CODEC_H264 ||
539         encoder_context->codec == CODEC_H264_MVC) {
540         VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
541         width_in_mbs = pSequenceParameter->picture_width_in_mbs;
542         height_in_mbs = pSequenceParameter->picture_height_in_mbs;
543     } else if (encoder_context->codec == CODEC_MPEG2) {
544         VAEncSequenceParameterBufferMPEG2 *pSequenceParameter = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
545
546         assert(encoder_context->codec == CODEC_MPEG2);
547
548         width_in_mbs = ALIGN(pSequenceParameter->picture_width, 16) / 16;
549         height_in_mbs = ALIGN(pSequenceParameter->picture_height, 16) / 16;
550     } else {
551         assert(encoder_context->codec == CODEC_JPEG);
552         VAEncPictureParameterBufferJPEG *pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
553
554         width_in_mbs = ALIGN(pic_param->picture_width, 16) / 16;
555         height_in_mbs = ALIGN(pic_param->picture_height, 16) / 16;
556     }
557
558     slice_batchbuffer_size = 64 * width_in_mbs * height_in_mbs + 4096 +
559                 (SLICE_HEADER + SLICE_TAIL) * encode_state->num_slice_params_ext;
560
561     /*Encode common setup for MFC*/
562     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
563     mfc_context->post_deblocking_output.bo = NULL;
564
565     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
566     mfc_context->pre_deblocking_output.bo = NULL;
567
568     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
569     mfc_context->uncompressed_picture_source.bo = NULL;
570
571     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
572     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
573
574     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
575         if (mfc_context->direct_mv_buffers[i].bo != NULL)
576             dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
577         mfc_context->direct_mv_buffers[i].bo = NULL;
578     }
579
580     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
581         if (mfc_context->reference_surfaces[i].bo != NULL)
582             dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
583         mfc_context->reference_surfaces[i].bo = NULL;  
584     }
585
586     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
587     bo = dri_bo_alloc(i965->intel.bufmgr,
588                       "Buffer",
589                       width_in_mbs * 64,
590                       64);
591     assert(bo);
592     mfc_context->intra_row_store_scratch_buffer.bo = bo;
593
594     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
595     bo = dri_bo_alloc(i965->intel.bufmgr,
596                       "Buffer",
597                       width_in_mbs * height_in_mbs * 16,
598                       64);
599     assert(bo);
600     mfc_context->macroblock_status_buffer.bo = bo;
601
602     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
603     bo = dri_bo_alloc(i965->intel.bufmgr,
604                       "Buffer",
605                       4 * width_in_mbs * 64,  /* 4 * width_in_mbs * 64 */
606                       64);
607     assert(bo);
608     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
609
610     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
611     bo = dri_bo_alloc(i965->intel.bufmgr,
612                       "Buffer",
613                       2 * width_in_mbs * 64, /* 2 * width_in_mbs * 64 */
614                       0x1000);
615     assert(bo);
616     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
617
618     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
619     mfc_context->mfc_batchbuffer_surface.bo = NULL;
620
621     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
622     mfc_context->aux_batchbuffer_surface.bo = NULL;
623
624     if (mfc_context->aux_batchbuffer)
625         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
626
627     mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
628     mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
629     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
630     mfc_context->aux_batchbuffer_surface.pitch = 16;
631     mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
632     mfc_context->aux_batchbuffer_surface.size_block = 16;
633
634     gen8_gpe_context_init(ctx, &mfc_context->gpe_context);
635 }
636
637 static void
638 gen8_mfc_pipe_buf_addr_state(VADriverContextP ctx,
639                              struct intel_encoder_context *encoder_context)
640 {
641     struct i965_driver_data *i965 = i965_driver_data(ctx);
642     struct intel_batchbuffer *batch = encoder_context->base.batch;
643     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
644     int i;
645
646     BEGIN_BCS_BATCH(batch, 61);
647
648     OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (61 - 2));
649
650     /* the DW1-3 is for pre_deblocking */
651     if (mfc_context->pre_deblocking_output.bo)
652         OUT_BCS_RELOC64(batch, mfc_context->pre_deblocking_output.bo,
653                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
654                       0);
655     else {
656         OUT_BCS_BATCH(batch, 0);
657         OUT_BCS_BATCH(batch, 0);                                                                                        /* pre output addr   */
658
659     }
660     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
661     /* the DW4-6 is for the post_deblocking */
662
663     if (mfc_context->post_deblocking_output.bo)
664         OUT_BCS_RELOC64(batch, mfc_context->post_deblocking_output.bo,
665                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
666                       0);                                                                                       /* post output addr  */ 
667     else {
668         OUT_BCS_BATCH(batch, 0);
669         OUT_BCS_BATCH(batch, 0);
670     }
671     
672     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
673
674     /* the DW7-9 is for the uncompressed_picture */
675     OUT_BCS_RELOC64(batch, mfc_context->uncompressed_picture_source.bo,
676                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
677                   0); /* uncompressed data */
678
679     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
680
681     /* the DW10-12 is for the mb status */
682     OUT_BCS_RELOC64(batch, mfc_context->macroblock_status_buffer.bo,
683                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
684                   0); /* StreamOut data*/
685     
686     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
687
688     /* the DW13-15 is for the intra_row_store_scratch */
689     OUT_BCS_RELOC64(batch, mfc_context->intra_row_store_scratch_buffer.bo,
690                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
691                   0);   
692
693     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
694
695     /* the DW16-18 is for the deblocking filter */
696     OUT_BCS_RELOC64(batch, mfc_context->deblocking_filter_row_store_scratch_buffer.bo,
697                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
698                   0);
699
700     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
701
702     /* the DW 19-50 is for Reference pictures*/
703     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
704         if ( mfc_context->reference_surfaces[i].bo != NULL) {
705             OUT_BCS_RELOC64(batch, mfc_context->reference_surfaces[i].bo,
706                           I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
707                           0);                   
708         } else {
709             OUT_BCS_BATCH(batch, 0);
710             OUT_BCS_BATCH(batch, 0);
711         }
712
713     }
714
715     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
716
717     /* The DW 52-54 is for the MB status buffer */
718     OUT_BCS_RELOC64(batch, mfc_context->macroblock_status_buffer.bo,
719                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
720                   0);                                                                                   /* Macroblock status buffer*/
721         
722     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
723
724     /* the DW 55-57 is the ILDB buffer */
725     OUT_BCS_BATCH(batch, 0);
726     OUT_BCS_BATCH(batch, 0);
727     OUT_BCS_BATCH(batch, 0);
728
729     /* the DW 58-60 is the second ILDB buffer */
730     OUT_BCS_BATCH(batch, 0);
731     OUT_BCS_BATCH(batch, 0);
732     OUT_BCS_BATCH(batch, 0);
733
734     ADVANCE_BCS_BATCH(batch);
735 }
736
737 static void
738 gen8_mfc_avc_directmode_state(VADriverContextP ctx,
739                               struct intel_encoder_context *encoder_context)
740 {
741     struct i965_driver_data *i965 = i965_driver_data(ctx);
742     struct intel_batchbuffer *batch = encoder_context->base.batch;
743     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
744
745     int i;
746
747     BEGIN_BCS_BATCH(batch, 71);
748
749     OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (71 - 2));
750
751     /* Reference frames and Current frames */
752     /* the DW1-32 is for the direct MV for reference */
753     for(i = 0; i < NUM_MFC_DMV_BUFFERS - 2; i += 2) {
754         if ( mfc_context->direct_mv_buffers[i].bo != NULL) { 
755             OUT_BCS_RELOC64(batch, mfc_context->direct_mv_buffers[i].bo,
756                           I915_GEM_DOMAIN_INSTRUCTION, 0,
757                           0);
758         } else {
759             OUT_BCS_BATCH(batch, 0);
760             OUT_BCS_BATCH(batch, 0);
761         }
762     }
763     
764     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
765
766     /* the DW34-36 is the MV for the current reference */
767     OUT_BCS_RELOC64(batch, mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 2].bo,
768                   I915_GEM_DOMAIN_INSTRUCTION, 0,
769                   0);
770
771     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
772
773     /* POL list */
774     for(i = 0; i < 32; i++) {
775         OUT_BCS_BATCH(batch, i/2);
776     }
777     OUT_BCS_BATCH(batch, 0);
778     OUT_BCS_BATCH(batch, 0);
779
780     ADVANCE_BCS_BATCH(batch);
781 }
782
783
784 static void
785 gen8_mfc_bsp_buf_base_addr_state(VADriverContextP ctx,
786                                  struct intel_encoder_context *encoder_context)
787 {
788     struct i965_driver_data *i965 = i965_driver_data(ctx);
789     struct intel_batchbuffer *batch = encoder_context->base.batch;
790     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
791
792     BEGIN_BCS_BATCH(batch, 10);
793
794     OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (10 - 2));
795     OUT_BCS_RELOC64(batch, mfc_context->bsd_mpc_row_store_scratch_buffer.bo,
796                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
797                   0);
798     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
799         
800     /* the DW4-6 is for MPR Row Store Scratch Buffer Base Address */
801     OUT_BCS_BATCH(batch, 0);
802     OUT_BCS_BATCH(batch, 0);
803     OUT_BCS_BATCH(batch, 0);
804
805     /* the DW7-9 is for Bitplane Read Buffer Base Address */
806     OUT_BCS_BATCH(batch, 0);
807     OUT_BCS_BATCH(batch, 0);
808     OUT_BCS_BATCH(batch, 0);
809
810     ADVANCE_BCS_BATCH(batch);
811 }
812
813
814 static void gen8_mfc_avc_pipeline_picture_programing( VADriverContextP ctx,
815                                                       struct encode_state *encode_state,
816                                                       struct intel_encoder_context *encoder_context)
817 {
818     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
819
820     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_AVC, encoder_context);
821     mfc_context->set_surface_state(ctx, encoder_context);
822     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
823     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
824     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
825     mfc_context->avc_img_state(ctx, encode_state, encoder_context);
826     mfc_context->avc_qm_state(ctx, encode_state, encoder_context);
827     mfc_context->avc_fqm_state(ctx, encode_state, encoder_context);
828     gen8_mfc_avc_directmode_state(ctx, encoder_context); 
829     intel_mfc_avc_ref_idx_state(ctx, encode_state, encoder_context);
830 }
831
832
833 static VAStatus gen8_mfc_run(VADriverContextP ctx, 
834                              struct encode_state *encode_state,
835                              struct intel_encoder_context *encoder_context)
836 {
837     struct intel_batchbuffer *batch = encoder_context->base.batch;
838
839     intel_batchbuffer_flush(batch);             //run the pipeline
840
841     return VA_STATUS_SUCCESS;
842 }
843
844
845 static VAStatus
846 gen8_mfc_stop(VADriverContextP ctx, 
847               struct encode_state *encode_state,
848               struct intel_encoder_context *encoder_context,
849               int *encoded_bits_size)
850 {
851     VAStatus vaStatus = VA_STATUS_ERROR_UNKNOWN;
852     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
853     VACodedBufferSegment *coded_buffer_segment;
854     
855     vaStatus = i965_MapBuffer(ctx, pPicParameter->coded_buf, (void **)&coded_buffer_segment);
856     assert(vaStatus == VA_STATUS_SUCCESS);
857     *encoded_bits_size = coded_buffer_segment->size * 8;
858     i965_UnmapBuffer(ctx, pPicParameter->coded_buf);
859
860     return VA_STATUS_SUCCESS;
861 }
862
863
864 static void
865 gen8_mfc_avc_slice_state(VADriverContextP ctx,
866                          VAEncPictureParameterBufferH264 *pic_param,
867                          VAEncSliceParameterBufferH264 *slice_param,
868                          struct encode_state *encode_state,
869                          struct intel_encoder_context *encoder_context,
870                          int rate_control_enable,
871                          int qp,
872                          struct intel_batchbuffer *batch)
873 {
874     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
875     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
876     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
877     int beginmb = slice_param->macroblock_address;
878     int endmb = beginmb + slice_param->num_macroblocks;
879     int beginx = beginmb % width_in_mbs;
880     int beginy = beginmb / width_in_mbs;
881     int nextx =  endmb % width_in_mbs;
882     int nexty = endmb / width_in_mbs;
883     int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
884     int last_slice = (endmb == (width_in_mbs * height_in_mbs));
885     int maxQpN, maxQpP;
886     unsigned char correct[6], grow, shrink;
887     int i;
888     int weighted_pred_idc = 0;
889     unsigned int luma_log2_weight_denom = slice_param->luma_log2_weight_denom;
890     unsigned int chroma_log2_weight_denom = slice_param->chroma_log2_weight_denom;
891     int num_ref_l0 = 0, num_ref_l1 = 0;
892
893     if (batch == NULL)
894         batch = encoder_context->base.batch;
895
896     if (slice_type == SLICE_TYPE_I) {
897         luma_log2_weight_denom = 0;
898         chroma_log2_weight_denom = 0;
899     } else if (slice_type == SLICE_TYPE_P) {
900         weighted_pred_idc = pic_param->pic_fields.bits.weighted_pred_flag;
901         num_ref_l0 = pic_param->num_ref_idx_l0_active_minus1 + 1;
902
903         if (slice_param->num_ref_idx_active_override_flag)
904             num_ref_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
905     } else if (slice_type == SLICE_TYPE_B) {
906         weighted_pred_idc = pic_param->pic_fields.bits.weighted_bipred_idc;
907         num_ref_l0 = pic_param->num_ref_idx_l0_active_minus1 + 1;
908         num_ref_l1 = pic_param->num_ref_idx_l1_active_minus1 + 1;
909
910         if (slice_param->num_ref_idx_active_override_flag) {
911             num_ref_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
912             num_ref_l1 = slice_param->num_ref_idx_l1_active_minus1 + 1;
913         }
914
915         if (weighted_pred_idc == 2) {
916             /* 8.4.3 - Derivation process for prediction weights (8-279) */
917             luma_log2_weight_denom = 5;
918             chroma_log2_weight_denom = 5;
919         }
920     }
921
922     maxQpN = mfc_context->bit_rate_control_context[slice_type].MaxQpNegModifier;
923     maxQpP = mfc_context->bit_rate_control_context[slice_type].MaxQpPosModifier;
924
925     for (i = 0; i < 6; i++)
926         correct[i] = mfc_context->bit_rate_control_context[slice_type].Correct[i];
927
928     grow = mfc_context->bit_rate_control_context[slice_type].GrowInit + 
929         (mfc_context->bit_rate_control_context[slice_type].GrowResistance << 4);
930     shrink = mfc_context->bit_rate_control_context[slice_type].ShrinkInit + 
931         (mfc_context->bit_rate_control_context[slice_type].ShrinkResistance << 4);
932
933     BEGIN_BCS_BATCH(batch, 11);;
934
935     OUT_BCS_BATCH(batch, MFX_AVC_SLICE_STATE | (11 - 2) );
936     OUT_BCS_BATCH(batch, slice_type);                   /*Slice Type: I:P:B Slice*/
937
938     OUT_BCS_BATCH(batch,
939                   (num_ref_l0 << 16) |
940                   (num_ref_l1 << 24) |
941                   (chroma_log2_weight_denom << 8) |
942                   (luma_log2_weight_denom << 0));
943
944     OUT_BCS_BATCH(batch, 
945                   (weighted_pred_idc << 30) |
946                   (slice_param->direct_spatial_mv_pred_flag<<29) |             /*Direct Prediction Type*/
947                   (slice_param->disable_deblocking_filter_idc << 27) |
948                   (slice_param->cabac_init_idc << 24) |
949                   (qp<<16) |                    /*Slice Quantization Parameter*/
950                   ((slice_param->slice_beta_offset_div2 & 0xf) << 8) |
951                   ((slice_param->slice_alpha_c0_offset_div2 & 0xf) << 0));
952     OUT_BCS_BATCH(batch,
953                   (beginy << 24) |                      /*First MB X&Y , the begin postion of current slice*/
954                   (beginx << 16) |
955                   slice_param->macroblock_address );
956     OUT_BCS_BATCH(batch, (nexty << 16) | nextx);                       /*Next slice first MB X&Y*/
957     OUT_BCS_BATCH(batch, 
958                   (0/*rate_control_enable*/ << 31) |            /*in CBR mode RateControlCounterEnable = enable*/
959                   (1 << 30) |           /*ResetRateControlCounter*/
960                   (0 << 28) |           /*RC Triggle Mode = Always Rate Control*/
961                   (4 << 24) |     /*RC Stable Tolerance, middle level*/
962                   (0/*rate_control_enable*/ << 23) |     /*RC Panic Enable*/                 
963                   (0 << 22) |     /*QP mode, don't modfiy CBP*/
964                   (0 << 21) |     /*MB Type Direct Conversion Enabled*/ 
965                   (0 << 20) |     /*MB Type Skip Conversion Enabled*/ 
966                   (last_slice << 19) |     /*IsLastSlice*/
967                   (0 << 18) |   /*BitstreamOutputFlag Compressed BitStream Output Disable Flag 0:enable 1:disable*/
968                   (1 << 17) |       /*HeaderPresentFlag*/       
969                   (1 << 16) |       /*SliceData PresentFlag*/
970                   (1 << 15) |       /*TailPresentFlag*/
971                   (1 << 13) |       /*RBSP NAL TYPE*/   
972                   (0 << 12) );    /*CabacZeroWordInsertionEnable*/
973     OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
974     OUT_BCS_BATCH(batch,
975                   (maxQpN << 24) |     /*Target QP - 24 is lowest QP*/ 
976                   (maxQpP << 16) |     /*Target QP + 20 is highest QP*/
977                   (shrink << 8)  |
978                   (grow << 0));   
979     OUT_BCS_BATCH(batch,
980                   (correct[5] << 20) |
981                   (correct[4] << 16) |
982                   (correct[3] << 12) |
983                   (correct[2] << 8) |
984                   (correct[1] << 4) |
985                   (correct[0] << 0));
986     OUT_BCS_BATCH(batch, 0);
987
988     ADVANCE_BCS_BATCH(batch);
989 }
990
991 #define    AVC_INTRA_RDO_OFFSET    4
992 #define    AVC_INTER_RDO_OFFSET    10
993 #define    AVC_INTER_MSG_OFFSET    8
994 #define    AVC_INTER_MV_OFFSET     48
995 #define    AVC_RDO_MASK            0xFFFF
996
997 static int
998 gen8_mfc_avc_pak_object_intra(VADriverContextP ctx, int x, int y, int end_mb,
999                               int qp,unsigned int *msg,
1000                               struct intel_encoder_context *encoder_context,
1001                               unsigned char target_mb_size, unsigned char max_mb_size,
1002                               struct intel_batchbuffer *batch)
1003 {
1004     int len_in_dwords = 12;
1005     unsigned int intra_msg;
1006 #define         INTRA_MSG_FLAG          (1 << 13)
1007 #define         INTRA_MBTYPE_MASK       (0x1F0000)
1008     if (batch == NULL)
1009         batch = encoder_context->base.batch;
1010
1011     BEGIN_BCS_BATCH(batch, len_in_dwords);
1012
1013     intra_msg = msg[0] & 0xC0FF;
1014     intra_msg |= INTRA_MSG_FLAG;
1015     intra_msg |= ((msg[0] & INTRA_MBTYPE_MASK) >> 8);
1016     OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
1017     OUT_BCS_BATCH(batch, 0);
1018     OUT_BCS_BATCH(batch, 0);
1019     OUT_BCS_BATCH(batch, 
1020                   (0 << 24) |           /* PackedMvNum, Debug*/
1021                   (0 << 20) |           /* No motion vector */
1022                   (1 << 19) |           /* CbpDcY */
1023                   (1 << 18) |           /* CbpDcU */
1024                   (1 << 17) |           /* CbpDcV */
1025                   intra_msg);
1026
1027     OUT_BCS_BATCH(batch, (0xFFFF << 16) | (y << 8) | x);                /* Code Block Pattern for Y*/
1028     OUT_BCS_BATCH(batch, 0x000F000F);                                                   /* Code Block Pattern */                
1029     OUT_BCS_BATCH(batch, (0 << 27) | (end_mb << 26) | qp);      /* Last MB */
1030
1031     /*Stuff for Intra MB*/
1032     OUT_BCS_BATCH(batch, msg[1]);                       /* We using Intra16x16 no 4x4 predmode*/        
1033     OUT_BCS_BATCH(batch, msg[2]);       
1034     OUT_BCS_BATCH(batch, msg[3]&0xFF);  
1035     
1036     /*MaxSizeInWord and TargetSzieInWord*/
1037     OUT_BCS_BATCH(batch, (max_mb_size << 24) |
1038                   (target_mb_size << 16) );
1039
1040     OUT_BCS_BATCH(batch, 0);
1041
1042     ADVANCE_BCS_BATCH(batch);
1043
1044     return len_in_dwords;
1045 }
1046
1047 static int
1048 gen8_mfc_avc_pak_object_inter(VADriverContextP ctx, int x, int y, int end_mb, int qp,
1049                               unsigned int *msg, unsigned int offset,
1050                               struct intel_encoder_context *encoder_context,
1051                               unsigned char target_mb_size,unsigned char max_mb_size, int slice_type,
1052                               struct intel_batchbuffer *batch)
1053 {
1054     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1055     int len_in_dwords = 12;
1056     unsigned int inter_msg = 0;
1057     if (batch == NULL)
1058         batch = encoder_context->base.batch;
1059     {
1060 #define MSG_MV_OFFSET   4
1061         unsigned int *mv_ptr;
1062         mv_ptr = msg + MSG_MV_OFFSET;
1063         /* MV of VME output is based on 16 sub-blocks. So it is necessary
1064          * to convert them to be compatible with the format of AVC_PAK
1065          * command.
1066          */
1067         if ((msg[0] & INTER_MODE_MASK) == INTER_8X16) {
1068             /* MV[0] and MV[2] are replicated */
1069             mv_ptr[4] = mv_ptr[0];
1070             mv_ptr[5] = mv_ptr[1];
1071             mv_ptr[2] = mv_ptr[8];
1072             mv_ptr[3] = mv_ptr[9];
1073             mv_ptr[6] = mv_ptr[8];
1074             mv_ptr[7] = mv_ptr[9];
1075         } else if ((msg[0] & INTER_MODE_MASK) == INTER_16X8) {
1076             /* MV[0] and MV[1] are replicated */
1077             mv_ptr[2] = mv_ptr[0];
1078             mv_ptr[3] = mv_ptr[1];
1079             mv_ptr[4] = mv_ptr[16];
1080             mv_ptr[5] = mv_ptr[17];
1081             mv_ptr[6] = mv_ptr[24];
1082             mv_ptr[7] = mv_ptr[25];
1083         } else if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
1084                    !(msg[1] & SUBMB_SHAPE_MASK)) {
1085             /* Don't touch MV[0] or MV[1] */
1086             mv_ptr[2] = mv_ptr[8];
1087             mv_ptr[3] = mv_ptr[9];
1088             mv_ptr[4] = mv_ptr[16];
1089             mv_ptr[5] = mv_ptr[17];
1090             mv_ptr[6] = mv_ptr[24];
1091             mv_ptr[7] = mv_ptr[25];
1092         }
1093     }
1094
1095     BEGIN_BCS_BATCH(batch, len_in_dwords);
1096
1097     OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
1098
1099     inter_msg = 32;
1100     /* MV quantity */
1101     if ((msg[0] & INTER_MODE_MASK) == INTER_8X8) {
1102         if (msg[1] & SUBMB_SHAPE_MASK)
1103             inter_msg = 128;
1104     }
1105     OUT_BCS_BATCH(batch, inter_msg);         /* 32 MV*/
1106     OUT_BCS_BATCH(batch, offset);
1107     inter_msg = msg[0] & (0x1F00FFFF);
1108     inter_msg |= INTER_MV8;
1109     inter_msg |= ((1 << 19) | (1 << 18) | (1 << 17));
1110     if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
1111         (msg[1] & SUBMB_SHAPE_MASK)) {
1112         inter_msg |= INTER_MV32;
1113     }
1114
1115     OUT_BCS_BATCH(batch, inter_msg);
1116
1117     OUT_BCS_BATCH(batch, (0xFFFF<<16) | (y << 8) | x);        /* Code Block Pattern for Y*/
1118     OUT_BCS_BATCH(batch, 0x000F000F);                         /* Code Block Pattern */  
1119 #if 0 
1120     if ( slice_type == SLICE_TYPE_B) {
1121         OUT_BCS_BATCH(batch, (0xF<<28) | (end_mb << 26) | qp);  /* Last MB */
1122     } else {
1123         OUT_BCS_BATCH(batch, (end_mb << 26) | qp);      /* Last MB */
1124     }
1125 #else
1126     OUT_BCS_BATCH(batch, (end_mb << 26) | qp);  /* Last MB */
1127 #endif
1128
1129     inter_msg = msg[1] >> 8;
1130     /*Stuff for Inter MB*/
1131     OUT_BCS_BATCH(batch, inter_msg);        
1132     OUT_BCS_BATCH(batch, vme_context->ref_index_in_mb[0]);
1133     OUT_BCS_BATCH(batch, vme_context->ref_index_in_mb[1]);
1134
1135     /*MaxSizeInWord and TargetSzieInWord*/
1136     OUT_BCS_BATCH(batch, (max_mb_size << 24) |
1137                   (target_mb_size << 16) );
1138
1139     OUT_BCS_BATCH(batch, 0x0);    
1140
1141     ADVANCE_BCS_BATCH(batch);
1142
1143     return len_in_dwords;
1144 }
1145
1146 static void 
1147 gen8_mfc_avc_pipeline_slice_programing(VADriverContextP ctx,
1148                                        struct encode_state *encode_state,
1149                                        struct intel_encoder_context *encoder_context,
1150                                        int slice_index,
1151                                        struct intel_batchbuffer *slice_batch)
1152 {
1153     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1154     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1155     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1156     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
1157     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
1158     unsigned int *msg = NULL, offset = 0;
1159     unsigned char *msg_ptr = NULL;
1160     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1161     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1162     int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
1163     int i,x,y;
1164     int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1165     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1166     unsigned int tail_data[] = { 0x0, 0x0 };
1167     int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
1168     int is_intra = slice_type == SLICE_TYPE_I;
1169     int qp_slice;
1170     int qp_mb;
1171
1172     qp_slice = qp;
1173     if (rate_control_mode != VA_RC_CQP) {
1174         qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];
1175         if (encode_state->slice_header_index[slice_index] == 0) {
1176             pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1177             qp_slice = qp;
1178         }
1179     }
1180
1181     /* only support for 8-bit pixel bit-depth */
1182     assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
1183     assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
1184     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1185     assert(qp >= 0 && qp < 52);
1186
1187     gen8_mfc_avc_slice_state(ctx,
1188                              pPicParameter,
1189                              pSliceParameter,
1190                              encode_state, encoder_context,
1191                              (rate_control_mode != VA_RC_CQP), qp_slice, slice_batch);
1192
1193     if ( slice_index == 0)
1194         intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1195
1196     intel_avc_slice_insert_packed_data(ctx, encode_state, encoder_context, slice_index, slice_batch);
1197
1198     dri_bo_map(vme_context->vme_output.bo , 1);
1199     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
1200
1201     if (is_intra) {
1202         msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
1203     } else {
1204         msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
1205     }
1206    
1207     for (i = pSliceParameter->macroblock_address; 
1208          i < pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks; i++) {
1209         int last_mb = (i == (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks - 1) );
1210         x = i % width_in_mbs;
1211         y = i / width_in_mbs;
1212         msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
1213         if (vme_context->roi_enabled) {
1214             qp_mb = *(vme_context->qp_per_mb + i);
1215         } else
1216             qp_mb = qp;
1217
1218         if (is_intra) {
1219             assert(msg);
1220             gen8_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp_mb, msg, encoder_context, 0, 0, slice_batch);
1221         } else {
1222             int inter_rdo, intra_rdo;
1223             inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
1224             intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
1225             offset = i * vme_context->vme_output.size_block + AVC_INTER_MV_OFFSET;
1226             if (intra_rdo < inter_rdo) { 
1227                 gen8_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp_mb, msg, encoder_context, 0, 0, slice_batch);
1228             } else {
1229                 msg += AVC_INTER_MSG_OFFSET;
1230                 gen8_mfc_avc_pak_object_inter(ctx, x, y, last_mb, qp_mb, msg, offset, encoder_context, 0, 0, pSliceParameter->slice_type, slice_batch);
1231             }
1232         }
1233     }
1234    
1235     dri_bo_unmap(vme_context->vme_output.bo);
1236
1237     if ( last_slice ) {    
1238         mfc_context->insert_object(ctx, encoder_context,
1239                                    tail_data, 2, 8,
1240                                    2, 1, 1, 0, slice_batch);
1241     } else {
1242         mfc_context->insert_object(ctx, encoder_context,
1243                                    tail_data, 1, 8,
1244                                    1, 1, 1, 0, slice_batch);
1245     }
1246 }
1247
1248 static dri_bo *
1249 gen8_mfc_avc_software_batchbuffer(VADriverContextP ctx,
1250                                   struct encode_state *encode_state,
1251                                   struct intel_encoder_context *encoder_context)
1252 {
1253     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1254     struct intel_batchbuffer *batch;
1255     dri_bo *batch_bo;
1256     int i;
1257
1258     batch = mfc_context->aux_batchbuffer;
1259     batch_bo = batch->buffer;
1260     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
1261         gen8_mfc_avc_pipeline_slice_programing(ctx, encode_state, encoder_context, i, batch);
1262     }
1263
1264     intel_batchbuffer_align(batch, 8);
1265     
1266     BEGIN_BCS_BATCH(batch, 2);
1267     OUT_BCS_BATCH(batch, 0);
1268     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
1269     ADVANCE_BCS_BATCH(batch);
1270
1271     dri_bo_reference(batch_bo);
1272     intel_batchbuffer_free(batch);
1273     mfc_context->aux_batchbuffer = NULL;
1274
1275     return batch_bo;
1276 }
1277
1278
1279 static void
1280 gen8_mfc_batchbuffer_surfaces_input(VADriverContextP ctx,
1281                                     struct encode_state *encode_state,
1282                                     struct intel_encoder_context *encoder_context)
1283 {
1284     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1285     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1286
1287     assert(vme_context->vme_output.bo);
1288     mfc_context->buffer_suface_setup(ctx,
1289                                      &mfc_context->gpe_context,
1290                                      &vme_context->vme_output,
1291                                      BINDING_TABLE_OFFSET(BIND_IDX_VME_OUTPUT),
1292                                      SURFACE_STATE_OFFSET(BIND_IDX_VME_OUTPUT));
1293 }
1294
1295 static void
1296 gen8_mfc_batchbuffer_surfaces_output(VADriverContextP ctx,
1297                                      struct encode_state *encode_state,
1298                                      struct intel_encoder_context *encoder_context)
1299 {
1300     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1301     assert(mfc_context->aux_batchbuffer_surface.bo);
1302     mfc_context->buffer_suface_setup(ctx,
1303                                      &mfc_context->gpe_context,
1304                                      &mfc_context->aux_batchbuffer_surface,
1305                                      BINDING_TABLE_OFFSET(BIND_IDX_MFC_BATCHBUFFER),
1306                                      SURFACE_STATE_OFFSET(BIND_IDX_MFC_BATCHBUFFER));
1307 }
1308
1309 static void
1310 gen8_mfc_batchbuffer_surfaces_setup(VADriverContextP ctx, 
1311                                     struct encode_state *encode_state,
1312                                     struct intel_encoder_context *encoder_context)
1313 {
1314     gen8_mfc_batchbuffer_surfaces_input(ctx, encode_state, encoder_context);
1315     gen8_mfc_batchbuffer_surfaces_output(ctx, encode_state, encoder_context);
1316 }
1317
1318 static void
1319 gen8_mfc_batchbuffer_idrt_setup(VADriverContextP ctx, 
1320                                 struct encode_state *encode_state,
1321                                 struct intel_encoder_context *encoder_context)
1322 {
1323     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1324     struct gen8_interface_descriptor_data *desc;
1325     int i;
1326     dri_bo *bo;
1327     unsigned char *desc_ptr;
1328
1329     bo = mfc_context->gpe_context.idrt.bo;
1330     dri_bo_map(bo, 1);
1331     assert(bo->virtual);
1332     desc_ptr = (unsigned char *)bo->virtual + mfc_context->gpe_context.idrt.offset;
1333
1334     desc = (struct gen8_interface_descriptor_data *)desc_ptr;
1335
1336     for (i = 0; i < mfc_context->gpe_context.num_kernels; i++) {
1337         struct i965_kernel *kernel;
1338         kernel = &mfc_context->gpe_context.kernels[i];
1339         assert(sizeof(*desc) == 32);
1340         /*Setup the descritor table*/
1341         memset(desc, 0, sizeof(*desc));
1342         desc->desc0.kernel_start_pointer = kernel->kernel_offset >> 6;
1343         desc->desc3.sampler_count = 0;
1344         desc->desc3.sampler_state_pointer = 0;
1345         desc->desc4.binding_table_entry_count = 1;
1346         desc->desc4.binding_table_pointer = (BINDING_TABLE_OFFSET(0) >> 5);
1347         desc->desc5.constant_urb_entry_read_offset = 0;
1348         desc->desc5.constant_urb_entry_read_length = 4;
1349
1350                 
1351         desc++;
1352     }
1353
1354     dri_bo_unmap(bo);
1355
1356     return;
1357 }
1358
1359 static void
1360 gen8_mfc_batchbuffer_constant_setup(VADriverContextP ctx, 
1361                                     struct encode_state *encode_state,
1362                                     struct intel_encoder_context *encoder_context)
1363 {
1364     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1365     
1366     (void)mfc_context;
1367 }
1368
1369 #define AVC_PAK_LEN_IN_BYTE     48
1370 #define AVC_PAK_LEN_IN_OWORD    3
1371
1372 static void
1373 gen8_mfc_batchbuffer_emit_object_command(struct intel_batchbuffer *batch,
1374                                           uint32_t intra_flag,
1375                                           int head_offset,
1376                                           int number_mb_cmds,
1377                                           int slice_end_x,
1378                                           int slice_end_y,
1379                                           int mb_x,
1380                                           int mb_y,
1381                                           int width_in_mbs,
1382                                           int qp,
1383                                           uint32_t fwd_ref,
1384                                           uint32_t bwd_ref)
1385 {
1386     uint32_t temp_value;
1387     BEGIN_BATCH(batch, 14);
1388     
1389     OUT_BATCH(batch, CMD_MEDIA_OBJECT | (14 - 2));
1390     OUT_BATCH(batch, 0);
1391     OUT_BATCH(batch, 0);
1392     OUT_BATCH(batch, 0);
1393     OUT_BATCH(batch, 0);
1394     OUT_BATCH(batch, 0);
1395    
1396     /*inline data */
1397     OUT_BATCH(batch, head_offset / 16);
1398     OUT_BATCH(batch, (intra_flag) | (qp << 16));
1399     temp_value = (mb_x | (mb_y << 8) | (width_in_mbs << 16));
1400     OUT_BATCH(batch, temp_value);
1401
1402     OUT_BATCH(batch, number_mb_cmds);
1403
1404     OUT_BATCH(batch,
1405               ((slice_end_y << 8) | (slice_end_x)));
1406     OUT_BATCH(batch, fwd_ref);
1407     OUT_BATCH(batch, bwd_ref);
1408
1409     OUT_BATCH(batch, MI_NOOP);
1410
1411     ADVANCE_BATCH(batch);
1412 }
1413
1414 static void
1415 gen8_mfc_avc_batchbuffer_slice_command(VADriverContextP ctx,
1416                                         struct intel_encoder_context *encoder_context,
1417                                         VAEncSliceParameterBufferH264 *slice_param,
1418                                         int head_offset,
1419                                         int qp,
1420                                         int last_slice)
1421 {
1422     struct intel_batchbuffer *batch = encoder_context->base.batch;
1423     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1424     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1425     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1426     int total_mbs = slice_param->num_macroblocks;
1427     int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
1428     int number_mb_cmds = 128;
1429     int starting_offset = 0;
1430     int mb_x, mb_y;
1431     int last_mb, slice_end_x, slice_end_y;
1432     int remaining_mb = total_mbs;
1433     uint32_t fwd_ref , bwd_ref, mb_flag;
1434     char tmp_qp;
1435     int number_roi_mbs, max_mb_cmds, i;
1436
1437     last_mb = slice_param->macroblock_address + total_mbs - 1;
1438     slice_end_x = last_mb % width_in_mbs;
1439     slice_end_y = last_mb / width_in_mbs;
1440
1441     if (slice_type == SLICE_TYPE_I) {
1442         fwd_ref = 0;
1443         bwd_ref = 0;
1444         mb_flag = 1;
1445     } else {
1446         fwd_ref = vme_context->ref_index_in_mb[0];
1447         bwd_ref = vme_context->ref_index_in_mb[1];
1448         mb_flag = 0;
1449     }
1450
1451     if (width_in_mbs >= 100) {
1452         number_mb_cmds = width_in_mbs / 5;
1453     } else if (width_in_mbs >= 80) {
1454         number_mb_cmds = width_in_mbs / 4;
1455     } else if (width_in_mbs >= 60) {
1456         number_mb_cmds = width_in_mbs / 3;
1457     } else if (width_in_mbs >= 40) {
1458         number_mb_cmds = width_in_mbs / 2;
1459     } else {
1460         number_mb_cmds = width_in_mbs;
1461     }
1462
1463     max_mb_cmds = number_mb_cmds;
1464
1465     do {
1466         mb_x = (slice_param->macroblock_address + starting_offset) % width_in_mbs;
1467         mb_y = (slice_param->macroblock_address + starting_offset) / width_in_mbs;
1468
1469         number_mb_cmds = max_mb_cmds;
1470         if (vme_context->roi_enabled) {
1471
1472             number_roi_mbs = 1;
1473             tmp_qp = *(vme_context->qp_per_mb + starting_offset);
1474             for (i = 1; i < max_mb_cmds; i++) {
1475                 if (tmp_qp != *(vme_context->qp_per_mb + starting_offset + i))
1476                     break;
1477
1478                 number_roi_mbs++;
1479             }
1480
1481             number_mb_cmds = number_roi_mbs;
1482             qp = tmp_qp;
1483         }
1484
1485         if (number_mb_cmds >= remaining_mb) {
1486             number_mb_cmds = remaining_mb;
1487         }
1488
1489         gen8_mfc_batchbuffer_emit_object_command(batch,
1490                                                   mb_flag,
1491                                                   head_offset,
1492                                                   number_mb_cmds,
1493                                                   slice_end_x,
1494                                                   slice_end_y,
1495                                                   mb_x,
1496                                                   mb_y,
1497                                                   width_in_mbs,
1498                                                   qp,
1499                                                   fwd_ref,
1500                                                   bwd_ref);
1501
1502         head_offset += (number_mb_cmds * AVC_PAK_LEN_IN_BYTE);
1503         remaining_mb -= number_mb_cmds;
1504         starting_offset += number_mb_cmds;
1505     } while (remaining_mb > 0);
1506 }
1507
1508 static void
1509 gen8_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
1510                                 struct encode_state *encode_state,
1511                                 struct intel_encoder_context *encoder_context,
1512                                 int slice_index)
1513 {
1514     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1515     struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
1516     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1517     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
1518     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
1519     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1520     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1521     int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
1522     int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1523     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1524     unsigned int tail_data[] = { 0x0, 0x0 };
1525     long head_offset;
1526     int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
1527     int qp_slice;
1528
1529     qp_slice = qp;
1530     if (rate_control_mode != VA_RC_CQP) {
1531         qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];
1532         if (encode_state->slice_header_index[slice_index] == 0) {
1533             pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1534             qp_slice = qp;
1535         }
1536     }
1537
1538     /* only support for 8-bit pixel bit-depth */
1539     assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
1540     assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
1541     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1542     assert(qp >= 0 && qp < 52);
1543
1544     gen8_mfc_avc_slice_state(ctx,
1545                               pPicParameter,
1546                               pSliceParameter,
1547                               encode_state,
1548                               encoder_context,
1549                               (rate_control_mode != VA_RC_CQP),
1550                               qp_slice,
1551                               slice_batch);
1552
1553     if (slice_index == 0)
1554         intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1555
1556     intel_avc_slice_insert_packed_data(ctx, encode_state, encoder_context, slice_index, slice_batch);
1557
1558     intel_batchbuffer_align(slice_batch, 64); /* aligned by an Cache-line */
1559     head_offset = intel_batchbuffer_used_size(slice_batch);
1560
1561     slice_batch->ptr += pSliceParameter->num_macroblocks * AVC_PAK_LEN_IN_BYTE;
1562
1563     gen8_mfc_avc_batchbuffer_slice_command(ctx,
1564                                             encoder_context,
1565                                             pSliceParameter,
1566                                             head_offset,
1567                                             qp,
1568                                             last_slice);
1569
1570
1571     /* Aligned for tail */
1572     intel_batchbuffer_align(slice_batch, 64); /* aligned by Cache-line */
1573     if (last_slice) {    
1574         mfc_context->insert_object(ctx,
1575                                    encoder_context,
1576                                    tail_data,
1577                                    2,
1578                                    8,
1579                                    2,
1580                                    1,
1581                                    1,
1582                                    0,
1583                                    slice_batch);
1584     } else {
1585         mfc_context->insert_object(ctx,
1586                                    encoder_context,
1587                                    tail_data,
1588                                    1,
1589                                    8,
1590                                    1,
1591                                    1,
1592                                    1,
1593                                    0,
1594                                    slice_batch);
1595     }
1596
1597     return;
1598 }
1599
1600 static void
1601 gen8_mfc_avc_batchbuffer_pipeline(VADriverContextP ctx,
1602                                   struct encode_state *encode_state,
1603                                   struct intel_encoder_context *encoder_context)
1604 {
1605     struct i965_driver_data *i965 = i965_driver_data(ctx);
1606     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1607     struct intel_batchbuffer *batch = encoder_context->base.batch;
1608     int i;
1609
1610     intel_batchbuffer_start_atomic(batch, 0x4000);
1611
1612     if (IS_GEN9(i965->intel.device_info))
1613         gen9_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
1614     else
1615         gen8_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
1616
1617     for ( i = 0; i < encode_state->num_slice_params_ext; i++) {
1618         gen8_mfc_avc_batchbuffer_slice(ctx, encode_state, encoder_context, i);
1619     }
1620     {
1621         struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
1622
1623         intel_batchbuffer_align(slice_batch, 8);
1624         BEGIN_BCS_BATCH(slice_batch, 2);
1625         OUT_BCS_BATCH(slice_batch, 0);
1626         OUT_BCS_BATCH(slice_batch, MI_BATCH_BUFFER_END);
1627         ADVANCE_BCS_BATCH(slice_batch);
1628
1629         BEGIN_BATCH(batch, 2);
1630         OUT_BATCH(batch, CMD_MEDIA_STATE_FLUSH);
1631         OUT_BATCH(batch, 0);
1632         ADVANCE_BATCH(batch);
1633
1634         intel_batchbuffer_free(slice_batch);
1635         mfc_context->aux_batchbuffer = NULL;
1636     }
1637
1638     if (IS_GEN9(i965->intel.device_info))
1639         gen9_gpe_pipeline_end(ctx, &mfc_context->gpe_context, batch);
1640
1641     intel_batchbuffer_end_atomic(batch);
1642     intel_batchbuffer_flush(batch);
1643
1644 }
1645
1646 static void
1647 gen8_mfc_build_avc_batchbuffer(VADriverContextP ctx, 
1648                                struct encode_state *encode_state,
1649                                struct intel_encoder_context *encoder_context)
1650 {
1651     gen8_mfc_batchbuffer_surfaces_setup(ctx, encode_state, encoder_context);
1652     gen8_mfc_batchbuffer_idrt_setup(ctx, encode_state, encoder_context);
1653     gen8_mfc_batchbuffer_constant_setup(ctx, encode_state, encoder_context);
1654     gen8_mfc_avc_batchbuffer_pipeline(ctx, encode_state, encoder_context);
1655 }
1656
1657 static dri_bo *
1658 gen8_mfc_avc_hardware_batchbuffer(VADriverContextP ctx,
1659                                   struct encode_state *encode_state,
1660                                   struct intel_encoder_context *encoder_context)
1661 {
1662     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1663
1664     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
1665     gen8_mfc_build_avc_batchbuffer(ctx, encode_state, encoder_context);
1666
1667     return mfc_context->aux_batchbuffer_surface.bo;
1668 }
1669
1670 static void
1671 gen8_mfc_avc_pipeline_programing(VADriverContextP ctx,
1672                                  struct encode_state *encode_state,
1673                                  struct intel_encoder_context *encoder_context)
1674 {
1675     struct intel_batchbuffer *batch = encoder_context->base.batch;
1676     dri_bo *slice_batch_bo;
1677
1678     if ( intel_mfc_interlace_check(ctx, encode_state, encoder_context) ) {
1679         fprintf(stderr, "Current VA driver don't support interlace mode!\n");
1680         assert(0);
1681         return; 
1682     }
1683
1684     if (encoder_context->soft_batch_force)
1685         slice_batch_bo = gen8_mfc_avc_software_batchbuffer(ctx, encode_state, encoder_context);
1686     else
1687         slice_batch_bo = gen8_mfc_avc_hardware_batchbuffer(ctx, encode_state, encoder_context);
1688
1689
1690     // begin programing
1691     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
1692     intel_batchbuffer_emit_mi_flush(batch);
1693     
1694     // picture level programing
1695     gen8_mfc_avc_pipeline_picture_programing(ctx, encode_state, encoder_context);
1696
1697     BEGIN_BCS_BATCH(batch, 3);
1698     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
1699     OUT_BCS_RELOC64(batch,
1700                   slice_batch_bo,
1701                   I915_GEM_DOMAIN_COMMAND, 0, 
1702                   0);
1703     ADVANCE_BCS_BATCH(batch);
1704
1705     // end programing
1706     intel_batchbuffer_end_atomic(batch);
1707
1708     dri_bo_unreference(slice_batch_bo);
1709 }
1710
1711
1712 static VAStatus
1713 gen8_mfc_avc_encode_picture(VADriverContextP ctx, 
1714                             struct encode_state *encode_state,
1715                             struct intel_encoder_context *encoder_context)
1716 {
1717     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1718     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1719     int current_frame_bits_size;
1720     int sts;
1721  
1722     for (;;) {
1723         gen8_mfc_init(ctx, encode_state, encoder_context);
1724         intel_mfc_avc_prepare(ctx, encode_state, encoder_context);
1725         /*Programing bcs pipeline*/
1726         gen8_mfc_avc_pipeline_programing(ctx, encode_state, encoder_context);   //filling the pipeline
1727         gen8_mfc_run(ctx, encode_state, encoder_context);
1728         if (rate_control_mode == VA_RC_CBR || rate_control_mode == VA_RC_VBR) {
1729             gen8_mfc_stop(ctx, encode_state, encoder_context, &current_frame_bits_size);
1730             sts = intel_mfc_brc_postpack(encode_state, encoder_context, current_frame_bits_size);
1731             if (sts == BRC_NO_HRD_VIOLATION) {
1732                 intel_mfc_hrd_context_update(encode_state, mfc_context);
1733                 break;
1734             }
1735             else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
1736                 if (!mfc_context->hrd.violation_noted) {
1737                     fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP)? "overflow": "underflow");
1738                     mfc_context->hrd.violation_noted = 1;
1739                 }
1740                 return VA_STATUS_SUCCESS;
1741             }
1742         } else {
1743             break;
1744         }
1745     }
1746
1747     return VA_STATUS_SUCCESS;
1748 }
1749
1750 /*
1751  * MPEG-2
1752  */
1753
1754 static const int
1755 va_to_gen8_mpeg2_picture_type[3] = {
1756     1,  /* I */
1757     2,  /* P */
1758     3   /* B */
1759 };
1760
1761 static void
1762 gen8_mfc_mpeg2_pic_state(VADriverContextP ctx,
1763                          struct intel_encoder_context *encoder_context,
1764                          struct encode_state *encode_state)
1765 {
1766     struct intel_batchbuffer *batch = encoder_context->base.batch;
1767     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1768     VAEncPictureParameterBufferMPEG2 *pic_param;
1769     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1770     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1771     VAEncSliceParameterBufferMPEG2 *slice_param = NULL;
1772
1773     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
1774     pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
1775     slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[0]->buffer;
1776
1777     BEGIN_BCS_BATCH(batch, 13);
1778     OUT_BCS_BATCH(batch, MFX_MPEG2_PIC_STATE | (13 - 2));
1779     OUT_BCS_BATCH(batch,
1780                   (pic_param->f_code[1][1] & 0xf) << 28 | /* f_code[1][1] */
1781                   (pic_param->f_code[1][0] & 0xf) << 24 | /* f_code[1][0] */
1782                   (pic_param->f_code[0][1] & 0xf) << 20 | /* f_code[0][1] */
1783                   (pic_param->f_code[0][0] & 0xf) << 16 | /* f_code[0][0] */
1784                   pic_param->picture_coding_extension.bits.intra_dc_precision << 14 |
1785                   pic_param->picture_coding_extension.bits.picture_structure << 12 |
1786                   pic_param->picture_coding_extension.bits.top_field_first << 11 |
1787                   pic_param->picture_coding_extension.bits.frame_pred_frame_dct << 10 |
1788                   pic_param->picture_coding_extension.bits.concealment_motion_vectors << 9 |
1789                   pic_param->picture_coding_extension.bits.q_scale_type << 8 |
1790                   pic_param->picture_coding_extension.bits.intra_vlc_format << 7 | 
1791                   pic_param->picture_coding_extension.bits.alternate_scan << 6);
1792     OUT_BCS_BATCH(batch,
1793                   0 << 14 |     /* LoadSlicePointerFlag, 0 means only loading bitstream pointer once */
1794                   va_to_gen8_mpeg2_picture_type[pic_param->picture_type] << 9 |
1795                   0);
1796     OUT_BCS_BATCH(batch,
1797                   1 << 31 |     /* slice concealment */
1798                   (height_in_mbs - 1) << 16 |
1799                   (width_in_mbs - 1));
1800
1801     if (slice_param && slice_param->quantiser_scale_code >= 14)
1802         OUT_BCS_BATCH(batch, (3 << 1) | (1 << 4) | (5 << 8) | (1 << 12));
1803     else
1804         OUT_BCS_BATCH(batch, 0);
1805
1806     OUT_BCS_BATCH(batch, 0);
1807     OUT_BCS_BATCH(batch,
1808                   0xFFF << 16 | /* InterMBMaxSize */
1809                   0xFFF << 0 |  /* IntraMBMaxSize */
1810                   0);
1811     OUT_BCS_BATCH(batch, 0);
1812     OUT_BCS_BATCH(batch, 0);
1813     OUT_BCS_BATCH(batch, 0);
1814     OUT_BCS_BATCH(batch, 0);
1815     OUT_BCS_BATCH(batch, 0);
1816     OUT_BCS_BATCH(batch, 0);
1817     ADVANCE_BCS_BATCH(batch);
1818 }
1819
1820 static void
1821 gen8_mfc_mpeg2_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
1822 {
1823     unsigned char intra_qm[64] = {
1824         8, 16, 19, 22, 26, 27, 29, 34,
1825         16, 16, 22, 24, 27, 29, 34, 37,
1826         19, 22, 26, 27, 29, 34, 34, 38,
1827         22, 22, 26, 27, 29, 34, 37, 40,
1828         22, 26, 27, 29, 32, 35, 40, 48,
1829         26, 27, 29, 32, 35, 40, 48, 58,
1830         26, 27, 29, 34, 38, 46, 56, 69,
1831         27, 29, 35, 38, 46, 56, 69, 83
1832     };
1833
1834     unsigned char non_intra_qm[64] = {
1835         16, 16, 16, 16, 16, 16, 16, 16,
1836         16, 16, 16, 16, 16, 16, 16, 16,
1837         16, 16, 16, 16, 16, 16, 16, 16,
1838         16, 16, 16, 16, 16, 16, 16, 16,
1839         16, 16, 16, 16, 16, 16, 16, 16,
1840         16, 16, 16, 16, 16, 16, 16, 16,
1841         16, 16, 16, 16, 16, 16, 16, 16,
1842         16, 16, 16, 16, 16, 16, 16, 16
1843     };
1844
1845     gen8_mfc_qm_state(ctx, MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX, (unsigned int *)intra_qm, 16, encoder_context);
1846     gen8_mfc_qm_state(ctx, MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX, (unsigned int *)non_intra_qm, 16,encoder_context);
1847 }
1848
1849 static void
1850 gen8_mfc_mpeg2_fqm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
1851 {
1852     unsigned short intra_fqm[64] = {
1853         65536/0x8, 65536/0x10, 65536/0x13, 65536/0x16, 65536/0x16, 65536/0x1a, 65536/0x1a, 65536/0x1b,
1854         65536/0x10, 65536/0x10, 65536/0x16, 65536/0x16, 65536/0x1a, 65536/0x1b, 65536/0x1b, 65536/0x1d,
1855         65536/0x13, 65536/0x16, 65536/0x1a, 65536/0x1a, 65536/0x1b, 65536/0x1d, 65536/0x1d, 65536/0x23,
1856         65536/0x16, 65536/0x18, 65536/0x1b, 65536/0x1b, 65536/0x13, 65536/0x20, 65536/0x22, 65536/0x26,
1857         65536/0x1a, 65536/0x1b, 65536/0x13, 65536/0x13, 65536/0x20, 65536/0x23, 65536/0x26, 65536/0x2e,
1858         65536/0x1b, 65536/0x1d, 65536/0x22, 65536/0x22, 65536/0x23, 65536/0x28, 65536/0x2e, 65536/0x38,
1859         65536/0x1d, 65536/0x22, 65536/0x22, 65536/0x25, 65536/0x28, 65536/0x30, 65536/0x38, 65536/0x45,
1860         65536/0x22, 65536/0x25, 65536/0x26, 65536/0x28, 65536/0x30, 65536/0x3a, 65536/0x45, 65536/0x53,
1861     };
1862
1863     unsigned short non_intra_fqm[64] = {
1864         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1865         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1866         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1867         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1868         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1869         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1870         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1871         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1872     };
1873
1874     gen8_mfc_fqm_state(ctx, MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX, (unsigned int *)intra_fqm, 32, encoder_context);
1875     gen8_mfc_fqm_state(ctx, MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX, (unsigned int *)non_intra_fqm, 32, encoder_context);
1876 }
1877
1878 static void
1879 gen8_mfc_mpeg2_slicegroup_state(VADriverContextP ctx,
1880                                 struct intel_encoder_context *encoder_context,
1881                                 int x, int y,
1882                                 int next_x, int next_y,
1883                                 int is_fisrt_slice_group,
1884                                 int is_last_slice_group,
1885                                 int intra_slice,
1886                                 int qp,
1887                                 struct intel_batchbuffer *batch)
1888 {
1889     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1890
1891     if (batch == NULL)
1892         batch = encoder_context->base.batch;
1893
1894     BEGIN_BCS_BATCH(batch, 8);
1895
1896     OUT_BCS_BATCH(batch, MFC_MPEG2_SLICEGROUP_STATE | (8 - 2));
1897     OUT_BCS_BATCH(batch,
1898                   0 << 31 |                             /* MbRateCtrlFlag */
1899                   !!is_last_slice_group << 19 |         /* IsLastSliceGrp */
1900                   1 << 17 |                             /* Insert Header before the first slice group data */
1901                   1 << 16 |                             /* SliceData PresentFlag: always 1 */
1902                   1 << 15 |                             /* TailPresentFlag: always 1 */
1903                   0 << 14 |                             /* FirstSliceHdrDisabled: slice header for each slice */
1904                   !!intra_slice << 13 |                 /* IntraSlice */
1905                   !!intra_slice << 12 |                 /* IntraSliceFlag */
1906                   0);
1907     OUT_BCS_BATCH(batch,
1908                   next_y << 24 |
1909                   next_x << 16 |
1910                   y << 8 |
1911                   x << 0 |
1912                   0);
1913     OUT_BCS_BATCH(batch, qp);   /* FIXME: SliceGroupQp */
1914     /* bitstream pointer is only loaded once for the first slice of a frame when 
1915      * LoadSlicePointerFlag is 0
1916      */
1917     OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
1918     OUT_BCS_BATCH(batch, 0);    /* FIXME: */
1919     OUT_BCS_BATCH(batch, 0);    /* FIXME: CorrectPoints */
1920     OUT_BCS_BATCH(batch, 0);    /* FIXME: CVxxx */
1921
1922     ADVANCE_BCS_BATCH(batch);
1923 }
1924
1925 static int
1926 gen8_mfc_mpeg2_pak_object_intra(VADriverContextP ctx,
1927                                 struct intel_encoder_context *encoder_context,
1928                                 int x, int y,
1929                                 int first_mb_in_slice,
1930                                 int last_mb_in_slice,
1931                                 int first_mb_in_slice_group,
1932                                 int last_mb_in_slice_group,
1933                                 int mb_type,
1934                                 int qp_scale_code,
1935                                 int coded_block_pattern,
1936                                 unsigned char target_size_in_word,
1937                                 unsigned char max_size_in_word,
1938                                 struct intel_batchbuffer *batch)
1939 {
1940     int len_in_dwords = 9;
1941
1942     if (batch == NULL)
1943         batch = encoder_context->base.batch;
1944
1945     BEGIN_BCS_BATCH(batch, len_in_dwords);
1946
1947     OUT_BCS_BATCH(batch, MFC_MPEG2_PAK_OBJECT | (len_in_dwords - 2));
1948     OUT_BCS_BATCH(batch,
1949                   0 << 24 |     /* PackedMvNum */
1950                   0 << 20 |     /* MvFormat */
1951                   7 << 17 |     /* CbpDcY/CbpDcU/CbpDcV */
1952                   0 << 15 |     /* TransformFlag: frame DCT */
1953                   0 << 14 |     /* FieldMbFlag */
1954                   1 << 13 |     /* IntraMbFlag */
1955                   mb_type << 8 |   /* MbType: Intra */
1956                   0 << 2 |      /* SkipMbFlag */
1957                   0 << 0 |      /* InterMbMode */
1958                   0);
1959     OUT_BCS_BATCH(batch, y << 16 | x);
1960     OUT_BCS_BATCH(batch,
1961                   max_size_in_word << 24 |
1962                   target_size_in_word << 16 |
1963                   coded_block_pattern << 6 |      /* CBP */
1964                   0);
1965     OUT_BCS_BATCH(batch,
1966                   last_mb_in_slice << 31 |
1967                   first_mb_in_slice << 30 |
1968                   0 << 27 |     /* EnableCoeffClamp */
1969                   last_mb_in_slice_group << 26 |
1970                   0 << 25 |     /* MbSkipConvDisable */
1971                   first_mb_in_slice_group << 24 |
1972                   0 << 16 |     /* MvFieldSelect */
1973                   qp_scale_code << 0 |
1974                   0);
1975     OUT_BCS_BATCH(batch, 0);    /* MV[0][0] */
1976     OUT_BCS_BATCH(batch, 0);    /* MV[1][0] */
1977     OUT_BCS_BATCH(batch, 0);    /* MV[0][1] */
1978     OUT_BCS_BATCH(batch, 0);    /* MV[1][1] */
1979
1980     ADVANCE_BCS_BATCH(batch);
1981
1982     return len_in_dwords;
1983 }
1984
1985 /* Byte offset */
1986 #define MPEG2_INTER_MV_OFFSET   48 
1987
1988 static struct _mv_ranges
1989 {
1990     int low;    /* in the unit of 1/2 pixel */
1991     int high;   /* in the unit of 1/2 pixel */
1992 } mv_ranges[] = {
1993     {0, 0},
1994     {-16, 15},
1995     {-32, 31},
1996     {-64, 63},
1997     {-128, 127},
1998     {-256, 255},
1999     {-512, 511},
2000     {-1024, 1023},
2001     {-2048, 2047},
2002     {-4096, 4095}
2003 };
2004
2005 static int
2006 mpeg2_motion_vector(int mv, int pos, int display_max, int f_code)
2007 {
2008     if (mv + pos * 16 * 2 < 0 ||
2009         mv + (pos + 1) * 16 * 2 > display_max * 2)
2010         mv = 0;
2011
2012     if (f_code > 0 && f_code < 10) {
2013         if (mv < mv_ranges[f_code].low)
2014             mv = mv_ranges[f_code].low;
2015
2016         if (mv > mv_ranges[f_code].high)
2017             mv = mv_ranges[f_code].high;
2018     }
2019
2020     return mv;
2021 }
2022
2023 static int
2024 gen8_mfc_mpeg2_pak_object_inter(VADriverContextP ctx,
2025                                 struct encode_state *encode_state,
2026                                 struct intel_encoder_context *encoder_context,
2027                                 unsigned int *msg,
2028                                 int width_in_mbs, int height_in_mbs,
2029                                 int x, int y,
2030                                 int first_mb_in_slice,
2031                                 int last_mb_in_slice,
2032                                 int first_mb_in_slice_group,
2033                                 int last_mb_in_slice_group,
2034                                 int qp_scale_code,
2035                                 unsigned char target_size_in_word,
2036                                 unsigned char max_size_in_word,
2037                                 struct intel_batchbuffer *batch)
2038 {
2039     VAEncPictureParameterBufferMPEG2 *pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
2040     int len_in_dwords = 9;
2041     short *mvptr, mvx0, mvy0, mvx1, mvy1;
2042     
2043     if (batch == NULL)
2044         batch = encoder_context->base.batch;
2045
2046     mvptr = (short *)((unsigned char *)msg + MPEG2_INTER_MV_OFFSET);;
2047     mvx0 = mpeg2_motion_vector(mvptr[0] / 2, x, width_in_mbs * 16, pic_param->f_code[0][0]);
2048     mvy0 = mpeg2_motion_vector(mvptr[1] / 2, y, height_in_mbs * 16, pic_param->f_code[0][0]);
2049     mvx1 = mpeg2_motion_vector(mvptr[2] / 2, x, width_in_mbs * 16, pic_param->f_code[1][0]);
2050     mvy1 = mpeg2_motion_vector(mvptr[3] / 2, y, height_in_mbs * 16, pic_param->f_code[1][0]);
2051
2052     BEGIN_BCS_BATCH(batch, len_in_dwords);
2053
2054     OUT_BCS_BATCH(batch, MFC_MPEG2_PAK_OBJECT | (len_in_dwords - 2));
2055     OUT_BCS_BATCH(batch,
2056                   2 << 24 |     /* PackedMvNum */
2057                   7 << 20 |     /* MvFormat */
2058                   7 << 17 |     /* CbpDcY/CbpDcU/CbpDcV */
2059                   0 << 15 |     /* TransformFlag: frame DCT */
2060                   0 << 14 |     /* FieldMbFlag */
2061                   0 << 13 |     /* IntraMbFlag */
2062                   1 << 8 |      /* MbType: Frame-based */
2063                   0 << 2 |      /* SkipMbFlag */
2064                   0 << 0 |      /* InterMbMode */
2065                   0);
2066     OUT_BCS_BATCH(batch, y << 16 | x);
2067     OUT_BCS_BATCH(batch,
2068                   max_size_in_word << 24 |
2069                   target_size_in_word << 16 |
2070                   0x3f << 6 |   /* CBP */
2071                   0);
2072     OUT_BCS_BATCH(batch,
2073                   last_mb_in_slice << 31 |
2074                   first_mb_in_slice << 30 |
2075                   0 << 27 |     /* EnableCoeffClamp */
2076                   last_mb_in_slice_group << 26 |
2077                   0 << 25 |     /* MbSkipConvDisable */
2078                   first_mb_in_slice_group << 24 |
2079                   0 << 16 |     /* MvFieldSelect */
2080                   qp_scale_code << 0 |
2081                   0);
2082
2083     OUT_BCS_BATCH(batch, (mvx0 & 0xFFFF) | mvy0 << 16);    /* MV[0][0] */
2084     OUT_BCS_BATCH(batch, (mvx1 & 0xFFFF) | mvy1 << 16);    /* MV[1][0] */
2085     OUT_BCS_BATCH(batch, 0);    /* MV[0][1] */
2086     OUT_BCS_BATCH(batch, 0);    /* MV[1][1] */
2087
2088     ADVANCE_BCS_BATCH(batch);
2089
2090     return len_in_dwords;
2091 }
2092
2093 static void
2094 intel_mfc_mpeg2_pipeline_header_programing(VADriverContextP ctx,
2095                                            struct encode_state *encode_state,
2096                                            struct intel_encoder_context *encoder_context,
2097                                            struct intel_batchbuffer *slice_batch)
2098 {
2099     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2100     int idx = va_enc_packed_type_to_idx(VAEncPackedHeaderMPEG2_SPS);
2101
2102     if (encode_state->packed_header_data[idx]) {
2103         VAEncPackedHeaderParameterBuffer *param = NULL;
2104         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2105         unsigned int length_in_bits;
2106
2107         assert(encode_state->packed_header_param[idx]);
2108         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2109         length_in_bits = param->bit_length;
2110
2111         mfc_context->insert_object(ctx,
2112                                    encoder_context,
2113                                    header_data,
2114                                    ALIGN(length_in_bits, 32) >> 5,
2115                                    length_in_bits & 0x1f,
2116                                    5,   /* FIXME: check it */
2117                                    0,
2118                                    0,
2119                                    0,   /* Needn't insert emulation bytes for MPEG-2 */
2120                                    slice_batch);
2121     }
2122
2123     idx = va_enc_packed_type_to_idx(VAEncPackedHeaderMPEG2_PPS);
2124
2125     if (encode_state->packed_header_data[idx]) {
2126         VAEncPackedHeaderParameterBuffer *param = NULL;
2127         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2128         unsigned int length_in_bits;
2129
2130         assert(encode_state->packed_header_param[idx]);
2131         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2132         length_in_bits = param->bit_length;
2133
2134         mfc_context->insert_object(ctx,
2135                                    encoder_context,
2136                                    header_data,
2137                                    ALIGN(length_in_bits, 32) >> 5,
2138                                    length_in_bits & 0x1f,
2139                                    5,   /* FIXME: check it */
2140                                    0,
2141                                    0,
2142                                    0,   /* Needn't insert emulation bytes for MPEG-2 */
2143                                    slice_batch);
2144     }
2145 }
2146
2147 static void 
2148 gen8_mfc_mpeg2_pipeline_slice_group(VADriverContextP ctx,
2149                                     struct encode_state *encode_state,
2150                                     struct intel_encoder_context *encoder_context,
2151                                     int slice_index,
2152                                     VAEncSliceParameterBufferMPEG2 *next_slice_group_param,
2153                                     struct intel_batchbuffer *slice_batch)
2154 {
2155     struct gen6_vme_context *vme_context = encoder_context->vme_context;
2156     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2157     VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
2158     VAEncSliceParameterBufferMPEG2 *slice_param = NULL;
2159     unsigned char tail_delimiter[] = {MPEG2_DELIMITER0, MPEG2_DELIMITER1, MPEG2_DELIMITER2, MPEG2_DELIMITER3, MPEG2_DELIMITER4, 0, 0, 0};
2160     unsigned char section_delimiter[] = {0x0, 0x0, 0x0, 0x0};
2161     int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
2162     int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
2163     int i, j;
2164     int h_start_pos, v_start_pos, h_next_start_pos, v_next_start_pos;
2165     unsigned int *msg = NULL;
2166     unsigned char *msg_ptr = NULL;
2167
2168     slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[slice_index]->buffer;
2169     h_start_pos = slice_param->macroblock_address % width_in_mbs;
2170     v_start_pos = slice_param->macroblock_address / width_in_mbs;
2171     assert(h_start_pos + slice_param->num_macroblocks <= width_in_mbs);
2172
2173     dri_bo_map(vme_context->vme_output.bo , 0);
2174     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
2175
2176     if (next_slice_group_param) {
2177         h_next_start_pos = next_slice_group_param->macroblock_address % width_in_mbs;
2178         v_next_start_pos = next_slice_group_param->macroblock_address / width_in_mbs;
2179     } else {
2180         h_next_start_pos = 0;
2181         v_next_start_pos = height_in_mbs;
2182     }
2183
2184     gen8_mfc_mpeg2_slicegroup_state(ctx,
2185                                     encoder_context,
2186                                     h_start_pos,
2187                                     v_start_pos,
2188                                     h_next_start_pos,
2189                                     v_next_start_pos,
2190                                     slice_index == 0,
2191                                     next_slice_group_param == NULL,
2192                                     slice_param->is_intra_slice,
2193                                     slice_param->quantiser_scale_code,
2194                                     slice_batch);
2195
2196     if (slice_index == 0) 
2197         intel_mfc_mpeg2_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
2198
2199     /* Insert '00' to make sure the header is valid */
2200     mfc_context->insert_object(ctx,
2201                                encoder_context,
2202                                (unsigned int*)section_delimiter,
2203                                1,
2204                                8,   /* 8bits in the last DWORD */
2205                                1,   /* 1 byte */
2206                                1,
2207                                0,
2208                                0,
2209                                slice_batch);
2210
2211     for (i = 0; i < encode_state->slice_params_ext[slice_index]->num_elements; i++) {
2212         /* PAK for each macroblocks */
2213         for (j = 0; j < slice_param->num_macroblocks; j++) {
2214             int h_pos = (slice_param->macroblock_address + j) % width_in_mbs;
2215             int v_pos = (slice_param->macroblock_address + j) / width_in_mbs;
2216             int first_mb_in_slice = (j == 0);
2217             int last_mb_in_slice = (j == slice_param->num_macroblocks - 1);
2218             int first_mb_in_slice_group = (i == 0 && j == 0);
2219             int last_mb_in_slice_group = (i == encode_state->slice_params_ext[slice_index]->num_elements - 1 &&
2220                                           j == slice_param->num_macroblocks - 1);
2221
2222             msg = (unsigned int *)(msg_ptr + (slice_param->macroblock_address + j) * vme_context->vme_output.size_block);
2223
2224             if (slice_param->is_intra_slice) {
2225                 gen8_mfc_mpeg2_pak_object_intra(ctx,
2226                                                 encoder_context,
2227                                                 h_pos, v_pos,
2228                                                 first_mb_in_slice,
2229                                                 last_mb_in_slice,
2230                                                 first_mb_in_slice_group,
2231                                                 last_mb_in_slice_group,
2232                                                 0x1a,
2233                                                 slice_param->quantiser_scale_code,
2234                                                 0x3f,
2235                                                 0,
2236                                                 0xff,
2237                                                 slice_batch);
2238             } else {
2239                 int inter_rdo, intra_rdo;
2240                 inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
2241                 intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
2242
2243                 if (intra_rdo < inter_rdo) 
2244                     gen8_mfc_mpeg2_pak_object_intra(ctx,
2245                                                      encoder_context,
2246                                                      h_pos, v_pos,
2247                                                      first_mb_in_slice,
2248                                                      last_mb_in_slice,
2249                                                      first_mb_in_slice_group,
2250                                                      last_mb_in_slice_group,
2251                                                      0x1a,
2252                                                      slice_param->quantiser_scale_code,
2253                                                      0x3f,
2254                                                      0,
2255                                                      0xff,
2256                                                      slice_batch);
2257                 else
2258                     gen8_mfc_mpeg2_pak_object_inter(ctx,
2259                                                 encode_state,
2260                                                 encoder_context,
2261                                                 msg,
2262                                                 width_in_mbs, height_in_mbs,
2263                                                 h_pos, v_pos,
2264                                                 first_mb_in_slice,
2265                                                 last_mb_in_slice,
2266                                                 first_mb_in_slice_group,
2267                                                 last_mb_in_slice_group,
2268                                                 slice_param->quantiser_scale_code,
2269                                                 0,
2270                                                 0xff,
2271                                                 slice_batch);
2272             }
2273         }
2274
2275         slice_param++;
2276     }
2277
2278     dri_bo_unmap(vme_context->vme_output.bo);
2279
2280     /* tail data */
2281     if (next_slice_group_param == NULL) { /* end of a picture */
2282         mfc_context->insert_object(ctx,
2283                                    encoder_context,
2284                                    (unsigned int *)tail_delimiter,
2285                                    2,
2286                                    8,   /* 8bits in the last DWORD */
2287                                    5,   /* 5 bytes */
2288                                    1,
2289                                    1,
2290                                    0,
2291                                    slice_batch);
2292     } else {        /* end of a lsice group */
2293         mfc_context->insert_object(ctx,
2294                                    encoder_context,
2295                                    (unsigned int *)section_delimiter,
2296                                    1,
2297                                    8,   /* 8bits in the last DWORD */
2298                                    1,   /* 1 byte */
2299                                    1,
2300                                    1,
2301                                    0,
2302                                    slice_batch);
2303     }
2304 }
2305
2306 /* 
2307  * A batch buffer for all slices, including slice state, 
2308  * slice insert object and slice pak object commands
2309  *
2310  */
2311 static dri_bo *
2312 gen8_mfc_mpeg2_software_slice_batchbuffer(VADriverContextP ctx,
2313                                           struct encode_state *encode_state,
2314                                           struct intel_encoder_context *encoder_context)
2315 {
2316     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2317     struct intel_batchbuffer *batch;
2318     VAEncSliceParameterBufferMPEG2 *next_slice_group_param = NULL;
2319     dri_bo *batch_bo;
2320     int i;
2321
2322     batch = mfc_context->aux_batchbuffer;
2323     batch_bo = batch->buffer;
2324
2325     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
2326         if (i == encode_state->num_slice_params_ext - 1)
2327             next_slice_group_param = NULL;
2328         else
2329             next_slice_group_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[i + 1]->buffer;
2330
2331         gen8_mfc_mpeg2_pipeline_slice_group(ctx, encode_state, encoder_context, i, next_slice_group_param, batch);
2332     }
2333
2334     intel_batchbuffer_align(batch, 8);
2335     
2336     BEGIN_BCS_BATCH(batch, 2);
2337     OUT_BCS_BATCH(batch, 0);
2338     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
2339     ADVANCE_BCS_BATCH(batch);
2340
2341     dri_bo_reference(batch_bo);
2342     intel_batchbuffer_free(batch);
2343     mfc_context->aux_batchbuffer = NULL;
2344
2345     return batch_bo;
2346 }
2347
2348 static void
2349 gen8_mfc_mpeg2_pipeline_picture_programing(VADriverContextP ctx,
2350                                            struct encode_state *encode_state,
2351                                            struct intel_encoder_context *encoder_context)
2352 {
2353     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2354
2355     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_MPEG2, encoder_context);
2356     mfc_context->set_surface_state(ctx, encoder_context);
2357     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
2358     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
2359     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
2360     gen8_mfc_mpeg2_pic_state(ctx, encoder_context, encode_state);
2361     gen8_mfc_mpeg2_qm_state(ctx, encoder_context);
2362     gen8_mfc_mpeg2_fqm_state(ctx, encoder_context);
2363 }
2364
2365 static void
2366 gen8_mfc_mpeg2_pipeline_programing(VADriverContextP ctx,
2367                                    struct encode_state *encode_state,
2368                                    struct intel_encoder_context *encoder_context)
2369 {
2370     struct intel_batchbuffer *batch = encoder_context->base.batch;
2371     dri_bo *slice_batch_bo;
2372
2373     slice_batch_bo = gen8_mfc_mpeg2_software_slice_batchbuffer(ctx, encode_state, encoder_context);
2374
2375     // begin programing
2376     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
2377     intel_batchbuffer_emit_mi_flush(batch);
2378     
2379     // picture level programing
2380     gen8_mfc_mpeg2_pipeline_picture_programing(ctx, encode_state, encoder_context);
2381
2382     BEGIN_BCS_BATCH(batch, 4);
2383     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
2384     OUT_BCS_RELOC64(batch,
2385                   slice_batch_bo,
2386                   I915_GEM_DOMAIN_COMMAND, 0, 
2387                   0);
2388     OUT_BCS_BATCH(batch, 0);
2389     ADVANCE_BCS_BATCH(batch);
2390
2391     // end programing
2392     intel_batchbuffer_end_atomic(batch);
2393
2394     dri_bo_unreference(slice_batch_bo);
2395 }
2396
2397 static VAStatus
2398 intel_mfc_mpeg2_prepare(VADriverContextP ctx, 
2399                         struct encode_state *encode_state,
2400                         struct intel_encoder_context *encoder_context)
2401 {
2402     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2403     struct object_surface *obj_surface; 
2404     struct object_buffer *obj_buffer;
2405     struct i965_coded_buffer_segment *coded_buffer_segment;
2406     VAStatus vaStatus = VA_STATUS_SUCCESS;
2407     dri_bo *bo;
2408     int i;
2409
2410     /* reconstructed surface */
2411     obj_surface = encode_state->reconstructed_object;
2412     i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC_NV12, SUBSAMPLE_YUV420);
2413     mfc_context->pre_deblocking_output.bo = obj_surface->bo;
2414     dri_bo_reference(mfc_context->pre_deblocking_output.bo);
2415     mfc_context->surface_state.width = obj_surface->orig_width;
2416     mfc_context->surface_state.height = obj_surface->orig_height;
2417     mfc_context->surface_state.w_pitch = obj_surface->width;
2418     mfc_context->surface_state.h_pitch = obj_surface->height;
2419
2420     /* forward reference */
2421     obj_surface = encode_state->reference_objects[0];
2422
2423     if (obj_surface && obj_surface->bo) {
2424         mfc_context->reference_surfaces[0].bo = obj_surface->bo;
2425         dri_bo_reference(mfc_context->reference_surfaces[0].bo);
2426     } else
2427         mfc_context->reference_surfaces[0].bo = NULL;
2428
2429     /* backward reference */
2430     obj_surface = encode_state->reference_objects[1];
2431
2432     if (obj_surface && obj_surface->bo) {
2433         mfc_context->reference_surfaces[1].bo = obj_surface->bo;
2434         dri_bo_reference(mfc_context->reference_surfaces[1].bo);
2435     } else {
2436         mfc_context->reference_surfaces[1].bo = mfc_context->reference_surfaces[0].bo;
2437
2438         if (mfc_context->reference_surfaces[1].bo)
2439             dri_bo_reference(mfc_context->reference_surfaces[1].bo);
2440     }
2441
2442     for (i = 2; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
2443         mfc_context->reference_surfaces[i].bo = mfc_context->reference_surfaces[i & 1].bo;
2444
2445         if (mfc_context->reference_surfaces[i].bo)
2446             dri_bo_reference(mfc_context->reference_surfaces[i].bo);
2447     }
2448     
2449     /* input YUV surface */
2450     obj_surface = encode_state->input_yuv_object;
2451     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
2452     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
2453
2454     /* coded buffer */
2455     obj_buffer = encode_state->coded_buf_object;
2456     bo = obj_buffer->buffer_store->bo;
2457     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
2458     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
2459     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
2460     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
2461
2462     /* set the internal flag to 0 to indicate the coded size is unknown */
2463     dri_bo_map(bo, 1);
2464     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
2465     coded_buffer_segment->mapped = 0;
2466     coded_buffer_segment->codec = encoder_context->codec;
2467     dri_bo_unmap(bo);
2468
2469     return vaStatus;
2470 }
2471
2472 static VAStatus
2473 gen8_mfc_mpeg2_encode_picture(VADriverContextP ctx, 
2474                               struct encode_state *encode_state,
2475                               struct intel_encoder_context *encoder_context)
2476 {
2477     gen8_mfc_init(ctx, encode_state, encoder_context);
2478     intel_mfc_mpeg2_prepare(ctx, encode_state, encoder_context);
2479     /*Programing bcs pipeline*/
2480     gen8_mfc_mpeg2_pipeline_programing(ctx, encode_state, encoder_context);
2481     gen8_mfc_run(ctx, encode_state, encoder_context);
2482
2483     return VA_STATUS_SUCCESS;
2484 }
2485
2486 /* JPEG encode methods */
2487
2488 static VAStatus
2489 intel_mfc_jpeg_prepare(VADriverContextP ctx, 
2490                         struct encode_state *encode_state,
2491                         struct intel_encoder_context *encoder_context)
2492 {
2493     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2494     struct object_surface *obj_surface; 
2495     struct object_buffer *obj_buffer;
2496     struct i965_coded_buffer_segment *coded_buffer_segment;
2497     VAStatus vaStatus = VA_STATUS_SUCCESS;
2498     dri_bo *bo;
2499    
2500     /* input YUV surface */
2501     obj_surface = encode_state->input_yuv_object;
2502     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
2503     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
2504
2505     /* coded buffer */
2506     obj_buffer = encode_state->coded_buf_object;
2507     bo = obj_buffer->buffer_store->bo;
2508     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
2509     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
2510     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
2511     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
2512
2513     /* set the internal flag to 0 to indicate the coded size is unknown */
2514     dri_bo_map(bo, 1);
2515     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
2516     coded_buffer_segment->mapped = 0;
2517     coded_buffer_segment->codec = encoder_context->codec;
2518     dri_bo_unmap(bo);
2519
2520     return vaStatus;
2521 }
2522
2523
2524 static void 
2525 gen8_mfc_jpeg_set_surface_state(VADriverContextP ctx,
2526                         struct intel_encoder_context *encoder_context,
2527                         struct encode_state *encode_state)
2528 {
2529     struct intel_batchbuffer *batch = encoder_context->base.batch;
2530     struct object_surface *obj_surface = encode_state->input_yuv_object;
2531     unsigned int input_fourcc;
2532     unsigned int y_cb_offset;
2533     unsigned int y_cr_offset;
2534     unsigned int surface_format;
2535
2536     assert(obj_surface);
2537
2538     y_cb_offset = obj_surface->y_cb_offset;
2539     y_cr_offset = obj_surface->y_cr_offset;
2540     input_fourcc = obj_surface->fourcc;
2541
2542     surface_format = (obj_surface->fourcc == VA_FOURCC_Y800) ?
2543         MFX_SURFACE_MONOCHROME : MFX_SURFACE_PLANAR_420_8;
2544         
2545         
2546      switch (input_fourcc) {
2547         case VA_FOURCC_Y800: {
2548             surface_format = MFX_SURFACE_MONOCHROME;
2549             break;
2550         }
2551         case VA_FOURCC_NV12: { 
2552             surface_format = MFX_SURFACE_PLANAR_420_8;
2553             break;
2554         }      
2555         case VA_FOURCC_UYVY: { 
2556             surface_format = MFX_SURFACE_YCRCB_SWAPY;
2557             break;
2558         }
2559         case VA_FOURCC_YUY2: { 
2560             surface_format = MFX_SURFACE_YCRCB_NORMAL;
2561             break;
2562         }
2563         case VA_FOURCC_RGBA:
2564         case VA_FOURCC_444P: {
2565             surface_format = MFX_SURFACE_R8G8B8A8_UNORM;
2566             break;
2567         }
2568     }
2569
2570     BEGIN_BCS_BATCH(batch, 6);
2571
2572     OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
2573     OUT_BCS_BATCH(batch, 0);
2574     OUT_BCS_BATCH(batch,
2575                   ((obj_surface->orig_height - 1) << 18) |
2576                   ((obj_surface->orig_width - 1) << 4));
2577     OUT_BCS_BATCH(batch,
2578                   (surface_format << 28) | /* Surface Format */
2579                   (0 << 27) | /* must be 1 for interleave U/V, hardware requirement for AVC/VC1/MPEG and 0 for JPEG */
2580                   (0 << 22) | /* surface object control state, FIXME??? */
2581                   ((obj_surface->width - 1) << 3) | /* pitch */
2582                   (0 << 2)  | /* must be 0 for interleave U/V */
2583                   (1 << 1)  | /* must be tiled */
2584                   (I965_TILEWALK_YMAJOR << 0));  /* tile walk, TILEWALK_YMAJOR */
2585     OUT_BCS_BATCH(batch,
2586                   (0 << 16) | /* X offset for U(Cb), must be 0 */
2587                   (y_cb_offset << 0)); /* Y offset for U(Cb) */
2588     OUT_BCS_BATCH(batch,
2589                   (0 << 16) | /* X offset for V(Cr), must be 0 */
2590                   (y_cr_offset << 0)); /* Y offset for V(Cr), must be 0 for video codec, non-zoeo for JPEG */
2591                  
2592
2593     ADVANCE_BCS_BATCH(batch);
2594 }
2595
2596 static void
2597 gen8_mfc_jpeg_pic_state(VADriverContextP ctx,
2598                         struct intel_encoder_context *encoder_context,
2599                         struct encode_state *encode_state)
2600 {
2601     struct intel_batchbuffer *batch = encoder_context->base.batch;
2602     struct object_surface *obj_surface = encode_state->input_yuv_object;
2603     VAEncPictureParameterBufferJPEG *pic_param;
2604     unsigned int  surface_format;
2605     unsigned int  frame_width_in_blks;
2606     unsigned int  frame_height_in_blks;
2607     unsigned int  pixels_in_horizontal_lastMCU;
2608     unsigned int  pixels_in_vertical_lastMCU;
2609     unsigned int  input_surface_format;
2610     unsigned int  output_mcu_format;
2611     unsigned int  picture_width;
2612     unsigned int  picture_height;  
2613
2614     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
2615     assert(obj_surface);
2616     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
2617     surface_format = obj_surface->fourcc;
2618     picture_width = pic_param->picture_width;
2619     picture_height = pic_param->picture_height;
2620     
2621     switch (surface_format) {
2622         case VA_FOURCC_Y800: {
2623             input_surface_format = JPEG_ENC_SURFACE_Y8; 
2624             output_mcu_format = JPEG_ENC_MCU_YUV400;
2625             break;
2626         }
2627         case VA_FOURCC_NV12: { 
2628             input_surface_format = JPEG_ENC_SURFACE_NV12; 
2629             output_mcu_format = JPEG_ENC_MCU_YUV420; 
2630             break;
2631         }      
2632         case VA_FOURCC_UYVY: { 
2633             input_surface_format = JPEG_ENC_SURFACE_UYVY; 
2634             output_mcu_format = JPEG_ENC_MCU_YUV422H_2Y; 
2635             break;
2636         }
2637         case VA_FOURCC_YUY2: { 
2638             input_surface_format = JPEG_ENC_SURFACE_YUY2; 
2639             output_mcu_format = JPEG_ENC_MCU_YUV422H_2Y; 
2640             break;
2641         }
2642
2643         case VA_FOURCC_RGBA:
2644         case VA_FOURCC_444P: { 
2645             input_surface_format = JPEG_ENC_SURFACE_RGB; 
2646             output_mcu_format = JPEG_ENC_MCU_RGB; 
2647             break;
2648         }
2649         default : {
2650             input_surface_format = JPEG_ENC_SURFACE_NV12; 
2651             output_mcu_format = JPEG_ENC_MCU_YUV420;
2652             break;
2653         }
2654     }
2655
2656     
2657     switch (output_mcu_format) {
2658         
2659         case JPEG_ENC_MCU_YUV400:
2660         case JPEG_ENC_MCU_RGB: {
2661             pixels_in_horizontal_lastMCU = (picture_width % 8);
2662             pixels_in_vertical_lastMCU = (picture_height % 8); 
2663
2664             //H1=1,V1=1 for YUV400 and YUV444. So, compute these values accordingly
2665             frame_width_in_blks = ((picture_width + 7) / 8); 
2666             frame_height_in_blks = ((picture_height + 7) / 8);
2667             break;
2668         }
2669         
2670         case JPEG_ENC_MCU_YUV420: {        
2671             if((picture_width % 2) == 0) 
2672                 pixels_in_horizontal_lastMCU = picture_width % 16; 
2673             else 
2674                 pixels_in_horizontal_lastMCU   = ((picture_width % 16) + 1) % 16; 
2675             
2676             if((picture_height % 2) == 0) 
2677                 pixels_in_vertical_lastMCU     = picture_height % 16; 
2678             else 
2679                 pixels_in_vertical_lastMCU   = ((picture_height % 16) + 1) % 16; 
2680
2681             //H1=2,V1=2 for YUV420. So, compute these values accordingly
2682             frame_width_in_blks = ((picture_width + 15) / 16) * 2;
2683             frame_height_in_blks = ((picture_height + 15) / 16) * 2;
2684             break;
2685         }
2686         
2687         case JPEG_ENC_MCU_YUV422H_2Y: {
2688             if(picture_width % 2 == 0) 
2689                 pixels_in_horizontal_lastMCU = picture_width % 16; 
2690             else 
2691                 pixels_in_horizontal_lastMCU = ((picture_width % 16) + 1) % 16; 
2692             
2693             pixels_in_vertical_lastMCU = picture_height % 8;
2694             
2695             //H1=2,V1=1 for YUV422H_2Y. So, compute these values accordingly
2696             frame_width_in_blks = ((picture_width + 15) / 16) * 2;
2697             frame_height_in_blks = ((picture_height + 7) / 8);
2698             break;            
2699         }       
2700     } //end of switch
2701    
2702     BEGIN_BCS_BATCH(batch, 3);
2703     /* DWORD 0 */
2704     OUT_BCS_BATCH(batch, MFX_JPEG_PIC_STATE | (3 - 2)); 
2705     /* DWORD 1 */
2706     OUT_BCS_BATCH(batch,
2707                   ( pixels_in_horizontal_lastMCU << 26) |    /* Pixels In Horizontal Last MCU */
2708                   ( pixels_in_vertical_lastMCU << 21)   |    /* Pixels In Vertical Last MCU */
2709                   ( input_surface_format << 8)          |    /* Input Surface format */
2710                   ( output_mcu_format << 0));                /* Output MCU Structure */
2711     /* DWORD 2 */
2712     OUT_BCS_BATCH(batch,
2713                   ((frame_height_in_blks - 1) << 16)    |   /* Frame Height In Blks Minus 1 */
2714                   (JPEG_ENC_ROUND_QUANT_DEFAULT  << 13) |   /* Rounding Quant set to default value 0 */
2715                   ((frame_width_in_blks - 1) << 0));        /* Frame Width In Blks Minus 1 */
2716     ADVANCE_BCS_BATCH(batch);
2717 }
2718
2719 static void 
2720 get_reciprocal_dword_qm(unsigned char *raster_qm, uint32_t *dword_qm)
2721 {
2722     int i = 0, j = 0;
2723     short reciprocal_qm[64];
2724     
2725     for(i=0; i<64; i++) {
2726         reciprocal_qm[i] = 65535/(raster_qm[i]);           
2727     }
2728     
2729     for(i=0; i<64; i++) {
2730         dword_qm[j] = ((reciprocal_qm[i+1] <<16) | (reciprocal_qm[i]));
2731         j++;
2732         i++;
2733     }    
2734     
2735 }
2736
2737
2738 static void 
2739 gen8_mfc_jpeg_fqm_state(VADriverContextP ctx,
2740                         struct intel_encoder_context *encoder_context,
2741                         struct encode_state *encode_state)
2742 {
2743     unsigned int quality = 0;
2744     uint32_t temp, i = 0, j = 0, dword_qm[32];
2745     VAEncPictureParameterBufferJPEG *pic_param;
2746     VAQMatrixBufferJPEG *qmatrix;
2747     unsigned char raster_qm[64], column_raster_qm[64];
2748     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2749     
2750     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
2751     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
2752     quality = pic_param->quality;
2753     
2754     //If the app sends the qmatrix, use it, buffer it for using it with the next frames 
2755     //The app can send qmatrix for the first frame and not send for the subsequent frames
2756     if(encode_state->q_matrix && encode_state->q_matrix->buffer) {
2757         qmatrix = (VAQMatrixBufferJPEG *)encode_state->q_matrix->buffer;
2758
2759         mfc_context->buffered_qmatrix.load_lum_quantiser_matrix = 1;
2760         memcpy(mfc_context->buffered_qmatrix.lum_quantiser_matrix, qmatrix->lum_quantiser_matrix, 64 * (sizeof(unsigned char)));
2761
2762         if(pic_param->num_components > 1) {
2763             mfc_context->buffered_qmatrix.load_chroma_quantiser_matrix = 1;
2764             memcpy(mfc_context->buffered_qmatrix.chroma_quantiser_matrix, qmatrix->chroma_quantiser_matrix, 64 * (sizeof(unsigned char)));
2765         } else {
2766             mfc_context->buffered_qmatrix.load_chroma_quantiser_matrix = 0;
2767         }
2768
2769     } else {
2770         //If the app doesnt send the qmatrix, use the buffered/default qmatrix
2771         qmatrix = &mfc_context->buffered_qmatrix;
2772         qmatrix->load_lum_quantiser_matrix = 1;
2773         qmatrix->load_chroma_quantiser_matrix = (pic_param->num_components > 1) ? 1 : 0;
2774     }   
2775
2776
2777     //As per the design, normalization of the quality factor and scaling of the Quantization tables
2778     //based on the quality factor needs to be done in the driver before sending the values to the HW.
2779     //But note, the driver expects the scaled quantization tables (as per below logic) to be sent as
2780     //packed header information. The packed header is written as the header of the jpeg file. This
2781     //header information is used to decode the jpeg file. So, it is the app's responsibility to send
2782     //the correct header information (See build_packed_jpeg_header_buffer() in jpegenc.c in LibVa on
2783     //how to do this). QTables can be different for different applications. If no tables are provided,
2784     //the default tables in the driver are used.
2785
2786     //Normalization of the quality factor
2787     if (quality > 100) quality=100;
2788     if (quality == 0)  quality=1;
2789     quality = (quality < 50) ? (5000/quality) : (200 - (quality*2)); 
2790     
2791     //Step 1. Apply Quality factor and clip to range [1, 255] for luma and chroma Quantization matrices
2792     //Step 2. HW expects the 1/Q[i] values in the qm sent, so get reciprocals
2793     //Step 3. HW also expects 32 dwords, hence combine 2 (1/Q) values into 1 dword
2794     //Step 4. Send the Quantization matrix to the HW, use gen8_mfc_fqm_state
2795     
2796     //For luma (Y or R)
2797     if(qmatrix->load_lum_quantiser_matrix) {
2798         //apply quality to lum_quantiser_matrix
2799         for(i=0; i < 64; i++) {
2800             temp = (qmatrix->lum_quantiser_matrix[i] * quality)/100;
2801             //clamp to range [1,255]
2802             temp = (temp > 255) ? 255 : temp;
2803             temp = (temp < 1) ? 1 : temp;
2804             qmatrix->lum_quantiser_matrix[i] = (unsigned char)temp;
2805         }       
2806         
2807         //For VAAPI, the VAQMatrixBuffer needs to be in zigzag order. 
2808         //The App should send it in zigzag. Now, the driver has to extract the raster from it. 
2809         for (j = 0; j < 64; j++)
2810             raster_qm[zigzag_direct[j]] = qmatrix->lum_quantiser_matrix[j];
2811
2812         //Convert the raster order(row-ordered) to the column-raster (column by column).
2813         //To be consistent with the other encoders, send it in column order.
2814         //Need to double check if our HW expects col or row raster.
2815         for (j = 0; j < 64; j++) {
2816             int row = j / 8, col = j % 8;
2817             column_raster_qm[col * 8 + row] = raster_qm[j];
2818         }
2819         
2820         //Convert to raster QM to reciprocal. HW expects values in reciprocal.
2821         get_reciprocal_dword_qm(column_raster_qm, dword_qm);
2822         
2823         //send the luma qm to the command buffer
2824         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_LUMA_Y_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);
2825     } 
2826     
2827     //For Chroma, if chroma exists (Cb, Cr or G, B)
2828     if(qmatrix->load_chroma_quantiser_matrix) {
2829         //apply quality to chroma_quantiser_matrix
2830         for(i=0; i < 64; i++) {
2831             temp = (qmatrix->chroma_quantiser_matrix[i] * quality)/100;
2832             //clamp to range [1,255]
2833             temp = (temp > 255) ? 255 : temp;
2834             temp = (temp < 1) ? 1 : temp;
2835             qmatrix->chroma_quantiser_matrix[i] = (unsigned char)temp;
2836         }
2837         
2838         //For VAAPI, the VAQMatrixBuffer needs to be in zigzag order. 
2839         //The App should send it in zigzag. Now, the driver has to extract the raster from it. 
2840         for (j = 0; j < 64; j++)
2841             raster_qm[zigzag_direct[j]] = qmatrix->chroma_quantiser_matrix[j];
2842         
2843         //Convert the raster order(row-ordered) to the column-raster (column by column).
2844         //To be consistent with the other encoders, send it in column order.
2845         //Need to double check if our HW expects col or row raster.
2846         for (j = 0; j < 64; j++) {
2847             int row = j / 8, col = j % 8;
2848             column_raster_qm[col * 8 + row] = raster_qm[j];
2849         }
2850
2851
2852         //Convert to raster QM to reciprocal. HW expects values in reciprocal.
2853         get_reciprocal_dword_qm(column_raster_qm, dword_qm);
2854
2855         //send the same chroma qm to the command buffer (for both U,V or G,B)
2856         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_CHROMA_CB_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);
2857         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_CHROMA_CR_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);        
2858     }
2859 }
2860
2861
2862 //Translation of Table K.5 into code: This method takes the huffval from the 
2863 //Huffmantable buffer and converts into index for the coefficients and size tables
2864 uint8_t map_huffval_to_index(uint8_t huff_val) 
2865 {
2866     uint8_t index = 0;
2867
2868     if(huff_val < 0xF0) {
2869         index = (((huff_val >> 4) & 0x0F) * 0xA) + (huff_val & 0x0F);
2870     } else {
2871         index = 1 + (((huff_val >> 4) & 0x0F) * 0xA) + (huff_val & 0x0F);
2872     }
2873
2874     return index;
2875 }
2876
2877
2878 //Implementation of Flow chart Annex C  - Figure C.1
2879 static void
2880 generate_huffman_codesizes_table(uint8_t *bits, uint8_t *huff_size_table, uint8_t *lastK) 
2881 {
2882     uint8_t i=1, j=1, k=0;
2883
2884     while(i <= 16) {
2885         while(j <= (uint8_t)bits[i-1]) {
2886             huff_size_table[k] = i;
2887             k = k+1;
2888             j = j+1;
2889         }
2890         
2891         i = i+1;
2892         j = 1;
2893     }
2894     huff_size_table[k] = 0;
2895     (*lastK) = k;    
2896 }
2897
2898 //Implementation of Flow chart Annex C - Figure C.2
2899 static void
2900 generate_huffman_codes_table(uint8_t *huff_size_table, uint16_t *huff_code_table)
2901 {
2902     uint8_t k=0;
2903     uint16_t code=0;
2904     uint8_t si=huff_size_table[k];
2905     
2906     while(huff_size_table[k] != 0) {
2907     
2908         while(huff_size_table[k] == si) {
2909             
2910             // An huffman code can never be 0xFFFF. Replace it with 0 if 0xFFFF 
2911             if(code == 0xFFFF) {
2912                 code = 0x0000;
2913             }
2914
2915             huff_code_table[k] = code;
2916             code = code+1;
2917             k = k+1;
2918         }
2919     
2920         code <<= 1;
2921         si = si+1;
2922     }
2923     
2924 }
2925
2926 //Implementation of Flow chat Annex C - Figure C.3
2927 static void
2928 generate_ordered_codes_table(uint8_t *huff_vals, uint8_t *huff_size_table, uint16_t *huff_code_table, uint8_t type, uint8_t lastK)
2929 {
2930     uint8_t huff_val_size=0, i=0, k=0;
2931     
2932     huff_val_size = (type == 0) ? 12 : 162; 
2933     uint8_t huff_si_table[huff_val_size]; 
2934     uint16_t huff_co_table[huff_val_size];
2935     
2936     memset(huff_si_table, 0, sizeof(huff_si_table));
2937     memset(huff_co_table, 0, sizeof(huff_co_table));
2938     
2939     do {
2940         i = map_huffval_to_index(huff_vals[k]);
2941         huff_co_table[i] = huff_code_table[k];
2942         huff_si_table[i] = huff_size_table[k];
2943         k++;
2944     } while(k < lastK);
2945     
2946     memcpy(huff_size_table, huff_si_table, sizeof(uint8_t)*huff_val_size);
2947     memcpy(huff_code_table, huff_co_table, sizeof(uint16_t)*huff_val_size);
2948 }
2949
2950
2951 //This method converts the huffman table to code words which is needed by the HW
2952 //Flowcharts from Jpeg Spec Annex C - Figure C.1, Figure C.2, Figure C.3 are used here
2953 static void
2954 convert_hufftable_to_codes(VAHuffmanTableBufferJPEGBaseline *huff_buffer, uint32_t *table, uint8_t type, uint8_t index)
2955 {
2956     uint8_t lastK = 0, i=0; 
2957     uint8_t huff_val_size = 0;
2958     uint8_t *huff_bits, *huff_vals;
2959
2960     huff_val_size = (type == 0) ? 12 : 162; 
2961     uint8_t huff_size_table[huff_val_size+1]; //The +1 for adding 0 at the end of huff_val_size
2962     uint16_t huff_code_table[huff_val_size];
2963
2964     memset(huff_size_table, 0, sizeof(huff_size_table));
2965     memset(huff_code_table, 0, sizeof(huff_code_table));
2966
2967     huff_bits = (type == 0) ? (huff_buffer->huffman_table[index].num_dc_codes) : (huff_buffer->huffman_table[index].num_ac_codes);
2968     huff_vals = (type == 0) ? (huff_buffer->huffman_table[index].dc_values) : (huff_buffer->huffman_table[index].ac_values);
2969     
2970
2971     //Generation of table of Huffman code sizes
2972     generate_huffman_codesizes_table(huff_bits, huff_size_table, &lastK);
2973        
2974     //Generation of table of Huffman codes
2975     generate_huffman_codes_table(huff_size_table, huff_code_table);
2976        
2977     //Ordering procedure for encoding procedure code tables
2978     generate_ordered_codes_table(huff_vals, huff_size_table, huff_code_table, type, lastK);
2979
2980     //HW expects Byte0: Code length; Byte1,Byte2: Code Word, Byte3: Dummy
2981     //Since IA is littlended, &, | and << accordingly to store the values in the DWord.
2982     for(i=0; i<huff_val_size; i++) {
2983         table[i] = 0;
2984         table[i] = ((huff_size_table[i] & 0xFF) | ((huff_code_table[i] & 0xFFFF) << 8));
2985     }
2986
2987 }
2988
2989 //send the huffman table using MFC_JPEG_HUFF_TABLE_STATE
2990 static void
2991 gen8_mfc_jpeg_huff_table_state(VADriverContextP ctx,
2992                                            struct encode_state *encode_state,
2993                                            struct intel_encoder_context *encoder_context,
2994                                            int num_tables)
2995 {
2996     VAHuffmanTableBufferJPEGBaseline *huff_buffer;
2997     struct intel_batchbuffer *batch = encoder_context->base.batch;
2998     uint8_t index;
2999     uint32_t dc_table[12], ac_table[162]; 
3000     
3001     assert(encode_state->huffman_table && encode_state->huffman_table->buffer);
3002     huff_buffer = (VAHuffmanTableBufferJPEGBaseline *)encode_state->huffman_table->buffer;
3003
3004     memset(dc_table, 0, 12);
3005     memset(ac_table, 0, 162);
3006
3007     for (index = 0; index < num_tables; index++) {
3008         int id = va_to_gen7_jpeg_hufftable[index];
3009  
3010         if (!huff_buffer->load_huffman_table[index])
3011             continue;
3012      
3013         //load DC table with 12 DWords
3014         convert_hufftable_to_codes(huff_buffer, dc_table, 0, index);  //0 for Dc
3015
3016         //load AC table with 162 DWords 
3017         convert_hufftable_to_codes(huff_buffer, ac_table, 1, index);  //1 for AC 
3018
3019         BEGIN_BCS_BATCH(batch, 176);
3020         OUT_BCS_BATCH(batch, MFC_JPEG_HUFF_TABLE_STATE | (176 - 2));
3021         OUT_BCS_BATCH(batch, id); //Huff table id
3022
3023         //DWord 2 - 13 has DC_TABLE
3024         intel_batchbuffer_data(batch, dc_table, 12*4);
3025
3026         //Dword 14 -175 has AC_TABLE
3027         intel_batchbuffer_data(batch, ac_table, 162*4);
3028         ADVANCE_BCS_BATCH(batch);
3029     }    
3030 }
3031
3032
3033 //This method is used to compute the MCU count used for setting MFC_JPEG_SCAN_OBJECT
3034 static void get_Y_sampling_factors(uint32_t surface_format, uint8_t *h_factor, uint8_t *v_factor)
3035
3036     switch (surface_format) {
3037         case VA_FOURCC_Y800: {
3038             (* h_factor) = 1; 
3039             (* v_factor) = 1;
3040             break;
3041         }
3042         case VA_FOURCC_NV12: { 
3043             (* h_factor) = 2;             
3044             (* v_factor) = 2;
3045             break;
3046         }      
3047         case VA_FOURCC_UYVY: { 
3048             (* h_factor) = 2; 
3049             (* v_factor) = 1;
3050             break;
3051         }
3052         case VA_FOURCC_YUY2: { 
3053             (* h_factor) = 2; 
3054             (* v_factor) = 1;
3055             break;
3056         }
3057         case VA_FOURCC_RGBA:
3058         case VA_FOURCC_444P: { 
3059             (* h_factor) = 1; 
3060             (* v_factor) = 1;
3061             break;
3062         }
3063         default : { //May be  have to insert error handling here. For now just use as below
3064             (* h_factor) = 1; 
3065             (* v_factor) = 1;
3066             break;
3067         }
3068     }
3069 }
3070
3071 //set MFC_JPEG_SCAN_OBJECT
3072 static void
3073 gen8_mfc_jpeg_scan_object(VADriverContextP ctx,
3074                                            struct encode_state *encode_state,
3075                                            struct intel_encoder_context *encoder_context)
3076 {
3077     uint32_t mcu_count, surface_format, Mx, My;
3078     uint8_t i, horizontal_sampling_factor, vertical_sampling_factor, huff_ac_table=0, huff_dc_table=0;
3079     uint8_t is_last_scan = 1;    //Jpeg has only 1 scan per frame. When last scan, HW inserts EOI code.
3080     uint8_t head_present_flag=1; //Header has tables and app data 
3081     uint16_t num_components, restart_interval;   //Specifies number of MCUs in an ECS.
3082     VAEncSliceParameterBufferJPEG *slice_param;
3083     VAEncPictureParameterBufferJPEG *pic_param;
3084     
3085     struct intel_batchbuffer *batch = encoder_context->base.batch;
3086     struct object_surface *obj_surface = encode_state->input_yuv_object;
3087     
3088     assert(encode_state->slice_params_ext[0] && encode_state->slice_params_ext[0]->buffer);
3089     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
3090     assert(obj_surface);
3091     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
3092     slice_param = (VAEncSliceParameterBufferJPEG *)encode_state->slice_params_ext[0]->buffer;
3093     surface_format = obj_surface->fourcc;
3094     
3095     get_Y_sampling_factors(surface_format, &horizontal_sampling_factor, &vertical_sampling_factor);
3096     
3097     // Mx = #MCUs in a row, My = #MCUs in a column
3098     Mx = (pic_param->picture_width + (horizontal_sampling_factor*8 -1))/(horizontal_sampling_factor*8);
3099     My = (pic_param->picture_height + (vertical_sampling_factor*8 -1))/(vertical_sampling_factor*8);
3100     mcu_count = (Mx * My);
3101  
3102     num_components = pic_param->num_components;    
3103     restart_interval = slice_param->restart_interval;
3104     
3105     //Depending on number of components and values set for table selectors, 
3106     //only those bits are set in 24:22 for AC table, 20:18 for DC table
3107     for(i=0; i<num_components; i++) {
3108         huff_ac_table |= ((slice_param->components[i].ac_table_selector)<<i);
3109         huff_dc_table |= ((slice_param->components[i].dc_table_selector)<<i);
3110     }
3111     
3112     
3113     BEGIN_BCS_BATCH(batch, 3);
3114     /* DWORD 0 */
3115     OUT_BCS_BATCH(batch, MFC_JPEG_SCAN_OBJECT | (3 - 2)); 
3116     /* DWORD 1 */
3117     OUT_BCS_BATCH(batch, mcu_count << 0);       //MCU Count
3118     /* DWORD 2 */
3119     OUT_BCS_BATCH(batch,
3120                   (huff_ac_table << 22)     |   //Huffman AC Table
3121                   (huff_dc_table << 18)     |   //Huffman DC Table
3122                   (head_present_flag << 17) |   //Head present flag
3123                   (is_last_scan << 16)      |   //Is last scan
3124                   (restart_interval << 0));     //Restart Interval
3125     ADVANCE_BCS_BATCH(batch);
3126 }
3127
3128 static void
3129 gen8_mfc_jpeg_pak_insert_object(struct intel_encoder_context *encoder_context, unsigned int *insert_data, 
3130                                 int length_in_dws, int data_bits_in_last_dw, int is_last_header, 
3131                                 int is_end_of_slice)
3132 {
3133     struct intel_batchbuffer *batch = encoder_context->base.batch;
3134     assert(batch);
3135     
3136     if (data_bits_in_last_dw == 0)
3137         data_bits_in_last_dw = 32;
3138
3139     BEGIN_BCS_BATCH(batch, length_in_dws + 2);
3140
3141     OUT_BCS_BATCH(batch, MFX_INSERT_OBJECT | (length_in_dws + 2 - 2));
3142     //DWord 1
3143     OUT_BCS_BATCH(batch,
3144                   (0 << 16) |                    //DataByteOffset 0 for JPEG Encoder
3145                   (0 << 15) |                    //HeaderLengthExcludeFrmSize 0 for JPEG Encoder
3146                   (data_bits_in_last_dw << 8) |  //DataBitsInLastDW
3147                   (0 << 4) |                     //SkipEmulByteCount 0 for JPEG Encoder
3148                   (0 << 3) |                     //EmulationFlag 0 for JPEG Encoder
3149                   ((!!is_last_header) << 2) |    //LastHeaderFlag
3150                   ((!!is_end_of_slice) << 1) |   //EndOfSliceFlag
3151                   (1 << 0));                     //BitstreamStartReset 1 for JPEG Encoder
3152     //Data Paylaod
3153     intel_batchbuffer_data(batch, insert_data, length_in_dws*4);
3154
3155     ADVANCE_BCS_BATCH(batch);
3156 }
3157
3158
3159 //send the jpeg headers to HW using MFX_PAK_INSERT_OBJECT
3160 static void
3161 gen8_mfc_jpeg_add_headers(VADriverContextP ctx,
3162                                            struct encode_state *encode_state,
3163                                            struct intel_encoder_context *encoder_context)
3164 {
3165     if (encode_state->packed_header_data_ext) {
3166         VAEncPackedHeaderParameterBuffer *param = NULL;
3167         unsigned int *header_data = (unsigned int *)(*encode_state->packed_header_data_ext)->buffer;
3168         unsigned int length_in_bits;
3169
3170         param = (VAEncPackedHeaderParameterBuffer *)(*encode_state->packed_header_params_ext)->buffer;
3171         length_in_bits = param->bit_length;
3172
3173         gen8_mfc_jpeg_pak_insert_object(encoder_context, 
3174                                         header_data, 
3175                                         ALIGN(length_in_bits, 32) >> 5,
3176                                         length_in_bits & 0x1f,
3177                                         1,
3178                                         1);
3179     }
3180 }
3181
3182 //Initialize the buffered_qmatrix with the default qmatrix in the driver.
3183 //If the app sends the qmatrix, this will be replaced with the one app sends.
3184 static void 
3185 jpeg_init_default_qmatrix(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
3186 {
3187     int i=0;
3188     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3189  
3190     //Load the the QM in zigzag order. If app sends QM, it is always in zigzag order.
3191     for(i=0; i<64; i++)
3192        mfc_context->buffered_qmatrix.lum_quantiser_matrix[i] = jpeg_luma_quant[zigzag_direct[i]];
3193
3194     for(i=0; i<64; i++)
3195         mfc_context->buffered_qmatrix.chroma_quantiser_matrix[i] = jpeg_chroma_quant[zigzag_direct[i]];
3196 }    
3197  
3198 /* This is at the picture level */
3199 static void
3200 gen8_mfc_jpeg_pipeline_picture_programing(VADriverContextP ctx,
3201                                            struct encode_state *encode_state,
3202                                            struct intel_encoder_context *encoder_context)
3203 {
3204     int i, j, component, max_selector = 0;
3205     VAEncSliceParameterBufferJPEG *slice_param;
3206     
3207     gen8_mfc_pipe_mode_select(ctx, MFX_FORMAT_JPEG, encoder_context);
3208     gen8_mfc_jpeg_set_surface_state(ctx, encoder_context, encode_state);
3209     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
3210     gen8_mfc_ind_obj_base_addr_state(ctx, encoder_context);
3211     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
3212     gen8_mfc_jpeg_pic_state(ctx, encoder_context, encode_state);
3213     
3214     //do the slice level encoding here
3215     gen8_mfc_jpeg_fqm_state(ctx, encoder_context, encode_state);
3216
3217     //I dont think I need this for loop. Just to be consistent with other encoding logic...
3218     for(i = 0; i < encode_state->num_slice_params_ext; i++) {
3219         assert(encode_state->slice_params_ext && encode_state->slice_params_ext[i]->buffer);
3220         slice_param = (VAEncSliceParameterBufferJPEG *)encode_state->slice_params_ext[i]->buffer;
3221         
3222         for(j = 0; j < encode_state->slice_params_ext[i]->num_elements; j++) {
3223             
3224             for(component = 0; component < slice_param->num_components; component++) {
3225                 if(max_selector < slice_param->components[component].dc_table_selector)
3226                     max_selector = slice_param->components[component].dc_table_selector;
3227                 
3228                 if (max_selector < slice_param->components[component].ac_table_selector)
3229                     max_selector = slice_param->components[component].ac_table_selector;
3230             }
3231             
3232             slice_param++;
3233         }
3234     }    
3235
3236     assert(max_selector < 2);
3237     //send the huffman table using MFC_JPEG_HUFF_TABLE
3238     gen8_mfc_jpeg_huff_table_state(ctx, encode_state, encoder_context, max_selector+1);
3239     //set MFC_JPEG_SCAN_OBJECT
3240     gen8_mfc_jpeg_scan_object(ctx, encode_state, encoder_context);
3241     //add headers using MFX_PAK_INSERT_OBJECT (it is refered as MFX_INSERT_OBJECT in this driver code)
3242     gen8_mfc_jpeg_add_headers(ctx, encode_state, encoder_context);
3243        
3244 }
3245
3246 static void
3247 gen8_mfc_jpeg_pipeline_programing(VADriverContextP ctx,
3248                                    struct encode_state *encode_state,
3249                                    struct intel_encoder_context *encoder_context)
3250 {
3251     struct intel_batchbuffer *batch = encoder_context->base.batch;
3252     
3253     // begin programing
3254     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
3255     intel_batchbuffer_emit_mi_flush(batch);
3256     
3257     // picture level programing
3258     gen8_mfc_jpeg_pipeline_picture_programing(ctx, encode_state, encoder_context);
3259
3260     // end programing
3261     intel_batchbuffer_end_atomic(batch);
3262
3263 }
3264
3265
3266 static VAStatus
3267 gen8_mfc_jpeg_encode_picture(VADriverContextP ctx, 
3268                               struct encode_state *encode_state,
3269                               struct intel_encoder_context *encoder_context)
3270 {
3271     gen8_mfc_init(ctx, encode_state, encoder_context);
3272     intel_mfc_jpeg_prepare(ctx, encode_state, encoder_context);
3273     /*Programing bcs pipeline*/
3274     gen8_mfc_jpeg_pipeline_programing(ctx, encode_state, encoder_context);
3275     gen8_mfc_run(ctx, encode_state, encoder_context);
3276
3277     return VA_STATUS_SUCCESS;
3278 }
3279
3280 static int gen8_mfc_vp8_qindex_estimate(struct encode_state *encode_state,
3281                                         struct gen6_mfc_context *mfc_context,
3282                                         int target_frame_size,
3283                                         int is_key_frame)
3284 {
3285     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3286     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3287     unsigned int max_qindex = pic_param->clamp_qindex_high;
3288     unsigned int min_qindex = pic_param->clamp_qindex_low;
3289     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
3290     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
3291     int target_mb_size;
3292     int last_size_gap  = -1;
3293     int per_mb_size_at_qindex;
3294     int target_qindex = min_qindex, i;
3295
3296     /* make sure would not overflow*/
3297     if (target_frame_size >= (0x7fffffff >> 9))
3298         target_mb_size = (target_frame_size / width_in_mbs / height_in_mbs) << 9;
3299     else
3300         target_mb_size = (target_frame_size << 9) / width_in_mbs / height_in_mbs;
3301
3302     for (i = min_qindex; i <= max_qindex; i++) {
3303         per_mb_size_at_qindex = vp8_bits_per_mb[!is_key_frame][i];
3304         target_qindex = i;
3305         if (per_mb_size_at_qindex <= target_mb_size) {
3306             if (target_mb_size - per_mb_size_at_qindex < last_size_gap)
3307                 target_qindex--;
3308             break;
3309         }
3310         else
3311             last_size_gap = per_mb_size_at_qindex - target_mb_size;
3312     }
3313
3314     return target_qindex;
3315 }
3316
3317 static void gen8_mfc_vp8_brc_init(struct encode_state *encode_state,
3318                                struct intel_encoder_context* encoder_context)
3319 {
3320     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3321     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3322     double bitrate = encoder_context->brc.bits_per_second[0];
3323     double framerate = (double)encoder_context->brc.framerate[0].num / (double)encoder_context->brc.framerate[0].den;
3324     int inum = 1, pnum = 0;
3325     int intra_period = seq_param->intra_period;
3326     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
3327     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
3328     int max_frame_size =  (vp8_bits_per_mb[0][0] >> 9) * width_in_mbs * height_in_mbs;/* vp8_bits_per_mb table mutilpled 512 */
3329
3330     pnum = intra_period  - 1;
3331
3332     mfc_context->brc.mode = encoder_context->rate_control_mode;
3333
3334     mfc_context->brc.target_frame_size[0][SLICE_TYPE_I] = (int)((double)((bitrate * intra_period) / framerate) /
3335                                                              (double)(inum + BRC_PWEIGHT * pnum ));
3336     mfc_context->brc.target_frame_size[0][SLICE_TYPE_P] = BRC_PWEIGHT * mfc_context->brc.target_frame_size[0][SLICE_TYPE_I];
3337
3338     mfc_context->brc.gop_nums[0][SLICE_TYPE_I] = inum;
3339     mfc_context->brc.gop_nums[0][SLICE_TYPE_P] = pnum;
3340
3341     mfc_context->brc.bits_per_frame[0] = bitrate / framerate;
3342
3343     mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I] = gen8_mfc_vp8_qindex_estimate(encode_state,
3344                                                                                 mfc_context,
3345                                                                                 mfc_context->brc.target_frame_size[0][SLICE_TYPE_I],
3346                                                                                 1);
3347     mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P] = gen8_mfc_vp8_qindex_estimate(encode_state,
3348                                                                                 mfc_context,
3349                                                                                 mfc_context->brc.target_frame_size[0][SLICE_TYPE_P],
3350                                                                                 0);
3351
3352     if (encoder_context->brc.hrd_buffer_size)
3353         mfc_context->hrd.buffer_size[0] = (double)encoder_context->brc.hrd_buffer_size;
3354     else
3355         mfc_context->hrd.buffer_size[0] = bitrate;
3356     if (encoder_context->brc.hrd_initial_buffer_fullness &&
3357         encoder_context->brc.hrd_initial_buffer_fullness < mfc_context->hrd.buffer_size[0])
3358         mfc_context->hrd.current_buffer_fullness[0] = (double)encoder_context->brc.hrd_initial_buffer_fullness;
3359     else
3360         mfc_context->hrd.current_buffer_fullness[0] = mfc_context->hrd.buffer_size[0] / 2.0;
3361     mfc_context->hrd.target_buffer_fullness[0] = (double)mfc_context->hrd.buffer_size[0] / 2.0;
3362     mfc_context->hrd.buffer_capacity[0] = (double)mfc_context->hrd.buffer_size[0] / max_frame_size;
3363     mfc_context->hrd.violation_noted = 0;
3364 }
3365
3366 static int gen8_mfc_vp8_brc_postpack(struct encode_state *encode_state,
3367                            struct intel_encoder_context *encoder_context,
3368                            int frame_bits)
3369 {
3370     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3371     gen6_brc_status sts = BRC_NO_HRD_VIOLATION;
3372     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3373     int is_key_frame = !pic_param->pic_flags.bits.frame_type;
3374     int slicetype = (is_key_frame ? SLICE_TYPE_I : SLICE_TYPE_P);
3375     int qpi = mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I];
3376     int qpp = mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P];
3377     int qp; // quantizer of previously encoded slice of current type
3378     int qpn; // predicted quantizer for next frame of current type in integer format
3379     double qpf; // predicted quantizer for next frame of current type in float format
3380     double delta_qp; // QP correction
3381     int target_frame_size, frame_size_next;
3382     /* Notes:
3383      *  x - how far we are from HRD buffer borders
3384      *  y - how far we are from target HRD buffer fullness
3385      */
3386     double x, y;
3387     double frame_size_alpha;
3388     unsigned int max_qindex = pic_param->clamp_qindex_high;
3389     unsigned int min_qindex = pic_param->clamp_qindex_low;
3390
3391     qp = mfc_context->brc.qp_prime_y[0][slicetype];
3392
3393     target_frame_size = mfc_context->brc.target_frame_size[0][slicetype];
3394     if (mfc_context->hrd.buffer_capacity[0] < 5)
3395         frame_size_alpha = 0;
3396     else
3397         frame_size_alpha = (double)mfc_context->brc.gop_nums[0][slicetype];
3398     if (frame_size_alpha > 30) frame_size_alpha = 30;
3399     frame_size_next = target_frame_size + (double)(target_frame_size - frame_bits) /
3400         (double)(frame_size_alpha + 1.);
3401
3402     /* frame_size_next: avoiding negative number and too small value */
3403     if ((double)frame_size_next < (double)(target_frame_size * 0.25))
3404         frame_size_next = (int)((double)target_frame_size * 0.25);
3405
3406     qpf = (double)qp * target_frame_size / frame_size_next;
3407     qpn = (int)(qpf + 0.5);
3408
3409     if (qpn == qp) {
3410         /* setting qpn we round qpf making mistakes: now we are trying to compensate this */
3411         mfc_context->brc.qpf_rounding_accumulator[0] += qpf - qpn;
3412         if (mfc_context->brc.qpf_rounding_accumulator[0] > 1.0) {
3413             qpn++;
3414             mfc_context->brc.qpf_rounding_accumulator[0] = 0.;
3415         } else if (mfc_context->brc.qpf_rounding_accumulator[0] < -1.0) {
3416             qpn--;
3417             mfc_context->brc.qpf_rounding_accumulator[0] = 0.;
3418         }
3419     }
3420
3421     /* making sure that QP is not changing too fast */
3422     if ((qpn - qp) > BRC_QP_MAX_CHANGE) qpn = qp + BRC_QP_MAX_CHANGE;
3423     else if ((qpn - qp) < -BRC_QP_MAX_CHANGE) qpn = qp - BRC_QP_MAX_CHANGE;
3424     /* making sure that with QP predictions we did do not leave QPs range */
3425     BRC_CLIP(qpn, min_qindex, max_qindex);
3426
3427     /* checking wthether HRD compliance is still met */
3428     sts = intel_mfc_update_hrd(encode_state, encoder_context, frame_bits);
3429
3430     /* calculating QP delta as some function*/
3431     x = mfc_context->hrd.target_buffer_fullness[0] - mfc_context->hrd.current_buffer_fullness[0];
3432     if (x > 0) {
3433         x /= mfc_context->hrd.target_buffer_fullness[0];
3434         y = mfc_context->hrd.current_buffer_fullness[0];
3435     }
3436     else {
3437         x /= (mfc_context->hrd.buffer_size[0] - mfc_context->hrd.target_buffer_fullness[0]);
3438         y = mfc_context->hrd.buffer_size[0] - mfc_context->hrd.current_buffer_fullness[0];
3439     }
3440     if (y < 0.01) y = 0.01;
3441     if (x > 1) x = 1;
3442     else if (x < -1) x = -1;
3443
3444     delta_qp = BRC_QP_MAX_CHANGE*exp(-1/y)*sin(BRC_PI_0_5 * x);
3445     qpn = (int)(qpn + delta_qp + 0.5);
3446
3447     /* making sure that with QP predictions we did do not leave QPs range */
3448     BRC_CLIP(qpn, min_qindex, max_qindex);
3449
3450     if (sts == BRC_NO_HRD_VIOLATION) { // no HRD violation
3451         /* correcting QPs of slices of other types */
3452         if (!is_key_frame) {
3453             if (abs(qpn - BRC_I_P_QP_DIFF - qpi) > 4)
3454                 mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I] += (qpn - BRC_I_P_QP_DIFF - qpi) >> 2;
3455         } else {
3456             if (abs(qpn + BRC_I_P_QP_DIFF - qpp) > 4)
3457                 mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P] += (qpn + BRC_I_P_QP_DIFF - qpp) >> 2;
3458         }
3459         BRC_CLIP(mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I], min_qindex, max_qindex);
3460         BRC_CLIP(mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P], min_qindex, max_qindex);
3461     } else if (sts == BRC_UNDERFLOW) { // underflow
3462         if (qpn <= qp) qpn = qp + 2;
3463         if (qpn > max_qindex) {
3464             qpn = max_qindex;
3465             sts = BRC_UNDERFLOW_WITH_MAX_QP; //underflow with maxQP
3466         }
3467     } else if (sts == BRC_OVERFLOW) {
3468         if (qpn >= qp) qpn = qp - 2;
3469         if (qpn < min_qindex) { // < 0 (?) overflow with minQP
3470             qpn = min_qindex;
3471             sts = BRC_OVERFLOW_WITH_MIN_QP; // bit stuffing to be done
3472         }
3473     }
3474
3475     mfc_context->brc.qp_prime_y[0][slicetype] = qpn;
3476
3477     return sts;
3478 }
3479
3480 static void gen8_mfc_vp8_hrd_context_init(struct encode_state *encode_state,
3481                                        struct intel_encoder_context *encoder_context)
3482 {
3483     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3484     unsigned int rate_control_mode = encoder_context->rate_control_mode;
3485     int target_bit_rate = encoder_context->brc.bits_per_second[0];
3486
3487     // current we only support CBR mode.
3488     if (rate_control_mode == VA_RC_CBR) {
3489         mfc_context->vui_hrd.i_bit_rate_value = target_bit_rate >> 10;
3490         mfc_context->vui_hrd.i_initial_cpb_removal_delay = ((target_bit_rate * 8) >> 10) * 0.5 * 1024 / target_bit_rate * 90000;
3491         mfc_context->vui_hrd.i_cpb_removal_delay = 2;
3492         mfc_context->vui_hrd.i_frame_number = 0;
3493
3494         mfc_context->vui_hrd.i_initial_cpb_removal_delay_length = 24;
3495         mfc_context->vui_hrd.i_cpb_removal_delay_length = 24;
3496         mfc_context->vui_hrd.i_dpb_output_delay_length = 24;
3497     }
3498
3499 }
3500
3501 static void gen8_mfc_vp8_hrd_context_update(struct encode_state *encode_state,
3502                              struct gen6_mfc_context *mfc_context)
3503 {
3504     mfc_context->vui_hrd.i_frame_number++;
3505 }
3506
3507 static void gen8_mfc_vp8_brc_prepare(struct encode_state *encode_state,
3508                            struct intel_encoder_context *encoder_context)
3509 {
3510     unsigned int rate_control_mode = encoder_context->rate_control_mode;
3511
3512     if (rate_control_mode == VA_RC_CBR) {
3513         bool brc_updated;
3514         assert(encoder_context->codec != CODEC_MPEG2);
3515
3516         brc_updated = encoder_context->brc.need_reset;
3517
3518         /*Programing bit rate control */
3519         if (brc_updated) {
3520             gen8_mfc_vp8_brc_init(encode_state, encoder_context);
3521         }
3522
3523         /*Programing HRD control */
3524         if (brc_updated)
3525             gen8_mfc_vp8_hrd_context_init(encode_state, encoder_context);
3526     }
3527 }
3528
3529 static void vp8_enc_state_init(struct gen6_mfc_context *mfc_context,
3530                                VAEncPictureParameterBufferVP8 *pic_param,
3531                                VAQMatrixBufferVP8 *q_matrix)
3532 {
3533
3534     int is_key_frame = !pic_param->pic_flags.bits.frame_type;
3535     unsigned char *coeff_probs_stream_in_buffer;
3536     
3537     mfc_context->vp8_state.frame_header_lf_update_pos = 0;
3538     mfc_context->vp8_state.frame_header_qindex_update_pos = 0;
3539     mfc_context->vp8_state.frame_header_token_update_pos = 0;
3540     mfc_context->vp8_state.frame_header_bin_mv_upate_pos = 0;
3541
3542     mfc_context->vp8_state.prob_skip_false = 255;
3543     memset(mfc_context->vp8_state.mb_segment_tree_probs, 0, sizeof(mfc_context->vp8_state.mb_segment_tree_probs));
3544     memcpy(mfc_context->vp8_state.mv_probs, vp8_default_mv_context, sizeof(mfc_context->vp8_state.mv_probs));
3545     
3546     if (is_key_frame) {
3547         memcpy(mfc_context->vp8_state.y_mode_probs, vp8_kf_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
3548         memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_kf_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
3549
3550         mfc_context->vp8_state.prob_intra = 255;
3551         mfc_context->vp8_state.prob_last = 128;
3552         mfc_context->vp8_state.prob_gf = 128;
3553     } else {
3554         memcpy(mfc_context->vp8_state.y_mode_probs, vp8_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
3555         memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
3556
3557         mfc_context->vp8_state.prob_intra = 63;
3558         mfc_context->vp8_state.prob_last = 128;
3559         mfc_context->vp8_state.prob_gf = 128;
3560     }
3561     
3562     mfc_context->vp8_state.prob_skip_false = vp8_base_skip_false_prob[q_matrix->quantization_index[0]];
3563   
3564     dri_bo_map(mfc_context->vp8_state.coeff_probs_stream_in_bo, 1);
3565     coeff_probs_stream_in_buffer = (unsigned char *)mfc_context->vp8_state.coeff_probs_stream_in_bo->virtual;
3566     assert(coeff_probs_stream_in_buffer);
3567     memcpy(coeff_probs_stream_in_buffer, vp8_default_coef_probs, sizeof(vp8_default_coef_probs));
3568     dri_bo_unmap(mfc_context->vp8_state.coeff_probs_stream_in_bo);
3569 }
3570
3571 static void vp8_enc_state_update(struct gen6_mfc_context *mfc_context,
3572                                  VAQMatrixBufferVP8 *q_matrix)
3573 {
3574
3575     /*some other probabilities need to be updated*/
3576 }
3577
3578 extern void binarize_vp8_frame_header(VAEncSequenceParameterBufferVP8 *seq_param,
3579                            VAEncPictureParameterBufferVP8 *pic_param,
3580                            VAQMatrixBufferVP8 *q_matrix,
3581                            struct gen6_mfc_context *mfc_context,
3582                            struct intel_encoder_context *encoder_context);
3583
3584 static void vp8_enc_frame_header_binarize(struct encode_state *encode_state,
3585                                           struct intel_encoder_context *encoder_context,
3586                                           struct gen6_mfc_context *mfc_context)
3587 {
3588     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3589     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3590     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3591     unsigned char *frame_header_buffer;
3592
3593     binarize_vp8_frame_header(seq_param, pic_param, q_matrix, mfc_context, encoder_context);
3594  
3595     dri_bo_map(mfc_context->vp8_state.frame_header_bo, 1);
3596     frame_header_buffer = (unsigned char *)mfc_context->vp8_state.frame_header_bo->virtual;
3597     assert(frame_header_buffer);
3598     memcpy(frame_header_buffer, mfc_context->vp8_state.vp8_frame_header, (mfc_context->vp8_state.frame_header_bit_count + 7) / 8);
3599     free(mfc_context->vp8_state.vp8_frame_header);
3600     dri_bo_unmap(mfc_context->vp8_state.frame_header_bo);
3601 }
3602
3603 #define MAX_VP8_FRAME_HEADER_SIZE              0x2000
3604 #define VP8_TOKEN_STATISTICS_BUFFER_SIZE       0x2000
3605
3606 static void gen8_mfc_vp8_init(VADriverContextP ctx,
3607                           struct encode_state *encode_state,
3608                           struct intel_encoder_context *encoder_context)
3609 {
3610     struct i965_driver_data *i965 = i965_driver_data(ctx);
3611     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3612     dri_bo *bo;
3613     int i;
3614     int width_in_mbs = 0;
3615     int height_in_mbs = 0;
3616     int slice_batchbuffer_size;
3617     int is_key_frame, slice_type, rate_control_mode;
3618
3619     VAEncSequenceParameterBufferVP8 *pSequenceParameter = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3620     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3621     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3622
3623     width_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
3624     height_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
3625
3626     is_key_frame = !pic_param->pic_flags.bits.frame_type;
3627     slice_type = (is_key_frame ? SLICE_TYPE_I : SLICE_TYPE_P);
3628     rate_control_mode = encoder_context->rate_control_mode;
3629
3630     if (rate_control_mode == VA_RC_CBR) {
3631         q_matrix->quantization_index[0] = mfc_context->brc.qp_prime_y[0][slice_type];
3632         for (i = 1; i < 4; i++)
3633             q_matrix->quantization_index[i] = q_matrix->quantization_index[0];
3634         for (i = 0; i < 5; i++)
3635             q_matrix->quantization_index_delta[i] = 0;
3636     }
3637
3638     slice_batchbuffer_size = 64 * width_in_mbs * height_in_mbs + 4096 +
3639         (SLICE_HEADER + SLICE_TAIL);
3640
3641     /*Encode common setup for MFC*/
3642     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
3643     mfc_context->post_deblocking_output.bo = NULL;
3644
3645     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
3646     mfc_context->pre_deblocking_output.bo = NULL;
3647
3648     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
3649     mfc_context->uncompressed_picture_source.bo = NULL;
3650
3651     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo);
3652     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
3653
3654     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
3655         if ( mfc_context->direct_mv_buffers[i].bo != NULL)
3656             dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
3657         mfc_context->direct_mv_buffers[i].bo = NULL;
3658     }
3659
3660     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
3661         if (mfc_context->reference_surfaces[i].bo != NULL)
3662             dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
3663         mfc_context->reference_surfaces[i].bo = NULL;
3664     }
3665
3666     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
3667     bo = dri_bo_alloc(i965->intel.bufmgr,
3668                       "Buffer",
3669                       width_in_mbs * 64 * 16,
3670                       64);
3671     assert(bo);
3672     mfc_context->intra_row_store_scratch_buffer.bo = bo;
3673
3674     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
3675     bo = dri_bo_alloc(i965->intel.bufmgr,
3676                       "Buffer",
3677                       width_in_mbs * height_in_mbs * 16,
3678                       64);
3679     assert(bo);
3680     mfc_context->macroblock_status_buffer.bo = bo;
3681
3682     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
3683     bo = dri_bo_alloc(i965->intel.bufmgr,
3684                       "Buffer",
3685                       16 * width_in_mbs * 64,  /* 16 * width_in_mbs * 64 */
3686                       64);
3687     assert(bo);
3688     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
3689
3690     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
3691     bo = dri_bo_alloc(i965->intel.bufmgr,
3692                       "Buffer",
3693                       16 * width_in_mbs * 64, /* 16 * width_in_mbs * 64 */
3694                       0x1000);
3695     assert(bo);
3696     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
3697
3698     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
3699     mfc_context->mfc_batchbuffer_surface.bo = NULL;
3700
3701     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
3702     mfc_context->aux_batchbuffer_surface.bo = NULL;
3703
3704     if (mfc_context->aux_batchbuffer) {
3705         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
3706         mfc_context->aux_batchbuffer = NULL;
3707     }
3708
3709     mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
3710     mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
3711     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
3712     mfc_context->aux_batchbuffer_surface.pitch = 16;
3713     mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
3714     mfc_context->aux_batchbuffer_surface.size_block = 16;
3715
3716     gen8_gpe_context_init(ctx, &mfc_context->gpe_context);
3717
3718     /* alloc vp8 encoding buffers*/
3719     dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
3720     bo = dri_bo_alloc(i965->intel.bufmgr,
3721                       "Buffer",
3722                       MAX_VP8_FRAME_HEADER_SIZE,
3723                       0x1000);
3724     assert(bo);
3725     mfc_context->vp8_state.frame_header_bo = bo;
3726
3727     mfc_context->vp8_state.intermediate_buffer_max_size = width_in_mbs * height_in_mbs * 384 * 9;
3728     for(i = 0; i < 8; i++) {
3729         mfc_context->vp8_state.intermediate_partition_offset[i] = width_in_mbs * height_in_mbs * 384 * (i + 1);
3730     }
3731     dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
3732     bo = dri_bo_alloc(i965->intel.bufmgr,
3733                       "Buffer",
3734                       mfc_context->vp8_state.intermediate_buffer_max_size,
3735                       0x1000);
3736     assert(bo);
3737     mfc_context->vp8_state.intermediate_bo = bo;
3738
3739     dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
3740     bo = dri_bo_alloc(i965->intel.bufmgr,
3741                       "Buffer",
3742                       width_in_mbs * height_in_mbs * 16,
3743                       0x1000);
3744     assert(bo);
3745     mfc_context->vp8_state.stream_out_bo = bo;
3746
3747     dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
3748     bo = dri_bo_alloc(i965->intel.bufmgr,
3749                       "Buffer",
3750                       sizeof(vp8_default_coef_probs),
3751                       0x1000);
3752     assert(bo);
3753     mfc_context->vp8_state.coeff_probs_stream_in_bo = bo;
3754
3755     dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
3756     bo = dri_bo_alloc(i965->intel.bufmgr,
3757                       "Buffer",
3758                       VP8_TOKEN_STATISTICS_BUFFER_SIZE,
3759                       0x1000);
3760     assert(bo);
3761     mfc_context->vp8_state.token_statistics_bo = bo;
3762
3763     dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
3764     bo = dri_bo_alloc(i965->intel.bufmgr,
3765                       "Buffer",
3766                       width_in_mbs * 16 * 64,
3767                       0x1000);
3768     assert(bo);
3769     mfc_context->vp8_state.mpc_row_store_bo = bo;
3770
3771     vp8_enc_state_init(mfc_context, pic_param, q_matrix);
3772     vp8_enc_frame_header_binarize(encode_state, encoder_context, mfc_context);
3773 }
3774
3775 static VAStatus
3776 intel_mfc_vp8_prepare(VADriverContextP ctx,
3777                         struct encode_state *encode_state,
3778                         struct intel_encoder_context *encoder_context)
3779 {
3780     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3781     struct object_surface *obj_surface;
3782     struct object_buffer *obj_buffer;
3783     struct i965_coded_buffer_segment *coded_buffer_segment;
3784     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3785     VAStatus vaStatus = VA_STATUS_SUCCESS;
3786     dri_bo *bo;
3787     int i;
3788
3789     /* reconstructed surface */
3790     obj_surface = encode_state->reconstructed_object;
3791     i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
3792     if (pic_param->loop_filter_level[0] == 0) {
3793         mfc_context->pre_deblocking_output.bo = obj_surface->bo;
3794         dri_bo_reference(mfc_context->pre_deblocking_output.bo);
3795     } else {
3796         mfc_context->post_deblocking_output.bo = obj_surface->bo;
3797         dri_bo_reference(mfc_context->post_deblocking_output.bo);
3798     }
3799
3800     mfc_context->surface_state.width = obj_surface->orig_width;
3801     mfc_context->surface_state.height = obj_surface->orig_height;
3802     mfc_context->surface_state.w_pitch = obj_surface->width;
3803     mfc_context->surface_state.h_pitch = obj_surface->height;
3804
3805     /* set vp8 reference frames */
3806     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
3807         obj_surface = encode_state->reference_objects[i];
3808
3809         if (obj_surface && obj_surface->bo) {
3810             mfc_context->reference_surfaces[i].bo = obj_surface->bo;
3811             dri_bo_reference(mfc_context->reference_surfaces[i].bo);
3812         } else {
3813             mfc_context->reference_surfaces[i].bo = NULL;
3814         }
3815     }
3816
3817     /* input YUV surface */
3818     obj_surface = encode_state->input_yuv_object;
3819     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
3820     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
3821
3822     /* coded buffer */
3823     obj_buffer = encode_state->coded_buf_object;
3824     bo = obj_buffer->buffer_store->bo;
3825     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
3826     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
3827     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
3828     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
3829
3830     dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
3831     mfc_context->vp8_state.final_frame_bo = mfc_context->mfc_indirect_pak_bse_object.bo;
3832     mfc_context->vp8_state.final_frame_byte_offset = I965_CODEDBUFFER_HEADER_SIZE;
3833     dri_bo_reference(mfc_context->vp8_state.final_frame_bo);
3834
3835     /* set the internal flag to 0 to indicate the coded size is unknown */
3836     dri_bo_map(bo, 1);
3837     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
3838     coded_buffer_segment->mapped = 0;
3839     coded_buffer_segment->codec = encoder_context->codec;
3840     dri_bo_unmap(bo);
3841
3842     return vaStatus;
3843 }
3844
3845 static void
3846 gen8_mfc_vp8_encoder_cfg(VADriverContextP ctx, 
3847                          struct encode_state *encode_state,
3848                          struct intel_encoder_context *encoder_context)
3849 {
3850     struct intel_batchbuffer *batch = encoder_context->base.batch;
3851     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3852     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3853     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3854
3855     BEGIN_BCS_BATCH(batch, 30);
3856     OUT_BCS_BATCH(batch, MFX_VP8_ENCODER_CFG | (30 - 2)); /* SKL should be 31-2 ? */
3857
3858     OUT_BCS_BATCH(batch,
3859                   0 << 9 | /* compressed bitstream output disable */
3860                   1 << 7 | /* disable per-segment delta qindex and loop filter in RC */
3861                   1 << 6 | /* RC initial pass */
3862                   0 << 4 | /* upate segment feature date flag */
3863                   1 << 3 | /* bitstream statistics output enable */
3864                   1 << 2 | /* token statistics output enable */
3865                   0 << 1 | /* final bitstream output disable */
3866                   0 << 0); /*DW1*/
3867     
3868     OUT_BCS_BATCH(batch, 0); /*DW2*/
3869
3870     OUT_BCS_BATCH(batch, 
3871                   0xfff << 16 | /* max intra mb bit count limit */
3872                   0xfff << 0  /* max inter mb bit count limit */
3873                   ); /*DW3*/
3874
3875     OUT_BCS_BATCH(batch, 0); /*DW4*/
3876     OUT_BCS_BATCH(batch, 0); /*DW5*/
3877     OUT_BCS_BATCH(batch, 0); /*DW6*/
3878     OUT_BCS_BATCH(batch, 0); /*DW7*/
3879     OUT_BCS_BATCH(batch, 0); /*DW8*/
3880     OUT_BCS_BATCH(batch, 0); /*DW9*/
3881     OUT_BCS_BATCH(batch, 0); /*DW10*/
3882     OUT_BCS_BATCH(batch, 0); /*DW11*/
3883     OUT_BCS_BATCH(batch, 0); /*DW12*/
3884     OUT_BCS_BATCH(batch, 0); /*DW13*/
3885     OUT_BCS_BATCH(batch, 0); /*DW14*/
3886     OUT_BCS_BATCH(batch, 0); /*DW15*/
3887     OUT_BCS_BATCH(batch, 0); /*DW16*/
3888     OUT_BCS_BATCH(batch, 0); /*DW17*/
3889     OUT_BCS_BATCH(batch, 0); /*DW18*/
3890     OUT_BCS_BATCH(batch, 0); /*DW19*/
3891     OUT_BCS_BATCH(batch, 0); /*DW20*/
3892     OUT_BCS_BATCH(batch, 0); /*DW21*/
3893
3894     OUT_BCS_BATCH(batch, 
3895                  pic_param->pic_flags.bits.show_frame << 23 |
3896                  pic_param->pic_flags.bits.version << 20
3897                  ); /*DW22*/
3898
3899     OUT_BCS_BATCH(batch,
3900                  (seq_param->frame_height_scale << 14 | seq_param->frame_height) << 16 |
3901                  (seq_param->frame_width_scale << 14 | seq_param->frame_width) << 0
3902                  );
3903
3904     /*DW24*/
3905     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bit_count); /* frame header bit count */
3906
3907     /*DW25*/
3908     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_qindex_update_pos); /* frame header bin buffer qindex update pointer */
3909
3910     /*DW26*/
3911     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_lf_update_pos); /* frame header bin buffer loop filter update pointer*/
3912
3913     /*DW27*/
3914     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_token_update_pos); /* frame header bin buffer token update pointer */
3915
3916     /*DW28*/
3917     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bin_mv_upate_pos); /*frame header bin buffer mv update pointer */
3918
3919     /*DW29*/
3920     OUT_BCS_BATCH(batch, 0);
3921
3922     ADVANCE_BCS_BATCH(batch);
3923 }
3924
3925 static void
3926 gen8_mfc_vp8_pic_state(VADriverContextP ctx,
3927                        struct encode_state *encode_state,
3928                        struct intel_encoder_context *encoder_context)
3929 {
3930     struct intel_batchbuffer *batch = encoder_context->base.batch;
3931     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3932     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3933     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3934     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3935     int i, j, log2num;
3936
3937     log2num = pic_param->pic_flags.bits.num_token_partitions;
3938
3939     /*update mode and token probs*/
3940     vp8_enc_state_update(mfc_context, q_matrix);
3941
3942     BEGIN_BCS_BATCH(batch, 38);
3943     OUT_BCS_BATCH(batch, MFX_VP8_PIC_STATE | (38 - 2));
3944     OUT_BCS_BATCH(batch,
3945                   (ALIGN(seq_param->frame_height, 16) / 16 - 1) << 16 |
3946                   (ALIGN(seq_param->frame_width, 16) / 16 - 1) << 0);
3947  
3948     OUT_BCS_BATCH(batch,
3949                   log2num << 24 |
3950                   pic_param->sharpness_level << 16 |
3951                   pic_param->pic_flags.bits.sign_bias_alternate << 13 |
3952                   pic_param->pic_flags.bits.sign_bias_golden << 12 |
3953                   pic_param->pic_flags.bits.loop_filter_adj_enable << 11 |
3954                   pic_param->pic_flags.bits.mb_no_coeff_skip << 10 |
3955                   pic_param->pic_flags.bits.update_mb_segmentation_map << 9 |
3956                   pic_param->pic_flags.bits.segmentation_enabled << 8 |
3957                   !pic_param->pic_flags.bits.frame_type << 5 | /* 0 indicate an intra frame in VP8 stream/spec($9.1)*/
3958                   (pic_param->pic_flags.bits.version / 2) << 4 |
3959                   (pic_param->pic_flags.bits.version == 3) << 1 | /* full pixel mode for version 3 */
3960                   !!pic_param->pic_flags.bits.version << 0); /* version 0: 6 tap */
3961  
3962     OUT_BCS_BATCH(batch,
3963                   pic_param->loop_filter_level[3] << 24 |
3964                   pic_param->loop_filter_level[2] << 16 |
3965                   pic_param->loop_filter_level[1] <<  8 |
3966                   pic_param->loop_filter_level[0] <<  0);
3967
3968     OUT_BCS_BATCH(batch,
3969                   q_matrix->quantization_index[3] << 24 |
3970                   q_matrix->quantization_index[2] << 16 |
3971                   q_matrix->quantization_index[1] <<  8 |
3972                   q_matrix->quantization_index[0] << 0);
3973
3974     OUT_BCS_BATCH(batch,
3975                  ((unsigned short)(q_matrix->quantization_index_delta[4]) >> 15) << 28 | 
3976                  abs(q_matrix->quantization_index_delta[4]) << 24 |
3977                  ((unsigned short)(q_matrix->quantization_index_delta[3]) >> 15) << 20 | 
3978                  abs(q_matrix->quantization_index_delta[3]) << 16 |
3979                  ((unsigned short)(q_matrix->quantization_index_delta[2]) >> 15) << 12 | 
3980                  abs(q_matrix->quantization_index_delta[2]) << 8 |
3981                  ((unsigned short)(q_matrix->quantization_index_delta[1]) >> 15) << 4 | 
3982                  abs(q_matrix->quantization_index_delta[1]) << 0);
3983
3984     OUT_BCS_BATCH(batch,
3985                  ((unsigned short)(q_matrix->quantization_index_delta[0]) >> 15) << 4 |
3986                  abs(q_matrix->quantization_index_delta[0]) << 0);
3987     
3988     OUT_BCS_BATCH(batch,
3989                  pic_param->clamp_qindex_high << 8 |
3990                  pic_param->clamp_qindex_low << 0);
3991
3992     for (i = 8; i < 19; i++) {
3993          OUT_BCS_BATCH(batch, 0xffffffff);
3994     }
3995
3996     OUT_BCS_BATCH(batch,
3997                   mfc_context->vp8_state.mb_segment_tree_probs[2] << 16 |
3998                   mfc_context->vp8_state.mb_segment_tree_probs[1] <<  8 |
3999                   mfc_context->vp8_state.mb_segment_tree_probs[0] <<  0);
4000
4001     OUT_BCS_BATCH(batch,
4002                   mfc_context->vp8_state.prob_skip_false << 24 |
4003                   mfc_context->vp8_state.prob_intra      << 16 |
4004                   mfc_context->vp8_state.prob_last       <<  8 |
4005                   mfc_context->vp8_state.prob_gf         <<  0);
4006
4007     OUT_BCS_BATCH(batch,
4008                   mfc_context->vp8_state.y_mode_probs[3] << 24 |
4009                   mfc_context->vp8_state.y_mode_probs[2] << 16 |
4010                   mfc_context->vp8_state.y_mode_probs[1] <<  8 |
4011                   mfc_context->vp8_state.y_mode_probs[0] <<  0);
4012
4013     OUT_BCS_BATCH(batch,
4014                   mfc_context->vp8_state.uv_mode_probs[2] << 16 |
4015                   mfc_context->vp8_state.uv_mode_probs[1] <<  8 |
4016                   mfc_context->vp8_state.uv_mode_probs[0] <<  0);
4017     
4018     /* MV update value, DW23-DW32 */
4019     for (i = 0; i < 2; i++) {
4020         for (j = 0; j < 20; j += 4) {
4021             OUT_BCS_BATCH(batch,
4022                           (j + 3 == 19 ? 0 : mfc_context->vp8_state.mv_probs[i][j + 3]) << 24 |
4023                           mfc_context->vp8_state.mv_probs[i][j + 2] << 16 |
4024                           mfc_context->vp8_state.mv_probs[i][j + 1] <<  8 |
4025                           mfc_context->vp8_state.mv_probs[i][j + 0] <<  0);
4026         }
4027     }
4028
4029     OUT_BCS_BATCH(batch,
4030                   (pic_param->ref_lf_delta[3] & 0x7f) << 24 |
4031                   (pic_param->ref_lf_delta[2] & 0x7f) << 16 |
4032                   (pic_param->ref_lf_delta[1] & 0x7f) <<  8 |
4033                   (pic_param->ref_lf_delta[0] & 0x7f) <<  0);
4034
4035     OUT_BCS_BATCH(batch,
4036                   (pic_param->mode_lf_delta[3] & 0x7f) << 24 |
4037                   (pic_param->mode_lf_delta[2] & 0x7f) << 16 |
4038                   (pic_param->mode_lf_delta[1] & 0x7f) <<  8 |
4039                   (pic_param->mode_lf_delta[0] & 0x7f) <<  0);
4040
4041     OUT_BCS_BATCH(batch, 0);
4042     OUT_BCS_BATCH(batch, 0);
4043     OUT_BCS_BATCH(batch, 0);
4044
4045     ADVANCE_BCS_BATCH(batch);
4046 }
4047
4048 #define OUT_VP8_BUFFER(bo, offset)                                      \
4049     if (bo)                                                             \
4050         OUT_BCS_RELOC64(batch,                                            \
4051                       bo,                                               \
4052                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, \
4053                       offset);                                           \
4054     else  {                                                               \
4055         OUT_BCS_BATCH(batch, 0);                                        \
4056         OUT_BCS_BATCH(batch, 0);                                        \
4057     }                                                                   \
4058     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
4059
4060 static void 
4061 gen8_mfc_vp8_bsp_buf_base_addr_state(VADriverContextP ctx, 
4062                                      struct encode_state *encode_state,
4063                                      struct intel_encoder_context *encoder_context)
4064 {
4065     struct i965_driver_data *i965 = i965_driver_data(ctx);
4066     struct intel_batchbuffer *batch = encoder_context->base.batch;
4067     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4068
4069     BEGIN_BCS_BATCH(batch, 32);
4070     OUT_BCS_BATCH(batch, MFX_VP8_BSP_BUF_BASE_ADDR_STATE | (32 - 2));
4071
4072     OUT_VP8_BUFFER(mfc_context->vp8_state.frame_header_bo, 0);
4073
4074     OUT_VP8_BUFFER(mfc_context->vp8_state.intermediate_bo, 0);
4075     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[0]);
4076     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[1]);
4077     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[2]);
4078     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[3]);
4079     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[4]);
4080     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[5]);
4081     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[6]);
4082     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[7]);
4083     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_buffer_max_size);
4084
4085     OUT_VP8_BUFFER(mfc_context->vp8_state.final_frame_bo, I965_CODEDBUFFER_HEADER_SIZE);
4086     OUT_BCS_BATCH(batch, 0);
4087
4088     OUT_VP8_BUFFER(mfc_context->vp8_state.stream_out_bo, 0);
4089     OUT_VP8_BUFFER(mfc_context->vp8_state.coeff_probs_stream_in_bo, 0);
4090     OUT_VP8_BUFFER(mfc_context->vp8_state.token_statistics_bo, 0);
4091     OUT_VP8_BUFFER(mfc_context->vp8_state.mpc_row_store_bo, 0);
4092
4093     ADVANCE_BCS_BATCH(batch);
4094 }
4095
4096 static void
4097 gen8_mfc_vp8_pipeline_picture_programing(VADriverContextP ctx,
4098                                            struct encode_state *encode_state,
4099                                            struct intel_encoder_context *encoder_context)
4100 {
4101     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4102
4103     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_VP8, encoder_context);
4104     mfc_context->set_surface_state(ctx, encoder_context);
4105     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
4106     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
4107     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
4108     gen8_mfc_vp8_bsp_buf_base_addr_state(ctx, encode_state, encoder_context);
4109     gen8_mfc_vp8_pic_state(ctx, encode_state,encoder_context);
4110     gen8_mfc_vp8_encoder_cfg(ctx, encode_state, encoder_context);
4111 }
4112
4113 static const unsigned char
4114 vp8_intra_mb_mode_map[VME_MB_INTRA_MODE_COUNT] = {
4115     PAK_V_PRED,
4116     PAK_H_PRED,
4117     PAK_DC_PRED,
4118     PAK_TM_PRED
4119 };
4120
4121 static const unsigned char
4122 vp8_intra_block_mode_map[VME_B_INTRA_MODE_COUNT] = {
4123     PAK_B_VE_PRED,
4124     PAK_B_HE_PRED,
4125     PAK_B_DC_PRED,
4126     PAK_B_LD_PRED,
4127     PAK_B_RD_PRED,
4128     PAK_B_VR_PRED,
4129     PAK_B_HD_PRED,
4130     PAK_B_VL_PRED,
4131     PAK_B_HU_PRED
4132 };
4133
4134 static int inline gen8_mfc_vp8_intra_mb_mode_map(unsigned int vme_pred_mode, int is_luma_4x4)
4135 {
4136     unsigned int i, pak_pred_mode = 0;
4137     unsigned int vme_sub_blocks_pred_mode[8], pak_sub_blocks_pred_mode[8]; /* 8 blocks's intra mode */
4138
4139     if (!is_luma_4x4) {
4140         pak_pred_mode = vp8_intra_mb_mode_map[vme_pred_mode & 0x3];
4141     } else {
4142         for (i = 0; i < 8; i++) { 
4143             vme_sub_blocks_pred_mode[i] = ((vme_pred_mode >> (4 * i)) & 0xf);
4144             assert(vme_sub_blocks_pred_mode[i] < VME_B_INTRA_MODE_COUNT);
4145             pak_sub_blocks_pred_mode[i] = vp8_intra_block_mode_map[vme_sub_blocks_pred_mode[i]];
4146             pak_pred_mode |= (pak_sub_blocks_pred_mode[i] << (4 * i));
4147         }
4148     }
4149
4150     return pak_pred_mode;
4151 }
4152 static void
4153 gen8_mfc_vp8_pak_object_intra(VADriverContextP ctx, 
4154                               struct intel_encoder_context *encoder_context,
4155                               unsigned int *msg,
4156                               int x, int y,
4157                               struct intel_batchbuffer *batch)
4158 {
4159     unsigned int vme_intra_mb_mode, vme_chroma_pred_mode;
4160     unsigned int pak_intra_mb_mode, pak_chroma_pred_mode;
4161     unsigned int vme_luma_pred_mode[2], pak_luma_pred_mode[2];
4162
4163     if (batch == NULL)
4164         batch = encoder_context->base.batch;
4165
4166     vme_intra_mb_mode = ((msg[0] & 0x30) >> 4);
4167     assert((vme_intra_mb_mode == 0) || (vme_intra_mb_mode == 2)); //vp8 only support intra_16x16 and intra_4x4
4168     pak_intra_mb_mode = (vme_intra_mb_mode >> 1);
4169
4170     vme_luma_pred_mode[0] = msg[1];
4171     vme_luma_pred_mode[1] = msg[2];
4172     vme_chroma_pred_mode = msg[3] & 0x3;
4173
4174     pak_luma_pred_mode[0] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[0], pak_intra_mb_mode);
4175     pak_luma_pred_mode[1] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[1], pak_intra_mb_mode);
4176     pak_chroma_pred_mode = gen8_mfc_vp8_intra_mb_mode_map(vme_chroma_pred_mode, 0);
4177
4178     BEGIN_BCS_BATCH(batch, 7);
4179
4180     OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
4181     OUT_BCS_BATCH(batch, 0);
4182     OUT_BCS_BATCH(batch, 0);
4183     OUT_BCS_BATCH(batch,
4184                   (0 << 20) |                    /* mv format: intra mb */
4185                   (0 << 18) |                    /* Segment ID */
4186                   (0 << 17) |                    /* disable coeff clamp */
4187                   (1 << 13) |                    /* intra mb flag */
4188                   (0 << 11) |                    /* refer picture select: last frame */
4189                   (pak_intra_mb_mode << 8) |     /* mb type */
4190                   (pak_chroma_pred_mode << 4) |  /* mb uv mode */
4191                   (0 << 2) |                     /* skip mb flag: disable */
4192                   0);
4193
4194     OUT_BCS_BATCH(batch, (y << 16) | x);
4195     OUT_BCS_BATCH(batch, pak_luma_pred_mode[0]);
4196     OUT_BCS_BATCH(batch, pak_luma_pred_mode[1]);
4197
4198     ADVANCE_BCS_BATCH(batch);
4199 }
4200
4201 static void
4202 gen8_mfc_vp8_pak_object_inter(VADriverContextP ctx, 
4203                               struct intel_encoder_context *encoder_context,
4204                               unsigned int *msg,
4205                               int offset,
4206                               int x, int y,
4207                               struct intel_batchbuffer *batch)
4208 {
4209     int i;
4210
4211     if (batch == NULL)
4212         batch = encoder_context->base.batch;
4213
4214     /* only support inter_16x16 now */
4215     assert((msg[AVC_INTER_MSG_OFFSET] & INTER_MODE_MASK) == INTER_16X16);
4216     /* for inter_16x16, all 16 MVs should be same, 
4217      * and move mv to the vme mb start address to make sure offset is 64 bytes aligned
4218      * as vp8 spec, all vp8 luma motion vectors are doulbled stored
4219      */
4220     msg[0] = (((msg[AVC_INTER_MV_OFFSET/4] & 0xffff0000) << 1) | ((msg[AVC_INTER_MV_OFFSET/4] << 1) & 0xffff));
4221
4222     for (i = 1; i < 16; i++) {
4223         msg[i] = msg[0];
4224     }
4225     
4226     BEGIN_BCS_BATCH(batch, 7);
4227
4228     OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
4229     OUT_BCS_BATCH(batch,
4230                   (0 << 29) |           /* enable inline mv data: disable */
4231                   64);
4232     OUT_BCS_BATCH(batch,
4233                   offset);
4234     OUT_BCS_BATCH(batch,
4235                   (4 << 20) |           /* mv format: inter */
4236                   (0 << 18) |           /* Segment ID */
4237                   (0 << 17) |           /* coeff clamp: disable */
4238                   (0 << 13) |           /* intra mb flag: inter mb */
4239                   (0 << 11) |           /* refer picture select: last frame */
4240                   (0 << 8) |            /* mb type: 16x16 */
4241                   (0 << 4) |            /* mb uv mode: dc_pred */
4242                   (0 << 2) |            /* skip mb flag: disable */
4243                   0);
4244
4245     OUT_BCS_BATCH(batch, (y << 16) | x);
4246
4247     /*new mv*/
4248     OUT_BCS_BATCH(batch, 0x8);
4249     OUT_BCS_BATCH(batch, 0x8);
4250
4251     ADVANCE_BCS_BATCH(batch);
4252 }
4253
4254 static void
4255 gen8_mfc_vp8_pak_pipeline(VADriverContextP ctx,
4256                           struct encode_state *encode_state,
4257                           struct intel_encoder_context *encoder_context,
4258                           struct intel_batchbuffer *slice_batch)
4259 {
4260     struct gen6_vme_context *vme_context = encoder_context->vme_context;
4261     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
4262     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
4263     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
4264     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
4265     unsigned int *msg = NULL;
4266     unsigned char *msg_ptr = NULL;
4267     unsigned int i, offset, is_intra_frame;
4268
4269     is_intra_frame = !pic_param->pic_flags.bits.frame_type;
4270
4271     dri_bo_map(vme_context->vme_output.bo , 1);
4272     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
4273
4274     for( i = 0; i < width_in_mbs * height_in_mbs; i++) {
4275         int h_pos = i % width_in_mbs;
4276         int v_pos = i / width_in_mbs;
4277         msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
4278         
4279         if (is_intra_frame) {
4280             gen8_mfc_vp8_pak_object_intra(ctx,
4281                     encoder_context,
4282                     msg,
4283                     h_pos, v_pos,
4284                     slice_batch);
4285         } else {
4286             int inter_rdo, intra_rdo;
4287             inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
4288             intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
4289
4290             if (intra_rdo < inter_rdo) {
4291                 gen8_mfc_vp8_pak_object_intra(ctx,
4292                         encoder_context,
4293                         msg,
4294                         h_pos, v_pos,
4295                         slice_batch);
4296             } else {
4297                 offset = i * vme_context->vme_output.size_block;
4298                 gen8_mfc_vp8_pak_object_inter(ctx,
4299                         encoder_context,
4300                         msg,
4301                         offset,
4302                         h_pos, v_pos,
4303                         slice_batch);
4304             }
4305         }
4306     }
4307
4308     dri_bo_unmap(vme_context->vme_output.bo);
4309 }
4310
4311 /*
4312  * A batch buffer for vp8 pak object commands
4313  */
4314 static dri_bo *
4315 gen8_mfc_vp8_software_batchbuffer(VADriverContextP ctx,
4316                                           struct encode_state *encode_state,
4317                                           struct intel_encoder_context *encoder_context)
4318 {
4319     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4320     struct intel_batchbuffer *batch;
4321     dri_bo *batch_bo;
4322
4323     batch = mfc_context->aux_batchbuffer;
4324     batch_bo = batch->buffer;
4325
4326     gen8_mfc_vp8_pak_pipeline(ctx, encode_state, encoder_context, batch);
4327
4328     intel_batchbuffer_align(batch, 8);
4329
4330     BEGIN_BCS_BATCH(batch, 2);
4331     OUT_BCS_BATCH(batch, 0);
4332     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
4333     ADVANCE_BCS_BATCH(batch);
4334
4335     dri_bo_reference(batch_bo);
4336     intel_batchbuffer_free(batch);
4337     mfc_context->aux_batchbuffer = NULL;
4338
4339     return batch_bo;
4340 }
4341
4342 static void
4343 gen8_mfc_vp8_pipeline_programing(VADriverContextP ctx,
4344                                    struct encode_state *encode_state,
4345                                    struct intel_encoder_context *encoder_context)
4346 {
4347     struct intel_batchbuffer *batch = encoder_context->base.batch;
4348     dri_bo *slice_batch_bo;
4349
4350     slice_batch_bo = gen8_mfc_vp8_software_batchbuffer(ctx, encode_state, encoder_context);
4351
4352     // begin programing
4353     intel_batchbuffer_start_atomic_bcs(batch, 0x4000);
4354     intel_batchbuffer_emit_mi_flush(batch);
4355
4356     // picture level programing
4357     gen8_mfc_vp8_pipeline_picture_programing(ctx, encode_state, encoder_context);
4358
4359     BEGIN_BCS_BATCH(batch, 4);
4360     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
4361     OUT_BCS_RELOC64(batch,
4362                   slice_batch_bo,
4363                   I915_GEM_DOMAIN_COMMAND, 0,
4364                   0);
4365     OUT_BCS_BATCH(batch, 0);
4366     ADVANCE_BCS_BATCH(batch);
4367
4368     // end programing
4369     intel_batchbuffer_end_atomic(batch);
4370
4371     dri_bo_unreference(slice_batch_bo);
4372 }
4373
4374 static int gen8_mfc_calc_vp8_coded_buffer_size(VADriverContextP ctx,
4375                           struct encode_state *encode_state,
4376                           struct intel_encoder_context *encoder_context)
4377 {
4378     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4379     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
4380     unsigned char is_intra_frame = !pic_param->pic_flags.bits.frame_type;
4381     unsigned int *vp8_encoding_status, i, first_partition_bytes, token_partition_bytes, vp8_coded_bytes;
4382     
4383     int partition_num = 1 << pic_param->pic_flags.bits.num_token_partitions;
4384
4385     first_partition_bytes = token_partition_bytes = vp8_coded_bytes = 0;
4386
4387     dri_bo_map(mfc_context->vp8_state.token_statistics_bo, 0);
4388
4389     vp8_encoding_status = (unsigned int *)mfc_context->vp8_state.token_statistics_bo->virtual;
4390     first_partition_bytes = (vp8_encoding_status[0] + 7) / 8;
4391
4392     for (i = 1; i <= partition_num; i++) 
4393         token_partition_bytes += (vp8_encoding_status[i] + 7) / 8;
4394
4395     /*coded_bytes includes P0~P8 partitions bytes + uncompresse date bytes + partion_size bytes in bitstream + 3 extra bytes */
4396     /*it seems the last partition size in vp8 status buffer is smaller than reality. so add 3 extra bytes */
4397     vp8_coded_bytes = first_partition_bytes + token_partition_bytes + (3 + 7 * !!is_intra_frame) + (partition_num - 1) * 3 + 3;
4398
4399     dri_bo_unmap(mfc_context->vp8_state.token_statistics_bo);
4400
4401     dri_bo_map(mfc_context->vp8_state.final_frame_bo, 0);
4402     struct i965_coded_buffer_segment *coded_buffer_segment = (struct i965_coded_buffer_segment *)(mfc_context->vp8_state.final_frame_bo->virtual);
4403     coded_buffer_segment->base.size = vp8_coded_bytes;
4404     dri_bo_unmap(mfc_context->vp8_state.final_frame_bo);
4405
4406     return vp8_coded_bytes;
4407 }
4408
4409 static VAStatus
4410 gen8_mfc_vp8_encode_picture(VADriverContextP ctx,
4411                               struct encode_state *encode_state,
4412                               struct intel_encoder_context *encoder_context)
4413 {
4414     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4415     unsigned int rate_control_mode = encoder_context->rate_control_mode;
4416     int current_frame_bits_size;
4417     int sts;
4418
4419     gen8_mfc_vp8_init(ctx, encode_state, encoder_context);
4420     intel_mfc_vp8_prepare(ctx, encode_state, encoder_context);
4421     /*Programing bcs pipeline*/
4422     gen8_mfc_vp8_pipeline_programing(ctx, encode_state, encoder_context);
4423     gen8_mfc_run(ctx, encode_state, encoder_context);
4424     current_frame_bits_size = 8 * gen8_mfc_calc_vp8_coded_buffer_size(ctx, encode_state, encoder_context);
4425
4426     if (rate_control_mode == VA_RC_CBR /*|| rate_control_mode == VA_RC_VBR*/) {
4427         sts = gen8_mfc_vp8_brc_postpack(encode_state, encoder_context, current_frame_bits_size);
4428         if (sts == BRC_NO_HRD_VIOLATION) {
4429             gen8_mfc_vp8_hrd_context_update(encode_state, mfc_context);
4430         }
4431         else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
4432             if (!mfc_context->hrd.violation_noted) {
4433                 fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP)? "overflow": "underflow");
4434                 mfc_context->hrd.violation_noted = 1;
4435             }
4436             return VA_STATUS_SUCCESS;
4437         }
4438     }
4439
4440     return VA_STATUS_SUCCESS;
4441 }
4442
4443 static void
4444 gen8_mfc_context_destroy(void *context)
4445 {
4446     struct gen6_mfc_context *mfc_context = context;
4447     int i;
4448
4449     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
4450     mfc_context->post_deblocking_output.bo = NULL;
4451
4452     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
4453     mfc_context->pre_deblocking_output.bo = NULL;
4454
4455     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
4456     mfc_context->uncompressed_picture_source.bo = NULL;
4457
4458     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
4459     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
4460
4461     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
4462         dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
4463         mfc_context->direct_mv_buffers[i].bo = NULL;
4464     }
4465
4466     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
4467     mfc_context->intra_row_store_scratch_buffer.bo = NULL;
4468
4469     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
4470     mfc_context->macroblock_status_buffer.bo = NULL;
4471
4472     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
4473     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = NULL;
4474
4475     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
4476     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = NULL;
4477
4478
4479     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
4480         dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
4481         mfc_context->reference_surfaces[i].bo = NULL;  
4482     }
4483
4484     gen8_gpe_context_destroy(&mfc_context->gpe_context);
4485
4486     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
4487     mfc_context->mfc_batchbuffer_surface.bo = NULL;
4488
4489     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
4490     mfc_context->aux_batchbuffer_surface.bo = NULL;
4491
4492     if (mfc_context->aux_batchbuffer)
4493         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
4494
4495     mfc_context->aux_batchbuffer = NULL;
4496
4497     dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
4498     mfc_context->vp8_state.coeff_probs_stream_in_bo = NULL;
4499
4500     dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
4501     mfc_context->vp8_state.final_frame_bo = NULL;
4502
4503     dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
4504     mfc_context->vp8_state.frame_header_bo = NULL;
4505
4506     dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
4507     mfc_context->vp8_state.intermediate_bo = NULL;
4508
4509     dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
4510     mfc_context->vp8_state.mpc_row_store_bo = NULL;
4511
4512     dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
4513     mfc_context->vp8_state.stream_out_bo = NULL;
4514
4515     dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
4516     mfc_context->vp8_state.token_statistics_bo = NULL;
4517
4518     free(mfc_context);
4519 }
4520
4521 static VAStatus gen8_mfc_pipeline(VADriverContextP ctx,
4522                                   VAProfile profile,
4523                                   struct encode_state *encode_state,
4524                                   struct intel_encoder_context *encoder_context)
4525 {
4526     VAStatus vaStatus;
4527
4528     switch (profile) {
4529     case VAProfileH264ConstrainedBaseline:
4530     case VAProfileH264Main:
4531     case VAProfileH264High:
4532     case VAProfileH264MultiviewHigh:
4533     case VAProfileH264StereoHigh:
4534         vaStatus = gen8_mfc_avc_encode_picture(ctx, encode_state, encoder_context);
4535         break;
4536
4537         /* FIXME: add for other profile */
4538     case VAProfileMPEG2Simple:
4539     case VAProfileMPEG2Main:
4540         vaStatus = gen8_mfc_mpeg2_encode_picture(ctx, encode_state, encoder_context);
4541         break;
4542
4543     case VAProfileJPEGBaseline:
4544         jpeg_init_default_qmatrix(ctx, encoder_context);
4545         vaStatus = gen8_mfc_jpeg_encode_picture(ctx, encode_state, encoder_context);
4546         break;
4547  
4548     case VAProfileVP8Version0_3:
4549         vaStatus = gen8_mfc_vp8_encode_picture(ctx, encode_state, encoder_context);
4550         break;
4551  
4552     default:
4553         vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
4554         break;
4555     }
4556
4557     return vaStatus;
4558 }
4559
4560 extern Bool i965_encoder_vp8_pak_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context);
4561
4562 Bool gen8_mfc_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
4563 {
4564     struct i965_driver_data *i965 = i965_driver_data(ctx);
4565     struct gen6_mfc_context *mfc_context;
4566
4567     if (IS_CHERRYVIEW(i965->intel.device_info) && encoder_context->codec == CODEC_VP8)
4568         return i965_encoder_vp8_pak_context_init(ctx, encoder_context);
4569
4570     mfc_context = calloc(1, sizeof(struct gen6_mfc_context));
4571     assert(mfc_context);
4572     mfc_context->gpe_context.surface_state_binding_table.length = (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;
4573
4574     mfc_context->gpe_context.idrt.entry_size = ALIGN(sizeof(struct gen8_interface_descriptor_data), 64);
4575     mfc_context->gpe_context.idrt.max_entries = MAX_INTERFACE_DESC_GEN6;
4576     mfc_context->gpe_context.curbe.length = 32 * 4;
4577     mfc_context->gpe_context.sampler.entry_size = 0;
4578     mfc_context->gpe_context.sampler.max_entries = 0;
4579
4580     if (i965->intel.eu_total > 0)
4581         mfc_context->gpe_context.vfe_state.max_num_threads = 6 * i965->intel.eu_total;
4582     else
4583         mfc_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
4584
4585     mfc_context->gpe_context.vfe_state.num_urb_entries = 16;
4586     mfc_context->gpe_context.vfe_state.gpgpu_mode = 0;
4587     mfc_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
4588     mfc_context->gpe_context.vfe_state.curbe_allocation_size = 37 - 1;
4589
4590     if (IS_GEN9(i965->intel.device_info)) {
4591         gen8_gpe_load_kernels(ctx,
4592                           &mfc_context->gpe_context,
4593                           gen9_mfc_kernels,
4594                           1);
4595     } else {
4596         gen8_gpe_load_kernels(ctx,
4597                           &mfc_context->gpe_context,
4598                           gen8_mfc_kernels,
4599                           1);
4600     }
4601
4602     mfc_context->pipe_mode_select = gen8_mfc_pipe_mode_select;
4603     mfc_context->set_surface_state = gen8_mfc_surface_state;
4604     mfc_context->ind_obj_base_addr_state = gen8_mfc_ind_obj_base_addr_state;
4605     mfc_context->avc_img_state = gen8_mfc_avc_img_state;
4606     mfc_context->avc_qm_state = gen8_mfc_avc_qm_state;
4607     mfc_context->avc_fqm_state = gen8_mfc_avc_fqm_state;
4608     mfc_context->insert_object = gen8_mfc_avc_insert_object;
4609     mfc_context->buffer_suface_setup = gen8_gpe_buffer_suface_setup;
4610
4611     encoder_context->mfc_context = mfc_context;
4612     encoder_context->mfc_context_destroy = gen8_mfc_context_destroy;
4613     encoder_context->mfc_pipeline = gen8_mfc_pipeline;
4614
4615     if (encoder_context->codec == CODEC_VP8)
4616         encoder_context->mfc_brc_prepare = gen8_mfc_vp8_brc_prepare;
4617     else
4618         encoder_context->mfc_brc_prepare = intel_mfc_brc_prepare;
4619
4620     return True;
4621 }