OSDN Git Service

Follow the HW spec to configure the buffer cache on Gen9+
[android-x86/hardware-intel-common-vaapi.git] / src / gen8_mfc.c
1 /*
2  * Copyright © 2012 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the
13  * next paragraph) shall be included in all copies or substantial portions
14  * of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
19  * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
20  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Zhao Yakui <yakui.zhao@intel.com>
26  *    Xiang Haihao <haihao.xiang@intel.com>
27  *
28  */
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <math.h>
34 #include <assert.h>
35
36 #include "intel_batchbuffer.h"
37 #include "i965_defines.h"
38 #include "i965_structs.h"
39 #include "i965_drv_video.h"
40 #include "i965_encoder.h"
41 #include "i965_encoder_utils.h"
42 #include "gen6_mfc.h"
43 #include "gen6_vme.h"
44 #include "intel_media.h"
45 #include <va/va_enc_jpeg.h>
46 #include "vp8_probs.h"
47
48 #define SURFACE_STATE_PADDED_SIZE               SURFACE_STATE_PADDED_SIZE_GEN8
49 #define SURFACE_STATE_OFFSET(index)             (SURFACE_STATE_PADDED_SIZE * index)
50 #define BINDING_TABLE_OFFSET(index)             (SURFACE_STATE_OFFSET(MAX_MEDIA_SURFACES_GEN6) + sizeof(unsigned int) * index)
51
52 #define MFC_SOFTWARE_BATCH      0
53
54 #define B0_STEP_REV             2
55 #define IS_STEPPING_BPLUS(i965) ((i965->intel.revision) >= B0_STEP_REV)
56
57 //Zigzag scan order of the the Luma and Chroma components
58 //Note: Jpeg Spec ISO/IEC 10918-1, Figure A.6 shows the zigzag order differently.
59 //The Spec is trying to show the zigzag pattern with number positions. The below
60 //table will use the pattern shown by A.6 and map the position of the elements in the array
61 static const uint32_t zigzag_direct[64] = {
62     0,   1,  8, 16,  9,  2,  3, 10,
63     17, 24, 32, 25, 18, 11,  4,  5,
64     12, 19, 26, 33, 40, 48, 41, 34,
65     27, 20, 13,  6,  7, 14, 21, 28,
66     35, 42, 49, 56, 57, 50, 43, 36,
67     29, 22, 15, 23, 30, 37, 44, 51,
68     58, 59, 52, 45, 38, 31, 39, 46,
69     53, 60, 61, 54, 47, 55, 62, 63
70 };
71
72 //Default Luminance quantization table
73 //Source: Jpeg Spec ISO/IEC 10918-1, Annex K, Table K.1
74 static const uint8_t jpeg_luma_quant[64] = {
75     16, 11, 10, 16, 24,  40,  51,  61,
76     12, 12, 14, 19, 26,  58,  60,  55,
77     14, 13, 16, 24, 40,  57,  69,  56,
78     14, 17, 22, 29, 51,  87,  80,  62,
79     18, 22, 37, 56, 68,  109, 103, 77,
80     24, 35, 55, 64, 81,  104, 113, 92,
81     49, 64, 78, 87, 103, 121, 120, 101,
82     72, 92, 95, 98, 112, 100, 103, 99    
83 };
84
85 //Default Chroma quantization table
86 //Source: Jpeg Spec ISO/IEC 10918-1, Annex K, Table K.2
87 static const uint8_t jpeg_chroma_quant[64] = {
88     17, 18, 24, 47, 99, 99, 99, 99,
89     18, 21, 26, 66, 99, 99, 99, 99,
90     24, 26, 56, 99, 99, 99, 99, 99,
91     47, 66, 99, 99, 99, 99, 99, 99,
92     99, 99, 99, 99, 99, 99, 99, 99,
93     99, 99, 99, 99, 99, 99, 99, 99,
94     99, 99, 99, 99, 99, 99, 99, 99,
95     99, 99, 99, 99, 99, 99, 99, 99
96 };
97
98
99 static const int va_to_gen7_jpeg_hufftable[2] = {
100     MFX_HUFFTABLE_ID_Y,
101     MFX_HUFFTABLE_ID_UV
102 };
103
104 static const uint32_t gen8_mfc_batchbuffer_avc[][4] = {
105 #include "shaders/utils/mfc_batchbuffer_hsw.g8b"
106 };
107
108 static const uint32_t gen9_mfc_batchbuffer_avc[][4] = {
109 #include "shaders/utils/mfc_batchbuffer_hsw.g9b"
110 };
111
112 static struct i965_kernel gen8_mfc_kernels[] = {
113     {
114         "MFC AVC INTRA BATCHBUFFER ",
115         MFC_BATCHBUFFER_AVC_INTRA,
116         gen8_mfc_batchbuffer_avc,
117         sizeof(gen8_mfc_batchbuffer_avc),
118         NULL
119     },
120 };
121
122 static struct i965_kernel gen9_mfc_kernels[] = {
123     {
124         "MFC AVC INTRA BATCHBUFFER ",
125         MFC_BATCHBUFFER_AVC_INTRA,
126         gen9_mfc_batchbuffer_avc,
127         sizeof(gen9_mfc_batchbuffer_avc),
128         NULL
129     },
130 };
131
132 static const uint32_t qm_flat[16] = {
133     0x10101010, 0x10101010, 0x10101010, 0x10101010,
134     0x10101010, 0x10101010, 0x10101010, 0x10101010,
135     0x10101010, 0x10101010, 0x10101010, 0x10101010,
136     0x10101010, 0x10101010, 0x10101010, 0x10101010
137 };
138
139 static const uint32_t fqm_flat[32] = {
140     0x10001000, 0x10001000, 0x10001000, 0x10001000,
141     0x10001000, 0x10001000, 0x10001000, 0x10001000,
142     0x10001000, 0x10001000, 0x10001000, 0x10001000,
143     0x10001000, 0x10001000, 0x10001000, 0x10001000,
144     0x10001000, 0x10001000, 0x10001000, 0x10001000,
145     0x10001000, 0x10001000, 0x10001000, 0x10001000,
146     0x10001000, 0x10001000, 0x10001000, 0x10001000,
147     0x10001000, 0x10001000, 0x10001000, 0x10001000
148 };
149
150 #define         INTER_MODE_MASK         0x03
151 #define         INTER_8X8               0x03
152 #define         INTER_16X8              0x01
153 #define         INTER_8X16              0x02
154 #define         SUBMB_SHAPE_MASK        0x00FF00
155 #define         INTER_16X16             0x00
156
157 #define         INTER_MV8               (4 << 20)
158 #define         INTER_MV32              (6 << 20)
159
160
161 static void
162 gen8_mfc_pipe_mode_select(VADriverContextP ctx,
163                           int standard_select,
164                           struct intel_encoder_context *encoder_context)
165 {
166     struct intel_batchbuffer *batch = encoder_context->base.batch;
167     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
168
169     assert(standard_select == MFX_FORMAT_MPEG2 ||
170            standard_select == MFX_FORMAT_AVC   ||
171            standard_select == MFX_FORMAT_JPEG  ||
172            standard_select == MFX_FORMAT_VP8);
173
174     BEGIN_BCS_BATCH(batch, 5);
175
176     OUT_BCS_BATCH(batch, MFX_PIPE_MODE_SELECT | (5 - 2));
177     OUT_BCS_BATCH(batch,
178                   (MFX_LONG_MODE << 17) | /* Must be long format for encoder */
179                   (MFD_MODE_VLD << 15) | /* VLD mode */
180                   (0 << 10) | /* Stream-Out Enable */
181                   ((!!mfc_context->post_deblocking_output.bo) << 9)  | /* Post Deblocking Output */
182                   ((!!mfc_context->pre_deblocking_output.bo) << 8)  | /* Pre Deblocking Output */
183                   (0 << 6)  | /* frame statistics stream-out enable*/
184                   (0 << 5)  | /* not in stitch mode */
185                   (1 << 4)  | /* encoding mode */
186                   (standard_select << 0));  /* standard select: avc or mpeg2 or jpeg*/
187     OUT_BCS_BATCH(batch,
188                   (0 << 7)  | /* expand NOA bus flag */
189                   (0 << 6)  | /* disable slice-level clock gating */
190                   (0 << 5)  | /* disable clock gating for NOA */
191                   (0 << 4)  | /* terminate if AVC motion and POC table error occurs */
192                   (0 << 3)  | /* terminate if AVC mbdata error occurs */
193                   (0 << 2)  | /* terminate if AVC CABAC/CAVLC decode error occurs */
194                   (0 << 1)  |
195                   (0 << 0));
196     OUT_BCS_BATCH(batch, 0);
197     OUT_BCS_BATCH(batch, 0);
198
199     ADVANCE_BCS_BATCH(batch);
200 }
201
202 static void
203 gen8_mfc_surface_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
204 {
205     struct intel_batchbuffer *batch = encoder_context->base.batch;
206     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
207
208     BEGIN_BCS_BATCH(batch, 6);
209
210     OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
211     OUT_BCS_BATCH(batch, 0);
212     OUT_BCS_BATCH(batch,
213                   ((mfc_context->surface_state.height - 1) << 18) |
214                   ((mfc_context->surface_state.width - 1) << 4));
215     OUT_BCS_BATCH(batch,
216                   (MFX_SURFACE_PLANAR_420_8 << 28) | /* 420 planar YUV surface */
217                   (1 << 27) | /* must be 1 for interleave U/V, hardware requirement */
218                   (0 << 22) | /* surface object control state, FIXME??? */
219                   ((mfc_context->surface_state.w_pitch - 1) << 3) | /* pitch */
220                   (0 << 2)  | /* must be 0 for interleave U/V */
221                   (1 << 1)  | /* must be tiled */
222                   (I965_TILEWALK_YMAJOR << 0));  /* tile walk, TILEWALK_YMAJOR */
223     OUT_BCS_BATCH(batch,
224                   (0 << 16) |                                                           /* must be 0 for interleave U/V */
225                   (mfc_context->surface_state.h_pitch));                /* y offset for U(cb) */
226     OUT_BCS_BATCH(batch, 0);
227
228     ADVANCE_BCS_BATCH(batch);
229 }
230
231 static void
232 gen8_mfc_ind_obj_base_addr_state(VADriverContextP ctx,
233                                  struct intel_encoder_context *encoder_context)
234 {
235     struct i965_driver_data *i965 = i965_driver_data(ctx);
236     struct intel_batchbuffer *batch = encoder_context->base.batch;
237     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
238     struct gen6_vme_context *vme_context = encoder_context->vme_context;
239     int vme_size;
240     unsigned int bse_offset;
241
242     BEGIN_BCS_BATCH(batch, 26);
243
244     OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (26 - 2));
245     /* the DW1-3 is for the MFX indirect bistream offset */
246     OUT_BCS_BATCH(batch, 0);
247     OUT_BCS_BATCH(batch, 0);
248     OUT_BCS_BATCH(batch, 0);
249
250     /* the DW4-5 is the MFX upper bound */
251     if (encoder_context->codec == CODEC_VP8) {
252         OUT_BCS_RELOC(batch,
253                 mfc_context->mfc_indirect_pak_bse_object.bo,
254                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
255                 mfc_context->mfc_indirect_pak_bse_object.end_offset);
256         OUT_BCS_BATCH(batch, 0);
257     } else {
258         OUT_BCS_BATCH(batch, 0);
259         OUT_BCS_BATCH(batch, 0);
260     }
261
262     if(encoder_context->codec != CODEC_JPEG) {
263         vme_size = vme_context->vme_output.size_block * vme_context->vme_output.num_blocks;
264         /* the DW6-10 is for MFX Indirect MV Object Base Address */
265         OUT_BCS_RELOC(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
266         OUT_BCS_BATCH(batch, 0);
267         OUT_BCS_BATCH(batch, i965->intel.mocs_state);
268         OUT_BCS_RELOC(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, vme_size);
269         OUT_BCS_BATCH(batch, 0);
270     } else {
271         /* No VME for JPEG */
272         OUT_BCS_BATCH(batch, 0);
273         OUT_BCS_BATCH(batch, 0);
274         OUT_BCS_BATCH(batch, 0);
275         OUT_BCS_BATCH(batch, 0);
276         OUT_BCS_BATCH(batch, 0);
277     }
278
279     /* the DW11-15 is for MFX IT-COFF. Not used on encoder */
280     OUT_BCS_BATCH(batch, 0);
281     OUT_BCS_BATCH(batch, 0);
282     OUT_BCS_BATCH(batch, 0);
283     OUT_BCS_BATCH(batch, 0);
284     OUT_BCS_BATCH(batch, 0);
285
286     /* the DW16-20 is for MFX indirect DBLK. Not used on encoder */
287     OUT_BCS_BATCH(batch, 0);
288     OUT_BCS_BATCH(batch, 0);
289     OUT_BCS_BATCH(batch, 0);
290     OUT_BCS_BATCH(batch, 0);
291     OUT_BCS_BATCH(batch, 0);
292
293     /* the DW21-25 is for MFC Indirect PAK-BSE Object Base Address for Encoder*/        
294     bse_offset = (encoder_context->codec == CODEC_JPEG) ? (mfc_context->mfc_indirect_pak_bse_object.offset) : 0;
295     OUT_BCS_RELOC(batch,
296                   mfc_context->mfc_indirect_pak_bse_object.bo,
297                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
298                   bse_offset);
299     OUT_BCS_BATCH(batch, 0);
300     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
301         
302     OUT_BCS_RELOC(batch,
303                   mfc_context->mfc_indirect_pak_bse_object.bo,
304                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
305                   mfc_context->mfc_indirect_pak_bse_object.end_offset);
306     OUT_BCS_BATCH(batch, 0);
307
308     ADVANCE_BCS_BATCH(batch);
309 }
310
311 static void
312 gen8_mfc_avc_img_state(VADriverContextP ctx, struct encode_state *encode_state,  
313                        struct intel_encoder_context *encoder_context)
314 {
315     struct intel_batchbuffer *batch = encoder_context->base.batch;
316     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
317     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
318
319     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
320     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
321
322     BEGIN_BCS_BATCH(batch, 16);
323
324     OUT_BCS_BATCH(batch, MFX_AVC_IMG_STATE | (16 - 2));
325     /*DW1. MB setting of frame */
326     OUT_BCS_BATCH(batch,
327                   ((width_in_mbs * height_in_mbs - 1) & 0xFFFF));
328     OUT_BCS_BATCH(batch, 
329                   ((height_in_mbs - 1) << 16) | 
330                   ((width_in_mbs - 1) << 0));
331     /* DW3 QP setting */
332     OUT_BCS_BATCH(batch, 
333                   (0 << 24) |   /* Second Chroma QP Offset */
334                   (0 << 16) |   /* Chroma QP Offset */
335                   (0 << 14) |   /* Max-bit conformance Intra flag */
336                   (0 << 13) |   /* Max Macroblock size conformance Inter flag */
337                   (pPicParameter->pic_fields.bits.weighted_pred_flag << 12) |   /*Weighted_Pred_Flag */
338                   (pPicParameter->pic_fields.bits.weighted_bipred_idc << 10) |  /* Weighted_BiPred_Idc */
339                   (0 << 8)  |   /* FIXME: Image Structure */
340                   (0 << 0) );   /* Current Decoed Image Frame Store ID, reserved in Encode mode */
341     OUT_BCS_BATCH(batch,
342                   (0 << 16) |   /* Mininum Frame size */
343                   (0 << 15) |   /* Disable reading of Macroblock Status Buffer */
344                   (0 << 14) |   /* Load BitStream Pointer only once, 1 slic 1 frame */
345                   (0 << 13) |   /* CABAC 0 word insertion test enable */
346                   (1 << 12) |   /* MVUnpackedEnable,compliant to DXVA */
347                   (1 << 10) |   /* Chroma Format IDC, 4:2:0 */
348                   (0 << 8)  |   /* FIXME: MbMvFormatFlag */
349                   (pPicParameter->pic_fields.bits.entropy_coding_mode_flag << 7)  |   /*0:CAVLC encoding mode,1:CABAC*/
350                   (0 << 6)  |   /* Only valid for VLD decoding mode */
351                   (0 << 5)  |   /* Constrained Intra Predition Flag, from PPS */
352                   (0 << 4)  |   /* Direct 8x8 inference flag */
353                   (pPicParameter->pic_fields.bits.transform_8x8_mode_flag << 3)  |   /*8x8 or 4x4 IDCT Transform Mode Flag*/
354                   (1 << 2)  |   /* Frame MB only flag */
355                   (0 << 1)  |   /* MBAFF mode is in active */
356                   (0 << 0));    /* Field picture flag */
357     /* DW5 Trellis quantization */
358     OUT_BCS_BATCH(batch, 0);    /* Mainly about MB rate control and debug, just ignoring */
359     OUT_BCS_BATCH(batch,        /* Inter and Intra Conformance Max size limit */
360                   (0xBB8 << 16) |       /* InterMbMaxSz */
361                   (0xEE8) );            /* IntraMbMaxSz */
362     OUT_BCS_BATCH(batch, 0);            /* Reserved */
363     /* DW8. QP delta */
364     OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
365     OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
366     /* DW10. Bit setting for MB */
367     OUT_BCS_BATCH(batch, 0x8C000000);
368     OUT_BCS_BATCH(batch, 0x00010000);
369     /* DW12. */
370     OUT_BCS_BATCH(batch, 0);
371     OUT_BCS_BATCH(batch, 0x02010100);
372     /* DW14. For short format */
373     OUT_BCS_BATCH(batch, 0);
374     OUT_BCS_BATCH(batch, 0);
375
376     ADVANCE_BCS_BATCH(batch);
377 }
378
379 static void
380 gen8_mfc_qm_state(VADriverContextP ctx,
381                   int qm_type,
382                   const uint32_t *qm,
383                   int qm_length,
384                   struct intel_encoder_context *encoder_context)
385 {
386     struct intel_batchbuffer *batch = encoder_context->base.batch;
387     unsigned int qm_buffer[16];
388
389     assert(qm_length <= 16);
390     assert(sizeof(*qm) == 4);
391     memcpy(qm_buffer, qm, qm_length * 4);
392
393     BEGIN_BCS_BATCH(batch, 18);
394     OUT_BCS_BATCH(batch, MFX_QM_STATE | (18 - 2));
395     OUT_BCS_BATCH(batch, qm_type << 0);
396     intel_batchbuffer_data(batch, qm_buffer, 16 * 4);
397     ADVANCE_BCS_BATCH(batch);
398 }
399
400 static void
401 gen8_mfc_avc_qm_state(VADriverContextP ctx,
402                       struct encode_state *encode_state,
403                       struct intel_encoder_context *encoder_context)
404 {
405     const unsigned int *qm_4x4_intra;
406     const unsigned int *qm_4x4_inter;
407     const unsigned int *qm_8x8_intra;
408     const unsigned int *qm_8x8_inter;
409     VAEncSequenceParameterBufferH264 *pSeqParameter =
410         (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
411     VAEncPictureParameterBufferH264 *pPicParameter =
412         (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
413
414     if (!pSeqParameter->seq_fields.bits.seq_scaling_matrix_present_flag
415         && !pPicParameter->pic_fields.bits.pic_scaling_matrix_present_flag) {
416         qm_4x4_intra = qm_4x4_inter = qm_8x8_intra = qm_8x8_inter = qm_flat;
417     } else {
418         VAIQMatrixBufferH264 *qm;
419         assert(encode_state->q_matrix && encode_state->q_matrix->buffer);
420         qm = (VAIQMatrixBufferH264 *)encode_state->q_matrix->buffer;
421         qm_4x4_intra = (unsigned int *)qm->ScalingList4x4[0];
422         qm_4x4_inter = (unsigned int *)qm->ScalingList4x4[3];
423         qm_8x8_intra = (unsigned int *)qm->ScalingList8x8[0];
424         qm_8x8_inter = (unsigned int *)qm->ScalingList8x8[1];
425     }
426
427     gen8_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, qm_4x4_intra, 12, encoder_context);
428     gen8_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, qm_4x4_inter, 12, encoder_context);
429     gen8_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, qm_8x8_intra, 16, encoder_context);
430     gen8_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, qm_8x8_inter, 16, encoder_context);
431 }
432
433 static void
434 gen8_mfc_fqm_state(VADriverContextP ctx,
435                    int fqm_type,
436                    const uint32_t *fqm,
437                    int fqm_length,
438                    struct intel_encoder_context *encoder_context)
439 {
440     struct intel_batchbuffer *batch = encoder_context->base.batch;
441     unsigned int fqm_buffer[32];
442
443     assert(fqm_length <= 32);
444     assert(sizeof(*fqm) == 4);
445     memcpy(fqm_buffer, fqm, fqm_length * 4);
446
447     BEGIN_BCS_BATCH(batch, 34);
448     OUT_BCS_BATCH(batch, MFX_FQM_STATE | (34 - 2));
449     OUT_BCS_BATCH(batch, fqm_type << 0);
450     intel_batchbuffer_data(batch, fqm_buffer, 32 * 4);
451     ADVANCE_BCS_BATCH(batch);
452 }
453
454 static void
455 gen8_mfc_avc_fill_fqm(uint8_t *qm, uint16_t *fqm, int len)
456 {
457     int i, j;
458     for (i = 0; i < len; i++)
459        for (j = 0; j < len; j++)
460            fqm[i * len + j] = (1 << 16) / qm[j * len + i];
461 }
462
463 static void
464 gen8_mfc_avc_fqm_state(VADriverContextP ctx,
465                        struct encode_state *encode_state,
466                        struct intel_encoder_context *encoder_context)
467 {
468     VAEncSequenceParameterBufferH264 *pSeqParameter =
469         (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
470     VAEncPictureParameterBufferH264 *pPicParameter =
471         (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
472
473     if (!pSeqParameter->seq_fields.bits.seq_scaling_matrix_present_flag
474         && !pPicParameter->pic_fields.bits.pic_scaling_matrix_present_flag) {
475         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, fqm_flat, 24, encoder_context);
476         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, fqm_flat, 24, encoder_context);
477         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, fqm_flat, 32, encoder_context);
478         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, fqm_flat, 32, encoder_context);
479     } else {
480         int i;
481         uint32_t fqm[32];
482         VAIQMatrixBufferH264 *qm;
483         assert(encode_state->q_matrix && encode_state->q_matrix->buffer);
484         qm = (VAIQMatrixBufferH264 *)encode_state->q_matrix->buffer;
485
486         for (i = 0; i < 3; i++)
487             gen8_mfc_avc_fill_fqm(qm->ScalingList4x4[i], (uint16_t *)fqm + 16 * i, 4);
488         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, fqm, 24, encoder_context);
489
490         for (i = 3; i < 6; i++)
491             gen8_mfc_avc_fill_fqm(qm->ScalingList4x4[i], (uint16_t *)fqm + 16 * (i - 3), 4);
492         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, fqm, 24, encoder_context);
493
494         gen8_mfc_avc_fill_fqm(qm->ScalingList8x8[0], (uint16_t *)fqm, 8);
495         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, fqm, 32, encoder_context);
496
497         gen8_mfc_avc_fill_fqm(qm->ScalingList8x8[1], (uint16_t *)fqm, 8);
498         gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, fqm, 32, encoder_context);
499     }
500 }
501
502 static void
503 gen8_mfc_avc_insert_object(VADriverContextP ctx, struct intel_encoder_context *encoder_context,
504                            unsigned int *insert_data, int lenght_in_dws, int data_bits_in_last_dw,
505                            int skip_emul_byte_count, int is_last_header, int is_end_of_slice, int emulation_flag,
506                            struct intel_batchbuffer *batch)
507 {
508     if (batch == NULL)
509         batch = encoder_context->base.batch;
510
511     if (data_bits_in_last_dw == 0)
512         data_bits_in_last_dw = 32;
513
514     BEGIN_BCS_BATCH(batch, lenght_in_dws + 2);
515
516     OUT_BCS_BATCH(batch, MFX_INSERT_OBJECT | (lenght_in_dws + 2 - 2));
517     OUT_BCS_BATCH(batch,
518                   (0 << 16) |   /* always start at offset 0 */
519                   (data_bits_in_last_dw << 8) |
520                   (skip_emul_byte_count << 4) |
521                   (!!emulation_flag << 3) |
522                   ((!!is_last_header) << 2) |
523                   ((!!is_end_of_slice) << 1) |
524                   (0 << 0));    /* FIXME: ??? */
525     intel_batchbuffer_data(batch, insert_data, lenght_in_dws * 4);
526
527     ADVANCE_BCS_BATCH(batch);
528 }
529
530
531 static void gen8_mfc_init(VADriverContextP ctx,
532                           struct encode_state *encode_state,
533                           struct intel_encoder_context *encoder_context)
534 {
535     struct i965_driver_data *i965 = i965_driver_data(ctx);
536     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
537     dri_bo *bo;
538     int i;
539     int width_in_mbs = 0;
540     int height_in_mbs = 0;
541     int slice_batchbuffer_size;
542
543     if (encoder_context->codec == CODEC_H264 ||
544         encoder_context->codec == CODEC_H264_MVC) {
545         VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
546         width_in_mbs = pSequenceParameter->picture_width_in_mbs;
547         height_in_mbs = pSequenceParameter->picture_height_in_mbs;
548     } else if (encoder_context->codec == CODEC_MPEG2) {
549         VAEncSequenceParameterBufferMPEG2 *pSequenceParameter = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
550
551         assert(encoder_context->codec == CODEC_MPEG2);
552
553         width_in_mbs = ALIGN(pSequenceParameter->picture_width, 16) / 16;
554         height_in_mbs = ALIGN(pSequenceParameter->picture_height, 16) / 16;
555     } else {
556         assert(encoder_context->codec == CODEC_JPEG);
557         VAEncPictureParameterBufferJPEG *pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
558
559         width_in_mbs = ALIGN(pic_param->picture_width, 16) / 16;
560         height_in_mbs = ALIGN(pic_param->picture_height, 16) / 16;
561     }
562
563     slice_batchbuffer_size = 64 * width_in_mbs * height_in_mbs + 4096 +
564                 (SLICE_HEADER + SLICE_TAIL) * encode_state->num_slice_params_ext;
565
566     /*Encode common setup for MFC*/
567     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
568     mfc_context->post_deblocking_output.bo = NULL;
569
570     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
571     mfc_context->pre_deblocking_output.bo = NULL;
572
573     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
574     mfc_context->uncompressed_picture_source.bo = NULL;
575
576     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
577     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
578
579     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
580         if (mfc_context->direct_mv_buffers[i].bo != NULL)
581             dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
582         mfc_context->direct_mv_buffers[i].bo = NULL;
583     }
584
585     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
586         if (mfc_context->reference_surfaces[i].bo != NULL)
587             dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
588         mfc_context->reference_surfaces[i].bo = NULL;  
589     }
590
591     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
592     bo = dri_bo_alloc(i965->intel.bufmgr,
593                       "Buffer",
594                       width_in_mbs * 64,
595                       64);
596     assert(bo);
597     mfc_context->intra_row_store_scratch_buffer.bo = bo;
598
599     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
600     bo = dri_bo_alloc(i965->intel.bufmgr,
601                       "Buffer",
602                       width_in_mbs * height_in_mbs * 16,
603                       64);
604     assert(bo);
605     mfc_context->macroblock_status_buffer.bo = bo;
606
607     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
608     bo = dri_bo_alloc(i965->intel.bufmgr,
609                       "Buffer",
610                       4 * width_in_mbs * 64,  /* 4 * width_in_mbs * 64 */
611                       64);
612     assert(bo);
613     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
614
615     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
616     bo = dri_bo_alloc(i965->intel.bufmgr,
617                       "Buffer",
618                       2 * width_in_mbs * 64, /* 2 * width_in_mbs * 64 */
619                       0x1000);
620     assert(bo);
621     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
622
623     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
624     mfc_context->mfc_batchbuffer_surface.bo = NULL;
625
626     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
627     mfc_context->aux_batchbuffer_surface.bo = NULL;
628
629     if (mfc_context->aux_batchbuffer)
630         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
631
632     mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
633     mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
634     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
635     mfc_context->aux_batchbuffer_surface.pitch = 16;
636     mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
637     mfc_context->aux_batchbuffer_surface.size_block = 16;
638
639     gen8_gpe_context_init(ctx, &mfc_context->gpe_context);
640 }
641
642 static void
643 gen8_mfc_pipe_buf_addr_state(VADriverContextP ctx,
644                              struct intel_encoder_context *encoder_context)
645 {
646     struct i965_driver_data *i965 = i965_driver_data(ctx);
647     struct intel_batchbuffer *batch = encoder_context->base.batch;
648     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
649     int i;
650
651     BEGIN_BCS_BATCH(batch, 61);
652
653     OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (61 - 2));
654
655     /* the DW1-3 is for pre_deblocking */
656     if (mfc_context->pre_deblocking_output.bo)
657         OUT_BCS_RELOC(batch, mfc_context->pre_deblocking_output.bo,
658                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
659                       0);
660     else
661         OUT_BCS_BATCH(batch, 0);                                                                                        /* pre output addr   */
662
663     OUT_BCS_BATCH(batch, 0);
664     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
665     /* the DW4-6 is for the post_deblocking */
666
667     if (mfc_context->post_deblocking_output.bo)
668         OUT_BCS_RELOC(batch, mfc_context->post_deblocking_output.bo,
669                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
670                       0);                                                                                       /* post output addr  */ 
671     else
672         OUT_BCS_BATCH(batch, 0);
673     
674     OUT_BCS_BATCH(batch, 0);
675     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
676
677     /* the DW7-9 is for the uncompressed_picture */
678     OUT_BCS_RELOC(batch, mfc_context->uncompressed_picture_source.bo,
679                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
680                   0); /* uncompressed data */
681
682     OUT_BCS_BATCH(batch, 0);
683     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
684
685     /* the DW10-12 is for the mb status */
686     OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
687                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
688                   0); /* StreamOut data*/
689     
690     OUT_BCS_BATCH(batch, 0);
691     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
692
693     /* the DW13-15 is for the intra_row_store_scratch */
694     OUT_BCS_RELOC(batch, mfc_context->intra_row_store_scratch_buffer.bo,
695                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
696                   0);   
697
698     OUT_BCS_BATCH(batch, 0);
699     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
700
701     /* the DW16-18 is for the deblocking filter */
702     OUT_BCS_RELOC(batch, mfc_context->deblocking_filter_row_store_scratch_buffer.bo,
703                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
704                   0);
705
706     OUT_BCS_BATCH(batch, 0);
707     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
708
709     /* the DW 19-50 is for Reference pictures*/
710     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
711         if ( mfc_context->reference_surfaces[i].bo != NULL) {
712             OUT_BCS_RELOC(batch, mfc_context->reference_surfaces[i].bo,
713                           I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
714                           0);                   
715         } else {
716             OUT_BCS_BATCH(batch, 0);
717         }
718
719         OUT_BCS_BATCH(batch, 0);
720     }
721
722     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
723
724     /* The DW 52-54 is for the MB status buffer */
725     OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
726                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
727                   0);                                                                                   /* Macroblock status buffer*/
728         
729     OUT_BCS_BATCH(batch, 0);
730     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
731
732     /* the DW 55-57 is the ILDB buffer */
733     OUT_BCS_BATCH(batch, 0);
734     OUT_BCS_BATCH(batch, 0);
735     OUT_BCS_BATCH(batch, 0);
736
737     /* the DW 58-60 is the second ILDB buffer */
738     OUT_BCS_BATCH(batch, 0);
739     OUT_BCS_BATCH(batch, 0);
740     OUT_BCS_BATCH(batch, 0);
741
742     ADVANCE_BCS_BATCH(batch);
743 }
744
745 static void
746 gen8_mfc_avc_directmode_state(VADriverContextP ctx,
747                               struct intel_encoder_context *encoder_context)
748 {
749     struct i965_driver_data *i965 = i965_driver_data(ctx);
750     struct intel_batchbuffer *batch = encoder_context->base.batch;
751     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
752
753     int i;
754
755     BEGIN_BCS_BATCH(batch, 71);
756
757     OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (71 - 2));
758
759     /* Reference frames and Current frames */
760     /* the DW1-32 is for the direct MV for reference */
761     for(i = 0; i < NUM_MFC_DMV_BUFFERS - 2; i += 2) {
762         if ( mfc_context->direct_mv_buffers[i].bo != NULL) { 
763             OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[i].bo,
764                           I915_GEM_DOMAIN_INSTRUCTION, 0,
765                           0);
766             OUT_BCS_BATCH(batch, 0);
767         } else {
768             OUT_BCS_BATCH(batch, 0);
769             OUT_BCS_BATCH(batch, 0);
770         }
771     }
772     
773     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
774
775     /* the DW34-36 is the MV for the current reference */
776     OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 2].bo,
777                   I915_GEM_DOMAIN_INSTRUCTION, 0,
778                   0);
779
780     OUT_BCS_BATCH(batch, 0);
781     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
782
783     /* POL list */
784     for(i = 0; i < 32; i++) {
785         OUT_BCS_BATCH(batch, i/2);
786     }
787     OUT_BCS_BATCH(batch, 0);
788     OUT_BCS_BATCH(batch, 0);
789
790     ADVANCE_BCS_BATCH(batch);
791 }
792
793
794 static void
795 gen8_mfc_bsp_buf_base_addr_state(VADriverContextP ctx,
796                                  struct intel_encoder_context *encoder_context)
797 {
798     struct i965_driver_data *i965 = i965_driver_data(ctx);
799     struct intel_batchbuffer *batch = encoder_context->base.batch;
800     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
801
802     BEGIN_BCS_BATCH(batch, 10);
803
804     OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (10 - 2));
805     OUT_BCS_RELOC(batch, mfc_context->bsd_mpc_row_store_scratch_buffer.bo,
806                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
807                   0);
808     OUT_BCS_BATCH(batch, 0);
809     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
810         
811     /* the DW4-6 is for MPR Row Store Scratch Buffer Base Address */
812     OUT_BCS_BATCH(batch, 0);
813     OUT_BCS_BATCH(batch, 0);
814     OUT_BCS_BATCH(batch, 0);
815
816     /* the DW7-9 is for Bitplane Read Buffer Base Address */
817     OUT_BCS_BATCH(batch, 0);
818     OUT_BCS_BATCH(batch, 0);
819     OUT_BCS_BATCH(batch, 0);
820
821     ADVANCE_BCS_BATCH(batch);
822 }
823
824
825 static void gen8_mfc_avc_pipeline_picture_programing( VADriverContextP ctx,
826                                                       struct encode_state *encode_state,
827                                                       struct intel_encoder_context *encoder_context)
828 {
829     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
830
831     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_AVC, encoder_context);
832     mfc_context->set_surface_state(ctx, encoder_context);
833     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
834     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
835     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
836     mfc_context->avc_img_state(ctx, encode_state, encoder_context);
837     mfc_context->avc_qm_state(ctx, encode_state, encoder_context);
838     mfc_context->avc_fqm_state(ctx, encode_state, encoder_context);
839     gen8_mfc_avc_directmode_state(ctx, encoder_context); 
840     intel_mfc_avc_ref_idx_state(ctx, encode_state, encoder_context);
841 }
842
843
844 static VAStatus gen8_mfc_run(VADriverContextP ctx, 
845                              struct encode_state *encode_state,
846                              struct intel_encoder_context *encoder_context)
847 {
848     struct intel_batchbuffer *batch = encoder_context->base.batch;
849
850     intel_batchbuffer_flush(batch);             //run the pipeline
851
852     return VA_STATUS_SUCCESS;
853 }
854
855
856 static VAStatus
857 gen8_mfc_stop(VADriverContextP ctx, 
858               struct encode_state *encode_state,
859               struct intel_encoder_context *encoder_context,
860               int *encoded_bits_size)
861 {
862     VAStatus vaStatus = VA_STATUS_ERROR_UNKNOWN;
863     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
864     VACodedBufferSegment *coded_buffer_segment;
865     
866     vaStatus = i965_MapBuffer(ctx, pPicParameter->coded_buf, (void **)&coded_buffer_segment);
867     assert(vaStatus == VA_STATUS_SUCCESS);
868     *encoded_bits_size = coded_buffer_segment->size * 8;
869     i965_UnmapBuffer(ctx, pPicParameter->coded_buf);
870
871     return VA_STATUS_SUCCESS;
872 }
873
874
875 static void
876 gen8_mfc_avc_slice_state(VADriverContextP ctx,
877                          VAEncPictureParameterBufferH264 *pic_param,
878                          VAEncSliceParameterBufferH264 *slice_param,
879                          struct encode_state *encode_state,
880                          struct intel_encoder_context *encoder_context,
881                          int rate_control_enable,
882                          int qp,
883                          struct intel_batchbuffer *batch)
884 {
885     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
886     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
887     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
888     int beginmb = slice_param->macroblock_address;
889     int endmb = beginmb + slice_param->num_macroblocks;
890     int beginx = beginmb % width_in_mbs;
891     int beginy = beginmb / width_in_mbs;
892     int nextx =  endmb % width_in_mbs;
893     int nexty = endmb / width_in_mbs;
894     int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
895     int last_slice = (endmb == (width_in_mbs * height_in_mbs));
896     int maxQpN, maxQpP;
897     unsigned char correct[6], grow, shrink;
898     int i;
899     int weighted_pred_idc = 0;
900     unsigned int luma_log2_weight_denom = slice_param->luma_log2_weight_denom;
901     unsigned int chroma_log2_weight_denom = slice_param->chroma_log2_weight_denom;
902     int num_ref_l0 = 0, num_ref_l1 = 0;
903
904     if (batch == NULL)
905         batch = encoder_context->base.batch;
906
907     if (slice_type == SLICE_TYPE_I) {
908         luma_log2_weight_denom = 0;
909         chroma_log2_weight_denom = 0;
910     } else if (slice_type == SLICE_TYPE_P) {
911         weighted_pred_idc = pic_param->pic_fields.bits.weighted_pred_flag;
912         num_ref_l0 = pic_param->num_ref_idx_l0_active_minus1 + 1;
913
914         if (slice_param->num_ref_idx_active_override_flag)
915             num_ref_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
916     } else if (slice_type == SLICE_TYPE_B) {
917         weighted_pred_idc = pic_param->pic_fields.bits.weighted_bipred_idc;
918         num_ref_l0 = pic_param->num_ref_idx_l0_active_minus1 + 1;
919         num_ref_l1 = pic_param->num_ref_idx_l1_active_minus1 + 1;
920
921         if (slice_param->num_ref_idx_active_override_flag) {
922             num_ref_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
923             num_ref_l1 = slice_param->num_ref_idx_l1_active_minus1 + 1;
924         }
925
926         if (weighted_pred_idc == 2) {
927             /* 8.4.3 - Derivation process for prediction weights (8-279) */
928             luma_log2_weight_denom = 5;
929             chroma_log2_weight_denom = 5;
930         }
931     }
932
933     maxQpN = mfc_context->bit_rate_control_context[slice_type].MaxQpNegModifier;
934     maxQpP = mfc_context->bit_rate_control_context[slice_type].MaxQpPosModifier;
935
936     for (i = 0; i < 6; i++)
937         correct[i] = mfc_context->bit_rate_control_context[slice_type].Correct[i];
938
939     grow = mfc_context->bit_rate_control_context[slice_type].GrowInit + 
940         (mfc_context->bit_rate_control_context[slice_type].GrowResistance << 4);
941     shrink = mfc_context->bit_rate_control_context[slice_type].ShrinkInit + 
942         (mfc_context->bit_rate_control_context[slice_type].ShrinkResistance << 4);
943
944     BEGIN_BCS_BATCH(batch, 11);;
945
946     OUT_BCS_BATCH(batch, MFX_AVC_SLICE_STATE | (11 - 2) );
947     OUT_BCS_BATCH(batch, slice_type);                   /*Slice Type: I:P:B Slice*/
948
949     OUT_BCS_BATCH(batch,
950                   (num_ref_l0 << 16) |
951                   (num_ref_l1 << 24) |
952                   (chroma_log2_weight_denom << 8) |
953                   (luma_log2_weight_denom << 0));
954
955     OUT_BCS_BATCH(batch, 
956                   (weighted_pred_idc << 30) |
957                   (slice_param->direct_spatial_mv_pred_flag<<29) |             /*Direct Prediction Type*/
958                   (slice_param->disable_deblocking_filter_idc << 27) |
959                   (slice_param->cabac_init_idc << 24) |
960                   (qp<<16) |                    /*Slice Quantization Parameter*/
961                   ((slice_param->slice_beta_offset_div2 & 0xf) << 8) |
962                   ((slice_param->slice_alpha_c0_offset_div2 & 0xf) << 0));
963     OUT_BCS_BATCH(batch,
964                   (beginy << 24) |                      /*First MB X&Y , the begin postion of current slice*/
965                   (beginx << 16) |
966                   slice_param->macroblock_address );
967     OUT_BCS_BATCH(batch, (nexty << 16) | nextx);                       /*Next slice first MB X&Y*/
968     OUT_BCS_BATCH(batch, 
969                   (0/*rate_control_enable*/ << 31) |            /*in CBR mode RateControlCounterEnable = enable*/
970                   (1 << 30) |           /*ResetRateControlCounter*/
971                   (0 << 28) |           /*RC Triggle Mode = Always Rate Control*/
972                   (4 << 24) |     /*RC Stable Tolerance, middle level*/
973                   (0/*rate_control_enable*/ << 23) |     /*RC Panic Enable*/                 
974                   (0 << 22) |     /*QP mode, don't modfiy CBP*/
975                   (0 << 21) |     /*MB Type Direct Conversion Enabled*/ 
976                   (0 << 20) |     /*MB Type Skip Conversion Enabled*/ 
977                   (last_slice << 19) |     /*IsLastSlice*/
978                   (0 << 18) |   /*BitstreamOutputFlag Compressed BitStream Output Disable Flag 0:enable 1:disable*/
979                   (1 << 17) |       /*HeaderPresentFlag*/       
980                   (1 << 16) |       /*SliceData PresentFlag*/
981                   (1 << 15) |       /*TailPresentFlag*/
982                   (1 << 13) |       /*RBSP NAL TYPE*/   
983                   (0 << 12) );    /*CabacZeroWordInsertionEnable*/
984     OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
985     OUT_BCS_BATCH(batch,
986                   (maxQpN << 24) |     /*Target QP - 24 is lowest QP*/ 
987                   (maxQpP << 16) |     /*Target QP + 20 is highest QP*/
988                   (shrink << 8)  |
989                   (grow << 0));   
990     OUT_BCS_BATCH(batch,
991                   (correct[5] << 20) |
992                   (correct[4] << 16) |
993                   (correct[3] << 12) |
994                   (correct[2] << 8) |
995                   (correct[1] << 4) |
996                   (correct[0] << 0));
997     OUT_BCS_BATCH(batch, 0);
998
999     ADVANCE_BCS_BATCH(batch);
1000 }
1001
1002 #define    AVC_INTRA_RDO_OFFSET    4
1003 #define    AVC_INTER_RDO_OFFSET    10
1004 #define    AVC_INTER_MSG_OFFSET    8
1005 #define    AVC_INTER_MV_OFFSET     48
1006 #define    AVC_RDO_MASK            0xFFFF
1007
1008 static int
1009 gen8_mfc_avc_pak_object_intra(VADriverContextP ctx, int x, int y, int end_mb,
1010                               int qp,unsigned int *msg,
1011                               struct intel_encoder_context *encoder_context,
1012                               unsigned char target_mb_size, unsigned char max_mb_size,
1013                               struct intel_batchbuffer *batch)
1014 {
1015     int len_in_dwords = 12;
1016     unsigned int intra_msg;
1017 #define         INTRA_MSG_FLAG          (1 << 13)
1018 #define         INTRA_MBTYPE_MASK       (0x1F0000)
1019     if (batch == NULL)
1020         batch = encoder_context->base.batch;
1021
1022     BEGIN_BCS_BATCH(batch, len_in_dwords);
1023
1024     intra_msg = msg[0] & 0xC0FF;
1025     intra_msg |= INTRA_MSG_FLAG;
1026     intra_msg |= ((msg[0] & INTRA_MBTYPE_MASK) >> 8);
1027     OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
1028     OUT_BCS_BATCH(batch, 0);
1029     OUT_BCS_BATCH(batch, 0);
1030     OUT_BCS_BATCH(batch, 
1031                   (0 << 24) |           /* PackedMvNum, Debug*/
1032                   (0 << 20) |           /* No motion vector */
1033                   (1 << 19) |           /* CbpDcY */
1034                   (1 << 18) |           /* CbpDcU */
1035                   (1 << 17) |           /* CbpDcV */
1036                   intra_msg);
1037
1038     OUT_BCS_BATCH(batch, (0xFFFF << 16) | (y << 8) | x);                /* Code Block Pattern for Y*/
1039     OUT_BCS_BATCH(batch, 0x000F000F);                                                   /* Code Block Pattern */                
1040     OUT_BCS_BATCH(batch, (0 << 27) | (end_mb << 26) | qp);      /* Last MB */
1041
1042     /*Stuff for Intra MB*/
1043     OUT_BCS_BATCH(batch, msg[1]);                       /* We using Intra16x16 no 4x4 predmode*/        
1044     OUT_BCS_BATCH(batch, msg[2]);       
1045     OUT_BCS_BATCH(batch, msg[3]&0xFF);  
1046     
1047     /*MaxSizeInWord and TargetSzieInWord*/
1048     OUT_BCS_BATCH(batch, (max_mb_size << 24) |
1049                   (target_mb_size << 16) );
1050
1051     OUT_BCS_BATCH(batch, 0);
1052
1053     ADVANCE_BCS_BATCH(batch);
1054
1055     return len_in_dwords;
1056 }
1057
1058 static int
1059 gen8_mfc_avc_pak_object_inter(VADriverContextP ctx, int x, int y, int end_mb, int qp,
1060                               unsigned int *msg, unsigned int offset,
1061                               struct intel_encoder_context *encoder_context,
1062                               unsigned char target_mb_size,unsigned char max_mb_size, int slice_type,
1063                               struct intel_batchbuffer *batch)
1064 {
1065     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1066     int len_in_dwords = 12;
1067     unsigned int inter_msg = 0;
1068     if (batch == NULL)
1069         batch = encoder_context->base.batch;
1070     {
1071 #define MSG_MV_OFFSET   4
1072         unsigned int *mv_ptr;
1073         mv_ptr = msg + MSG_MV_OFFSET;
1074         /* MV of VME output is based on 16 sub-blocks. So it is necessary
1075          * to convert them to be compatible with the format of AVC_PAK
1076          * command.
1077          */
1078         if ((msg[0] & INTER_MODE_MASK) == INTER_8X16) {
1079             /* MV[0] and MV[2] are replicated */
1080             mv_ptr[4] = mv_ptr[0];
1081             mv_ptr[5] = mv_ptr[1];
1082             mv_ptr[2] = mv_ptr[8];
1083             mv_ptr[3] = mv_ptr[9];
1084             mv_ptr[6] = mv_ptr[8];
1085             mv_ptr[7] = mv_ptr[9];
1086         } else if ((msg[0] & INTER_MODE_MASK) == INTER_16X8) {
1087             /* MV[0] and MV[1] are replicated */
1088             mv_ptr[2] = mv_ptr[0];
1089             mv_ptr[3] = mv_ptr[1];
1090             mv_ptr[4] = mv_ptr[16];
1091             mv_ptr[5] = mv_ptr[17];
1092             mv_ptr[6] = mv_ptr[24];
1093             mv_ptr[7] = mv_ptr[25];
1094         } else if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
1095                    !(msg[1] & SUBMB_SHAPE_MASK)) {
1096             /* Don't touch MV[0] or MV[1] */
1097             mv_ptr[2] = mv_ptr[8];
1098             mv_ptr[3] = mv_ptr[9];
1099             mv_ptr[4] = mv_ptr[16];
1100             mv_ptr[5] = mv_ptr[17];
1101             mv_ptr[6] = mv_ptr[24];
1102             mv_ptr[7] = mv_ptr[25];
1103         }
1104     }
1105
1106     BEGIN_BCS_BATCH(batch, len_in_dwords);
1107
1108     OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
1109
1110     inter_msg = 32;
1111     /* MV quantity */
1112     if ((msg[0] & INTER_MODE_MASK) == INTER_8X8) {
1113         if (msg[1] & SUBMB_SHAPE_MASK)
1114             inter_msg = 128;
1115     }
1116     OUT_BCS_BATCH(batch, inter_msg);         /* 32 MV*/
1117     OUT_BCS_BATCH(batch, offset);
1118     inter_msg = msg[0] & (0x1F00FFFF);
1119     inter_msg |= INTER_MV8;
1120     inter_msg |= ((1 << 19) | (1 << 18) | (1 << 17));
1121     if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
1122         (msg[1] & SUBMB_SHAPE_MASK)) {
1123         inter_msg |= INTER_MV32;
1124     }
1125
1126     OUT_BCS_BATCH(batch, inter_msg);
1127
1128     OUT_BCS_BATCH(batch, (0xFFFF<<16) | (y << 8) | x);        /* Code Block Pattern for Y*/
1129     OUT_BCS_BATCH(batch, 0x000F000F);                         /* Code Block Pattern */  
1130 #if 0 
1131     if ( slice_type == SLICE_TYPE_B) {
1132         OUT_BCS_BATCH(batch, (0xF<<28) | (end_mb << 26) | qp);  /* Last MB */
1133     } else {
1134         OUT_BCS_BATCH(batch, (end_mb << 26) | qp);      /* Last MB */
1135     }
1136 #else
1137     OUT_BCS_BATCH(batch, (end_mb << 26) | qp);  /* Last MB */
1138 #endif
1139
1140     inter_msg = msg[1] >> 8;
1141     /*Stuff for Inter MB*/
1142     OUT_BCS_BATCH(batch, inter_msg);        
1143     OUT_BCS_BATCH(batch, vme_context->ref_index_in_mb[0]);
1144     OUT_BCS_BATCH(batch, vme_context->ref_index_in_mb[1]);
1145
1146     /*MaxSizeInWord and TargetSzieInWord*/
1147     OUT_BCS_BATCH(batch, (max_mb_size << 24) |
1148                   (target_mb_size << 16) );
1149
1150     OUT_BCS_BATCH(batch, 0x0);    
1151
1152     ADVANCE_BCS_BATCH(batch);
1153
1154     return len_in_dwords;
1155 }
1156
1157 static void 
1158 gen8_mfc_avc_pipeline_slice_programing(VADriverContextP ctx,
1159                                        struct encode_state *encode_state,
1160                                        struct intel_encoder_context *encoder_context,
1161                                        int slice_index,
1162                                        struct intel_batchbuffer *slice_batch)
1163 {
1164     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1165     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1166     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1167     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
1168     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
1169     unsigned int *msg = NULL, offset = 0;
1170     unsigned char *msg_ptr = NULL;
1171     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1172     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1173     int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
1174     int i,x,y;
1175     int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1176     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1177     unsigned int tail_data[] = { 0x0, 0x0 };
1178     int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
1179     int is_intra = slice_type == SLICE_TYPE_I;
1180     int qp_slice;
1181     int qp_mb;
1182
1183     qp_slice = qp;
1184     if (rate_control_mode != VA_RC_CQP) {
1185         qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];
1186         if (encode_state->slice_header_index[slice_index] == 0) {
1187             pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1188             qp_slice = qp;
1189         }
1190     }
1191
1192     /* only support for 8-bit pixel bit-depth */
1193     assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
1194     assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
1195     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1196     assert(qp >= 0 && qp < 52);
1197
1198     gen8_mfc_avc_slice_state(ctx,
1199                              pPicParameter,
1200                              pSliceParameter,
1201                              encode_state, encoder_context,
1202                              (rate_control_mode != VA_RC_CQP), qp_slice, slice_batch);
1203
1204     if ( slice_index == 0)
1205         intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1206
1207     intel_avc_slice_insert_packed_data(ctx, encode_state, encoder_context, slice_index, slice_batch);
1208
1209     dri_bo_map(vme_context->vme_output.bo , 1);
1210     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
1211
1212     if (is_intra) {
1213         msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
1214     } else {
1215         msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
1216     }
1217    
1218     for (i = pSliceParameter->macroblock_address; 
1219          i < pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks; i++) {
1220         int last_mb = (i == (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks - 1) );
1221         x = i % width_in_mbs;
1222         y = i / width_in_mbs;
1223         msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
1224         if (vme_context->roi_enabled) {
1225             qp_mb = *(vme_context->qp_per_mb + i);
1226         } else
1227             qp_mb = qp;
1228
1229         if (is_intra) {
1230             assert(msg);
1231             gen8_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp_mb, msg, encoder_context, 0, 0, slice_batch);
1232         } else {
1233             int inter_rdo, intra_rdo;
1234             inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
1235             intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
1236             offset = i * vme_context->vme_output.size_block + AVC_INTER_MV_OFFSET;
1237             if (intra_rdo < inter_rdo) { 
1238                 gen8_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp_mb, msg, encoder_context, 0, 0, slice_batch);
1239             } else {
1240                 msg += AVC_INTER_MSG_OFFSET;
1241                 gen8_mfc_avc_pak_object_inter(ctx, x, y, last_mb, qp_mb, msg, offset, encoder_context, 0, 0, pSliceParameter->slice_type, slice_batch);
1242             }
1243         }
1244     }
1245    
1246     dri_bo_unmap(vme_context->vme_output.bo);
1247
1248     if ( last_slice ) {    
1249         mfc_context->insert_object(ctx, encoder_context,
1250                                    tail_data, 2, 8,
1251                                    2, 1, 1, 0, slice_batch);
1252     } else {
1253         mfc_context->insert_object(ctx, encoder_context,
1254                                    tail_data, 1, 8,
1255                                    1, 1, 1, 0, slice_batch);
1256     }
1257 }
1258
1259 static dri_bo *
1260 gen8_mfc_avc_software_batchbuffer(VADriverContextP ctx,
1261                                   struct encode_state *encode_state,
1262                                   struct intel_encoder_context *encoder_context)
1263 {
1264     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1265     struct intel_batchbuffer *batch;
1266     dri_bo *batch_bo;
1267     int i;
1268
1269     batch = mfc_context->aux_batchbuffer;
1270     batch_bo = batch->buffer;
1271     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
1272         gen8_mfc_avc_pipeline_slice_programing(ctx, encode_state, encoder_context, i, batch);
1273     }
1274
1275     intel_batchbuffer_align(batch, 8);
1276     
1277     BEGIN_BCS_BATCH(batch, 2);
1278     OUT_BCS_BATCH(batch, 0);
1279     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
1280     ADVANCE_BCS_BATCH(batch);
1281
1282     dri_bo_reference(batch_bo);
1283     intel_batchbuffer_free(batch);
1284     mfc_context->aux_batchbuffer = NULL;
1285
1286     return batch_bo;
1287 }
1288
1289
1290 static void
1291 gen8_mfc_batchbuffer_surfaces_input(VADriverContextP ctx,
1292                                     struct encode_state *encode_state,
1293                                     struct intel_encoder_context *encoder_context)
1294 {
1295     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1296     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1297
1298     assert(vme_context->vme_output.bo);
1299     mfc_context->buffer_suface_setup(ctx,
1300                                      &mfc_context->gpe_context,
1301                                      &vme_context->vme_output,
1302                                      BINDING_TABLE_OFFSET(BIND_IDX_VME_OUTPUT),
1303                                      SURFACE_STATE_OFFSET(BIND_IDX_VME_OUTPUT));
1304 }
1305
1306 static void
1307 gen8_mfc_batchbuffer_surfaces_output(VADriverContextP ctx,
1308                                      struct encode_state *encode_state,
1309                                      struct intel_encoder_context *encoder_context)
1310 {
1311     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1312     assert(mfc_context->aux_batchbuffer_surface.bo);
1313     mfc_context->buffer_suface_setup(ctx,
1314                                      &mfc_context->gpe_context,
1315                                      &mfc_context->aux_batchbuffer_surface,
1316                                      BINDING_TABLE_OFFSET(BIND_IDX_MFC_BATCHBUFFER),
1317                                      SURFACE_STATE_OFFSET(BIND_IDX_MFC_BATCHBUFFER));
1318 }
1319
1320 static void
1321 gen8_mfc_batchbuffer_surfaces_setup(VADriverContextP ctx, 
1322                                     struct encode_state *encode_state,
1323                                     struct intel_encoder_context *encoder_context)
1324 {
1325     gen8_mfc_batchbuffer_surfaces_input(ctx, encode_state, encoder_context);
1326     gen8_mfc_batchbuffer_surfaces_output(ctx, encode_state, encoder_context);
1327 }
1328
1329 static void
1330 gen8_mfc_batchbuffer_idrt_setup(VADriverContextP ctx, 
1331                                 struct encode_state *encode_state,
1332                                 struct intel_encoder_context *encoder_context)
1333 {
1334     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1335     struct gen8_interface_descriptor_data *desc;
1336     int i;
1337     dri_bo *bo;
1338     unsigned char *desc_ptr;
1339
1340     bo = mfc_context->gpe_context.idrt.bo;
1341     dri_bo_map(bo, 1);
1342     assert(bo->virtual);
1343     desc_ptr = (unsigned char *)bo->virtual + mfc_context->gpe_context.idrt.offset;
1344
1345     desc = (struct gen8_interface_descriptor_data *)desc_ptr;
1346
1347     for (i = 0; i < mfc_context->gpe_context.num_kernels; i++) {
1348         struct i965_kernel *kernel;
1349         kernel = &mfc_context->gpe_context.kernels[i];
1350         assert(sizeof(*desc) == 32);
1351         /*Setup the descritor table*/
1352         memset(desc, 0, sizeof(*desc));
1353         desc->desc0.kernel_start_pointer = kernel->kernel_offset >> 6;
1354         desc->desc3.sampler_count = 0;
1355         desc->desc3.sampler_state_pointer = 0;
1356         desc->desc4.binding_table_entry_count = 1;
1357         desc->desc4.binding_table_pointer = (BINDING_TABLE_OFFSET(0) >> 5);
1358         desc->desc5.constant_urb_entry_read_offset = 0;
1359         desc->desc5.constant_urb_entry_read_length = 4;
1360
1361                 
1362         desc++;
1363     }
1364
1365     dri_bo_unmap(bo);
1366
1367     return;
1368 }
1369
1370 static void
1371 gen8_mfc_batchbuffer_constant_setup(VADriverContextP ctx, 
1372                                     struct encode_state *encode_state,
1373                                     struct intel_encoder_context *encoder_context)
1374 {
1375     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1376     
1377     (void)mfc_context;
1378 }
1379
1380 #define AVC_PAK_LEN_IN_BYTE     48
1381 #define AVC_PAK_LEN_IN_OWORD    3
1382
1383 static void
1384 gen8_mfc_batchbuffer_emit_object_command(struct intel_batchbuffer *batch,
1385                                           uint32_t intra_flag,
1386                                           int head_offset,
1387                                           int number_mb_cmds,
1388                                           int slice_end_x,
1389                                           int slice_end_y,
1390                                           int mb_x,
1391                                           int mb_y,
1392                                           int width_in_mbs,
1393                                           int qp,
1394                                           uint32_t fwd_ref,
1395                                           uint32_t bwd_ref)
1396 {
1397     uint32_t temp_value;
1398     BEGIN_BATCH(batch, 14);
1399     
1400     OUT_BATCH(batch, CMD_MEDIA_OBJECT | (14 - 2));
1401     OUT_BATCH(batch, 0);
1402     OUT_BATCH(batch, 0);
1403     OUT_BATCH(batch, 0);
1404     OUT_BATCH(batch, 0);
1405     OUT_BATCH(batch, 0);
1406    
1407     /*inline data */
1408     OUT_BATCH(batch, head_offset / 16);
1409     OUT_BATCH(batch, (intra_flag) | (qp << 16));
1410     temp_value = (mb_x | (mb_y << 8) | (width_in_mbs << 16));
1411     OUT_BATCH(batch, temp_value);
1412
1413     OUT_BATCH(batch, number_mb_cmds);
1414
1415     OUT_BATCH(batch,
1416               ((slice_end_y << 8) | (slice_end_x)));
1417     OUT_BATCH(batch, fwd_ref);
1418     OUT_BATCH(batch, bwd_ref);
1419
1420     OUT_BATCH(batch, MI_NOOP);
1421
1422     ADVANCE_BATCH(batch);
1423 }
1424
1425 static void
1426 gen8_mfc_avc_batchbuffer_slice_command(VADriverContextP ctx,
1427                                         struct intel_encoder_context *encoder_context,
1428                                         VAEncSliceParameterBufferH264 *slice_param,
1429                                         int head_offset,
1430                                         int qp,
1431                                         int last_slice)
1432 {
1433     struct intel_batchbuffer *batch = encoder_context->base.batch;
1434     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1435     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1436     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1437     int total_mbs = slice_param->num_macroblocks;
1438     int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
1439     int number_mb_cmds = 128;
1440     int starting_offset = 0;
1441     int mb_x, mb_y;
1442     int last_mb, slice_end_x, slice_end_y;
1443     int remaining_mb = total_mbs;
1444     uint32_t fwd_ref , bwd_ref, mb_flag;
1445     char tmp_qp;
1446     int number_roi_mbs, max_mb_cmds, i;
1447
1448     last_mb = slice_param->macroblock_address + total_mbs - 1;
1449     slice_end_x = last_mb % width_in_mbs;
1450     slice_end_y = last_mb / width_in_mbs;
1451
1452     if (slice_type == SLICE_TYPE_I) {
1453         fwd_ref = 0;
1454         bwd_ref = 0;
1455         mb_flag = 1;
1456     } else {
1457         fwd_ref = vme_context->ref_index_in_mb[0];
1458         bwd_ref = vme_context->ref_index_in_mb[1];
1459         mb_flag = 0;
1460     }
1461
1462     if (width_in_mbs >= 100) {
1463         number_mb_cmds = width_in_mbs / 5;
1464     } else if (width_in_mbs >= 80) {
1465         number_mb_cmds = width_in_mbs / 4;
1466     } else if (width_in_mbs >= 60) {
1467         number_mb_cmds = width_in_mbs / 3;
1468     } else if (width_in_mbs >= 40) {
1469         number_mb_cmds = width_in_mbs / 2;
1470     } else {
1471         number_mb_cmds = width_in_mbs;
1472     }
1473
1474     max_mb_cmds = number_mb_cmds;
1475
1476     do {
1477         mb_x = (slice_param->macroblock_address + starting_offset) % width_in_mbs;
1478         mb_y = (slice_param->macroblock_address + starting_offset) / width_in_mbs;
1479
1480         number_mb_cmds = max_mb_cmds;
1481         if (vme_context->roi_enabled) {
1482
1483             number_roi_mbs = 1;
1484             tmp_qp = *(vme_context->qp_per_mb + starting_offset);
1485             for (i = 1; i < max_mb_cmds; i++) {
1486                 if (tmp_qp != *(vme_context->qp_per_mb + starting_offset + i))
1487                     break;
1488
1489                 number_roi_mbs++;
1490             }
1491
1492             number_mb_cmds = number_roi_mbs;
1493             qp = tmp_qp;
1494         }
1495
1496         if (number_mb_cmds >= remaining_mb) {
1497             number_mb_cmds = remaining_mb;
1498         }
1499
1500         gen8_mfc_batchbuffer_emit_object_command(batch,
1501                                                   mb_flag,
1502                                                   head_offset,
1503                                                   number_mb_cmds,
1504                                                   slice_end_x,
1505                                                   slice_end_y,
1506                                                   mb_x,
1507                                                   mb_y,
1508                                                   width_in_mbs,
1509                                                   qp,
1510                                                   fwd_ref,
1511                                                   bwd_ref);
1512
1513         head_offset += (number_mb_cmds * AVC_PAK_LEN_IN_BYTE);
1514         remaining_mb -= number_mb_cmds;
1515         starting_offset += number_mb_cmds;
1516     } while (remaining_mb > 0);
1517 }
1518
1519 static void
1520 gen8_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
1521                                 struct encode_state *encode_state,
1522                                 struct intel_encoder_context *encoder_context,
1523                                 int slice_index)
1524 {
1525     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1526     struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
1527     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1528     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
1529     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
1530     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1531     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1532     int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
1533     int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1534     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1535     unsigned int tail_data[] = { 0x0, 0x0 };
1536     long head_offset;
1537     int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
1538     int qp_slice;
1539
1540     qp_slice = qp;
1541     if (rate_control_mode != VA_RC_CQP) {
1542         qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];
1543         if (encode_state->slice_header_index[slice_index] == 0) {
1544             pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1545             qp_slice = qp;
1546         }
1547     }
1548
1549     /* only support for 8-bit pixel bit-depth */
1550     assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
1551     assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
1552     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1553     assert(qp >= 0 && qp < 52);
1554
1555     gen8_mfc_avc_slice_state(ctx,
1556                               pPicParameter,
1557                               pSliceParameter,
1558                               encode_state,
1559                               encoder_context,
1560                               (rate_control_mode != VA_RC_CQP),
1561                               qp_slice,
1562                               slice_batch);
1563
1564     if (slice_index == 0)
1565         intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1566
1567     intel_avc_slice_insert_packed_data(ctx, encode_state, encoder_context, slice_index, slice_batch);
1568
1569     intel_batchbuffer_align(slice_batch, 64); /* aligned by an Cache-line */
1570     head_offset = intel_batchbuffer_used_size(slice_batch);
1571
1572     slice_batch->ptr += pSliceParameter->num_macroblocks * AVC_PAK_LEN_IN_BYTE;
1573
1574     gen8_mfc_avc_batchbuffer_slice_command(ctx,
1575                                             encoder_context,
1576                                             pSliceParameter,
1577                                             head_offset,
1578                                             qp,
1579                                             last_slice);
1580
1581
1582     /* Aligned for tail */
1583     intel_batchbuffer_align(slice_batch, 64); /* aligned by Cache-line */
1584     if (last_slice) {    
1585         mfc_context->insert_object(ctx,
1586                                    encoder_context,
1587                                    tail_data,
1588                                    2,
1589                                    8,
1590                                    2,
1591                                    1,
1592                                    1,
1593                                    0,
1594                                    slice_batch);
1595     } else {
1596         mfc_context->insert_object(ctx,
1597                                    encoder_context,
1598                                    tail_data,
1599                                    1,
1600                                    8,
1601                                    1,
1602                                    1,
1603                                    1,
1604                                    0,
1605                                    slice_batch);
1606     }
1607
1608     return;
1609 }
1610
1611 static void
1612 gen8_mfc_avc_batchbuffer_pipeline(VADriverContextP ctx,
1613                                   struct encode_state *encode_state,
1614                                   struct intel_encoder_context *encoder_context)
1615 {
1616     struct i965_driver_data *i965 = i965_driver_data(ctx);
1617     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1618     struct intel_batchbuffer *batch = encoder_context->base.batch;
1619     int i;
1620
1621     intel_batchbuffer_start_atomic(batch, 0x4000);
1622
1623     if (IS_GEN9(i965->intel.device_info))
1624         gen9_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
1625     else
1626         gen8_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
1627
1628     for ( i = 0; i < encode_state->num_slice_params_ext; i++) {
1629         gen8_mfc_avc_batchbuffer_slice(ctx, encode_state, encoder_context, i);
1630     }
1631     {
1632         struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
1633
1634         intel_batchbuffer_align(slice_batch, 8);
1635         BEGIN_BCS_BATCH(slice_batch, 2);
1636         OUT_BCS_BATCH(slice_batch, 0);
1637         OUT_BCS_BATCH(slice_batch, MI_BATCH_BUFFER_END);
1638         ADVANCE_BCS_BATCH(slice_batch);
1639
1640         BEGIN_BATCH(batch, 2);
1641         OUT_BATCH(batch, CMD_MEDIA_STATE_FLUSH);
1642         OUT_BATCH(batch, 0);
1643         ADVANCE_BATCH(batch);
1644
1645         intel_batchbuffer_free(slice_batch);
1646         mfc_context->aux_batchbuffer = NULL;
1647     }
1648
1649     if (IS_GEN9(i965->intel.device_info))
1650         gen9_gpe_pipeline_end(ctx, &mfc_context->gpe_context, batch);
1651
1652     intel_batchbuffer_end_atomic(batch);
1653     intel_batchbuffer_flush(batch);
1654
1655 }
1656
1657 static void
1658 gen8_mfc_build_avc_batchbuffer(VADriverContextP ctx, 
1659                                struct encode_state *encode_state,
1660                                struct intel_encoder_context *encoder_context)
1661 {
1662     gen8_mfc_batchbuffer_surfaces_setup(ctx, encode_state, encoder_context);
1663     gen8_mfc_batchbuffer_idrt_setup(ctx, encode_state, encoder_context);
1664     gen8_mfc_batchbuffer_constant_setup(ctx, encode_state, encoder_context);
1665     gen8_mfc_avc_batchbuffer_pipeline(ctx, encode_state, encoder_context);
1666 }
1667
1668 static dri_bo *
1669 gen8_mfc_avc_hardware_batchbuffer(VADriverContextP ctx,
1670                                   struct encode_state *encode_state,
1671                                   struct intel_encoder_context *encoder_context)
1672 {
1673     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1674
1675     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
1676     gen8_mfc_build_avc_batchbuffer(ctx, encode_state, encoder_context);
1677
1678     return mfc_context->aux_batchbuffer_surface.bo;
1679 }
1680
1681 static void
1682 gen8_mfc_avc_pipeline_programing(VADriverContextP ctx,
1683                                  struct encode_state *encode_state,
1684                                  struct intel_encoder_context *encoder_context)
1685 {
1686     struct intel_batchbuffer *batch = encoder_context->base.batch;
1687     dri_bo *slice_batch_bo;
1688
1689     if ( intel_mfc_interlace_check(ctx, encode_state, encoder_context) ) {
1690         fprintf(stderr, "Current VA driver don't support interlace mode!\n");
1691         assert(0);
1692         return; 
1693     }
1694
1695     if (encoder_context->soft_batch_force)
1696         slice_batch_bo = gen8_mfc_avc_software_batchbuffer(ctx, encode_state, encoder_context);
1697     else
1698         slice_batch_bo = gen8_mfc_avc_hardware_batchbuffer(ctx, encode_state, encoder_context);
1699
1700
1701     // begin programing
1702     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
1703     intel_batchbuffer_emit_mi_flush(batch);
1704     
1705     // picture level programing
1706     gen8_mfc_avc_pipeline_picture_programing(ctx, encode_state, encoder_context);
1707
1708     BEGIN_BCS_BATCH(batch, 3);
1709     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
1710     OUT_BCS_RELOC(batch,
1711                   slice_batch_bo,
1712                   I915_GEM_DOMAIN_COMMAND, 0, 
1713                   0);
1714     OUT_BCS_BATCH(batch, 0);
1715     ADVANCE_BCS_BATCH(batch);
1716
1717     // end programing
1718     intel_batchbuffer_end_atomic(batch);
1719
1720     dri_bo_unreference(slice_batch_bo);
1721 }
1722
1723
1724 static VAStatus
1725 gen8_mfc_avc_encode_picture(VADriverContextP ctx, 
1726                             struct encode_state *encode_state,
1727                             struct intel_encoder_context *encoder_context)
1728 {
1729     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1730     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1731     int current_frame_bits_size;
1732     int sts;
1733  
1734     for (;;) {
1735         gen8_mfc_init(ctx, encode_state, encoder_context);
1736         intel_mfc_avc_prepare(ctx, encode_state, encoder_context);
1737         /*Programing bcs pipeline*/
1738         gen8_mfc_avc_pipeline_programing(ctx, encode_state, encoder_context);   //filling the pipeline
1739         gen8_mfc_run(ctx, encode_state, encoder_context);
1740         if (rate_control_mode == VA_RC_CBR || rate_control_mode == VA_RC_VBR) {
1741             gen8_mfc_stop(ctx, encode_state, encoder_context, &current_frame_bits_size);
1742             sts = intel_mfc_brc_postpack(encode_state, encoder_context, current_frame_bits_size);
1743             if (sts == BRC_NO_HRD_VIOLATION) {
1744                 intel_mfc_hrd_context_update(encode_state, mfc_context);
1745                 break;
1746             }
1747             else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
1748                 if (!mfc_context->hrd.violation_noted) {
1749                     fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP)? "overflow": "underflow");
1750                     mfc_context->hrd.violation_noted = 1;
1751                 }
1752                 return VA_STATUS_SUCCESS;
1753             }
1754         } else {
1755             break;
1756         }
1757     }
1758
1759     return VA_STATUS_SUCCESS;
1760 }
1761
1762 /*
1763  * MPEG-2
1764  */
1765
1766 static const int
1767 va_to_gen8_mpeg2_picture_type[3] = {
1768     1,  /* I */
1769     2,  /* P */
1770     3   /* B */
1771 };
1772
1773 static void
1774 gen8_mfc_mpeg2_pic_state(VADriverContextP ctx,
1775                          struct intel_encoder_context *encoder_context,
1776                          struct encode_state *encode_state)
1777 {
1778     struct intel_batchbuffer *batch = encoder_context->base.batch;
1779     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1780     VAEncPictureParameterBufferMPEG2 *pic_param;
1781     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1782     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1783     VAEncSliceParameterBufferMPEG2 *slice_param = NULL;
1784
1785     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
1786     pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
1787     slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[0]->buffer;
1788
1789     BEGIN_BCS_BATCH(batch, 13);
1790     OUT_BCS_BATCH(batch, MFX_MPEG2_PIC_STATE | (13 - 2));
1791     OUT_BCS_BATCH(batch,
1792                   (pic_param->f_code[1][1] & 0xf) << 28 | /* f_code[1][1] */
1793                   (pic_param->f_code[1][0] & 0xf) << 24 | /* f_code[1][0] */
1794                   (pic_param->f_code[0][1] & 0xf) << 20 | /* f_code[0][1] */
1795                   (pic_param->f_code[0][0] & 0xf) << 16 | /* f_code[0][0] */
1796                   pic_param->picture_coding_extension.bits.intra_dc_precision << 14 |
1797                   pic_param->picture_coding_extension.bits.picture_structure << 12 |
1798                   pic_param->picture_coding_extension.bits.top_field_first << 11 |
1799                   pic_param->picture_coding_extension.bits.frame_pred_frame_dct << 10 |
1800                   pic_param->picture_coding_extension.bits.concealment_motion_vectors << 9 |
1801                   pic_param->picture_coding_extension.bits.q_scale_type << 8 |
1802                   pic_param->picture_coding_extension.bits.intra_vlc_format << 7 | 
1803                   pic_param->picture_coding_extension.bits.alternate_scan << 6);
1804     OUT_BCS_BATCH(batch,
1805                   0 << 14 |     /* LoadSlicePointerFlag, 0 means only loading bitstream pointer once */
1806                   va_to_gen8_mpeg2_picture_type[pic_param->picture_type] << 9 |
1807                   0);
1808     OUT_BCS_BATCH(batch,
1809                   1 << 31 |     /* slice concealment */
1810                   (height_in_mbs - 1) << 16 |
1811                   (width_in_mbs - 1));
1812
1813     if (slice_param && slice_param->quantiser_scale_code >= 14)
1814         OUT_BCS_BATCH(batch, (3 << 1) | (1 << 4) | (5 << 8) | (1 << 12));
1815     else
1816         OUT_BCS_BATCH(batch, 0);
1817
1818     OUT_BCS_BATCH(batch, 0);
1819     OUT_BCS_BATCH(batch,
1820                   0xFFF << 16 | /* InterMBMaxSize */
1821                   0xFFF << 0 |  /* IntraMBMaxSize */
1822                   0);
1823     OUT_BCS_BATCH(batch, 0);
1824     OUT_BCS_BATCH(batch, 0);
1825     OUT_BCS_BATCH(batch, 0);
1826     OUT_BCS_BATCH(batch, 0);
1827     OUT_BCS_BATCH(batch, 0);
1828     OUT_BCS_BATCH(batch, 0);
1829     ADVANCE_BCS_BATCH(batch);
1830 }
1831
1832 static void
1833 gen8_mfc_mpeg2_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
1834 {
1835     unsigned char intra_qm[64] = {
1836         8, 16, 19, 22, 26, 27, 29, 34,
1837         16, 16, 22, 24, 27, 29, 34, 37,
1838         19, 22, 26, 27, 29, 34, 34, 38,
1839         22, 22, 26, 27, 29, 34, 37, 40,
1840         22, 26, 27, 29, 32, 35, 40, 48,
1841         26, 27, 29, 32, 35, 40, 48, 58,
1842         26, 27, 29, 34, 38, 46, 56, 69,
1843         27, 29, 35, 38, 46, 56, 69, 83
1844     };
1845
1846     unsigned char non_intra_qm[64] = {
1847         16, 16, 16, 16, 16, 16, 16, 16,
1848         16, 16, 16, 16, 16, 16, 16, 16,
1849         16, 16, 16, 16, 16, 16, 16, 16,
1850         16, 16, 16, 16, 16, 16, 16, 16,
1851         16, 16, 16, 16, 16, 16, 16, 16,
1852         16, 16, 16, 16, 16, 16, 16, 16,
1853         16, 16, 16, 16, 16, 16, 16, 16,
1854         16, 16, 16, 16, 16, 16, 16, 16
1855     };
1856
1857     gen8_mfc_qm_state(ctx, MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX, (unsigned int *)intra_qm, 16, encoder_context);
1858     gen8_mfc_qm_state(ctx, MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX, (unsigned int *)non_intra_qm, 16,encoder_context);
1859 }
1860
1861 static void
1862 gen8_mfc_mpeg2_fqm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
1863 {
1864     unsigned short intra_fqm[64] = {
1865         65536/0x8, 65536/0x10, 65536/0x13, 65536/0x16, 65536/0x16, 65536/0x1a, 65536/0x1a, 65536/0x1b,
1866         65536/0x10, 65536/0x10, 65536/0x16, 65536/0x16, 65536/0x1a, 65536/0x1b, 65536/0x1b, 65536/0x1d,
1867         65536/0x13, 65536/0x16, 65536/0x1a, 65536/0x1a, 65536/0x1b, 65536/0x1d, 65536/0x1d, 65536/0x23,
1868         65536/0x16, 65536/0x18, 65536/0x1b, 65536/0x1b, 65536/0x13, 65536/0x20, 65536/0x22, 65536/0x26,
1869         65536/0x1a, 65536/0x1b, 65536/0x13, 65536/0x13, 65536/0x20, 65536/0x23, 65536/0x26, 65536/0x2e,
1870         65536/0x1b, 65536/0x1d, 65536/0x22, 65536/0x22, 65536/0x23, 65536/0x28, 65536/0x2e, 65536/0x38,
1871         65536/0x1d, 65536/0x22, 65536/0x22, 65536/0x25, 65536/0x28, 65536/0x30, 65536/0x38, 65536/0x45,
1872         65536/0x22, 65536/0x25, 65536/0x26, 65536/0x28, 65536/0x30, 65536/0x3a, 65536/0x45, 65536/0x53,
1873     };
1874
1875     unsigned short non_intra_fqm[64] = {
1876         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1877         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1878         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1879         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1880         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1881         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1882         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1883         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1884     };
1885
1886     gen8_mfc_fqm_state(ctx, MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX, (unsigned int *)intra_fqm, 32, encoder_context);
1887     gen8_mfc_fqm_state(ctx, MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX, (unsigned int *)non_intra_fqm, 32, encoder_context);
1888 }
1889
1890 static void
1891 gen8_mfc_mpeg2_slicegroup_state(VADriverContextP ctx,
1892                                 struct intel_encoder_context *encoder_context,
1893                                 int x, int y,
1894                                 int next_x, int next_y,
1895                                 int is_fisrt_slice_group,
1896                                 int is_last_slice_group,
1897                                 int intra_slice,
1898                                 int qp,
1899                                 struct intel_batchbuffer *batch)
1900 {
1901     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1902
1903     if (batch == NULL)
1904         batch = encoder_context->base.batch;
1905
1906     BEGIN_BCS_BATCH(batch, 8);
1907
1908     OUT_BCS_BATCH(batch, MFC_MPEG2_SLICEGROUP_STATE | (8 - 2));
1909     OUT_BCS_BATCH(batch,
1910                   0 << 31 |                             /* MbRateCtrlFlag */
1911                   !!is_last_slice_group << 19 |         /* IsLastSliceGrp */
1912                   1 << 17 |                             /* Insert Header before the first slice group data */
1913                   1 << 16 |                             /* SliceData PresentFlag: always 1 */
1914                   1 << 15 |                             /* TailPresentFlag: always 1 */
1915                   0 << 14 |                             /* FirstSliceHdrDisabled: slice header for each slice */
1916                   !!intra_slice << 13 |                 /* IntraSlice */
1917                   !!intra_slice << 12 |                 /* IntraSliceFlag */
1918                   0);
1919     OUT_BCS_BATCH(batch,
1920                   next_y << 24 |
1921                   next_x << 16 |
1922                   y << 8 |
1923                   x << 0 |
1924                   0);
1925     OUT_BCS_BATCH(batch, qp);   /* FIXME: SliceGroupQp */
1926     /* bitstream pointer is only loaded once for the first slice of a frame when 
1927      * LoadSlicePointerFlag is 0
1928      */
1929     OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
1930     OUT_BCS_BATCH(batch, 0);    /* FIXME: */
1931     OUT_BCS_BATCH(batch, 0);    /* FIXME: CorrectPoints */
1932     OUT_BCS_BATCH(batch, 0);    /* FIXME: CVxxx */
1933
1934     ADVANCE_BCS_BATCH(batch);
1935 }
1936
1937 static int
1938 gen8_mfc_mpeg2_pak_object_intra(VADriverContextP ctx,
1939                                 struct intel_encoder_context *encoder_context,
1940                                 int x, int y,
1941                                 int first_mb_in_slice,
1942                                 int last_mb_in_slice,
1943                                 int first_mb_in_slice_group,
1944                                 int last_mb_in_slice_group,
1945                                 int mb_type,
1946                                 int qp_scale_code,
1947                                 int coded_block_pattern,
1948                                 unsigned char target_size_in_word,
1949                                 unsigned char max_size_in_word,
1950                                 struct intel_batchbuffer *batch)
1951 {
1952     int len_in_dwords = 9;
1953
1954     if (batch == NULL)
1955         batch = encoder_context->base.batch;
1956
1957     BEGIN_BCS_BATCH(batch, len_in_dwords);
1958
1959     OUT_BCS_BATCH(batch, MFC_MPEG2_PAK_OBJECT | (len_in_dwords - 2));
1960     OUT_BCS_BATCH(batch,
1961                   0 << 24 |     /* PackedMvNum */
1962                   0 << 20 |     /* MvFormat */
1963                   7 << 17 |     /* CbpDcY/CbpDcU/CbpDcV */
1964                   0 << 15 |     /* TransformFlag: frame DCT */
1965                   0 << 14 |     /* FieldMbFlag */
1966                   1 << 13 |     /* IntraMbFlag */
1967                   mb_type << 8 |   /* MbType: Intra */
1968                   0 << 2 |      /* SkipMbFlag */
1969                   0 << 0 |      /* InterMbMode */
1970                   0);
1971     OUT_BCS_BATCH(batch, y << 16 | x);
1972     OUT_BCS_BATCH(batch,
1973                   max_size_in_word << 24 |
1974                   target_size_in_word << 16 |
1975                   coded_block_pattern << 6 |      /* CBP */
1976                   0);
1977     OUT_BCS_BATCH(batch,
1978                   last_mb_in_slice << 31 |
1979                   first_mb_in_slice << 30 |
1980                   0 << 27 |     /* EnableCoeffClamp */
1981                   last_mb_in_slice_group << 26 |
1982                   0 << 25 |     /* MbSkipConvDisable */
1983                   first_mb_in_slice_group << 24 |
1984                   0 << 16 |     /* MvFieldSelect */
1985                   qp_scale_code << 0 |
1986                   0);
1987     OUT_BCS_BATCH(batch, 0);    /* MV[0][0] */
1988     OUT_BCS_BATCH(batch, 0);    /* MV[1][0] */
1989     OUT_BCS_BATCH(batch, 0);    /* MV[0][1] */
1990     OUT_BCS_BATCH(batch, 0);    /* MV[1][1] */
1991
1992     ADVANCE_BCS_BATCH(batch);
1993
1994     return len_in_dwords;
1995 }
1996
1997 /* Byte offset */
1998 #define MPEG2_INTER_MV_OFFSET   48 
1999
2000 static struct _mv_ranges
2001 {
2002     int low;    /* in the unit of 1/2 pixel */
2003     int high;   /* in the unit of 1/2 pixel */
2004 } mv_ranges[] = {
2005     {0, 0},
2006     {-16, 15},
2007     {-32, 31},
2008     {-64, 63},
2009     {-128, 127},
2010     {-256, 255},
2011     {-512, 511},
2012     {-1024, 1023},
2013     {-2048, 2047},
2014     {-4096, 4095}
2015 };
2016
2017 static int
2018 mpeg2_motion_vector(int mv, int pos, int display_max, int f_code)
2019 {
2020     if (mv + pos * 16 * 2 < 0 ||
2021         mv + (pos + 1) * 16 * 2 > display_max * 2)
2022         mv = 0;
2023
2024     if (f_code > 0 && f_code < 10) {
2025         if (mv < mv_ranges[f_code].low)
2026             mv = mv_ranges[f_code].low;
2027
2028         if (mv > mv_ranges[f_code].high)
2029             mv = mv_ranges[f_code].high;
2030     }
2031
2032     return mv;
2033 }
2034
2035 static int
2036 gen8_mfc_mpeg2_pak_object_inter(VADriverContextP ctx,
2037                                 struct encode_state *encode_state,
2038                                 struct intel_encoder_context *encoder_context,
2039                                 unsigned int *msg,
2040                                 int width_in_mbs, int height_in_mbs,
2041                                 int x, int y,
2042                                 int first_mb_in_slice,
2043                                 int last_mb_in_slice,
2044                                 int first_mb_in_slice_group,
2045                                 int last_mb_in_slice_group,
2046                                 int qp_scale_code,
2047                                 unsigned char target_size_in_word,
2048                                 unsigned char max_size_in_word,
2049                                 struct intel_batchbuffer *batch)
2050 {
2051     VAEncPictureParameterBufferMPEG2 *pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
2052     int len_in_dwords = 9;
2053     short *mvptr, mvx0, mvy0, mvx1, mvy1;
2054     
2055     if (batch == NULL)
2056         batch = encoder_context->base.batch;
2057
2058     mvptr = (short *)((unsigned char *)msg + MPEG2_INTER_MV_OFFSET);;
2059     mvx0 = mpeg2_motion_vector(mvptr[0] / 2, x, width_in_mbs * 16, pic_param->f_code[0][0]);
2060     mvy0 = mpeg2_motion_vector(mvptr[1] / 2, y, height_in_mbs * 16, pic_param->f_code[0][0]);
2061     mvx1 = mpeg2_motion_vector(mvptr[2] / 2, x, width_in_mbs * 16, pic_param->f_code[1][0]);
2062     mvy1 = mpeg2_motion_vector(mvptr[3] / 2, y, height_in_mbs * 16, pic_param->f_code[1][0]);
2063
2064     BEGIN_BCS_BATCH(batch, len_in_dwords);
2065
2066     OUT_BCS_BATCH(batch, MFC_MPEG2_PAK_OBJECT | (len_in_dwords - 2));
2067     OUT_BCS_BATCH(batch,
2068                   2 << 24 |     /* PackedMvNum */
2069                   7 << 20 |     /* MvFormat */
2070                   7 << 17 |     /* CbpDcY/CbpDcU/CbpDcV */
2071                   0 << 15 |     /* TransformFlag: frame DCT */
2072                   0 << 14 |     /* FieldMbFlag */
2073                   0 << 13 |     /* IntraMbFlag */
2074                   1 << 8 |      /* MbType: Frame-based */
2075                   0 << 2 |      /* SkipMbFlag */
2076                   0 << 0 |      /* InterMbMode */
2077                   0);
2078     OUT_BCS_BATCH(batch, y << 16 | x);
2079     OUT_BCS_BATCH(batch,
2080                   max_size_in_word << 24 |
2081                   target_size_in_word << 16 |
2082                   0x3f << 6 |   /* CBP */
2083                   0);
2084     OUT_BCS_BATCH(batch,
2085                   last_mb_in_slice << 31 |
2086                   first_mb_in_slice << 30 |
2087                   0 << 27 |     /* EnableCoeffClamp */
2088                   last_mb_in_slice_group << 26 |
2089                   0 << 25 |     /* MbSkipConvDisable */
2090                   first_mb_in_slice_group << 24 |
2091                   0 << 16 |     /* MvFieldSelect */
2092                   qp_scale_code << 0 |
2093                   0);
2094
2095     OUT_BCS_BATCH(batch, (mvx0 & 0xFFFF) | mvy0 << 16);    /* MV[0][0] */
2096     OUT_BCS_BATCH(batch, (mvx1 & 0xFFFF) | mvy1 << 16);    /* MV[1][0] */
2097     OUT_BCS_BATCH(batch, 0);    /* MV[0][1] */
2098     OUT_BCS_BATCH(batch, 0);    /* MV[1][1] */
2099
2100     ADVANCE_BCS_BATCH(batch);
2101
2102     return len_in_dwords;
2103 }
2104
2105 static void
2106 intel_mfc_mpeg2_pipeline_header_programing(VADriverContextP ctx,
2107                                            struct encode_state *encode_state,
2108                                            struct intel_encoder_context *encoder_context,
2109                                            struct intel_batchbuffer *slice_batch)
2110 {
2111     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2112     int idx = va_enc_packed_type_to_idx(VAEncPackedHeaderMPEG2_SPS);
2113
2114     if (encode_state->packed_header_data[idx]) {
2115         VAEncPackedHeaderParameterBuffer *param = NULL;
2116         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2117         unsigned int length_in_bits;
2118
2119         assert(encode_state->packed_header_param[idx]);
2120         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2121         length_in_bits = param->bit_length;
2122
2123         mfc_context->insert_object(ctx,
2124                                    encoder_context,
2125                                    header_data,
2126                                    ALIGN(length_in_bits, 32) >> 5,
2127                                    length_in_bits & 0x1f,
2128                                    5,   /* FIXME: check it */
2129                                    0,
2130                                    0,
2131                                    0,   /* Needn't insert emulation bytes for MPEG-2 */
2132                                    slice_batch);
2133     }
2134
2135     idx = va_enc_packed_type_to_idx(VAEncPackedHeaderMPEG2_PPS);
2136
2137     if (encode_state->packed_header_data[idx]) {
2138         VAEncPackedHeaderParameterBuffer *param = NULL;
2139         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2140         unsigned int length_in_bits;
2141
2142         assert(encode_state->packed_header_param[idx]);
2143         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2144         length_in_bits = param->bit_length;
2145
2146         mfc_context->insert_object(ctx,
2147                                    encoder_context,
2148                                    header_data,
2149                                    ALIGN(length_in_bits, 32) >> 5,
2150                                    length_in_bits & 0x1f,
2151                                    5,   /* FIXME: check it */
2152                                    0,
2153                                    0,
2154                                    0,   /* Needn't insert emulation bytes for MPEG-2 */
2155                                    slice_batch);
2156     }
2157 }
2158
2159 static void 
2160 gen8_mfc_mpeg2_pipeline_slice_group(VADriverContextP ctx,
2161                                     struct encode_state *encode_state,
2162                                     struct intel_encoder_context *encoder_context,
2163                                     int slice_index,
2164                                     VAEncSliceParameterBufferMPEG2 *next_slice_group_param,
2165                                     struct intel_batchbuffer *slice_batch)
2166 {
2167     struct gen6_vme_context *vme_context = encoder_context->vme_context;
2168     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2169     VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
2170     VAEncSliceParameterBufferMPEG2 *slice_param = NULL;
2171     unsigned char tail_delimiter[] = {MPEG2_DELIMITER0, MPEG2_DELIMITER1, MPEG2_DELIMITER2, MPEG2_DELIMITER3, MPEG2_DELIMITER4, 0, 0, 0};
2172     unsigned char section_delimiter[] = {0x0, 0x0, 0x0, 0x0};
2173     int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
2174     int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
2175     int i, j;
2176     int h_start_pos, v_start_pos, h_next_start_pos, v_next_start_pos;
2177     unsigned int *msg = NULL;
2178     unsigned char *msg_ptr = NULL;
2179
2180     slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[slice_index]->buffer;
2181     h_start_pos = slice_param->macroblock_address % width_in_mbs;
2182     v_start_pos = slice_param->macroblock_address / width_in_mbs;
2183     assert(h_start_pos + slice_param->num_macroblocks <= width_in_mbs);
2184
2185     dri_bo_map(vme_context->vme_output.bo , 0);
2186     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
2187
2188     if (next_slice_group_param) {
2189         h_next_start_pos = next_slice_group_param->macroblock_address % width_in_mbs;
2190         v_next_start_pos = next_slice_group_param->macroblock_address / width_in_mbs;
2191     } else {
2192         h_next_start_pos = 0;
2193         v_next_start_pos = height_in_mbs;
2194     }
2195
2196     gen8_mfc_mpeg2_slicegroup_state(ctx,
2197                                     encoder_context,
2198                                     h_start_pos,
2199                                     v_start_pos,
2200                                     h_next_start_pos,
2201                                     v_next_start_pos,
2202                                     slice_index == 0,
2203                                     next_slice_group_param == NULL,
2204                                     slice_param->is_intra_slice,
2205                                     slice_param->quantiser_scale_code,
2206                                     slice_batch);
2207
2208     if (slice_index == 0) 
2209         intel_mfc_mpeg2_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
2210
2211     /* Insert '00' to make sure the header is valid */
2212     mfc_context->insert_object(ctx,
2213                                encoder_context,
2214                                (unsigned int*)section_delimiter,
2215                                1,
2216                                8,   /* 8bits in the last DWORD */
2217                                1,   /* 1 byte */
2218                                1,
2219                                0,
2220                                0,
2221                                slice_batch);
2222
2223     for (i = 0; i < encode_state->slice_params_ext[slice_index]->num_elements; i++) {
2224         /* PAK for each macroblocks */
2225         for (j = 0; j < slice_param->num_macroblocks; j++) {
2226             int h_pos = (slice_param->macroblock_address + j) % width_in_mbs;
2227             int v_pos = (slice_param->macroblock_address + j) / width_in_mbs;
2228             int first_mb_in_slice = (j == 0);
2229             int last_mb_in_slice = (j == slice_param->num_macroblocks - 1);
2230             int first_mb_in_slice_group = (i == 0 && j == 0);
2231             int last_mb_in_slice_group = (i == encode_state->slice_params_ext[slice_index]->num_elements - 1 &&
2232                                           j == slice_param->num_macroblocks - 1);
2233
2234             msg = (unsigned int *)(msg_ptr + (slice_param->macroblock_address + j) * vme_context->vme_output.size_block);
2235
2236             if (slice_param->is_intra_slice) {
2237                 gen8_mfc_mpeg2_pak_object_intra(ctx,
2238                                                 encoder_context,
2239                                                 h_pos, v_pos,
2240                                                 first_mb_in_slice,
2241                                                 last_mb_in_slice,
2242                                                 first_mb_in_slice_group,
2243                                                 last_mb_in_slice_group,
2244                                                 0x1a,
2245                                                 slice_param->quantiser_scale_code,
2246                                                 0x3f,
2247                                                 0,
2248                                                 0xff,
2249                                                 slice_batch);
2250             } else {
2251                 int inter_rdo, intra_rdo;
2252                 inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
2253                 intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
2254
2255                 if (intra_rdo < inter_rdo) 
2256                     gen8_mfc_mpeg2_pak_object_intra(ctx,
2257                                                      encoder_context,
2258                                                      h_pos, v_pos,
2259                                                      first_mb_in_slice,
2260                                                      last_mb_in_slice,
2261                                                      first_mb_in_slice_group,
2262                                                      last_mb_in_slice_group,
2263                                                      0x1a,
2264                                                      slice_param->quantiser_scale_code,
2265                                                      0x3f,
2266                                                      0,
2267                                                      0xff,
2268                                                      slice_batch);
2269                 else
2270                     gen8_mfc_mpeg2_pak_object_inter(ctx,
2271                                                 encode_state,
2272                                                 encoder_context,
2273                                                 msg,
2274                                                 width_in_mbs, height_in_mbs,
2275                                                 h_pos, v_pos,
2276                                                 first_mb_in_slice,
2277                                                 last_mb_in_slice,
2278                                                 first_mb_in_slice_group,
2279                                                 last_mb_in_slice_group,
2280                                                 slice_param->quantiser_scale_code,
2281                                                 0,
2282                                                 0xff,
2283                                                 slice_batch);
2284             }
2285         }
2286
2287         slice_param++;
2288     }
2289
2290     dri_bo_unmap(vme_context->vme_output.bo);
2291
2292     /* tail data */
2293     if (next_slice_group_param == NULL) { /* end of a picture */
2294         mfc_context->insert_object(ctx,
2295                                    encoder_context,
2296                                    (unsigned int *)tail_delimiter,
2297                                    2,
2298                                    8,   /* 8bits in the last DWORD */
2299                                    5,   /* 5 bytes */
2300                                    1,
2301                                    1,
2302                                    0,
2303                                    slice_batch);
2304     } else {        /* end of a lsice group */
2305         mfc_context->insert_object(ctx,
2306                                    encoder_context,
2307                                    (unsigned int *)section_delimiter,
2308                                    1,
2309                                    8,   /* 8bits in the last DWORD */
2310                                    1,   /* 1 byte */
2311                                    1,
2312                                    1,
2313                                    0,
2314                                    slice_batch);
2315     }
2316 }
2317
2318 /* 
2319  * A batch buffer for all slices, including slice state, 
2320  * slice insert object and slice pak object commands
2321  *
2322  */
2323 static dri_bo *
2324 gen8_mfc_mpeg2_software_slice_batchbuffer(VADriverContextP ctx,
2325                                           struct encode_state *encode_state,
2326                                           struct intel_encoder_context *encoder_context)
2327 {
2328     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2329     struct intel_batchbuffer *batch;
2330     VAEncSliceParameterBufferMPEG2 *next_slice_group_param = NULL;
2331     dri_bo *batch_bo;
2332     int i;
2333
2334     batch = mfc_context->aux_batchbuffer;
2335     batch_bo = batch->buffer;
2336
2337     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
2338         if (i == encode_state->num_slice_params_ext - 1)
2339             next_slice_group_param = NULL;
2340         else
2341             next_slice_group_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[i + 1]->buffer;
2342
2343         gen8_mfc_mpeg2_pipeline_slice_group(ctx, encode_state, encoder_context, i, next_slice_group_param, batch);
2344     }
2345
2346     intel_batchbuffer_align(batch, 8);
2347     
2348     BEGIN_BCS_BATCH(batch, 2);
2349     OUT_BCS_BATCH(batch, 0);
2350     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
2351     ADVANCE_BCS_BATCH(batch);
2352
2353     dri_bo_reference(batch_bo);
2354     intel_batchbuffer_free(batch);
2355     mfc_context->aux_batchbuffer = NULL;
2356
2357     return batch_bo;
2358 }
2359
2360 static void
2361 gen8_mfc_mpeg2_pipeline_picture_programing(VADriverContextP ctx,
2362                                            struct encode_state *encode_state,
2363                                            struct intel_encoder_context *encoder_context)
2364 {
2365     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2366
2367     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_MPEG2, encoder_context);
2368     mfc_context->set_surface_state(ctx, encoder_context);
2369     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
2370     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
2371     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
2372     gen8_mfc_mpeg2_pic_state(ctx, encoder_context, encode_state);
2373     gen8_mfc_mpeg2_qm_state(ctx, encoder_context);
2374     gen8_mfc_mpeg2_fqm_state(ctx, encoder_context);
2375 }
2376
2377 static void
2378 gen8_mfc_mpeg2_pipeline_programing(VADriverContextP ctx,
2379                                    struct encode_state *encode_state,
2380                                    struct intel_encoder_context *encoder_context)
2381 {
2382     struct intel_batchbuffer *batch = encoder_context->base.batch;
2383     dri_bo *slice_batch_bo;
2384
2385     slice_batch_bo = gen8_mfc_mpeg2_software_slice_batchbuffer(ctx, encode_state, encoder_context);
2386
2387     // begin programing
2388     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
2389     intel_batchbuffer_emit_mi_flush(batch);
2390     
2391     // picture level programing
2392     gen8_mfc_mpeg2_pipeline_picture_programing(ctx, encode_state, encoder_context);
2393
2394     BEGIN_BCS_BATCH(batch, 4);
2395     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
2396     OUT_BCS_RELOC(batch,
2397                   slice_batch_bo,
2398                   I915_GEM_DOMAIN_COMMAND, 0, 
2399                   0);
2400     OUT_BCS_BATCH(batch, 0);
2401     OUT_BCS_BATCH(batch, 0);
2402     ADVANCE_BCS_BATCH(batch);
2403
2404     // end programing
2405     intel_batchbuffer_end_atomic(batch);
2406
2407     dri_bo_unreference(slice_batch_bo);
2408 }
2409
2410 static VAStatus
2411 intel_mfc_mpeg2_prepare(VADriverContextP ctx, 
2412                         struct encode_state *encode_state,
2413                         struct intel_encoder_context *encoder_context)
2414 {
2415     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2416     struct object_surface *obj_surface; 
2417     struct object_buffer *obj_buffer;
2418     struct i965_coded_buffer_segment *coded_buffer_segment;
2419     VAStatus vaStatus = VA_STATUS_SUCCESS;
2420     dri_bo *bo;
2421     int i;
2422
2423     /* reconstructed surface */
2424     obj_surface = encode_state->reconstructed_object;
2425     i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC_NV12, SUBSAMPLE_YUV420);
2426     mfc_context->pre_deblocking_output.bo = obj_surface->bo;
2427     dri_bo_reference(mfc_context->pre_deblocking_output.bo);
2428     mfc_context->surface_state.width = obj_surface->orig_width;
2429     mfc_context->surface_state.height = obj_surface->orig_height;
2430     mfc_context->surface_state.w_pitch = obj_surface->width;
2431     mfc_context->surface_state.h_pitch = obj_surface->height;
2432
2433     /* forward reference */
2434     obj_surface = encode_state->reference_objects[0];
2435
2436     if (obj_surface && obj_surface->bo) {
2437         mfc_context->reference_surfaces[0].bo = obj_surface->bo;
2438         dri_bo_reference(mfc_context->reference_surfaces[0].bo);
2439     } else
2440         mfc_context->reference_surfaces[0].bo = NULL;
2441
2442     /* backward reference */
2443     obj_surface = encode_state->reference_objects[1];
2444
2445     if (obj_surface && obj_surface->bo) {
2446         mfc_context->reference_surfaces[1].bo = obj_surface->bo;
2447         dri_bo_reference(mfc_context->reference_surfaces[1].bo);
2448     } else {
2449         mfc_context->reference_surfaces[1].bo = mfc_context->reference_surfaces[0].bo;
2450
2451         if (mfc_context->reference_surfaces[1].bo)
2452             dri_bo_reference(mfc_context->reference_surfaces[1].bo);
2453     }
2454
2455     for (i = 2; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
2456         mfc_context->reference_surfaces[i].bo = mfc_context->reference_surfaces[i & 1].bo;
2457
2458         if (mfc_context->reference_surfaces[i].bo)
2459             dri_bo_reference(mfc_context->reference_surfaces[i].bo);
2460     }
2461     
2462     /* input YUV surface */
2463     obj_surface = encode_state->input_yuv_object;
2464     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
2465     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
2466
2467     /* coded buffer */
2468     obj_buffer = encode_state->coded_buf_object;
2469     bo = obj_buffer->buffer_store->bo;
2470     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
2471     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
2472     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
2473     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
2474
2475     /* set the internal flag to 0 to indicate the coded size is unknown */
2476     dri_bo_map(bo, 1);
2477     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
2478     coded_buffer_segment->mapped = 0;
2479     coded_buffer_segment->codec = encoder_context->codec;
2480     dri_bo_unmap(bo);
2481
2482     return vaStatus;
2483 }
2484
2485 static VAStatus
2486 gen8_mfc_mpeg2_encode_picture(VADriverContextP ctx, 
2487                               struct encode_state *encode_state,
2488                               struct intel_encoder_context *encoder_context)
2489 {
2490     gen8_mfc_init(ctx, encode_state, encoder_context);
2491     intel_mfc_mpeg2_prepare(ctx, encode_state, encoder_context);
2492     /*Programing bcs pipeline*/
2493     gen8_mfc_mpeg2_pipeline_programing(ctx, encode_state, encoder_context);
2494     gen8_mfc_run(ctx, encode_state, encoder_context);
2495
2496     return VA_STATUS_SUCCESS;
2497 }
2498
2499 /* JPEG encode methods */
2500
2501 static VAStatus
2502 intel_mfc_jpeg_prepare(VADriverContextP ctx, 
2503                         struct encode_state *encode_state,
2504                         struct intel_encoder_context *encoder_context)
2505 {
2506     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2507     struct object_surface *obj_surface; 
2508     struct object_buffer *obj_buffer;
2509     struct i965_coded_buffer_segment *coded_buffer_segment;
2510     VAStatus vaStatus = VA_STATUS_SUCCESS;
2511     dri_bo *bo;
2512    
2513     /* input YUV surface */
2514     obj_surface = encode_state->input_yuv_object;
2515     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
2516     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
2517
2518     /* coded buffer */
2519     obj_buffer = encode_state->coded_buf_object;
2520     bo = obj_buffer->buffer_store->bo;
2521     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
2522     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
2523     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
2524     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
2525
2526     /* set the internal flag to 0 to indicate the coded size is unknown */
2527     dri_bo_map(bo, 1);
2528     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
2529     coded_buffer_segment->mapped = 0;
2530     coded_buffer_segment->codec = encoder_context->codec;
2531     dri_bo_unmap(bo);
2532
2533     return vaStatus;
2534 }
2535
2536
2537 static void 
2538 gen8_mfc_jpeg_set_surface_state(VADriverContextP ctx,
2539                         struct intel_encoder_context *encoder_context,
2540                         struct encode_state *encode_state)
2541 {
2542     struct intel_batchbuffer *batch = encoder_context->base.batch;
2543     struct object_surface *obj_surface = encode_state->input_yuv_object;
2544     unsigned int input_fourcc;
2545     unsigned int y_cb_offset;
2546     unsigned int y_cr_offset;
2547     unsigned int surface_format;
2548
2549     assert(obj_surface);
2550
2551     y_cb_offset = obj_surface->y_cb_offset;
2552     y_cr_offset = obj_surface->y_cr_offset;
2553     input_fourcc = obj_surface->fourcc;
2554
2555     surface_format = (obj_surface->fourcc == VA_FOURCC_Y800) ?
2556         MFX_SURFACE_MONOCHROME : MFX_SURFACE_PLANAR_420_8;
2557         
2558         
2559      switch (input_fourcc) {
2560         case VA_FOURCC_Y800: {
2561             surface_format = MFX_SURFACE_MONOCHROME;
2562             break;
2563         }
2564         case VA_FOURCC_NV12: { 
2565             surface_format = MFX_SURFACE_PLANAR_420_8;
2566             break;
2567         }      
2568         case VA_FOURCC_UYVY: { 
2569             surface_format = MFX_SURFACE_YCRCB_SWAPY;
2570             break;
2571         }
2572         case VA_FOURCC_YUY2: { 
2573             surface_format = MFX_SURFACE_YCRCB_NORMAL;
2574             break;
2575         }
2576         case VA_FOURCC_RGBA:
2577         case VA_FOURCC_444P: {
2578             surface_format = MFX_SURFACE_R8G8B8A8_UNORM;
2579             break;
2580         }
2581     }
2582
2583     BEGIN_BCS_BATCH(batch, 6);
2584
2585     OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
2586     OUT_BCS_BATCH(batch, 0);
2587     OUT_BCS_BATCH(batch,
2588                   ((obj_surface->orig_height - 1) << 18) |
2589                   ((obj_surface->orig_width - 1) << 4));
2590     OUT_BCS_BATCH(batch,
2591                   (surface_format << 28) | /* Surface Format */
2592                   (0 << 27) | /* must be 1 for interleave U/V, hardware requirement for AVC/VC1/MPEG and 0 for JPEG */
2593                   (0 << 22) | /* surface object control state, FIXME??? */
2594                   ((obj_surface->width - 1) << 3) | /* pitch */
2595                   (0 << 2)  | /* must be 0 for interleave U/V */
2596                   (1 << 1)  | /* must be tiled */
2597                   (I965_TILEWALK_YMAJOR << 0));  /* tile walk, TILEWALK_YMAJOR */
2598     OUT_BCS_BATCH(batch,
2599                   (0 << 16) | /* X offset for U(Cb), must be 0 */
2600                   (y_cb_offset << 0)); /* Y offset for U(Cb) */
2601     OUT_BCS_BATCH(batch,
2602                   (0 << 16) | /* X offset for V(Cr), must be 0 */
2603                   (y_cr_offset << 0)); /* Y offset for V(Cr), must be 0 for video codec, non-zoeo for JPEG */
2604                  
2605
2606     ADVANCE_BCS_BATCH(batch);
2607 }
2608
2609 static void
2610 gen8_mfc_jpeg_pic_state(VADriverContextP ctx,
2611                         struct intel_encoder_context *encoder_context,
2612                         struct encode_state *encode_state)
2613 {
2614     struct intel_batchbuffer *batch = encoder_context->base.batch;
2615     struct object_surface *obj_surface = encode_state->input_yuv_object;
2616     VAEncPictureParameterBufferJPEG *pic_param;
2617     unsigned int  surface_format;
2618     unsigned int  frame_width_in_blks;
2619     unsigned int  frame_height_in_blks;
2620     unsigned int  pixels_in_horizontal_lastMCU;
2621     unsigned int  pixels_in_vertical_lastMCU;
2622     unsigned int  input_surface_format;
2623     unsigned int  output_mcu_format;
2624     unsigned int  picture_width;
2625     unsigned int  picture_height;  
2626
2627     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
2628     assert(obj_surface);
2629     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
2630     surface_format = obj_surface->fourcc;
2631     picture_width = pic_param->picture_width;
2632     picture_height = pic_param->picture_height;
2633     
2634     switch (surface_format) {
2635         case VA_FOURCC_Y800: {
2636             input_surface_format = JPEG_ENC_SURFACE_Y8; 
2637             output_mcu_format = JPEG_ENC_MCU_YUV400;
2638             break;
2639         }
2640         case VA_FOURCC_NV12: { 
2641             input_surface_format = JPEG_ENC_SURFACE_NV12; 
2642             output_mcu_format = JPEG_ENC_MCU_YUV420; 
2643             break;
2644         }      
2645         case VA_FOURCC_UYVY: { 
2646             input_surface_format = JPEG_ENC_SURFACE_UYVY; 
2647             output_mcu_format = JPEG_ENC_MCU_YUV422H_2Y; 
2648             break;
2649         }
2650         case VA_FOURCC_YUY2: { 
2651             input_surface_format = JPEG_ENC_SURFACE_YUY2; 
2652             output_mcu_format = JPEG_ENC_MCU_YUV422H_2Y; 
2653             break;
2654         }
2655
2656         case VA_FOURCC_RGBA:
2657         case VA_FOURCC_444P: { 
2658             input_surface_format = JPEG_ENC_SURFACE_RGB; 
2659             output_mcu_format = JPEG_ENC_MCU_RGB; 
2660             break;
2661         }
2662         default : {
2663             input_surface_format = JPEG_ENC_SURFACE_NV12; 
2664             output_mcu_format = JPEG_ENC_MCU_YUV420;
2665             break;
2666         }
2667     }
2668
2669     
2670     switch (output_mcu_format) {
2671         
2672         case JPEG_ENC_MCU_YUV400:
2673         case JPEG_ENC_MCU_RGB: {
2674             pixels_in_horizontal_lastMCU = (picture_width % 8);
2675             pixels_in_vertical_lastMCU = (picture_height % 8); 
2676
2677             //H1=1,V1=1 for YUV400 and YUV444. So, compute these values accordingly
2678             frame_width_in_blks = ((picture_width + 7) / 8); 
2679             frame_height_in_blks = ((picture_height + 7) / 8);
2680             break;
2681         }
2682         
2683         case JPEG_ENC_MCU_YUV420: {        
2684             if((picture_width % 2) == 0) 
2685                 pixels_in_horizontal_lastMCU = picture_width % 16; 
2686             else 
2687                 pixels_in_horizontal_lastMCU   = ((picture_width % 16) + 1) % 16; 
2688             
2689             if((picture_height % 2) == 0) 
2690                 pixels_in_vertical_lastMCU     = picture_height % 16; 
2691             else 
2692                 pixels_in_vertical_lastMCU   = ((picture_height % 16) + 1) % 16; 
2693
2694             //H1=2,V1=2 for YUV420. So, compute these values accordingly
2695             frame_width_in_blks = ((picture_width + 15) / 16) * 2;
2696             frame_height_in_blks = ((picture_height + 15) / 16) * 2;
2697             break;
2698         }
2699         
2700         case JPEG_ENC_MCU_YUV422H_2Y: {
2701             if(picture_width % 2 == 0) 
2702                 pixels_in_horizontal_lastMCU = picture_width % 16; 
2703             else 
2704                 pixels_in_horizontal_lastMCU = ((picture_width % 16) + 1) % 16; 
2705             
2706             pixels_in_vertical_lastMCU = picture_height % 8;
2707             
2708             //H1=2,V1=1 for YUV422H_2Y. So, compute these values accordingly
2709             frame_width_in_blks = ((picture_width + 15) / 16) * 2;
2710             frame_height_in_blks = ((picture_height + 7) / 8);
2711             break;            
2712         }       
2713     } //end of switch
2714    
2715     BEGIN_BCS_BATCH(batch, 3);
2716     /* DWORD 0 */
2717     OUT_BCS_BATCH(batch, MFX_JPEG_PIC_STATE | (3 - 2)); 
2718     /* DWORD 1 */
2719     OUT_BCS_BATCH(batch,
2720                   ( pixels_in_horizontal_lastMCU << 26) |    /* Pixels In Horizontal Last MCU */
2721                   ( pixels_in_vertical_lastMCU << 21)   |    /* Pixels In Vertical Last MCU */
2722                   ( input_surface_format << 8)          |    /* Input Surface format */
2723                   ( output_mcu_format << 0));                /* Output MCU Structure */
2724     /* DWORD 2 */
2725     OUT_BCS_BATCH(batch,
2726                   ((frame_height_in_blks - 1) << 16)    |   /* Frame Height In Blks Minus 1 */
2727                   (JPEG_ENC_ROUND_QUANT_DEFAULT  << 13) |   /* Rounding Quant set to default value 0 */
2728                   ((frame_width_in_blks - 1) << 0));        /* Frame Width In Blks Minus 1 */
2729     ADVANCE_BCS_BATCH(batch);
2730 }
2731
2732 static void 
2733 get_reciprocal_dword_qm(unsigned char *raster_qm, uint32_t *dword_qm)
2734 {
2735     int i = 0, j = 0;
2736     short reciprocal_qm[64];
2737     
2738     for(i=0; i<64; i++) {
2739         reciprocal_qm[i] = 65535/(raster_qm[i]);           
2740     }
2741     
2742     for(i=0; i<64; i++) {
2743         dword_qm[j] = ((reciprocal_qm[i+1] <<16) | (reciprocal_qm[i]));
2744         j++;
2745         i++;
2746     }    
2747     
2748 }
2749
2750
2751 static void 
2752 gen8_mfc_jpeg_fqm_state(VADriverContextP ctx,
2753                         struct intel_encoder_context *encoder_context,
2754                         struct encode_state *encode_state)
2755 {
2756     unsigned int quality = 0;
2757     uint32_t temp, i = 0, j = 0, dword_qm[32];
2758     VAEncPictureParameterBufferJPEG *pic_param;
2759     VAQMatrixBufferJPEG *qmatrix;
2760     unsigned char raster_qm[64], column_raster_qm[64];
2761     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2762     
2763     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
2764     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
2765     quality = pic_param->quality;
2766     
2767     //If the app sends the qmatrix, use it, buffer it for using it with the next frames 
2768     //The app can send qmatrix for the first frame and not send for the subsequent frames
2769     if(encode_state->q_matrix && encode_state->q_matrix->buffer) {
2770         qmatrix = (VAQMatrixBufferJPEG *)encode_state->q_matrix->buffer;
2771
2772         mfc_context->buffered_qmatrix.load_lum_quantiser_matrix = 1;
2773         memcpy(mfc_context->buffered_qmatrix.lum_quantiser_matrix, qmatrix->lum_quantiser_matrix, 64 * (sizeof(unsigned char)));
2774
2775         if(pic_param->num_components > 1) {
2776             mfc_context->buffered_qmatrix.load_chroma_quantiser_matrix = 1;
2777             memcpy(mfc_context->buffered_qmatrix.chroma_quantiser_matrix, qmatrix->chroma_quantiser_matrix, 64 * (sizeof(unsigned char)));
2778         } else {
2779             mfc_context->buffered_qmatrix.load_chroma_quantiser_matrix = 0;
2780         }
2781
2782     } else {
2783         //If the app doesnt send the qmatrix, use the buffered/default qmatrix
2784         qmatrix = &mfc_context->buffered_qmatrix;
2785         qmatrix->load_lum_quantiser_matrix = 1;
2786         qmatrix->load_chroma_quantiser_matrix = (pic_param->num_components > 1) ? 1 : 0;
2787     }   
2788
2789
2790     //As per the design, normalization of the quality factor and scaling of the Quantization tables
2791     //based on the quality factor needs to be done in the driver before sending the values to the HW.
2792     //But note, the driver expects the scaled quantization tables (as per below logic) to be sent as
2793     //packed header information. The packed header is written as the header of the jpeg file. This
2794     //header information is used to decode the jpeg file. So, it is the app's responsibility to send
2795     //the correct header information (See build_packed_jpeg_header_buffer() in jpegenc.c in LibVa on
2796     //how to do this). QTables can be different for different applications. If no tables are provided,
2797     //the default tables in the driver are used.
2798
2799     //Normalization of the quality factor
2800     if (quality > 100) quality=100;
2801     if (quality == 0)  quality=1;
2802     quality = (quality < 50) ? (5000/quality) : (200 - (quality*2)); 
2803     
2804     //Step 1. Apply Quality factor and clip to range [1, 255] for luma and chroma Quantization matrices
2805     //Step 2. HW expects the 1/Q[i] values in the qm sent, so get reciprocals
2806     //Step 3. HW also expects 32 dwords, hence combine 2 (1/Q) values into 1 dword
2807     //Step 4. Send the Quantization matrix to the HW, use gen8_mfc_fqm_state
2808     
2809     //For luma (Y or R)
2810     if(qmatrix->load_lum_quantiser_matrix) {
2811         //apply quality to lum_quantiser_matrix
2812         for(i=0; i < 64; i++) {
2813             temp = (qmatrix->lum_quantiser_matrix[i] * quality)/100;
2814             //clamp to range [1,255]
2815             temp = (temp > 255) ? 255 : temp;
2816             temp = (temp < 1) ? 1 : temp;
2817             qmatrix->lum_quantiser_matrix[i] = (unsigned char)temp;
2818         }       
2819         
2820         //For VAAPI, the VAQMatrixBuffer needs to be in zigzag order. 
2821         //The App should send it in zigzag. Now, the driver has to extract the raster from it. 
2822         for (j = 0; j < 64; j++)
2823             raster_qm[zigzag_direct[j]] = qmatrix->lum_quantiser_matrix[j];
2824
2825         //Convert the raster order(row-ordered) to the column-raster (column by column).
2826         //To be consistent with the other encoders, send it in column order.
2827         //Need to double check if our HW expects col or row raster.
2828         for (j = 0; j < 64; j++) {
2829             int row = j / 8, col = j % 8;
2830             column_raster_qm[col * 8 + row] = raster_qm[j];
2831         }
2832         
2833         //Convert to raster QM to reciprocal. HW expects values in reciprocal.
2834         get_reciprocal_dword_qm(column_raster_qm, dword_qm);
2835         
2836         //send the luma qm to the command buffer
2837         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_LUMA_Y_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);
2838     } 
2839     
2840     //For Chroma, if chroma exists (Cb, Cr or G, B)
2841     if(qmatrix->load_chroma_quantiser_matrix) {
2842         //apply quality to chroma_quantiser_matrix
2843         for(i=0; i < 64; i++) {
2844             temp = (qmatrix->chroma_quantiser_matrix[i] * quality)/100;
2845             //clamp to range [1,255]
2846             temp = (temp > 255) ? 255 : temp;
2847             temp = (temp < 1) ? 1 : temp;
2848             qmatrix->chroma_quantiser_matrix[i] = (unsigned char)temp;
2849         }
2850         
2851         //For VAAPI, the VAQMatrixBuffer needs to be in zigzag order. 
2852         //The App should send it in zigzag. Now, the driver has to extract the raster from it. 
2853         for (j = 0; j < 64; j++)
2854             raster_qm[zigzag_direct[j]] = qmatrix->chroma_quantiser_matrix[j];
2855         
2856         //Convert the raster order(row-ordered) to the column-raster (column by column).
2857         //To be consistent with the other encoders, send it in column order.
2858         //Need to double check if our HW expects col or row raster.
2859         for (j = 0; j < 64; j++) {
2860             int row = j / 8, col = j % 8;
2861             column_raster_qm[col * 8 + row] = raster_qm[j];
2862         }
2863
2864
2865         //Convert to raster QM to reciprocal. HW expects values in reciprocal.
2866         get_reciprocal_dword_qm(column_raster_qm, dword_qm);
2867
2868         //send the same chroma qm to the command buffer (for both U,V or G,B)
2869         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_CHROMA_CB_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);
2870         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_CHROMA_CR_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);        
2871     }
2872 }
2873
2874
2875 //Translation of Table K.5 into code: This method takes the huffval from the 
2876 //Huffmantable buffer and converts into index for the coefficients and size tables
2877 uint8_t map_huffval_to_index(uint8_t huff_val) 
2878 {
2879     uint8_t index = 0;
2880
2881     if(huff_val < 0xF0) {
2882         index = (((huff_val >> 4) & 0x0F) * 0xA) + (huff_val & 0x0F);
2883     } else {
2884         index = 1 + (((huff_val >> 4) & 0x0F) * 0xA) + (huff_val & 0x0F);
2885     }
2886
2887     return index;
2888 }
2889
2890
2891 //Implementation of Flow chart Annex C  - Figure C.1
2892 static void
2893 generate_huffman_codesizes_table(uint8_t *bits, uint8_t *huff_size_table, uint8_t *lastK) 
2894 {
2895     uint8_t i=1, j=1, k=0;
2896
2897     while(i <= 16) {
2898         while(j <= (uint8_t)bits[i-1]) {
2899             huff_size_table[k] = i;
2900             k = k+1;
2901             j = j+1;
2902         }
2903         
2904         i = i+1;
2905         j = 1;
2906     }
2907     huff_size_table[k] = 0;
2908     (*lastK) = k;    
2909 }
2910
2911 //Implementation of Flow chart Annex C - Figure C.2
2912 static void
2913 generate_huffman_codes_table(uint8_t *huff_size_table, uint16_t *huff_code_table)
2914 {
2915     uint8_t k=0;
2916     uint16_t code=0;
2917     uint8_t si=huff_size_table[k];
2918     
2919     while(huff_size_table[k] != 0) {
2920     
2921         while(huff_size_table[k] == si) {
2922             
2923             // An huffman code can never be 0xFFFF. Replace it with 0 if 0xFFFF 
2924             if(code == 0xFFFF) {
2925                 code = 0x0000;
2926             }
2927
2928             huff_code_table[k] = code;
2929             code = code+1;
2930             k = k+1;
2931         }
2932     
2933         code <<= 1;
2934         si = si+1;
2935     }
2936     
2937 }
2938
2939 //Implementation of Flow chat Annex C - Figure C.3
2940 static void
2941 generate_ordered_codes_table(uint8_t *huff_vals, uint8_t *huff_size_table, uint16_t *huff_code_table, uint8_t type, uint8_t lastK)
2942 {
2943     uint8_t huff_val_size=0, i=0, k=0;
2944     
2945     huff_val_size = (type == 0) ? 12 : 162; 
2946     uint8_t huff_si_table[huff_val_size]; 
2947     uint16_t huff_co_table[huff_val_size];
2948     
2949     memset(huff_si_table, 0, sizeof(huff_si_table));
2950     memset(huff_co_table, 0, sizeof(huff_co_table));
2951     
2952     do {
2953         i = map_huffval_to_index(huff_vals[k]);
2954         huff_co_table[i] = huff_code_table[k];
2955         huff_si_table[i] = huff_size_table[k];
2956         k++;
2957     } while(k < lastK);
2958     
2959     memcpy(huff_size_table, huff_si_table, sizeof(uint8_t)*huff_val_size);
2960     memcpy(huff_code_table, huff_co_table, sizeof(uint16_t)*huff_val_size);
2961 }
2962
2963
2964 //This method converts the huffman table to code words which is needed by the HW
2965 //Flowcharts from Jpeg Spec Annex C - Figure C.1, Figure C.2, Figure C.3 are used here
2966 static void
2967 convert_hufftable_to_codes(VAHuffmanTableBufferJPEGBaseline *huff_buffer, uint32_t *table, uint8_t type, uint8_t index)
2968 {
2969     uint8_t lastK = 0, i=0; 
2970     uint8_t huff_val_size = 0;
2971     uint8_t *huff_bits, *huff_vals;
2972
2973     huff_val_size = (type == 0) ? 12 : 162; 
2974     uint8_t huff_size_table[huff_val_size+1]; //The +1 for adding 0 at the end of huff_val_size
2975     uint16_t huff_code_table[huff_val_size];
2976
2977     memset(huff_size_table, 0, sizeof(huff_size_table));
2978     memset(huff_code_table, 0, sizeof(huff_code_table));
2979
2980     huff_bits = (type == 0) ? (huff_buffer->huffman_table[index].num_dc_codes) : (huff_buffer->huffman_table[index].num_ac_codes);
2981     huff_vals = (type == 0) ? (huff_buffer->huffman_table[index].dc_values) : (huff_buffer->huffman_table[index].ac_values);
2982     
2983
2984     //Generation of table of Huffman code sizes
2985     generate_huffman_codesizes_table(huff_bits, huff_size_table, &lastK);
2986        
2987     //Generation of table of Huffman codes
2988     generate_huffman_codes_table(huff_size_table, huff_code_table);
2989        
2990     //Ordering procedure for encoding procedure code tables
2991     generate_ordered_codes_table(huff_vals, huff_size_table, huff_code_table, type, lastK);
2992
2993     //HW expects Byte0: Code length; Byte1,Byte2: Code Word, Byte3: Dummy
2994     //Since IA is littlended, &, | and << accordingly to store the values in the DWord.
2995     for(i=0; i<huff_val_size; i++) {
2996         table[i] = 0;
2997         table[i] = ((huff_size_table[i] & 0xFF) | ((huff_code_table[i] & 0xFFFF) << 8));
2998     }
2999
3000 }
3001
3002 //send the huffman table using MFC_JPEG_HUFF_TABLE_STATE
3003 static void
3004 gen8_mfc_jpeg_huff_table_state(VADriverContextP ctx,
3005                                            struct encode_state *encode_state,
3006                                            struct intel_encoder_context *encoder_context,
3007                                            int num_tables)
3008 {
3009     VAHuffmanTableBufferJPEGBaseline *huff_buffer;
3010     struct intel_batchbuffer *batch = encoder_context->base.batch;
3011     uint8_t index;
3012     uint32_t dc_table[12], ac_table[162]; 
3013     
3014     assert(encode_state->huffman_table && encode_state->huffman_table->buffer);
3015     huff_buffer = (VAHuffmanTableBufferJPEGBaseline *)encode_state->huffman_table->buffer;
3016
3017     memset(dc_table, 0, 12);
3018     memset(ac_table, 0, 162);
3019
3020     for (index = 0; index < num_tables; index++) {
3021         int id = va_to_gen7_jpeg_hufftable[index];
3022  
3023         if (!huff_buffer->load_huffman_table[index])
3024             continue;
3025      
3026         //load DC table with 12 DWords
3027         convert_hufftable_to_codes(huff_buffer, dc_table, 0, index);  //0 for Dc
3028
3029         //load AC table with 162 DWords 
3030         convert_hufftable_to_codes(huff_buffer, ac_table, 1, index);  //1 for AC 
3031
3032         BEGIN_BCS_BATCH(batch, 176);
3033         OUT_BCS_BATCH(batch, MFC_JPEG_HUFF_TABLE_STATE | (176 - 2));
3034         OUT_BCS_BATCH(batch, id); //Huff table id
3035
3036         //DWord 2 - 13 has DC_TABLE
3037         intel_batchbuffer_data(batch, dc_table, 12*4);
3038
3039         //Dword 14 -175 has AC_TABLE
3040         intel_batchbuffer_data(batch, ac_table, 162*4);
3041         ADVANCE_BCS_BATCH(batch);
3042     }    
3043 }
3044
3045
3046 //This method is used to compute the MCU count used for setting MFC_JPEG_SCAN_OBJECT
3047 static void get_Y_sampling_factors(uint32_t surface_format, uint8_t *h_factor, uint8_t *v_factor)
3048
3049     switch (surface_format) {
3050         case VA_FOURCC_Y800: {
3051             (* h_factor) = 1; 
3052             (* v_factor) = 1;
3053             break;
3054         }
3055         case VA_FOURCC_NV12: { 
3056             (* h_factor) = 2;             
3057             (* v_factor) = 2;
3058             break;
3059         }      
3060         case VA_FOURCC_UYVY: { 
3061             (* h_factor) = 2; 
3062             (* v_factor) = 1;
3063             break;
3064         }
3065         case VA_FOURCC_YUY2: { 
3066             (* h_factor) = 2; 
3067             (* v_factor) = 1;
3068             break;
3069         }
3070         case VA_FOURCC_RGBA:
3071         case VA_FOURCC_444P: { 
3072             (* h_factor) = 1; 
3073             (* v_factor) = 1;
3074             break;
3075         }
3076         default : { //May be  have to insert error handling here. For now just use as below
3077             (* h_factor) = 1; 
3078             (* v_factor) = 1;
3079             break;
3080         }
3081     }
3082 }
3083
3084 //set MFC_JPEG_SCAN_OBJECT
3085 static void
3086 gen8_mfc_jpeg_scan_object(VADriverContextP ctx,
3087                                            struct encode_state *encode_state,
3088                                            struct intel_encoder_context *encoder_context)
3089 {
3090     uint32_t mcu_count, surface_format, Mx, My;
3091     uint8_t i, horizontal_sampling_factor, vertical_sampling_factor, huff_ac_table=0, huff_dc_table=0;
3092     uint8_t is_last_scan = 1;    //Jpeg has only 1 scan per frame. When last scan, HW inserts EOI code.
3093     uint8_t head_present_flag=1; //Header has tables and app data 
3094     uint16_t num_components, restart_interval;   //Specifies number of MCUs in an ECS.
3095     VAEncSliceParameterBufferJPEG *slice_param;
3096     VAEncPictureParameterBufferJPEG *pic_param;
3097     
3098     struct intel_batchbuffer *batch = encoder_context->base.batch;
3099     struct object_surface *obj_surface = encode_state->input_yuv_object;
3100     
3101     assert(encode_state->slice_params_ext[0] && encode_state->slice_params_ext[0]->buffer);
3102     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
3103     assert(obj_surface);
3104     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
3105     slice_param = (VAEncSliceParameterBufferJPEG *)encode_state->slice_params_ext[0]->buffer;
3106     surface_format = obj_surface->fourcc;
3107     
3108     get_Y_sampling_factors(surface_format, &horizontal_sampling_factor, &vertical_sampling_factor);
3109     
3110     // Mx = #MCUs in a row, My = #MCUs in a column
3111     Mx = (pic_param->picture_width + (horizontal_sampling_factor*8 -1))/(horizontal_sampling_factor*8);
3112     My = (pic_param->picture_height + (vertical_sampling_factor*8 -1))/(vertical_sampling_factor*8);
3113     mcu_count = (Mx * My);
3114  
3115     num_components = pic_param->num_components;    
3116     restart_interval = slice_param->restart_interval;
3117     
3118     //Depending on number of components and values set for table selectors, 
3119     //only those bits are set in 24:22 for AC table, 20:18 for DC table
3120     for(i=0; i<num_components; i++) {
3121         huff_ac_table |= ((slice_param->components[i].ac_table_selector)<<i);
3122         huff_dc_table |= ((slice_param->components[i].dc_table_selector)<<i);
3123     }
3124     
3125     
3126     BEGIN_BCS_BATCH(batch, 3);
3127     /* DWORD 0 */
3128     OUT_BCS_BATCH(batch, MFC_JPEG_SCAN_OBJECT | (3 - 2)); 
3129     /* DWORD 1 */
3130     OUT_BCS_BATCH(batch, mcu_count << 0);       //MCU Count
3131     /* DWORD 2 */
3132     OUT_BCS_BATCH(batch,
3133                   (huff_ac_table << 22)     |   //Huffman AC Table
3134                   (huff_dc_table << 18)     |   //Huffman DC Table
3135                   (head_present_flag << 17) |   //Head present flag
3136                   (is_last_scan << 16)      |   //Is last scan
3137                   (restart_interval << 0));     //Restart Interval
3138     ADVANCE_BCS_BATCH(batch);
3139 }
3140
3141 static void
3142 gen8_mfc_jpeg_pak_insert_object(struct intel_encoder_context *encoder_context, unsigned int *insert_data, 
3143                                 int length_in_dws, int data_bits_in_last_dw, int is_last_header, 
3144                                 int is_end_of_slice)
3145 {
3146     struct intel_batchbuffer *batch = encoder_context->base.batch;
3147     assert(batch);
3148     
3149     if (data_bits_in_last_dw == 0)
3150         data_bits_in_last_dw = 32;
3151
3152     BEGIN_BCS_BATCH(batch, length_in_dws + 2);
3153
3154     OUT_BCS_BATCH(batch, MFX_INSERT_OBJECT | (length_in_dws + 2 - 2));
3155     //DWord 1
3156     OUT_BCS_BATCH(batch,
3157                   (0 << 16) |                    //DataByteOffset 0 for JPEG Encoder
3158                   (0 << 15) |                    //HeaderLengthExcludeFrmSize 0 for JPEG Encoder
3159                   (data_bits_in_last_dw << 8) |  //DataBitsInLastDW
3160                   (0 << 4) |                     //SkipEmulByteCount 0 for JPEG Encoder
3161                   (0 << 3) |                     //EmulationFlag 0 for JPEG Encoder
3162                   ((!!is_last_header) << 2) |    //LastHeaderFlag
3163                   ((!!is_end_of_slice) << 1) |   //EndOfSliceFlag
3164                   (1 << 0));                     //BitstreamStartReset 1 for JPEG Encoder
3165     //Data Paylaod
3166     intel_batchbuffer_data(batch, insert_data, length_in_dws*4);
3167
3168     ADVANCE_BCS_BATCH(batch);
3169 }
3170
3171
3172 //send the jpeg headers to HW using MFX_PAK_INSERT_OBJECT
3173 static void
3174 gen8_mfc_jpeg_add_headers(VADriverContextP ctx,
3175                                            struct encode_state *encode_state,
3176                                            struct intel_encoder_context *encoder_context)
3177 {
3178     if (encode_state->packed_header_data_ext) {
3179         VAEncPackedHeaderParameterBuffer *param = NULL;
3180         unsigned int *header_data = (unsigned int *)(*encode_state->packed_header_data_ext)->buffer;
3181         unsigned int length_in_bits;
3182
3183         param = (VAEncPackedHeaderParameterBuffer *)(*encode_state->packed_header_params_ext)->buffer;
3184         length_in_bits = param->bit_length;
3185
3186         gen8_mfc_jpeg_pak_insert_object(encoder_context, 
3187                                         header_data, 
3188                                         ALIGN(length_in_bits, 32) >> 5,
3189                                         length_in_bits & 0x1f,
3190                                         1,
3191                                         1);
3192     }
3193 }
3194
3195 //Initialize the buffered_qmatrix with the default qmatrix in the driver.
3196 //If the app sends the qmatrix, this will be replaced with the one app sends.
3197 static void 
3198 jpeg_init_default_qmatrix(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
3199 {
3200     int i=0;
3201     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3202  
3203     //Load the the QM in zigzag order. If app sends QM, it is always in zigzag order.
3204     for(i=0; i<64; i++)
3205        mfc_context->buffered_qmatrix.lum_quantiser_matrix[i] = jpeg_luma_quant[zigzag_direct[i]];
3206
3207     for(i=0; i<64; i++)
3208         mfc_context->buffered_qmatrix.chroma_quantiser_matrix[i] = jpeg_chroma_quant[zigzag_direct[i]];
3209 }    
3210  
3211 /* This is at the picture level */
3212 static void
3213 gen8_mfc_jpeg_pipeline_picture_programing(VADriverContextP ctx,
3214                                            struct encode_state *encode_state,
3215                                            struct intel_encoder_context *encoder_context)
3216 {
3217     int i, j, component, max_selector = 0;
3218     VAEncSliceParameterBufferJPEG *slice_param;
3219     
3220     gen8_mfc_pipe_mode_select(ctx, MFX_FORMAT_JPEG, encoder_context);
3221     gen8_mfc_jpeg_set_surface_state(ctx, encoder_context, encode_state);
3222     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
3223     gen8_mfc_ind_obj_base_addr_state(ctx, encoder_context);
3224     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
3225     gen8_mfc_jpeg_pic_state(ctx, encoder_context, encode_state);
3226     
3227     //do the slice level encoding here
3228     gen8_mfc_jpeg_fqm_state(ctx, encoder_context, encode_state);
3229
3230     //I dont think I need this for loop. Just to be consistent with other encoding logic...
3231     for(i = 0; i < encode_state->num_slice_params_ext; i++) {
3232         assert(encode_state->slice_params_ext && encode_state->slice_params_ext[i]->buffer);
3233         slice_param = (VAEncSliceParameterBufferJPEG *)encode_state->slice_params_ext[i]->buffer;
3234         
3235         for(j = 0; j < encode_state->slice_params_ext[i]->num_elements; j++) {
3236             
3237             for(component = 0; component < slice_param->num_components; component++) {
3238                 if(max_selector < slice_param->components[component].dc_table_selector)
3239                     max_selector = slice_param->components[component].dc_table_selector;
3240                 
3241                 if (max_selector < slice_param->components[component].ac_table_selector)
3242                     max_selector = slice_param->components[component].ac_table_selector;
3243             }
3244             
3245             slice_param++;
3246         }
3247     }    
3248
3249     assert(max_selector < 2);
3250     //send the huffman table using MFC_JPEG_HUFF_TABLE
3251     gen8_mfc_jpeg_huff_table_state(ctx, encode_state, encoder_context, max_selector+1);
3252     //set MFC_JPEG_SCAN_OBJECT
3253     gen8_mfc_jpeg_scan_object(ctx, encode_state, encoder_context);
3254     //add headers using MFX_PAK_INSERT_OBJECT (it is refered as MFX_INSERT_OBJECT in this driver code)
3255     gen8_mfc_jpeg_add_headers(ctx, encode_state, encoder_context);
3256        
3257 }
3258
3259 static void
3260 gen8_mfc_jpeg_pipeline_programing(VADriverContextP ctx,
3261                                    struct encode_state *encode_state,
3262                                    struct intel_encoder_context *encoder_context)
3263 {
3264     struct intel_batchbuffer *batch = encoder_context->base.batch;
3265     
3266     // begin programing
3267     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
3268     intel_batchbuffer_emit_mi_flush(batch);
3269     
3270     // picture level programing
3271     gen8_mfc_jpeg_pipeline_picture_programing(ctx, encode_state, encoder_context);
3272
3273     // end programing
3274     intel_batchbuffer_end_atomic(batch);
3275
3276 }
3277
3278
3279 static VAStatus
3280 gen8_mfc_jpeg_encode_picture(VADriverContextP ctx, 
3281                               struct encode_state *encode_state,
3282                               struct intel_encoder_context *encoder_context)
3283 {
3284     gen8_mfc_init(ctx, encode_state, encoder_context);
3285     intel_mfc_jpeg_prepare(ctx, encode_state, encoder_context);
3286     /*Programing bcs pipeline*/
3287     gen8_mfc_jpeg_pipeline_programing(ctx, encode_state, encoder_context);
3288     gen8_mfc_run(ctx, encode_state, encoder_context);
3289
3290     return VA_STATUS_SUCCESS;
3291 }
3292
3293 static int gen8_mfc_vp8_qindex_estimate(struct encode_state *encode_state,
3294                                         struct gen6_mfc_context *mfc_context,
3295                                         int target_frame_size,
3296                                         int is_key_frame)
3297 {
3298     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3299     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3300     unsigned int max_qindex = pic_param->clamp_qindex_high;
3301     unsigned int min_qindex = pic_param->clamp_qindex_low;
3302     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
3303     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
3304     int target_mb_size;
3305     int last_size_gap  = -1;
3306     int per_mb_size_at_qindex;
3307     int target_qindex = min_qindex, i;
3308
3309     /* make sure would not overflow*/
3310     if (target_frame_size >= (0x7fffffff >> 9))
3311         target_mb_size = (target_frame_size / width_in_mbs / height_in_mbs) << 9;
3312     else
3313         target_mb_size = (target_frame_size << 9) / width_in_mbs / height_in_mbs;
3314
3315     for (i = min_qindex; i <= max_qindex; i++) {
3316         per_mb_size_at_qindex = vp8_bits_per_mb[!is_key_frame][i];
3317         target_qindex = i;
3318         if (per_mb_size_at_qindex <= target_mb_size) {
3319             if (target_mb_size - per_mb_size_at_qindex < last_size_gap)
3320                 target_qindex--;
3321             break;
3322         }
3323         else
3324             last_size_gap = per_mb_size_at_qindex - target_mb_size;
3325     }
3326
3327     return target_qindex;
3328 }
3329
3330 static void gen8_mfc_vp8_brc_init(struct encode_state *encode_state,
3331                                struct intel_encoder_context* encoder_context)
3332 {
3333     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3334     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3335     double bitrate = encoder_context->brc.bits_per_second[0];
3336     double framerate = (double)encoder_context->brc.framerate[0].num / (double)encoder_context->brc.framerate[0].den;
3337     int inum = 1, pnum = 0;
3338     int intra_period = seq_param->intra_period;
3339     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
3340     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
3341     int max_frame_size =  (vp8_bits_per_mb[0][0] >> 9) * width_in_mbs * height_in_mbs;/* vp8_bits_per_mb table mutilpled 512 */
3342
3343     pnum = intra_period  - 1;
3344
3345     mfc_context->brc.mode = encoder_context->rate_control_mode;
3346
3347     mfc_context->brc.target_frame_size[0][SLICE_TYPE_I] = (int)((double)((bitrate * intra_period) / framerate) /
3348                                                              (double)(inum + BRC_PWEIGHT * pnum ));
3349     mfc_context->brc.target_frame_size[0][SLICE_TYPE_P] = BRC_PWEIGHT * mfc_context->brc.target_frame_size[0][SLICE_TYPE_I];
3350
3351     mfc_context->brc.gop_nums[0][SLICE_TYPE_I] = inum;
3352     mfc_context->brc.gop_nums[0][SLICE_TYPE_P] = pnum;
3353
3354     mfc_context->brc.bits_per_frame[0] = bitrate / framerate;
3355
3356     mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I] = gen8_mfc_vp8_qindex_estimate(encode_state,
3357                                                                                 mfc_context,
3358                                                                                 mfc_context->brc.target_frame_size[0][SLICE_TYPE_I],
3359                                                                                 1);
3360     mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P] = gen8_mfc_vp8_qindex_estimate(encode_state,
3361                                                                                 mfc_context,
3362                                                                                 mfc_context->brc.target_frame_size[0][SLICE_TYPE_P],
3363                                                                                 0);
3364
3365     if (encoder_context->brc.hrd_buffer_size)
3366         mfc_context->hrd.buffer_size[0] = (double)encoder_context->brc.hrd_buffer_size;
3367     else
3368         mfc_context->hrd.buffer_size[0] = bitrate;
3369     if (encoder_context->brc.hrd_initial_buffer_fullness &&
3370         encoder_context->brc.hrd_initial_buffer_fullness < mfc_context->hrd.buffer_size[0])
3371         mfc_context->hrd.current_buffer_fullness[0] = (double)encoder_context->brc.hrd_initial_buffer_fullness;
3372     else
3373         mfc_context->hrd.current_buffer_fullness[0] = mfc_context->hrd.buffer_size[0] / 2.0;
3374     mfc_context->hrd.target_buffer_fullness[0] = (double)mfc_context->hrd.buffer_size[0] / 2.0;
3375     mfc_context->hrd.buffer_capacity[0] = (double)mfc_context->hrd.buffer_size[0] / max_frame_size;
3376     mfc_context->hrd.violation_noted = 0;
3377 }
3378
3379 static int gen8_mfc_vp8_brc_postpack(struct encode_state *encode_state,
3380                            struct intel_encoder_context *encoder_context,
3381                            int frame_bits)
3382 {
3383     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3384     gen6_brc_status sts = BRC_NO_HRD_VIOLATION;
3385     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3386     int is_key_frame = !pic_param->pic_flags.bits.frame_type;
3387     int slicetype = (is_key_frame ? SLICE_TYPE_I : SLICE_TYPE_P);
3388     int qpi = mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I];
3389     int qpp = mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P];
3390     int qp; // quantizer of previously encoded slice of current type
3391     int qpn; // predicted quantizer for next frame of current type in integer format
3392     double qpf; // predicted quantizer for next frame of current type in float format
3393     double delta_qp; // QP correction
3394     int target_frame_size, frame_size_next;
3395     /* Notes:
3396      *  x - how far we are from HRD buffer borders
3397      *  y - how far we are from target HRD buffer fullness
3398      */
3399     double x, y;
3400     double frame_size_alpha;
3401     unsigned int max_qindex = pic_param->clamp_qindex_high;
3402     unsigned int min_qindex = pic_param->clamp_qindex_low;
3403
3404     qp = mfc_context->brc.qp_prime_y[0][slicetype];
3405
3406     target_frame_size = mfc_context->brc.target_frame_size[0][slicetype];
3407     if (mfc_context->hrd.buffer_capacity[0] < 5)
3408         frame_size_alpha = 0;
3409     else
3410         frame_size_alpha = (double)mfc_context->brc.gop_nums[0][slicetype];
3411     if (frame_size_alpha > 30) frame_size_alpha = 30;
3412     frame_size_next = target_frame_size + (double)(target_frame_size - frame_bits) /
3413         (double)(frame_size_alpha + 1.);
3414
3415     /* frame_size_next: avoiding negative number and too small value */
3416     if ((double)frame_size_next < (double)(target_frame_size * 0.25))
3417         frame_size_next = (int)((double)target_frame_size * 0.25);
3418
3419     qpf = (double)qp * target_frame_size / frame_size_next;
3420     qpn = (int)(qpf + 0.5);
3421
3422     if (qpn == qp) {
3423         /* setting qpn we round qpf making mistakes: now we are trying to compensate this */
3424         mfc_context->brc.qpf_rounding_accumulator[0] += qpf - qpn;
3425         if (mfc_context->brc.qpf_rounding_accumulator[0] > 1.0) {
3426             qpn++;
3427             mfc_context->brc.qpf_rounding_accumulator[0] = 0.;
3428         } else if (mfc_context->brc.qpf_rounding_accumulator[0] < -1.0) {
3429             qpn--;
3430             mfc_context->brc.qpf_rounding_accumulator[0] = 0.;
3431         }
3432     }
3433
3434     /* making sure that QP is not changing too fast */
3435     if ((qpn - qp) > BRC_QP_MAX_CHANGE) qpn = qp + BRC_QP_MAX_CHANGE;
3436     else if ((qpn - qp) < -BRC_QP_MAX_CHANGE) qpn = qp - BRC_QP_MAX_CHANGE;
3437     /* making sure that with QP predictions we did do not leave QPs range */
3438     BRC_CLIP(qpn, min_qindex, max_qindex);
3439
3440     /* checking wthether HRD compliance is still met */
3441     sts = intel_mfc_update_hrd(encode_state, encoder_context, frame_bits);
3442
3443     /* calculating QP delta as some function*/
3444     x = mfc_context->hrd.target_buffer_fullness[0] - mfc_context->hrd.current_buffer_fullness[0];
3445     if (x > 0) {
3446         x /= mfc_context->hrd.target_buffer_fullness[0];
3447         y = mfc_context->hrd.current_buffer_fullness[0];
3448     }
3449     else {
3450         x /= (mfc_context->hrd.buffer_size[0] - mfc_context->hrd.target_buffer_fullness[0]);
3451         y = mfc_context->hrd.buffer_size[0] - mfc_context->hrd.current_buffer_fullness[0];
3452     }
3453     if (y < 0.01) y = 0.01;
3454     if (x > 1) x = 1;
3455     else if (x < -1) x = -1;
3456
3457     delta_qp = BRC_QP_MAX_CHANGE*exp(-1/y)*sin(BRC_PI_0_5 * x);
3458     qpn = (int)(qpn + delta_qp + 0.5);
3459
3460     /* making sure that with QP predictions we did do not leave QPs range */
3461     BRC_CLIP(qpn, min_qindex, max_qindex);
3462
3463     if (sts == BRC_NO_HRD_VIOLATION) { // no HRD violation
3464         /* correcting QPs of slices of other types */
3465         if (!is_key_frame) {
3466             if (abs(qpn - BRC_I_P_QP_DIFF - qpi) > 4)
3467                 mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I] += (qpn - BRC_I_P_QP_DIFF - qpi) >> 2;
3468         } else {
3469             if (abs(qpn + BRC_I_P_QP_DIFF - qpp) > 4)
3470                 mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P] += (qpn + BRC_I_P_QP_DIFF - qpp) >> 2;
3471         }
3472         BRC_CLIP(mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I], min_qindex, max_qindex);
3473         BRC_CLIP(mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P], min_qindex, max_qindex);
3474     } else if (sts == BRC_UNDERFLOW) { // underflow
3475         if (qpn <= qp) qpn = qp + 2;
3476         if (qpn > max_qindex) {
3477             qpn = max_qindex;
3478             sts = BRC_UNDERFLOW_WITH_MAX_QP; //underflow with maxQP
3479         }
3480     } else if (sts == BRC_OVERFLOW) {
3481         if (qpn >= qp) qpn = qp - 2;
3482         if (qpn < min_qindex) { // < 0 (?) overflow with minQP
3483             qpn = min_qindex;
3484             sts = BRC_OVERFLOW_WITH_MIN_QP; // bit stuffing to be done
3485         }
3486     }
3487
3488     mfc_context->brc.qp_prime_y[0][slicetype] = qpn;
3489
3490     return sts;
3491 }
3492
3493 static void gen8_mfc_vp8_hrd_context_init(struct encode_state *encode_state,
3494                                        struct intel_encoder_context *encoder_context)
3495 {
3496     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3497     unsigned int rate_control_mode = encoder_context->rate_control_mode;
3498     int target_bit_rate = encoder_context->brc.bits_per_second[0];
3499
3500     // current we only support CBR mode.
3501     if (rate_control_mode == VA_RC_CBR) {
3502         mfc_context->vui_hrd.i_bit_rate_value = target_bit_rate >> 10;
3503         mfc_context->vui_hrd.i_initial_cpb_removal_delay = ((target_bit_rate * 8) >> 10) * 0.5 * 1024 / target_bit_rate * 90000;
3504         mfc_context->vui_hrd.i_cpb_removal_delay = 2;
3505         mfc_context->vui_hrd.i_frame_number = 0;
3506
3507         mfc_context->vui_hrd.i_initial_cpb_removal_delay_length = 24;
3508         mfc_context->vui_hrd.i_cpb_removal_delay_length = 24;
3509         mfc_context->vui_hrd.i_dpb_output_delay_length = 24;
3510     }
3511
3512 }
3513
3514 static void gen8_mfc_vp8_hrd_context_update(struct encode_state *encode_state,
3515                              struct gen6_mfc_context *mfc_context)
3516 {
3517     mfc_context->vui_hrd.i_frame_number++;
3518 }
3519
3520 static void gen8_mfc_vp8_brc_prepare(struct encode_state *encode_state,
3521                            struct intel_encoder_context *encoder_context)
3522 {
3523     unsigned int rate_control_mode = encoder_context->rate_control_mode;
3524
3525     if (rate_control_mode == VA_RC_CBR) {
3526         bool brc_updated;
3527         assert(encoder_context->codec != CODEC_MPEG2);
3528
3529         brc_updated = encoder_context->brc.need_reset;
3530
3531         /*Programing bit rate control */
3532         if (brc_updated) {
3533             gen8_mfc_vp8_brc_init(encode_state, encoder_context);
3534         }
3535
3536         /*Programing HRD control */
3537         if (brc_updated)
3538             gen8_mfc_vp8_hrd_context_init(encode_state, encoder_context);
3539     }
3540 }
3541
3542 static void vp8_enc_state_init(struct gen6_mfc_context *mfc_context,
3543                                VAEncPictureParameterBufferVP8 *pic_param,
3544                                VAQMatrixBufferVP8 *q_matrix)
3545 {
3546
3547     int is_key_frame = !pic_param->pic_flags.bits.frame_type;
3548     unsigned char *coeff_probs_stream_in_buffer;
3549     
3550     mfc_context->vp8_state.frame_header_lf_update_pos = 0;
3551     mfc_context->vp8_state.frame_header_qindex_update_pos = 0;
3552     mfc_context->vp8_state.frame_header_token_update_pos = 0;
3553     mfc_context->vp8_state.frame_header_bin_mv_upate_pos = 0;
3554
3555     mfc_context->vp8_state.prob_skip_false = 255;
3556     memset(mfc_context->vp8_state.mb_segment_tree_probs, 0, sizeof(mfc_context->vp8_state.mb_segment_tree_probs));
3557     memcpy(mfc_context->vp8_state.mv_probs, vp8_default_mv_context, sizeof(mfc_context->vp8_state.mv_probs));
3558     
3559     if (is_key_frame) {
3560         memcpy(mfc_context->vp8_state.y_mode_probs, vp8_kf_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
3561         memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_kf_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
3562
3563         mfc_context->vp8_state.prob_intra = 255;
3564         mfc_context->vp8_state.prob_last = 128;
3565         mfc_context->vp8_state.prob_gf = 128;
3566     } else {
3567         memcpy(mfc_context->vp8_state.y_mode_probs, vp8_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
3568         memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
3569
3570         mfc_context->vp8_state.prob_intra = 63;
3571         mfc_context->vp8_state.prob_last = 128;
3572         mfc_context->vp8_state.prob_gf = 128;
3573     }
3574     
3575     mfc_context->vp8_state.prob_skip_false = vp8_base_skip_false_prob[q_matrix->quantization_index[0]];
3576   
3577     dri_bo_map(mfc_context->vp8_state.coeff_probs_stream_in_bo, 1);
3578     coeff_probs_stream_in_buffer = (unsigned char *)mfc_context->vp8_state.coeff_probs_stream_in_bo->virtual;
3579     assert(coeff_probs_stream_in_buffer);
3580     memcpy(coeff_probs_stream_in_buffer, vp8_default_coef_probs, sizeof(vp8_default_coef_probs));
3581     dri_bo_unmap(mfc_context->vp8_state.coeff_probs_stream_in_bo);
3582 }
3583
3584 static void vp8_enc_state_update(struct gen6_mfc_context *mfc_context,
3585                                  VAQMatrixBufferVP8 *q_matrix)
3586 {
3587
3588     /*some other probabilities need to be updated*/
3589 }
3590
3591 extern void binarize_vp8_frame_header(VAEncSequenceParameterBufferVP8 *seq_param,
3592                            VAEncPictureParameterBufferVP8 *pic_param,
3593                            VAQMatrixBufferVP8 *q_matrix,
3594                            struct gen6_mfc_context *mfc_context,
3595                            struct intel_encoder_context *encoder_context);
3596
3597 static void vp8_enc_frame_header_binarize(struct encode_state *encode_state,
3598                                           struct intel_encoder_context *encoder_context,
3599                                           struct gen6_mfc_context *mfc_context)
3600 {
3601     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3602     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3603     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3604     unsigned char *frame_header_buffer;
3605
3606     binarize_vp8_frame_header(seq_param, pic_param, q_matrix, mfc_context, encoder_context);
3607  
3608     dri_bo_map(mfc_context->vp8_state.frame_header_bo, 1);
3609     frame_header_buffer = (unsigned char *)mfc_context->vp8_state.frame_header_bo->virtual;
3610     assert(frame_header_buffer);
3611     memcpy(frame_header_buffer, mfc_context->vp8_state.vp8_frame_header, (mfc_context->vp8_state.frame_header_bit_count + 7) / 8);
3612     free(mfc_context->vp8_state.vp8_frame_header);
3613     dri_bo_unmap(mfc_context->vp8_state.frame_header_bo);
3614 }
3615
3616 #define MAX_VP8_FRAME_HEADER_SIZE              0x2000
3617 #define VP8_TOKEN_STATISTICS_BUFFER_SIZE       0x2000
3618
3619 static void gen8_mfc_vp8_init(VADriverContextP ctx,
3620                           struct encode_state *encode_state,
3621                           struct intel_encoder_context *encoder_context)
3622 {
3623     struct i965_driver_data *i965 = i965_driver_data(ctx);
3624     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3625     dri_bo *bo;
3626     int i;
3627     int width_in_mbs = 0;
3628     int height_in_mbs = 0;
3629     int slice_batchbuffer_size;
3630     int is_key_frame, slice_type, rate_control_mode;
3631
3632     VAEncSequenceParameterBufferVP8 *pSequenceParameter = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3633     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3634     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3635
3636     width_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
3637     height_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
3638
3639     is_key_frame = !pic_param->pic_flags.bits.frame_type;
3640     slice_type = (is_key_frame ? SLICE_TYPE_I : SLICE_TYPE_P);
3641     rate_control_mode = encoder_context->rate_control_mode;
3642
3643     if (rate_control_mode == VA_RC_CBR) {
3644         q_matrix->quantization_index[0] = mfc_context->brc.qp_prime_y[0][slice_type];
3645         for (i = 1; i < 4; i++)
3646             q_matrix->quantization_index[i] = q_matrix->quantization_index[0];
3647         for (i = 0; i < 5; i++)
3648             q_matrix->quantization_index_delta[i] = 0;
3649     }
3650
3651     slice_batchbuffer_size = 64 * width_in_mbs * height_in_mbs + 4096 +
3652         (SLICE_HEADER + SLICE_TAIL);
3653
3654     /*Encode common setup for MFC*/
3655     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
3656     mfc_context->post_deblocking_output.bo = NULL;
3657
3658     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
3659     mfc_context->pre_deblocking_output.bo = NULL;
3660
3661     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
3662     mfc_context->uncompressed_picture_source.bo = NULL;
3663
3664     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo);
3665     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
3666
3667     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
3668         if ( mfc_context->direct_mv_buffers[i].bo != NULL)
3669             dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
3670         mfc_context->direct_mv_buffers[i].bo = NULL;
3671     }
3672
3673     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
3674         if (mfc_context->reference_surfaces[i].bo != NULL)
3675             dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
3676         mfc_context->reference_surfaces[i].bo = NULL;
3677     }
3678
3679     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
3680     bo = dri_bo_alloc(i965->intel.bufmgr,
3681                       "Buffer",
3682                       width_in_mbs * 64 * 16,
3683                       64);
3684     assert(bo);
3685     mfc_context->intra_row_store_scratch_buffer.bo = bo;
3686
3687     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
3688     bo = dri_bo_alloc(i965->intel.bufmgr,
3689                       "Buffer",
3690                       width_in_mbs * height_in_mbs * 16,
3691                       64);
3692     assert(bo);
3693     mfc_context->macroblock_status_buffer.bo = bo;
3694
3695     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
3696     bo = dri_bo_alloc(i965->intel.bufmgr,
3697                       "Buffer",
3698                       16 * width_in_mbs * 64,  /* 16 * width_in_mbs * 64 */
3699                       64);
3700     assert(bo);
3701     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
3702
3703     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
3704     bo = dri_bo_alloc(i965->intel.bufmgr,
3705                       "Buffer",
3706                       16 * width_in_mbs * 64, /* 16 * width_in_mbs * 64 */
3707                       0x1000);
3708     assert(bo);
3709     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
3710
3711     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
3712     mfc_context->mfc_batchbuffer_surface.bo = NULL;
3713
3714     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
3715     mfc_context->aux_batchbuffer_surface.bo = NULL;
3716
3717     if (mfc_context->aux_batchbuffer) {
3718         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
3719         mfc_context->aux_batchbuffer = NULL;
3720     }
3721
3722     mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
3723     mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
3724     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
3725     mfc_context->aux_batchbuffer_surface.pitch = 16;
3726     mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
3727     mfc_context->aux_batchbuffer_surface.size_block = 16;
3728
3729     gen8_gpe_context_init(ctx, &mfc_context->gpe_context);
3730
3731     /* alloc vp8 encoding buffers*/
3732     dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
3733     bo = dri_bo_alloc(i965->intel.bufmgr,
3734                       "Buffer",
3735                       MAX_VP8_FRAME_HEADER_SIZE,
3736                       0x1000);
3737     assert(bo);
3738     mfc_context->vp8_state.frame_header_bo = bo;
3739
3740     mfc_context->vp8_state.intermediate_buffer_max_size = width_in_mbs * height_in_mbs * 384 * 9;
3741     for(i = 0; i < 8; i++) {
3742         mfc_context->vp8_state.intermediate_partition_offset[i] = width_in_mbs * height_in_mbs * 384 * (i + 1);
3743     }
3744     dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
3745     bo = dri_bo_alloc(i965->intel.bufmgr,
3746                       "Buffer",
3747                       mfc_context->vp8_state.intermediate_buffer_max_size,
3748                       0x1000);
3749     assert(bo);
3750     mfc_context->vp8_state.intermediate_bo = bo;
3751
3752     dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
3753     bo = dri_bo_alloc(i965->intel.bufmgr,
3754                       "Buffer",
3755                       width_in_mbs * height_in_mbs * 16,
3756                       0x1000);
3757     assert(bo);
3758     mfc_context->vp8_state.stream_out_bo = bo;
3759
3760     dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
3761     bo = dri_bo_alloc(i965->intel.bufmgr,
3762                       "Buffer",
3763                       sizeof(vp8_default_coef_probs),
3764                       0x1000);
3765     assert(bo);
3766     mfc_context->vp8_state.coeff_probs_stream_in_bo = bo;
3767
3768     dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
3769     bo = dri_bo_alloc(i965->intel.bufmgr,
3770                       "Buffer",
3771                       VP8_TOKEN_STATISTICS_BUFFER_SIZE,
3772                       0x1000);
3773     assert(bo);
3774     mfc_context->vp8_state.token_statistics_bo = bo;
3775
3776     dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
3777     bo = dri_bo_alloc(i965->intel.bufmgr,
3778                       "Buffer",
3779                       width_in_mbs * 16 * 64,
3780                       0x1000);
3781     assert(bo);
3782     mfc_context->vp8_state.mpc_row_store_bo = bo;
3783
3784     vp8_enc_state_init(mfc_context, pic_param, q_matrix);
3785     vp8_enc_frame_header_binarize(encode_state, encoder_context, mfc_context);
3786 }
3787
3788 static VAStatus
3789 intel_mfc_vp8_prepare(VADriverContextP ctx,
3790                         struct encode_state *encode_state,
3791                         struct intel_encoder_context *encoder_context)
3792 {
3793     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3794     struct object_surface *obj_surface;
3795     struct object_buffer *obj_buffer;
3796     struct i965_coded_buffer_segment *coded_buffer_segment;
3797     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3798     VAStatus vaStatus = VA_STATUS_SUCCESS;
3799     dri_bo *bo;
3800     int i;
3801
3802     /* reconstructed surface */
3803     obj_surface = encode_state->reconstructed_object;
3804     i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
3805     if (pic_param->loop_filter_level[0] == 0) {
3806         mfc_context->pre_deblocking_output.bo = obj_surface->bo;
3807         dri_bo_reference(mfc_context->pre_deblocking_output.bo);
3808     } else {
3809         mfc_context->post_deblocking_output.bo = obj_surface->bo;
3810         dri_bo_reference(mfc_context->post_deblocking_output.bo);
3811     }
3812
3813     mfc_context->surface_state.width = obj_surface->orig_width;
3814     mfc_context->surface_state.height = obj_surface->orig_height;
3815     mfc_context->surface_state.w_pitch = obj_surface->width;
3816     mfc_context->surface_state.h_pitch = obj_surface->height;
3817
3818     /* set vp8 reference frames */
3819     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
3820         obj_surface = encode_state->reference_objects[i];
3821
3822         if (obj_surface && obj_surface->bo) {
3823             mfc_context->reference_surfaces[i].bo = obj_surface->bo;
3824             dri_bo_reference(mfc_context->reference_surfaces[i].bo);
3825         } else {
3826             mfc_context->reference_surfaces[i].bo = NULL;
3827         }
3828     }
3829
3830     /* input YUV surface */
3831     obj_surface = encode_state->input_yuv_object;
3832     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
3833     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
3834
3835     /* coded buffer */
3836     obj_buffer = encode_state->coded_buf_object;
3837     bo = obj_buffer->buffer_store->bo;
3838     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
3839     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
3840     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
3841     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
3842
3843     dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
3844     mfc_context->vp8_state.final_frame_bo = mfc_context->mfc_indirect_pak_bse_object.bo;
3845     mfc_context->vp8_state.final_frame_byte_offset = I965_CODEDBUFFER_HEADER_SIZE;
3846     dri_bo_reference(mfc_context->vp8_state.final_frame_bo);
3847
3848     /* set the internal flag to 0 to indicate the coded size is unknown */
3849     dri_bo_map(bo, 1);
3850     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
3851     coded_buffer_segment->mapped = 0;
3852     coded_buffer_segment->codec = encoder_context->codec;
3853     dri_bo_unmap(bo);
3854
3855     return vaStatus;
3856 }
3857
3858 static void
3859 gen8_mfc_vp8_encoder_cfg(VADriverContextP ctx, 
3860                          struct encode_state *encode_state,
3861                          struct intel_encoder_context *encoder_context)
3862 {
3863     struct intel_batchbuffer *batch = encoder_context->base.batch;
3864     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3865     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3866     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3867
3868     BEGIN_BCS_BATCH(batch, 30);
3869     OUT_BCS_BATCH(batch, MFX_VP8_ENCODER_CFG | (30 - 2)); /* SKL should be 31-2 ? */
3870
3871     OUT_BCS_BATCH(batch,
3872                   0 << 9 | /* compressed bitstream output disable */
3873                   1 << 7 | /* disable per-segment delta qindex and loop filter in RC */
3874                   1 << 6 | /* RC initial pass */
3875                   0 << 4 | /* upate segment feature date flag */
3876                   1 << 3 | /* bitstream statistics output enable */
3877                   1 << 2 | /* token statistics output enable */
3878                   0 << 1 | /* final bitstream output disable */
3879                   0 << 0); /*DW1*/
3880     
3881     OUT_BCS_BATCH(batch, 0); /*DW2*/
3882
3883     OUT_BCS_BATCH(batch, 
3884                   0xfff << 16 | /* max intra mb bit count limit */
3885                   0xfff << 0  /* max inter mb bit count limit */
3886                   ); /*DW3*/
3887
3888     OUT_BCS_BATCH(batch, 0); /*DW4*/
3889     OUT_BCS_BATCH(batch, 0); /*DW5*/
3890     OUT_BCS_BATCH(batch, 0); /*DW6*/
3891     OUT_BCS_BATCH(batch, 0); /*DW7*/
3892     OUT_BCS_BATCH(batch, 0); /*DW8*/
3893     OUT_BCS_BATCH(batch, 0); /*DW9*/
3894     OUT_BCS_BATCH(batch, 0); /*DW10*/
3895     OUT_BCS_BATCH(batch, 0); /*DW11*/
3896     OUT_BCS_BATCH(batch, 0); /*DW12*/
3897     OUT_BCS_BATCH(batch, 0); /*DW13*/
3898     OUT_BCS_BATCH(batch, 0); /*DW14*/
3899     OUT_BCS_BATCH(batch, 0); /*DW15*/
3900     OUT_BCS_BATCH(batch, 0); /*DW16*/
3901     OUT_BCS_BATCH(batch, 0); /*DW17*/
3902     OUT_BCS_BATCH(batch, 0); /*DW18*/
3903     OUT_BCS_BATCH(batch, 0); /*DW19*/
3904     OUT_BCS_BATCH(batch, 0); /*DW20*/
3905     OUT_BCS_BATCH(batch, 0); /*DW21*/
3906
3907     OUT_BCS_BATCH(batch, 
3908                  pic_param->pic_flags.bits.show_frame << 23 |
3909                  pic_param->pic_flags.bits.version << 20
3910                  ); /*DW22*/
3911
3912     OUT_BCS_BATCH(batch,
3913                  (seq_param->frame_height_scale << 14 | seq_param->frame_height) << 16 |
3914                  (seq_param->frame_width_scale << 14 | seq_param->frame_width) << 0
3915                  );
3916
3917     /*DW24*/
3918     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bit_count); /* frame header bit count */
3919
3920     /*DW25*/
3921     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_qindex_update_pos); /* frame header bin buffer qindex update pointer */
3922
3923     /*DW26*/
3924     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_lf_update_pos); /* frame header bin buffer loop filter update pointer*/
3925
3926     /*DW27*/
3927     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_token_update_pos); /* frame header bin buffer token update pointer */
3928
3929     /*DW28*/
3930     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bin_mv_upate_pos); /*frame header bin buffer mv update pointer */
3931
3932     /*DW29*/
3933     OUT_BCS_BATCH(batch, 0);
3934
3935     ADVANCE_BCS_BATCH(batch);
3936 }
3937
3938 static void
3939 gen8_mfc_vp8_pic_state(VADriverContextP ctx,
3940                        struct encode_state *encode_state,
3941                        struct intel_encoder_context *encoder_context)
3942 {
3943     struct intel_batchbuffer *batch = encoder_context->base.batch;
3944     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3945     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3946     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3947     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3948     int i, j, log2num;
3949
3950     log2num = pic_param->pic_flags.bits.num_token_partitions;
3951
3952     /*update mode and token probs*/
3953     vp8_enc_state_update(mfc_context, q_matrix);
3954
3955     BEGIN_BCS_BATCH(batch, 38);
3956     OUT_BCS_BATCH(batch, MFX_VP8_PIC_STATE | (38 - 2));
3957     OUT_BCS_BATCH(batch,
3958                   (ALIGN(seq_param->frame_height, 16) / 16 - 1) << 16 |
3959                   (ALIGN(seq_param->frame_width, 16) / 16 - 1) << 0);
3960  
3961     OUT_BCS_BATCH(batch,
3962                   log2num << 24 |
3963                   pic_param->sharpness_level << 16 |
3964                   pic_param->pic_flags.bits.sign_bias_alternate << 13 |
3965                   pic_param->pic_flags.bits.sign_bias_golden << 12 |
3966                   pic_param->pic_flags.bits.loop_filter_adj_enable << 11 |
3967                   pic_param->pic_flags.bits.mb_no_coeff_skip << 10 |
3968                   pic_param->pic_flags.bits.update_mb_segmentation_map << 9 |
3969                   pic_param->pic_flags.bits.segmentation_enabled << 8 |
3970                   !pic_param->pic_flags.bits.frame_type << 5 | /* 0 indicate an intra frame in VP8 stream/spec($9.1)*/
3971                   (pic_param->pic_flags.bits.version / 2) << 4 |
3972                   (pic_param->pic_flags.bits.version == 3) << 1 | /* full pixel mode for version 3 */
3973                   !!pic_param->pic_flags.bits.version << 0); /* version 0: 6 tap */
3974  
3975     OUT_BCS_BATCH(batch,
3976                   pic_param->loop_filter_level[3] << 24 |
3977                   pic_param->loop_filter_level[2] << 16 |
3978                   pic_param->loop_filter_level[1] <<  8 |
3979                   pic_param->loop_filter_level[0] <<  0);
3980
3981     OUT_BCS_BATCH(batch,
3982                   q_matrix->quantization_index[3] << 24 |
3983                   q_matrix->quantization_index[2] << 16 |
3984                   q_matrix->quantization_index[1] <<  8 |
3985                   q_matrix->quantization_index[0] << 0);
3986
3987     OUT_BCS_BATCH(batch,
3988                  ((unsigned short)(q_matrix->quantization_index_delta[4]) >> 15) << 28 | 
3989                  abs(q_matrix->quantization_index_delta[4]) << 24 |
3990                  ((unsigned short)(q_matrix->quantization_index_delta[3]) >> 15) << 20 | 
3991                  abs(q_matrix->quantization_index_delta[3]) << 16 |
3992                  ((unsigned short)(q_matrix->quantization_index_delta[2]) >> 15) << 12 | 
3993                  abs(q_matrix->quantization_index_delta[2]) << 8 |
3994                  ((unsigned short)(q_matrix->quantization_index_delta[1]) >> 15) << 4 | 
3995                  abs(q_matrix->quantization_index_delta[1]) << 0);
3996
3997     OUT_BCS_BATCH(batch,
3998                  ((unsigned short)(q_matrix->quantization_index_delta[0]) >> 15) << 4 |
3999                  abs(q_matrix->quantization_index_delta[0]) << 0);
4000     
4001     OUT_BCS_BATCH(batch,
4002                  pic_param->clamp_qindex_high << 8 |
4003                  pic_param->clamp_qindex_low << 0);
4004
4005     for (i = 8; i < 19; i++) {
4006          OUT_BCS_BATCH(batch, 0xffffffff);
4007     }
4008
4009     OUT_BCS_BATCH(batch,
4010                   mfc_context->vp8_state.mb_segment_tree_probs[2] << 16 |
4011                   mfc_context->vp8_state.mb_segment_tree_probs[1] <<  8 |
4012                   mfc_context->vp8_state.mb_segment_tree_probs[0] <<  0);
4013
4014     OUT_BCS_BATCH(batch,
4015                   mfc_context->vp8_state.prob_skip_false << 24 |
4016                   mfc_context->vp8_state.prob_intra      << 16 |
4017                   mfc_context->vp8_state.prob_last       <<  8 |
4018                   mfc_context->vp8_state.prob_gf         <<  0);
4019
4020     OUT_BCS_BATCH(batch,
4021                   mfc_context->vp8_state.y_mode_probs[3] << 24 |
4022                   mfc_context->vp8_state.y_mode_probs[2] << 16 |
4023                   mfc_context->vp8_state.y_mode_probs[1] <<  8 |
4024                   mfc_context->vp8_state.y_mode_probs[0] <<  0);
4025
4026     OUT_BCS_BATCH(batch,
4027                   mfc_context->vp8_state.uv_mode_probs[2] << 16 |
4028                   mfc_context->vp8_state.uv_mode_probs[1] <<  8 |
4029                   mfc_context->vp8_state.uv_mode_probs[0] <<  0);
4030     
4031     /* MV update value, DW23-DW32 */
4032     for (i = 0; i < 2; i++) {
4033         for (j = 0; j < 20; j += 4) {
4034             OUT_BCS_BATCH(batch,
4035                           (j + 3 == 19 ? 0 : mfc_context->vp8_state.mv_probs[i][j + 3]) << 24 |
4036                           mfc_context->vp8_state.mv_probs[i][j + 2] << 16 |
4037                           mfc_context->vp8_state.mv_probs[i][j + 1] <<  8 |
4038                           mfc_context->vp8_state.mv_probs[i][j + 0] <<  0);
4039         }
4040     }
4041
4042     OUT_BCS_BATCH(batch,
4043                   (pic_param->ref_lf_delta[3] & 0x7f) << 24 |
4044                   (pic_param->ref_lf_delta[2] & 0x7f) << 16 |
4045                   (pic_param->ref_lf_delta[1] & 0x7f) <<  8 |
4046                   (pic_param->ref_lf_delta[0] & 0x7f) <<  0);
4047
4048     OUT_BCS_BATCH(batch,
4049                   (pic_param->mode_lf_delta[3] & 0x7f) << 24 |
4050                   (pic_param->mode_lf_delta[2] & 0x7f) << 16 |
4051                   (pic_param->mode_lf_delta[1] & 0x7f) <<  8 |
4052                   (pic_param->mode_lf_delta[0] & 0x7f) <<  0);
4053
4054     OUT_BCS_BATCH(batch, 0);
4055     OUT_BCS_BATCH(batch, 0);
4056     OUT_BCS_BATCH(batch, 0);
4057
4058     ADVANCE_BCS_BATCH(batch);
4059 }
4060
4061 #define OUT_VP8_BUFFER(bo, offset)                                      \
4062     if (bo)                                                             \
4063         OUT_BCS_RELOC(batch,                                            \
4064                       bo,                                               \
4065                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, \
4066                       offset);                                           \
4067     else                                                                \
4068         OUT_BCS_BATCH(batch, 0);                                        \
4069     OUT_BCS_BATCH(batch, 0);                                            \
4070     OUT_BCS_BATCH(batch, i965->intel.mocs_state);
4071
4072 static void 
4073 gen8_mfc_vp8_bsp_buf_base_addr_state(VADriverContextP ctx, 
4074                                      struct encode_state *encode_state,
4075                                      struct intel_encoder_context *encoder_context)
4076 {
4077     struct i965_driver_data *i965 = i965_driver_data(ctx);
4078     struct intel_batchbuffer *batch = encoder_context->base.batch;
4079     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4080
4081     BEGIN_BCS_BATCH(batch, 32);
4082     OUT_BCS_BATCH(batch, MFX_VP8_BSP_BUF_BASE_ADDR_STATE | (32 - 2));
4083
4084     OUT_VP8_BUFFER(mfc_context->vp8_state.frame_header_bo, 0);
4085
4086     OUT_VP8_BUFFER(mfc_context->vp8_state.intermediate_bo, 0);
4087     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[0]);
4088     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[1]);
4089     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[2]);
4090     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[3]);
4091     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[4]);
4092     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[5]);
4093     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[6]);
4094     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[7]);
4095     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_buffer_max_size);
4096
4097     OUT_VP8_BUFFER(mfc_context->vp8_state.final_frame_bo, I965_CODEDBUFFER_HEADER_SIZE);
4098     OUT_BCS_BATCH(batch, 0);
4099
4100     OUT_VP8_BUFFER(mfc_context->vp8_state.stream_out_bo, 0);
4101     OUT_VP8_BUFFER(mfc_context->vp8_state.coeff_probs_stream_in_bo, 0);
4102     OUT_VP8_BUFFER(mfc_context->vp8_state.token_statistics_bo, 0);
4103     OUT_VP8_BUFFER(mfc_context->vp8_state.mpc_row_store_bo, 0);
4104
4105     ADVANCE_BCS_BATCH(batch);
4106 }
4107
4108 static void
4109 gen8_mfc_vp8_pipeline_picture_programing(VADriverContextP ctx,
4110                                            struct encode_state *encode_state,
4111                                            struct intel_encoder_context *encoder_context)
4112 {
4113     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4114
4115     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_VP8, encoder_context);
4116     mfc_context->set_surface_state(ctx, encoder_context);
4117     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
4118     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
4119     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
4120     gen8_mfc_vp8_bsp_buf_base_addr_state(ctx, encode_state, encoder_context);
4121     gen8_mfc_vp8_pic_state(ctx, encode_state,encoder_context);
4122     gen8_mfc_vp8_encoder_cfg(ctx, encode_state, encoder_context);
4123 }
4124
4125 static const unsigned char
4126 vp8_intra_mb_mode_map[VME_MB_INTRA_MODE_COUNT] = {
4127     PAK_V_PRED,
4128     PAK_H_PRED,
4129     PAK_DC_PRED,
4130     PAK_TM_PRED
4131 };
4132
4133 static const unsigned char
4134 vp8_intra_block_mode_map[VME_B_INTRA_MODE_COUNT] = {
4135     PAK_B_VE_PRED,
4136     PAK_B_HE_PRED,
4137     PAK_B_DC_PRED,
4138     PAK_B_LD_PRED,
4139     PAK_B_RD_PRED,
4140     PAK_B_VR_PRED,
4141     PAK_B_HD_PRED,
4142     PAK_B_VL_PRED,
4143     PAK_B_HU_PRED
4144 };
4145
4146 static int inline gen8_mfc_vp8_intra_mb_mode_map(unsigned int vme_pred_mode, int is_luma_4x4)
4147 {
4148     unsigned int i, pak_pred_mode = 0;
4149     unsigned int vme_sub_blocks_pred_mode[8], pak_sub_blocks_pred_mode[8]; /* 8 blocks's intra mode */
4150
4151     if (!is_luma_4x4) {
4152         pak_pred_mode = vp8_intra_mb_mode_map[vme_pred_mode & 0x3];
4153     } else {
4154         for (i = 0; i < 8; i++) { 
4155             vme_sub_blocks_pred_mode[i] = ((vme_pred_mode >> (4 * i)) & 0xf);
4156             assert(vme_sub_blocks_pred_mode[i] < VME_B_INTRA_MODE_COUNT);
4157             pak_sub_blocks_pred_mode[i] = vp8_intra_block_mode_map[vme_sub_blocks_pred_mode[i]];
4158             pak_pred_mode |= (pak_sub_blocks_pred_mode[i] << (4 * i));
4159         }
4160     }
4161
4162     return pak_pred_mode;
4163 }
4164 static void
4165 gen8_mfc_vp8_pak_object_intra(VADriverContextP ctx, 
4166                               struct intel_encoder_context *encoder_context,
4167                               unsigned int *msg,
4168                               int x, int y,
4169                               struct intel_batchbuffer *batch)
4170 {
4171     unsigned int vme_intra_mb_mode, vme_chroma_pred_mode;
4172     unsigned int pak_intra_mb_mode, pak_chroma_pred_mode;
4173     unsigned int vme_luma_pred_mode[2], pak_luma_pred_mode[2];
4174
4175     if (batch == NULL)
4176         batch = encoder_context->base.batch;
4177
4178     vme_intra_mb_mode = ((msg[0] & 0x30) >> 4);
4179     assert((vme_intra_mb_mode == 0) || (vme_intra_mb_mode == 2)); //vp8 only support intra_16x16 and intra_4x4
4180     pak_intra_mb_mode = (vme_intra_mb_mode >> 1);
4181
4182     vme_luma_pred_mode[0] = msg[1];
4183     vme_luma_pred_mode[1] = msg[2];
4184     vme_chroma_pred_mode = msg[3] & 0x3;
4185
4186     pak_luma_pred_mode[0] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[0], pak_intra_mb_mode);
4187     pak_luma_pred_mode[1] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[1], pak_intra_mb_mode);
4188     pak_chroma_pred_mode = gen8_mfc_vp8_intra_mb_mode_map(vme_chroma_pred_mode, 0);
4189
4190     BEGIN_BCS_BATCH(batch, 7);
4191
4192     OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
4193     OUT_BCS_BATCH(batch, 0);
4194     OUT_BCS_BATCH(batch, 0);
4195     OUT_BCS_BATCH(batch,
4196                   (0 << 20) |                    /* mv format: intra mb */
4197                   (0 << 18) |                    /* Segment ID */
4198                   (0 << 17) |                    /* disable coeff clamp */
4199                   (1 << 13) |                    /* intra mb flag */
4200                   (0 << 11) |                    /* refer picture select: last frame */
4201                   (pak_intra_mb_mode << 8) |     /* mb type */
4202                   (pak_chroma_pred_mode << 4) |  /* mb uv mode */
4203                   (0 << 2) |                     /* skip mb flag: disable */
4204                   0);
4205
4206     OUT_BCS_BATCH(batch, (y << 16) | x);
4207     OUT_BCS_BATCH(batch, pak_luma_pred_mode[0]);
4208     OUT_BCS_BATCH(batch, pak_luma_pred_mode[1]);
4209
4210     ADVANCE_BCS_BATCH(batch);
4211 }
4212
4213 static void
4214 gen8_mfc_vp8_pak_object_inter(VADriverContextP ctx, 
4215                               struct intel_encoder_context *encoder_context,
4216                               unsigned int *msg,
4217                               int offset,
4218                               int x, int y,
4219                               struct intel_batchbuffer *batch)
4220 {
4221     int i;
4222
4223     if (batch == NULL)
4224         batch = encoder_context->base.batch;
4225
4226     /* only support inter_16x16 now */
4227     assert((msg[AVC_INTER_MSG_OFFSET] & INTER_MODE_MASK) == INTER_16X16);
4228     /* for inter_16x16, all 16 MVs should be same, 
4229      * and move mv to the vme mb start address to make sure offset is 64 bytes aligned
4230      * as vp8 spec, all vp8 luma motion vectors are doulbled stored
4231      */
4232     msg[0] = (((msg[AVC_INTER_MV_OFFSET/4] & 0xffff0000) << 1) | ((msg[AVC_INTER_MV_OFFSET/4] << 1) & 0xffff));
4233
4234     for (i = 1; i < 16; i++) {
4235         msg[i] = msg[0];
4236     }
4237     
4238     BEGIN_BCS_BATCH(batch, 7);
4239
4240     OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
4241     OUT_BCS_BATCH(batch,
4242                   (0 << 29) |           /* enable inline mv data: disable */
4243                   64);
4244     OUT_BCS_BATCH(batch,
4245                   offset);
4246     OUT_BCS_BATCH(batch,
4247                   (4 << 20) |           /* mv format: inter */
4248                   (0 << 18) |           /* Segment ID */
4249                   (0 << 17) |           /* coeff clamp: disable */
4250                   (0 << 13) |           /* intra mb flag: inter mb */
4251                   (0 << 11) |           /* refer picture select: last frame */
4252                   (0 << 8) |            /* mb type: 16x16 */
4253                   (0 << 4) |            /* mb uv mode: dc_pred */
4254                   (0 << 2) |            /* skip mb flag: disable */
4255                   0);
4256
4257     OUT_BCS_BATCH(batch, (y << 16) | x);
4258
4259     /*new mv*/
4260     OUT_BCS_BATCH(batch, 0x8);
4261     OUT_BCS_BATCH(batch, 0x8);
4262
4263     ADVANCE_BCS_BATCH(batch);
4264 }
4265
4266 static void
4267 gen8_mfc_vp8_pak_pipeline(VADriverContextP ctx,
4268                           struct encode_state *encode_state,
4269                           struct intel_encoder_context *encoder_context,
4270                           struct intel_batchbuffer *slice_batch)
4271 {
4272     struct gen6_vme_context *vme_context = encoder_context->vme_context;
4273     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
4274     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
4275     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
4276     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
4277     unsigned int *msg = NULL;
4278     unsigned char *msg_ptr = NULL;
4279     unsigned int i, offset, is_intra_frame;
4280
4281     is_intra_frame = !pic_param->pic_flags.bits.frame_type;
4282
4283     dri_bo_map(vme_context->vme_output.bo , 1);
4284     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
4285
4286     for( i = 0; i < width_in_mbs * height_in_mbs; i++) {
4287         int h_pos = i % width_in_mbs;
4288         int v_pos = i / width_in_mbs;
4289         msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
4290         
4291         if (is_intra_frame) {
4292             gen8_mfc_vp8_pak_object_intra(ctx,
4293                     encoder_context,
4294                     msg,
4295                     h_pos, v_pos,
4296                     slice_batch);
4297         } else {
4298             int inter_rdo, intra_rdo;
4299             inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
4300             intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
4301
4302             if (intra_rdo < inter_rdo) {
4303                 gen8_mfc_vp8_pak_object_intra(ctx,
4304                         encoder_context,
4305                         msg,
4306                         h_pos, v_pos,
4307                         slice_batch);
4308             } else {
4309                 offset = i * vme_context->vme_output.size_block;
4310                 gen8_mfc_vp8_pak_object_inter(ctx,
4311                         encoder_context,
4312                         msg,
4313                         offset,
4314                         h_pos, v_pos,
4315                         slice_batch);
4316             }
4317         }
4318     }
4319
4320     dri_bo_unmap(vme_context->vme_output.bo);
4321 }
4322
4323 /*
4324  * A batch buffer for vp8 pak object commands
4325  */
4326 static dri_bo *
4327 gen8_mfc_vp8_software_batchbuffer(VADriverContextP ctx,
4328                                           struct encode_state *encode_state,
4329                                           struct intel_encoder_context *encoder_context)
4330 {
4331     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4332     struct intel_batchbuffer *batch;
4333     dri_bo *batch_bo;
4334
4335     batch = mfc_context->aux_batchbuffer;
4336     batch_bo = batch->buffer;
4337
4338     gen8_mfc_vp8_pak_pipeline(ctx, encode_state, encoder_context, batch);
4339
4340     intel_batchbuffer_align(batch, 8);
4341
4342     BEGIN_BCS_BATCH(batch, 2);
4343     OUT_BCS_BATCH(batch, 0);
4344     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
4345     ADVANCE_BCS_BATCH(batch);
4346
4347     dri_bo_reference(batch_bo);
4348     intel_batchbuffer_free(batch);
4349     mfc_context->aux_batchbuffer = NULL;
4350
4351     return batch_bo;
4352 }
4353
4354 static void
4355 gen8_mfc_vp8_pipeline_programing(VADriverContextP ctx,
4356                                    struct encode_state *encode_state,
4357                                    struct intel_encoder_context *encoder_context)
4358 {
4359     struct intel_batchbuffer *batch = encoder_context->base.batch;
4360     dri_bo *slice_batch_bo;
4361
4362     slice_batch_bo = gen8_mfc_vp8_software_batchbuffer(ctx, encode_state, encoder_context);
4363
4364     // begin programing
4365     intel_batchbuffer_start_atomic_bcs(batch, 0x4000);
4366     intel_batchbuffer_emit_mi_flush(batch);
4367
4368     // picture level programing
4369     gen8_mfc_vp8_pipeline_picture_programing(ctx, encode_state, encoder_context);
4370
4371     BEGIN_BCS_BATCH(batch, 4);
4372     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
4373     OUT_BCS_RELOC(batch,
4374                   slice_batch_bo,
4375                   I915_GEM_DOMAIN_COMMAND, 0,
4376                   0);
4377     OUT_BCS_BATCH(batch, 0);
4378     OUT_BCS_BATCH(batch, 0);
4379     ADVANCE_BCS_BATCH(batch);
4380
4381     // end programing
4382     intel_batchbuffer_end_atomic(batch);
4383
4384     dri_bo_unreference(slice_batch_bo);
4385 }
4386
4387 static int gen8_mfc_calc_vp8_coded_buffer_size(VADriverContextP ctx,
4388                           struct encode_state *encode_state,
4389                           struct intel_encoder_context *encoder_context)
4390 {
4391     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4392     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
4393     unsigned char is_intra_frame = !pic_param->pic_flags.bits.frame_type;
4394     unsigned int *vp8_encoding_status, i, first_partition_bytes, token_partition_bytes, vp8_coded_bytes;
4395     
4396     int partition_num = 1 << pic_param->pic_flags.bits.num_token_partitions;
4397
4398     first_partition_bytes = token_partition_bytes = vp8_coded_bytes = 0;
4399
4400     dri_bo_map(mfc_context->vp8_state.token_statistics_bo, 0);
4401
4402     vp8_encoding_status = (unsigned int *)mfc_context->vp8_state.token_statistics_bo->virtual;
4403     first_partition_bytes = (vp8_encoding_status[0] + 7) / 8;
4404
4405     for (i = 1; i <= partition_num; i++) 
4406         token_partition_bytes += (vp8_encoding_status[i] + 7) / 8;
4407
4408     /*coded_bytes includes P0~P8 partitions bytes + uncompresse date bytes + partion_size bytes in bitstream + 3 extra bytes */
4409     /*it seems the last partition size in vp8 status buffer is smaller than reality. so add 3 extra bytes */
4410     vp8_coded_bytes = first_partition_bytes + token_partition_bytes + (3 + 7 * !!is_intra_frame) + (partition_num - 1) * 3 + 3;
4411
4412     dri_bo_unmap(mfc_context->vp8_state.token_statistics_bo);
4413
4414     dri_bo_map(mfc_context->vp8_state.final_frame_bo, 0);
4415     struct i965_coded_buffer_segment *coded_buffer_segment = (struct i965_coded_buffer_segment *)(mfc_context->vp8_state.final_frame_bo->virtual);
4416     coded_buffer_segment->base.size = vp8_coded_bytes;
4417     dri_bo_unmap(mfc_context->vp8_state.final_frame_bo);
4418
4419     return vp8_coded_bytes;
4420 }
4421
4422 static VAStatus
4423 gen8_mfc_vp8_encode_picture(VADriverContextP ctx,
4424                               struct encode_state *encode_state,
4425                               struct intel_encoder_context *encoder_context)
4426 {
4427     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4428     unsigned int rate_control_mode = encoder_context->rate_control_mode;
4429     int current_frame_bits_size;
4430     int sts;
4431
4432     gen8_mfc_vp8_init(ctx, encode_state, encoder_context);
4433     intel_mfc_vp8_prepare(ctx, encode_state, encoder_context);
4434     /*Programing bcs pipeline*/
4435     gen8_mfc_vp8_pipeline_programing(ctx, encode_state, encoder_context);
4436     gen8_mfc_run(ctx, encode_state, encoder_context);
4437     current_frame_bits_size = 8 * gen8_mfc_calc_vp8_coded_buffer_size(ctx, encode_state, encoder_context);
4438
4439     if (rate_control_mode == VA_RC_CBR /*|| rate_control_mode == VA_RC_VBR*/) {
4440         sts = gen8_mfc_vp8_brc_postpack(encode_state, encoder_context, current_frame_bits_size);
4441         if (sts == BRC_NO_HRD_VIOLATION) {
4442             gen8_mfc_vp8_hrd_context_update(encode_state, mfc_context);
4443         }
4444         else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
4445             if (!mfc_context->hrd.violation_noted) {
4446                 fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP)? "overflow": "underflow");
4447                 mfc_context->hrd.violation_noted = 1;
4448             }
4449             return VA_STATUS_SUCCESS;
4450         }
4451     }
4452
4453     return VA_STATUS_SUCCESS;
4454 }
4455
4456 static void
4457 gen8_mfc_context_destroy(void *context)
4458 {
4459     struct gen6_mfc_context *mfc_context = context;
4460     int i;
4461
4462     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
4463     mfc_context->post_deblocking_output.bo = NULL;
4464
4465     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
4466     mfc_context->pre_deblocking_output.bo = NULL;
4467
4468     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
4469     mfc_context->uncompressed_picture_source.bo = NULL;
4470
4471     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
4472     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
4473
4474     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
4475         dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
4476         mfc_context->direct_mv_buffers[i].bo = NULL;
4477     }
4478
4479     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
4480     mfc_context->intra_row_store_scratch_buffer.bo = NULL;
4481
4482     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
4483     mfc_context->macroblock_status_buffer.bo = NULL;
4484
4485     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
4486     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = NULL;
4487
4488     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
4489     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = NULL;
4490
4491
4492     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
4493         dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
4494         mfc_context->reference_surfaces[i].bo = NULL;  
4495     }
4496
4497     gen8_gpe_context_destroy(&mfc_context->gpe_context);
4498
4499     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
4500     mfc_context->mfc_batchbuffer_surface.bo = NULL;
4501
4502     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
4503     mfc_context->aux_batchbuffer_surface.bo = NULL;
4504
4505     if (mfc_context->aux_batchbuffer)
4506         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
4507
4508     mfc_context->aux_batchbuffer = NULL;
4509
4510     dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
4511     mfc_context->vp8_state.coeff_probs_stream_in_bo = NULL;
4512
4513     dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
4514     mfc_context->vp8_state.final_frame_bo = NULL;
4515
4516     dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
4517     mfc_context->vp8_state.frame_header_bo = NULL;
4518
4519     dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
4520     mfc_context->vp8_state.intermediate_bo = NULL;
4521
4522     dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
4523     mfc_context->vp8_state.mpc_row_store_bo = NULL;
4524
4525     dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
4526     mfc_context->vp8_state.stream_out_bo = NULL;
4527
4528     dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
4529     mfc_context->vp8_state.token_statistics_bo = NULL;
4530
4531     free(mfc_context);
4532 }
4533
4534 static VAStatus gen8_mfc_pipeline(VADriverContextP ctx,
4535                                   VAProfile profile,
4536                                   struct encode_state *encode_state,
4537                                   struct intel_encoder_context *encoder_context)
4538 {
4539     VAStatus vaStatus;
4540
4541     switch (profile) {
4542     case VAProfileH264ConstrainedBaseline:
4543     case VAProfileH264Main:
4544     case VAProfileH264High:
4545     case VAProfileH264MultiviewHigh:
4546     case VAProfileH264StereoHigh:
4547         vaStatus = gen8_mfc_avc_encode_picture(ctx, encode_state, encoder_context);
4548         break;
4549
4550         /* FIXME: add for other profile */
4551     case VAProfileMPEG2Simple:
4552     case VAProfileMPEG2Main:
4553         vaStatus = gen8_mfc_mpeg2_encode_picture(ctx, encode_state, encoder_context);
4554         break;
4555
4556     case VAProfileJPEGBaseline:
4557         jpeg_init_default_qmatrix(ctx, encoder_context);
4558         vaStatus = gen8_mfc_jpeg_encode_picture(ctx, encode_state, encoder_context);
4559         break;
4560  
4561     case VAProfileVP8Version0_3:
4562         vaStatus = gen8_mfc_vp8_encode_picture(ctx, encode_state, encoder_context);
4563         break;
4564  
4565     default:
4566         vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
4567         break;
4568     }
4569
4570     return vaStatus;
4571 }
4572
4573 Bool gen8_mfc_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
4574 {
4575     struct i965_driver_data *i965 = i965_driver_data(ctx);
4576     struct gen6_mfc_context *mfc_context = calloc(1, sizeof(struct gen6_mfc_context));
4577
4578     assert(mfc_context);
4579     mfc_context->gpe_context.surface_state_binding_table.length = (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;
4580
4581     mfc_context->gpe_context.idrt.entry_size = ALIGN(sizeof(struct gen8_interface_descriptor_data), 64);
4582     mfc_context->gpe_context.idrt.max_entries = MAX_INTERFACE_DESC_GEN6;
4583     mfc_context->gpe_context.curbe.length = 32 * 4;
4584     mfc_context->gpe_context.sampler.entry_size = 0;
4585     mfc_context->gpe_context.sampler.max_entries = 0;
4586
4587     if (i965->intel.eu_total > 0)
4588         mfc_context->gpe_context.vfe_state.max_num_threads = 6 * i965->intel.eu_total;
4589     else
4590         mfc_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
4591
4592     mfc_context->gpe_context.vfe_state.num_urb_entries = 16;
4593     mfc_context->gpe_context.vfe_state.gpgpu_mode = 0;
4594     mfc_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
4595     mfc_context->gpe_context.vfe_state.curbe_allocation_size = 37 - 1;
4596
4597     if (IS_GEN9(i965->intel.device_info)) {
4598         gen8_gpe_load_kernels(ctx,
4599                           &mfc_context->gpe_context,
4600                           gen9_mfc_kernels,
4601                           1);
4602     } else {
4603         gen8_gpe_load_kernels(ctx,
4604                           &mfc_context->gpe_context,
4605                           gen8_mfc_kernels,
4606                           1);
4607     }
4608
4609     mfc_context->pipe_mode_select = gen8_mfc_pipe_mode_select;
4610     mfc_context->set_surface_state = gen8_mfc_surface_state;
4611     mfc_context->ind_obj_base_addr_state = gen8_mfc_ind_obj_base_addr_state;
4612     mfc_context->avc_img_state = gen8_mfc_avc_img_state;
4613     mfc_context->avc_qm_state = gen8_mfc_avc_qm_state;
4614     mfc_context->avc_fqm_state = gen8_mfc_avc_fqm_state;
4615     mfc_context->insert_object = gen8_mfc_avc_insert_object;
4616     mfc_context->buffer_suface_setup = gen8_gpe_buffer_suface_setup;
4617
4618     encoder_context->mfc_context = mfc_context;
4619     encoder_context->mfc_context_destroy = gen8_mfc_context_destroy;
4620     encoder_context->mfc_pipeline = gen8_mfc_pipeline;
4621
4622     if (encoder_context->codec == CODEC_VP8)
4623         encoder_context->mfc_brc_prepare = gen8_mfc_vp8_brc_prepare;
4624     else
4625         encoder_context->mfc_brc_prepare = intel_mfc_brc_prepare;
4626
4627     return True;
4628 }