OSDN Git Service

Encoding: H264 uses the GPU to construct the PAK obj command on Gen8+
[android-x86/hardware-intel-common-vaapi.git] / src / gen8_mfc.c
1 /*
2  * Copyright © 2012 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the
13  * next paragraph) shall be included in all copies or substantial portions
14  * of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
19  * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
20  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Zhao Yakui <yakui.zhao@intel.com>
26  *    Xiang Haihao <haihao.xiang@intel.com>
27  *
28  */
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <math.h>
34 #include <assert.h>
35
36 #include "intel_batchbuffer.h"
37 #include "i965_defines.h"
38 #include "i965_structs.h"
39 #include "i965_drv_video.h"
40 #include "i965_encoder.h"
41 #include "i965_encoder_utils.h"
42 #include "gen6_mfc.h"
43 #include "gen6_vme.h"
44 #include "intel_media.h"
45 #include <va/va_enc_jpeg.h>
46 #include "vp8_probs.h"
47
48 #define SURFACE_STATE_PADDED_SIZE               SURFACE_STATE_PADDED_SIZE_GEN8
49 #define SURFACE_STATE_OFFSET(index)             (SURFACE_STATE_PADDED_SIZE * index)
50 #define BINDING_TABLE_OFFSET(index)             (SURFACE_STATE_OFFSET(MAX_MEDIA_SURFACES_GEN6) + sizeof(unsigned int) * index)
51
52 #define MFC_SOFTWARE_BATCH      0
53
54 #define B0_STEP_REV             2
55 #define IS_STEPPING_BPLUS(i965) ((i965->intel.revision) >= B0_STEP_REV)
56
57 //Zigzag scan order of the the Luma and Chroma components
58 //Note: Jpeg Spec ISO/IEC 10918-1, Figure A.6 shows the zigzag order differently.
59 //The Spec is trying to show the zigzag pattern with number positions. The below
60 //table will use the pattern shown by A.6 and map the position of the elements in the array
61 static const uint32_t zigzag_direct[64] = {
62     0,   1,  8, 16,  9,  2,  3, 10,
63     17, 24, 32, 25, 18, 11,  4,  5,
64     12, 19, 26, 33, 40, 48, 41, 34,
65     27, 20, 13,  6,  7, 14, 21, 28,
66     35, 42, 49, 56, 57, 50, 43, 36,
67     29, 22, 15, 23, 30, 37, 44, 51,
68     58, 59, 52, 45, 38, 31, 39, 46,
69     53, 60, 61, 54, 47, 55, 62, 63
70 };
71
72 //Default Luminance quantization table
73 //Source: Jpeg Spec ISO/IEC 10918-1, Annex K, Table K.1
74 static const uint8_t jpeg_luma_quant[64] = {
75     16, 11, 10, 16, 24,  40,  51,  61,
76     12, 12, 14, 19, 26,  58,  60,  55,
77     14, 13, 16, 24, 40,  57,  69,  56,
78     14, 17, 22, 29, 51,  87,  80,  62,
79     18, 22, 37, 56, 68,  109, 103, 77,
80     24, 35, 55, 64, 81,  104, 113, 92,
81     49, 64, 78, 87, 103, 121, 120, 101,
82     72, 92, 95, 98, 112, 100, 103, 99    
83 };
84
85 //Default Chroma quantization table
86 //Source: Jpeg Spec ISO/IEC 10918-1, Annex K, Table K.2
87 static const uint8_t jpeg_chroma_quant[64] = {
88     17, 18, 24, 47, 99, 99, 99, 99,
89     18, 21, 26, 66, 99, 99, 99, 99,
90     24, 26, 56, 99, 99, 99, 99, 99,
91     47, 66, 99, 99, 99, 99, 99, 99,
92     99, 99, 99, 99, 99, 99, 99, 99,
93     99, 99, 99, 99, 99, 99, 99, 99,
94     99, 99, 99, 99, 99, 99, 99, 99,
95     99, 99, 99, 99, 99, 99, 99, 99
96 };
97
98
99 static const int va_to_gen7_jpeg_hufftable[2] = {
100     MFX_HUFFTABLE_ID_Y,
101     MFX_HUFFTABLE_ID_UV
102 };
103
104 static const uint32_t gen8_mfc_batchbuffer_avc[][4] = {
105 #include "shaders/utils/mfc_batchbuffer_hsw.g8b"
106 };
107
108 static const uint32_t gen9_mfc_batchbuffer_avc[][4] = {
109 #include "shaders/utils/mfc_batchbuffer_hsw.g9b"
110 };
111
112 static struct i965_kernel gen8_mfc_kernels[] = {
113     {
114         "MFC AVC INTRA BATCHBUFFER ",
115         MFC_BATCHBUFFER_AVC_INTRA,
116         gen8_mfc_batchbuffer_avc,
117         sizeof(gen8_mfc_batchbuffer_avc),
118         NULL
119     },
120 };
121
122 static struct i965_kernel gen9_mfc_kernels[] = {
123     {
124         "MFC AVC INTRA BATCHBUFFER ",
125         MFC_BATCHBUFFER_AVC_INTRA,
126         gen9_mfc_batchbuffer_avc,
127         sizeof(gen9_mfc_batchbuffer_avc),
128         NULL
129     },
130 };
131
132
133 #define         INTER_MODE_MASK         0x03
134 #define         INTER_8X8               0x03
135 #define         INTER_16X8              0x01
136 #define         INTER_8X16              0x02
137 #define         SUBMB_SHAPE_MASK        0x00FF00
138 #define         INTER_16X16             0x00
139
140 #define         INTER_MV8               (4 << 20)
141 #define         INTER_MV32              (6 << 20)
142
143
144 static void
145 gen8_mfc_pipe_mode_select(VADriverContextP ctx,
146                           int standard_select,
147                           struct intel_encoder_context *encoder_context)
148 {
149     struct intel_batchbuffer *batch = encoder_context->base.batch;
150     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
151
152     assert(standard_select == MFX_FORMAT_MPEG2 ||
153            standard_select == MFX_FORMAT_AVC   ||
154            standard_select == MFX_FORMAT_JPEG  ||
155            standard_select == MFX_FORMAT_VP8);
156
157     BEGIN_BCS_BATCH(batch, 5);
158
159     OUT_BCS_BATCH(batch, MFX_PIPE_MODE_SELECT | (5 - 2));
160     OUT_BCS_BATCH(batch,
161                   (MFX_LONG_MODE << 17) | /* Must be long format for encoder */
162                   (MFD_MODE_VLD << 15) | /* VLD mode */
163                   (0 << 10) | /* Stream-Out Enable */
164                   ((!!mfc_context->post_deblocking_output.bo) << 9)  | /* Post Deblocking Output */
165                   ((!!mfc_context->pre_deblocking_output.bo) << 8)  | /* Pre Deblocking Output */
166                   (0 << 6)  | /* frame statistics stream-out enable*/
167                   (0 << 5)  | /* not in stitch mode */
168                   (1 << 4)  | /* encoding mode */
169                   (standard_select << 0));  /* standard select: avc or mpeg2 or jpeg*/
170     OUT_BCS_BATCH(batch,
171                   (0 << 7)  | /* expand NOA bus flag */
172                   (0 << 6)  | /* disable slice-level clock gating */
173                   (0 << 5)  | /* disable clock gating for NOA */
174                   (0 << 4)  | /* terminate if AVC motion and POC table error occurs */
175                   (0 << 3)  | /* terminate if AVC mbdata error occurs */
176                   (0 << 2)  | /* terminate if AVC CABAC/CAVLC decode error occurs */
177                   (0 << 1)  |
178                   (0 << 0));
179     OUT_BCS_BATCH(batch, 0);
180     OUT_BCS_BATCH(batch, 0);
181
182     ADVANCE_BCS_BATCH(batch);
183 }
184
185 static void
186 gen8_mfc_surface_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
187 {
188     struct intel_batchbuffer *batch = encoder_context->base.batch;
189     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
190
191     BEGIN_BCS_BATCH(batch, 6);
192
193     OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
194     OUT_BCS_BATCH(batch, 0);
195     OUT_BCS_BATCH(batch,
196                   ((mfc_context->surface_state.height - 1) << 18) |
197                   ((mfc_context->surface_state.width - 1) << 4));
198     OUT_BCS_BATCH(batch,
199                   (MFX_SURFACE_PLANAR_420_8 << 28) | /* 420 planar YUV surface */
200                   (1 << 27) | /* must be 1 for interleave U/V, hardware requirement */
201                   (0 << 22) | /* surface object control state, FIXME??? */
202                   ((mfc_context->surface_state.w_pitch - 1) << 3) | /* pitch */
203                   (0 << 2)  | /* must be 0 for interleave U/V */
204                   (1 << 1)  | /* must be tiled */
205                   (I965_TILEWALK_YMAJOR << 0));  /* tile walk, TILEWALK_YMAJOR */
206     OUT_BCS_BATCH(batch,
207                   (0 << 16) |                                                           /* must be 0 for interleave U/V */
208                   (mfc_context->surface_state.h_pitch));                /* y offset for U(cb) */
209     OUT_BCS_BATCH(batch, 0);
210
211     ADVANCE_BCS_BATCH(batch);
212 }
213
214 static void
215 gen8_mfc_ind_obj_base_addr_state(VADriverContextP ctx,
216                                  struct intel_encoder_context *encoder_context)
217 {
218     struct intel_batchbuffer *batch = encoder_context->base.batch;
219     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
220     struct gen6_vme_context *vme_context = encoder_context->vme_context;
221     int vme_size;
222     unsigned int bse_offset;
223
224     BEGIN_BCS_BATCH(batch, 26);
225
226     OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (26 - 2));
227     /* the DW1-3 is for the MFX indirect bistream offset */
228     OUT_BCS_BATCH(batch, 0);
229     OUT_BCS_BATCH(batch, 0);
230     OUT_BCS_BATCH(batch, 0);
231
232     /* the DW4-5 is the MFX upper bound */
233     if (encoder_context->codec == CODEC_VP8) {
234         OUT_BCS_RELOC(batch,
235                 mfc_context->mfc_indirect_pak_bse_object.bo,
236                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
237                 mfc_context->mfc_indirect_pak_bse_object.end_offset);
238         OUT_BCS_BATCH(batch, 0);
239     } else {
240         OUT_BCS_BATCH(batch, 0);
241         OUT_BCS_BATCH(batch, 0);
242     }
243
244     if(encoder_context->codec != CODEC_JPEG) {
245         vme_size = vme_context->vme_output.size_block * vme_context->vme_output.num_blocks;
246         /* the DW6-10 is for MFX Indirect MV Object Base Address */
247         OUT_BCS_RELOC(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
248         OUT_BCS_BATCH(batch, 0);
249         OUT_BCS_BATCH(batch, 0);
250         OUT_BCS_RELOC(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, vme_size);
251         OUT_BCS_BATCH(batch, 0);
252     } else {
253         /* No VME for JPEG */
254         OUT_BCS_BATCH(batch, 0);
255         OUT_BCS_BATCH(batch, 0);
256         OUT_BCS_BATCH(batch, 0);
257         OUT_BCS_BATCH(batch, 0);
258         OUT_BCS_BATCH(batch, 0);
259     }
260
261     /* the DW11-15 is for MFX IT-COFF. Not used on encoder */
262     OUT_BCS_BATCH(batch, 0);
263     OUT_BCS_BATCH(batch, 0);
264     OUT_BCS_BATCH(batch, 0);
265     OUT_BCS_BATCH(batch, 0);
266     OUT_BCS_BATCH(batch, 0);
267
268     /* the DW16-20 is for MFX indirect DBLK. Not used on encoder */
269     OUT_BCS_BATCH(batch, 0);
270     OUT_BCS_BATCH(batch, 0);
271     OUT_BCS_BATCH(batch, 0);
272     OUT_BCS_BATCH(batch, 0);
273     OUT_BCS_BATCH(batch, 0);
274
275     /* the DW21-25 is for MFC Indirect PAK-BSE Object Base Address for Encoder*/        
276     bse_offset = (encoder_context->codec == CODEC_JPEG) ? (mfc_context->mfc_indirect_pak_bse_object.offset) : 0;
277     OUT_BCS_RELOC(batch,
278                   mfc_context->mfc_indirect_pak_bse_object.bo,
279                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
280                   bse_offset);
281     OUT_BCS_BATCH(batch, 0);
282     OUT_BCS_BATCH(batch, 0);
283         
284     OUT_BCS_RELOC(batch,
285                   mfc_context->mfc_indirect_pak_bse_object.bo,
286                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
287                   mfc_context->mfc_indirect_pak_bse_object.end_offset);
288     OUT_BCS_BATCH(batch, 0);
289
290     ADVANCE_BCS_BATCH(batch);
291 }
292
293 static void
294 gen8_mfc_avc_img_state(VADriverContextP ctx, struct encode_state *encode_state,  
295                        struct intel_encoder_context *encoder_context)
296 {
297     struct intel_batchbuffer *batch = encoder_context->base.batch;
298     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
299     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
300
301     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
302     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
303
304     BEGIN_BCS_BATCH(batch, 16);
305
306     OUT_BCS_BATCH(batch, MFX_AVC_IMG_STATE | (16 - 2));
307     /*DW1. MB setting of frame */
308     OUT_BCS_BATCH(batch,
309                   ((width_in_mbs * height_in_mbs - 1) & 0xFFFF));
310     OUT_BCS_BATCH(batch, 
311                   ((height_in_mbs - 1) << 16) | 
312                   ((width_in_mbs - 1) << 0));
313     /* DW3 QP setting */
314     OUT_BCS_BATCH(batch, 
315                   (0 << 24) |   /* Second Chroma QP Offset */
316                   (0 << 16) |   /* Chroma QP Offset */
317                   (0 << 14) |   /* Max-bit conformance Intra flag */
318                   (0 << 13) |   /* Max Macroblock size conformance Inter flag */
319                   (pPicParameter->pic_fields.bits.weighted_pred_flag << 12) |   /*Weighted_Pred_Flag */
320                   (pPicParameter->pic_fields.bits.weighted_bipred_idc << 10) |  /* Weighted_BiPred_Idc */
321                   (0 << 8)  |   /* FIXME: Image Structure */
322                   (0 << 0) );   /* Current Decoed Image Frame Store ID, reserved in Encode mode */
323     OUT_BCS_BATCH(batch,
324                   (0 << 16) |   /* Mininum Frame size */
325                   (0 << 15) |   /* Disable reading of Macroblock Status Buffer */
326                   (0 << 14) |   /* Load BitStream Pointer only once, 1 slic 1 frame */
327                   (0 << 13) |   /* CABAC 0 word insertion test enable */
328                   (1 << 12) |   /* MVUnpackedEnable,compliant to DXVA */
329                   (1 << 10) |   /* Chroma Format IDC, 4:2:0 */
330                   (0 << 8)  |   /* FIXME: MbMvFormatFlag */
331                   (pPicParameter->pic_fields.bits.entropy_coding_mode_flag << 7)  |   /*0:CAVLC encoding mode,1:CABAC*/
332                   (0 << 6)  |   /* Only valid for VLD decoding mode */
333                   (0 << 5)  |   /* Constrained Intra Predition Flag, from PPS */
334                   (0 << 4)  |   /* Direct 8x8 inference flag */
335                   (pPicParameter->pic_fields.bits.transform_8x8_mode_flag << 3)  |   /*8x8 or 4x4 IDCT Transform Mode Flag*/
336                   (1 << 2)  |   /* Frame MB only flag */
337                   (0 << 1)  |   /* MBAFF mode is in active */
338                   (0 << 0));    /* Field picture flag */
339     /* DW5 Trellis quantization */
340     OUT_BCS_BATCH(batch, 0);    /* Mainly about MB rate control and debug, just ignoring */
341     OUT_BCS_BATCH(batch,        /* Inter and Intra Conformance Max size limit */
342                   (0xBB8 << 16) |       /* InterMbMaxSz */
343                   (0xEE8) );            /* IntraMbMaxSz */
344     OUT_BCS_BATCH(batch, 0);            /* Reserved */
345     /* DW8. QP delta */
346     OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
347     OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
348     /* DW10. Bit setting for MB */
349     OUT_BCS_BATCH(batch, 0x8C000000);
350     OUT_BCS_BATCH(batch, 0x00010000);
351     /* DW12. */
352     OUT_BCS_BATCH(batch, 0);
353     OUT_BCS_BATCH(batch, 0x02010100);
354     /* DW14. For short format */
355     OUT_BCS_BATCH(batch, 0);
356     OUT_BCS_BATCH(batch, 0);
357
358     ADVANCE_BCS_BATCH(batch);
359 }
360
361 static void
362 gen8_mfc_qm_state(VADriverContextP ctx,
363                   int qm_type,
364                   unsigned int *qm,
365                   int qm_length,
366                   struct intel_encoder_context *encoder_context)
367 {
368     struct intel_batchbuffer *batch = encoder_context->base.batch;
369     unsigned int qm_buffer[16];
370
371     assert(qm_length <= 16);
372     assert(sizeof(*qm) == 4);
373     memcpy(qm_buffer, qm, qm_length * 4);
374
375     BEGIN_BCS_BATCH(batch, 18);
376     OUT_BCS_BATCH(batch, MFX_QM_STATE | (18 - 2));
377     OUT_BCS_BATCH(batch, qm_type << 0);
378     intel_batchbuffer_data(batch, qm_buffer, 16 * 4);
379     ADVANCE_BCS_BATCH(batch);
380 }
381
382 static void
383 gen8_mfc_avc_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
384 {
385     unsigned int qm[16] = {
386         0x10101010, 0x10101010, 0x10101010, 0x10101010,
387         0x10101010, 0x10101010, 0x10101010, 0x10101010,
388         0x10101010, 0x10101010, 0x10101010, 0x10101010,
389         0x10101010, 0x10101010, 0x10101010, 0x10101010
390     };
391
392     gen8_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, qm, 12, encoder_context);
393     gen8_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, qm, 12, encoder_context);
394     gen8_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, qm, 16, encoder_context);
395     gen8_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, qm, 16, encoder_context);
396 }
397
398 static void
399 gen8_mfc_fqm_state(VADriverContextP ctx,
400                    int fqm_type,
401                    unsigned int *fqm,
402                    int fqm_length,
403                    struct intel_encoder_context *encoder_context)
404 {
405     struct intel_batchbuffer *batch = encoder_context->base.batch;
406     unsigned int fqm_buffer[32];
407
408     assert(fqm_length <= 32);
409     assert(sizeof(*fqm) == 4);
410     memcpy(fqm_buffer, fqm, fqm_length * 4);
411
412     BEGIN_BCS_BATCH(batch, 34);
413     OUT_BCS_BATCH(batch, MFX_FQM_STATE | (34 - 2));
414     OUT_BCS_BATCH(batch, fqm_type << 0);
415     intel_batchbuffer_data(batch, fqm_buffer, 32 * 4);
416     ADVANCE_BCS_BATCH(batch);
417 }
418
419 static void
420 gen8_mfc_avc_fqm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
421 {
422     unsigned int qm[32] = {
423         0x10001000, 0x10001000, 0x10001000, 0x10001000,
424         0x10001000, 0x10001000, 0x10001000, 0x10001000,
425         0x10001000, 0x10001000, 0x10001000, 0x10001000,
426         0x10001000, 0x10001000, 0x10001000, 0x10001000,
427         0x10001000, 0x10001000, 0x10001000, 0x10001000,
428         0x10001000, 0x10001000, 0x10001000, 0x10001000,
429         0x10001000, 0x10001000, 0x10001000, 0x10001000,
430         0x10001000, 0x10001000, 0x10001000, 0x10001000
431     };
432
433     gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, qm, 24, encoder_context);
434     gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, qm, 24, encoder_context);
435     gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, qm, 32, encoder_context);
436     gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, qm, 32, encoder_context);
437 }
438
439 static void
440 gen8_mfc_avc_insert_object(VADriverContextP ctx, struct intel_encoder_context *encoder_context,
441                            unsigned int *insert_data, int lenght_in_dws, int data_bits_in_last_dw,
442                            int skip_emul_byte_count, int is_last_header, int is_end_of_slice, int emulation_flag,
443                            struct intel_batchbuffer *batch)
444 {
445     if (batch == NULL)
446         batch = encoder_context->base.batch;
447
448     if (data_bits_in_last_dw == 0)
449         data_bits_in_last_dw = 32;
450
451     BEGIN_BCS_BATCH(batch, lenght_in_dws + 2);
452
453     OUT_BCS_BATCH(batch, MFX_INSERT_OBJECT | (lenght_in_dws + 2 - 2));
454     OUT_BCS_BATCH(batch,
455                   (0 << 16) |   /* always start at offset 0 */
456                   (data_bits_in_last_dw << 8) |
457                   (skip_emul_byte_count << 4) |
458                   (!!emulation_flag << 3) |
459                   ((!!is_last_header) << 2) |
460                   ((!!is_end_of_slice) << 1) |
461                   (0 << 0));    /* FIXME: ??? */
462     intel_batchbuffer_data(batch, insert_data, lenght_in_dws * 4);
463
464     ADVANCE_BCS_BATCH(batch);
465 }
466
467
468 static void gen8_mfc_init(VADriverContextP ctx,
469                           struct encode_state *encode_state,
470                           struct intel_encoder_context *encoder_context)
471 {
472     struct i965_driver_data *i965 = i965_driver_data(ctx);
473     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
474     dri_bo *bo;
475     int i;
476     int width_in_mbs = 0;
477     int height_in_mbs = 0;
478     int slice_batchbuffer_size;
479
480     if (encoder_context->codec == CODEC_H264 ||
481         encoder_context->codec == CODEC_H264_MVC) {
482         VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
483         width_in_mbs = pSequenceParameter->picture_width_in_mbs;
484         height_in_mbs = pSequenceParameter->picture_height_in_mbs;
485     } else if (encoder_context->codec == CODEC_MPEG2) {
486         VAEncSequenceParameterBufferMPEG2 *pSequenceParameter = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
487
488         assert(encoder_context->codec == CODEC_MPEG2);
489
490         width_in_mbs = ALIGN(pSequenceParameter->picture_width, 16) / 16;
491         height_in_mbs = ALIGN(pSequenceParameter->picture_height, 16) / 16;
492     } else {
493         assert(encoder_context->codec == CODEC_JPEG);
494         VAEncPictureParameterBufferJPEG *pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
495
496         width_in_mbs = ALIGN(pic_param->picture_width, 16) / 16;
497         height_in_mbs = ALIGN(pic_param->picture_height, 16) / 16;
498     }
499
500     slice_batchbuffer_size = 64 * width_in_mbs * height_in_mbs + 4096 +
501                 (SLICE_HEADER + SLICE_TAIL) * encode_state->num_slice_params_ext;
502
503     /*Encode common setup for MFC*/
504     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
505     mfc_context->post_deblocking_output.bo = NULL;
506
507     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
508     mfc_context->pre_deblocking_output.bo = NULL;
509
510     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
511     mfc_context->uncompressed_picture_source.bo = NULL;
512
513     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
514     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
515
516     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
517         if (mfc_context->direct_mv_buffers[i].bo != NULL)
518             dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
519         mfc_context->direct_mv_buffers[i].bo = NULL;
520     }
521
522     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
523         if (mfc_context->reference_surfaces[i].bo != NULL)
524             dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
525         mfc_context->reference_surfaces[i].bo = NULL;  
526     }
527
528     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
529     bo = dri_bo_alloc(i965->intel.bufmgr,
530                       "Buffer",
531                       width_in_mbs * 64,
532                       64);
533     assert(bo);
534     mfc_context->intra_row_store_scratch_buffer.bo = bo;
535
536     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
537     bo = dri_bo_alloc(i965->intel.bufmgr,
538                       "Buffer",
539                       width_in_mbs * height_in_mbs * 16,
540                       64);
541     assert(bo);
542     mfc_context->macroblock_status_buffer.bo = bo;
543
544     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
545     bo = dri_bo_alloc(i965->intel.bufmgr,
546                       "Buffer",
547                       4 * width_in_mbs * 64,  /* 4 * width_in_mbs * 64 */
548                       64);
549     assert(bo);
550     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
551
552     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
553     bo = dri_bo_alloc(i965->intel.bufmgr,
554                       "Buffer",
555                       2 * width_in_mbs * 64, /* 2 * width_in_mbs * 64 */
556                       0x1000);
557     assert(bo);
558     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
559
560     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
561     mfc_context->mfc_batchbuffer_surface.bo = NULL;
562
563     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
564     mfc_context->aux_batchbuffer_surface.bo = NULL;
565
566     if (mfc_context->aux_batchbuffer)
567         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
568
569     mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
570     mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
571     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
572     mfc_context->aux_batchbuffer_surface.pitch = 16;
573     mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
574     mfc_context->aux_batchbuffer_surface.size_block = 16;
575
576     gen8_gpe_context_init(ctx, &mfc_context->gpe_context);
577 }
578
579 static void
580 gen8_mfc_pipe_buf_addr_state(VADriverContextP ctx,
581                              struct intel_encoder_context *encoder_context)
582 {
583     struct intel_batchbuffer *batch = encoder_context->base.batch;
584     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
585     int i;
586
587     BEGIN_BCS_BATCH(batch, 61);
588
589     OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (61 - 2));
590
591     /* the DW1-3 is for pre_deblocking */
592     if (mfc_context->pre_deblocking_output.bo)
593         OUT_BCS_RELOC(batch, mfc_context->pre_deblocking_output.bo,
594                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
595                       0);
596     else
597         OUT_BCS_BATCH(batch, 0);                                                                                        /* pre output addr   */
598
599     OUT_BCS_BATCH(batch, 0);
600     OUT_BCS_BATCH(batch, 0);
601     /* the DW4-6 is for the post_deblocking */
602
603     if (mfc_context->post_deblocking_output.bo)
604         OUT_BCS_RELOC(batch, mfc_context->post_deblocking_output.bo,
605                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
606                       0);                                                                                       /* post output addr  */ 
607     else
608         OUT_BCS_BATCH(batch, 0);
609     
610     OUT_BCS_BATCH(batch, 0);
611     OUT_BCS_BATCH(batch, 0);
612
613     /* the DW7-9 is for the uncompressed_picture */
614     OUT_BCS_RELOC(batch, mfc_context->uncompressed_picture_source.bo,
615                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
616                   0); /* uncompressed data */
617
618     OUT_BCS_BATCH(batch, 0);
619     OUT_BCS_BATCH(batch, 0);
620
621     /* the DW10-12 is for the mb status */
622     OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
623                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
624                   0); /* StreamOut data*/
625     
626     OUT_BCS_BATCH(batch, 0);
627     OUT_BCS_BATCH(batch, 0);
628
629     /* the DW13-15 is for the intra_row_store_scratch */
630     OUT_BCS_RELOC(batch, mfc_context->intra_row_store_scratch_buffer.bo,
631                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
632                   0);   
633
634     OUT_BCS_BATCH(batch, 0);
635     OUT_BCS_BATCH(batch, 0);
636
637     /* the DW16-18 is for the deblocking filter */
638     OUT_BCS_RELOC(batch, mfc_context->deblocking_filter_row_store_scratch_buffer.bo,
639                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
640                   0);
641
642     OUT_BCS_BATCH(batch, 0);
643     OUT_BCS_BATCH(batch, 0);
644
645     /* the DW 19-50 is for Reference pictures*/
646     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
647         if ( mfc_context->reference_surfaces[i].bo != NULL) {
648             OUT_BCS_RELOC(batch, mfc_context->reference_surfaces[i].bo,
649                           I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
650                           0);                   
651         } else {
652             OUT_BCS_BATCH(batch, 0);
653         }
654
655         OUT_BCS_BATCH(batch, 0);
656     }
657
658     OUT_BCS_BATCH(batch, 0);
659
660     /* The DW 52-54 is for the MB status buffer */
661     OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
662                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
663                   0);                                                                                   /* Macroblock status buffer*/
664         
665     OUT_BCS_BATCH(batch, 0);
666     OUT_BCS_BATCH(batch, 0);
667
668     /* the DW 55-57 is the ILDB buffer */
669     OUT_BCS_BATCH(batch, 0);
670     OUT_BCS_BATCH(batch, 0);
671     OUT_BCS_BATCH(batch, 0);
672
673     /* the DW 58-60 is the second ILDB buffer */
674     OUT_BCS_BATCH(batch, 0);
675     OUT_BCS_BATCH(batch, 0);
676     OUT_BCS_BATCH(batch, 0);
677
678     ADVANCE_BCS_BATCH(batch);
679 }
680
681 static void
682 gen8_mfc_avc_directmode_state(VADriverContextP ctx,
683                               struct intel_encoder_context *encoder_context)
684 {
685     struct intel_batchbuffer *batch = encoder_context->base.batch;
686     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
687
688     int i;
689
690     BEGIN_BCS_BATCH(batch, 71);
691
692     OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (71 - 2));
693
694     /* Reference frames and Current frames */
695     /* the DW1-32 is for the direct MV for reference */
696     for(i = 0; i < NUM_MFC_DMV_BUFFERS - 2; i += 2) {
697         if ( mfc_context->direct_mv_buffers[i].bo != NULL) { 
698             OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[i].bo,
699                           I915_GEM_DOMAIN_INSTRUCTION, 0,
700                           0);
701             OUT_BCS_BATCH(batch, 0);
702         } else {
703             OUT_BCS_BATCH(batch, 0);
704             OUT_BCS_BATCH(batch, 0);
705         }
706     }
707     
708     OUT_BCS_BATCH(batch, 0);
709
710     /* the DW34-36 is the MV for the current reference */
711     OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 2].bo,
712                   I915_GEM_DOMAIN_INSTRUCTION, 0,
713                   0);
714
715     OUT_BCS_BATCH(batch, 0);
716     OUT_BCS_BATCH(batch, 0);
717
718     /* POL list */
719     for(i = 0; i < 32; i++) {
720         OUT_BCS_BATCH(batch, i/2);
721     }
722     OUT_BCS_BATCH(batch, 0);
723     OUT_BCS_BATCH(batch, 0);
724
725     ADVANCE_BCS_BATCH(batch);
726 }
727
728
729 static void
730 gen8_mfc_bsp_buf_base_addr_state(VADriverContextP ctx,
731                                  struct intel_encoder_context *encoder_context)
732 {
733     struct intel_batchbuffer *batch = encoder_context->base.batch;
734     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
735
736     BEGIN_BCS_BATCH(batch, 10);
737
738     OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (10 - 2));
739     OUT_BCS_RELOC(batch, mfc_context->bsd_mpc_row_store_scratch_buffer.bo,
740                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
741                   0);
742     OUT_BCS_BATCH(batch, 0);
743     OUT_BCS_BATCH(batch, 0);
744         
745     /* the DW4-6 is for MPR Row Store Scratch Buffer Base Address */
746     OUT_BCS_BATCH(batch, 0);
747     OUT_BCS_BATCH(batch, 0);
748     OUT_BCS_BATCH(batch, 0);
749
750     /* the DW7-9 is for Bitplane Read Buffer Base Address */
751     OUT_BCS_BATCH(batch, 0);
752     OUT_BCS_BATCH(batch, 0);
753     OUT_BCS_BATCH(batch, 0);
754
755     ADVANCE_BCS_BATCH(batch);
756 }
757
758
759 static void gen8_mfc_avc_pipeline_picture_programing( VADriverContextP ctx,
760                                                       struct encode_state *encode_state,
761                                                       struct intel_encoder_context *encoder_context)
762 {
763     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
764
765     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_AVC, encoder_context);
766     mfc_context->set_surface_state(ctx, encoder_context);
767     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
768     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
769     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
770     mfc_context->avc_img_state(ctx, encode_state, encoder_context);
771     mfc_context->avc_qm_state(ctx, encoder_context);
772     mfc_context->avc_fqm_state(ctx, encoder_context);
773     gen8_mfc_avc_directmode_state(ctx, encoder_context); 
774     intel_mfc_avc_ref_idx_state(ctx, encode_state, encoder_context);
775 }
776
777
778 static VAStatus gen8_mfc_run(VADriverContextP ctx, 
779                              struct encode_state *encode_state,
780                              struct intel_encoder_context *encoder_context)
781 {
782     struct intel_batchbuffer *batch = encoder_context->base.batch;
783
784     intel_batchbuffer_flush(batch);             //run the pipeline
785
786     return VA_STATUS_SUCCESS;
787 }
788
789
790 static VAStatus
791 gen8_mfc_stop(VADriverContextP ctx, 
792               struct encode_state *encode_state,
793               struct intel_encoder_context *encoder_context,
794               int *encoded_bits_size)
795 {
796     VAStatus vaStatus = VA_STATUS_ERROR_UNKNOWN;
797     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
798     VACodedBufferSegment *coded_buffer_segment;
799     
800     vaStatus = i965_MapBuffer(ctx, pPicParameter->coded_buf, (void **)&coded_buffer_segment);
801     assert(vaStatus == VA_STATUS_SUCCESS);
802     *encoded_bits_size = coded_buffer_segment->size * 8;
803     i965_UnmapBuffer(ctx, pPicParameter->coded_buf);
804
805     return VA_STATUS_SUCCESS;
806 }
807
808
809 static void
810 gen8_mfc_avc_slice_state(VADriverContextP ctx,
811                          VAEncPictureParameterBufferH264 *pic_param,
812                          VAEncSliceParameterBufferH264 *slice_param,
813                          struct encode_state *encode_state,
814                          struct intel_encoder_context *encoder_context,
815                          int rate_control_enable,
816                          int qp,
817                          struct intel_batchbuffer *batch)
818 {
819     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
820     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
821     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
822     int beginmb = slice_param->macroblock_address;
823     int endmb = beginmb + slice_param->num_macroblocks;
824     int beginx = beginmb % width_in_mbs;
825     int beginy = beginmb / width_in_mbs;
826     int nextx =  endmb % width_in_mbs;
827     int nexty = endmb / width_in_mbs;
828     int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
829     int last_slice = (endmb == (width_in_mbs * height_in_mbs));
830     int maxQpN, maxQpP;
831     unsigned char correct[6], grow, shrink;
832     int i;
833     int weighted_pred_idc = 0;
834     unsigned int luma_log2_weight_denom = slice_param->luma_log2_weight_denom;
835     unsigned int chroma_log2_weight_denom = slice_param->chroma_log2_weight_denom;
836     int num_ref_l0 = 0, num_ref_l1 = 0;
837
838     if (batch == NULL)
839         batch = encoder_context->base.batch;
840
841     if (slice_type == SLICE_TYPE_I) {
842         luma_log2_weight_denom = 0;
843         chroma_log2_weight_denom = 0;
844     } else if (slice_type == SLICE_TYPE_P) {
845         weighted_pred_idc = pic_param->pic_fields.bits.weighted_pred_flag;
846         num_ref_l0 = pic_param->num_ref_idx_l0_active_minus1 + 1;
847
848         if (slice_param->num_ref_idx_active_override_flag)
849             num_ref_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
850     } else if (slice_type == SLICE_TYPE_B) {
851         weighted_pred_idc = pic_param->pic_fields.bits.weighted_bipred_idc;
852         num_ref_l0 = pic_param->num_ref_idx_l0_active_minus1 + 1;
853         num_ref_l1 = pic_param->num_ref_idx_l1_active_minus1 + 1;
854
855         if (slice_param->num_ref_idx_active_override_flag) {
856             num_ref_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
857             num_ref_l1 = slice_param->num_ref_idx_l1_active_minus1 + 1;
858         }
859
860         if (weighted_pred_idc == 2) {
861             /* 8.4.3 - Derivation process for prediction weights (8-279) */
862             luma_log2_weight_denom = 5;
863             chroma_log2_weight_denom = 5;
864         }
865     }
866
867     maxQpN = mfc_context->bit_rate_control_context[slice_type].MaxQpNegModifier;
868     maxQpP = mfc_context->bit_rate_control_context[slice_type].MaxQpPosModifier;
869
870     for (i = 0; i < 6; i++)
871         correct[i] = mfc_context->bit_rate_control_context[slice_type].Correct[i];
872
873     grow = mfc_context->bit_rate_control_context[slice_type].GrowInit + 
874         (mfc_context->bit_rate_control_context[slice_type].GrowResistance << 4);
875     shrink = mfc_context->bit_rate_control_context[slice_type].ShrinkInit + 
876         (mfc_context->bit_rate_control_context[slice_type].ShrinkResistance << 4);
877
878     BEGIN_BCS_BATCH(batch, 11);;
879
880     OUT_BCS_BATCH(batch, MFX_AVC_SLICE_STATE | (11 - 2) );
881     OUT_BCS_BATCH(batch, slice_type);                   /*Slice Type: I:P:B Slice*/
882
883     OUT_BCS_BATCH(batch,
884                   (num_ref_l0 << 16) |
885                   (num_ref_l1 << 24) |
886                   (chroma_log2_weight_denom << 8) |
887                   (luma_log2_weight_denom << 0));
888
889     OUT_BCS_BATCH(batch, 
890                   (weighted_pred_idc << 30) |
891                   (slice_param->direct_spatial_mv_pred_flag<<29) |             /*Direct Prediction Type*/
892                   (slice_param->disable_deblocking_filter_idc << 27) |
893                   (slice_param->cabac_init_idc << 24) |
894                   (qp<<16) |                    /*Slice Quantization Parameter*/
895                   ((slice_param->slice_beta_offset_div2 & 0xf) << 8) |
896                   ((slice_param->slice_alpha_c0_offset_div2 & 0xf) << 0));
897     OUT_BCS_BATCH(batch,
898                   (beginy << 24) |                      /*First MB X&Y , the begin postion of current slice*/
899                   (beginx << 16) |
900                   slice_param->macroblock_address );
901     OUT_BCS_BATCH(batch, (nexty << 16) | nextx);                       /*Next slice first MB X&Y*/
902     OUT_BCS_BATCH(batch, 
903                   (0/*rate_control_enable*/ << 31) |            /*in CBR mode RateControlCounterEnable = enable*/
904                   (1 << 30) |           /*ResetRateControlCounter*/
905                   (0 << 28) |           /*RC Triggle Mode = Always Rate Control*/
906                   (4 << 24) |     /*RC Stable Tolerance, middle level*/
907                   (0/*rate_control_enable*/ << 23) |     /*RC Panic Enable*/                 
908                   (0 << 22) |     /*QP mode, don't modfiy CBP*/
909                   (0 << 21) |     /*MB Type Direct Conversion Enabled*/ 
910                   (0 << 20) |     /*MB Type Skip Conversion Enabled*/ 
911                   (last_slice << 19) |     /*IsLastSlice*/
912                   (0 << 18) |   /*BitstreamOutputFlag Compressed BitStream Output Disable Flag 0:enable 1:disable*/
913                   (1 << 17) |       /*HeaderPresentFlag*/       
914                   (1 << 16) |       /*SliceData PresentFlag*/
915                   (1 << 15) |       /*TailPresentFlag*/
916                   (1 << 13) |       /*RBSP NAL TYPE*/   
917                   (0 << 12) );    /*CabacZeroWordInsertionEnable*/
918     OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
919     OUT_BCS_BATCH(batch,
920                   (maxQpN << 24) |     /*Target QP - 24 is lowest QP*/ 
921                   (maxQpP << 16) |     /*Target QP + 20 is highest QP*/
922                   (shrink << 8)  |
923                   (grow << 0));   
924     OUT_BCS_BATCH(batch,
925                   (correct[5] << 20) |
926                   (correct[4] << 16) |
927                   (correct[3] << 12) |
928                   (correct[2] << 8) |
929                   (correct[1] << 4) |
930                   (correct[0] << 0));
931     OUT_BCS_BATCH(batch, 0);
932
933     ADVANCE_BCS_BATCH(batch);
934 }
935
936 #define    AVC_INTRA_RDO_OFFSET    4
937 #define    AVC_INTER_RDO_OFFSET    10
938 #define    AVC_INTER_MSG_OFFSET    8
939 #define    AVC_INTER_MV_OFFSET     48
940 #define    AVC_RDO_MASK            0xFFFF
941
942 #if MFC_SOFTWARE_BATCH
943
944 static int
945 gen8_mfc_avc_pak_object_intra(VADriverContextP ctx, int x, int y, int end_mb,
946                               int qp,unsigned int *msg,
947                               struct intel_encoder_context *encoder_context,
948                               unsigned char target_mb_size, unsigned char max_mb_size,
949                               struct intel_batchbuffer *batch)
950 {
951     int len_in_dwords = 12;
952     unsigned int intra_msg;
953 #define         INTRA_MSG_FLAG          (1 << 13)
954 #define         INTRA_MBTYPE_MASK       (0x1F0000)
955     if (batch == NULL)
956         batch = encoder_context->base.batch;
957
958     BEGIN_BCS_BATCH(batch, len_in_dwords);
959
960     intra_msg = msg[0] & 0xC0FF;
961     intra_msg |= INTRA_MSG_FLAG;
962     intra_msg |= ((msg[0] & INTRA_MBTYPE_MASK) >> 8);
963     OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
964     OUT_BCS_BATCH(batch, 0);
965     OUT_BCS_BATCH(batch, 0);
966     OUT_BCS_BATCH(batch, 
967                   (0 << 24) |           /* PackedMvNum, Debug*/
968                   (0 << 20) |           /* No motion vector */
969                   (1 << 19) |           /* CbpDcY */
970                   (1 << 18) |           /* CbpDcU */
971                   (1 << 17) |           /* CbpDcV */
972                   intra_msg);
973
974     OUT_BCS_BATCH(batch, (0xFFFF << 16) | (y << 8) | x);                /* Code Block Pattern for Y*/
975     OUT_BCS_BATCH(batch, 0x000F000F);                                                   /* Code Block Pattern */                
976     OUT_BCS_BATCH(batch, (0 << 27) | (end_mb << 26) | qp);      /* Last MB */
977
978     /*Stuff for Intra MB*/
979     OUT_BCS_BATCH(batch, msg[1]);                       /* We using Intra16x16 no 4x4 predmode*/        
980     OUT_BCS_BATCH(batch, msg[2]);       
981     OUT_BCS_BATCH(batch, msg[3]&0xFF);  
982     
983     /*MaxSizeInWord and TargetSzieInWord*/
984     OUT_BCS_BATCH(batch, (max_mb_size << 24) |
985                   (target_mb_size << 16) );
986
987     OUT_BCS_BATCH(batch, 0);
988
989     ADVANCE_BCS_BATCH(batch);
990
991     return len_in_dwords;
992 }
993
994 static int
995 gen8_mfc_avc_pak_object_inter(VADriverContextP ctx, int x, int y, int end_mb, int qp,
996                               unsigned int *msg, unsigned int offset,
997                               struct intel_encoder_context *encoder_context,
998                               unsigned char target_mb_size,unsigned char max_mb_size, int slice_type,
999                               struct intel_batchbuffer *batch)
1000 {
1001     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1002     int len_in_dwords = 12;
1003     unsigned int inter_msg = 0;
1004     if (batch == NULL)
1005         batch = encoder_context->base.batch;
1006     {
1007 #define MSG_MV_OFFSET   4
1008         unsigned int *mv_ptr;
1009         mv_ptr = msg + MSG_MV_OFFSET;
1010         /* MV of VME output is based on 16 sub-blocks. So it is necessary
1011          * to convert them to be compatible with the format of AVC_PAK
1012          * command.
1013          */
1014         if ((msg[0] & INTER_MODE_MASK) == INTER_8X16) {
1015             /* MV[0] and MV[2] are replicated */
1016             mv_ptr[4] = mv_ptr[0];
1017             mv_ptr[5] = mv_ptr[1];
1018             mv_ptr[2] = mv_ptr[8];
1019             mv_ptr[3] = mv_ptr[9];
1020             mv_ptr[6] = mv_ptr[8];
1021             mv_ptr[7] = mv_ptr[9];
1022         } else if ((msg[0] & INTER_MODE_MASK) == INTER_16X8) {
1023             /* MV[0] and MV[1] are replicated */
1024             mv_ptr[2] = mv_ptr[0];
1025             mv_ptr[3] = mv_ptr[1];
1026             mv_ptr[4] = mv_ptr[16];
1027             mv_ptr[5] = mv_ptr[17];
1028             mv_ptr[6] = mv_ptr[24];
1029             mv_ptr[7] = mv_ptr[25];
1030         } else if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
1031                    !(msg[1] & SUBMB_SHAPE_MASK)) {
1032             /* Don't touch MV[0] or MV[1] */
1033             mv_ptr[2] = mv_ptr[8];
1034             mv_ptr[3] = mv_ptr[9];
1035             mv_ptr[4] = mv_ptr[16];
1036             mv_ptr[5] = mv_ptr[17];
1037             mv_ptr[6] = mv_ptr[24];
1038             mv_ptr[7] = mv_ptr[25];
1039         }
1040     }
1041
1042     BEGIN_BCS_BATCH(batch, len_in_dwords);
1043
1044     OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
1045
1046     inter_msg = 32;
1047     /* MV quantity */
1048     if ((msg[0] & INTER_MODE_MASK) == INTER_8X8) {
1049         if (msg[1] & SUBMB_SHAPE_MASK)
1050             inter_msg = 128;
1051     }
1052     OUT_BCS_BATCH(batch, inter_msg);         /* 32 MV*/
1053     OUT_BCS_BATCH(batch, offset);
1054     inter_msg = msg[0] & (0x1F00FFFF);
1055     inter_msg |= INTER_MV8;
1056     inter_msg |= ((1 << 19) | (1 << 18) | (1 << 17));
1057     if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
1058         (msg[1] & SUBMB_SHAPE_MASK)) {
1059         inter_msg |= INTER_MV32;
1060     }
1061
1062     OUT_BCS_BATCH(batch, inter_msg);
1063
1064     OUT_BCS_BATCH(batch, (0xFFFF<<16) | (y << 8) | x);        /* Code Block Pattern for Y*/
1065     OUT_BCS_BATCH(batch, 0x000F000F);                         /* Code Block Pattern */  
1066 #if 0 
1067     if ( slice_type == SLICE_TYPE_B) {
1068         OUT_BCS_BATCH(batch, (0xF<<28) | (end_mb << 26) | qp);  /* Last MB */
1069     } else {
1070         OUT_BCS_BATCH(batch, (end_mb << 26) | qp);      /* Last MB */
1071     }
1072 #else
1073     OUT_BCS_BATCH(batch, (end_mb << 26) | qp);  /* Last MB */
1074 #endif
1075
1076     inter_msg = msg[1] >> 8;
1077     /*Stuff for Inter MB*/
1078     OUT_BCS_BATCH(batch, inter_msg);        
1079     OUT_BCS_BATCH(batch, vme_context->ref_index_in_mb[0]);
1080     OUT_BCS_BATCH(batch, vme_context->ref_index_in_mb[1]);
1081
1082     /*MaxSizeInWord and TargetSzieInWord*/
1083     OUT_BCS_BATCH(batch, (max_mb_size << 24) |
1084                   (target_mb_size << 16) );
1085
1086     OUT_BCS_BATCH(batch, 0x0);    
1087
1088     ADVANCE_BCS_BATCH(batch);
1089
1090     return len_in_dwords;
1091 }
1092
1093 static void 
1094 gen8_mfc_avc_pipeline_slice_programing(VADriverContextP ctx,
1095                                        struct encode_state *encode_state,
1096                                        struct intel_encoder_context *encoder_context,
1097                                        int slice_index,
1098                                        struct intel_batchbuffer *slice_batch)
1099 {
1100     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1101     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1102     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1103     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
1104     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
1105     unsigned int *msg = NULL, offset = 0;
1106     unsigned char *msg_ptr = NULL;
1107     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1108     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1109     int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
1110     int i,x,y;
1111     int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1112     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1113     unsigned int tail_data[] = { 0x0, 0x0 };
1114     int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
1115     int is_intra = slice_type == SLICE_TYPE_I;
1116     int qp_slice;
1117
1118     qp_slice = qp;
1119     if (rate_control_mode == VA_RC_CBR) {
1120         qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
1121         if (encode_state->slice_header_index[slice_index] == 0) {
1122             pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1123             qp_slice = qp;
1124         }
1125     }
1126
1127     /* only support for 8-bit pixel bit-depth */
1128     assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
1129     assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
1130     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1131     assert(qp >= 0 && qp < 52);
1132
1133     gen8_mfc_avc_slice_state(ctx,
1134                              pPicParameter,
1135                              pSliceParameter,
1136                              encode_state, encoder_context,
1137                              (rate_control_mode == VA_RC_CBR), qp_slice, slice_batch);
1138
1139     if ( slice_index == 0)
1140         intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1141
1142     intel_avc_slice_insert_packed_data(ctx, encode_state, encoder_context, slice_index, slice_batch);
1143
1144     dri_bo_map(vme_context->vme_output.bo , 1);
1145     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
1146
1147     if (is_intra) {
1148         msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
1149     } else {
1150         msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
1151     }
1152    
1153     for (i = pSliceParameter->macroblock_address; 
1154          i < pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks; i++) {
1155         int last_mb = (i == (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks - 1) );
1156         x = i % width_in_mbs;
1157         y = i / width_in_mbs;
1158         msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
1159
1160         if (is_intra) {
1161             assert(msg);
1162             gen8_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp, msg, encoder_context, 0, 0, slice_batch);
1163         } else {
1164             int inter_rdo, intra_rdo;
1165             inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
1166             intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
1167             offset = i * vme_context->vme_output.size_block + AVC_INTER_MV_OFFSET;
1168             if (intra_rdo < inter_rdo) { 
1169                 gen8_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp, msg, encoder_context, 0, 0, slice_batch);
1170             } else {
1171                 msg += AVC_INTER_MSG_OFFSET;
1172                 gen8_mfc_avc_pak_object_inter(ctx, x, y, last_mb, qp, msg, offset, encoder_context, 0, 0, pSliceParameter->slice_type, slice_batch);
1173             }
1174         }
1175     }
1176    
1177     dri_bo_unmap(vme_context->vme_output.bo);
1178
1179     if ( last_slice ) {    
1180         mfc_context->insert_object(ctx, encoder_context,
1181                                    tail_data, 2, 8,
1182                                    2, 1, 1, 0, slice_batch);
1183     } else {
1184         mfc_context->insert_object(ctx, encoder_context,
1185                                    tail_data, 1, 8,
1186                                    1, 1, 1, 0, slice_batch);
1187     }
1188 }
1189
1190 static dri_bo *
1191 gen8_mfc_avc_software_batchbuffer(VADriverContextP ctx,
1192                                   struct encode_state *encode_state,
1193                                   struct intel_encoder_context *encoder_context)
1194 {
1195     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1196     struct intel_batchbuffer *batch;
1197     dri_bo *batch_bo;
1198     int i;
1199
1200     batch = mfc_context->aux_batchbuffer;
1201     batch_bo = batch->buffer;
1202     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
1203         gen8_mfc_avc_pipeline_slice_programing(ctx, encode_state, encoder_context, i, batch);
1204     }
1205
1206     intel_batchbuffer_align(batch, 8);
1207     
1208     BEGIN_BCS_BATCH(batch, 2);
1209     OUT_BCS_BATCH(batch, 0);
1210     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
1211     ADVANCE_BCS_BATCH(batch);
1212
1213     dri_bo_reference(batch_bo);
1214     intel_batchbuffer_free(batch);
1215     mfc_context->aux_batchbuffer = NULL;
1216
1217     return batch_bo;
1218 }
1219
1220 #else
1221
1222 static void
1223 gen8_mfc_batchbuffer_surfaces_input(VADriverContextP ctx,
1224                                     struct encode_state *encode_state,
1225                                     struct intel_encoder_context *encoder_context)
1226 {
1227     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1228     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1229
1230     assert(vme_context->vme_output.bo);
1231     mfc_context->buffer_suface_setup(ctx,
1232                                      &mfc_context->gpe_context,
1233                                      &vme_context->vme_output,
1234                                      BINDING_TABLE_OFFSET(BIND_IDX_VME_OUTPUT),
1235                                      SURFACE_STATE_OFFSET(BIND_IDX_VME_OUTPUT));
1236 }
1237
1238 static void
1239 gen8_mfc_batchbuffer_surfaces_output(VADriverContextP ctx,
1240                                      struct encode_state *encode_state,
1241                                      struct intel_encoder_context *encoder_context)
1242 {
1243     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1244     assert(mfc_context->aux_batchbuffer_surface.bo);
1245     mfc_context->buffer_suface_setup(ctx,
1246                                      &mfc_context->gpe_context,
1247                                      &mfc_context->aux_batchbuffer_surface,
1248                                      BINDING_TABLE_OFFSET(BIND_IDX_MFC_BATCHBUFFER),
1249                                      SURFACE_STATE_OFFSET(BIND_IDX_MFC_BATCHBUFFER));
1250 }
1251
1252 static void
1253 gen8_mfc_batchbuffer_surfaces_setup(VADriverContextP ctx, 
1254                                     struct encode_state *encode_state,
1255                                     struct intel_encoder_context *encoder_context)
1256 {
1257     gen8_mfc_batchbuffer_surfaces_input(ctx, encode_state, encoder_context);
1258     gen8_mfc_batchbuffer_surfaces_output(ctx, encode_state, encoder_context);
1259 }
1260
1261 static void
1262 gen8_mfc_batchbuffer_idrt_setup(VADriverContextP ctx, 
1263                                 struct encode_state *encode_state,
1264                                 struct intel_encoder_context *encoder_context)
1265 {
1266     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1267     struct gen8_interface_descriptor_data *desc;
1268     int i;
1269     dri_bo *bo;
1270     unsigned char *desc_ptr;
1271
1272     bo = mfc_context->gpe_context.dynamic_state.bo;
1273     dri_bo_map(bo, 1);
1274     assert(bo->virtual);
1275     desc_ptr = (unsigned char *)bo->virtual + mfc_context->gpe_context.idrt_offset;
1276
1277     desc = (struct gen8_interface_descriptor_data *)desc_ptr;
1278
1279     for (i = 0; i < mfc_context->gpe_context.num_kernels; i++) {
1280         struct i965_kernel *kernel;
1281         kernel = &mfc_context->gpe_context.kernels[i];
1282         assert(sizeof(*desc) == 32);
1283         /*Setup the descritor table*/
1284         memset(desc, 0, sizeof(*desc));
1285         desc->desc0.kernel_start_pointer = kernel->kernel_offset >> 6;
1286         desc->desc3.sampler_count = 0;
1287         desc->desc3.sampler_state_pointer = 0;
1288         desc->desc4.binding_table_entry_count = 1;
1289         desc->desc4.binding_table_pointer = (BINDING_TABLE_OFFSET(0) >> 5);
1290         desc->desc5.constant_urb_entry_read_offset = 0;
1291         desc->desc5.constant_urb_entry_read_length = 4;
1292
1293                 
1294         desc++;
1295     }
1296
1297     dri_bo_unmap(bo);
1298
1299     return;
1300 }
1301
1302 static void
1303 gen8_mfc_batchbuffer_constant_setup(VADriverContextP ctx, 
1304                                     struct encode_state *encode_state,
1305                                     struct intel_encoder_context *encoder_context)
1306 {
1307     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1308     
1309     (void)mfc_context;
1310 }
1311
1312 #define AVC_PAK_LEN_IN_BYTE     48
1313 #define AVC_PAK_LEN_IN_OWORD    3
1314
1315 static void
1316 gen8_mfc_batchbuffer_emit_object_command(struct intel_batchbuffer *batch,
1317                                           uint32_t intra_flag,
1318                                           int head_offset,
1319                                           int number_mb_cmds,
1320                                           int slice_end_x,
1321                                           int slice_end_y,
1322                                           int mb_x,
1323                                           int mb_y,
1324                                           int width_in_mbs,
1325                                           int qp,
1326                                           uint32_t fwd_ref,
1327                                           uint32_t bwd_ref)
1328 {
1329     uint32_t temp_value;
1330     BEGIN_BATCH(batch, 14);
1331     
1332     OUT_BATCH(batch, CMD_MEDIA_OBJECT | (14 - 2));
1333     OUT_BATCH(batch, 0);
1334     OUT_BATCH(batch, 0);
1335     OUT_BATCH(batch, 0);
1336     OUT_BATCH(batch, 0);
1337     OUT_BATCH(batch, 0);
1338    
1339     /*inline data */
1340     OUT_BATCH(batch, head_offset / 16);
1341     OUT_BATCH(batch, (intra_flag) | (qp << 16));
1342     temp_value = (mb_x | (mb_y << 8) | (width_in_mbs << 16));
1343     OUT_BATCH(batch, temp_value);
1344
1345     OUT_BATCH(batch, number_mb_cmds);
1346
1347     OUT_BATCH(batch,
1348               ((slice_end_y << 8) | (slice_end_x)));
1349     OUT_BATCH(batch, fwd_ref);
1350     OUT_BATCH(batch, bwd_ref);
1351
1352     OUT_BATCH(batch, MI_NOOP);
1353
1354     ADVANCE_BATCH(batch);
1355 }
1356
1357 static void
1358 gen8_mfc_avc_batchbuffer_slice_command(VADriverContextP ctx,
1359                                         struct intel_encoder_context *encoder_context,
1360                                         VAEncSliceParameterBufferH264 *slice_param,
1361                                         int head_offset,
1362                                         int qp,
1363                                         int last_slice)
1364 {
1365     struct intel_batchbuffer *batch = encoder_context->base.batch;
1366     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1367     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1368     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1369     int total_mbs = slice_param->num_macroblocks;
1370     int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
1371     int number_mb_cmds = 128;
1372     int starting_offset = 0;
1373     int mb_x, mb_y;
1374     int last_mb, slice_end_x, slice_end_y;
1375     int remaining_mb = total_mbs;
1376     uint32_t fwd_ref , bwd_ref, mb_flag;
1377
1378     last_mb = slice_param->macroblock_address + total_mbs - 1;
1379     slice_end_x = last_mb % width_in_mbs;
1380     slice_end_y = last_mb / width_in_mbs;
1381
1382     if (slice_type == SLICE_TYPE_I) {
1383         fwd_ref = 0;
1384         bwd_ref = 0;
1385         mb_flag = 1;
1386     } else {
1387         fwd_ref = vme_context->ref_index_in_mb[0];
1388         bwd_ref = vme_context->ref_index_in_mb[1];
1389         mb_flag = 0;
1390     }
1391
1392     if (width_in_mbs >= 100) {
1393         number_mb_cmds = width_in_mbs / 5;
1394     } else if (width_in_mbs >= 80) {
1395         number_mb_cmds = width_in_mbs / 4;
1396     } else if (width_in_mbs >= 60) {
1397         number_mb_cmds = width_in_mbs / 3;
1398     } else if (width_in_mbs >= 40) {
1399         number_mb_cmds = width_in_mbs / 2;
1400     } else {
1401         number_mb_cmds = width_in_mbs;
1402     }
1403
1404     do {
1405         if (number_mb_cmds >= remaining_mb) {
1406                 number_mb_cmds = remaining_mb;
1407         }
1408         mb_x = (slice_param->macroblock_address + starting_offset) % width_in_mbs;
1409         mb_y = (slice_param->macroblock_address + starting_offset) / width_in_mbs;
1410
1411         gen8_mfc_batchbuffer_emit_object_command(batch,
1412                                                   mb_flag,
1413                                                   head_offset,
1414                                                   number_mb_cmds,
1415                                                   slice_end_x,
1416                                                   slice_end_y,
1417                                                   mb_x,
1418                                                   mb_y,
1419                                                   width_in_mbs,
1420                                                   qp,
1421                                                   fwd_ref,
1422                                                   bwd_ref);
1423
1424         head_offset += (number_mb_cmds * AVC_PAK_LEN_IN_BYTE);
1425         remaining_mb -= number_mb_cmds;
1426         starting_offset += number_mb_cmds;
1427     } while (remaining_mb > 0);
1428 }
1429
1430 static void
1431 gen8_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
1432                                 struct encode_state *encode_state,
1433                                 struct intel_encoder_context *encoder_context,
1434                                 int slice_index)
1435 {
1436     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1437     struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
1438     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1439     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
1440     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
1441     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1442     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1443     int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
1444     int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1445     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1446     unsigned int tail_data[] = { 0x0, 0x0 };
1447     long head_offset;
1448     int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
1449     int qp_slice;
1450
1451     qp_slice = qp;
1452     if (rate_control_mode == VA_RC_CBR) {
1453         qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
1454         if (encode_state->slice_header_index[slice_index] == 0) {
1455             pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1456             qp_slice = qp;
1457         }
1458     }
1459
1460     /* only support for 8-bit pixel bit-depth */
1461     assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
1462     assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
1463     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1464     assert(qp >= 0 && qp < 52);
1465
1466     gen8_mfc_avc_slice_state(ctx,
1467                               pPicParameter,
1468                               pSliceParameter,
1469                               encode_state,
1470                               encoder_context,
1471                               (rate_control_mode == VA_RC_CBR),
1472                               qp_slice,
1473                               slice_batch);
1474
1475     if (slice_index == 0)
1476         intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1477
1478     intel_avc_slice_insert_packed_data(ctx, encode_state, encoder_context, slice_index, slice_batch);
1479
1480     intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
1481     head_offset = intel_batchbuffer_used_size(slice_batch);
1482
1483     slice_batch->ptr += pSliceParameter->num_macroblocks * AVC_PAK_LEN_IN_BYTE;
1484
1485     gen8_mfc_avc_batchbuffer_slice_command(ctx,
1486                                             encoder_context,
1487                                             pSliceParameter,
1488                                             head_offset,
1489                                             qp,
1490                                             last_slice);
1491
1492
1493     /* Aligned for tail */
1494     intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
1495     if (last_slice) {    
1496         mfc_context->insert_object(ctx,
1497                                    encoder_context,
1498                                    tail_data,
1499                                    2,
1500                                    8,
1501                                    2,
1502                                    1,
1503                                    1,
1504                                    0,
1505                                    slice_batch);
1506     } else {
1507         mfc_context->insert_object(ctx,
1508                                    encoder_context,
1509                                    tail_data,
1510                                    1,
1511                                    8,
1512                                    1,
1513                                    1,
1514                                    1,
1515                                    0,
1516                                    slice_batch);
1517     }
1518
1519     return;
1520 }
1521
1522 static void
1523 gen8_mfc_avc_batchbuffer_pipeline(VADriverContextP ctx,
1524                                   struct encode_state *encode_state,
1525                                   struct intel_encoder_context *encoder_context)
1526 {
1527     struct i965_driver_data *i965 = i965_driver_data(ctx);
1528     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1529     struct intel_batchbuffer *batch = encoder_context->base.batch;
1530     int i;
1531
1532     intel_batchbuffer_start_atomic(batch, 0x4000);
1533
1534     if (IS_GEN9(i965->intel.device_info))
1535         gen9_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
1536     else
1537         gen8_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
1538
1539     for ( i = 0; i < encode_state->num_slice_params_ext; i++) {
1540         gen8_mfc_avc_batchbuffer_slice(ctx, encode_state, encoder_context, i);
1541     }
1542     {
1543         struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
1544
1545         intel_batchbuffer_align(slice_batch, 8);
1546         BEGIN_BCS_BATCH(slice_batch, 2);
1547         OUT_BCS_BATCH(slice_batch, 0);
1548         OUT_BCS_BATCH(slice_batch, MI_BATCH_BUFFER_END);
1549         ADVANCE_BCS_BATCH(slice_batch);
1550
1551         BEGIN_BATCH(batch, 2);
1552         OUT_BATCH(batch, CMD_MEDIA_STATE_FLUSH);
1553         OUT_BATCH(batch, 0);
1554         ADVANCE_BATCH(batch);
1555     }
1556
1557     intel_batchbuffer_end_atomic(batch);
1558     intel_batchbuffer_flush(batch);
1559
1560     if (IS_GEN9(i965->intel.device_info))
1561         gen9_gpe_pipeline_end(ctx, &mfc_context->gpe_context, batch);
1562 }
1563
1564 static void
1565 gen8_mfc_build_avc_batchbuffer(VADriverContextP ctx, 
1566                                struct encode_state *encode_state,
1567                                struct intel_encoder_context *encoder_context)
1568 {
1569     gen8_mfc_batchbuffer_surfaces_setup(ctx, encode_state, encoder_context);
1570     gen8_mfc_batchbuffer_idrt_setup(ctx, encode_state, encoder_context);
1571     gen8_mfc_batchbuffer_constant_setup(ctx, encode_state, encoder_context);
1572     gen8_mfc_avc_batchbuffer_pipeline(ctx, encode_state, encoder_context);
1573 }
1574
1575 static dri_bo *
1576 gen8_mfc_avc_hardware_batchbuffer(VADriverContextP ctx,
1577                                   struct encode_state *encode_state,
1578                                   struct intel_encoder_context *encoder_context)
1579 {
1580     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1581
1582     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
1583     gen8_mfc_build_avc_batchbuffer(ctx, encode_state, encoder_context);
1584
1585     return mfc_context->aux_batchbuffer_surface.bo;
1586 }
1587
1588 #endif
1589
1590 static void
1591 gen8_mfc_avc_pipeline_programing(VADriverContextP ctx,
1592                                  struct encode_state *encode_state,
1593                                  struct intel_encoder_context *encoder_context)
1594 {
1595     struct intel_batchbuffer *batch = encoder_context->base.batch;
1596     dri_bo *slice_batch_bo;
1597
1598     if ( intel_mfc_interlace_check(ctx, encode_state, encoder_context) ) {
1599         fprintf(stderr, "Current VA driver don't support interlace mode!\n");
1600         assert(0);
1601         return; 
1602     }
1603
1604 #if MFC_SOFTWARE_BATCH
1605     slice_batch_bo = gen8_mfc_avc_software_batchbuffer(ctx, encode_state, encoder_context);
1606 #else
1607     slice_batch_bo = gen8_mfc_avc_hardware_batchbuffer(ctx, encode_state, encoder_context);
1608 #endif
1609
1610     // begin programing
1611     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
1612     intel_batchbuffer_emit_mi_flush(batch);
1613     
1614     // picture level programing
1615     gen8_mfc_avc_pipeline_picture_programing(ctx, encode_state, encoder_context);
1616
1617     BEGIN_BCS_BATCH(batch, 3);
1618     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
1619     OUT_BCS_RELOC(batch,
1620                   slice_batch_bo,
1621                   I915_GEM_DOMAIN_COMMAND, 0, 
1622                   0);
1623     OUT_BCS_BATCH(batch, 0);
1624     ADVANCE_BCS_BATCH(batch);
1625
1626     // end programing
1627     intel_batchbuffer_end_atomic(batch);
1628
1629     dri_bo_unreference(slice_batch_bo);
1630 }
1631
1632
1633 static VAStatus
1634 gen8_mfc_avc_encode_picture(VADriverContextP ctx, 
1635                             struct encode_state *encode_state,
1636                             struct intel_encoder_context *encoder_context)
1637 {
1638     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1639     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1640     int current_frame_bits_size;
1641     int sts;
1642  
1643     for (;;) {
1644         gen8_mfc_init(ctx, encode_state, encoder_context);
1645         intel_mfc_avc_prepare(ctx, encode_state, encoder_context);
1646         /*Programing bcs pipeline*/
1647         gen8_mfc_avc_pipeline_programing(ctx, encode_state, encoder_context);   //filling the pipeline
1648         gen8_mfc_run(ctx, encode_state, encoder_context);
1649         if (rate_control_mode == VA_RC_CBR /*|| rate_control_mode == VA_RC_VBR*/) {
1650             gen8_mfc_stop(ctx, encode_state, encoder_context, &current_frame_bits_size);
1651             sts = intel_mfc_brc_postpack(encode_state, mfc_context, current_frame_bits_size);
1652             if (sts == BRC_NO_HRD_VIOLATION) {
1653                 intel_mfc_hrd_context_update(encode_state, mfc_context);
1654                 break;
1655             }
1656             else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
1657                 if (!mfc_context->hrd.violation_noted) {
1658                     fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP)? "overflow": "underflow");
1659                     mfc_context->hrd.violation_noted = 1;
1660                 }
1661                 return VA_STATUS_SUCCESS;
1662             }
1663         } else {
1664             break;
1665         }
1666     }
1667
1668     return VA_STATUS_SUCCESS;
1669 }
1670
1671 /*
1672  * MPEG-2
1673  */
1674
1675 static const int
1676 va_to_gen8_mpeg2_picture_type[3] = {
1677     1,  /* I */
1678     2,  /* P */
1679     3   /* B */
1680 };
1681
1682 static void
1683 gen8_mfc_mpeg2_pic_state(VADriverContextP ctx,
1684                          struct intel_encoder_context *encoder_context,
1685                          struct encode_state *encode_state)
1686 {
1687     struct intel_batchbuffer *batch = encoder_context->base.batch;
1688     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1689     VAEncPictureParameterBufferMPEG2 *pic_param;
1690     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1691     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1692     VAEncSliceParameterBufferMPEG2 *slice_param = NULL;
1693
1694     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
1695     pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
1696     slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[0]->buffer;
1697
1698     BEGIN_BCS_BATCH(batch, 13);
1699     OUT_BCS_BATCH(batch, MFX_MPEG2_PIC_STATE | (13 - 2));
1700     OUT_BCS_BATCH(batch,
1701                   (pic_param->f_code[1][1] & 0xf) << 28 | /* f_code[1][1] */
1702                   (pic_param->f_code[1][0] & 0xf) << 24 | /* f_code[1][0] */
1703                   (pic_param->f_code[0][1] & 0xf) << 20 | /* f_code[0][1] */
1704                   (pic_param->f_code[0][0] & 0xf) << 16 | /* f_code[0][0] */
1705                   pic_param->picture_coding_extension.bits.intra_dc_precision << 14 |
1706                   pic_param->picture_coding_extension.bits.picture_structure << 12 |
1707                   pic_param->picture_coding_extension.bits.top_field_first << 11 |
1708                   pic_param->picture_coding_extension.bits.frame_pred_frame_dct << 10 |
1709                   pic_param->picture_coding_extension.bits.concealment_motion_vectors << 9 |
1710                   pic_param->picture_coding_extension.bits.q_scale_type << 8 |
1711                   pic_param->picture_coding_extension.bits.intra_vlc_format << 7 | 
1712                   pic_param->picture_coding_extension.bits.alternate_scan << 6);
1713     OUT_BCS_BATCH(batch,
1714                   0 << 14 |     /* LoadSlicePointerFlag, 0 means only loading bitstream pointer once */
1715                   va_to_gen8_mpeg2_picture_type[pic_param->picture_type] << 9 |
1716                   0);
1717     OUT_BCS_BATCH(batch,
1718                   1 << 31 |     /* slice concealment */
1719                   (height_in_mbs - 1) << 16 |
1720                   (width_in_mbs - 1));
1721
1722     if (slice_param && slice_param->quantiser_scale_code >= 14)
1723         OUT_BCS_BATCH(batch, (3 << 1) | (1 << 4) | (5 << 8) | (1 << 12));
1724     else
1725         OUT_BCS_BATCH(batch, 0);
1726
1727     OUT_BCS_BATCH(batch, 0);
1728     OUT_BCS_BATCH(batch,
1729                   0xFFF << 16 | /* InterMBMaxSize */
1730                   0xFFF << 0 |  /* IntraMBMaxSize */
1731                   0);
1732     OUT_BCS_BATCH(batch, 0);
1733     OUT_BCS_BATCH(batch, 0);
1734     OUT_BCS_BATCH(batch, 0);
1735     OUT_BCS_BATCH(batch, 0);
1736     OUT_BCS_BATCH(batch, 0);
1737     OUT_BCS_BATCH(batch, 0);
1738     ADVANCE_BCS_BATCH(batch);
1739 }
1740
1741 static void
1742 gen8_mfc_mpeg2_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
1743 {
1744     unsigned char intra_qm[64] = {
1745         8, 16, 19, 22, 26, 27, 29, 34,
1746         16, 16, 22, 24, 27, 29, 34, 37,
1747         19, 22, 26, 27, 29, 34, 34, 38,
1748         22, 22, 26, 27, 29, 34, 37, 40,
1749         22, 26, 27, 29, 32, 35, 40, 48,
1750         26, 27, 29, 32, 35, 40, 48, 58,
1751         26, 27, 29, 34, 38, 46, 56, 69,
1752         27, 29, 35, 38, 46, 56, 69, 83
1753     };
1754
1755     unsigned char non_intra_qm[64] = {
1756         16, 16, 16, 16, 16, 16, 16, 16,
1757         16, 16, 16, 16, 16, 16, 16, 16,
1758         16, 16, 16, 16, 16, 16, 16, 16,
1759         16, 16, 16, 16, 16, 16, 16, 16,
1760         16, 16, 16, 16, 16, 16, 16, 16,
1761         16, 16, 16, 16, 16, 16, 16, 16,
1762         16, 16, 16, 16, 16, 16, 16, 16,
1763         16, 16, 16, 16, 16, 16, 16, 16
1764     };
1765
1766     gen8_mfc_qm_state(ctx, MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX, (unsigned int *)intra_qm, 16, encoder_context);
1767     gen8_mfc_qm_state(ctx, MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX, (unsigned int *)non_intra_qm, 16,encoder_context);
1768 }
1769
1770 static void
1771 gen8_mfc_mpeg2_fqm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
1772 {
1773     unsigned short intra_fqm[64] = {
1774         65536/0x8, 65536/0x10, 65536/0x13, 65536/0x16, 65536/0x16, 65536/0x1a, 65536/0x1a, 65536/0x1b,
1775         65536/0x10, 65536/0x10, 65536/0x16, 65536/0x16, 65536/0x1a, 65536/0x1b, 65536/0x1b, 65536/0x1d,
1776         65536/0x13, 65536/0x16, 65536/0x1a, 65536/0x1a, 65536/0x1b, 65536/0x1d, 65536/0x1d, 65536/0x23,
1777         65536/0x16, 65536/0x18, 65536/0x1b, 65536/0x1b, 65536/0x13, 65536/0x20, 65536/0x22, 65536/0x26,
1778         65536/0x1a, 65536/0x1b, 65536/0x13, 65536/0x13, 65536/0x20, 65536/0x23, 65536/0x26, 65536/0x2e,
1779         65536/0x1b, 65536/0x1d, 65536/0x22, 65536/0x22, 65536/0x23, 65536/0x28, 65536/0x2e, 65536/0x38,
1780         65536/0x1d, 65536/0x22, 65536/0x22, 65536/0x25, 65536/0x28, 65536/0x30, 65536/0x38, 65536/0x45,
1781         65536/0x22, 65536/0x25, 65536/0x26, 65536/0x28, 65536/0x30, 65536/0x3a, 65536/0x45, 65536/0x53,
1782     };
1783
1784     unsigned short non_intra_fqm[64] = {
1785         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1786         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1787         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1788         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1789         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1790         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1791         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1792         0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
1793     };
1794
1795     gen8_mfc_fqm_state(ctx, MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX, (unsigned int *)intra_fqm, 32, encoder_context);
1796     gen8_mfc_fqm_state(ctx, MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX, (unsigned int *)non_intra_fqm, 32, encoder_context);
1797 }
1798
1799 static void
1800 gen8_mfc_mpeg2_slicegroup_state(VADriverContextP ctx,
1801                                 struct intel_encoder_context *encoder_context,
1802                                 int x, int y,
1803                                 int next_x, int next_y,
1804                                 int is_fisrt_slice_group,
1805                                 int is_last_slice_group,
1806                                 int intra_slice,
1807                                 int qp,
1808                                 struct intel_batchbuffer *batch)
1809 {
1810     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1811
1812     if (batch == NULL)
1813         batch = encoder_context->base.batch;
1814
1815     BEGIN_BCS_BATCH(batch, 8);
1816
1817     OUT_BCS_BATCH(batch, MFC_MPEG2_SLICEGROUP_STATE | (8 - 2));
1818     OUT_BCS_BATCH(batch,
1819                   0 << 31 |                             /* MbRateCtrlFlag */
1820                   !!is_last_slice_group << 19 |         /* IsLastSliceGrp */
1821                   1 << 17 |                             /* Insert Header before the first slice group data */
1822                   1 << 16 |                             /* SliceData PresentFlag: always 1 */
1823                   1 << 15 |                             /* TailPresentFlag: always 1 */
1824                   0 << 14 |                             /* FirstSliceHdrDisabled: slice header for each slice */
1825                   !!intra_slice << 13 |                 /* IntraSlice */
1826                   !!intra_slice << 12 |                 /* IntraSliceFlag */
1827                   0);
1828     OUT_BCS_BATCH(batch,
1829                   next_y << 24 |
1830                   next_x << 16 |
1831                   y << 8 |
1832                   x << 0 |
1833                   0);
1834     OUT_BCS_BATCH(batch, qp);   /* FIXME: SliceGroupQp */
1835     /* bitstream pointer is only loaded once for the first slice of a frame when 
1836      * LoadSlicePointerFlag is 0
1837      */
1838     OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
1839     OUT_BCS_BATCH(batch, 0);    /* FIXME: */
1840     OUT_BCS_BATCH(batch, 0);    /* FIXME: CorrectPoints */
1841     OUT_BCS_BATCH(batch, 0);    /* FIXME: CVxxx */
1842
1843     ADVANCE_BCS_BATCH(batch);
1844 }
1845
1846 static int
1847 gen8_mfc_mpeg2_pak_object_intra(VADriverContextP ctx,
1848                                 struct intel_encoder_context *encoder_context,
1849                                 int x, int y,
1850                                 int first_mb_in_slice,
1851                                 int last_mb_in_slice,
1852                                 int first_mb_in_slice_group,
1853                                 int last_mb_in_slice_group,
1854                                 int mb_type,
1855                                 int qp_scale_code,
1856                                 int coded_block_pattern,
1857                                 unsigned char target_size_in_word,
1858                                 unsigned char max_size_in_word,
1859                                 struct intel_batchbuffer *batch)
1860 {
1861     int len_in_dwords = 9;
1862
1863     if (batch == NULL)
1864         batch = encoder_context->base.batch;
1865
1866     BEGIN_BCS_BATCH(batch, len_in_dwords);
1867
1868     OUT_BCS_BATCH(batch, MFC_MPEG2_PAK_OBJECT | (len_in_dwords - 2));
1869     OUT_BCS_BATCH(batch,
1870                   0 << 24 |     /* PackedMvNum */
1871                   0 << 20 |     /* MvFormat */
1872                   7 << 17 |     /* CbpDcY/CbpDcU/CbpDcV */
1873                   0 << 15 |     /* TransformFlag: frame DCT */
1874                   0 << 14 |     /* FieldMbFlag */
1875                   1 << 13 |     /* IntraMbFlag */
1876                   mb_type << 8 |   /* MbType: Intra */
1877                   0 << 2 |      /* SkipMbFlag */
1878                   0 << 0 |      /* InterMbMode */
1879                   0);
1880     OUT_BCS_BATCH(batch, y << 16 | x);
1881     OUT_BCS_BATCH(batch,
1882                   max_size_in_word << 24 |
1883                   target_size_in_word << 16 |
1884                   coded_block_pattern << 6 |      /* CBP */
1885                   0);
1886     OUT_BCS_BATCH(batch,
1887                   last_mb_in_slice << 31 |
1888                   first_mb_in_slice << 30 |
1889                   0 << 27 |     /* EnableCoeffClamp */
1890                   last_mb_in_slice_group << 26 |
1891                   0 << 25 |     /* MbSkipConvDisable */
1892                   first_mb_in_slice_group << 24 |
1893                   0 << 16 |     /* MvFieldSelect */
1894                   qp_scale_code << 0 |
1895                   0);
1896     OUT_BCS_BATCH(batch, 0);    /* MV[0][0] */
1897     OUT_BCS_BATCH(batch, 0);    /* MV[1][0] */
1898     OUT_BCS_BATCH(batch, 0);    /* MV[0][1] */
1899     OUT_BCS_BATCH(batch, 0);    /* MV[1][1] */
1900
1901     ADVANCE_BCS_BATCH(batch);
1902
1903     return len_in_dwords;
1904 }
1905
1906 /* Byte offset */
1907 #define MPEG2_INTER_MV_OFFSET   48 
1908
1909 static struct _mv_ranges
1910 {
1911     int low;    /* in the unit of 1/2 pixel */
1912     int high;   /* in the unit of 1/2 pixel */
1913 } mv_ranges[] = {
1914     {0, 0},
1915     {-16, 15},
1916     {-32, 31},
1917     {-64, 63},
1918     {-128, 127},
1919     {-256, 255},
1920     {-512, 511},
1921     {-1024, 1023},
1922     {-2048, 2047},
1923     {-4096, 4095}
1924 };
1925
1926 static int
1927 mpeg2_motion_vector(int mv, int pos, int display_max, int f_code)
1928 {
1929     if (mv + pos * 16 * 2 < 0 ||
1930         mv + (pos + 1) * 16 * 2 > display_max * 2)
1931         mv = 0;
1932
1933     if (f_code > 0 && f_code < 10) {
1934         if (mv < mv_ranges[f_code].low)
1935             mv = mv_ranges[f_code].low;
1936
1937         if (mv > mv_ranges[f_code].high)
1938             mv = mv_ranges[f_code].high;
1939     }
1940
1941     return mv;
1942 }
1943
1944 static int
1945 gen8_mfc_mpeg2_pak_object_inter(VADriverContextP ctx,
1946                                 struct encode_state *encode_state,
1947                                 struct intel_encoder_context *encoder_context,
1948                                 unsigned int *msg,
1949                                 int width_in_mbs, int height_in_mbs,
1950                                 int x, int y,
1951                                 int first_mb_in_slice,
1952                                 int last_mb_in_slice,
1953                                 int first_mb_in_slice_group,
1954                                 int last_mb_in_slice_group,
1955                                 int qp_scale_code,
1956                                 unsigned char target_size_in_word,
1957                                 unsigned char max_size_in_word,
1958                                 struct intel_batchbuffer *batch)
1959 {
1960     VAEncPictureParameterBufferMPEG2 *pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
1961     int len_in_dwords = 9;
1962     short *mvptr, mvx0, mvy0, mvx1, mvy1;
1963     
1964     if (batch == NULL)
1965         batch = encoder_context->base.batch;
1966
1967     mvptr = (short *)((unsigned char *)msg + MPEG2_INTER_MV_OFFSET);;
1968     mvx0 = mpeg2_motion_vector(mvptr[0] / 2, x, width_in_mbs * 16, pic_param->f_code[0][0]);
1969     mvy0 = mpeg2_motion_vector(mvptr[1] / 2, y, height_in_mbs * 16, pic_param->f_code[0][0]);
1970     mvx1 = mpeg2_motion_vector(mvptr[2] / 2, x, width_in_mbs * 16, pic_param->f_code[1][0]);
1971     mvy1 = mpeg2_motion_vector(mvptr[3] / 2, y, height_in_mbs * 16, pic_param->f_code[1][0]);
1972
1973     BEGIN_BCS_BATCH(batch, len_in_dwords);
1974
1975     OUT_BCS_BATCH(batch, MFC_MPEG2_PAK_OBJECT | (len_in_dwords - 2));
1976     OUT_BCS_BATCH(batch,
1977                   2 << 24 |     /* PackedMvNum */
1978                   7 << 20 |     /* MvFormat */
1979                   7 << 17 |     /* CbpDcY/CbpDcU/CbpDcV */
1980                   0 << 15 |     /* TransformFlag: frame DCT */
1981                   0 << 14 |     /* FieldMbFlag */
1982                   0 << 13 |     /* IntraMbFlag */
1983                   1 << 8 |      /* MbType: Frame-based */
1984                   0 << 2 |      /* SkipMbFlag */
1985                   0 << 0 |      /* InterMbMode */
1986                   0);
1987     OUT_BCS_BATCH(batch, y << 16 | x);
1988     OUT_BCS_BATCH(batch,
1989                   max_size_in_word << 24 |
1990                   target_size_in_word << 16 |
1991                   0x3f << 6 |   /* CBP */
1992                   0);
1993     OUT_BCS_BATCH(batch,
1994                   last_mb_in_slice << 31 |
1995                   first_mb_in_slice << 30 |
1996                   0 << 27 |     /* EnableCoeffClamp */
1997                   last_mb_in_slice_group << 26 |
1998                   0 << 25 |     /* MbSkipConvDisable */
1999                   first_mb_in_slice_group << 24 |
2000                   0 << 16 |     /* MvFieldSelect */
2001                   qp_scale_code << 0 |
2002                   0);
2003
2004     OUT_BCS_BATCH(batch, (mvx0 & 0xFFFF) | mvy0 << 16);    /* MV[0][0] */
2005     OUT_BCS_BATCH(batch, (mvx1 & 0xFFFF) | mvy1 << 16);    /* MV[1][0] */
2006     OUT_BCS_BATCH(batch, 0);    /* MV[0][1] */
2007     OUT_BCS_BATCH(batch, 0);    /* MV[1][1] */
2008
2009     ADVANCE_BCS_BATCH(batch);
2010
2011     return len_in_dwords;
2012 }
2013
2014 static void
2015 intel_mfc_mpeg2_pipeline_header_programing(VADriverContextP ctx,
2016                                            struct encode_state *encode_state,
2017                                            struct intel_encoder_context *encoder_context,
2018                                            struct intel_batchbuffer *slice_batch)
2019 {
2020     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2021     int idx = va_enc_packed_type_to_idx(VAEncPackedHeaderMPEG2_SPS);
2022
2023     if (encode_state->packed_header_data[idx]) {
2024         VAEncPackedHeaderParameterBuffer *param = NULL;
2025         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2026         unsigned int length_in_bits;
2027
2028         assert(encode_state->packed_header_param[idx]);
2029         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2030         length_in_bits = param->bit_length;
2031
2032         mfc_context->insert_object(ctx,
2033                                    encoder_context,
2034                                    header_data,
2035                                    ALIGN(length_in_bits, 32) >> 5,
2036                                    length_in_bits & 0x1f,
2037                                    5,   /* FIXME: check it */
2038                                    0,
2039                                    0,
2040                                    0,   /* Needn't insert emulation bytes for MPEG-2 */
2041                                    slice_batch);
2042     }
2043
2044     idx = va_enc_packed_type_to_idx(VAEncPackedHeaderMPEG2_PPS);
2045
2046     if (encode_state->packed_header_data[idx]) {
2047         VAEncPackedHeaderParameterBuffer *param = NULL;
2048         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
2049         unsigned int length_in_bits;
2050
2051         assert(encode_state->packed_header_param[idx]);
2052         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
2053         length_in_bits = param->bit_length;
2054
2055         mfc_context->insert_object(ctx,
2056                                    encoder_context,
2057                                    header_data,
2058                                    ALIGN(length_in_bits, 32) >> 5,
2059                                    length_in_bits & 0x1f,
2060                                    5,   /* FIXME: check it */
2061                                    0,
2062                                    0,
2063                                    0,   /* Needn't insert emulation bytes for MPEG-2 */
2064                                    slice_batch);
2065     }
2066 }
2067
2068 static void 
2069 gen8_mfc_mpeg2_pipeline_slice_group(VADriverContextP ctx,
2070                                     struct encode_state *encode_state,
2071                                     struct intel_encoder_context *encoder_context,
2072                                     int slice_index,
2073                                     VAEncSliceParameterBufferMPEG2 *next_slice_group_param,
2074                                     struct intel_batchbuffer *slice_batch)
2075 {
2076     struct gen6_vme_context *vme_context = encoder_context->vme_context;
2077     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2078     VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
2079     VAEncSliceParameterBufferMPEG2 *slice_param = NULL;
2080     unsigned char tail_delimiter[] = {MPEG2_DELIMITER0, MPEG2_DELIMITER1, MPEG2_DELIMITER2, MPEG2_DELIMITER3, MPEG2_DELIMITER4, 0, 0, 0};
2081     unsigned char section_delimiter[] = {0x0, 0x0, 0x0, 0x0};
2082     int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
2083     int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
2084     int i, j;
2085     int h_start_pos, v_start_pos, h_next_start_pos, v_next_start_pos;
2086     unsigned int *msg = NULL;
2087     unsigned char *msg_ptr = NULL;
2088
2089     slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[slice_index]->buffer;
2090     h_start_pos = slice_param->macroblock_address % width_in_mbs;
2091     v_start_pos = slice_param->macroblock_address / width_in_mbs;
2092     assert(h_start_pos + slice_param->num_macroblocks <= width_in_mbs);
2093
2094     dri_bo_map(vme_context->vme_output.bo , 0);
2095     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
2096
2097     if (next_slice_group_param) {
2098         h_next_start_pos = next_slice_group_param->macroblock_address % width_in_mbs;
2099         v_next_start_pos = next_slice_group_param->macroblock_address / width_in_mbs;
2100     } else {
2101         h_next_start_pos = 0;
2102         v_next_start_pos = height_in_mbs;
2103     }
2104
2105     gen8_mfc_mpeg2_slicegroup_state(ctx,
2106                                     encoder_context,
2107                                     h_start_pos,
2108                                     v_start_pos,
2109                                     h_next_start_pos,
2110                                     v_next_start_pos,
2111                                     slice_index == 0,
2112                                     next_slice_group_param == NULL,
2113                                     slice_param->is_intra_slice,
2114                                     slice_param->quantiser_scale_code,
2115                                     slice_batch);
2116
2117     if (slice_index == 0) 
2118         intel_mfc_mpeg2_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
2119
2120     /* Insert '00' to make sure the header is valid */
2121     mfc_context->insert_object(ctx,
2122                                encoder_context,
2123                                (unsigned int*)section_delimiter,
2124                                1,
2125                                8,   /* 8bits in the last DWORD */
2126                                1,   /* 1 byte */
2127                                1,
2128                                0,
2129                                0,
2130                                slice_batch);
2131
2132     for (i = 0; i < encode_state->slice_params_ext[slice_index]->num_elements; i++) {
2133         /* PAK for each macroblocks */
2134         for (j = 0; j < slice_param->num_macroblocks; j++) {
2135             int h_pos = (slice_param->macroblock_address + j) % width_in_mbs;
2136             int v_pos = (slice_param->macroblock_address + j) / width_in_mbs;
2137             int first_mb_in_slice = (j == 0);
2138             int last_mb_in_slice = (j == slice_param->num_macroblocks - 1);
2139             int first_mb_in_slice_group = (i == 0 && j == 0);
2140             int last_mb_in_slice_group = (i == encode_state->slice_params_ext[slice_index]->num_elements - 1 &&
2141                                           j == slice_param->num_macroblocks - 1);
2142
2143             msg = (unsigned int *)(msg_ptr + (slice_param->macroblock_address + j) * vme_context->vme_output.size_block);
2144
2145             if (slice_param->is_intra_slice) {
2146                 gen8_mfc_mpeg2_pak_object_intra(ctx,
2147                                                 encoder_context,
2148                                                 h_pos, v_pos,
2149                                                 first_mb_in_slice,
2150                                                 last_mb_in_slice,
2151                                                 first_mb_in_slice_group,
2152                                                 last_mb_in_slice_group,
2153                                                 0x1a,
2154                                                 slice_param->quantiser_scale_code,
2155                                                 0x3f,
2156                                                 0,
2157                                                 0xff,
2158                                                 slice_batch);
2159             } else {
2160                 int inter_rdo, intra_rdo;
2161                 inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
2162                 intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
2163
2164                 if (intra_rdo < inter_rdo) 
2165                     gen8_mfc_mpeg2_pak_object_intra(ctx,
2166                                                      encoder_context,
2167                                                      h_pos, v_pos,
2168                                                      first_mb_in_slice,
2169                                                      last_mb_in_slice,
2170                                                      first_mb_in_slice_group,
2171                                                      last_mb_in_slice_group,
2172                                                      0x1a,
2173                                                      slice_param->quantiser_scale_code,
2174                                                      0x3f,
2175                                                      0,
2176                                                      0xff,
2177                                                      slice_batch);
2178                 else
2179                     gen8_mfc_mpeg2_pak_object_inter(ctx,
2180                                                 encode_state,
2181                                                 encoder_context,
2182                                                 msg,
2183                                                 width_in_mbs, height_in_mbs,
2184                                                 h_pos, v_pos,
2185                                                 first_mb_in_slice,
2186                                                 last_mb_in_slice,
2187                                                 first_mb_in_slice_group,
2188                                                 last_mb_in_slice_group,
2189                                                 slice_param->quantiser_scale_code,
2190                                                 0,
2191                                                 0xff,
2192                                                 slice_batch);
2193             }
2194         }
2195
2196         slice_param++;
2197     }
2198
2199     dri_bo_unmap(vme_context->vme_output.bo);
2200
2201     /* tail data */
2202     if (next_slice_group_param == NULL) { /* end of a picture */
2203         mfc_context->insert_object(ctx,
2204                                    encoder_context,
2205                                    (unsigned int *)tail_delimiter,
2206                                    2,
2207                                    8,   /* 8bits in the last DWORD */
2208                                    5,   /* 5 bytes */
2209                                    1,
2210                                    1,
2211                                    0,
2212                                    slice_batch);
2213     } else {        /* end of a lsice group */
2214         mfc_context->insert_object(ctx,
2215                                    encoder_context,
2216                                    (unsigned int *)section_delimiter,
2217                                    1,
2218                                    8,   /* 8bits in the last DWORD */
2219                                    1,   /* 1 byte */
2220                                    1,
2221                                    1,
2222                                    0,
2223                                    slice_batch);
2224     }
2225 }
2226
2227 /* 
2228  * A batch buffer for all slices, including slice state, 
2229  * slice insert object and slice pak object commands
2230  *
2231  */
2232 static dri_bo *
2233 gen8_mfc_mpeg2_software_slice_batchbuffer(VADriverContextP ctx,
2234                                           struct encode_state *encode_state,
2235                                           struct intel_encoder_context *encoder_context)
2236 {
2237     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2238     struct intel_batchbuffer *batch;
2239     VAEncSliceParameterBufferMPEG2 *next_slice_group_param = NULL;
2240     dri_bo *batch_bo;
2241     int i;
2242
2243     batch = mfc_context->aux_batchbuffer;
2244     batch_bo = batch->buffer;
2245
2246     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
2247         if (i == encode_state->num_slice_params_ext - 1)
2248             next_slice_group_param = NULL;
2249         else
2250             next_slice_group_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[i + 1]->buffer;
2251
2252         gen8_mfc_mpeg2_pipeline_slice_group(ctx, encode_state, encoder_context, i, next_slice_group_param, batch);
2253     }
2254
2255     intel_batchbuffer_align(batch, 8);
2256     
2257     BEGIN_BCS_BATCH(batch, 2);
2258     OUT_BCS_BATCH(batch, 0);
2259     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
2260     ADVANCE_BCS_BATCH(batch);
2261
2262     dri_bo_reference(batch_bo);
2263     intel_batchbuffer_free(batch);
2264     mfc_context->aux_batchbuffer = NULL;
2265
2266     return batch_bo;
2267 }
2268
2269 static void
2270 gen8_mfc_mpeg2_pipeline_picture_programing(VADriverContextP ctx,
2271                                            struct encode_state *encode_state,
2272                                            struct intel_encoder_context *encoder_context)
2273 {
2274     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2275
2276     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_MPEG2, encoder_context);
2277     mfc_context->set_surface_state(ctx, encoder_context);
2278     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
2279     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
2280     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
2281     gen8_mfc_mpeg2_pic_state(ctx, encoder_context, encode_state);
2282     gen8_mfc_mpeg2_qm_state(ctx, encoder_context);
2283     gen8_mfc_mpeg2_fqm_state(ctx, encoder_context);
2284 }
2285
2286 static void
2287 gen8_mfc_mpeg2_pipeline_programing(VADriverContextP ctx,
2288                                    struct encode_state *encode_state,
2289                                    struct intel_encoder_context *encoder_context)
2290 {
2291     struct intel_batchbuffer *batch = encoder_context->base.batch;
2292     dri_bo *slice_batch_bo;
2293
2294     slice_batch_bo = gen8_mfc_mpeg2_software_slice_batchbuffer(ctx, encode_state, encoder_context);
2295
2296     // begin programing
2297     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
2298     intel_batchbuffer_emit_mi_flush(batch);
2299     
2300     // picture level programing
2301     gen8_mfc_mpeg2_pipeline_picture_programing(ctx, encode_state, encoder_context);
2302
2303     BEGIN_BCS_BATCH(batch, 4);
2304     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
2305     OUT_BCS_RELOC(batch,
2306                   slice_batch_bo,
2307                   I915_GEM_DOMAIN_COMMAND, 0, 
2308                   0);
2309     OUT_BCS_BATCH(batch, 0);
2310     OUT_BCS_BATCH(batch, 0);
2311     ADVANCE_BCS_BATCH(batch);
2312
2313     // end programing
2314     intel_batchbuffer_end_atomic(batch);
2315
2316     dri_bo_unreference(slice_batch_bo);
2317 }
2318
2319 static VAStatus
2320 intel_mfc_mpeg2_prepare(VADriverContextP ctx, 
2321                         struct encode_state *encode_state,
2322                         struct intel_encoder_context *encoder_context)
2323 {
2324     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2325     struct object_surface *obj_surface; 
2326     struct object_buffer *obj_buffer;
2327     struct i965_coded_buffer_segment *coded_buffer_segment;
2328     VAStatus vaStatus = VA_STATUS_SUCCESS;
2329     dri_bo *bo;
2330     int i;
2331
2332     /* reconstructed surface */
2333     obj_surface = encode_state->reconstructed_object;
2334     i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC_NV12, SUBSAMPLE_YUV420);
2335     mfc_context->pre_deblocking_output.bo = obj_surface->bo;
2336     dri_bo_reference(mfc_context->pre_deblocking_output.bo);
2337     mfc_context->surface_state.width = obj_surface->orig_width;
2338     mfc_context->surface_state.height = obj_surface->orig_height;
2339     mfc_context->surface_state.w_pitch = obj_surface->width;
2340     mfc_context->surface_state.h_pitch = obj_surface->height;
2341
2342     /* forward reference */
2343     obj_surface = encode_state->reference_objects[0];
2344
2345     if (obj_surface && obj_surface->bo) {
2346         mfc_context->reference_surfaces[0].bo = obj_surface->bo;
2347         dri_bo_reference(mfc_context->reference_surfaces[0].bo);
2348     } else
2349         mfc_context->reference_surfaces[0].bo = NULL;
2350
2351     /* backward reference */
2352     obj_surface = encode_state->reference_objects[1];
2353
2354     if (obj_surface && obj_surface->bo) {
2355         mfc_context->reference_surfaces[1].bo = obj_surface->bo;
2356         dri_bo_reference(mfc_context->reference_surfaces[1].bo);
2357     } else {
2358         mfc_context->reference_surfaces[1].bo = mfc_context->reference_surfaces[0].bo;
2359
2360         if (mfc_context->reference_surfaces[1].bo)
2361             dri_bo_reference(mfc_context->reference_surfaces[1].bo);
2362     }
2363
2364     for (i = 2; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
2365         mfc_context->reference_surfaces[i].bo = mfc_context->reference_surfaces[i & 1].bo;
2366
2367         if (mfc_context->reference_surfaces[i].bo)
2368             dri_bo_reference(mfc_context->reference_surfaces[i].bo);
2369     }
2370     
2371     /* input YUV surface */
2372     obj_surface = encode_state->input_yuv_object;
2373     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
2374     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
2375
2376     /* coded buffer */
2377     obj_buffer = encode_state->coded_buf_object;
2378     bo = obj_buffer->buffer_store->bo;
2379     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
2380     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
2381     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
2382     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
2383
2384     /* set the internal flag to 0 to indicate the coded size is unknown */
2385     dri_bo_map(bo, 1);
2386     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
2387     coded_buffer_segment->mapped = 0;
2388     coded_buffer_segment->codec = encoder_context->codec;
2389     dri_bo_unmap(bo);
2390
2391     return vaStatus;
2392 }
2393
2394 static VAStatus
2395 gen8_mfc_mpeg2_encode_picture(VADriverContextP ctx, 
2396                               struct encode_state *encode_state,
2397                               struct intel_encoder_context *encoder_context)
2398 {
2399     gen8_mfc_init(ctx, encode_state, encoder_context);
2400     intel_mfc_mpeg2_prepare(ctx, encode_state, encoder_context);
2401     /*Programing bcs pipeline*/
2402     gen8_mfc_mpeg2_pipeline_programing(ctx, encode_state, encoder_context);
2403     gen8_mfc_run(ctx, encode_state, encoder_context);
2404
2405     return VA_STATUS_SUCCESS;
2406 }
2407
2408 /* JPEG encode methods */
2409
2410 static VAStatus
2411 intel_mfc_jpeg_prepare(VADriverContextP ctx, 
2412                         struct encode_state *encode_state,
2413                         struct intel_encoder_context *encoder_context)
2414 {
2415     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2416     struct object_surface *obj_surface; 
2417     struct object_buffer *obj_buffer;
2418     struct i965_coded_buffer_segment *coded_buffer_segment;
2419     VAStatus vaStatus = VA_STATUS_SUCCESS;
2420     dri_bo *bo;
2421    
2422     /* input YUV surface */
2423     obj_surface = encode_state->input_yuv_object;
2424     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
2425     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
2426
2427     /* coded buffer */
2428     obj_buffer = encode_state->coded_buf_object;
2429     bo = obj_buffer->buffer_store->bo;
2430     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
2431     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
2432     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
2433     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
2434
2435     /* set the internal flag to 0 to indicate the coded size is unknown */
2436     dri_bo_map(bo, 1);
2437     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
2438     coded_buffer_segment->mapped = 0;
2439     coded_buffer_segment->codec = encoder_context->codec;
2440     dri_bo_unmap(bo);
2441
2442     return vaStatus;
2443 }
2444
2445
2446 static void 
2447 gen8_mfc_jpeg_set_surface_state(VADriverContextP ctx,
2448                         struct intel_encoder_context *encoder_context,
2449                         struct encode_state *encode_state)
2450 {
2451     struct intel_batchbuffer *batch = encoder_context->base.batch;
2452     struct object_surface *obj_surface = encode_state->input_yuv_object;
2453     unsigned int input_fourcc;
2454     unsigned int y_cb_offset;
2455     unsigned int y_cr_offset;
2456     unsigned int surface_format;
2457
2458     assert(obj_surface);
2459
2460     y_cb_offset = obj_surface->y_cb_offset;
2461     y_cr_offset = obj_surface->y_cr_offset;
2462     input_fourcc = obj_surface->fourcc;
2463
2464     surface_format = (obj_surface->fourcc == VA_FOURCC_Y800) ?
2465         MFX_SURFACE_MONOCHROME : MFX_SURFACE_PLANAR_420_8;
2466         
2467         
2468      switch (input_fourcc) {
2469         case VA_FOURCC_Y800: {
2470             surface_format = MFX_SURFACE_MONOCHROME;
2471             break;
2472         }
2473         case VA_FOURCC_NV12: { 
2474             surface_format = MFX_SURFACE_PLANAR_420_8;
2475             break;
2476         }      
2477         case VA_FOURCC_UYVY: { 
2478             surface_format = MFX_SURFACE_YCRCB_SWAPY;
2479             break;
2480         }
2481         case VA_FOURCC_YUY2: { 
2482             surface_format = MFX_SURFACE_YCRCB_NORMAL;
2483             break;
2484         }
2485         case VA_FOURCC_RGBA:
2486         case VA_FOURCC_444P: {
2487             surface_format = MFX_SURFACE_R8G8B8A8_UNORM;
2488             break;
2489         }
2490     }
2491
2492     BEGIN_BCS_BATCH(batch, 6);
2493
2494     OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
2495     OUT_BCS_BATCH(batch, 0);
2496     OUT_BCS_BATCH(batch,
2497                   ((obj_surface->orig_height - 1) << 18) |
2498                   ((obj_surface->orig_width - 1) << 4));
2499     OUT_BCS_BATCH(batch,
2500                   (surface_format << 28) | /* Surface Format */
2501                   (0 << 27) | /* must be 1 for interleave U/V, hardware requirement for AVC/VC1/MPEG and 0 for JPEG */
2502                   (0 << 22) | /* surface object control state, FIXME??? */
2503                   ((obj_surface->width - 1) << 3) | /* pitch */
2504                   (0 << 2)  | /* must be 0 for interleave U/V */
2505                   (1 << 1)  | /* must be tiled */
2506                   (I965_TILEWALK_YMAJOR << 0));  /* tile walk, TILEWALK_YMAJOR */
2507     OUT_BCS_BATCH(batch,
2508                   (0 << 16) | /* X offset for U(Cb), must be 0 */
2509                   (y_cb_offset << 0)); /* Y offset for U(Cb) */
2510     OUT_BCS_BATCH(batch,
2511                   (0 << 16) | /* X offset for V(Cr), must be 0 */
2512                   (y_cr_offset << 0)); /* Y offset for V(Cr), must be 0 for video codec, non-zoeo for JPEG */
2513                  
2514
2515     ADVANCE_BCS_BATCH(batch);
2516 }
2517
2518 static void
2519 gen8_mfc_jpeg_pic_state(VADriverContextP ctx,
2520                         struct intel_encoder_context *encoder_context,
2521                         struct encode_state *encode_state)
2522 {
2523     struct intel_batchbuffer *batch = encoder_context->base.batch;
2524     struct object_surface *obj_surface = encode_state->input_yuv_object;
2525     VAEncPictureParameterBufferJPEG *pic_param;
2526     unsigned int  surface_format;
2527     unsigned int  frame_width_in_blks;
2528     unsigned int  frame_height_in_blks;
2529     unsigned int  pixels_in_horizontal_lastMCU;
2530     unsigned int  pixels_in_vertical_lastMCU;
2531     unsigned int  input_surface_format;
2532     unsigned int  output_mcu_format;
2533     unsigned int  picture_width;
2534     unsigned int  picture_height;  
2535
2536     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
2537     assert(obj_surface);
2538     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
2539     surface_format = obj_surface->fourcc;
2540     picture_width = pic_param->picture_width;
2541     picture_height = pic_param->picture_height;
2542     
2543     switch (surface_format) {
2544         case VA_FOURCC_Y800: {
2545             input_surface_format = JPEG_ENC_SURFACE_Y8; 
2546             output_mcu_format = JPEG_ENC_MCU_YUV400;
2547             break;
2548         }
2549         case VA_FOURCC_NV12: { 
2550             input_surface_format = JPEG_ENC_SURFACE_NV12; 
2551             output_mcu_format = JPEG_ENC_MCU_YUV420; 
2552             break;
2553         }      
2554         case VA_FOURCC_UYVY: { 
2555             input_surface_format = JPEG_ENC_SURFACE_UYVY; 
2556             output_mcu_format = JPEG_ENC_MCU_YUV422H_2Y; 
2557             break;
2558         }
2559         case VA_FOURCC_YUY2: { 
2560             input_surface_format = JPEG_ENC_SURFACE_YUY2; 
2561             output_mcu_format = JPEG_ENC_MCU_YUV422H_2Y; 
2562             break;
2563         }
2564
2565         case VA_FOURCC_RGBA:
2566         case VA_FOURCC_444P: { 
2567             input_surface_format = JPEG_ENC_SURFACE_RGB; 
2568             output_mcu_format = JPEG_ENC_MCU_RGB; 
2569             break;
2570         }
2571         default : {
2572             input_surface_format = JPEG_ENC_SURFACE_NV12; 
2573             output_mcu_format = JPEG_ENC_MCU_YUV420;
2574             break;
2575         }
2576     }
2577
2578     
2579     switch (output_mcu_format) {
2580         
2581         case JPEG_ENC_MCU_YUV400:
2582         case JPEG_ENC_MCU_RGB: {
2583             pixels_in_horizontal_lastMCU = (picture_width % 8);
2584             pixels_in_vertical_lastMCU = (picture_height % 8); 
2585
2586             //H1=1,V1=1 for YUV400 and YUV444. So, compute these values accordingly
2587             frame_width_in_blks = ((picture_width + 7) / 8); 
2588             frame_height_in_blks = ((picture_height + 7) / 8);
2589             break;
2590         }
2591         
2592         case JPEG_ENC_MCU_YUV420: {        
2593             if((picture_width % 2) == 0) 
2594                 pixels_in_horizontal_lastMCU = picture_width % 16; 
2595             else 
2596                 pixels_in_horizontal_lastMCU   = ((picture_width % 16) + 1) % 16; 
2597             
2598             if((picture_height % 2) == 0) 
2599                 pixels_in_vertical_lastMCU     = picture_height % 16; 
2600             else 
2601                 pixels_in_vertical_lastMCU   = ((picture_height % 16) + 1) % 16; 
2602
2603             //H1=2,V1=2 for YUV420. So, compute these values accordingly
2604             frame_width_in_blks = ((picture_width + 15) / 16) * 2;
2605             frame_height_in_blks = ((picture_height + 15) / 16) * 2;
2606             break;
2607         }
2608         
2609         case JPEG_ENC_MCU_YUV422H_2Y: {
2610             if(picture_width % 2 == 0) 
2611                 pixels_in_horizontal_lastMCU = picture_width % 16; 
2612             else 
2613                 pixels_in_horizontal_lastMCU = ((picture_width % 16) + 1) % 16; 
2614             
2615             pixels_in_vertical_lastMCU = picture_height % 8;
2616             
2617             //H1=2,V1=1 for YUV422H_2Y. So, compute these values accordingly
2618             frame_width_in_blks = ((picture_width + 15) / 16) * 2;
2619             frame_height_in_blks = ((picture_height + 7) / 8);
2620             break;            
2621         }       
2622     } //end of switch
2623    
2624     BEGIN_BCS_BATCH(batch, 3);
2625     /* DWORD 0 */
2626     OUT_BCS_BATCH(batch, MFX_JPEG_PIC_STATE | (3 - 2)); 
2627     /* DWORD 1 */
2628     OUT_BCS_BATCH(batch,
2629                   ( pixels_in_horizontal_lastMCU << 26) |    /* Pixels In Horizontal Last MCU */
2630                   ( pixels_in_vertical_lastMCU << 21)   |    /* Pixels In Vertical Last MCU */
2631                   ( input_surface_format << 8)          |    /* Input Surface format */
2632                   ( output_mcu_format << 0));                /* Output MCU Structure */
2633     /* DWORD 2 */
2634     OUT_BCS_BATCH(batch,
2635                   ((frame_height_in_blks - 1) << 16)    |   /* Frame Height In Blks Minus 1 */
2636                   (JPEG_ENC_ROUND_QUANT_DEFAULT  << 13) |   /* Rounding Quant set to default value 0 */
2637                   ((frame_width_in_blks - 1) << 0));        /* Frame Width In Blks Minus 1 */
2638     ADVANCE_BCS_BATCH(batch);
2639 }
2640
2641 static void 
2642 get_reciprocal_dword_qm(unsigned char *raster_qm, uint32_t *dword_qm)
2643 {
2644     int i = 0, j = 0;
2645     short reciprocal_qm[64];
2646     
2647     for(i=0; i<64; i++) {
2648         reciprocal_qm[i] = 65535/(raster_qm[i]);           
2649     }
2650     
2651     for(i=0; i<64; i++) {
2652         dword_qm[j] = ((reciprocal_qm[i+1] <<16) | (reciprocal_qm[i]));
2653         j++;
2654         i++;
2655     }    
2656     
2657 }
2658
2659
2660 static void 
2661 gen8_mfc_jpeg_fqm_state(VADriverContextP ctx,
2662                         struct intel_encoder_context *encoder_context,
2663                         struct encode_state *encode_state)
2664 {
2665     unsigned int quality = 0;
2666     uint32_t temp, i = 0, j = 0, dword_qm[32];
2667     VAEncPictureParameterBufferJPEG *pic_param;
2668     VAQMatrixBufferJPEG *qmatrix;
2669     unsigned char raster_qm[64], column_raster_qm[64];
2670     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
2671     
2672     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
2673     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
2674     quality = pic_param->quality;
2675     
2676     //If the app sends the qmatrix, use it, buffer it for using it with the next frames 
2677     //The app can send qmatrix for the first frame and not send for the subsequent frames
2678     if(encode_state->q_matrix && encode_state->q_matrix->buffer) {
2679         qmatrix = (VAQMatrixBufferJPEG *)encode_state->q_matrix->buffer;
2680
2681         mfc_context->buffered_qmatrix.load_lum_quantiser_matrix = 1;
2682         memcpy(mfc_context->buffered_qmatrix.lum_quantiser_matrix, qmatrix->lum_quantiser_matrix, 64 * (sizeof(unsigned char)));
2683
2684         if(pic_param->num_components > 1) {
2685             mfc_context->buffered_qmatrix.load_chroma_quantiser_matrix = 1;
2686             memcpy(mfc_context->buffered_qmatrix.chroma_quantiser_matrix, qmatrix->chroma_quantiser_matrix, 64 * (sizeof(unsigned char)));
2687         } else {
2688             mfc_context->buffered_qmatrix.load_chroma_quantiser_matrix = 0;
2689         }
2690
2691     } else {
2692         //If the app doesnt send the qmatrix, use the buffered/default qmatrix
2693         qmatrix = &mfc_context->buffered_qmatrix;
2694         qmatrix->load_lum_quantiser_matrix = 1;
2695         qmatrix->load_chroma_quantiser_matrix = (pic_param->num_components > 1) ? 1 : 0;
2696     }   
2697
2698
2699     //As per the design, normalization of the quality factor and scaling of the Quantization tables
2700     //based on the quality factor needs to be done in the driver before sending the values to the HW.
2701     //But note, the driver expects the scaled quantization tables (as per below logic) to be sent as
2702     //packed header information. The packed header is written as the header of the jpeg file. This
2703     //header information is used to decode the jpeg file. So, it is the app's responsibility to send
2704     //the correct header information (See build_packed_jpeg_header_buffer() in jpegenc.c in LibVa on
2705     //how to do this). QTables can be different for different applications. If no tables are provided,
2706     //the default tables in the driver are used.
2707
2708     //Normalization of the quality factor
2709     if (quality > 100) quality=100;
2710     if (quality == 0)  quality=1;
2711     quality = (quality < 50) ? (5000/quality) : (200 - (quality*2)); 
2712     
2713     //Step 1. Apply Quality factor and clip to range [1, 255] for luma and chroma Quantization matrices
2714     //Step 2. HW expects the 1/Q[i] values in the qm sent, so get reciprocals
2715     //Step 3. HW also expects 32 dwords, hence combine 2 (1/Q) values into 1 dword
2716     //Step 4. Send the Quantization matrix to the HW, use gen8_mfc_fqm_state
2717     
2718     //For luma (Y or R)
2719     if(qmatrix->load_lum_quantiser_matrix) {
2720         //apply quality to lum_quantiser_matrix
2721         for(i=0; i < 64; i++) {
2722             temp = (qmatrix->lum_quantiser_matrix[i] * quality)/100;
2723             //clamp to range [1,255]
2724             temp = (temp > 255) ? 255 : temp;
2725             temp = (temp < 1) ? 1 : temp;
2726             qmatrix->lum_quantiser_matrix[i] = (unsigned char)temp;
2727         }       
2728         
2729         //For VAAPI, the VAQMatrixBuffer needs to be in zigzag order. 
2730         //The App should send it in zigzag. Now, the driver has to extract the raster from it. 
2731         for (j = 0; j < 64; j++)
2732             raster_qm[zigzag_direct[j]] = qmatrix->lum_quantiser_matrix[j];
2733
2734         //Convert the raster order(row-ordered) to the column-raster (column by column).
2735         //To be consistent with the other encoders, send it in column order.
2736         //Need to double check if our HW expects col or row raster.
2737         for (j = 0; j < 64; j++) {
2738             int row = j / 8, col = j % 8;
2739             column_raster_qm[col * 8 + row] = raster_qm[j];
2740         }
2741         
2742         //Convert to raster QM to reciprocal. HW expects values in reciprocal.
2743         get_reciprocal_dword_qm(column_raster_qm, dword_qm);
2744         
2745         //send the luma qm to the command buffer
2746         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_LUMA_Y_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);
2747     } 
2748     
2749     //For Chroma, if chroma exists (Cb, Cr or G, B)
2750     if(qmatrix->load_chroma_quantiser_matrix) {
2751         //apply quality to chroma_quantiser_matrix
2752         for(i=0; i < 64; i++) {
2753             temp = (qmatrix->chroma_quantiser_matrix[i] * quality)/100;
2754             //clamp to range [1,255]
2755             temp = (temp > 255) ? 255 : temp;
2756             temp = (temp < 1) ? 1 : temp;
2757             qmatrix->chroma_quantiser_matrix[i] = (unsigned char)temp;
2758         }
2759         
2760         //For VAAPI, the VAQMatrixBuffer needs to be in zigzag order. 
2761         //The App should send it in zigzag. Now, the driver has to extract the raster from it. 
2762         for (j = 0; j < 64; j++)
2763             raster_qm[zigzag_direct[j]] = qmatrix->chroma_quantiser_matrix[j];
2764         
2765         //Convert the raster order(row-ordered) to the column-raster (column by column).
2766         //To be consistent with the other encoders, send it in column order.
2767         //Need to double check if our HW expects col or row raster.
2768         for (j = 0; j < 64; j++) {
2769             int row = j / 8, col = j % 8;
2770             column_raster_qm[col * 8 + row] = raster_qm[j];
2771         }
2772
2773
2774         //Convert to raster QM to reciprocal. HW expects values in reciprocal.
2775         get_reciprocal_dword_qm(column_raster_qm, dword_qm);
2776
2777         //send the same chroma qm to the command buffer (for both U,V or G,B)
2778         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_CHROMA_CB_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);
2779         gen8_mfc_fqm_state(ctx, MFX_QM_JPEG_CHROMA_CR_QUANTIZER_MATRIX, dword_qm, 32, encoder_context);        
2780     }
2781 }
2782
2783
2784 //Translation of Table K.5 into code: This method takes the huffval from the 
2785 //Huffmantable buffer and converts into index for the coefficients and size tables
2786 uint8_t map_huffval_to_index(uint8_t huff_val) 
2787 {
2788     uint8_t index = 0;
2789
2790     if(huff_val < 0xF0) {
2791         index = (((huff_val >> 4) & 0x0F) * 0xA) + (huff_val & 0x0F);
2792     } else {
2793         index = 1 + (((huff_val >> 4) & 0x0F) * 0xA) + (huff_val & 0x0F);
2794     }
2795
2796     return index;
2797 }
2798
2799
2800 //Implementation of Flow chart Annex C  - Figure C.1
2801 static void
2802 generate_huffman_codesizes_table(uint8_t *bits, uint8_t *huff_size_table, uint8_t *lastK) 
2803 {
2804     uint8_t i=1, j=1, k=0;
2805
2806     while(i <= 16) {
2807         while(j <= (uint8_t)bits[i-1]) {
2808             huff_size_table[k] = i;
2809             k = k+1;
2810             j = j+1;
2811         }
2812         
2813         i = i+1;
2814         j = 1;
2815     }
2816     huff_size_table[k] = 0;
2817     (*lastK) = k;    
2818 }
2819
2820 //Implementation of Flow chart Annex C - Figure C.2
2821 static void
2822 generate_huffman_codes_table(uint8_t *huff_size_table, uint16_t *huff_code_table)
2823 {
2824     uint8_t k=0;
2825     uint16_t code=0;
2826     uint8_t si=huff_size_table[k];
2827     
2828     while(huff_size_table[k] != 0) {
2829     
2830         while(huff_size_table[k] == si) {
2831             
2832             // An huffman code can never be 0xFFFF. Replace it with 0 if 0xFFFF 
2833             if(code == 0xFFFF) {
2834                 code = 0x0000;
2835             }
2836
2837             huff_code_table[k] = code;
2838             code = code+1;
2839             k = k+1;
2840         }
2841     
2842         code <<= 1;
2843         si = si+1;
2844     }
2845     
2846 }
2847
2848 //Implementation of Flow chat Annex C - Figure C.3
2849 static void
2850 generate_ordered_codes_table(uint8_t *huff_vals, uint8_t *huff_size_table, uint16_t *huff_code_table, uint8_t type, uint8_t lastK)
2851 {
2852     uint8_t huff_val_size=0, i=0, k=0;
2853     
2854     huff_val_size = (type == 0) ? 12 : 162; 
2855     uint8_t huff_si_table[huff_val_size]; 
2856     uint16_t huff_co_table[huff_val_size];
2857     
2858     memset(huff_si_table, 0, sizeof(huff_si_table));
2859     memset(huff_co_table, 0, sizeof(huff_co_table));
2860     
2861     do {
2862         i = map_huffval_to_index(huff_vals[k]);
2863         huff_co_table[i] = huff_code_table[k];
2864         huff_si_table[i] = huff_size_table[k];
2865         k++;
2866     } while(k < lastK);
2867     
2868     memcpy(huff_size_table, huff_si_table, sizeof(uint8_t)*huff_val_size);
2869     memcpy(huff_code_table, huff_co_table, sizeof(uint16_t)*huff_val_size);
2870 }
2871
2872
2873 //This method converts the huffman table to code words which is needed by the HW
2874 //Flowcharts from Jpeg Spec Annex C - Figure C.1, Figure C.2, Figure C.3 are used here
2875 static void
2876 convert_hufftable_to_codes(VAHuffmanTableBufferJPEGBaseline *huff_buffer, uint32_t *table, uint8_t type, uint8_t index)
2877 {
2878     uint8_t lastK = 0, i=0; 
2879     uint8_t huff_val_size = 0;
2880     uint8_t *huff_bits, *huff_vals;
2881
2882     huff_val_size = (type == 0) ? 12 : 162; 
2883     uint8_t huff_size_table[huff_val_size+1]; //The +1 for adding 0 at the end of huff_val_size
2884     uint16_t huff_code_table[huff_val_size];
2885
2886     memset(huff_size_table, 0, sizeof(huff_size_table));
2887     memset(huff_code_table, 0, sizeof(huff_code_table));
2888
2889     huff_bits = (type == 0) ? (huff_buffer->huffman_table[index].num_dc_codes) : (huff_buffer->huffman_table[index].num_ac_codes);
2890     huff_vals = (type == 0) ? (huff_buffer->huffman_table[index].dc_values) : (huff_buffer->huffman_table[index].ac_values);
2891     
2892
2893     //Generation of table of Huffman code sizes
2894     generate_huffman_codesizes_table(huff_bits, huff_size_table, &lastK);
2895        
2896     //Generation of table of Huffman codes
2897     generate_huffman_codes_table(huff_size_table, huff_code_table);
2898        
2899     //Ordering procedure for encoding procedure code tables
2900     generate_ordered_codes_table(huff_vals, huff_size_table, huff_code_table, type, lastK);
2901
2902     //HW expects Byte0: Code length; Byte1,Byte2: Code Word, Byte3: Dummy
2903     //Since IA is littlended, &, | and << accordingly to store the values in the DWord.
2904     for(i=0; i<huff_val_size; i++) {
2905         table[i] = 0;
2906         table[i] = ((huff_size_table[i] & 0xFF) | ((huff_code_table[i] & 0xFFFF) << 8));
2907     }
2908
2909 }
2910
2911 //send the huffman table using MFC_JPEG_HUFF_TABLE_STATE
2912 static void
2913 gen8_mfc_jpeg_huff_table_state(VADriverContextP ctx,
2914                                            struct encode_state *encode_state,
2915                                            struct intel_encoder_context *encoder_context,
2916                                            int num_tables)
2917 {
2918     VAHuffmanTableBufferJPEGBaseline *huff_buffer;
2919     struct intel_batchbuffer *batch = encoder_context->base.batch;
2920     uint8_t index;
2921     uint32_t dc_table[12], ac_table[162]; 
2922     
2923     assert(encode_state->huffman_table && encode_state->huffman_table->buffer);
2924     huff_buffer = (VAHuffmanTableBufferJPEGBaseline *)encode_state->huffman_table->buffer;
2925
2926     memset(dc_table, 0, 12);
2927     memset(ac_table, 0, 162);
2928
2929     for (index = 0; index < num_tables; index++) {
2930         int id = va_to_gen7_jpeg_hufftable[index];
2931  
2932         if (!huff_buffer->load_huffman_table[index])
2933             continue;
2934      
2935         //load DC table with 12 DWords
2936         convert_hufftable_to_codes(huff_buffer, dc_table, 0, index);  //0 for Dc
2937
2938         //load AC table with 162 DWords 
2939         convert_hufftable_to_codes(huff_buffer, ac_table, 1, index);  //1 for AC 
2940
2941         BEGIN_BCS_BATCH(batch, 176);
2942         OUT_BCS_BATCH(batch, MFC_JPEG_HUFF_TABLE_STATE | (176 - 2));
2943         OUT_BCS_BATCH(batch, id); //Huff table id
2944
2945         //DWord 2 - 13 has DC_TABLE
2946         intel_batchbuffer_data(batch, dc_table, 12*4);
2947
2948         //Dword 14 -175 has AC_TABLE
2949         intel_batchbuffer_data(batch, ac_table, 162*4);
2950         ADVANCE_BCS_BATCH(batch);
2951     }    
2952 }
2953
2954
2955 //This method is used to compute the MCU count used for setting MFC_JPEG_SCAN_OBJECT
2956 static void get_Y_sampling_factors(uint32_t surface_format, uint8_t *h_factor, uint8_t *v_factor)
2957
2958     switch (surface_format) {
2959         case VA_FOURCC_Y800: {
2960             (* h_factor) = 1; 
2961             (* v_factor) = 1;
2962             break;
2963         }
2964         case VA_FOURCC_NV12: { 
2965             (* h_factor) = 2;             
2966             (* v_factor) = 2;
2967             break;
2968         }      
2969         case VA_FOURCC_UYVY: { 
2970             (* h_factor) = 2; 
2971             (* v_factor) = 1;
2972             break;
2973         }
2974         case VA_FOURCC_YUY2: { 
2975             (* h_factor) = 2; 
2976             (* v_factor) = 1;
2977             break;
2978         }
2979         case VA_FOURCC_RGBA:
2980         case VA_FOURCC_444P: { 
2981             (* h_factor) = 1; 
2982             (* v_factor) = 1;
2983             break;
2984         }
2985         default : { //May be  have to insert error handling here. For now just use as below
2986             (* h_factor) = 1; 
2987             (* v_factor) = 1;
2988             break;
2989         }
2990     }
2991 }
2992
2993 //set MFC_JPEG_SCAN_OBJECT
2994 static void
2995 gen8_mfc_jpeg_scan_object(VADriverContextP ctx,
2996                                            struct encode_state *encode_state,
2997                                            struct intel_encoder_context *encoder_context)
2998 {
2999     uint32_t mcu_count, surface_format, Mx, My;
3000     uint8_t i, horizontal_sampling_factor, vertical_sampling_factor, huff_ac_table=0, huff_dc_table=0;
3001     uint8_t is_last_scan = 1;    //Jpeg has only 1 scan per frame. When last scan, HW inserts EOI code.
3002     uint8_t head_present_flag=1; //Header has tables and app data 
3003     uint16_t num_components, restart_interval;   //Specifies number of MCUs in an ECS.
3004     VAEncSliceParameterBufferJPEG *slice_param;
3005     VAEncPictureParameterBufferJPEG *pic_param;
3006     
3007     struct intel_batchbuffer *batch = encoder_context->base.batch;
3008     struct object_surface *obj_surface = encode_state->input_yuv_object;
3009     
3010     assert(encode_state->slice_params_ext[0] && encode_state->slice_params_ext[0]->buffer);
3011     assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
3012     assert(obj_surface);
3013     pic_param = (VAEncPictureParameterBufferJPEG *)encode_state->pic_param_ext->buffer;
3014     slice_param = (VAEncSliceParameterBufferJPEG *)encode_state->slice_params_ext[0]->buffer;
3015     surface_format = obj_surface->fourcc;
3016     
3017     get_Y_sampling_factors(surface_format, &horizontal_sampling_factor, &vertical_sampling_factor);
3018     
3019     // Mx = #MCUs in a row, My = #MCUs in a column
3020     Mx = (pic_param->picture_width + (horizontal_sampling_factor*8 -1))/(horizontal_sampling_factor*8);
3021     My = (pic_param->picture_height + (vertical_sampling_factor*8 -1))/(vertical_sampling_factor*8);
3022     mcu_count = (Mx * My);
3023  
3024     num_components = pic_param->num_components;    
3025     restart_interval = slice_param->restart_interval;
3026     
3027     //Depending on number of components and values set for table selectors, 
3028     //only those bits are set in 24:22 for AC table, 20:18 for DC table
3029     for(i=0; i<num_components; i++) {
3030         huff_ac_table |= ((slice_param->components[i].ac_table_selector)<<i);
3031         huff_dc_table |= ((slice_param->components[i].dc_table_selector)<<i);
3032     }
3033     
3034     
3035     BEGIN_BCS_BATCH(batch, 3);
3036     /* DWORD 0 */
3037     OUT_BCS_BATCH(batch, MFC_JPEG_SCAN_OBJECT | (3 - 2)); 
3038     /* DWORD 1 */
3039     OUT_BCS_BATCH(batch, mcu_count << 0);       //MCU Count
3040     /* DWORD 2 */
3041     OUT_BCS_BATCH(batch,
3042                   (huff_ac_table << 22)     |   //Huffman AC Table
3043                   (huff_dc_table << 18)     |   //Huffman DC Table
3044                   (head_present_flag << 17) |   //Head present flag
3045                   (is_last_scan << 16)      |   //Is last scan
3046                   (restart_interval << 0));     //Restart Interval
3047     ADVANCE_BCS_BATCH(batch);
3048 }
3049
3050 static void
3051 gen8_mfc_jpeg_pak_insert_object(struct intel_encoder_context *encoder_context, unsigned int *insert_data, 
3052                                 int length_in_dws, int data_bits_in_last_dw, int is_last_header, 
3053                                 int is_end_of_slice)
3054 {
3055     struct intel_batchbuffer *batch = encoder_context->base.batch;
3056     assert(batch);
3057     
3058     if (data_bits_in_last_dw == 0)
3059         data_bits_in_last_dw = 32;
3060
3061     BEGIN_BCS_BATCH(batch, length_in_dws + 2);
3062
3063     OUT_BCS_BATCH(batch, MFX_INSERT_OBJECT | (length_in_dws + 2 - 2));
3064     //DWord 1
3065     OUT_BCS_BATCH(batch,
3066                   (0 << 16) |                    //DataByteOffset 0 for JPEG Encoder
3067                   (0 << 15) |                    //HeaderLengthExcludeFrmSize 0 for JPEG Encoder
3068                   (data_bits_in_last_dw << 8) |  //DataBitsInLastDW
3069                   (0 << 4) |                     //SkipEmulByteCount 0 for JPEG Encoder
3070                   (0 << 3) |                     //EmulationFlag 0 for JPEG Encoder
3071                   ((!!is_last_header) << 2) |    //LastHeaderFlag
3072                   ((!!is_end_of_slice) << 1) |   //EndOfSliceFlag
3073                   (1 << 0));                     //BitstreamStartReset 1 for JPEG Encoder
3074     //Data Paylaod
3075     intel_batchbuffer_data(batch, insert_data, length_in_dws*4);
3076
3077     ADVANCE_BCS_BATCH(batch);
3078 }
3079
3080
3081 //send the jpeg headers to HW using MFX_PAK_INSERT_OBJECT
3082 static void
3083 gen8_mfc_jpeg_add_headers(VADriverContextP ctx,
3084                                            struct encode_state *encode_state,
3085                                            struct intel_encoder_context *encoder_context)
3086 {
3087     if (encode_state->packed_header_data_ext) {
3088         VAEncPackedHeaderParameterBuffer *param = NULL;
3089         unsigned int *header_data = (unsigned int *)(*encode_state->packed_header_data_ext)->buffer;
3090         unsigned int length_in_bits;
3091
3092         param = (VAEncPackedHeaderParameterBuffer *)(*encode_state->packed_header_params_ext)->buffer;
3093         length_in_bits = param->bit_length;
3094
3095         gen8_mfc_jpeg_pak_insert_object(encoder_context, 
3096                                         header_data, 
3097                                         ALIGN(length_in_bits, 32) >> 5,
3098                                         length_in_bits & 0x1f,
3099                                         1,
3100                                         1);
3101     }
3102 }
3103
3104 //Initialize the buffered_qmatrix with the default qmatrix in the driver.
3105 //If the app sends the qmatrix, this will be replaced with the one app sends.
3106 static void 
3107 jpeg_init_default_qmatrix(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
3108 {
3109     int i=0;
3110     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3111  
3112     //Load the the QM in zigzag order. If app sends QM, it is always in zigzag order.
3113     for(i=0; i<64; i++)
3114        mfc_context->buffered_qmatrix.lum_quantiser_matrix[i] = jpeg_luma_quant[zigzag_direct[i]];
3115
3116     for(i=0; i<64; i++)
3117         mfc_context->buffered_qmatrix.chroma_quantiser_matrix[i] = jpeg_chroma_quant[zigzag_direct[i]];
3118 }    
3119  
3120 /* This is at the picture level */
3121 static void
3122 gen8_mfc_jpeg_pipeline_picture_programing(VADriverContextP ctx,
3123                                            struct encode_state *encode_state,
3124                                            struct intel_encoder_context *encoder_context)
3125 {
3126     int i, j, component, max_selector = 0;
3127     VAEncSliceParameterBufferJPEG *slice_param;
3128     
3129     gen8_mfc_pipe_mode_select(ctx, MFX_FORMAT_JPEG, encoder_context);
3130     gen8_mfc_jpeg_set_surface_state(ctx, encoder_context, encode_state);
3131     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
3132     gen8_mfc_ind_obj_base_addr_state(ctx, encoder_context);
3133     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
3134     gen8_mfc_jpeg_pic_state(ctx, encoder_context, encode_state);
3135     
3136     //do the slice level encoding here
3137     gen8_mfc_jpeg_fqm_state(ctx, encoder_context, encode_state);
3138
3139     //I dont think I need this for loop. Just to be consistent with other encoding logic...
3140     for(i = 0; i < encode_state->num_slice_params_ext; i++) {
3141         assert(encode_state->slice_params && encode_state->slice_params_ext[i]->buffer);
3142         slice_param = (VAEncSliceParameterBufferJPEG *)encode_state->slice_params_ext[i]->buffer;
3143         
3144         for(j = 0; j < encode_state->slice_params_ext[i]->num_elements; j++) {
3145             
3146             for(component = 0; component < slice_param->num_components; component++) {
3147                 if(max_selector < slice_param->components[component].dc_table_selector)
3148                     max_selector = slice_param->components[component].dc_table_selector;
3149                 
3150                 if (max_selector < slice_param->components[component].ac_table_selector)
3151                     max_selector = slice_param->components[component].ac_table_selector;
3152             }
3153             
3154             slice_param++;
3155         }
3156     }    
3157
3158     assert(max_selector < 2);
3159     //send the huffman table using MFC_JPEG_HUFF_TABLE
3160     gen8_mfc_jpeg_huff_table_state(ctx, encode_state, encoder_context, max_selector+1);
3161     //set MFC_JPEG_SCAN_OBJECT
3162     gen8_mfc_jpeg_scan_object(ctx, encode_state, encoder_context);
3163     //add headers using MFX_PAK_INSERT_OBJECT (it is refered as MFX_INSERT_OBJECT in this driver code)
3164     gen8_mfc_jpeg_add_headers(ctx, encode_state, encoder_context);
3165        
3166 }
3167
3168 static void
3169 gen8_mfc_jpeg_pipeline_programing(VADriverContextP ctx,
3170                                    struct encode_state *encode_state,
3171                                    struct intel_encoder_context *encoder_context)
3172 {
3173     struct intel_batchbuffer *batch = encoder_context->base.batch;
3174     
3175     // begin programing
3176     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
3177     intel_batchbuffer_emit_mi_flush(batch);
3178     
3179     // picture level programing
3180     gen8_mfc_jpeg_pipeline_picture_programing(ctx, encode_state, encoder_context);
3181
3182     // end programing
3183     intel_batchbuffer_end_atomic(batch);
3184
3185 }
3186
3187
3188 static VAStatus
3189 gen8_mfc_jpeg_encode_picture(VADriverContextP ctx, 
3190                               struct encode_state *encode_state,
3191                               struct intel_encoder_context *encoder_context)
3192 {
3193     gen8_mfc_init(ctx, encode_state, encoder_context);
3194     intel_mfc_jpeg_prepare(ctx, encode_state, encoder_context);
3195     /*Programing bcs pipeline*/
3196     gen8_mfc_jpeg_pipeline_programing(ctx, encode_state, encoder_context);
3197     gen8_mfc_run(ctx, encode_state, encoder_context);
3198
3199     return VA_STATUS_SUCCESS;
3200 }
3201
3202 static int gen8_mfc_vp8_qindex_estimate(struct encode_state *encode_state,
3203                                         struct gen6_mfc_context *mfc_context,
3204                                         int target_frame_size,
3205                                         int is_key_frame)
3206 {
3207     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3208     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3209     unsigned int max_qindex = pic_param->clamp_qindex_high;
3210     unsigned int min_qindex = pic_param->clamp_qindex_low;
3211     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
3212     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
3213     int target_mb_size;
3214     int last_size_gap  = -1;
3215     int per_mb_size_at_qindex;
3216     int target_qindex = min_qindex, i;
3217
3218     /* make sure would not overflow*/
3219     if (target_frame_size >= (0x7fffffff >> 9))
3220         target_mb_size = (target_frame_size / width_in_mbs / height_in_mbs) << 9;
3221     else
3222         target_mb_size = (target_frame_size << 9) / width_in_mbs / height_in_mbs;
3223
3224     for (i = min_qindex; i <= max_qindex; i++) {
3225         per_mb_size_at_qindex = vp8_bits_per_mb[!is_key_frame][i];
3226         target_qindex = i;
3227         if (per_mb_size_at_qindex <= target_mb_size) {
3228             if (target_mb_size - per_mb_size_at_qindex < last_size_gap)
3229                 target_qindex--;
3230             break;
3231         }
3232         else
3233             last_size_gap = per_mb_size_at_qindex - target_mb_size;
3234     }
3235
3236     return target_qindex;
3237 }
3238
3239 static void
3240 gen8_mfc_vp8_bit_rate_control_context_init(struct encode_state *encode_state,
3241                                         struct gen6_mfc_context *mfc_context)
3242 {
3243     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3244     VAEncMiscParameterBuffer *misc_param_frame_rate_buffer = (VAEncMiscParameterBuffer*)encode_state->misc_param[VAEncMiscParameterTypeFrameRate]->buffer;
3245     VAEncMiscParameterFrameRate* param_frame_rate = (VAEncMiscParameterFrameRate*)misc_param_frame_rate_buffer->data;
3246     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
3247     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
3248     float fps = param_frame_rate->framerate;
3249     int inter_mb_size = seq_param->bits_per_second * 1.0 / (fps+4.0) / width_in_mbs / height_in_mbs;
3250     int intra_mb_size = inter_mb_size * 5.0;
3251
3252     mfc_context->bit_rate_control_context[SLICE_TYPE_I].target_mb_size = intra_mb_size;
3253     mfc_context->bit_rate_control_context[SLICE_TYPE_I].target_frame_size = intra_mb_size * width_in_mbs * height_in_mbs;
3254     mfc_context->bit_rate_control_context[SLICE_TYPE_P].target_mb_size = inter_mb_size;
3255     mfc_context->bit_rate_control_context[SLICE_TYPE_P].target_frame_size = inter_mb_size * width_in_mbs * height_in_mbs;
3256
3257     mfc_context->bit_rate_control_context[SLICE_TYPE_I].TargetSizeInWord = (intra_mb_size + 16)/ 16;
3258     mfc_context->bit_rate_control_context[SLICE_TYPE_P].TargetSizeInWord = (inter_mb_size + 16)/ 16;
3259
3260     mfc_context->bit_rate_control_context[SLICE_TYPE_I].MaxSizeInWord = mfc_context->bit_rate_control_context[SLICE_TYPE_I].TargetSizeInWord * 1.5;
3261     mfc_context->bit_rate_control_context[SLICE_TYPE_P].MaxSizeInWord = mfc_context->bit_rate_control_context[SLICE_TYPE_P].TargetSizeInWord * 1.5;
3262 }
3263
3264 static void gen8_mfc_vp8_brc_init(struct encode_state *encode_state,
3265                                struct intel_encoder_context* encoder_context)
3266 {
3267     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3268     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3269     VAEncMiscParameterBuffer* misc_param_hrd = (VAEncMiscParameterBuffer*)encode_state->misc_param[VAEncMiscParameterTypeHRD]->buffer;
3270     VAEncMiscParameterHRD* param_hrd = (VAEncMiscParameterHRD*)misc_param_hrd->data;
3271     VAEncMiscParameterBuffer* misc_param_frame_rate_buffer = (VAEncMiscParameterBuffer*)encode_state->misc_param[VAEncMiscParameterTypeFrameRate]->buffer;
3272     VAEncMiscParameterFrameRate* param_frame_rate = (VAEncMiscParameterFrameRate*)misc_param_frame_rate_buffer->data;
3273     double bitrate = seq_param->bits_per_second;
3274     unsigned int frame_rate = param_frame_rate->framerate;
3275     int inum = 1, pnum = 0;
3276     int intra_period = seq_param->intra_period;
3277     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
3278     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
3279     int max_frame_size =  (vp8_bits_per_mb[0][0] >> 9) * width_in_mbs * height_in_mbs;/* vp8_bits_per_mb table mutilpled 512 */
3280
3281     pnum = intra_period  - 1;
3282
3283     mfc_context->brc.mode = encoder_context->rate_control_mode;
3284
3285     mfc_context->brc.target_frame_size[SLICE_TYPE_I] = (int)((double)((bitrate * intra_period)/frame_rate) /
3286                                                              (double)(inum + BRC_PWEIGHT * pnum ));
3287     mfc_context->brc.target_frame_size[SLICE_TYPE_P] = BRC_PWEIGHT * mfc_context->brc.target_frame_size[SLICE_TYPE_I];
3288
3289     mfc_context->brc.gop_nums[SLICE_TYPE_I] = inum;
3290     mfc_context->brc.gop_nums[SLICE_TYPE_P] = pnum;
3291
3292     mfc_context->brc.bits_per_frame = bitrate/frame_rate;
3293
3294     mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY = gen8_mfc_vp8_qindex_estimate(encode_state,
3295                                                                    mfc_context,
3296                                                                    mfc_context->brc.target_frame_size[SLICE_TYPE_I],
3297                                                                    1);
3298     mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY = gen8_mfc_vp8_qindex_estimate(encode_state,
3299                                                                    mfc_context,
3300                                                                    mfc_context->brc.target_frame_size[SLICE_TYPE_P],
3301                                                                    0);
3302
3303     mfc_context->hrd.buffer_size = (double)param_hrd->buffer_size;
3304     mfc_context->hrd.current_buffer_fullness =
3305         (double)(param_hrd->initial_buffer_fullness < mfc_context->hrd.buffer_size)?
3306         param_hrd->initial_buffer_fullness: mfc_context->hrd.buffer_size/2.;
3307     mfc_context->hrd.target_buffer_fullness = (double)mfc_context->hrd.buffer_size/2.;
3308     mfc_context->hrd.buffer_capacity = (double)mfc_context->hrd.buffer_size/max_frame_size;
3309     mfc_context->hrd.violation_noted = 0;
3310 }
3311
3312 static int gen8_mfc_vp8_brc_postpack(struct encode_state *encode_state,
3313                            struct gen6_mfc_context *mfc_context,
3314                            int frame_bits)
3315 {
3316     gen6_brc_status sts = BRC_NO_HRD_VIOLATION;
3317     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3318     int is_key_frame = !pic_param->pic_flags.bits.frame_type;
3319     int slicetype = (is_key_frame ? SLICE_TYPE_I : SLICE_TYPE_P);
3320     int qpi = mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY;
3321     int qpp = mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY;
3322     int qp; // quantizer of previously encoded slice of current type
3323     int qpn; // predicted quantizer for next frame of current type in integer format
3324     double qpf; // predicted quantizer for next frame of current type in float format
3325     double delta_qp; // QP correction
3326     int target_frame_size, frame_size_next;
3327     /* Notes:
3328      *  x - how far we are from HRD buffer borders
3329      *  y - how far we are from target HRD buffer fullness
3330      */
3331     double x, y;
3332     double frame_size_alpha;
3333     unsigned int max_qindex = pic_param->clamp_qindex_high;
3334     unsigned int min_qindex = pic_param->clamp_qindex_low;
3335
3336     qp = mfc_context->bit_rate_control_context[slicetype].QpPrimeY;
3337
3338     target_frame_size = mfc_context->brc.target_frame_size[slicetype];
3339     if (mfc_context->hrd.buffer_capacity < 5)
3340         frame_size_alpha = 0;
3341     else
3342         frame_size_alpha = (double)mfc_context->brc.gop_nums[slicetype];
3343     if (frame_size_alpha > 30) frame_size_alpha = 30;
3344     frame_size_next = target_frame_size + (double)(target_frame_size - frame_bits) /
3345         (double)(frame_size_alpha + 1.);
3346
3347     /* frame_size_next: avoiding negative number and too small value */
3348     if ((double)frame_size_next < (double)(target_frame_size * 0.25))
3349         frame_size_next = (int)((double)target_frame_size * 0.25);
3350
3351     qpf = (double)qp * target_frame_size / frame_size_next;
3352     qpn = (int)(qpf + 0.5);
3353
3354     if (qpn == qp) {
3355         /* setting qpn we round qpf making mistakes: now we are trying to compensate this */
3356         mfc_context->brc.qpf_rounding_accumulator += qpf - qpn;
3357         if (mfc_context->brc.qpf_rounding_accumulator > 1.0) {
3358             qpn++;
3359             mfc_context->brc.qpf_rounding_accumulator = 0.;
3360         } else if (mfc_context->brc.qpf_rounding_accumulator < -1.0) {
3361             qpn--;
3362             mfc_context->brc.qpf_rounding_accumulator = 0.;
3363         }
3364     }
3365
3366     /* making sure that QP is not changing too fast */
3367     if ((qpn - qp) > BRC_QP_MAX_CHANGE) qpn = qp + BRC_QP_MAX_CHANGE;
3368     else if ((qpn - qp) < -BRC_QP_MAX_CHANGE) qpn = qp - BRC_QP_MAX_CHANGE;
3369     /* making sure that with QP predictions we did do not leave QPs range */
3370     BRC_CLIP(qpn, min_qindex, max_qindex);
3371
3372     /* checking wthether HRD compliance is still met */
3373     sts = intel_mfc_update_hrd(encode_state, mfc_context, frame_bits);
3374
3375     /* calculating QP delta as some function*/
3376     x = mfc_context->hrd.target_buffer_fullness - mfc_context->hrd.current_buffer_fullness;
3377     if (x > 0) {
3378         x /= mfc_context->hrd.target_buffer_fullness;
3379         y = mfc_context->hrd.current_buffer_fullness;
3380     }
3381     else {
3382         x /= (mfc_context->hrd.buffer_size - mfc_context->hrd.target_buffer_fullness);
3383         y = mfc_context->hrd.buffer_size - mfc_context->hrd.current_buffer_fullness;
3384     }
3385     if (y < 0.01) y = 0.01;
3386     if (x > 1) x = 1;
3387     else if (x < -1) x = -1;
3388
3389     delta_qp = BRC_QP_MAX_CHANGE*exp(-1/y)*sin(BRC_PI_0_5 * x);
3390     qpn = (int)(qpn + delta_qp + 0.5);
3391
3392     /* making sure that with QP predictions we did do not leave QPs range */
3393     BRC_CLIP(qpn, min_qindex, max_qindex);
3394
3395     if (sts == BRC_NO_HRD_VIOLATION) { // no HRD violation
3396         /* correcting QPs of slices of other types */
3397         if (!is_key_frame) {
3398             if (abs(qpn - BRC_I_P_QP_DIFF - qpi) > 4)
3399                 mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY += (qpn - BRC_I_P_QP_DIFF - qpi) >> 2;
3400         } else {
3401             if (abs(qpn + BRC_I_P_QP_DIFF - qpp) > 4)
3402                 mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY += (qpn + BRC_I_P_QP_DIFF - qpp) >> 2;
3403         }
3404         BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY, min_qindex, max_qindex);
3405         BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY, min_qindex, max_qindex);
3406     } else if (sts == BRC_UNDERFLOW) { // underflow
3407         if (qpn <= qp) qpn = qp + 2;
3408         if (qpn > max_qindex) {
3409             qpn = max_qindex;
3410             sts = BRC_UNDERFLOW_WITH_MAX_QP; //underflow with maxQP
3411         }
3412     } else if (sts == BRC_OVERFLOW) {
3413         if (qpn >= qp) qpn = qp - 2;
3414         if (qpn < min_qindex) { // < 0 (?) overflow with minQP
3415             qpn = min_qindex;
3416             sts = BRC_OVERFLOW_WITH_MIN_QP; // bit stuffing to be done
3417         }
3418     }
3419
3420     mfc_context->bit_rate_control_context[slicetype].QpPrimeY = qpn;
3421
3422     return sts;
3423 }
3424
3425 static void gen8_mfc_vp8_hrd_context_init(struct encode_state *encode_state,
3426                                        struct intel_encoder_context *encoder_context)
3427 {
3428     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3429     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3430     unsigned int rate_control_mode = encoder_context->rate_control_mode;
3431     int target_bit_rate = seq_param->bits_per_second;
3432
3433     // current we only support CBR mode.
3434     if (rate_control_mode == VA_RC_CBR) {
3435         mfc_context->vui_hrd.i_bit_rate_value = target_bit_rate >> 10;
3436         mfc_context->vui_hrd.i_cpb_size_value = (target_bit_rate * 8) >> 10;
3437         mfc_context->vui_hrd.i_initial_cpb_removal_delay = mfc_context->vui_hrd.i_cpb_size_value * 0.5 * 1024 / target_bit_rate * 90000;
3438         mfc_context->vui_hrd.i_cpb_removal_delay = 2;
3439         mfc_context->vui_hrd.i_frame_number = 0;
3440
3441         mfc_context->vui_hrd.i_initial_cpb_removal_delay_length = 24;
3442         mfc_context->vui_hrd.i_cpb_removal_delay_length = 24;
3443         mfc_context->vui_hrd.i_dpb_output_delay_length = 24;
3444     }
3445
3446 }
3447
3448 static void gen8_mfc_vp8_hrd_context_update(struct encode_state *encode_state,
3449                              struct gen6_mfc_context *mfc_context)
3450 {
3451     mfc_context->vui_hrd.i_frame_number++;
3452 }
3453
3454 /*
3455  * Check whether the parameters related with CBR are updated and decide whether
3456  * it needs to reinitialize the configuration related with CBR.
3457  * Currently it will check the following parameters:
3458  *      bits_per_second
3459  *      frame_rate
3460  *      gop_configuration(intra_period, ip_period, intra_idr_period)
3461  */
3462 static bool gen8_mfc_vp8_brc_updated_check(struct encode_state *encode_state,
3463                            struct intel_encoder_context *encoder_context)
3464 {
3465     unsigned int rate_control_mode = encoder_context->rate_control_mode;
3466     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3467     double cur_fps, cur_bitrate;
3468     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3469     VAEncMiscParameterBuffer *misc_param_frame_rate_buf = (VAEncMiscParameterBuffer*)encode_state->misc_param[VAEncMiscParameterTypeFrameRate]->buffer;
3470     VAEncMiscParameterFrameRate *param_frame_rate = (VAEncMiscParameterFrameRate*)misc_param_frame_rate_buf->data;
3471     unsigned int frame_rate = param_frame_rate->framerate;
3472
3473     if (rate_control_mode != VA_RC_CBR) {
3474         return false;
3475     }
3476
3477     cur_bitrate = seq_param->bits_per_second;
3478     cur_fps = frame_rate;
3479
3480     if ((cur_bitrate == mfc_context->brc.saved_bps) &&
3481         (cur_fps == mfc_context->brc.saved_fps) &&
3482         (seq_param->intra_period == mfc_context->brc.saved_intra_period)) {
3483         /* the parameters related with CBR are not updaetd */
3484         return false;
3485     }
3486
3487     mfc_context->brc.saved_intra_period = seq_param->intra_period;
3488     mfc_context->brc.saved_fps = cur_fps;
3489     mfc_context->brc.saved_bps = cur_bitrate;
3490     return true;
3491 }
3492
3493 static void gen8_mfc_vp8_brc_prepare(struct encode_state *encode_state,
3494                            struct intel_encoder_context *encoder_context)
3495 {
3496     unsigned int rate_control_mode = encoder_context->rate_control_mode;
3497     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3498
3499     if (rate_control_mode == VA_RC_CBR) {
3500         bool brc_updated;
3501         assert(encoder_context->codec != CODEC_MPEG2);
3502
3503         brc_updated = gen8_mfc_vp8_brc_updated_check(encode_state, encoder_context);
3504
3505         /*Programing bit rate control */
3506         if ((mfc_context->bit_rate_control_context[SLICE_TYPE_I].MaxSizeInWord == 0) ||
3507              brc_updated) {
3508             gen8_mfc_vp8_bit_rate_control_context_init(encode_state, mfc_context);
3509             gen8_mfc_vp8_brc_init(encode_state, encoder_context);
3510         }
3511
3512         /*Programing HRD control */
3513         if ((mfc_context->vui_hrd.i_cpb_size_value == 0) || brc_updated )
3514             gen8_mfc_vp8_hrd_context_init(encode_state, encoder_context);
3515     }
3516 }
3517
3518 static void vp8_enc_state_init(struct gen6_mfc_context *mfc_context,
3519                                VAEncPictureParameterBufferVP8 *pic_param,
3520                                VAQMatrixBufferVP8 *q_matrix)
3521 {
3522
3523     int is_key_frame = !pic_param->pic_flags.bits.frame_type;
3524     unsigned char *coeff_probs_stream_in_buffer;
3525     
3526     mfc_context->vp8_state.frame_header_lf_update_pos = 0;
3527     mfc_context->vp8_state.frame_header_qindex_update_pos = 0;
3528     mfc_context->vp8_state.frame_header_token_update_pos = 0;
3529     mfc_context->vp8_state.frame_header_bin_mv_upate_pos = 0;
3530
3531     mfc_context->vp8_state.prob_skip_false = 255;
3532     memset(mfc_context->vp8_state.mb_segment_tree_probs, 0, sizeof(mfc_context->vp8_state.mb_segment_tree_probs));
3533     memcpy(mfc_context->vp8_state.mv_probs, vp8_default_mv_context, sizeof(mfc_context->vp8_state.mv_probs));
3534     
3535     if (is_key_frame) {
3536         memcpy(mfc_context->vp8_state.y_mode_probs, vp8_kf_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
3537         memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_kf_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
3538
3539         mfc_context->vp8_state.prob_intra = 255;
3540         mfc_context->vp8_state.prob_last = 128;
3541         mfc_context->vp8_state.prob_gf = 128;
3542     } else {
3543         memcpy(mfc_context->vp8_state.y_mode_probs, vp8_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
3544         memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
3545
3546         mfc_context->vp8_state.prob_intra = 63;
3547         mfc_context->vp8_state.prob_last = 128;
3548         mfc_context->vp8_state.prob_gf = 128;
3549     }
3550     
3551     mfc_context->vp8_state.prob_skip_false = vp8_base_skip_false_prob[q_matrix->quantization_index[0]];
3552   
3553     dri_bo_map(mfc_context->vp8_state.coeff_probs_stream_in_bo, 1);
3554     coeff_probs_stream_in_buffer = (unsigned char *)mfc_context->vp8_state.coeff_probs_stream_in_bo->virtual;
3555     assert(coeff_probs_stream_in_buffer);
3556     memcpy(coeff_probs_stream_in_buffer, vp8_default_coef_probs, sizeof(vp8_default_coef_probs));
3557     dri_bo_unmap(mfc_context->vp8_state.coeff_probs_stream_in_bo);
3558 }
3559
3560 static void vp8_enc_state_update(struct gen6_mfc_context *mfc_context,
3561                                  VAQMatrixBufferVP8 *q_matrix)
3562 {
3563
3564     /*some other probabilities need to be updated*/
3565 }
3566
3567 extern void binarize_vp8_frame_header(VAEncSequenceParameterBufferVP8 *seq_param,
3568                            VAEncPictureParameterBufferVP8 *pic_param,
3569                            VAQMatrixBufferVP8 *q_matrix,
3570                            struct gen6_mfc_context *mfc_context,
3571                            struct intel_encoder_context *encoder_context);
3572
3573 static void vp8_enc_frame_header_binarize(struct encode_state *encode_state,
3574                                           struct intel_encoder_context *encoder_context,
3575                                           struct gen6_mfc_context *mfc_context)
3576 {
3577     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3578     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3579     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3580     unsigned char *frame_header_buffer;
3581
3582     binarize_vp8_frame_header(seq_param, pic_param, q_matrix, mfc_context, encoder_context);
3583  
3584     dri_bo_map(mfc_context->vp8_state.frame_header_bo, 1);
3585     frame_header_buffer = (unsigned char *)mfc_context->vp8_state.frame_header_bo->virtual;
3586     assert(frame_header_buffer);
3587     memcpy(frame_header_buffer, mfc_context->vp8_state.vp8_frame_header, (mfc_context->vp8_state.frame_header_bit_count + 7) / 8);
3588     dri_bo_unmap(mfc_context->vp8_state.frame_header_bo);
3589 }
3590
3591 #define MAX_VP8_FRAME_HEADER_SIZE              0x2000
3592 #define VP8_TOKEN_STATISTICS_BUFFER_SIZE       0x2000
3593
3594 static void gen8_mfc_vp8_init(VADriverContextP ctx,
3595                           struct encode_state *encode_state,
3596                           struct intel_encoder_context *encoder_context)
3597 {
3598     struct i965_driver_data *i965 = i965_driver_data(ctx);
3599     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3600     dri_bo *bo;
3601     int i;
3602     int width_in_mbs = 0;
3603     int height_in_mbs = 0;
3604     int slice_batchbuffer_size;
3605     int is_key_frame, slice_type, rate_control_mode;
3606
3607     VAEncSequenceParameterBufferVP8 *pSequenceParameter = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3608     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3609     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3610
3611     width_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
3612     height_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
3613
3614     is_key_frame = !pic_param->pic_flags.bits.frame_type;
3615     slice_type = (is_key_frame ? SLICE_TYPE_I : SLICE_TYPE_P);
3616     rate_control_mode = encoder_context->rate_control_mode;
3617
3618     if (rate_control_mode == VA_RC_CBR) {
3619         q_matrix->quantization_index[0] = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
3620         for (i = 1; i < 4; i++)
3621             q_matrix->quantization_index[i] = q_matrix->quantization_index[0];
3622         for (i = 0; i < 5; i++)
3623             q_matrix->quantization_index_delta[i] = 0;
3624     }
3625
3626     slice_batchbuffer_size = 64 * width_in_mbs * height_in_mbs + 4096 +
3627         (SLICE_HEADER + SLICE_TAIL);
3628
3629     /*Encode common setup for MFC*/
3630     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
3631     mfc_context->post_deblocking_output.bo = NULL;
3632
3633     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
3634     mfc_context->pre_deblocking_output.bo = NULL;
3635
3636     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
3637     mfc_context->uncompressed_picture_source.bo = NULL;
3638
3639     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo);
3640     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
3641
3642     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
3643         if ( mfc_context->direct_mv_buffers[i].bo != NULL)
3644             dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
3645         mfc_context->direct_mv_buffers[i].bo = NULL;
3646     }
3647
3648     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
3649         if (mfc_context->reference_surfaces[i].bo != NULL)
3650             dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
3651         mfc_context->reference_surfaces[i].bo = NULL;
3652     }
3653
3654     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
3655     bo = dri_bo_alloc(i965->intel.bufmgr,
3656                       "Buffer",
3657                       width_in_mbs * 64 * 16,
3658                       64);
3659     assert(bo);
3660     mfc_context->intra_row_store_scratch_buffer.bo = bo;
3661
3662     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
3663     bo = dri_bo_alloc(i965->intel.bufmgr,
3664                       "Buffer",
3665                       width_in_mbs * height_in_mbs * 16,
3666                       64);
3667     assert(bo);
3668     mfc_context->macroblock_status_buffer.bo = bo;
3669
3670     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
3671     bo = dri_bo_alloc(i965->intel.bufmgr,
3672                       "Buffer",
3673                       16 * width_in_mbs * 64,  /* 16 * width_in_mbs * 64 */
3674                       64);
3675     assert(bo);
3676     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
3677
3678     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
3679     bo = dri_bo_alloc(i965->intel.bufmgr,
3680                       "Buffer",
3681                       16 * width_in_mbs * 64, /* 16 * width_in_mbs * 64 */
3682                       0x1000);
3683     assert(bo);
3684     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
3685
3686     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
3687     mfc_context->mfc_batchbuffer_surface.bo = NULL;
3688
3689     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
3690     mfc_context->aux_batchbuffer_surface.bo = NULL;
3691
3692     if (mfc_context->aux_batchbuffer) {
3693         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
3694         mfc_context->aux_batchbuffer = NULL;
3695     }
3696
3697     mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
3698     mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
3699     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
3700     mfc_context->aux_batchbuffer_surface.pitch = 16;
3701     mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
3702     mfc_context->aux_batchbuffer_surface.size_block = 16;
3703
3704     gen8_gpe_context_init(ctx, &mfc_context->gpe_context);
3705
3706     /* alloc vp8 encoding buffers*/
3707     dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
3708     bo = dri_bo_alloc(i965->intel.bufmgr,
3709                       "Buffer",
3710                       MAX_VP8_FRAME_HEADER_SIZE,
3711                       0x1000);
3712     assert(bo);
3713     mfc_context->vp8_state.frame_header_bo = bo;
3714
3715     mfc_context->vp8_state.intermediate_buffer_max_size = width_in_mbs * height_in_mbs * 384 * 9;
3716     for(i = 0; i < 8; i++) {
3717         mfc_context->vp8_state.intermediate_partition_offset[i] = width_in_mbs * height_in_mbs * 384 * (i + 1);
3718     }
3719     dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
3720     bo = dri_bo_alloc(i965->intel.bufmgr,
3721                       "Buffer",
3722                       mfc_context->vp8_state.intermediate_buffer_max_size,
3723                       0x1000);
3724     assert(bo);
3725     mfc_context->vp8_state.intermediate_bo = bo;
3726
3727     dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
3728     bo = dri_bo_alloc(i965->intel.bufmgr,
3729                       "Buffer",
3730                       width_in_mbs * height_in_mbs * 16,
3731                       0x1000);
3732     assert(bo);
3733     mfc_context->vp8_state.stream_out_bo = bo;
3734
3735     dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
3736     bo = dri_bo_alloc(i965->intel.bufmgr,
3737                       "Buffer",
3738                       sizeof(vp8_default_coef_probs),
3739                       0x1000);
3740     assert(bo);
3741     mfc_context->vp8_state.coeff_probs_stream_in_bo = bo;
3742
3743     dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
3744     bo = dri_bo_alloc(i965->intel.bufmgr,
3745                       "Buffer",
3746                       VP8_TOKEN_STATISTICS_BUFFER_SIZE,
3747                       0x1000);
3748     assert(bo);
3749     mfc_context->vp8_state.token_statistics_bo = bo;
3750
3751     dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
3752     bo = dri_bo_alloc(i965->intel.bufmgr,
3753                       "Buffer",
3754                       width_in_mbs * 16 * 64,
3755                       0x1000);
3756     assert(bo);
3757     mfc_context->vp8_state.mpc_row_store_bo = bo;
3758
3759     vp8_enc_state_init(mfc_context, pic_param, q_matrix);
3760     vp8_enc_frame_header_binarize(encode_state, encoder_context, mfc_context);
3761 }
3762
3763 static VAStatus
3764 intel_mfc_vp8_prepare(VADriverContextP ctx,
3765                         struct encode_state *encode_state,
3766                         struct intel_encoder_context *encoder_context)
3767 {
3768     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3769     struct object_surface *obj_surface;
3770     struct object_buffer *obj_buffer;
3771     struct i965_coded_buffer_segment *coded_buffer_segment;
3772     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3773     VAStatus vaStatus = VA_STATUS_SUCCESS;
3774     dri_bo *bo;
3775     int i;
3776
3777     /* reconstructed surface */
3778     obj_surface = encode_state->reconstructed_object;
3779     i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
3780     if (pic_param->loop_filter_level[0] == 0) {
3781         mfc_context->pre_deblocking_output.bo = obj_surface->bo;
3782         dri_bo_reference(mfc_context->pre_deblocking_output.bo);
3783     } else {
3784         mfc_context->post_deblocking_output.bo = obj_surface->bo;
3785         dri_bo_reference(mfc_context->post_deblocking_output.bo);
3786     }
3787
3788     mfc_context->surface_state.width = obj_surface->orig_width;
3789     mfc_context->surface_state.height = obj_surface->orig_height;
3790     mfc_context->surface_state.w_pitch = obj_surface->width;
3791     mfc_context->surface_state.h_pitch = obj_surface->height;
3792
3793     /* set vp8 reference frames */
3794     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
3795         obj_surface = encode_state->reference_objects[i];
3796
3797         if (obj_surface && obj_surface->bo) {
3798             mfc_context->reference_surfaces[i].bo = obj_surface->bo;
3799             dri_bo_reference(mfc_context->reference_surfaces[i].bo);
3800         } else {
3801             mfc_context->reference_surfaces[i].bo = NULL;
3802         }
3803     }
3804
3805     /* input YUV surface */
3806     obj_surface = encode_state->input_yuv_object;
3807     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
3808     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
3809
3810     /* coded buffer */
3811     obj_buffer = encode_state->coded_buf_object;
3812     bo = obj_buffer->buffer_store->bo;
3813     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
3814     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
3815     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
3816     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
3817
3818     dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
3819     mfc_context->vp8_state.final_frame_bo = mfc_context->mfc_indirect_pak_bse_object.bo;
3820     mfc_context->vp8_state.final_frame_byte_offset = I965_CODEDBUFFER_HEADER_SIZE;
3821     dri_bo_reference(mfc_context->vp8_state.final_frame_bo);
3822
3823     /* set the internal flag to 0 to indicate the coded size is unknown */
3824     dri_bo_map(bo, 1);
3825     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
3826     coded_buffer_segment->mapped = 0;
3827     coded_buffer_segment->codec = encoder_context->codec;
3828     dri_bo_unmap(bo);
3829
3830     return vaStatus;
3831 }
3832
3833 static void
3834 gen8_mfc_vp8_encoder_cfg(VADriverContextP ctx, 
3835                          struct encode_state *encode_state,
3836                          struct intel_encoder_context *encoder_context)
3837 {
3838     struct intel_batchbuffer *batch = encoder_context->base.batch;
3839     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3840     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3841     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3842
3843     BEGIN_BCS_BATCH(batch, 30);
3844     OUT_BCS_BATCH(batch, MFX_VP8_ENCODER_CFG | (30 - 2)); /* SKL should be 31-2 ? */
3845
3846     OUT_BCS_BATCH(batch,
3847                   0 << 9 | /* compressed bitstream output disable */
3848                   1 << 7 | /* disable per-segment delta qindex and loop filter in RC */
3849                   1 << 6 | /* RC initial pass */
3850                   0 << 4 | /* upate segment feature date flag */
3851                   1 << 3 | /* bitstream statistics output enable */
3852                   1 << 2 | /* token statistics output enable */
3853                   0 << 1 | /* final bitstream output disable */
3854                   0 << 0); /*DW1*/
3855     
3856     OUT_BCS_BATCH(batch, 0); /*DW2*/
3857
3858     OUT_BCS_BATCH(batch, 
3859                   0xfff << 16 | /* max intra mb bit count limit */
3860                   0xfff << 0  /* max inter mb bit count limit */
3861                   ); /*DW3*/
3862
3863     OUT_BCS_BATCH(batch, 0); /*DW4*/
3864     OUT_BCS_BATCH(batch, 0); /*DW5*/
3865     OUT_BCS_BATCH(batch, 0); /*DW6*/
3866     OUT_BCS_BATCH(batch, 0); /*DW7*/
3867     OUT_BCS_BATCH(batch, 0); /*DW8*/
3868     OUT_BCS_BATCH(batch, 0); /*DW9*/
3869     OUT_BCS_BATCH(batch, 0); /*DW10*/
3870     OUT_BCS_BATCH(batch, 0); /*DW11*/
3871     OUT_BCS_BATCH(batch, 0); /*DW12*/
3872     OUT_BCS_BATCH(batch, 0); /*DW13*/
3873     OUT_BCS_BATCH(batch, 0); /*DW14*/
3874     OUT_BCS_BATCH(batch, 0); /*DW15*/
3875     OUT_BCS_BATCH(batch, 0); /*DW16*/
3876     OUT_BCS_BATCH(batch, 0); /*DW17*/
3877     OUT_BCS_BATCH(batch, 0); /*DW18*/
3878     OUT_BCS_BATCH(batch, 0); /*DW19*/
3879     OUT_BCS_BATCH(batch, 0); /*DW20*/
3880     OUT_BCS_BATCH(batch, 0); /*DW21*/
3881
3882     OUT_BCS_BATCH(batch, 
3883                  pic_param->pic_flags.bits.show_frame << 23 |
3884                  pic_param->pic_flags.bits.version << 20
3885                  ); /*DW22*/
3886
3887     OUT_BCS_BATCH(batch,
3888                  (seq_param->frame_height_scale << 14 | seq_param->frame_height) << 16 |
3889                  (seq_param->frame_width_scale << 14 | seq_param->frame_width) << 0
3890                  );
3891
3892     /*DW24*/
3893     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bit_count); /* frame header bit count */
3894
3895     /*DW25*/
3896     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_qindex_update_pos); /* frame header bin buffer qindex update pointer */
3897
3898     /*DW26*/
3899     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_lf_update_pos); /* frame header bin buffer loop filter update pointer*/
3900
3901     /*DW27*/
3902     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_token_update_pos); /* frame header bin buffer token update pointer */
3903
3904     /*DW28*/
3905     OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bin_mv_upate_pos); /*frame header bin buffer mv update pointer */
3906
3907     /*DW29*/
3908     OUT_BCS_BATCH(batch, 0);
3909
3910     ADVANCE_BCS_BATCH(batch);
3911 }
3912
3913 static void
3914 gen8_mfc_vp8_pic_state(VADriverContextP ctx,
3915                        struct encode_state *encode_state,
3916                        struct intel_encoder_context *encoder_context)
3917 {
3918     struct intel_batchbuffer *batch = encoder_context->base.batch;
3919     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
3920     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
3921     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
3922     VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
3923     int i, j, log2num;
3924
3925     log2num = pic_param->pic_flags.bits.num_token_partitions;
3926
3927     /*update mode and token probs*/
3928     vp8_enc_state_update(mfc_context, q_matrix);
3929
3930     BEGIN_BCS_BATCH(batch, 38);
3931     OUT_BCS_BATCH(batch, MFX_VP8_PIC_STATE | (38 - 2));
3932     OUT_BCS_BATCH(batch,
3933                   (ALIGN(seq_param->frame_height, 16) / 16 - 1) << 16 |
3934                   (ALIGN(seq_param->frame_width, 16) / 16 - 1) << 0);
3935  
3936     OUT_BCS_BATCH(batch,
3937                   log2num << 24 |
3938                   pic_param->sharpness_level << 16 |
3939                   pic_param->pic_flags.bits.sign_bias_alternate << 13 |
3940                   pic_param->pic_flags.bits.sign_bias_golden << 12 |
3941                   pic_param->pic_flags.bits.loop_filter_adj_enable << 11 |
3942                   pic_param->pic_flags.bits.mb_no_coeff_skip << 10 |
3943                   pic_param->pic_flags.bits.update_mb_segmentation_map << 9 |
3944                   pic_param->pic_flags.bits.segmentation_enabled << 8 |
3945                   !pic_param->pic_flags.bits.frame_type << 5 | /* 0 indicate an intra frame in VP8 stream/spec($9.1)*/
3946                   (pic_param->pic_flags.bits.version / 2) << 4 |
3947                   (pic_param->pic_flags.bits.version == 3) << 1 | /* full pixel mode for version 3 */
3948                   !!pic_param->pic_flags.bits.version << 0); /* version 0: 6 tap */
3949  
3950     OUT_BCS_BATCH(batch,
3951                   pic_param->loop_filter_level[3] << 24 |
3952                   pic_param->loop_filter_level[2] << 16 |
3953                   pic_param->loop_filter_level[1] <<  8 |
3954                   pic_param->loop_filter_level[0] <<  0);
3955
3956     OUT_BCS_BATCH(batch,
3957                   q_matrix->quantization_index[3] << 24 |
3958                   q_matrix->quantization_index[2] << 16 |
3959                   q_matrix->quantization_index[1] <<  8 |
3960                   q_matrix->quantization_index[0] << 0);
3961
3962     OUT_BCS_BATCH(batch,
3963                  ((unsigned short)(q_matrix->quantization_index_delta[4]) >> 15) << 28 | 
3964                  abs(q_matrix->quantization_index_delta[4]) << 24 |
3965                  ((unsigned short)(q_matrix->quantization_index_delta[3]) >> 15) << 20 | 
3966                  abs(q_matrix->quantization_index_delta[3]) << 16 |
3967                  ((unsigned short)(q_matrix->quantization_index_delta[2]) >> 15) << 12 | 
3968                  abs(q_matrix->quantization_index_delta[2]) << 8 |
3969                  ((unsigned short)(q_matrix->quantization_index_delta[1]) >> 15) << 4 | 
3970                  abs(q_matrix->quantization_index_delta[1]) << 0);
3971
3972     OUT_BCS_BATCH(batch,
3973                  ((unsigned short)(q_matrix->quantization_index_delta[0]) >> 15) << 4 |
3974                  abs(q_matrix->quantization_index_delta[0]) << 0);
3975     
3976     OUT_BCS_BATCH(batch,
3977                  pic_param->clamp_qindex_high << 8 |
3978                  pic_param->clamp_qindex_low << 0);
3979
3980     for (i = 8; i < 19; i++) {
3981          OUT_BCS_BATCH(batch, 0xffffffff);
3982     }
3983
3984     OUT_BCS_BATCH(batch,
3985                   mfc_context->vp8_state.mb_segment_tree_probs[2] << 16 |
3986                   mfc_context->vp8_state.mb_segment_tree_probs[1] <<  8 |
3987                   mfc_context->vp8_state.mb_segment_tree_probs[0] <<  0);
3988
3989     OUT_BCS_BATCH(batch,
3990                   mfc_context->vp8_state.prob_skip_false << 24 |
3991                   mfc_context->vp8_state.prob_intra      << 16 |
3992                   mfc_context->vp8_state.prob_last       <<  8 |
3993                   mfc_context->vp8_state.prob_gf         <<  0);
3994
3995     OUT_BCS_BATCH(batch,
3996                   mfc_context->vp8_state.y_mode_probs[3] << 24 |
3997                   mfc_context->vp8_state.y_mode_probs[2] << 16 |
3998                   mfc_context->vp8_state.y_mode_probs[1] <<  8 |
3999                   mfc_context->vp8_state.y_mode_probs[0] <<  0);
4000
4001     OUT_BCS_BATCH(batch,
4002                   mfc_context->vp8_state.uv_mode_probs[2] << 16 |
4003                   mfc_context->vp8_state.uv_mode_probs[1] <<  8 |
4004                   mfc_context->vp8_state.uv_mode_probs[0] <<  0);
4005     
4006     /* MV update value, DW23-DW32 */
4007     for (i = 0; i < 2; i++) {
4008         for (j = 0; j < 20; j += 4) {
4009             OUT_BCS_BATCH(batch,
4010                           (j + 3 == 19 ? 0 : mfc_context->vp8_state.mv_probs[i][j + 3]) << 24 |
4011                           mfc_context->vp8_state.mv_probs[i][j + 2] << 16 |
4012                           mfc_context->vp8_state.mv_probs[i][j + 1] <<  8 |
4013                           mfc_context->vp8_state.mv_probs[i][j + 0] <<  0);
4014         }
4015     }
4016
4017     OUT_BCS_BATCH(batch,
4018                   (pic_param->ref_lf_delta[3] & 0x7f) << 24 |
4019                   (pic_param->ref_lf_delta[2] & 0x7f) << 16 |
4020                   (pic_param->ref_lf_delta[1] & 0x7f) <<  8 |
4021                   (pic_param->ref_lf_delta[0] & 0x7f) <<  0);
4022
4023     OUT_BCS_BATCH(batch,
4024                   (pic_param->mode_lf_delta[3] & 0x7f) << 24 |
4025                   (pic_param->mode_lf_delta[2] & 0x7f) << 16 |
4026                   (pic_param->mode_lf_delta[1] & 0x7f) <<  8 |
4027                   (pic_param->mode_lf_delta[0] & 0x7f) <<  0);
4028
4029     OUT_BCS_BATCH(batch, 0);
4030     OUT_BCS_BATCH(batch, 0);
4031     OUT_BCS_BATCH(batch, 0);
4032
4033     ADVANCE_BCS_BATCH(batch);
4034 }
4035
4036 #define OUT_VP8_BUFFER(bo, offset)                                      \
4037     if (bo)                                                             \
4038         OUT_BCS_RELOC(batch,                                            \
4039                       bo,                                               \
4040                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, \
4041                       offset);                                           \
4042     else                                                                \
4043         OUT_BCS_BATCH(batch, 0);                                        \
4044     OUT_BCS_BATCH(batch, 0);                                            \
4045     OUT_BCS_BATCH(batch, 0);
4046
4047 static void 
4048 gen8_mfc_vp8_bsp_buf_base_addr_state(VADriverContextP ctx, 
4049                                      struct encode_state *encode_state,
4050                                      struct intel_encoder_context *encoder_context)
4051 {
4052     struct intel_batchbuffer *batch = encoder_context->base.batch;
4053     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4054
4055     BEGIN_BCS_BATCH(batch, 32);
4056     OUT_BCS_BATCH(batch, MFX_VP8_BSP_BUF_BASE_ADDR_STATE | (32 - 2));
4057
4058     OUT_VP8_BUFFER(mfc_context->vp8_state.frame_header_bo, 0);
4059
4060     OUT_VP8_BUFFER(mfc_context->vp8_state.intermediate_bo, 0);
4061     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[0]);
4062     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[1]);
4063     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[2]);
4064     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[3]);
4065     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[4]);
4066     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[5]);
4067     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[6]);
4068     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[7]);
4069     OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_buffer_max_size);
4070
4071     OUT_VP8_BUFFER(mfc_context->vp8_state.final_frame_bo, I965_CODEDBUFFER_HEADER_SIZE);
4072     OUT_BCS_BATCH(batch, 0);
4073
4074     OUT_VP8_BUFFER(mfc_context->vp8_state.stream_out_bo, 0);
4075     OUT_VP8_BUFFER(mfc_context->vp8_state.coeff_probs_stream_in_bo, 0);
4076     OUT_VP8_BUFFER(mfc_context->vp8_state.token_statistics_bo, 0);
4077     OUT_VP8_BUFFER(mfc_context->vp8_state.mpc_row_store_bo, 0);
4078
4079     ADVANCE_BCS_BATCH(batch);
4080 }
4081
4082 static void
4083 gen8_mfc_vp8_pipeline_picture_programing(VADriverContextP ctx,
4084                                            struct encode_state *encode_state,
4085                                            struct intel_encoder_context *encoder_context)
4086 {
4087     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4088
4089     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_VP8, encoder_context);
4090     mfc_context->set_surface_state(ctx, encoder_context);
4091     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
4092     gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
4093     gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
4094     gen8_mfc_vp8_bsp_buf_base_addr_state(ctx, encode_state, encoder_context);
4095     gen8_mfc_vp8_pic_state(ctx, encode_state,encoder_context);
4096     gen8_mfc_vp8_encoder_cfg(ctx, encode_state, encoder_context);
4097 }
4098
4099 static const unsigned char
4100 vp8_intra_mb_mode_map[VME_MB_INTRA_MODE_COUNT] = {
4101     PAK_V_PRED,
4102     PAK_H_PRED,
4103     PAK_DC_PRED,
4104     PAK_TM_PRED
4105 };
4106
4107 static const unsigned char
4108 vp8_intra_block_mode_map[VME_B_INTRA_MODE_COUNT] = {
4109     PAK_B_VE_PRED,
4110     PAK_B_HE_PRED,
4111     PAK_B_DC_PRED,
4112     PAK_B_LD_PRED,
4113     PAK_B_RD_PRED,
4114     PAK_B_VR_PRED,
4115     PAK_B_HD_PRED,
4116     PAK_B_VL_PRED,
4117     PAK_B_HU_PRED
4118 };
4119
4120 static int inline gen8_mfc_vp8_intra_mb_mode_map(unsigned int vme_pred_mode, int is_luma_4x4)
4121 {
4122     unsigned int i, pak_pred_mode = 0;
4123     unsigned int vme_sub_blocks_pred_mode[8], pak_sub_blocks_pred_mode[8]; /* 8 blocks's intra mode */
4124
4125     if (!is_luma_4x4) {
4126         pak_pred_mode = vp8_intra_mb_mode_map[vme_pred_mode & 0x3];
4127     } else {
4128         for (i = 0; i < 8; i++) { 
4129             vme_sub_blocks_pred_mode[i] = ((vme_pred_mode >> (4 * i)) & 0xf);
4130             assert(vme_sub_blocks_pred_mode[i] < VME_B_INTRA_MODE_COUNT);
4131             pak_sub_blocks_pred_mode[i] = vp8_intra_block_mode_map[vme_sub_blocks_pred_mode[i]];
4132             pak_pred_mode |= (pak_sub_blocks_pred_mode[i] << (4 * i));
4133         }
4134     }
4135
4136     return pak_pred_mode;
4137 }
4138 static void
4139 gen8_mfc_vp8_pak_object_intra(VADriverContextP ctx, 
4140                               struct intel_encoder_context *encoder_context,
4141                               unsigned int *msg,
4142                               int x, int y,
4143                               struct intel_batchbuffer *batch)
4144 {
4145     unsigned int vme_intra_mb_mode, vme_chroma_pred_mode;
4146     unsigned int pak_intra_mb_mode, pak_chroma_pred_mode;
4147     unsigned int vme_luma_pred_mode[2], pak_luma_pred_mode[2];
4148
4149     if (batch == NULL)
4150         batch = encoder_context->base.batch;
4151
4152     vme_intra_mb_mode = ((msg[0] & 0x30) >> 4);
4153     assert((vme_intra_mb_mode == 0) || (vme_intra_mb_mode == 2)); //vp8 only support intra_16x16 and intra_4x4
4154     pak_intra_mb_mode = (vme_intra_mb_mode >> 1);
4155
4156     vme_luma_pred_mode[0] = msg[1];
4157     vme_luma_pred_mode[1] = msg[2];
4158     vme_chroma_pred_mode = msg[3] & 0x3;
4159
4160     pak_luma_pred_mode[0] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[0], pak_intra_mb_mode);
4161     pak_luma_pred_mode[1] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[1], pak_intra_mb_mode);
4162     pak_chroma_pred_mode = gen8_mfc_vp8_intra_mb_mode_map(vme_chroma_pred_mode, 0);
4163
4164     BEGIN_BCS_BATCH(batch, 7);
4165
4166     OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
4167     OUT_BCS_BATCH(batch, 0);
4168     OUT_BCS_BATCH(batch, 0);
4169     OUT_BCS_BATCH(batch,
4170                   (0 << 20) |                    /* mv format: intra mb */
4171                   (0 << 18) |                    /* Segment ID */
4172                   (0 << 17) |                    /* disable coeff clamp */
4173                   (1 << 13) |                    /* intra mb flag */
4174                   (0 << 11) |                    /* refer picture select: last frame */
4175                   (pak_intra_mb_mode << 8) |     /* mb type */
4176                   (pak_chroma_pred_mode << 4) |  /* mb uv mode */
4177                   (0 << 2) |                     /* skip mb flag: disable */
4178                   0);
4179
4180     OUT_BCS_BATCH(batch, (y << 16) | x);
4181     OUT_BCS_BATCH(batch, pak_luma_pred_mode[0]);
4182     OUT_BCS_BATCH(batch, pak_luma_pred_mode[1]);
4183
4184     ADVANCE_BCS_BATCH(batch);
4185 }
4186
4187 static void
4188 gen8_mfc_vp8_pak_object_inter(VADriverContextP ctx, 
4189                               struct intel_encoder_context *encoder_context,
4190                               unsigned int *msg,
4191                               int offset,
4192                               int x, int y,
4193                               struct intel_batchbuffer *batch)
4194 {
4195     int i;
4196
4197     if (batch == NULL)
4198         batch = encoder_context->base.batch;
4199
4200     /* only support inter_16x16 now */
4201     assert((msg[AVC_INTER_MSG_OFFSET] & INTER_MODE_MASK) == INTER_16X16);
4202     /* for inter_16x16, all 16 MVs should be same, 
4203      * and move mv to the vme mb start address to make sure offset is 64 bytes aligned
4204      * as vp8 spec, all vp8 luma motion vectors are doulbled stored
4205      */
4206     msg[0] = (((msg[AVC_INTER_MV_OFFSET/4] & 0xffff0000) << 1) | ((msg[AVC_INTER_MV_OFFSET/4] << 1) & 0xffff));
4207
4208     for (i = 1; i < 16; i++) {
4209         msg[i] = msg[0];
4210     }
4211     
4212     BEGIN_BCS_BATCH(batch, 7);
4213
4214     OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
4215     OUT_BCS_BATCH(batch,
4216                   (0 << 29) |           /* enable inline mv data: disable */
4217                   64);
4218     OUT_BCS_BATCH(batch,
4219                   offset);
4220     OUT_BCS_BATCH(batch,
4221                   (4 << 20) |           /* mv format: inter */
4222                   (0 << 18) |           /* Segment ID */
4223                   (0 << 17) |           /* coeff clamp: disable */
4224                   (0 << 13) |           /* intra mb flag: inter mb */
4225                   (0 << 11) |           /* refer picture select: last frame */
4226                   (0 << 8) |            /* mb type: 16x16 */
4227                   (0 << 4) |            /* mb uv mode: dc_pred */
4228                   (0 << 2) |            /* skip mb flag: disable */
4229                   0);
4230
4231     OUT_BCS_BATCH(batch, (y << 16) | x);
4232
4233     /*new mv*/
4234     OUT_BCS_BATCH(batch, 0x8);
4235     OUT_BCS_BATCH(batch, 0x8);
4236
4237     ADVANCE_BCS_BATCH(batch);
4238 }
4239
4240 static void
4241 gen8_mfc_vp8_pak_pipeline(VADriverContextP ctx,
4242                           struct encode_state *encode_state,
4243                           struct intel_encoder_context *encoder_context,
4244                           struct intel_batchbuffer *slice_batch)
4245 {
4246     struct gen6_vme_context *vme_context = encoder_context->vme_context;
4247     VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
4248     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
4249     int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
4250     int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
4251     unsigned int *msg = NULL;
4252     unsigned char *msg_ptr = NULL;
4253     unsigned int i, offset, is_intra_frame;
4254
4255     is_intra_frame = !pic_param->pic_flags.bits.frame_type;
4256
4257     dri_bo_map(vme_context->vme_output.bo , 1);
4258     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
4259
4260     for( i = 0; i < width_in_mbs * height_in_mbs; i++) {
4261         int h_pos = i % width_in_mbs;
4262         int v_pos = i / width_in_mbs;
4263         msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
4264         
4265         if (is_intra_frame) {
4266             gen8_mfc_vp8_pak_object_intra(ctx,
4267                     encoder_context,
4268                     msg,
4269                     h_pos, v_pos,
4270                     slice_batch);
4271         } else {
4272             int inter_rdo, intra_rdo;
4273             inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
4274             intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
4275
4276             if (intra_rdo < inter_rdo) {
4277                 gen8_mfc_vp8_pak_object_intra(ctx,
4278                         encoder_context,
4279                         msg,
4280                         h_pos, v_pos,
4281                         slice_batch);
4282             } else {
4283                 offset = i * vme_context->vme_output.size_block;
4284                 gen8_mfc_vp8_pak_object_inter(ctx,
4285                         encoder_context,
4286                         msg,
4287                         offset,
4288                         h_pos, v_pos,
4289                         slice_batch);
4290             }
4291         }
4292     }
4293
4294     dri_bo_unmap(vme_context->vme_output.bo);
4295 }
4296
4297 /*
4298  * A batch buffer for vp8 pak object commands
4299  */
4300 static dri_bo *
4301 gen8_mfc_vp8_software_batchbuffer(VADriverContextP ctx,
4302                                           struct encode_state *encode_state,
4303                                           struct intel_encoder_context *encoder_context)
4304 {
4305     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4306     struct intel_batchbuffer *batch;
4307     dri_bo *batch_bo;
4308
4309     batch = mfc_context->aux_batchbuffer;
4310     batch_bo = batch->buffer;
4311
4312     gen8_mfc_vp8_pak_pipeline(ctx, encode_state, encoder_context, batch);
4313
4314     intel_batchbuffer_align(batch, 8);
4315
4316     BEGIN_BCS_BATCH(batch, 2);
4317     OUT_BCS_BATCH(batch, 0);
4318     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
4319     ADVANCE_BCS_BATCH(batch);
4320
4321     dri_bo_reference(batch_bo);
4322     intel_batchbuffer_free(batch);
4323     mfc_context->aux_batchbuffer = NULL;
4324
4325     return batch_bo;
4326 }
4327
4328 static void
4329 gen8_mfc_vp8_pipeline_programing(VADriverContextP ctx,
4330                                    struct encode_state *encode_state,
4331                                    struct intel_encoder_context *encoder_context)
4332 {
4333     struct intel_batchbuffer *batch = encoder_context->base.batch;
4334     dri_bo *slice_batch_bo;
4335
4336     slice_batch_bo = gen8_mfc_vp8_software_batchbuffer(ctx, encode_state, encoder_context);
4337
4338     // begin programing
4339     intel_batchbuffer_start_atomic_bcs(batch, 0x4000);
4340     intel_batchbuffer_emit_mi_flush(batch);
4341
4342     // picture level programing
4343     gen8_mfc_vp8_pipeline_picture_programing(ctx, encode_state, encoder_context);
4344
4345     BEGIN_BCS_BATCH(batch, 4);
4346     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
4347     OUT_BCS_RELOC(batch,
4348                   slice_batch_bo,
4349                   I915_GEM_DOMAIN_COMMAND, 0,
4350                   0);
4351     OUT_BCS_BATCH(batch, 0);
4352     OUT_BCS_BATCH(batch, 0);
4353     ADVANCE_BCS_BATCH(batch);
4354
4355     // end programing
4356     intel_batchbuffer_end_atomic(batch);
4357
4358     dri_bo_unreference(slice_batch_bo);
4359 }
4360
4361 static int gen8_mfc_calc_vp8_coded_buffer_size(VADriverContextP ctx,
4362                           struct encode_state *encode_state,
4363                           struct intel_encoder_context *encoder_context)
4364 {
4365     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4366     VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
4367     unsigned char is_intra_frame = !pic_param->pic_flags.bits.frame_type;
4368     unsigned int *vp8_encoding_status, i, first_partition_bytes, token_partition_bytes, vp8_coded_bytes;
4369     
4370     int partition_num = 1 << pic_param->pic_flags.bits.num_token_partitions;
4371
4372     first_partition_bytes = token_partition_bytes = vp8_coded_bytes = 0;
4373
4374     dri_bo_map(mfc_context->vp8_state.token_statistics_bo, 0);
4375
4376     vp8_encoding_status = (unsigned int *)mfc_context->vp8_state.token_statistics_bo->virtual;
4377     first_partition_bytes = (vp8_encoding_status[0] + 7) / 8;
4378
4379     for (i = 1; i <= partition_num; i++) 
4380         token_partition_bytes += (vp8_encoding_status[i] + 7) / 8;
4381
4382     /*coded_bytes includes P0~P8 partitions bytes + uncompresse date bytes + partion_size bytes in bitstream + 3 extra bytes */
4383     /*it seems the last partition size in vp8 status buffer is smaller than reality. so add 3 extra bytes */
4384     vp8_coded_bytes = first_partition_bytes + token_partition_bytes + (3 + 7 * !!is_intra_frame) + (partition_num - 1) * 3 + 3;
4385
4386     dri_bo_unmap(mfc_context->vp8_state.token_statistics_bo);
4387
4388     dri_bo_map(mfc_context->vp8_state.final_frame_bo, 0);
4389     struct i965_coded_buffer_segment *coded_buffer_segment = (struct i965_coded_buffer_segment *)(mfc_context->vp8_state.final_frame_bo->virtual);
4390     coded_buffer_segment->base.size = vp8_coded_bytes;
4391     dri_bo_unmap(mfc_context->vp8_state.final_frame_bo);
4392
4393     return vp8_coded_bytes;
4394 }
4395
4396 static VAStatus
4397 gen8_mfc_vp8_encode_picture(VADriverContextP ctx,
4398                               struct encode_state *encode_state,
4399                               struct intel_encoder_context *encoder_context)
4400 {
4401     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
4402     unsigned int rate_control_mode = encoder_context->rate_control_mode;
4403     int current_frame_bits_size;
4404     int sts;
4405
4406     gen8_mfc_vp8_init(ctx, encode_state, encoder_context);
4407     intel_mfc_vp8_prepare(ctx, encode_state, encoder_context);
4408     /*Programing bcs pipeline*/
4409     gen8_mfc_vp8_pipeline_programing(ctx, encode_state, encoder_context);
4410     gen8_mfc_run(ctx, encode_state, encoder_context);
4411     current_frame_bits_size = 8 * gen8_mfc_calc_vp8_coded_buffer_size(ctx, encode_state, encoder_context);
4412
4413     if (rate_control_mode == VA_RC_CBR /*|| rate_control_mode == VA_RC_VBR*/) {
4414         sts = gen8_mfc_vp8_brc_postpack(encode_state, mfc_context, current_frame_bits_size);
4415         if (sts == BRC_NO_HRD_VIOLATION) {
4416             gen8_mfc_vp8_hrd_context_update(encode_state, mfc_context);
4417         }
4418         else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
4419             if (!mfc_context->hrd.violation_noted) {
4420                 fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP)? "overflow": "underflow");
4421                 mfc_context->hrd.violation_noted = 1;
4422             }
4423             return VA_STATUS_SUCCESS;
4424         }
4425     }
4426
4427     return VA_STATUS_SUCCESS;
4428 }
4429
4430 static void
4431 gen8_mfc_context_destroy(void *context)
4432 {
4433     struct gen6_mfc_context *mfc_context = context;
4434     int i;
4435
4436     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
4437     mfc_context->post_deblocking_output.bo = NULL;
4438
4439     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
4440     mfc_context->pre_deblocking_output.bo = NULL;
4441
4442     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
4443     mfc_context->uncompressed_picture_source.bo = NULL;
4444
4445     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
4446     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
4447
4448     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
4449         dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
4450         mfc_context->direct_mv_buffers[i].bo = NULL;
4451     }
4452
4453     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
4454     mfc_context->intra_row_store_scratch_buffer.bo = NULL;
4455
4456     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
4457     mfc_context->macroblock_status_buffer.bo = NULL;
4458
4459     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
4460     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = NULL;
4461
4462     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
4463     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = NULL;
4464
4465
4466     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
4467         dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
4468         mfc_context->reference_surfaces[i].bo = NULL;  
4469     }
4470
4471     gen8_gpe_context_destroy(&mfc_context->gpe_context);
4472
4473     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
4474     mfc_context->mfc_batchbuffer_surface.bo = NULL;
4475
4476     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
4477     mfc_context->aux_batchbuffer_surface.bo = NULL;
4478
4479     if (mfc_context->aux_batchbuffer)
4480         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
4481
4482     mfc_context->aux_batchbuffer = NULL;
4483
4484     dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
4485     mfc_context->vp8_state.coeff_probs_stream_in_bo = NULL;
4486
4487     dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
4488     mfc_context->vp8_state.final_frame_bo = NULL;
4489
4490     dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
4491     mfc_context->vp8_state.frame_header_bo = NULL;
4492
4493     dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
4494     mfc_context->vp8_state.intermediate_bo = NULL;
4495
4496     dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
4497     mfc_context->vp8_state.mpc_row_store_bo = NULL;
4498
4499     dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
4500     mfc_context->vp8_state.stream_out_bo = NULL;
4501
4502     dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
4503     mfc_context->vp8_state.token_statistics_bo = NULL;
4504
4505     free(mfc_context);
4506 }
4507
4508 static VAStatus gen8_mfc_pipeline(VADriverContextP ctx,
4509                                   VAProfile profile,
4510                                   struct encode_state *encode_state,
4511                                   struct intel_encoder_context *encoder_context)
4512 {
4513     VAStatus vaStatus;
4514
4515     switch (profile) {
4516     case VAProfileH264ConstrainedBaseline:
4517     case VAProfileH264Main:
4518     case VAProfileH264High:
4519     case VAProfileH264MultiviewHigh:
4520     case VAProfileH264StereoHigh:
4521         vaStatus = gen8_mfc_avc_encode_picture(ctx, encode_state, encoder_context);
4522         break;
4523
4524         /* FIXME: add for other profile */
4525     case VAProfileMPEG2Simple:
4526     case VAProfileMPEG2Main:
4527         vaStatus = gen8_mfc_mpeg2_encode_picture(ctx, encode_state, encoder_context);
4528         break;
4529
4530     case VAProfileJPEGBaseline:
4531         jpeg_init_default_qmatrix(ctx, encoder_context);
4532         vaStatus = gen8_mfc_jpeg_encode_picture(ctx, encode_state, encoder_context);
4533         break;
4534  
4535     case VAProfileVP8Version0_3:
4536         vaStatus = gen8_mfc_vp8_encode_picture(ctx, encode_state, encoder_context);
4537         break;
4538  
4539     default:
4540         vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
4541         break;
4542     }
4543
4544     return vaStatus;
4545 }
4546
4547 Bool gen8_mfc_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
4548 {
4549     struct i965_driver_data *i965 = i965_driver_data(ctx);
4550     struct gen6_mfc_context *mfc_context = calloc(1, sizeof(struct gen6_mfc_context));
4551
4552     assert(mfc_context);
4553     mfc_context->gpe_context.surface_state_binding_table.length = (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;
4554
4555     mfc_context->gpe_context.idrt_size = sizeof(struct gen8_interface_descriptor_data) * MAX_INTERFACE_DESC_GEN6;
4556     mfc_context->gpe_context.curbe_size = 32 * 4;
4557     mfc_context->gpe_context.sampler_size = 0;
4558
4559     mfc_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
4560     mfc_context->gpe_context.vfe_state.num_urb_entries = 16;
4561     mfc_context->gpe_context.vfe_state.gpgpu_mode = 0;
4562     mfc_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
4563     mfc_context->gpe_context.vfe_state.curbe_allocation_size = 37 - 1;
4564
4565     if (IS_GEN9(i965->intel.device_info)) {
4566         gen8_gpe_load_kernels(ctx,
4567                           &mfc_context->gpe_context,
4568                           gen9_mfc_kernels,
4569                           1);
4570     } else {
4571         gen8_gpe_load_kernels(ctx,
4572                           &mfc_context->gpe_context,
4573                           gen8_mfc_kernels,
4574                           1);
4575     }
4576
4577     mfc_context->pipe_mode_select = gen8_mfc_pipe_mode_select;
4578     mfc_context->set_surface_state = gen8_mfc_surface_state;
4579     mfc_context->ind_obj_base_addr_state = gen8_mfc_ind_obj_base_addr_state;
4580     mfc_context->avc_img_state = gen8_mfc_avc_img_state;
4581     mfc_context->avc_qm_state = gen8_mfc_avc_qm_state;
4582     mfc_context->avc_fqm_state = gen8_mfc_avc_fqm_state;
4583     mfc_context->insert_object = gen8_mfc_avc_insert_object;
4584     mfc_context->buffer_suface_setup = gen8_gpe_buffer_suface_setup;
4585
4586     encoder_context->mfc_context = mfc_context;
4587     encoder_context->mfc_context_destroy = gen8_mfc_context_destroy;
4588     encoder_context->mfc_pipeline = gen8_mfc_pipeline;
4589
4590     if (encoder_context->codec == CODEC_VP8)
4591         encoder_context->mfc_brc_prepare = gen8_mfc_vp8_brc_prepare;
4592     else
4593         encoder_context->mfc_brc_prepare = intel_mfc_brc_prepare;
4594
4595     return True;
4596 }