OSDN Git Service

Fix wrong y_cb/cr_offset when the imported surfaces's fourcc is VA_FOURCC_411P
[android-x86/hardware-intel-common-vaapi.git] / src / gen6_mfc_common.c
index a6b418c..0d21a11 100644 (file)
 #include "i965_encoder_utils.h"
 #include "gen6_mfc.h"
 #include "gen6_vme.h"
+#include "gen9_mfc.h"
 #include "intel_media.h"
 
-#define BRC_CLIP(x, min, max)                                   \
-    {                                                           \
-        x = ((x > (max)) ? (max) : ((x < (min)) ? (min) : x));  \
-    }
-
-#define BRC_P_B_QP_DIFF 4
-#define BRC_I_P_QP_DIFF 2
-#define BRC_I_B_QP_DIFF (BRC_I_P_QP_DIFF + BRC_P_B_QP_DIFF)
-
-#define BRC_PWEIGHT 0.6  /* weight if P slice with comparison to I slice */
-#define BRC_BWEIGHT 0.25 /* weight if B slice with comparison to I slice */
-
-#define BRC_QP_MAX_CHANGE 5 /* maximum qp modification */
-#define BRC_CY 0.1 /* weight for */
-#define BRC_CX_UNDERFLOW 5.
-#define BRC_CX_OVERFLOW -4.
-
-#define BRC_PI_0_5 1.5707963267948966192313216916398
-
 #ifndef HAVE_LOG2F
 #define log2f(x) (logf(x)/(float)M_LN2)
 #endif
@@ -86,25 +68,12 @@ int intel_avc_enc_slice_type_fixup(int slice_type)
 
 static void
 intel_mfc_bit_rate_control_context_init(struct encode_state *encode_state, 
-                                        struct gen6_mfc_context *mfc_context)
+                                        struct intel_encoder_context *encoder_context)
 {
-    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
-    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
-    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
-    float fps =  pSequenceParameter->time_scale * 0.5 / pSequenceParameter->num_units_in_tick ;
-    int inter_mb_size = pSequenceParameter->bits_per_second * 1.0 / (fps+4.0) / width_in_mbs / height_in_mbs;
-    int intra_mb_size = inter_mb_size * 5.0;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
     int i;
 
-    mfc_context->bit_rate_control_context[SLICE_TYPE_I].target_mb_size = intra_mb_size;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_I].target_frame_size = intra_mb_size * width_in_mbs * height_in_mbs;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_P].target_mb_size = inter_mb_size;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_P].target_frame_size = inter_mb_size * width_in_mbs * height_in_mbs;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_B].target_mb_size = inter_mb_size;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_B].target_frame_size = inter_mb_size * width_in_mbs * height_in_mbs;
-
     for(i = 0 ; i < 3; i++) {
-        mfc_context->bit_rate_control_context[i].QpPrimeY = 26;
         mfc_context->bit_rate_control_context[i].MaxQpNegModifier = 6;
         mfc_context->bit_rate_control_context[i].MaxQpPosModifier = 6;
         mfc_context->bit_rate_control_context[i].GrowInit = 6;
@@ -119,113 +88,155 @@ intel_mfc_bit_rate_control_context_init(struct encode_state *encode_state,
         mfc_context->bit_rate_control_context[i].Correct[4] = 4;
         mfc_context->bit_rate_control_context[i].Correct[5] = 8;
     }
-    
-    mfc_context->bit_rate_control_context[SLICE_TYPE_I].TargetSizeInWord = (intra_mb_size + 16)/ 16;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_P].TargetSizeInWord = (inter_mb_size + 16)/ 16;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_B].TargetSizeInWord = (inter_mb_size + 16)/ 16;
-
-    mfc_context->bit_rate_control_context[SLICE_TYPE_I].MaxSizeInWord = mfc_context->bit_rate_control_context[SLICE_TYPE_I].TargetSizeInWord * 1.5;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_P].MaxSizeInWord = mfc_context->bit_rate_control_context[SLICE_TYPE_P].TargetSizeInWord * 1.5;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_B].MaxSizeInWord = mfc_context->bit_rate_control_context[SLICE_TYPE_B].TargetSizeInWord * 1.5;
 }
 
 static void intel_mfc_brc_init(struct encode_state *encode_state,
                                struct intel_encoder_context* encoder_context)
 {
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
-    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
-    VAEncMiscParameterBuffer* pMiscParamHRD = (VAEncMiscParameterBuffer*)encode_state->misc_param[VAEncMiscParameterTypeHRD]->buffer;
-    VAEncMiscParameterHRD* pParameterHRD = (VAEncMiscParameterHRD*)pMiscParamHRD->data;
-    double bitrate = pSequenceParameter->bits_per_second;
-    double framerate = (double)pSequenceParameter->time_scale /(2 * (double)pSequenceParameter->num_units_in_tick);
-    int inum = 1, pnum = 0, bnum = 0; /* Gop structure: number of I, P, B frames in the Gop. */
-    int intra_period = pSequenceParameter->intra_period;
-    int ip_period = pSequenceParameter->ip_period;
-    double qp1_size = 0.1 * 8 * 3 * (pSequenceParameter->picture_width_in_mbs<<4) * (pSequenceParameter->picture_height_in_mbs<<4)/2;
-    double qp51_size = 0.001 * 8 * 3 * (pSequenceParameter->picture_width_in_mbs<<4) * (pSequenceParameter->picture_height_in_mbs<<4)/2;
-    double bpf;
-
-    if (pSequenceParameter->ip_period) {
-        pnum = (intra_period + ip_period - 1)/ip_period - 1;
-        bnum = intra_period - inum - pnum;
-    }
+    double bitrate, framerate;
+    double frame_per_bits = 8 * 3 * encoder_context->frame_width_in_pixel * encoder_context->frame_height_in_pixel / 2;
+    double qp1_size = 0.1 * frame_per_bits;
+    double qp51_size = 0.001 * frame_per_bits;
+    int min_qp = MAX(1, encoder_context->brc.min_qp);
+    double bpf, factor, hrd_factor;
+    int inum = encoder_context->brc.num_iframes_in_gop,
+        pnum = encoder_context->brc.num_pframes_in_gop,
+        bnum = encoder_context->brc.num_bframes_in_gop; /* Gop structure: number of I, P, B frames in the Gop. */
+    int intra_period = encoder_context->brc.gop_size;
+    int i;
+
+    if (encoder_context->layer.num_layers > 1)
+        qp1_size = 0.15 * frame_per_bits;
 
     mfc_context->brc.mode = encoder_context->rate_control_mode;
 
-    mfc_context->brc.target_frame_size[SLICE_TYPE_I] = (int)((double)((bitrate * intra_period)/framerate) /
-                                                             (double)(inum + BRC_PWEIGHT * pnum + BRC_BWEIGHT * bnum));
-    mfc_context->brc.target_frame_size[SLICE_TYPE_P] = BRC_PWEIGHT * mfc_context->brc.target_frame_size[SLICE_TYPE_I];
-    mfc_context->brc.target_frame_size[SLICE_TYPE_B] = BRC_BWEIGHT * mfc_context->brc.target_frame_size[SLICE_TYPE_I];
+    mfc_context->hrd.violation_noted = 0;
 
-    mfc_context->brc.gop_nums[SLICE_TYPE_I] = inum;
-    mfc_context->brc.gop_nums[SLICE_TYPE_P] = pnum;
-    mfc_context->brc.gop_nums[SLICE_TYPE_B] = bnum;
+    for (i = 0; i < encoder_context->layer.num_layers; i++) {
+        mfc_context->brc.qp_prime_y[i][SLICE_TYPE_I] = 26;
+        mfc_context->brc.qp_prime_y[i][SLICE_TYPE_P] = 26;
+        mfc_context->brc.qp_prime_y[i][SLICE_TYPE_B] = 26;
 
-    bpf = mfc_context->brc.bits_per_frame = bitrate/framerate;
+        if (i == 0) {
+            bitrate = encoder_context->brc.bits_per_second[0];
+            framerate = (double)encoder_context->brc.framerate[0].num / (double)encoder_context->brc.framerate[0].den;
+        } else {
+            bitrate = (encoder_context->brc.bits_per_second[i] - encoder_context->brc.bits_per_second[i - 1]);
+            framerate = ((double)encoder_context->brc.framerate[i].num / (double)encoder_context->brc.framerate[i].den) -
+                ((double)encoder_context->brc.framerate[i - 1].num / (double)encoder_context->brc.framerate[i - 1].den);
+        }
 
-    mfc_context->hrd.buffer_size = (double)pParameterHRD->buffer_size;
-    mfc_context->hrd.current_buffer_fullness =
-        (double)(pParameterHRD->initial_buffer_fullness < mfc_context->hrd.buffer_size)?
-        pParameterHRD->initial_buffer_fullness: mfc_context->hrd.buffer_size/2.;
-    mfc_context->hrd.target_buffer_fullness = (double)mfc_context->hrd.buffer_size/2.;
-    mfc_context->hrd.buffer_capacity = (double)mfc_context->hrd.buffer_size/qp1_size;
-    mfc_context->hrd.violation_noted = 0;
+        if (mfc_context->brc.mode == VA_RC_VBR && encoder_context->brc.target_percentage[i])
+            bitrate = bitrate * encoder_context->brc.target_percentage[i] / 100;
 
-    if ((bpf > qp51_size) && (bpf < qp1_size)) {
-        mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY = 51 - 50*(bpf - qp51_size)/(qp1_size - qp51_size);
-    }
-    else if (bpf >= qp1_size)
-        mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY = 1;
-    else if (bpf <= qp51_size)
-        mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY = 51;
+        if (i == encoder_context->layer.num_layers - 1)
+            factor = 1.0;
+        else {
+            factor = ((double)encoder_context->brc.framerate[i].num / (double)encoder_context->brc.framerate[i].den) /
+                ((double)encoder_context->brc.framerate[i - 1].num / (double)encoder_context->brc.framerate[i - 1].den);
+        }
+
+        hrd_factor = (double)bitrate / encoder_context->brc.bits_per_second[encoder_context->layer.num_layers - 1];
+
+        mfc_context->hrd.buffer_size[i] = (unsigned int)(encoder_context->brc.hrd_buffer_size * hrd_factor);
+        mfc_context->hrd.current_buffer_fullness[i] =
+            (double)(encoder_context->brc.hrd_initial_buffer_fullness < encoder_context->brc.hrd_buffer_size) ?
+            encoder_context->brc.hrd_initial_buffer_fullness : encoder_context->brc.hrd_buffer_size / 2.;
+        mfc_context->hrd.current_buffer_fullness[i] *= hrd_factor;
+        mfc_context->hrd.target_buffer_fullness[i] = (double)encoder_context->brc.hrd_buffer_size * hrd_factor / 2.;
+        mfc_context->hrd.buffer_capacity[i] = (double)encoder_context->brc.hrd_buffer_size * hrd_factor / qp1_size;
+
+        if (encoder_context->layer.num_layers > 1) {
+            if (i == 0) {
+                intra_period = (int)(encoder_context->brc.gop_size * factor);
+                inum = 1;
+                pnum = (int)(encoder_context->brc.num_pframes_in_gop * factor);
+                bnum = intra_period - inum - pnum;
+            } else {
+                intra_period = (int)(encoder_context->brc.gop_size * factor) - intra_period;
+                inum = 0;
+                pnum = (int)(encoder_context->brc.num_pframes_in_gop * factor) - pnum;
+                bnum = intra_period - inum - pnum;
+            }
+        }
+
+        mfc_context->brc.gop_nums[i][SLICE_TYPE_I] = inum;
+        mfc_context->brc.gop_nums[i][SLICE_TYPE_P] = pnum;
+        mfc_context->brc.gop_nums[i][SLICE_TYPE_B] = bnum;
+
+        mfc_context->brc.target_frame_size[i][SLICE_TYPE_I] = (int)((double)((bitrate * intra_period)/framerate) /
+                                                                    (double)(inum + BRC_PWEIGHT * pnum + BRC_BWEIGHT * bnum));
+        mfc_context->brc.target_frame_size[i][SLICE_TYPE_P] = BRC_PWEIGHT * mfc_context->brc.target_frame_size[i][SLICE_TYPE_I];
+        mfc_context->brc.target_frame_size[i][SLICE_TYPE_B] = BRC_BWEIGHT * mfc_context->brc.target_frame_size[i][SLICE_TYPE_I];
+
+        bpf = mfc_context->brc.bits_per_frame[i] = bitrate/framerate;
 
-    mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY = mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY;
-    mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY = mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY;
+        if (encoder_context->brc.initial_qp) {
+            mfc_context->brc.qp_prime_y[i][SLICE_TYPE_I] = encoder_context->brc.initial_qp;
+            mfc_context->brc.qp_prime_y[i][SLICE_TYPE_P] = encoder_context->brc.initial_qp;
+            mfc_context->brc.qp_prime_y[i][SLICE_TYPE_B] = encoder_context->brc.initial_qp;
+        } else {
+            if ((bpf > qp51_size) && (bpf < qp1_size)) {
+                mfc_context->brc.qp_prime_y[i][SLICE_TYPE_P] = 51 - 50*(bpf - qp51_size)/(qp1_size - qp51_size);
+            }
+            else if (bpf >= qp1_size)
+                mfc_context->brc.qp_prime_y[i][SLICE_TYPE_P] = 1;
+            else if (bpf <= qp51_size)
+                mfc_context->brc.qp_prime_y[i][SLICE_TYPE_P] = 51;
 
-    BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY, 1, 51);
-    BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY, 1, 51);
-    BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY, 1, 51);
+            mfc_context->brc.qp_prime_y[i][SLICE_TYPE_I] = mfc_context->brc.qp_prime_y[i][SLICE_TYPE_P];
+            mfc_context->brc.qp_prime_y[i][SLICE_TYPE_B] = mfc_context->brc.qp_prime_y[i][SLICE_TYPE_I];
+        }
+
+        BRC_CLIP(mfc_context->brc.qp_prime_y[i][SLICE_TYPE_I], min_qp, 51);
+        BRC_CLIP(mfc_context->brc.qp_prime_y[i][SLICE_TYPE_P], min_qp, 51);
+        BRC_CLIP(mfc_context->brc.qp_prime_y[i][SLICE_TYPE_B], min_qp, 51);
+    }
 }
 
 int intel_mfc_update_hrd(struct encode_state *encode_state,
-                         struct gen6_mfc_context *mfc_context,
+                         struct intel_encoder_context *encoder_context,
                          int frame_bits)
 {
-    double prev_bf = mfc_context->hrd.current_buffer_fullness;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    int layer_id = encoder_context->layer.curr_frame_layer_id;
+    double prev_bf = mfc_context->hrd.current_buffer_fullness[layer_id];
 
-    mfc_context->hrd.current_buffer_fullness -= frame_bits;
+    mfc_context->hrd.current_buffer_fullness[layer_id] -= frame_bits;
 
-    if (mfc_context->hrd.buffer_size > 0 && mfc_context->hrd.current_buffer_fullness <= 0.) {
-        mfc_context->hrd.current_buffer_fullness = prev_bf;
+    if (mfc_context->hrd.buffer_size[layer_id] > 0 && mfc_context->hrd.current_buffer_fullness[layer_id] <= 0.) {
+        mfc_context->hrd.current_buffer_fullness[layer_id] = prev_bf;
         return BRC_UNDERFLOW;
     }
     
-    mfc_context->hrd.current_buffer_fullness += mfc_context->brc.bits_per_frame;
-    if (mfc_context->hrd.buffer_size > 0 && mfc_context->hrd.current_buffer_fullness > mfc_context->hrd.buffer_size) {
+    mfc_context->hrd.current_buffer_fullness[layer_id] += mfc_context->brc.bits_per_frame[layer_id];
+    if (mfc_context->hrd.buffer_size[layer_id] > 0 && mfc_context->hrd.current_buffer_fullness[layer_id] > mfc_context->hrd.buffer_size[layer_id]) {
         if (mfc_context->brc.mode == VA_RC_VBR)
-            mfc_context->hrd.current_buffer_fullness = mfc_context->hrd.buffer_size;
+            mfc_context->hrd.current_buffer_fullness[layer_id] = mfc_context->hrd.buffer_size[layer_id];
         else {
-            mfc_context->hrd.current_buffer_fullness = prev_bf;
+            mfc_context->hrd.current_buffer_fullness[layer_id] = prev_bf;
             return BRC_OVERFLOW;
         }
     }
     return BRC_NO_HRD_VIOLATION;
 }
 
-int intel_mfc_brc_postpack(struct encode_state *encode_state,
-                           struct gen6_mfc_context *mfc_context,
-                           int frame_bits)
+static int intel_mfc_brc_postpack_cbr(struct encode_state *encode_state,
+                                      struct intel_encoder_context *encoder_context,
+                                      int frame_bits)
 {
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
     gen6_brc_status sts = BRC_NO_HRD_VIOLATION;
     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer; 
     int slicetype = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
-    int qpi = mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY;
-    int qpp = mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY;
-    int qpb = mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY;
+    int curr_frame_layer_id, next_frame_layer_id;
+    int qpi, qpp, qpb;
     int qp; // quantizer of previously encoded slice of current type
     int qpn; // predicted quantizer for next frame of current type in integer format
     double qpf; // predicted quantizer for next frame of current type in float format
     double delta_qp; // QP correction
+    int min_qp = MAX(1, encoder_context->brc.min_qp);
     int target_frame_size, frame_size_next;
     /* Notes:
      *  x - how far we are from HRD buffer borders
@@ -234,13 +245,44 @@ int intel_mfc_brc_postpack(struct encode_state *encode_state,
     double x, y;
     double frame_size_alpha;
 
-    qp = mfc_context->bit_rate_control_context[slicetype].QpPrimeY;
+    if (encoder_context->layer.num_layers < 2 || encoder_context->layer.size_frame_layer_ids == 0) {
+        curr_frame_layer_id = 0;
+        next_frame_layer_id = 0;
+    } else {
+        curr_frame_layer_id = encoder_context->layer.curr_frame_layer_id;
+        next_frame_layer_id = encoder_context->layer.frame_layer_ids[encoder_context->num_frames_in_sequence % encoder_context->layer.size_frame_layer_ids];
+    }
+
+    /* checking wthether HRD compliance first */
+    sts = intel_mfc_update_hrd(encode_state, encoder_context, frame_bits);
+
+    if (sts == BRC_NO_HRD_VIOLATION) { // no HRD violation
+        /* nothing */
+    } else {
+        next_frame_layer_id = curr_frame_layer_id;
+    }
+
+    mfc_context->brc.bits_prev_frame[curr_frame_layer_id] = frame_bits;
+    frame_bits = mfc_context->brc.bits_prev_frame[next_frame_layer_id];
+
+    mfc_context->brc.prev_slice_type[curr_frame_layer_id] = slicetype;
+    slicetype = mfc_context->brc.prev_slice_type[next_frame_layer_id];
+
+    /* 0 means the next frame is the first frame of next layer */
+    if (frame_bits == 0)
+        return sts;
+
+    qpi = mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_I];
+    qpp = mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_P];
+    qpb = mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_B];
+
+    qp = mfc_context->brc.qp_prime_y[next_frame_layer_id][slicetype];
 
-    target_frame_size = mfc_context->brc.target_frame_size[slicetype];
-    if (mfc_context->hrd.buffer_capacity < 5)
+    target_frame_size = mfc_context->brc.target_frame_size[next_frame_layer_id][slicetype];
+    if (mfc_context->hrd.buffer_capacity[next_frame_layer_id] < 5)
         frame_size_alpha = 0;
     else
-        frame_size_alpha = (double)mfc_context->brc.gop_nums[slicetype];
+        frame_size_alpha = (double)mfc_context->brc.gop_nums[next_frame_layer_id][slicetype];
     if (frame_size_alpha > 30) frame_size_alpha = 30;
     frame_size_next = target_frame_size + (double)(target_frame_size - frame_bits) /
         (double)(frame_size_alpha + 1.);
@@ -254,13 +296,13 @@ int intel_mfc_brc_postpack(struct encode_state *encode_state,
 
     if (qpn == qp) {
         /* setting qpn we round qpf making mistakes: now we are trying to compensate this */
-        mfc_context->brc.qpf_rounding_accumulator += qpf - qpn;
-        if (mfc_context->brc.qpf_rounding_accumulator > 1.0) {
+        mfc_context->brc.qpf_rounding_accumulator[next_frame_layer_id] += qpf - qpn;
+        if (mfc_context->brc.qpf_rounding_accumulator[next_frame_layer_id] > 1.0) {
             qpn++;
-            mfc_context->brc.qpf_rounding_accumulator = 0.;
-        } else if (mfc_context->brc.qpf_rounding_accumulator < -1.0) {
+            mfc_context->brc.qpf_rounding_accumulator[next_frame_layer_id] = 0.;
+        } else if (mfc_context->brc.qpf_rounding_accumulator[next_frame_layer_id] < -1.0) {
             qpn--;
-            mfc_context->brc.qpf_rounding_accumulator = 0.;
+            mfc_context->brc.qpf_rounding_accumulator[next_frame_layer_id] = 0.;
         }
     }
     /* making sure that QP is not changing too fast */
@@ -269,18 +311,15 @@ int intel_mfc_brc_postpack(struct encode_state *encode_state,
     /* making sure that with QP predictions we did do not leave QPs range */
     BRC_CLIP(qpn, 1, 51);
 
-    /* checking wthether HRD compliance is still met */
-    sts = intel_mfc_update_hrd(encode_state, mfc_context, frame_bits);
-
     /* calculating QP delta as some function*/
-    x = mfc_context->hrd.target_buffer_fullness - mfc_context->hrd.current_buffer_fullness;
+    x = mfc_context->hrd.target_buffer_fullness[next_frame_layer_id] - mfc_context->hrd.current_buffer_fullness[next_frame_layer_id];
     if (x > 0) {
-        x /= mfc_context->hrd.target_buffer_fullness;
-        y = mfc_context->hrd.current_buffer_fullness;
+        x /= mfc_context->hrd.target_buffer_fullness[next_frame_layer_id];
+        y = mfc_context->hrd.current_buffer_fullness[next_frame_layer_id];
     }
     else {
-        x /= (mfc_context->hrd.buffer_size - mfc_context->hrd.target_buffer_fullness);
-        y = mfc_context->hrd.buffer_size - mfc_context->hrd.current_buffer_fullness;
+        x /= (mfc_context->hrd.buffer_size[next_frame_layer_id] - mfc_context->hrd.target_buffer_fullness[next_frame_layer_id]);
+        y = mfc_context->hrd.buffer_size[next_frame_layer_id] - mfc_context->hrd.current_buffer_fullness[next_frame_layer_id];
     }
     if (y < 0.01) y = 0.01;
     if (x > 1) x = 1;
@@ -290,29 +329,29 @@ int intel_mfc_brc_postpack(struct encode_state *encode_state,
     qpn = (int)(qpn + delta_qp + 0.5);
 
     /* making sure that with QP predictions we did do not leave QPs range */
-    BRC_CLIP(qpn, 1, 51);
+    BRC_CLIP(qpn, min_qp, 51);
 
     if (sts == BRC_NO_HRD_VIOLATION) { // no HRD violation
         /* correcting QPs of slices of other types */
         if (slicetype == SLICE_TYPE_P) {
             if (abs(qpn + BRC_P_B_QP_DIFF - qpb) > 2)
-                mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY += (qpn + BRC_P_B_QP_DIFF - qpb) >> 1;
+                mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_B] += (qpn + BRC_P_B_QP_DIFF - qpb) >> 1;
             if (abs(qpn - BRC_I_P_QP_DIFF - qpi) > 2)
-                mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY += (qpn - BRC_I_P_QP_DIFF - qpi) >> 1;
+                mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_I] += (qpn - BRC_I_P_QP_DIFF - qpi) >> 1;
         } else if (slicetype == SLICE_TYPE_I) {
             if (abs(qpn + BRC_I_B_QP_DIFF - qpb) > 4)
-                mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY += (qpn + BRC_I_B_QP_DIFF - qpb) >> 2;
+                mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_B] += (qpn + BRC_I_B_QP_DIFF - qpb) >> 2;
             if (abs(qpn + BRC_I_P_QP_DIFF - qpp) > 2)
-                mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY += (qpn + BRC_I_P_QP_DIFF - qpp) >> 2;
+                mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_P] += (qpn + BRC_I_P_QP_DIFF - qpp) >> 2;
         } else { // SLICE_TYPE_B
             if (abs(qpn - BRC_P_B_QP_DIFF - qpp) > 2)
-                mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY += (qpn - BRC_P_B_QP_DIFF - qpp) >> 1;
+                mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_P] += (qpn - BRC_P_B_QP_DIFF - qpp) >> 1;
             if (abs(qpn - BRC_I_B_QP_DIFF - qpi) > 4)
-                mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY += (qpn - BRC_I_B_QP_DIFF - qpi) >> 2;
+                mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_I] += (qpn - BRC_I_B_QP_DIFF - qpi) >> 2;
         }
-        BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY, 1, 51);
-        BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY, 1, 51);
-        BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY, 1, 51);
+        BRC_CLIP(mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_I], min_qp, 51);
+        BRC_CLIP(mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_P], min_qp, 51);
+        BRC_CLIP(mfc_context->brc.qp_prime_y[next_frame_layer_id][SLICE_TYPE_B], min_qp, 51);
     } else if (sts == BRC_UNDERFLOW) { // underflow
         if (qpn <= qp) qpn = qp + 1;
         if (qpn > 51) {
@@ -321,30 +360,143 @@ int intel_mfc_brc_postpack(struct encode_state *encode_state,
         }
     } else if (sts == BRC_OVERFLOW) {
         if (qpn >= qp) qpn = qp - 1;
-        if (qpn < 1) { // < 0 (?) overflow with minQP
-            qpn = 1;
+        if (qpn < min_qp) { // overflow with minQP
+            qpn = min_qp;
             sts = BRC_OVERFLOW_WITH_MIN_QP; // bit stuffing to be done
         }
     }
 
-    mfc_context->bit_rate_control_context[slicetype].QpPrimeY = qpn;
+    mfc_context->brc.qp_prime_y[next_frame_layer_id][slicetype] = qpn;
 
     return sts;
 }
 
+static int intel_mfc_brc_postpack_vbr(struct encode_state *encode_state,
+                                      struct intel_encoder_context *encoder_context,
+                                      int frame_bits)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    gen6_brc_status sts;
+    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
+    int *qp = mfc_context->brc.qp_prime_y[0];
+    int min_qp = MAX(1, encoder_context->brc.min_qp);
+    int qp_delta, large_frame_adjustment;
+
+    // This implements a simple reactive VBR rate control mode for single-layer H.264.  The primary
+    // aim here is to avoid the problematic behaviour that the CBR rate controller displays on
+    // scene changes, where the QP can get pushed up by a large amount in a short period and
+    // compromise the quality of following frames to a very visible degree.
+    // The main idea, then, is to try to keep the HRD buffering above the target level most of the
+    // time, so that when a large frame is generated (on a scene change or when the stream
+    // complexity increases) we have plenty of slack to be able to encode the more difficult region
+    // without compromising quality immediately on the following frames.   It is optimistic about
+    // the complexity of future frames, so even after generating one or more large frames on a
+    // significant change it will try to keep the QP at its current level until the HRD buffer
+    // bounds force a change to maintain the intended rate.
+
+    sts = intel_mfc_update_hrd(encode_state, encoder_context, frame_bits);
+
+    // This adjustment is applied to increase the QP by more than we normally would if a very
+    // large frame is encountered and we are in danger of running out of slack.
+    large_frame_adjustment = rint(2.0 * log(frame_bits / mfc_context->brc.target_frame_size[0][slice_type]));
+
+    if (sts == BRC_UNDERFLOW) {
+        // The frame is far too big and we don't have the bits available to send it, so it will
+        // have to be re-encoded at a higher QP.
+        qp_delta = +2;
+        if (frame_bits > mfc_context->brc.target_frame_size[0][slice_type])
+            qp_delta += large_frame_adjustment;
+    } else if (sts == BRC_OVERFLOW) {
+        // The frame is very small and we are now overflowing the HRD buffer.  Currently this case
+        // does not occur because we ignore overflow in VBR mode.
+        assert(0 && "Overflow in VBR mode");
+    } else if (frame_bits <= mfc_context->brc.target_frame_size[0][slice_type]) {
+        // The frame is smaller than the average size expected for this frame type.
+        if (mfc_context->hrd.current_buffer_fullness[0] >
+            (mfc_context->hrd.target_buffer_fullness[0] + mfc_context->hrd.buffer_size[0]) / 2.0) {
+            // We currently have lots of bits available, so decrease the QP slightly for the next
+            // frame.
+            qp_delta = -1;
+        } else {
+            // The HRD buffer fullness is increasing, so do nothing.  (We may be under the target
+            // level here, but are moving in the right direction.)
+            qp_delta = 0;
+        }
+    } else {
+        // The frame is larger than the average size expected for this frame type.
+        if (mfc_context->hrd.current_buffer_fullness[0] > mfc_context->hrd.target_buffer_fullness[0]) {
+            // We are currently over the target level, so do nothing.
+            qp_delta = 0;
+        } else if (mfc_context->hrd.current_buffer_fullness[0] > mfc_context->hrd.target_buffer_fullness[0] / 2.0) {
+            // We are under the target level, but not critically.  Increase the QP by one step if
+            // continuing like this would underflow soon (currently within one second).
+            if (mfc_context->hrd.current_buffer_fullness[0] /
+                (double)(frame_bits - mfc_context->brc.target_frame_size[0][slice_type] + 1) <
+                ((double)encoder_context->brc.framerate[0].num / (double)encoder_context->brc.framerate[0].den))
+                qp_delta = +1;
+            else
+                qp_delta = 0;
+        } else {
+            // We are a long way under the target level.  Always increase the QP, possibly by a
+            // larger amount dependent on how big the frame we just made actually was.
+            qp_delta = +1 + large_frame_adjustment;
+        }
+    }
+
+    switch (slice_type) {
+    case SLICE_TYPE_I:
+        qp[SLICE_TYPE_I] += qp_delta;
+        qp[SLICE_TYPE_P]  = qp[SLICE_TYPE_I] + BRC_I_P_QP_DIFF;
+        qp[SLICE_TYPE_B]  = qp[SLICE_TYPE_I] + BRC_I_B_QP_DIFF;
+        break;
+    case SLICE_TYPE_P:
+        qp[SLICE_TYPE_P] += qp_delta;
+        qp[SLICE_TYPE_I]  = qp[SLICE_TYPE_P] - BRC_I_P_QP_DIFF;
+        qp[SLICE_TYPE_B]  = qp[SLICE_TYPE_P] + BRC_P_B_QP_DIFF;
+        break;
+    case SLICE_TYPE_B:
+        qp[SLICE_TYPE_B] += qp_delta;
+        qp[SLICE_TYPE_I]  = qp[SLICE_TYPE_B] - BRC_I_B_QP_DIFF;
+        qp[SLICE_TYPE_P]  = qp[SLICE_TYPE_B] - BRC_P_B_QP_DIFF;
+        break;
+    }
+    BRC_CLIP(mfc_context->brc.qp_prime_y[0][SLICE_TYPE_I], min_qp, 51);
+    BRC_CLIP(mfc_context->brc.qp_prime_y[0][SLICE_TYPE_P], min_qp, 51);
+    BRC_CLIP(mfc_context->brc.qp_prime_y[0][SLICE_TYPE_B], min_qp, 51);
+
+    if (sts == BRC_UNDERFLOW && qp[slice_type] == 51)
+        sts = BRC_UNDERFLOW_WITH_MAX_QP;
+    if (sts == BRC_OVERFLOW && qp[slice_type] == min_qp)
+        sts = BRC_OVERFLOW_WITH_MIN_QP;
+
+    return sts;
+}
+
+int intel_mfc_brc_postpack(struct encode_state *encode_state,
+                           struct intel_encoder_context *encoder_context,
+                           int frame_bits)
+{
+    switch (encoder_context->rate_control_mode) {
+    case VA_RC_CBR:
+        return intel_mfc_brc_postpack_cbr(encode_state, encoder_context, frame_bits);
+    case VA_RC_VBR:
+        return intel_mfc_brc_postpack_vbr(encode_state, encoder_context, frame_bits);
+    }
+    assert(0 && "Invalid RC mode");
+}
+
 static void intel_mfc_hrd_context_init(struct encode_state *encode_state,
                                        struct intel_encoder_context *encoder_context)
 {
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
-    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
     unsigned int rate_control_mode = encoder_context->rate_control_mode;
-    int target_bit_rate = pSequenceParameter->bits_per_second;
+    int target_bit_rate = encoder_context->brc.bits_per_second[encoder_context->layer.num_layers - 1];
     
     // current we only support CBR mode.
     if (rate_control_mode == VA_RC_CBR) {
         mfc_context->vui_hrd.i_bit_rate_value = target_bit_rate >> 10;
-        mfc_context->vui_hrd.i_cpb_size_value = (target_bit_rate * 8) >> 10;
-        mfc_context->vui_hrd.i_initial_cpb_removal_delay = mfc_context->vui_hrd.i_cpb_size_value * 0.5 * 1024 / target_bit_rate * 90000;
+        mfc_context->vui_hrd.i_initial_cpb_removal_delay = ((target_bit_rate * 8) >> 10) * 0.5 * 1024 / target_bit_rate * 90000;
         mfc_context->vui_hrd.i_cpb_removal_delay = 2;
         mfc_context->vui_hrd.i_frame_number = 0;
 
@@ -384,127 +536,28 @@ int intel_mfc_interlace_check(VADriverContextP ctx,
     return 1;
 }
 
-/*
- * Check whether the parameters related with CBR are updated and decide whether
- * it needs to reinitialize the configuration related with CBR.
- * Currently it will check the following parameters:
- *      bits_per_second
- *      frame_rate
- *      gop_configuration(intra_period, ip_period, intra_idr_period)
- */
-static bool intel_mfc_brc_updated_check(struct encode_state *encode_state,
-                           struct intel_encoder_context *encoder_context)
-{
-    unsigned int rate_control_mode = encoder_context->rate_control_mode;
-    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
-    double cur_fps, cur_bitrate;
-    VAEncSequenceParameterBufferH264 *pSequenceParameter;
-
-
-    if (rate_control_mode != VA_RC_CBR) {
-        return false;
-    }
-
-    pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
-
-    cur_bitrate = pSequenceParameter->bits_per_second;
-    cur_fps = (double)pSequenceParameter->time_scale /
-                (2 * (double)pSequenceParameter->num_units_in_tick);
-
-    if ((cur_bitrate == mfc_context->brc.saved_bps) &&
-        (cur_fps == mfc_context->brc.saved_fps) &&
-        (pSequenceParameter->intra_period == mfc_context->brc.saved_intra_period) &&
-        (pSequenceParameter->intra_idr_period == mfc_context->brc.saved_idr_period) &&
-        (pSequenceParameter->intra_period == mfc_context->brc.saved_intra_period)) {
-        /* the parameters related with CBR are not updaetd */
-        return false;
-    }
-
-    mfc_context->brc.saved_ip_period = pSequenceParameter->ip_period;
-    mfc_context->brc.saved_intra_period = pSequenceParameter->intra_period;
-    mfc_context->brc.saved_idr_period = pSequenceParameter->intra_idr_period;
-    mfc_context->brc.saved_fps = cur_fps;
-    mfc_context->brc.saved_bps = cur_bitrate;
-    return true;
-}
-
 void intel_mfc_brc_prepare(struct encode_state *encode_state,
                            struct intel_encoder_context *encoder_context)
 {
     unsigned int rate_control_mode = encoder_context->rate_control_mode;
-    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
-
-    if (rate_control_mode == VA_RC_CBR) {
-        bool brc_updated;
-        assert(encoder_context->codec != CODEC_MPEG2);
 
-        brc_updated = intel_mfc_brc_updated_check(encode_state, encoder_context);
+    if (encoder_context->codec != CODEC_H264 &&
+        encoder_context->codec != CODEC_H264_MVC)
+        return;
 
+    if (rate_control_mode != VA_RC_CQP) {
         /*Programing bit rate control */
-        if ((mfc_context->bit_rate_control_context[SLICE_TYPE_I].MaxSizeInWord == 0) ||
-             brc_updated) {
-            intel_mfc_bit_rate_control_context_init(encode_state, mfc_context);
+        if (encoder_context->brc.need_reset) {
+            intel_mfc_bit_rate_control_context_init(encode_state, encoder_context);
             intel_mfc_brc_init(encode_state, encoder_context);
         }
 
         /*Programing HRD control */
-        if ((mfc_context->vui_hrd.i_cpb_size_value == 0) || brc_updated )
+        if (encoder_context->brc.need_reset)
             intel_mfc_hrd_context_init(encode_state, encoder_context);    
     }
 }
 
-static int intel_avc_find_skipemulcnt(unsigned char *buf, int bits_length)
-{
-    int i, found;
-    int leading_zero_cnt, byte_length, zero_byte;
-    int nal_unit_type;
-    int skip_cnt = 0;
-
-#define NAL_UNIT_TYPE_MASK 0x1f
-#define HW_MAX_SKIP_LENGTH 15
-
-    byte_length = ALIGN(bits_length, 32) >> 3;
-
-
-    leading_zero_cnt = 0;
-    found = 0;
-    for(i = 0; i < byte_length - 4; i++) {
-        if (((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 1)) ||
-            ((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 0) && (buf[i + 3] == 1))) {
-                found = 1;
-                break;
-            }
-        leading_zero_cnt++;
-    }
-    if (!found) {
-        /* warning message is complained. But anyway it will be inserted. */
-        WARN_ONCE("Invalid packed header data. "
-                   "Can't find the 000001 start_prefix code\n");
-        return 0;
-    }
-    i = leading_zero_cnt;
-
-    zero_byte = 0;
-    if (!((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 1)))
-        zero_byte = 1;
-
-    skip_cnt = leading_zero_cnt + zero_byte + 3;
-
-    /* the unit header byte is accounted */
-    nal_unit_type = (buf[skip_cnt]) & NAL_UNIT_TYPE_MASK;
-    skip_cnt += 1;
-
-    if (nal_unit_type == 14 || nal_unit_type == 20 || nal_unit_type == 21) {
-        /* more unit header bytes are accounted for MVC/SVC */
-        skip_cnt += 3;
-    }
-    if (skip_cnt > HW_MAX_SKIP_LENGTH) {
-        WARN_ONCE("Too many leading zeros are padded for packed data. "
-                   "It is beyond the HW range.!!!\n");
-    }
-    return skip_cnt;
-}
-
 void intel_mfc_avc_pipeline_header_programing(VADriverContextP ctx,
                                               struct encode_state *encode_state,
                                               struct intel_encoder_context *encoder_context,
@@ -663,6 +716,7 @@ VAStatus intel_mfc_avc_prepare(VADriverContextP ctx,
 
     if ( obj_surface->private_data == NULL) {
         gen6_avc_surface = calloc(sizeof(GenAvcSurface), 1);
+        assert(gen6_avc_surface);
         gen6_avc_surface->dmv_top = 
             dri_bo_alloc(i965->intel.bufmgr,
                          "Buffer",
@@ -709,6 +763,7 @@ VAStatus intel_mfc_avc_prepare(VADriverContextP ctx,
             if ( obj_surface->private_data == NULL) {
                 
                 gen6_avc_surface = calloc(sizeof(GenAvcSurface), 1);
+                assert(gen6_avc_surface);
                 gen6_avc_surface->dmv_top = 
                     dri_bo_alloc(i965->intel.bufmgr,
                                  "Buffer",
@@ -812,6 +867,7 @@ int intel_format_lutvalue(int value, int max)
 
 
 #define                QP_MAX                  52
+#define                VP8_QP_MAX              128
 
 
 static float intel_lambda_qp(int qp)
@@ -825,32 +881,21 @@ static float intel_lambda_qp(int qp)
     return lambdaf;
 }
 
-
-void intel_vme_update_mbmv_cost(VADriverContextP ctx,
-                                struct encode_state *encode_state,
-                                struct intel_encoder_context *encoder_context)
+static
+void intel_h264_calc_mbmvcost_qp(int qp,
+                                 int slice_type,
+                                 uint8_t *vme_state_message)
 {
-    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
-    struct gen6_vme_context *vme_context = encoder_context->vme_context;
-    VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
-    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
-    int qp, m_cost, j, mv_count;
-    uint8_t *vme_state_message = (uint8_t *)(vme_context->vme_state_message);
+    int m_cost, j, mv_count;
     float   lambda, m_costf;
 
-    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
-
-    
-    if (encoder_context->rate_control_mode == VA_RC_CQP)
-        qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
-    else
-        qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
-
-    if (vme_state_message == NULL)
-        return;
-
     assert(qp <= QP_MAX); 
     lambda = intel_lambda_qp(qp);
+
+    m_cost = lambda;
+    vme_state_message[MODE_CHROMA_INTRA] = 0;
+    vme_state_message[MODE_REFID_COST] = intel_format_lutvalue(m_cost, 0x8f);
+
     if (slice_type == SLICE_TYPE_I) {
         vme_state_message[MODE_INTRA_16X16] = 0;
         m_cost = lambda * 4;
@@ -936,6 +981,31 @@ void intel_vme_update_mbmv_cost(VADriverContextP ctx,
             vme_state_message[MODE_INTER_BWD] = intel_format_lutvalue(m_cost, 0x6f);
         }
     }
+    return;
+}
+
+void intel_vme_update_mbmv_cost(VADriverContextP ctx,
+                                struct encode_state *encode_state,
+                                struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int qp;
+    uint8_t *vme_state_message = (uint8_t *)(vme_context->vme_state_message);
+
+    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+
+    if (encoder_context->rate_control_mode == VA_RC_CQP)
+        qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
+    else
+        qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];
+
+    if (vme_state_message == NULL)
+        return;
+
+    intel_h264_calc_mbmvcost_qp(qp, slice_type, vme_state_message);
 }
 
 void intel_vme_vp8_update_mbmv_cost(VADriverContextP ctx,
@@ -951,15 +1021,27 @@ void intel_vme_vp8_update_mbmv_cost(VADriverContextP ctx,
     float   lambda, m_costf;
 
     int is_key_frame = !pic_param->pic_flags.bits.frame_type;
+    int slice_type = (is_key_frame ? SLICE_TYPE_I : SLICE_TYPE_P);
   
     if (vme_state_message == NULL)
        return;
  
-    lambda = intel_lambda_qp(q_matrix->quantization_index[0] >> 1);
+    if (encoder_context->rate_control_mode == VA_RC_CQP)
+        qp = q_matrix->quantization_index[0];
+    else
+        qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];
+
+    lambda = intel_lambda_qp(qp * QP_MAX / VP8_QP_MAX);
+
+    m_cost = lambda;
+    vme_state_message[MODE_CHROMA_INTRA] = intel_format_lutvalue(m_cost, 0x8f);
+
     if (is_key_frame) {
        vme_state_message[MODE_INTRA_16X16] = 0;
        m_cost = lambda * 16; 
        vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
+       m_cost = lambda * 3;
+       vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
     } else {
        m_cost = 0;
        vme_state_message[MODE_INTER_MV0] = intel_format_lutvalue(m_cost, 0x6f);
@@ -976,7 +1058,7 @@ void intel_vme_vp8_update_mbmv_cost(VADriverContextP ctx,
             mv_count++;
        }
 
-       if (q_matrix->quantization_index[0] < 32 ) {
+       if (qp < 92 ) {
             vme_state_message[MODE_INTRA_16X16] = 0x4a;
             vme_state_message[MODE_INTRA_4X4] = 0x4a;
             vme_state_message[MODE_INTRA_NONPRED] = 0x4a;
@@ -984,6 +1066,7 @@ void intel_vme_vp8_update_mbmv_cost(VADriverContextP ctx,
             vme_state_message[MODE_INTER_16X8] = 0x4a;
             vme_state_message[MODE_INTER_8X8] = 0x4a;
             vme_state_message[MODE_INTER_4X4] = 0x4a;
+            vme_state_message[MODE_INTER_BWD] = 0;
             return;
        }
        m_costf = lambda * 10;
@@ -991,6 +1074,10 @@ void intel_vme_vp8_update_mbmv_cost(VADriverContextP ctx,
        m_cost = lambda * 24; 
        vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
             
+        m_costf = lambda * 3.5;
+        m_cost = m_costf;
+        vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
+
         m_costf = lambda * 2.5;
         m_cost = m_costf;
         vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
@@ -1062,6 +1149,16 @@ gen7_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx,
     int mb_row;
     int s;
     unsigned int *command_ptr;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int qp,qp_mb,qp_index;
+    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+
+    if (encoder_context->rate_control_mode == VA_RC_CQP)
+        qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
+    else
+        qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];
 
 #define                USE_SCOREBOARD          (1 << 21)
  
@@ -1101,7 +1198,7 @@ gen7_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx,
                     }
                 }
 
-                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (9 - 2));
                 *command_ptr++ = kernel;
                 *command_ptr++ = USE_SCOREBOARD;
                 /* Indirect data */
@@ -1112,6 +1209,13 @@ gen7_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx,
                 /*inline data */
                 *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
                 *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+                /* QP occupies one byte */
+                if (vme_context->roi_enabled) {
+                    qp_index = y_inner * mb_width + x_inner;
+                    qp_mb = *(vme_context->qp_per_mb + qp_index);
+                } else
+                    qp_mb = qp;
+                *command_ptr++ = qp_mb;
                 x_inner -= 2;
                 y_inner += 1;
             }
@@ -1145,7 +1249,7 @@ gen7_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx,
                     }
                 }
 
-                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (9 - 2));
                 *command_ptr++ = kernel;
                 *command_ptr++ = USE_SCOREBOARD;
                 /* Indirect data */
@@ -1156,6 +1260,13 @@ gen7_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx,
                 /*inline data */
                 *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
                 *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+                /* qp occupies one byte */
+                if (vme_context->roi_enabled) {
+                    qp_index = y_inner * mb_width + x_inner;
+                    qp_mb = *(vme_context->qp_per_mb + qp_index);
+                } else
+                    qp_mb = qp;
+                *command_ptr++ = qp_mb;
 
                 x_inner -= 2;
                 y_inner += 1;
@@ -1688,6 +1799,350 @@ void intel_avc_slice_insert_packed_data(VADriverContextP ctx,
     return;
 }
 
+void
+intel_h264_initialize_mbmv_cost(VADriverContextP ctx,
+                                struct encode_state *encode_state,
+                                struct intel_encoder_context *encoder_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int qp;
+    dri_bo *bo;
+    uint8_t *cost_table;
+
+    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+
+
+    if (slice_type == SLICE_TYPE_I) {
+        if (vme_context->i_qp_cost_table)
+            return;
+    } else if (slice_type == SLICE_TYPE_P) {
+        if (vme_context->p_qp_cost_table)
+            return;
+    } else {
+        if (vme_context->b_qp_cost_table)
+            return;
+    }
+
+    /* It is enough to allocate 32 bytes for each qp. */
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "cost_table ",
+                      QP_MAX * 32,
+                      64);
+
+    dri_bo_map(bo, 1);
+    assert(bo->virtual);
+    cost_table = (uint8_t *)(bo->virtual);
+    for (qp = 0; qp < QP_MAX; qp++) {
+        intel_h264_calc_mbmvcost_qp(qp, slice_type, cost_table);
+        cost_table += 32;
+    }
+
+    dri_bo_unmap(bo);
+
+    if (slice_type == SLICE_TYPE_I) {
+        vme_context->i_qp_cost_table = bo;
+    } else if (slice_type == SLICE_TYPE_P) {
+        vme_context->p_qp_cost_table = bo;
+    } else {
+        vme_context->b_qp_cost_table = bo;
+    }
+
+    vme_context->cost_table_size = QP_MAX * 32;
+    return;
+}
+
+extern void
+intel_h264_setup_cost_surface(VADriverContextP ctx,
+                              struct encode_state *encode_state,
+                              struct intel_encoder_context *encoder_context,
+                              unsigned long binding_table_offset,
+                              unsigned long surface_state_offset)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    dri_bo *bo;
+
+
+    struct i965_buffer_surface cost_table;
+
+    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+
+
+    if (slice_type == SLICE_TYPE_I) {
+        bo = vme_context->i_qp_cost_table;
+    } else if (slice_type == SLICE_TYPE_P) {
+        bo = vme_context->p_qp_cost_table;
+    } else {
+        bo = vme_context->b_qp_cost_table;
+    }
+
+    cost_table.bo = bo;
+    cost_table.num_blocks = QP_MAX;
+    cost_table.pitch = 16;
+    cost_table.size_block = 32;
+
+    vme_context->vme_buffer_suface_setup(ctx,
+                                         &vme_context->gpe_context,
+                                         &cost_table,
+                                         binding_table_offset,
+                                         surface_state_offset);
+}
+
+/*
+ * the idea of conversion between qp and qstep comes from scaling process
+ * of transform coeff for Luma component in H264 spec.
+ *   2^(Qpy / 6 - 6)
+ * In order to avoid too small qstep, it is multiplied by 16.
+ */
+static float intel_h264_qp_qstep(int qp)
+{
+    float value, qstep;
+    value = qp;
+    value = value / 6 - 2;
+    qstep = powf(2, value);
+    return qstep;
+}
+
+static int intel_h264_qstep_qp(float qstep)
+{
+    float qp;
+
+    qp = 12.0f + 6.0f * log2f(qstep);
+
+    return floorf(qp);
+}
+
+/*
+ * Currently it is based on the following assumption:
+ * SUM(roi_area * 1 / roi_qstep) + non_area * 1 / nonroi_qstep =
+ *                                total_aread * 1 / baseqp_qstep
+ *
+ * qstep is the linearized quantizer of H264 quantizer
+ */
+typedef struct {
+    int row_start_in_mb;
+    int row_end_in_mb;
+    int col_start_in_mb;
+    int col_end_in_mb;
+
+    int width_mbs;
+    int height_mbs;
+
+    int roi_qp;
+} ROIRegionParam;
+
+static VAStatus
+intel_h264_enc_roi_cbr(VADriverContextP ctx,
+                       int base_qp,
+                       struct encode_state *encode_state,
+                       struct intel_encoder_context *encoder_context)
+{
+    int nonroi_qp;
+    int min_qp = MAX(1, encoder_context->brc.min_qp);
+    bool quickfill = 0;
+
+    ROIRegionParam param_regions[I965_MAX_NUM_ROI_REGIONS];
+    int num_roi = 0;
+    int i,j;
+
+    float temp;
+    float qstep_nonroi, qstep_base;
+    float roi_area, total_area, nonroi_area;
+    float sum_roi;
+
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+    int mbs_in_picture = width_in_mbs * height_in_mbs;
+
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAStatus vaStatus = VA_STATUS_SUCCESS;
+
+    /* currently roi_value_is_qp_delta is the only supported mode of priority.
+     *
+     * qp_delta set by user is added to base_qp, which is then clapped by
+     * [base_qp-min_delta, base_qp+max_delta].
+     */
+    ASSERT_RET(encoder_context->brc.roi_value_is_qp_delta, VA_STATUS_ERROR_INVALID_PARAMETER);
+
+    num_roi = encoder_context->brc.num_roi;
+
+    /* when the base_qp is lower than 12, the quality is quite good based
+     * on the H264 test experience.
+     * In such case it is unnecessary to adjust the quality for ROI region.
+     */
+    if (base_qp <= 12) {
+        nonroi_qp = base_qp;
+        quickfill = 1;
+        goto qp_fill;
+    }
+
+    sum_roi = 0.0f;
+    roi_area = 0;
+    for (i = 0; i < num_roi; i++) {
+        int row_start, row_end, col_start, col_end;
+        int roi_width_mbs, roi_height_mbs;
+        int mbs_in_roi;
+        int roi_qp;
+        float qstep_roi;
+
+        col_start = encoder_context->brc.roi[i].left;
+        col_end = encoder_context->brc.roi[i].right;
+        row_start = encoder_context->brc.roi[i].top;
+        row_end = encoder_context->brc.roi[i].bottom;
+
+        col_start = col_start / 16;
+        col_end = (col_end + 15) / 16;
+        row_start = row_start / 16;
+        row_end = (row_end + 15) / 16;
+
+        roi_width_mbs = col_end - col_start;
+        roi_height_mbs = row_end - row_start;
+        mbs_in_roi = roi_width_mbs * roi_height_mbs;
+
+        param_regions[i].row_start_in_mb = row_start;
+        param_regions[i].row_end_in_mb = row_end;
+        param_regions[i].col_start_in_mb = col_start;
+        param_regions[i].col_end_in_mb = col_end;
+        param_regions[i].width_mbs = roi_width_mbs;
+        param_regions[i].height_mbs = roi_height_mbs;
+
+        roi_qp = base_qp + encoder_context->brc.roi[i].value;
+        BRC_CLIP(roi_qp, min_qp, 51);
+
+        param_regions[i].roi_qp = roi_qp;
+        qstep_roi = intel_h264_qp_qstep(roi_qp);
+
+        roi_area += mbs_in_roi;
+        sum_roi += mbs_in_roi / qstep_roi;
+    }
+
+    total_area = mbs_in_picture;
+    nonroi_area = total_area - roi_area;
+
+    qstep_base = intel_h264_qp_qstep(base_qp);
+    temp = (total_area / qstep_base - sum_roi);
+
+    if (temp < 0) {
+        nonroi_qp = 51;
+    } else {
+        qstep_nonroi = nonroi_area / temp;
+        nonroi_qp = intel_h264_qstep_qp(qstep_nonroi);
+    }
+
+    BRC_CLIP(nonroi_qp, min_qp, 51);
+
+qp_fill:
+    memset(vme_context->qp_per_mb, nonroi_qp, mbs_in_picture);
+    if (!quickfill) {
+        char *qp_ptr;
+
+        for (i = 0; i < num_roi; i++) {
+            for (j = param_regions[i].row_start_in_mb; j < param_regions[i].row_end_in_mb; j++) {
+                qp_ptr = vme_context->qp_per_mb + (j * width_in_mbs) + param_regions[i].col_start_in_mb;
+                memset(qp_ptr, param_regions[i].roi_qp, param_regions[i].width_mbs);
+            }
+        }
+    }
+    return vaStatus;
+}
+
+extern void
+intel_h264_enc_roi_config(VADriverContextP ctx,
+                          struct encode_state *encode_state,
+                          struct intel_encoder_context *encoder_context)
+{
+    char *qp_ptr;
+    int i, j;
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+    int row_start, row_end, col_start, col_end;
+    int num_roi = 0;
+
+    vme_context->roi_enabled = 0;
+    /* Restriction: Disable ROI when multi-slice is enabled */
+    if (!encoder_context->context_roi || (encode_state->num_slice_params_ext > 1))
+        return;
+
+    vme_context->roi_enabled = !!encoder_context->brc.num_roi;
+
+    if (!vme_context->roi_enabled)
+        return;
+
+    if ((vme_context->saved_width_mbs !=  width_in_mbs) ||
+        (vme_context->saved_height_mbs != height_in_mbs)) {
+        free(vme_context->qp_per_mb);
+        vme_context->qp_per_mb = calloc(1, width_in_mbs * height_in_mbs);
+
+        vme_context->saved_width_mbs = width_in_mbs;
+        vme_context->saved_height_mbs = height_in_mbs;
+        assert(vme_context->qp_per_mb);
+    }
+    if (encoder_context->rate_control_mode == VA_RC_CBR) {
+        /*
+         * TODO: More complex Qp adjust needs to be added.
+         * Currently it is initialized to slice_qp.
+         */
+        VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+        int qp;
+        int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+
+        qp = mfc_context->brc.qp_prime_y[encoder_context->layer.curr_frame_layer_id][slice_type];
+        intel_h264_enc_roi_cbr(ctx, qp, encode_state, encoder_context);
+
+    } else if (encoder_context->rate_control_mode == VA_RC_CQP){
+        VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+        VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+        int qp;
+        int min_qp = MAX(1, encoder_context->brc.min_qp);
+
+        qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
+        memset(vme_context->qp_per_mb, qp, width_in_mbs * height_in_mbs);
+
+
+        for (j = num_roi; j ; j--) {
+            int qp_delta, qp_clip;
+
+            col_start = encoder_context->brc.roi[i].left;
+            col_end = encoder_context->brc.roi[i].right;
+            row_start = encoder_context->brc.roi[i].top;
+            row_end = encoder_context->brc.roi[i].bottom;
+
+            col_start = col_start / 16;
+            col_end = (col_end + 15) / 16;
+            row_start = row_start / 16;
+            row_end = (row_end + 15) / 16;
+
+            qp_delta = encoder_context->brc.roi[i].value;
+            qp_clip = qp + qp_delta;
+
+            BRC_CLIP(qp_clip, min_qp, 51);
+
+            for (i = row_start; i < row_end; i++) {
+                qp_ptr = vme_context->qp_per_mb + (i * width_in_mbs) + col_start;
+                memset(qp_ptr, qp_clip, (col_end - col_start));
+            }
+        }
+    } else {
+        /*
+         * TODO: Disable it for non CBR-CQP.
+         */
+        vme_context->roi_enabled = 0;
+    }
+
+    if (vme_context->roi_enabled && IS_GEN7(i965->intel.device_info))
+        encoder_context->soft_batch_force = 1;
+
+    return;
+}
+
 /* HEVC */
 static int
 hevc_temporal_find_surface(VAPictureHEVC *curr_pic,
@@ -1733,12 +2188,19 @@ intel_hevc_vme_reference_state(VADriverContextP ctx,
     struct object_surface *obj_surface = NULL;
     struct i965_driver_data *i965 = i965_driver_data(ctx);
     VASurfaceID ref_surface_id;
+    VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
     VAEncPictureParameterBufferHEVC *pic_param = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
     VAEncSliceParameterBufferHEVC *slice_param = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
     int max_num_references;
     VAPictureHEVC *curr_pic;
     VAPictureHEVC *ref_list;
     int ref_idx;
+    unsigned int is_hevc10 = 0;
+    GenHevcSurface *hevc_encoder_surface = NULL;
+
+    if((pSequenceParameter->seq_fields.bits.bit_depth_luma_minus8 > 0)
+        || (pSequenceParameter->seq_fields.bits.bit_depth_chroma_minus8 > 0))
+        is_hevc10 = 1;
 
     if (list_index == 0) {
         max_num_references = pic_param->num_ref_idx_l0_default_active_minus1 + 1;
@@ -1785,6 +2247,12 @@ intel_hevc_vme_reference_state(VADriverContextP ctx,
         obj_surface->bo) {
         assert(ref_idx >= 0);
         vme_context->used_reference_objects[list_index] = obj_surface;
+
+        if(is_hevc10){
+            hevc_encoder_surface = (GenHevcSurface *) obj_surface->private_data;
+            assert(hevc_encoder_surface);
+            obj_surface = hevc_encoder_surface->nv12_surface_obj;
+        }
         vme_source_surface_state(ctx, surface_index, obj_surface, encoder_context);
         vme_context->ref_index_in_mb[list_index] = (ref_idx << 24 |
                 ref_idx << 16 |
@@ -1801,10 +2269,11 @@ void intel_vme_hevc_update_mbmv_cost(VADriverContextP ctx,
                                      struct encode_state *encode_state,
                                      struct intel_encoder_context *encoder_context)
 {
-    //struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct gen9_hcpe_context *mfc_context = encoder_context->mfc_context;
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
     VAEncPictureParameterBufferHEVC *pic_param = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
     VAEncSliceParameterBufferHEVC *slice_param = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
+    VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
     int qp, m_cost, j, mv_count;
     uint8_t *vme_state_message = (uint8_t *)(vme_context->vme_state_message);
     float   lambda, m_costf;
@@ -1813,18 +2282,31 @@ void intel_vme_hevc_update_mbmv_cost(VADriverContextP ctx,
     int slice_type = slice_param->slice_type;
 
 
-    /* to do for CBR*/
-    //if (encoder_context->rate_control_mode == VA_RC_CQP)
     qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
-    //else
-    //qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
+
+    if(encoder_context->rate_control_mode == VA_RC_CBR)
+    {
+        qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
+        if(slice_type == HEVC_SLICE_B) {
+            if(pSequenceParameter->ip_period == 1)
+            {
+                slice_type = HEVC_SLICE_P;
+                qp = mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY;
+
+            }else if(mfc_context->vui_hrd.i_frame_number % pSequenceParameter->ip_period == 1){
+                slice_type = HEVC_SLICE_P;
+                qp = mfc_context->bit_rate_control_context[HEVC_SLICE_P].QpPrimeY;
+            }
+        }
+
+    }
 
     if (vme_state_message == NULL)
         return;
 
     assert(qp <= QP_MAX);
     lambda = intel_lambda_qp(qp);
-    if (slice_type == SLICE_TYPE_I) {
+    if (slice_type == HEVC_SLICE_I) {
         vme_state_message[MODE_INTRA_16X16] = 0;
         m_cost = lambda * 4;
         vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
@@ -1870,7 +2352,7 @@ void intel_vme_hevc_update_mbmv_cost(VADriverContextP ctx,
         m_costf = lambda * 3.5;
         m_cost = m_costf;
         vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
-        if (slice_type == SLICE_TYPE_P) {
+        if (slice_type == HEVC_SLICE_P) {
             m_costf = lambda * 2.5;
             m_cost = m_costf;
             vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);