OSDN Git Service

i965_drv_video: add support for H264 on Clarkdale/Arrandale
[android-x86/hardware-intel-common-libva.git] / i965_drv_video / shaders / h264 / ildb / AVC_ILDB_Luma_Core.asm
1 /*\r
2  * Copyright © <2010>, Intel Corporation.\r
3  *\r
4  * This program is licensed under the terms and conditions of the\r
5  * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at\r
6  * http://www.opensource.org/licenses/eclipse-1.0.php.\r
7  *\r
8  */\r
9 #if !defined(__AVC_ILDB_LUMA_CORE__)    // Make sure this file is only included once\r
10 #define __AVC_ILDB_LUMA_CORE__\r
11 \r
12 ////////// AVC ILDB Luma Core /////////////////////////////////////////////////////////////////////////////////\r
13 //\r
14 //      This core performs AVC LUMA ILDB filtering on one horizontal edge (16 pixels) of a MB.  \r
15 //      If data is transposed, it can also de-block a vertical edge.\r
16 //\r
17 //      Bafore calling this subroutine, caller needs to set the following parameters.\r
18 //\r
19 //      - EdgeCntlMap1                          //      Edge control map A\r
20 //      - EdgeCntlMap2                          //      Edge control map B\r
21 //      - P_AddrReg                                     //      Src and dest address register for P pixels\r
22 //      - Q_AddrReg                                     //      Src and dest address register for Q pixels      \r
23 //      - alpha                                         //  alpha corresponding to the edge to be filtered\r
24 //      - beta                                          //  beta corresponding to the edge to be filtered\r
25 //      - tc0                                           //      tc0  corresponding to the edge to be filtered\r
26 //\r
27 //\r
28 //      +----+----+----+----+----+----+----+----+\r
29 //      | p3 | p2 | P1 | p0 | q0 | q1 | q2 | q3 |\r
30 //      +----+----+----+----+----+----+----+----+\r
31 //\r
32 //      p3 = r[P_AddrReg, 0]<16;16,1>  \r
33 //      p2 = r[P_AddrReg, 16]<16;16,1> \r
34 //      p1 = r[P_AddrReg, 32]<16;16,1> \r
35 //      p0 = r[P_AddrReg, 48]<16;16,1> \r
36 //      q0 = r[Q_AddrReg, 0]<16;16,1>  \r
37 //      q1 = r[Q_AddrReg, 16]<16;16,1> \r
38 //      q2 = r[Q_AddrReg, 32]<16;16,1> \r
39 //      q3 = r[Q_AddrReg, 48]<16;16,1> \r
40 //\r
41 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////\r
42 \r
43 // The region is both src and dest\r
44 // P0-P3 and Q0-Q3 should be only used if they have not been modified to new values\r
45 #undef  P3\r
46 #undef  P2\r
47 #undef  P1\r
48 #undef  P0\r
49 #undef  Q0\r
50 #undef  Q1\r
51 #undef  Q2\r
52 #undef  Q3\r
53   \r
54 #define P3              r[P_AddrReg,  0]<16;16,1>:ub\r
55 #define P2              r[P_AddrReg, 16]<16;16,1>:ub\r
56 #define P1              r[P_AddrReg, 32]<16;16,1>:ub\r
57 #define P0              r[P_AddrReg, 48]<16;16,1>:ub\r
58 #define Q0              r[Q_AddrReg,  0]<16;16,1>:ub\r
59 #define Q1              r[Q_AddrReg, 16]<16;16,1>:ub\r
60 #define Q2              r[Q_AddrReg, 32]<16;16,1>:ub\r
61 #define Q3              r[Q_AddrReg, 48]<16;16,1>:ub\r
62 \r
63 // New region as dest\r
64 #undef  NewP2\r
65 #undef  NewP1\r
66 #undef  NewP0\r
67 #undef  NewQ0\r
68 #undef  NewQ1\r
69 #undef  NewQ2\r
70 \r
71 #define NewP2   r[P_AddrReg, 16]<1>:ub\r
72 #define NewP1   r[P_AddrReg, 32]<1>:ub\r
73 #define NewP0   r[P_AddrReg, 48]<1>:ub\r
74 #define NewQ0   r[Q_AddrReg,  0]<1>:ub\r
75 #define NewQ1   r[Q_AddrReg, 16]<1>:ub\r
76 #define NewQ2   r[Q_AddrReg, 32]<1>:ub\r
77 \r
78 // Filter one luma edge\r
79 FILTER_Y:\r
80 \r
81 #if defined(_DEBUG) \r
82         mov             (1)             EntrySignatureC:w                       0x1111:w\r
83 #endif\r
84         //---------- Derive filterSampleflag in AVC spec, equition (8-469) ----------\r
85         // bS is in MaskA\r
86 \r
87         // Src copy of the p3, p2, p1, p0, q0, q1, q2, q3\r
88 //      mov (16) p0123_W(0)<1>          r[P_AddrReg]<16;16,1>:uw\r
89 //      mov (16) p0123_W(1)<1>          r[P_AddrReg, 32]<16;16,1>:uw\r
90 //      mov (16) q0123_W(0)<1>          r[Q_AddrReg]<16;16,1>:uw\r
91 //      mov (16) q0123_W(1)<1>          r[Q_AddrReg, 32]<16;16,1>:uw\r
92 \r
93         mov (2) f0.0<1>:uw              MaskA<2;2,1>:uw\r
94 \r
95         add (16) q0_p0(0)<1>            Q0              -P0                             // q0-p0\r
96         add (16) TempRow0(0)<1>         P1              -P0                             // p1-p0\r
97         add (16) TempRow1(0)<1>         Q1              -Q0                             // q1-q0\r
98 \r
99         // Build FilterSampleFlag\r
100         // abs(q0-p0) < alpha\r
101         (f0.0) cmp.l.f0.0 (16) null:w           (abs)q0_p0(0)                   alpha:w\r
102         // abs(p1-p0) < Beta\r
103         (f0.0) cmp.l.f0.0 (16) null:w           (abs)TempRow0(0)                beta:w\r
104         // abs(q1-q0) < Beta\r
105         (f0.0) cmp.l.f0.0 (16) null:w           (abs)TempRow1(0)                beta:w\r
106 \r
107         //-----------------------------------------------------------------------------------------\r
108 \r
109     (f0.0)      if      (16)            Y_ENDIF1\r
110                 // For channels whose edge control map1 = 1 ---> perform de-blocking\r
111 \r
112 //              mov (1)         f0.1:uw         MaskB:uw        {NoMask}                // Now check for which algorithm to apply\r
113 \r
114                 // (abs)ap = |p2-p0|\r
115                 add (16) ap(0)<1>               P2              -P0             // ap = p2-p0\r
116                 // (abs)aq = |q2-q0|\r
117                 add (16) aq(0)<1>               Q2              -Q0             // aq = q2-q0\r
118 \r
119                 // Make a copy of unmodified p0 and p1 for use in q0'and q1' calculation\r
120                 mov (16) p0123_W(1)<1>          r[P_AddrReg, 32]<16;16,1>:uw            {NoMask}\r
121 \r
122                 (f0.1)  if      (16)            Y_ELSE2\r
123 \r
124                         // For channels whose edge control map2 = 1 ---> bS = 4 algorithm\r
125 \r
126                         // Compute q0', q1' and q2'\r
127                         //-----------------------------------------------------------------------------\r
128                         // bS = 4 Algorithm :                   \r
129                         //\r
130                         // gama = |p0-q0| < ((alpha >> 2) + 2) \r
131                         // deltap = (ap<beta) && gama;                  // deep filter flag\r
132                         //      if (deltap) {\r
133                         //              p0' = (        p2 +2*p1 +2*p0 +2*q0 + q1 + 4) >> 3; \r
134                         //              p1' = (        p2 +  p1 +  p0 +  q0      + 2) >> 2;\r
135                         //              p2' = (2*p3 +3*p2 +  p1 +  p0 +  q0      + 4) >> 3;\r
136                         //      } else {  \r
137                         //              p0' = (            2*p1 +  p0 +  q1      + 2) >> 2;\r
138                         //      }\r
139                         //-----------------------------------------------------------------------------\r
140 \r
141                         // gama = |p0-q0| < ((alpha >> 2) + 2) = |p0-q0| < alpha2  \r
142                         cmp.l.f0.1 (16) null:w  (abs)q0_p0(0)   alpha2:w\r
143 \r
144                         // Common P01 = p0 + p1\r
145                         add (16)        P0_plus_P1(0)<1>        P0                      P1      \r
146 \r
147                         // Common Q01 = q0 + q1\r
148                         add (16)        Q0_plus_Q1(0)<1>        Q0                      Q1\r
149 \r
150 //                      mov (1) CTemp1_W:w              f0.1:uw                                         {NoMask}\r
151                         mov (1) f0.0:uw                 f0.1:uw                                         {NoMask}\r
152         \r
153                         // deltap = ((abs)ap < beta) && gama\r
154                         (f0.1) cmp.l.f0.1 (16) null:w   (abs)ap(0)              beta<0;1,0>:w                                                   // (abs)ap < beta ?\r
155 \r
156                         // deltaq = ((abs)aq < beta) && gama\r
157                         (f0.0) cmp.l.f0.0 (16) null:w   (abs)aq(0)              beta<0;1,0>:w                                                   // (abs)aq < beta ?\r
158 \r
159 \r
160 //                      mov (1) CTemp1_W:w              f0.0:uw                                         {NoMask}                                        // gama = |p0-q0| < ((alpha >> 2) + 2) for each channel \r
161 //                      and (1)         f0.1:w          f0.1:uw         CTemp1_W:w              {NoMask}                                        // deltap = (ap<beta) && gama\r
162 \r
163 \r
164                         (f0.1)  if      (16)            Y_ELSE3                 // for channels its deltap = true\r
165 \r
166                         add (16)        P2_plus_P3(0)<1>        P2              P3\r
167                         \r
168                         // A =  (p1 + p0) + q0 = P01 + q0\r
169                         add (16)        A(0)<1>                 P0_plus_P1(0)           Q0                                                      // A =  P01 + q0\r
170 \r
171                         // Now acc0 = A\r
172 \r
173                         // B =  p2 + (p1 + p0 + q0) + 4 = p2 + A + 4\r
174 //                      add (16)        acc0.0<1>:w             P2                              4:w                                                             // p2 + 4 \r
175 //                      add (16)        B(0)<1>                 acc0.0<16;16,1>:w               A(0)                                    // B = p2 + A + 4\r
176                         add (16)        acc0.0<1>:w             acc0.0<16;16,1>:w               4:w                                                             // p2 + 4 \r
177                         add (16)        B(0)<1>                 acc0.0<16;16,1>:w               P2                                      // B = p2 + A + 4\r
178                         \r
179                         // Now acc0 = B\r
180 \r
181                         // p2' = (2*p3 +3*p2 + A + 4) >> 3 = (2*(p3+p2) + B) >> 3\r
182 //                      mov     (16)    acc0.0<1>:w             B(0)\r
183                         mac (16)        acc0.0<1>:w             P2_plus_P3(0)           2:w             \r
184                         shr.sat (16) TempRow3B(0)<2>    acc0.0<16;16,1>:w               3:w\r
185                         \r
186                         // p1' = (p2 + A + 2) >> 2 = (B - 2) >> 2\r
187                         add (16)        acc0.0<1>:w             B(0)                    -2:w\r
188                         shr.sat (16) TempRow1B(0)<2>    acc0.0<16;16,1>:w               2:w\r
189         \r
190                         // p0' = (p2 +2*A + q1 + 4) >> 3 = (B + A + q1) >> 3\r
191                         add (16)        acc0.0<1>:w             Q1                              A(0)                                                    // B + A\r
192                         add (16)        acc0.0<1>:w             acc0.0<16;16,1>:w               B(0)                                                    // B + A + q1\r
193                         shr.sat (16) TempRow0B(0)<2>    acc0.0<16;16,1>:w               3:w                                                             // (B + A + q1) >> 3\r
194 \r
195                         // p2' = (2*p3 +3*p2 + A + 4) >> 3 = (2*(p3+p2) + B) >> 3\r
196 //                      mov     (16)    acc0.0<1>:w             B(0)\r
197 //                      mac (16)        acc0.0<1>:w             P2_plus_P3(0)           2:w             \r
198 //                      shr.sat (16) TempRow3B(0)<2>    acc0.0<16;16,1>:w               3:w\r
199 \r
200                         mov (16)        NewP2           TempRow3B(0)                                            // p2'\r
201                         mov (16)        NewP1           TempRow1B(0)                                            // p1'                  \r
202                         mov (16)        NewP0           TempRow0B(0)                                            // p0'\r
203 \r
204 Y_ELSE3:\r
205                         else (16)               Y_ENDIF3                // for channels its deltap = false\r
206 \r
207                         // p0' = (2*p1 + p0 + q1 + 2) >> 2 =  (p1 + P01 + q1 + 2) >> 2\r
208                         add (16)        acc0.0<1>:w             P1                      P0_plus_P1(0)                   // p1 + P01 (TempRow1(0) = P01)\r
209                         add (16)        acc0.0<1>:w             acc0.0<16;16,1>:w       Q1                              \r
210                         add (16)        acc0.0<1>:w             acc0.0<16;16,1>:w       2:w                     // p1 + P01 + q1 + 2\r
211 \r
212                         shr.sat (16) TempRow0B(0)<2>    acc0.0<16;16,1>:w               2:w     // >> 2\r
213                         mov (16)        NewP0           TempRow0B(0)                                            // p0'\r
214 \r
215                         endif\r
216 Y_ENDIF3:\r
217                         // Compute q0', q1' and q2'\r
218                         //-----------------------------------------------------------------------------\r
219                         // bS = 4 Algorithm (cont):                     \r
220                         //\r
221                         //      deltaq = (aq<beta) && gama;             // deep filter flag\r
222                         //      if (deltaq) {\r
223                         //              q0' = (        q2 +2*q1 +2*q0 +2*p0 + p1 + 4) >> 3; \r
224                         //              q1' = (        q2 +  q1 +  q0 +  p0      + 2) >> 2;\r
225                         //              q2' = (2*q3 +3*q2 +  q1 +  q0 +  p0      + 4) >> 3;\r
226                         //      } else {\r
227                         //              q0' = (            2*q1 +  q0 +  p1      + 2) >> 2;\r
228                         //      }\r
229                         \r
230                         // deltaq = ((abs)aq < beta) && gama\r
231 //                      cmp.l.f0.1 (16) null:w  (abs)aq(0)              beta<0;1,0>:w                                                   // (abs)aq < beta ?\r
232 \r
233                         // Common Q01 = q0 + q1\r
234 //                      add (16)        Q0_plus_Q1(0)<1>        Q0                      Q1\r
235                         \r
236 //                      and (1)         f0.1:w          f0.1:uw         CTemp1_W:w              {NoMask}                                // deltaq = ((abs)ap < beta) && gama\r
237 \r
238                         (f0.0)  if      (16)            Y_ELSE4                 // for channels its deltaq = true\r
239                         \r
240                         add (16)        Q2_plus_Q3(0)<1>        Q2                      Q3\r
241 \r
242                         // A =  (q1 + q0) + p0 = Q01 + p0\r
243                         add (16)        A(0)<1>                 Q0_plus_Q1(0)           p0(0)                                                   // A =  q1+q0 + p0\r
244 \r
245                         // Acc0 = A\r
246 \r
247                         // B =  q2 + q1 + q0 + p0 + 4 = q2 + A + 4\r
248                         add (16)        acc0.0<1>:w             acc0.0<16;16,1>:w               4:w                                                     // q2 + 4 \r
249                         add (16)        B(0)<1>                 acc0.0<16;16,1>:w               Q2                                                              // B = q2 + A + 4\r
250 \r
251                         // Acc0 = B\r
252                         \r
253                         // q2' = (2*q3 +3*q2 + A + 4) >> 3 = (2*(q3+q2) + B) >> 3\r
254 //                      mov (16)        acc0.0<1>:w             B(0)    \r
255                         mac (16)        acc0.0<1>:w             Q2_plus_Q3(0)   2:w\r
256                         shr.sat (16) TempRow3B(0)<2>    acc0.0<16;16,1>:w               3:w\r
257 \r
258                         // q1' = (q2 + A + 2) >> 2 = (B - 2) >> 2\r
259                         add (16)        acc0.0<1>:w             B(0)                    -2:w\r
260                         shr.sat (16) TempRow1B(0)<2>    acc0.0<16;16,1>:w       2:w\r
261                         \r
262                         // q0' = (q2 +2*A + p1 + 4) >> 3 = (B + A + p1) >> 3\r
263                         add (16)        acc0.0<1>:w             p1(0)                                   A(0)\r
264                         add (16)        acc0.0<1>:w             acc0.0<16;16,1>:w               B(0)\r
265                         shr.sat (16) TempRow0B(0)<2>    acc0.0<16;16,1>:w       3:w\r
266                         \r
267                         mov (16)        NewQ2           TempRow3B(0)                                            // q2'\r
268                         mov (16)        NewQ1           TempRow1B(0)                                            // q1'\r
269                         mov (16)        NewQ0           TempRow0B(0)                                            // q0'\r
270 \r
271 Y_ELSE4:\r
272                         else (16)               Y_ENDIF4                // for channels its deltaq = false\r
273 \r
274                         // q0' = (2*q1 + q0 + p1 + 2) >> 2 =  (q1 + Q01 + p1 + 2) >> 2\r
275                         // Use original p1 values in p1(0)\r
276                         add (16)        acc0.0<1>:w             p1(0)                   Q0_plus_Q1(0)                   // p1 + P01 (TempRow1(0) = P01)\r
277                         add (16)        acc0.0<1>:w             acc0.0<16;16,1>:w       Q1                              \r
278                         add (16)        acc0.0<1>:w             acc0.0<16;16,1>:w       2:w                     // p1 + P01 + q1 + 2\r
279 \r
280                         shr.sat (16)    TempRow0B(0)<2>         acc0.0<16;16,1>:w               2:w                                                             // >> 2\r
281                         mov (16)        NewQ0           TempRow0B(0)                                            // q0'\r
282 \r
283                         endif\r
284 Y_ENDIF4:\r
285 \r
286                         \r
287                         // Done with bS = 4 algorithm\r
288                         \r
289 Y_ELSE2: \r
290                 else    (16)            Y_ENDIF2\r
291                         // For channels whose edge control map2 = 0 ---> bS < 4 algorithm\r
292 \r
293                         //-----------------------------------------------------------------------------\r
294                         // bS < 4 Algorithm :\r
295                         // tc = tc0 + (|p2-p0|<Beta ? 1 : 0) + (|q2-q0|<Beta ? 1 : 0)\r
296                         // delta = Clip3(-tc, tc, ((((q0-p0)<<2) + (p1-q1) + 4) >> 3))\r
297                         // p0' = Clip1(p0 + delta) = Clip3(0, 0xFF, p0 + delta)\r
298                         // q0' = Clip1(q0 - delta) = Clip3(0, 0xFF, q0 - delta)\r
299                         // if (|p2-p0|<Beta)\r
300                         //              p1' = p1 + Clip3(-tc0, tc0, (p2 + ((p0+q0+1)>>1) - (p1<<1)) >> 1 )\r
301                         // if (|q2-q0|<Beta)\r
302                         //              q1' = q1 + Clip3(-tc0, tc0, (q2 + ((p0+q0+1)>>1) - (q1<<1)) >> 1 )\r
303                         //-----------------------------------------------------------------------------\r
304                         \r
305                         // Expand tc0\r
306                         mov (16)        tc_exp(0)<1>    tc0<1;4,0>:ub   {NoMask}\r
307                         mov (16)        tc0_exp(0)<1>   tc0<1;4,0>:ub   {NoMask}                                        // tc0_exp = tc0, each tc0 is duplicated 4 times for 4 adjcent 4 pixels \r
308                                                 \r
309                         // tc_exp = tc0_exp + (|p2-p0|<Beta ? 1 : 0) + (|q2-q0|<Beta ? 1 : 0)                   \r
310 //                      mov (16)        tc_exp(0)<1>            tc0_exp(0)                                                                      // tc = tc0_exp first\r
311                         \r
312 \r
313                         cmp.l.f0.0 (16) null:w          (abs)ap(0)                      beta:w                                          // |p2-p0|< Beta ? ---> (abs)ap < Beta ?\r
314                         cmp.l.f0.1 (16) null:w          (abs)aq(0)                      beta:w                                          // |q2-q0|< Beta ? ---> (abs)aq < Beta ?\r
315                         \r
316                         //--- Use free cycles here ---\r
317                         // delta = Clip3(-tc, tc, ((((q0-p0)<<2) + (p1-q1) + 4) >> 3))\r
318                         // 4 * (q0-p0) + p1 - q1 + 4\r
319                         add (16) acc0<1>:w              P1                      4:w                                                     // p1 + 4\r
320                         mac (16) acc0<1>:w              q0_p0(0)        4:w                                                     // 4 * (q0-p0) + p1 + 4\r
321                         add (16) acc0<1>:w              acc0<16;16,1>:w         -Q1                                     // 4 * (q0-p0) + p1 - q1 + 4\r
322                         shr (16) TempRow0(0)<1> acc0<16;16,1>:w         3:w\r
323                                                 \r
324                         // Continue on getting tc_exp\r
325                         (f0.0) add (16) tc_exp(0)<1>    tc_exp(0)       1:w                                                     // tc0_exp + (|p2-p0|<Beta ? 1 : 0)\r
326                         mov (2) CTemp1_W<1>:w           f0.0<2;2,1>:w                   {NoMask}                                        // Save |p2-p0|<Beta flag                       \r
327                         (f0.1) add (16) tc_exp(0)<1>    tc_exp(0)       1:w                                                     // tc_exp = tc0_exp + (|p2-p0|<Beta ? 1 : 0) + (|q2-q0|<Beta ? 1 : 0)\r
328                         \r
329 \r
330                         // Continue on cliping tc to get delta\r
331                         cmp.g.f0.0      (16) null:w             TempRow0(0)             tc_exp(0)                                       // Clip if delta' > tc\r
332                         cmp.l.f0.1      (16) null:w             TempRow0(0)             -tc_exp(0)                                      // Clip if delta' < -tc\r
333 \r
334                         //--- Use free cycles here ---\r
335                         // common = (p0+q0+1) >> 1        --->  TempRow2(0)\r
336                         // Same as avg of p0 and q0\r
337                         avg (16) TempRow2(0)<1>         P0                      Q0\r
338 \r
339                         // Continue on cliping tc to get delta\r
340                         (f0.0) mov (16) TempRow0(0)<1>                          tc_exp(0)\r
341                         (f0.1) mov (16) TempRow0(0)<1>                          -tc_exp(0)\r
342 \r
343                         //--- Use free cycles here ---\r
344                         mov (2) f0.0<1>:w               CTemp1_W<2;2,1>:w       {NoMask}                        // CTemp1_W = (|p2-p0|<Beta)\r
345                                                                                                                                                         // CTemp2_W = (|q2-q0|<Beta)            \r
346                         //-----------------------------------------------------------------------\r
347 \r
348                         // p0' = Clip1(p0 + delta) = Clip3(0, 0xFF, p0 + delta)\r
349                         // q0' = Clip1(q0 - delta) = Clip3(0, 0xFF, q0 - delta)\r
350                         add.sat (16) TempRow1B(0)<2>            P0                      TempRow0(0)                                     // p0+delta\r
351                         add.sat (16) TempRow0B(0)<2>            Q0                      -TempRow0(0)                            // q0-delta\r
352                         mov (16) NewP0          TempRow1B(0)                                    // p0'\r
353                         mov (16) NewQ0          TempRow0B(0)                                    // q0'\r
354                         //-----------------------------------------------------------------------\r
355 \r
356                         // Now compute p1' and q1'\r
357 \r
358                         // if (|p2-p0|<Beta)\r
359 //                      mov (1) f0.0:w          CTemp1_W:w                              {NoMask}                        // CTemp1_W = (|p2-p0|<Beta)\r
360                         (f0.0)  if      (16)            Y_ENDIF6\r
361                 \r
362                         // p1' = p1 + Clip3(-tc0, tc0, adj)\r
363                         // adj = (p2 + common - (p1<<1)) >> 1 = (p2 + common - (p1*2)) >> 1\r
364                         add (16) acc0<1>:w      P2              TempRow2(0)                                                     // TempRow2(0) = common = (p0+q0+1) >> 1\r
365                         mac (16) acc0<1>:w      P1              -2:w\r
366                         shr (16) TempRow1(0)<1>         acc0<16;16,1>:w         1:w\r
367 \r
368                         // tc clip to get tc_adj\r
369                         cmp.g.f0.0      (16) null:w             TempRow1(0)             tc0_exp(0)                                      // Clip if delta' > tc\r
370                         cmp.l.f0.1      (16) null:w             TempRow1(0)             -tc0_exp(0)                                     // Clip if delta' < -tc\r
371                         \r
372                         (f0.0) mov (16) TempRow1(0)<1>                          tc0_exp(0)\r
373                         (f0.1) mov (16) TempRow1(0)<1>                          -tc0_exp(0)\r
374 \r
375                         //--- Use free cycles here ---\r
376                         mov (1) f0.1:w          CTemp2_W:w                              {NoMask}                        // CTemp2_W = (|q2-q0|<Beta)\r
377 \r
378                         // p1' = p1 + tc_adj\r
379                         add.sat (16) TempRow1B(0)<2>            P1                      TempRow1(0)                                     // p1+tc_adj\r
380                         mov (16) NewP1                  TempRow1B(0)                            // p1'\r
381 \r
382                         //------------------------------------------------------------------------\r
383 Y_ENDIF6:\r
384                         endif\r
385                         \r
386                         // if (|q2-q0|<Beta)\r
387 //                      mov (1) f0.1:w          CTemp2_W:w                              {NoMask}                        // CTemp2_W = (|q2-q0|<Beta)\r
388                         (f0.1)  if      (16)            Y_ENDIF7\r
389                                         \r
390                         // q1' = q1 + Clip3(-tc0, tc0, adj)\r
391                         // adj = (q2 + common - (q1<<1)) >> 1 \r
392                         // same as q2 + common - (q1 * 2)\r
393                         add (16) acc0<1>:w      Q2              TempRow2(0)\r
394                         mac (16) acc0<1>:w      Q1              -2:w\r
395                         shr (16) TempRow1(0)<1>         acc0<16;16,1>:w         1:w     \r
396 \r
397                         // tc clip to get tc_adj\r
398                         cmp.g.f0.0      (16) null:w             TempRow1(0)             tc0_exp(0)                                      // Clip if delta' > tc\r
399                         cmp.l.f0.1      (16) null:w             TempRow1(0)             -tc0_exp(0)                                     // Clip if delta' < -tc\r
400 \r
401                         (f0.0) mov (16) TempRow1(0)<1>                          tc0_exp(0)\r
402                         (f0.1) mov (16) TempRow1(0)<1>                          -tc0_exp(0)\r
403 \r
404                         // q1' = q1 + tc_adj\r
405                         add.sat (16) TempRow1B(0)<2>            Q1                      TempRow1(0)                                     // q1+tc_adj\r
406                         mov (16) NewQ1                  TempRow1B(0)                            // q1'\r
407 \r
408                         //------------------------------------------------------------------------                      \r
409 Y_ENDIF7:\r
410                         endif\r
411 \r
412                 endif\r
413 Y_ENDIF2:\r
414 Y_ENDIF1:\r
415         endif\r
416 \r
417 RETURN\r
418 \r
419 #endif  // !defined(__AVC_ILDB_LUMA_CORE__)\r