-SUBDIRS = gen5_6
+SUBDIRS = gen5_6 gen7
--- /dev/null
+// 22 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: DI.asm
+// Author: Vivek Kumar
+// Description: Tasks for DI only case (16x4 block)
+
+
+// End of common.inc
+
+
+// FileName: DNDI.inc
+// Author: Vivek Kumar
+// Description: Include file for DN, DI and DNDI
+// Inputs: DI_ENABLE, DN_ENABLE, DN_PLANAR, DN_PACKED
+
+
+// End of common.inc
+
+
+//Interface:
+//Static Parameters:
+//r1
+
+
+//====================== Binding table (Explicit To DNDI)=========================================
+
+
+.declare mudMSGHDR_DNDI Base=r18 ElementSize=4 Type=ud
+.declare mdMSGHDR_DNDI Base=r18 ElementSize=4 Type=d
+.declare mwMSGHDR_DNDI Base=r18 ElementSize=2 Type=w
+
+
+.declare mudMSGHDR_STMM Base=r20 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_HIST Base=r22 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_ENC_STATS Base=r24 ElementSize=4 Type=ud
+.declare muwMSGHDR_ENC_STATS Base=r24 ElementSize=2 Type=uw
+.declare mubMSGHDR_ENC_STATS Base=r24 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_DN_OUT Base=r31.0 ElementSize=4 Type=ud
+.declare mdMSGHDR_DN_OUT Base=r31.0 ElementSize=4 Type=d
+.declare mubMSGHDR_DN_OUT Base=r31.0 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_UVCOPY Base=r36 ElementSize=4 Type=ud
+.declare mdMSGHDR_UVCOPY Base=r36 ElementSize=4 Type=d
+.declare mudMSGHDR_UCOPY Base=r36 ElementSize=4 Type=ud
+.declare mudMSGHDR_VCOPY Base=r38 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_DI_OUT1 Base=r18.0 ElementSize=4 Type=ud
+.declare mubMSGHDR_DI_OUT1 Base=r18.0 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_DI_OUT2 Base=r23.0 ElementSize=4 Type=ud
+.declare mubMSGHDR_DI_OUT2 Base=r23.0 ElementSize=1 Type=ub
+
+//r45
+//Use r45 as message header, so no need to "mov" the data.
+
+.declare mudDN_Y_OUT Base=r45.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+// Message response (Denoised & DI-ed pixels & statistics); Use buffer 5
+.declare udDNDI_RESP Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare uwDNDI_RESP Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare ubDNDI_RESP Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+// Message response (UV Copy); Use buffer 5
+.declare udDNDI_UV_RESP Base=r58.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare ubDNDI_UV_RESP Base=r58.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+//Temp GRFs: For 42X to 422 Conversion
+.declare uwDNDI_UVCOPY_TEMP Base=r10.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw //8 GRFs
+.declare ubDNDI_UVCOPY_TEMP Base=r10.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub //8 GRFs
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+ // Message descriptor for sampler read
+ // = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)
+ // 1 (header present 1) 0 11 (SIMD32/64 mode)
+ // 1000 (message type) 0000 (DI state index)
+ // 00000000 (binding table index - set later)
+ // = 0x040b8000
+
+
+// Attention: The Message Length is The Number of GRFs with Data Only, without the Header
+
+
+//---------------------------------------------------------------------------
+// VDI Return Data format
+//---------------------------------------------------------------------------
+// Defines for DI enabled
+
+
+// Defines for DI disabled
+
+
+// FileName: DNDI_Command.asm
+// Author: Vivek Kumar
+// Description: Sends a message to the VDI to process one DN (16x8) or DNDI (16x4) block
+
+// Prepare the DNDI send command
+mov (8) mudMSGHDR_DNDI(0)<1> r0.0<8;8,1>:ud // message header
+mov (1) mwMSGHDR_DNDI(1,4)<1> r7.0<0;1,0>:w { NoDDClr } // horizontal origin // Do we need to add offset here? -vK
+mov (1) mwMSGHDR_DNDI(1,12)<1> r7.1<0;1,0>:w { NoDDChk } // vertical origin // Can these 2 be combined? - vK
+
+send (8) udDNDI_RESP(0)<1> r18 0x2 0x4AE8003:ud
+
+// On Gen6, with VDI walker, use the XY pair returned rather than programmed above
+// VDI_RETURNED_XY is ordered XY in case of walker enables and the same as programmed in case of walker disabled
+mov (2) r7.0<1>:w uwDNDI_RESP(9,14)<2;2,1> // horizontal/Vertial origin in W.14 and W.15
+
+
+// FileName: DI_STMM_Save.asm
+// Author: Vivek Kumar
+// Description: Saves DI STMM Data to statistics surface in case of DI enabled (for 16x4 block)
+
+// Write STMM to memory
+mov (8) mudMSGHDR_STMM(0)<1> r0.0<8;8,1>:ud // message header
+mov (8) mudMSGHDR_STMM(1)<1> udDNDI_RESP(8,0) // Move STMM to MRF
+
+shr (1) mudMSGHDR_STMM(0,0)<1> r7.0<0;1,0>:w 1:w { NoDDClr } // X origin / 2
+mov (1) mudMSGHDR_STMM(0,1)<1> r7.1<0;1,0>:w { NoDDClr, NoDDChk } // Y origin
+mov (1) mudMSGHDR_STMM(0,2)<1> 0x30007:ud { NoDDChk } // block width and height (8x4)
+
+send (8) null<1>:d r20 0x5 0x40A8021:ud
+
+
+// FileName: DNDI_Enc_Stats_Save.asm
+// Author: Vivek Kumar
+// Description: Saves Encoder Statistics data to statistics surface in case of DI enabled (for 16x4 block)
+
+// Write encoder statistics to memory
+//Currently enable this only on Gen6 validation
+mov (8) mudMSGHDR_ENC_STATS(1)<1> 0x0:ud // Init payload MRF
+mov (8) mudMSGHDR_ENC_STATS(0)<1> r0.0<8;8,1>:ud // message header
+
+shr (1) mudMSGHDR_ENC_STATS(0,0)<1> r7.0<0;1,0>:w 1:w { NoDDClr } //enable the flag after testing on si { NoDDClr } // X origin / 2
+mul (1) acc0.1<1>:ud r7.1<0;1,0>:w 3:w // Y origin * 3
+shr (1) mudMSGHDR_ENC_STATS(0,1)<1> acc0.1<0;1,0>:ud 2:w { NoDDClr, NoDDChk } //enable the flag after testing on si { NoDDClr, NoDDChk } // Y origin * 3/4
+mov (1) mudMSGHDR_ENC_STATS(0,2)<1> 0x20007:ud { NoDDChk } //enable the flag after testing on si { NoDDChk } // block width and height (8x3)
+add (2) mudMSGHDR_ENC_STATS(0,0)<1> mudMSGHDR_ENC_STATS(0,0)<2;2,1> r1.12<2;2,1>:uw // Add pitch to X,Y origin
+
+
+ //Data block for Encoder Statistics
+ //----------------------------------------------------
+ //| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Bytes
+ //----------------------------------------------------
+ //| BNE | MCNT | FCNT | TCNT | X | X | X | X |
+ //----------------------------------------------------
+ //| DcTpT | SVCM | DcBpT | DcTpB |
+ //----------------------------------------------------
+ //| SHCM | STAD | DcTcB | DcBpB |
+ //----------------------------------------------------
+ mov (1) mudMSGHDR_ENC_STATS(1,0)<1> udDNDI_RESP(9,1)<0;1,0> { NoDDClr } // Move encoder statistics to MRF
+ mov (2) mudMSGHDR_ENC_STATS(1,3)<2> udDNDI_RESP(9,3)<2;2,1> { NoDDClr, NoDDChk } // Move encoder statistics to MRF
+ mov (2) mudMSGHDR_ENC_STATS(1,2)<2> udDNDI_RESP(9,5)<2;2,1> { NoDDChk } // Move encoder statistics to MRF
+
+
+send (8) null<1>:d r24 0x5 0x40A8021:ud
+
+
--- /dev/null
+// 20 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: DI_Save_NV12_16x4.asm
+// Author: Vivek Kumar
+// Description: Save two 16x4 blocks of DI output in NV12 format
+
+
+// FileName: DNDI.inc
+// Author: Vivek Kumar
+// Description: Include file for DN, DI and DNDI
+// Inputs: DI_ENABLE, DN_ENABLE, DN_PLANAR, DN_PACKED
+
+
+// End of common.inc
+
+
+//Interface:
+//Static Parameters:
+//r1
+
+
+//====================== Binding table (Explicit To DNDI)=========================================
+
+
+.declare mudMSGHDR_DNDI Base=r18 ElementSize=4 Type=ud
+.declare mdMSGHDR_DNDI Base=r18 ElementSize=4 Type=d
+.declare mwMSGHDR_DNDI Base=r18 ElementSize=2 Type=w
+
+
+.declare mudMSGHDR_STMM Base=r20 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_HIST Base=r22 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_ENC_STATS Base=r24 ElementSize=4 Type=ud
+.declare muwMSGHDR_ENC_STATS Base=r24 ElementSize=2 Type=uw
+.declare mubMSGHDR_ENC_STATS Base=r24 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_DN_OUT Base=r31.0 ElementSize=4 Type=ud
+.declare mdMSGHDR_DN_OUT Base=r31.0 ElementSize=4 Type=d
+.declare mubMSGHDR_DN_OUT Base=r31.0 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_UVCOPY Base=r36 ElementSize=4 Type=ud
+.declare mdMSGHDR_UVCOPY Base=r36 ElementSize=4 Type=d
+.declare mudMSGHDR_UCOPY Base=r36 ElementSize=4 Type=ud
+.declare mudMSGHDR_VCOPY Base=r38 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_DI_OUT1 Base=r18.0 ElementSize=4 Type=ud
+.declare mubMSGHDR_DI_OUT1 Base=r18.0 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_DI_OUT2 Base=r23.0 ElementSize=4 Type=ud
+.declare mubMSGHDR_DI_OUT2 Base=r23.0 ElementSize=1 Type=ub
+
+//r45
+//Use r45 as message header, so no need to "mov" the data.
+
+.declare mudDN_Y_OUT Base=r45.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+// Message response (Denoised & DI-ed pixels & statistics); Use buffer 5
+.declare udDNDI_RESP Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare uwDNDI_RESP Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare ubDNDI_RESP Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+// Message response (UV Copy); Use buffer 5
+.declare udDNDI_UV_RESP Base=r58.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare ubDNDI_UV_RESP Base=r58.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+//Temp GRFs: For 42X to 422 Conversion
+.declare uwDNDI_UVCOPY_TEMP Base=r10.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw //8 GRFs
+.declare ubDNDI_UVCOPY_TEMP Base=r10.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub //8 GRFs
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+ // Message descriptor for sampler read
+ // = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)
+ // 1 (header present 1) 0 11 (SIMD32/64 mode)
+ // 1000 (message type) 0000 (DI state index)
+ // 00000000 (binding table index - set later)
+ // = 0x040b8000
+
+
+// Attention: The Message Length is The Number of GRFs with Data Only, without the Header
+
+
+//---------------------------------------------------------------------------
+// VDI Return Data format
+//---------------------------------------------------------------------------
+// Defines for DI enabled
+
+
+// Defines for DI disabled
+
+
+mov (2) r27.0<1>:d r7.0<2;2,1>:w { NoDDClr }
+mov (1) r27.2<1>:ud 0x3000F:ud { NoDDChk } // Block width and height (16x4)
+
+//Bottom field Y
+ mov (8) mudMSGHDR_DI_OUT1(1)<1> udDNDI_RESP(0,0)
+ mov (8) mudMSGHDR_DI_OUT1(2)<1> udDNDI_RESP(0,8)
+// Top field Y
+ mov (8) mudMSGHDR_DI_OUT2(1)<1> udDNDI_RESP(4,0)
+ mov (8) mudMSGHDR_DI_OUT2(2)<1> udDNDI_RESP(4,8)
+
+//copy message desrcptor to the message header
+mov (8) r18.0<1>:ud r27<8;8,1>:ud
+mov (8) r23.0<1>:ud r27<8;8,1>:ud
+
+//Change origin to U/V block
+asr (1) r27.1<1>:d r27.1<0;1,0>:d 1:w { NoDDClr } // U/V block origin should be half of Y's
+mov (1) r27.2<1>:ud 0x1000F:ud { NoDDChk } // Block width and height (16x2)
+
+// Bottom field U/V
+mov (16) r21.0<2>:ub ubDNDI_RESP(2, 1)<32;8,2> { NoDDClr }
+mov (16) r21.1<2>:ub ubDNDI_RESP(2, 0)<32;8,2> { NoDDChk }
+
+// Top field U/V
+mov (16) r26.0<2>:ub ubDNDI_RESP(6, 1)<32;8,2> { NoDDClr }
+mov (16) r26.1<2>:ub ubDNDI_RESP(6, 0)<32;8,2> { NoDDChk }
+
+//copy message desrcptor to the message header
+mov (8) r21<1>:ud r27<8;8,1>:ud
+mov (8) r26<1>:ud r27<8;8,1>:ud
+
+//Send out Y component on previous frame to surface
+send (8) null<1>:d r18.0 0x5 0x60A801B:ud
+//Send out Y component on current frame to surface
+send (8) null<1>:d r23.0 0x5 0x60A801E:ud
+//Send out U/V component on previous frame to surface
+send (8) null<1>:d r21 0x5 0x40A801C:ud
+//Send out U/V component on current frame to surface
+send (8) null<1>:d r26 0x5 0x40A801F:ud
--- /dev/null
+// 33 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: DI_Save_PA_16x4.asm
+// Author: Vivek Kumar
+// Description: Save two 16x4 blocks of DI output in Packed format
+
+
+// FileName: DNDI.inc
+// Author: Vivek Kumar
+// Description: Include file for DN, DI and DNDI
+// Inputs: DI_ENABLE, DN_ENABLE, DN_PLANAR, DN_PACKED
+
+
+// End of common.inc
+
+
+//Interface:
+//Static Parameters:
+//r1
+
+
+//====================== Binding table (Explicit To DNDI)=========================================
+
+
+.declare mudMSGHDR_DNDI Base=r18 ElementSize=4 Type=ud
+.declare mdMSGHDR_DNDI Base=r18 ElementSize=4 Type=d
+.declare mwMSGHDR_DNDI Base=r18 ElementSize=2 Type=w
+
+
+.declare mudMSGHDR_STMM Base=r20 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_HIST Base=r22 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_ENC_STATS Base=r24 ElementSize=4 Type=ud
+.declare muwMSGHDR_ENC_STATS Base=r24 ElementSize=2 Type=uw
+.declare mubMSGHDR_ENC_STATS Base=r24 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_DN_OUT Base=r31.0 ElementSize=4 Type=ud
+.declare mdMSGHDR_DN_OUT Base=r31.0 ElementSize=4 Type=d
+.declare mubMSGHDR_DN_OUT Base=r31.0 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_UVCOPY Base=r36 ElementSize=4 Type=ud
+.declare mdMSGHDR_UVCOPY Base=r36 ElementSize=4 Type=d
+.declare mudMSGHDR_UCOPY Base=r36 ElementSize=4 Type=ud
+.declare mudMSGHDR_VCOPY Base=r38 ElementSize=4 Type=ud
+
+
+.declare mudMSGHDR_DI_OUT1 Base=r18.0 ElementSize=4 Type=ud
+.declare mubMSGHDR_DI_OUT1 Base=r18.0 ElementSize=1 Type=ub
+
+
+.declare mudMSGHDR_DI_OUT2 Base=r23.0 ElementSize=4 Type=ud
+.declare mubMSGHDR_DI_OUT2 Base=r23.0 ElementSize=1 Type=ub
+
+//r45
+//Use r45 as message header, so no need to "mov" the data.
+
+.declare mudDN_Y_OUT Base=r45.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+// Message response (Denoised & DI-ed pixels & statistics); Use buffer 5
+.declare udDNDI_RESP Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare uwDNDI_RESP Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare ubDNDI_RESP Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+// Message response (UV Copy); Use buffer 5
+.declare udDNDI_UV_RESP Base=r58.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare ubDNDI_UV_RESP Base=r58.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+//Temp GRFs: For 42X to 422 Conversion
+.declare uwDNDI_UVCOPY_TEMP Base=r10.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw //8 GRFs
+.declare ubDNDI_UVCOPY_TEMP Base=r10.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub //8 GRFs
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+ // Message descriptor for sampler read
+ // = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)
+ // 1 (header present 1) 0 11 (SIMD32/64 mode)
+ // 1000 (message type) 0000 (DI state index)
+ // 00000000 (binding table index - set later)
+ // = 0x040b8000
+
+
+// Attention: The Message Length is The Number of GRFs with Data Only, without the Header
+
+
+//---------------------------------------------------------------------------
+// VDI Return Data format
+//---------------------------------------------------------------------------
+// Defines for DI enabled
+
+
+// Defines for DI disabled
+
+
+add (4) a0.4<1>:uw r2.28<4;4,1>:ub 608:w // Initial Y,U,V offset in YUV422 block; it starts at m20
+
+mov (8) r27.0<1>:ud r0.0<8;8,1>:ud
+shl (1) r27.0<1>:d r7.0<0;1,0>:w 1:w { NoDDClr } // H. block origin need to be doubled
+mov (1) r27.1<1>:d r7.1<0;1,0>:w { NoDDClr, NoDDChk } // Block origin
+mov (1) r27.2<1>:ud 0x3001F:ud { NoDDChk } // Block width and height (32x8)
+
+//prepare the message headers
+mov (8) r18.0<1>:ud r27<8;8,1>:ud
+mov (8) r23.0<1>:ud r27<8;8,1>:ud
+
+// Pack 2nd field Y
+ mov (16) r[a0.4, 0]<2> ubDNDI_RESP(0,0) { NoDDClr }
+ mov (16) r[a0.4, 32]<2> ubDNDI_RESP(0,16) { NoDDClr }
+ mov (16) r[a0.4, 64]<2> ubDNDI_RESP(0,32) { NoDDClr }
+ mov (16) r[a0.4, 96]<2> ubDNDI_RESP(0,48) { NoDDClr }
+// Pack 2nd field U
+ mov (8) r[a0.5, 0]<4> ubDNDI_RESP(2,1)<16;8,2> { NoDDClr, NoDDChk } //U pixels
+ mov (8) r[a0.5, 32]<4> ubDNDI_RESP(2,17)<16;8,2> { NoDDClr, NoDDChk } //U pixels
+ mov (8) r[a0.5, 64]<4> ubDNDI_RESP(2,33)<16;8,2> { NoDDClr, NoDDChk } //U pixels
+ mov (8) r[a0.5, 96]<4> ubDNDI_RESP(2,49)<16;8,2> { NoDDClr, NoDDChk } //U pixels
+// Pack 2nd field V
+ mov (8) r[a0.6, 0]<4> ubDNDI_RESP(2,0)<16;8,2> { NoDDChk } //Vpixels
+ mov (8) r[a0.6, 32]<4> ubDNDI_RESP(2,16)<16;8,2> { NoDDChk } //Vpixels
+ mov (8) r[a0.6, 64]<4> ubDNDI_RESP(2,32)<16;8,2> { NoDDChk } //Vpixels
+ mov (8) r[a0.6, 96]<4> ubDNDI_RESP(2,48)<16;8,2> { NoDDChk } //Vpixels
+
+// Pack 1st field Y
+ mov (16) r[a0.4, 160]<2> ubDNDI_RESP(4,0) { NoDDClr }
+ mov (16) r[a0.4, 192]<2> ubDNDI_RESP(4,16) { NoDDClr }
+ mov (16) r[a0.4, 224]<2> ubDNDI_RESP(4,32) { NoDDClr }
+ mov (16) r[a0.4, 256]<2> ubDNDI_RESP(4,48) { NoDDClr }
+// Pack 1st field U
+ mov (8) r[a0.5, 160]<4> ubDNDI_RESP(6,1)<16;8,2> { NoDDClr, NoDDChk } //U pixels
+ mov (8) r[a0.5, 192]<4> ubDNDI_RESP(6,17)<16;8,2> { NoDDClr, NoDDChk } //U pixels
+ mov (8) r[a0.5, 224]<4> ubDNDI_RESP(6,33)<16;8,2> { NoDDClr, NoDDChk } //U pixels
+ mov (8) r[a0.5, 256]<4> ubDNDI_RESP(6,49)<16;8,2> { NoDDClr, NoDDChk } //U pixels
+// Pack 1st field V
+ mov (8) r[a0.6, 160]<4> ubDNDI_RESP(6,0)<16;8,2> { NoDDChk } //Vpixels
+ mov (8) r[a0.6, 192]<4> ubDNDI_RESP(6,16)<16;8,2> { NoDDChk } //Vpixels
+ mov (8) r[a0.6, 224]<4> ubDNDI_RESP(6,32)<16;8,2> { NoDDChk } //Vpixels
+ mov (8) r[a0.6, 256]<4> ubDNDI_RESP(6,48)<16;8,2> { NoDDChk } //Vpixels
+
+//save the previous frame
+send (8) null<1>:d r18.0 0x5 0xA0A801B:ud
+
+//save the current frame
+send (8) null<1>:d r23.0 0x5 0xA0A801E:ud
--- /dev/null
+// 2 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+//End of Thread message
+
+mov (8) r127<1>:ud r0.0<8;8,1>:ud
+ send (1) null<1>:d r127 0x27 0x02000010
--- /dev/null
+INTEL_PP_G7B = \
+ dndi.g7b \
+ avs.g7b
+
+INTEL_PP_G4A = \
+ DI_Core.g4a \
+ DI_Save_NV12_16x4.g4a\
+ DI_Save_PA_16x4.g4a\
+ EOT.g4a\
+ VP_Setup.g4a\
+ Set_Layer_0.g4a\
+ PA_AVS_Buf_0.g4a\
+ PA_AVS_Buf_1.g4a\
+ PA_AVS_Buf_2.g4a\
+ PA_AVS_Buf_3.g4a\
+ PL2_AVS_Buf_0.g4a\
+ PL2_AVS_Buf_1.g4a\
+ PL2_AVS_Buf_2.g4a\
+ PL2_AVS_Buf_3.g4a\
+ PL3_AVS_Buf_0.g4a\
+ PL3_AVS_Buf_1.g4a \
+ PL3_AVS_Buf_2.g4a\
+ PL3_AVS_Buf_3.g4a\
+ Set_AVS_Buf_0123_VYUA.g4a\
+ Save_AVS_RGB.g4a\
+ Save_AVS_PA.g4a \
+ Save_AVS_PL3.g4a\
+ Save_AVS_NV12.g4a\
+ Set_AVS_Buf_0123_BGRA.g4a\
+ Set_AVS_Buf_0123_PL2.g4a\
+ Set_AVS_Buf_0123_PL3.g4a\
+ Set_AVS_Buf_0123_VUYA.g4a
+
+EXTRA_DIST = \
+ $(INTEL_PP_G7B)
+
+if HAVE_GEN4ASM
+
+$(INTEL_PP_G7B): $(INTEL_PP_G4A)
+ @_PP_TARGET=$@; \
+ cpp $${_PP_TARGET/.g7b/.asm} > _pp0.asm; \
+ ../../gpp.py _pp0.asm _pp1.asm; \
+ intel-gen4asm -a -o $@ -g 7 _pp1.asm; \
+ rm _pp0.asm _pp1.asm
+
+BUILT_SOURCES= $(INTEL_PP_G7B)
+
+clean-local:
+ -rm -f $(INTEL_PP_G7B)
+
+endif
--- /dev/null
+// 9 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PA_AVS_Buf_0.asm
+// Author: Vivek Kumar
+// Description: Loads 8x8 AVS/IEF Packed data into Buffer 0
+
+
+// FileName : PA_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF Packed data into Buffer N
+
+//On IVB, for AVS module - set buffer pointers offset according to AVS Layout.
+//Change it to Sample Unorm layout in Shuffle modules.
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //NOTE: We need offsets for second halfof LAYER 0 - even if we do not load it.
+ //Update the channel offset in the buffers for the lower 8x4 data for BUFFER_0.
+ mov (1) r22.4<1>:ud 0x400040:ud
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_0_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x50EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x00000000:ud // Enable ARGB channels
+
+
+ //OPT: rAVS_PAYLOAD.1 and .7 --> use NODDCLR, NODDCHK -rT
+ mov (1) r25.7<1>:ud r7.7:ud { NoDDClr }
+ mov (1) r25.1<1>:ud r7.12:uw { NoDDChk }
+
+
+ // set the vertical block number
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud
+ send (1) uwBUFFER_0(0)<1> r16 0x2 a0.0:ud
+ // Returns packed data in 16 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_0_:
+ nop
+
+
--- /dev/null
+// 7 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PA_AVS_Buf_1.asm
+// Author: Vivek Kumar
+// Description: Loads 8x8 AVS/IEF Packed data into Buffer 1
+
+
+// FileName : PA_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF Packed data into Buffer N
+
+//On IVB, for AVS module - set buffer pointers offset according to AVS Layout.
+//Change it to Sample Unorm layout in Shuffle modules.
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_1_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x50EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x00000000:ud // Enable ARGB channels
+
+
+ // set the vertical block number
+
+ add (1) r25.1<1>:ud r7.12:uw 1:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud
+ send (1) uwBUFFER_1(0)<1> r16 0x2 a0.0:ud
+ // Returns packed data in 16 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_1_:
+ nop
+
+
--- /dev/null
+// 7 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PA_AVS_Buf_2.asm
+// Author: Vivek Kumar
+// Description: Loads 8x8 AVS/IEF Packed data into Buffer 2
+
+
+// FileName : PA_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF Packed data into Buffer N
+
+//On IVB, for AVS module - set buffer pointers offset according to AVS Layout.
+//Change it to Sample Unorm layout in Shuffle modules.
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_2_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x50EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x00000000:ud // Enable ARGB channels
+
+
+ // set the vertical block number
+
+
+ add (1) r25.1<1>:ud r7.12:uw 2:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud
+ send (1) uwBUFFER_2(0)<1> r16 0x2 a0.0:ud
+ // Returns packed data in 16 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_2_:
+ nop
+
+
--- /dev/null
+// 7 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PA_AVS_Buf_3.asm
+// Author: Vivek Kumar
+// Description: Loads 8x8 AVS/IEF Packed data into Buffer 3
+
+
+// FileName : PA_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF Packed data into Buffer N
+
+//On IVB, for AVS module - set buffer pointers offset according to AVS Layout.
+//Change it to Sample Unorm layout in Shuffle modules.
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_3_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x50EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x00000000:ud // Enable ARGB channels
+
+
+ // set the vertical block number
+
+
+ add (1) r25.1<1>:ud r7.12:uw 3:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud
+ send (1) uwBUFFER_3(0)<1> r16 0x2 a0.0:ud
+ // Returns packed data in 16 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_3_:
+ nop
+
+
--- /dev/null
+// 12 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PL2_AVS_Buf_0.asm
+// Author: Tatiya, Rupesh
+// Description: Loads 8x8 AVS/IEF PL2 data into Buffer 0
+
+
+// FileName : PL2_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF PL2 data into Buffer N
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //NOTE: We need offsets for second halfof LAYER 0 - even if we do not load it.
+ //Update the channel offset in the buffers for the lower 8x4 data for BUFFER_0.
+ mov (1) r22.4<1>:ud 0x400040:ud
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_0_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x0000D000:ud // Enable Red channel
+
+
+ mov (1) r25.7<1>:ud r7.7:ud { NoDDClr }
+ mov (1) r25.1<1>:ud r7.12:uw { NoDDChk }
+
+
+ // set the vertical block number
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud // Copy msg payload mirrors to MRFs
+
+ send (1) uwBUFFER_0(0)<1> r16 0x2 a0.0:ud
+ // Returns Y data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x48EB801:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000A000:ud // Enable Red+Green channel
+
+ send (1) uwBUFFER_0(4)<1> r16 0x2 a0.0:ud
+ // Returns UV data in 8 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_0_:
+ nop
+
+
--- /dev/null
+// 10 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PL2_AVS_Buf_1.asm
+// Author: Tatiya, Rupesh
+// Description: Loads 8x8 AVS/IEF PL2 data into Buffer 1
+
+
+// FileName : PL2_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF PL2 data into Buffer N
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_1_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x0000D000:ud // Enable Red channel
+
+
+ // set the vertical block number
+
+ add (1) r25.1<1>:ud r7.12:uw 1:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud // Copy msg payload mirrors to MRFs
+
+ send (1) uwBUFFER_1(0)<1> r16 0x2 a0.0:ud
+ // Returns Y data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x48EB801:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000A000:ud // Enable Red+Green channel
+
+ send (1) uwBUFFER_1(4)<1> r16 0x2 a0.0:ud
+ // Returns UV data in 8 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_1_:
+ nop
+
+
--- /dev/null
+// 10 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PL2_AVS_Buf_2.asm
+// Author: Tatiya, Rupesh
+// Description: Loads 8x8 AVS/IEF PL2 data into Buffer 2
+
+
+// FileName : PL2_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF PL2 data into Buffer N
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_2_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x0000D000:ud // Enable Red channel
+
+
+ // set the vertical block number
+
+
+ add (1) r25.1<1>:ud r7.12:uw 2:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud // Copy msg payload mirrors to MRFs
+
+ send (1) uwBUFFER_2(0)<1> r16 0x2 a0.0:ud
+ // Returns Y data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x48EB801:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000A000:ud // Enable Red+Green channel
+
+ send (1) uwBUFFER_2(4)<1> r16 0x2 a0.0:ud
+ // Returns UV data in 8 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_2_:
+ nop
+
+
--- /dev/null
+// 10 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PL2_AVS_Buf_3.asm
+// Author: Tatiya, Rupesh
+// Description: Loads 8x8 AVS/IEF PL2 data into Buffer 0
+
+
+// FileName : PL2_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF PL2 data into Buffer N
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_3_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x0000D000:ud // Enable Red channel
+
+
+ // set the vertical block number
+
+
+ add (1) r25.1<1>:ud r7.12:uw 3:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud // Copy msg payload mirrors to MRFs
+
+ send (1) uwBUFFER_3(0)<1> r16 0x2 a0.0:ud
+ // Returns Y data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x48EB801:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000A000:ud // Enable Red+Green channel
+
+ send (1) uwBUFFER_3(4)<1> r16 0x2 a0.0:ud
+ // Returns UV data in 8 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_3_:
+ nop
+
+
--- /dev/null
+// 15 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PL3_AVS_Buf_0.asm
+// Author: Tatiya, Rupesh
+// Description: Loads 8x8 AVS/IEF PL3 data into Buffer 0
+
+
+// FileName : PL3_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF PL3 data into Buffer N
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //NOTE: We need offsets for second halfof LAYER 0 - even if we do not load it.
+ //Update the channel offset in the buffers for the lower 8x4 data for BUFFER_0.
+ mov (1) r22.4<1>:ud 0x400040:ud
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_0_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x0000D000:ud // Enable Red channel
+
+
+ mov (1) r25.7<1>:ud r7.7:ud { NoDDClr }
+ mov (1) r25.1<1>:ud r7.12:uw { NoDDChk }
+
+
+ // set the vertical block number
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud // Copy msg payload mirrors to MRFs
+
+ send (1) uwBUFFER_0(0)<1> r16 0x2 a0.0:ud
+ // Returns Y data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB801:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000E000:ud // Enable Red channel
+
+ send (1) uwBUFFER_0(4)<1> r16 0x2 a0.0:ud
+ // Returns U data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EBC02:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000E000:ud // Enable Red channel
+
+ send (1) uwBUFFER_0(8)<1> r16 0x2 a0.0:ud
+ // Returns V data in 4 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_0_:
+ nop
+
+
--- /dev/null
+// 13 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PL3_AVS_Buf_1.asm
+// Author: Tatiya, Rupesh
+// Description: Loads 8x8 AVS/IEF PL3 data into Buffer 1
+
+
+// FileName : PL3_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF PL3 data into Buffer N
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_1_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x0000D000:ud // Enable Red channel
+
+
+ // set the vertical block number
+
+ add (1) r25.1<1>:ud r7.12:uw 1:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud // Copy msg payload mirrors to MRFs
+
+ send (1) uwBUFFER_1(0)<1> r16 0x2 a0.0:ud
+ // Returns Y data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB801:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000E000:ud // Enable Red channel
+
+ send (1) uwBUFFER_1(4)<1> r16 0x2 a0.0:ud
+ // Returns U data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EBC02:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000E000:ud // Enable Red channel
+
+ send (1) uwBUFFER_1(8)<1> r16 0x2 a0.0:ud
+ // Returns V data in 4 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_1_:
+ nop
+
+
--- /dev/null
+// 13 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PL3_AVS_Buf_2.asm
+// Author: Tatiya, Rupesh
+// Description: Loads 8x8 AVS/IEF PL3 data into Buffer 2
+
+
+// FileName : PL3_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF PL3 data into Buffer N
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_2_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x0000D000:ud // Enable Red channel
+
+
+ // set the vertical block number
+
+
+ add (1) r25.1<1>:ud r7.12:uw 2:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud // Copy msg payload mirrors to MRFs
+
+ send (1) uwBUFFER_2(0)<1> r16 0x2 a0.0:ud
+ // Returns Y data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB801:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000E000:ud // Enable Red channel
+
+ send (1) uwBUFFER_2(4)<1> r16 0x2 a0.0:ud
+ // Returns U data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EBC02:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000E000:ud // Enable Red channel
+
+ send (1) uwBUFFER_2(8)<1> r16 0x2 a0.0:ud
+ // Returns V data in 4 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_2_:
+ nop
+
+
--- /dev/null
+// 13 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: PL3_AVS_Buf_3.asm
+// Author: Tatiya, Rupesh
+// Description: Loads 8x8 AVS/IEF PL3 data into Buffer 3
+
+
+// FileName : PL3_AVS_Buf.asm
+// Author : Tatiya, Rupesh
+// Description : Loads 8x8 AVS/IEF PL3 data into Buffer N
+
+
+// Module name: Scaling.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ // Message Header
+ // m0.7 31:0 Debug
+ // m0.6 31:0 Debug
+ // m0.5 31:0 Ignored
+ // m0.4 31:0 Ignored
+ // m0.3 31:0 Ignored
+ // m0.2 31:16 Ignored
+ // 15 Alpha Write Channel Mask enable=0, disable=1
+ // 14 Blue Write Channel Mask (U)
+ // 13 Green Write Channel Mask (Y)
+ // 12 Red Write Channel Mask (V)
+ // 11:0 Ignored
+ // m0.1 Ignored
+ // m0.0 Ignored
+
+
+ // AVS payload
+ // m1.7 Group ID Number
+ // m1.6 U 2nd Derivative ---> NLAS dx
+ // m1.5 Delta V ---> Step Y
+ // m1.4 Delta U ---> Step X
+ // m1.3 Pixel 0 V Address ---> ORIY (Y0)
+ // m1.2 Pixel 0 U Address ---> ORIX (X0)
+ // m1.1 Vertical Block Number
+ // m1.0 Reserved
+
+ // Sampler Message Descriptor
+ // 31:29 Reserved 000
+ // 28:25 Message length 0010
+ // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel (AVS), 2GRFs for each enabled channel (sample unorm)
+ // 19 Header Present 1
+ // 18:17 SIMD Mode 11 ---> SIMD32/64
+ // 16:12 Message Type xxxxx ---> 01011 sample_8x8, 01100 (sample_unorm), 01010 (sample_unorm+killpix)
+ // 11:8 Sampler Index xxxx
+ // 7:0 Binding Table Index xxxxxxxx
+
+
+ // Msg Header M0.2
+ // 15:15 Alpha Write Channel Mask, 0: written back, 1: not written back
+ // 14:14 Blue Write Channel Mask
+ // 13:13 Green Write Channel Mask
+ // 12:12 Red Write Channel Mask
+
+
+//By design, Buffer 0,1,2,3 always have Layer 0 and Buffer 4,5 always have L1-L7
+
+
+//used to generate LABELS at compile time.
+
+
+ // 18:17 SIMD Mode 10 ---> SIMD16
+ // 16:12 Message Type xxxxx ---> 00000 (SIMD16)
+
+
+//r10-17 - 8 GRFs to load SIMD16 data (upto 4 channels)
+//r18-19 - 2 GRFs to store sampler ramp.
+
+ .declare mfSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare muwSCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+ .declare mudCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare mubCALING_0X_34X_PAYLOAD Base=r14.0 ElementSize=1 SrcRegion=<32;32,1> DstRegion=<1> Type=ub
+
+
+ .declare fSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+ .declare udSCALING_0X_34X_TEMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+ .declare ub4SCALING_0X_34X_TEMP Base=r9.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<1> Type=ub
+ .declare uwSCALING_0X_34X_TEMP Base=r9.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+ // Sampler ramp is used for Scaling 0X_0.34X
+ .declare fSAMPLER_RAMP Base=r9.0 ElementSize=4 SrcRegion=<8;8,1> Type=f // 1 GRFs, 8 elements
+
+
+ //#define rMSGDSC_UV r23.0
+
+
+//End of _SCALING_
+
+
+ //Check if layer is to be skipped
+
+
+ // f0.1 pre-computed in Set_Layer_0
+ (-f0.1) jmpi (1) SKIP_AVS_LOAD_L0_3_
+
+
+ //AVS_PAYLOAD already has all the data loaded at this point
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB400:ud //msg desc
+
+ mov (1) r16.2:ud 0x0000D000:ud // Enable Red channel
+
+
+ // set the vertical block number
+
+
+ add (1) r25.1<1>:ud r7.12:uw 3:ud
+
+
+ mov (8) r17.0:ud r25.0<8;8,1>:ud // Copy msg payload mirrors to MRFs
+
+ send (1) uwBUFFER_3(0)<1> r16 0x2 a0.0:ud
+ // Returns Y data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EB801:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000E000:ud // Enable Red channel
+
+ send (1) uwBUFFER_3(4)<1> r16 0x2 a0.0:ud
+ // Returns U data in 4 GRFs in scrambled order
+
+ add (1) a0.0:ud r23.5<0;1,0>:ud 0x44EBC02:ud // msg desc; 1 is added to change BI to UV
+ mov (1) r16.2:ud 0x0000E000:ud // Enable Red channel
+
+ send (1) uwBUFFER_3(8)<1> r16 0x2 a0.0:ud
+ // Returns V data in 4 GRFs in scrambled order
+
+SKIP_AVS_LOAD_L0_3_:
+ nop
+
+
--- /dev/null
+// 93 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// Module name: Save_AVS_NV12.asm
+//
+// Save NV12 420 frame data block of size 16x16
+//
+// To save 16x16 block (16x16 bytes of Y and 16x8 bytes of interleaved UV), we need 2 send instructions with of size 16x16 and 16x8 each.
+// ---------------
+// | 16x16 |
+// | YUYV |
+// ---------------
+// | 16x8 UV |
+// ---------------
+
+//-----------------------------------------------------------------
+//The layout of data is as follows:
+//mMSGHDR0 : Y data header (16x16)
+//mubMSGPAYLOAD0 : Y data payload (8 GRFs)
+//mMSGHDR1 : U data header (16x8)
+//mubMSGPAYLOAD1 : U data payload (4 GRFs)
+//------------------------------------------------------------------
+
+
+// Module name: Save.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+//Msg payload buffers; upto 4 full-size messages can be written
+
+
+.declare muwMSGPAYLOAD0 Base=r29.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD1 Base=r38.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD2 Base=r47.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD3 Base=r56.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+
+.declare mubMSGPAYLOAD0 Base=r29.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD1 Base=r38.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD2 Base=r47.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD3 Base=r56.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+
+
+//_SAVE_INC_
+
+
+ // At the save module we have all 8 address sub-registers available.
+ // So we will use PING-PONG type of scheme to save the data using
+ // pointers pBUF_CHNL_TOP_8x4 and pBUF_CHNL_BOT_8x4. This will help
+ // reduce dependency. - rT
+
+ //wBUFF_CHNL_PTR points to either buffer 0 or buffer 4.
+ //Add appropriate offsets to get pointers for all buffers (1,2,3 or 5).
+ //Offsets are zero for buffer 0 and buffer 4.
+ add (4) a0.0:uw r22.0<4;4,1>:w 0:uw
+ add (4) a0.4:uw r22.0<4;4,1>:w 512:uw
+
+ //Set up header for Y,U and V data
+ mov (8) r28<1>:ud r27<8;8,1>:ud
+ mov (8) r37<1>:ud r27<8;8,1>:ud
+
+ mov (2) r28.0<1>:d r7.0<2;2,1>:w { NoDDClr } //ORI Y (LUMA) = ORI
+ mov (1) r37.0<1>:d r7.0<0;1,0>:w { NoDDClr } //H ORI (CHROMA) = H ORI
+ shr (1) r37.1<1>:d r7.1<0;1,0>:w 1:w { NoDDClr, NoDDChk } //V ORI (CHROMA) = V ORI/2
+
+ mov (1) r28.2<1>:ud 0xF000F:ud { NoDDChk } // Y Block width and height (16x16)
+ mov (1) r37.2<1>:ud 0x7000F:ud { NoDDChk } // UV Block width and height(16x8)
+
+// Unscramble, and pack data directly to MRFs
+
+// Data 16x16 block is divided as -
+// ---------
+// | 0 |
+// ---------
+// | 1 |
+// ---------
+// | 2 |
+// ---------
+// | 3 |
+// ---------
+// All sub-blocks are of size 16x4
+// 0: ubBUFFER_0
+// 1: ubBUFFER_1, ubBUFFER_0+16
+// 2: ubBUFFER_2
+// 3: ubBUFFER_3, ubBUFFER_2+16
+
+
+//Add rounding operations to all the buffers.
+ add.sat (16) uwBUFFER_0(0)<1> uwBUFFER_0(0)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(1)<1> uwBUFFER_0(1)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(4)<1> uwBUFFER_0(4)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(5)<1> uwBUFFER_0(5)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(6)<1> uwBUFFER_0(6)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(7)<1> uwBUFFER_0(7)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(8)<1> uwBUFFER_0(8)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(9)<1> uwBUFFER_0(9)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(12)<1> uwBUFFER_0(12)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(13)<1> uwBUFFER_0(13)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(14)<1> uwBUFFER_0(14)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(15)<1> uwBUFFER_0(15)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(16)<1> uwBUFFER_0(16)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(17)<1> uwBUFFER_0(17)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(20)<1> uwBUFFER_0(20)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(21)<1> uwBUFFER_0(21)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(22)<1> uwBUFFER_0(22)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(23)<1> uwBUFFER_0(23)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(24)<1> uwBUFFER_0(24)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(25)<1> uwBUFFER_0(25)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(28)<1> uwBUFFER_0(28)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(29)<1> uwBUFFER_0(29)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(30)<1> uwBUFFER_0(30)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(31)<1> uwBUFFER_0(31)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(32)<1> uwBUFFER_0(32)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(33)<1> uwBUFFER_0(33)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(36)<1> uwBUFFER_0(36)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(37)<1> uwBUFFER_0(37)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(38)<1> uwBUFFER_0(38)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(39)<1> uwBUFFER_0(39)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(40)<1> uwBUFFER_0(40)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(41)<1> uwBUFFER_0(41)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(44)<1> uwBUFFER_0(44)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(45)<1> uwBUFFER_0(45)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(46)<1> uwBUFFER_0(46)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(47)<1> uwBUFFER_0(47)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(48)<1> uwBUFFER_0(48)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(49)<1> uwBUFFER_0(49)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(52)<1> uwBUFFER_0(52)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(53)<1> uwBUFFER_0(53)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(54)<1> uwBUFFER_0(54)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(55)<1> uwBUFFER_0(55)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(56)<1> uwBUFFER_0(56)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(57)<1> uwBUFFER_0(57)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(60)<1> uwBUFFER_0(60)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(61)<1> uwBUFFER_0(61)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(62)<1> uwBUFFER_0(62)<16;16,1> 0x0080:uw
+ add.sat (16) uwBUFFER_0(63)<1> uwBUFFER_0(63)<16;16,1> 0x0080:uw
+
+
+//Buffer 0
+//Move Y to msg payload
+ mov (16) mubMSGPAYLOAD0(0,0)<1> r[a0.1, 1]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(0,16)<1> r[a0.1, 33]<32;16,2>:ub { NoDDChk }
+ mov (16) mubMSGPAYLOAD0(1,0)<1> r[a0.1, 65]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(1,16)<1> r[a0.1, 97]<32;16,2>:ub { NoDDChk }
+
+//Move U to msg payload
+ mov (8) mubMSGPAYLOAD1(0,0)<2> r[a0.2, 1]<32;8,4>:ub { NoDDClr }
+ mov (8) mubMSGPAYLOAD1(0,16)<2> r[a0.2, 65]<32;8,4>:ub { NoDDClr, NoDDChk }
+
+//Move V to msg payload
+ mov (8) mubMSGPAYLOAD1(0,1)<2> r[a0.0, 1]<32;8,4>:ub { NoDDClr, NoDDChk }
+ mov (8) mubMSGPAYLOAD1(0,17)<2> r[a0.0, 65]<32;8,4>:ub { NoDDChk }
+
+ add (4) a0.0:uw r22.0<4;4,1>:w 1024:uw //Update Buffer 2 pointers
+
+//Buffer 1
+ mov (16) mubMSGPAYLOAD0(2,0)<1> r[a0.5, 1]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(2,16)<1> r[a0.5, 33]<32;16,2>:ub { NoDDChk }
+ mov (16) mubMSGPAYLOAD0(3,0)<1> r[a0.5, 65]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(3,16)<1> r[a0.5, 97]<32;16,2>:ub { NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(1,0)<2> r[a0.6, 1]<32;8,4>:ub { NoDDClr }
+ mov (8) mubMSGPAYLOAD1(1,16)<2> r[a0.6, 65]<32;8,4>:ub { NoDDClr, NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(1,1)<2> r[a0.4, 1]<32;8,4>:ub { NoDDClr, NoDDChk }
+ mov (8) mubMSGPAYLOAD1(1,17)<2> r[a0.4, 65]<32;8,4>:ub { NoDDChk }
+
+ add (4) a0.4:uw r22.0<4;4,1>:w 1536:uw //Update Buffer 3 pointers
+
+//Buffer 2
+ mov (16) mubMSGPAYLOAD0(4,0)<1> r[a0.1, 1]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(4,16)<1> r[a0.1, 33]<32;16,2>:ub { NoDDChk }
+ mov (16) mubMSGPAYLOAD0(5,0)<1> r[a0.1, 65]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(5,16)<1> r[a0.1, 97]<32;16,2>:ub { NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(2,0)<2> r[a0.2, 1]<32;8,4>:ub { NoDDClr }
+ mov (8) mubMSGPAYLOAD1(2,16)<2> r[a0.2, 65]<32;8,4>:ub { NoDDClr, NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(2,1)<2> r[a0.0, 1]<32;8,4>:ub { NoDDClr, NoDDChk }
+ mov (8) mubMSGPAYLOAD1(2,17)<2> r[a0.0, 65]<32;8,4>:ub { NoDDChk }
+
+//Buffer 3
+ mov (16) mubMSGPAYLOAD0(6,0)<1> r[a0.5, 1]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(6,16)<1> r[a0.5, 33]<32;16,2>:ub { NoDDChk }
+ mov (16) mubMSGPAYLOAD0(7,0)<1> r[a0.5, 65]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(7,16)<1> r[a0.5, 97]<32;16,2>:ub { NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(3,0)<2> r[a0.6, 1]<32;8,4>:ub { NoDDClr }
+ mov (8) mubMSGPAYLOAD1(3,16)<2> r[a0.6, 65]<32;8,4>:ub { NoDDClr, NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(3,1)<2> r[a0.4, 1]<32;8,4>:ub { NoDDClr, NoDDChk }
+ mov (8) mubMSGPAYLOAD1(3,17)<2> r[a0.4, 65]<32;8,4>:ub { NoDDChk }
+//===========================================================================
+
+send (1) null<1>:d r28 0x5 0x120A8018:ud
+send (1) null<1>:d r37 0x5 0xA0A8019:ud
--- /dev/null
+// 110 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// Module name: Save_PA.asm
+//
+// Save PA 422 frame data block of size 16x16
+//
+// To save 16x16 block (32x16 bytes of YUYV) we need 2 send instructions with of size 16x16 each.
+// -------------------------------
+// | 16x16 | 16x16 |
+// | YUYV | YUYV |
+// -------------------------------
+
+
+// Module name: Save.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+//Msg payload buffers; upto 4 full-size messages can be written
+
+
+.declare muwMSGPAYLOAD0 Base=r29.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD1 Base=r38.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD2 Base=r47.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD3 Base=r56.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+
+.declare mubMSGPAYLOAD0 Base=r29.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD1 Base=r38.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD2 Base=r47.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD3 Base=r56.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+
+
+//_SAVE_INC_
+
+
+ //wBUFF_CHNL_PTR points to buffer 0.
+ //Add appropriate offsets to get pointers for all buffers (1,2,3).
+ //Offset is zero for buffer 0.
+ add (4) a0.0:uw r22.0<4;4,1>:w 0:uw
+
+ //Set DEST pointers according to output packing i.e. YUYV, YVYU, UYVY, VYUY
+ add (4) a0.4<1>:w r2.28<4;4,1>:ub 928:uw
+
+ shl (1) r27.0<1>:d r7.0<0;1,0>:w 1:w { NoDDClr } // H. block origin need to be 2 times
+ mov (1) r27.1<1>:d r7.1<0;1,0>:w { NoDDClr, NoDDChk } // Block origin (1st quadrant)
+ mov (1) r27.2<1>:ud 0xF000F:ud { NoDDChk } // Block width and height (16x16)
+
+ // 1st 16x4 block response
+ // V 1st quarter left
+ mov (4) r[a0.6, 0]<4>:ub r[a0.0, 1]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6, 16]<4>:ub r[a0.0,33]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.6, 32]<4>:ub r[a0.0,65]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6, 48]<4>:ub r[a0.0,97]<16;4,4>:ub { NoDDClr, NoDDChk }
+
+ // Y 1st quarter left
+ mov (8) r[a0.4, 0]<2>:ub r[a0.1, 1]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 16]<2>:ub r[a0.1,33]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 32]<2>:ub r[a0.1,65]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 48]<2>:ub r[a0.1,97]<16;8,2>:ub { NoDDClr, NoDDChk }
+
+ // U 1st quarter left
+ mov (4) r[a0.5, 0]<4>:ub r[a0.2, 1]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5, 16]<4>:ub r[a0.2,33]<16;4,4>:ub { NoDDChk }
+ mov (4) r[a0.5, 32]<4>:ub r[a0.2,65]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5, 48]<4>:ub r[a0.2,97]<16;4,4>:ub { NoDDChk }
+
+ // V 1st quarter right
+ mov (4) r[a0.6,288]<4>:ub r[a0.0, 17]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,304]<4>:ub r[a0.0,49]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.6,320]<4>:ub r[a0.0,81]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,336]<4>:ub r[a0.0,113]<16;4,4>:ub { NoDDClr, NoDDChk }
+
+ // Y 1st quarter right
+ mov (8) r[a0.4,288]<2>:ub r[a0.1, 17]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,304]<2>:ub r[a0.1,49]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,320]<2>:ub r[a0.1,81]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,336]<2>:ub r[a0.1,113]<16;8,2>:ub { NoDDClr, NoDDChk }
+
+ // U 1st quarter right
+ mov (4) r[a0.5,288]<4>:ub r[a0.2, 17]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,304]<4>:ub r[a0.2,49]<16;4,4>:ub { NoDDChk }
+ mov (4) r[a0.5,320]<4>:ub r[a0.2,81]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,336]<4>:ub r[a0.2,113]<16;4,4>:ub { NoDDChk }
+
+ // 2nd 16x4 block response
+ add (4) a0.0:uw r22.0<4;4,1>:w 512:uw
+
+//-------
+mov (8) r28<1>:ud r27<8;8,1>:ud
+mov (8) r37<1>:ud r27<8;8,1>:ud
+//-------
+
+ // V 2nd quarter left
+ mov (4) r[a0.6, 64]<4>:ub r[a0.0, 1]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6, 80]<4>:ub r[a0.0,33]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.6, 96]<4>:ub r[a0.0,65]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,112]<4>:ub r[a0.0,97]<16;4,4>:ub { NoDDClr, NoDDChk }
+
+ // Y 2nd quarter left
+ mov (8) r[a0.4, 64]<2>:ub r[a0.1, 1]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 80]<2>:ub r[a0.1,33]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 96]<2>:ub r[a0.1,65]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,112]<2>:ub r[a0.1,97]<16;8,2>:ub { NoDDClr, NoDDChk }
+
+ // U 2nd quarter left
+ mov (4) r[a0.5, 64]<4>:ub r[a0.2, 1]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5, 80]<4>:ub r[a0.2,33]<16;4,4>:ub { NoDDChk }
+ mov (4) r[a0.5, 96]<4>:ub r[a0.2,65]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,112]<4>:ub r[a0.2,97]<16;4,4>:ub { NoDDChk }
+
+ // V 2nd quarter right
+ mov (4) r[a0.6,352]<4>:ub r[a0.0, 17]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,368]<4>:ub r[a0.0,49]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.6,384]<4>:ub r[a0.0,81]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,400]<4>:ub r[a0.0,113]<16;4,4>:ub { NoDDClr, NoDDChk }
+
+ // Y 2nd quarter right
+ mov (8) r[a0.4,352]<2>:ub r[a0.1, 17]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,368]<2>:ub r[a0.1,49]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,384]<2>:ub r[a0.1,81]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,400]<2>:ub r[a0.1,113]<16;8,2>:ub { NoDDClr, NoDDChk }
+
+ // U 2nd quarter right
+ mov (4) r[a0.5,352]<4>:ub r[a0.2, 17]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,368]<4>:ub r[a0.2,49]<16;4,4>:ub { NoDDChk }
+ mov (4) r[a0.5,384]<4>:ub r[a0.2,81]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,400]<4>:ub r[a0.2,113]<16;4,4>:ub { NoDDChk }
+
+ // 3rd 16x4 block response
+ add (4) a0.0:uw r22.0<4;4,1>:w 1024:uw
+
+//----------
+//Set DEST pointers according to output packing i.e. YUYV, YVYU, UYVY, VYUY
+add (4) a0.4<1>:w r2.28<4;4,1>:ub 1056:uw
+add (1) r37.0<1>:d r27.0<0;1,0>:d 16:d // Point to 2nd part
+//----------
+
+ // V 3rd quarter left
+ mov (4) r[a0.6, 0]<4>:ub r[a0.0, 1]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6, 16]<4>:ub r[a0.0,33]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.6, 32]<4>:ub r[a0.0,65]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6, 48]<4>:ub r[a0.0,97]<16;4,4>:ub { NoDDClr, NoDDChk }
+
+ // Y 3rd quarter left
+ mov (8) r[a0.4, 0]<2>:ub r[a0.1, 1]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 16]<2>:ub r[a0.1,33]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 32]<2>:ub r[a0.1,65]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 48]<2>:ub r[a0.1,97]<16;8,2>:ub { NoDDClr, NoDDChk }
+
+ // U 3rd quarter left
+ mov (4) r[a0.5, 0]<4>:ub r[a0.2, 1]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5, 16]<4>:ub r[a0.2,33]<16;4,4>:ub { NoDDChk }
+ mov (4) r[a0.5, 32]<4>:ub r[a0.2,65]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5, 48]<4>:ub r[a0.2,97]<16;4,4>:ub { NoDDChk }
+
+ // V 3rd quarter right
+ mov (4) r[a0.6,288]<4>:ub r[a0.0, 17]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,304]<4>:ub r[a0.0,49]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.6,320]<4>:ub r[a0.0,81]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,336]<4>:ub r[a0.0,113]<16;4,4>:ub { NoDDClr, NoDDChk }
+
+ // Y 3rd quarter right
+ mov (8) r[a0.4,288]<2>:ub r[a0.1, 17]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,304]<2>:ub r[a0.1,49]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,320]<2>:ub r[a0.1,81]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,336]<2>:ub r[a0.1,113]<16;8,2>:ub { NoDDClr, NoDDChk }
+
+ // U 3rd quarter right
+ mov (4) r[a0.5,288]<4>:ub r[a0.2, 17]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,304]<4>:ub r[a0.2,49]<16;4,4>:ub { NoDDChk }
+ mov (4) r[a0.5,320]<4>:ub r[a0.2,81]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,336]<4>:ub r[a0.2,113]<16;4,4>:ub { NoDDChk }
+
+ // 4th 16x4 block response
+ add (4) a0.0:uw r22.0<4;4,1>:w 1536:uw
+
+ // V 4th quarter left
+ mov (4) r[a0.6, 64]<4>:ub r[a0.0, 1]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6, 80]<4>:ub r[a0.0,33]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.6, 96]<4>:ub r[a0.0,65]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,112]<4>:ub r[a0.0,97]<16;4,4>:ub { NoDDClr, NoDDChk }
+
+ // Y 4th quarter left
+ mov (8) r[a0.4, 64]<2>:ub r[a0.1, 1]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 80]<2>:ub r[a0.1,33]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4, 96]<2>:ub r[a0.1,65]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,112]<2>:ub r[a0.1,97]<16;8,2>:ub { NoDDClr, NoDDChk }
+
+ // U 4th quarter left
+ mov (4) r[a0.5, 64]<4>:ub r[a0.2, 1]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5, 80]<4>:ub r[a0.2,33]<16;4,4>:ub { NoDDChk }
+ mov (4) r[a0.5, 96]<4>:ub r[a0.2,65]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,112]<4>:ub r[a0.2,97]<16;4,4>:ub { NoDDChk }
+
+ // V 4th quarter right
+ mov (4) r[a0.6,352]<4>:ub r[a0.0, 17]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,368]<4>:ub r[a0.0,49]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.6,384]<4>:ub r[a0.0,81]<16;4,4>:ub { NoDDClr }
+ mov (4) r[a0.6,400]<4>:ub r[a0.0,113]<16;4,4>:ub { NoDDClr, NoDDChk }
+
+ // Y 4th quarter right
+ mov (8) r[a0.4,352]<2>:ub r[a0.1, 17]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,368]<2>:ub r[a0.1,49]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,384]<2>:ub r[a0.1,81]<16;8,2>:ub { NoDDClr, NoDDChk }
+ mov (8) r[a0.4,400]<2>:ub r[a0.1,113]<16;8,2>:ub { NoDDClr, NoDDChk }
+
+ // U 4th quarter right
+ mov (4) r[a0.5,352]<4>:ub r[a0.2, 17]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,368]<4>:ub r[a0.2,49]<16;4,4>:ub { NoDDChk }
+ mov (4) r[a0.5,384]<4>:ub r[a0.2,81]<16;4,4>:ub { NoDDClr, NoDDChk }
+ mov (4) r[a0.5,400]<4>:ub r[a0.2,113]<16;4,4>:ub { NoDDChk }
+
+ send (1) null<1>:d r28 0x5 0x120A8018:ud
+ send (1) null<1>:d r37 0x5 0x120A8018:ud
--- /dev/null
+// 48 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// Module name: Save_AVS_PL3.asm
+//
+// Save PL3 420 frame data block of size 16x16
+//
+// To save 16x16 block (16x16 byte of Y and 8x8 byte of U and V each) we need 3 send instructions with one of size 16x16 and two of size 8x8.
+// -----------------
+// | 16x16 Y |
+// | |
+// -----------------
+// | 8x8 U |
+// ---------
+// | 8x8 V |
+// ---------
+
+//-----------------------------------------------------------------
+//The layout of data is as follows:
+//mMSGHDR0 : Y data header (16x16)
+//mubMSGPAYLOAD0 : Y data payload (8 GRFs)
+//mMSGHDR1 : U data header (8x8)
+//mubMSGPAYLOAD1 : U data payload (2 GRFs)
+//mMSGHDR2 : V data header (8x8)
+//mubMSGPAYLOAD2 : V data payload (2 GRFs)
+//------------------------------------------------------------------
+
+
+// Module name: Save.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+//Msg payload buffers; upto 4 full-size messages can be written
+
+
+.declare muwMSGPAYLOAD0 Base=r29.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD1 Base=r38.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD2 Base=r47.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD3 Base=r56.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+
+.declare mubMSGPAYLOAD0 Base=r29.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD1 Base=r38.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD2 Base=r47.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD3 Base=r56.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+
+
+//_SAVE_INC_
+
+
+ // At the save module we have all 8 address sub-registers available.
+ // So we will use PING-PONG type of scheme to save the data using
+ // pointers pBUF_CHNL_TOP_8x4 and pBUF_CHNL_BOT_8x4. This will help
+ // reduce dependency. - rT
+
+ //wBUFF_CHNL_PTR points to either buffer 0 or buffer 4.
+ //Add appropriate offsets to get pointers for all buffers (1,2,3 or 5).
+ //Offsets are zero for buffer 0 and buffer 4.
+ add (4) a0.0:uw r22.0<4;4,1>:w 0:uw
+ add (4) a0.4:uw r22.0<4;4,1>:w 512:uw
+
+ //Set up header for Y,U and V data
+ mov (8) r28<1>:ud r27<8;8,1>:ud
+ mov (8) r37<1>:ud r27<8;8,1>:ud
+ mov (8) r46<1>:ud r27<8;8,1>:ud
+
+ mov (2) r28.0<1>:d r7.0<2;2,1>:w { NoDDClr } //ORI Y (LUMA) = ORI
+ shr (2) r37.0<1>:d r7.0<2;2,1>:w 1:w { NoDDClr } //H/V ORI U = H/V ORI/2
+ shr (2) r46.0<1>:d r7.0<2;2,1>:w 1:w { NoDDClr } //H/V ORI V = H/V ORI/2
+
+ mov (1) r28.2<1>:ud 0xF000F:ud { NoDDChk } // Y Block width and height (16x16)
+ mov (1)r37.2<1>:ud 0x70007:ud { NoDDChk } // U Block width and height (8x8)
+ mov (1)r46.2<1>:ud 0x70007:ud { NoDDChk } // V Block width and height (8x8)
+
+// Unscramble, and pack data directly to MRFs
+
+// Data 16x16 block is divided as -
+// ---------
+// | 0 |
+// ---------
+// | 1 |
+// ---------
+// | 2 |
+// ---------
+// | 3 |
+// ---------
+// All sub-blocks are of size 16x4
+// 0: ubBUFFER_0
+// 1: ubBUFFER_1, ubBUFFER_0+16
+// 2: ubBUFFER_2
+// 3: ubBUFFER_3, ubBUFFER_2+16
+
+//Buffer 0
+//Move Y to msg payload
+ mov (16) mubMSGPAYLOAD0(0,0)<1> r[a0.1, 1]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(0,16)<1> r[a0.1, 33]<32;16,2>:ub { NoDDChk }
+ mov (16) mubMSGPAYLOAD0(1,0)<1> r[a0.1, 65]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(1,16)<1> r[a0.1, 97]<32;16,2>:ub { NoDDChk }
+
+//Move U to msg payload
+ mov (8) mubMSGPAYLOAD1(0,0)<1> r[a0.2, 1]<32;8,4>:ub { NoDDClr }
+ mov (8) mubMSGPAYLOAD1(0,8)<1> r[a0.2, 65]<32;8,4>:ub { NoDDClr, NoDDChk }
+
+//Move V to msg payload
+ mov (8) mubMSGPAYLOAD2(0,0)<1> r[a0.0, 1]<32;8,4>:ub { NoDDClr }
+ mov (8) mubMSGPAYLOAD2(0,8)<1> r[a0.0, 65]<32;8,4>:ub { NoDDClr, NoDDChk }
+
+ add (4) a0.0:uw r22.0<4;4,1>:w 1024:uw //Update Buffer 2 pointers
+
+//Buffer 1
+ mov (16) mubMSGPAYLOAD0(2,0)<1> r[a0.5, 1]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(2,16)<1> r[a0.5, 33]<32;16,2>:ub { NoDDChk }
+ mov (16) mubMSGPAYLOAD0(3,0)<1> r[a0.5, 65]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(3,16)<1> r[a0.5, 97]<32;16,2>:ub { NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(0,16)<1> r[a0.6, 1]<32;8,4>:ub { NoDDClr, NoDDChk }
+ mov (8) mubMSGPAYLOAD1(0,24)<1> r[a0.6, 65]<32;8,4>:ub { NoDDChk }
+
+ mov (8) mubMSGPAYLOAD2(0,16)<1> r[a0.4, 1]<32;8,4>:ub { NoDDClr, NoDDChk }
+ mov (8) mubMSGPAYLOAD2(0,24)<1> r[a0.4, 65]<32;8,4>:ub { NoDDChk }
+
+ add (4) a0.4:uw r22.0<4;4,1>:w 1536:uw //Update Buffer 3 pointers
+
+//Buffer 2
+ mov (16) mubMSGPAYLOAD0(4,0)<1> r[a0.1, 1]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(4,16)<1> r[a0.1, 33]<32;16,2>:ub { NoDDChk }
+ mov (16) mubMSGPAYLOAD0(5,0)<1> r[a0.1, 65]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(5,16)<1> r[a0.1, 97]<32;16,2>:ub { NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(1,0)<1> r[a0.2, 1]<32;8,4>:ub { NoDDClr }
+ mov (8) mubMSGPAYLOAD1(1,8)<1> r[a0.2, 65]<32;8,4>:ub { NoDDClr, NoDDChk }
+
+ mov (8) mubMSGPAYLOAD2(1,0)<1> r[a0.0, 1]<32;8,4>:ub { NoDDClr }
+ mov (8) mubMSGPAYLOAD2(1,8)<1> r[a0.0, 65]<32;8,4>:ub { NoDDClr, NoDDChk }
+
+//Buffer 3
+ mov (16) mubMSGPAYLOAD0(6,0)<1> r[a0.5, 1]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(6,16)<1> r[a0.5, 33]<32;16,2>:ub { NoDDChk }
+ mov (16) mubMSGPAYLOAD0(7,0)<1> r[a0.5, 65]<32;16,2>:ub { NoDDClr }
+ mov (16) mubMSGPAYLOAD0(7,16)<1> r[a0.5, 97]<32;16,2>:ub { NoDDChk }
+
+ mov (8) mubMSGPAYLOAD1(1,16)<1> r[a0.6, 1]<32;8,4>:ub { NoDDClr, NoDDChk }
+ mov (8) mubMSGPAYLOAD1(1,24)<1> r[a0.6, 65]<32;8,4>:ub { NoDDChk }
+
+ mov (8) mubMSGPAYLOAD2(1,16)<1> r[a0.4, 1]<32;8,4>:ub { NoDDClr, NoDDChk }
+ mov (8) mubMSGPAYLOAD2(1,24)<1> r[a0.4, 65]<32;8,4>:ub { NoDDChk }
+
+//===========================================================================
+
+send (1) null<1>:d r28 0x5 0x120A8018:ud
+send (1) null<1>:d r37 0x5 0x60A8019:ud
+send (1) null<1>:d r46 0x5 0x60A801A:ud
--- /dev/null
+// 278 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// Module name: Save_AVS_RGB.asm
+//
+// Save packed ARGB 444 frame data block of size 16x16
+//
+// To save 16x16 block (64x16 byte layout for ARGB8888) we need 4 send instructions with 16x16 in each
+// -----------------
+// | 0 | 1 | 2 | 3 |
+// -----------------
+
+
+// Module name: Save.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+//Msg payload buffers; upto 4 full-size messages can be written
+
+
+.declare muwMSGPAYLOAD0 Base=r29.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD1 Base=r38.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD2 Base=r47.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare muwMSGPAYLOAD3 Base=r56.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+
+.declare mubMSGPAYLOAD0 Base=r29.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD1 Base=r38.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD2 Base=r47.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare mubMSGPAYLOAD3 Base=r56.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+
+
+//_SAVE_INC_
+
+
+// At the save module we have all 8 address sub-registers available.
+// So we will use PING-PONG type of scheme to save the data using
+// pointers pBUF_CHNL_TOP_8x4 and pBUF_CHNL_BOT_8x4. This will help
+// reduce dependency. - rT
+
+//wBUFF_CHNL_PTR points to either buffer 0 or buffer 4.
+//Add appropriate offsets to get pointers for all buffers (1,2,3 or 5).
+//Offsets are zero for buffer 0 and buffer 4.
+ add (4) a0.0:uw r22.0<4;4,1>:w 0:uw
+
+ shl (1) r27.0<1>:d r7.0<0;1,0>:w 2:w { NoDDClr } // H. block origin need to be quadrupled
+ mov (1) r27.1<1>:d r7.1<0;1,0>:w { NoDDClr, NoDDChk } // Block origin (1st quadrant)
+ mov (1) r27.2<1>:ud 0xF000F:ud { NoDDChk } // Block width and height (16x16)
+
+ add (4) a0.4:uw a0.0<4;4,1>:w r22.8<0;2,1>:w
+
+ mov (8) r28<1>:ud r27<8;8,1>:ud
+ mov (8) r37<1>:ud r27<8;8,1>:ud
+ mov (8) r46<1>:ud r27<8;8,1>:ud
+ mov (8) r55<1>:ud r27<8;8,1>:ud
+
+ add (1) r37.0<1>:d r27.0<0;1,0>:d 16:d // Point to 2nd part
+ add (1) r46.0<1>:d r27.0<0;1,0>:d 32:d // Point to 3rd part
+ add (1) r55.0<1>:d r27.0<0;1,0>:d 48:d // Point to 4th part
+
+ // write Buf_0 to 1st quarter of four horizontal output blocks
+
+// Please note the scattered order of NODDCLR, NODDCHK flags. Since the sub-registers
+// of destination reg are not updated at one place and hence even flags are scattered. -rT
+
+ mov (4) mubMSGPAYLOAD0(0, 0)<4> r[a0.2, 1]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD0(0,16)<4> r[a0.2,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(1, 0)<4> r[a0.6, 1]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD0(1,16)<4> r[a0.6,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(0, 0)<4> r[a0.2, 9]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD1(0,16)<4> r[a0.2,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(1, 0)<4> r[a0.6, 9]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD1(1,16)<4> r[a0.6,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(0, 0)<4> r[a0.2, 17]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD2(0,16)<4> r[a0.2,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(1, 0)<4> r[a0.6, 17]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD2(1,16)<4> r[a0.6,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(0, 0)<4> r[a0.2, 25]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD3(0,16)<4> r[a0.2,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(1, 0)<4> r[a0.6, 25]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD3(1,16)<4> r[a0.6,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(0, 1)<4> r[a0.1, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(0,17)<4> r[a0.1,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(1, 1)<4> r[a0.5, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(1,17)<4> r[a0.5,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(0, 1)<4> r[a0.1, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(0,17)<4> r[a0.1,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(1, 1)<4> r[a0.5, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(1,17)<4> r[a0.5,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(0, 1)<4> r[a0.1, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(0,17)<4> r[a0.1,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(1, 1)<4> r[a0.5, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(1,17)<4> r[a0.5,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(0, 1)<4> r[a0.1, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(0,17)<4> r[a0.1,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(1, 1)<4> r[a0.5, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(1,17)<4> r[a0.5,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(0, 2)<4> r[a0.0, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(0,18)<4> r[a0.0,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(1, 2)<4> r[a0.4, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(1,18)<4> r[a0.4,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(0, 2)<4> r[a0.0, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(0,18)<4> r[a0.0,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(1, 2)<4> r[a0.4, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(1,18)<4> r[a0.4,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(0, 2)<4> r[a0.0, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(0,18)<4> r[a0.0,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(1, 2)<4> r[a0.4, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(1,18)<4> r[a0.4,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(0, 2)<4> r[a0.0, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(0,18)<4> r[a0.0,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(1, 2)<4> r[a0.4, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(1,18)<4> r[a0.4,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(0, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(0,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD0(1, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(1,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(0, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(0,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD1(1, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(1,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(0, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(0,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD2(1, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(1,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(0, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(0,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD3(1, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(1,19)<4> r2.31:ub { NoDDChk }
+
+ // write Buf_1 to 2nd quarter of four horizontal output blocks
+ add (4) a0.0:uw r22.0<4;4,1>:w 512:uw
+ add (4) a0.4:uw a0.0<4;4,1>:w r22.8<0;2,1>:w
+
+ mov (4) mubMSGPAYLOAD0(2, 0)<4> r[a0.2, 1]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD0(2,16)<4> r[a0.2,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(3, 0)<4> r[a0.6, 1]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD0(3,16)<4> r[a0.6,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(2, 0)<4> r[a0.2, 9]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD1(2,16)<4> r[a0.2,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(3, 0)<4> r[a0.6, 9]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD1(3,16)<4> r[a0.6,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(2, 0)<4> r[a0.2, 17]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD2(2,16)<4> r[a0.2,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(3, 0)<4> r[a0.6, 17]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD2(3,16)<4> r[a0.6,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(2, 0)<4> r[a0.2, 25]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD3(2,16)<4> r[a0.2,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(3, 0)<4> r[a0.6, 25]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD3(3,16)<4> r[a0.6,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(2, 1)<4> r[a0.1, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(2,17)<4> r[a0.1,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(3, 1)<4> r[a0.5, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(3,17)<4> r[a0.5,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(2, 1)<4> r[a0.1, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(2,17)<4> r[a0.1,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(3, 1)<4> r[a0.5, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(3,17)<4> r[a0.5,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(2, 1)<4> r[a0.1, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(2,17)<4> r[a0.1,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(3, 1)<4> r[a0.5, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(3,17)<4> r[a0.5,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(2, 1)<4> r[a0.1, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(2,17)<4> r[a0.1,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(3, 1)<4> r[a0.5, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(3,17)<4> r[a0.5,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(2, 2)<4> r[a0.0, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(2,18)<4> r[a0.0,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(3, 2)<4> r[a0.4, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(3,18)<4> r[a0.4,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(2, 2)<4> r[a0.0, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(2,18)<4> r[a0.0,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(3, 2)<4> r[a0.4, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(3,18)<4> r[a0.4,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(2, 2)<4> r[a0.0, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(2,18)<4> r[a0.0,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(3, 2)<4> r[a0.4, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(3,18)<4> r[a0.4,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(2, 2)<4> r[a0.0, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(2,18)<4> r[a0.0,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(3, 2)<4> r[a0.4, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(3,18)<4> r[a0.4,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(2, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(2,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD0(3, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(3,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(2, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(2,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD1(3, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(3,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(2, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(2,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD2(3, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(3,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(2, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(2,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD3(3, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(3,19)<4> r2.31:ub { NoDDChk }
+
+
+ // write Buf_2 to 3rd quarter of four horizontal output blocks
+ add (4) a0.0:uw r22.0<4;4,1>:w 1024:uw
+ add (4) a0.4:uw a0.0<4;4,1>:w r22.8<0;2,1>:w
+
+ mov (4) mubMSGPAYLOAD0(4, 0)<4> r[a0.2, 1]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD0(4,16)<4> r[a0.2,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(5, 0)<4> r[a0.6, 1]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD0(5,16)<4> r[a0.6,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(4, 0)<4> r[a0.2, 9]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD1(4,16)<4> r[a0.2,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(5, 0)<4> r[a0.6, 9]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD1(5,16)<4> r[a0.6,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(4, 0)<4> r[a0.2, 17]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD2(4,16)<4> r[a0.2,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(5, 0)<4> r[a0.6, 17]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD2(5,16)<4> r[a0.6,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(4, 0)<4> r[a0.2, 25]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD3(4,16)<4> r[a0.2,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(5, 0)<4> r[a0.6, 25]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD3(5,16)<4> r[a0.6,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(4, 1)<4> r[a0.1, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(4,17)<4> r[a0.1,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(5, 1)<4> r[a0.5, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(5,17)<4> r[a0.5,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(4, 1)<4> r[a0.1, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(4,17)<4> r[a0.1,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(5, 1)<4> r[a0.5, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(5,17)<4> r[a0.5,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(4, 1)<4> r[a0.1, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(4,17)<4> r[a0.1,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(5, 1)<4> r[a0.5, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(5,17)<4> r[a0.5,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(4, 1)<4> r[a0.1, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(4,17)<4> r[a0.1,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(5, 1)<4> r[a0.5, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(5,17)<4> r[a0.5,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(4, 2)<4> r[a0.0, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(4,18)<4> r[a0.0,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(5, 2)<4> r[a0.4, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(5,18)<4> r[a0.4,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(4, 2)<4> r[a0.0, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(4,18)<4> r[a0.0,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(5, 2)<4> r[a0.4, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(5,18)<4> r[a0.4,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(4, 2)<4> r[a0.0, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(4,18)<4> r[a0.0,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(5, 2)<4> r[a0.4, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(5,18)<4> r[a0.4,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(4, 2)<4> r[a0.0, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(4,18)<4> r[a0.0,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(5, 2)<4> r[a0.4, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(5,18)<4> r[a0.4,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(4, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(4,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD0(5, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(5,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(4, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(4,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD1(5, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(5,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(4, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(4,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD2(5, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(5,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(4, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(4,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD3(5, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(5,19)<4> r2.31:ub { NoDDChk }
+
+ // write Buf_3 to 4th quarter of four horizontal output blocks
+ add (4) a0.0:uw r22.0<4;4,1>:w 1536:uw
+ add (4) a0.4:uw a0.0<4;4,1>:w r22.8<0;2,1>:w
+
+ mov (4) mubMSGPAYLOAD0(6, 0)<4> r[a0.2, 1]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD0(6,16)<4> r[a0.2,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(7, 0)<4> r[a0.6, 1]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD0(7,16)<4> r[a0.6,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(6, 0)<4> r[a0.2, 9]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD1(6,16)<4> r[a0.2,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(7, 0)<4> r[a0.6, 9]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD1(7,16)<4> r[a0.6,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(6, 0)<4> r[a0.2, 17]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD2(6,16)<4> r[a0.2,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(7, 0)<4> r[a0.6, 17]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD2(7,16)<4> r[a0.6,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(6, 0)<4> r[a0.2, 25]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD3(6,16)<4> r[a0.2,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(7, 0)<4> r[a0.6, 25]<8;4,2> { NoDDClr }
+ mov (4) mubMSGPAYLOAD3(7,16)<4> r[a0.6,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(6, 1)<4> r[a0.1, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(6,17)<4> r[a0.1,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(7, 1)<4> r[a0.5, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(7,17)<4> r[a0.5,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(6, 1)<4> r[a0.1, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(6,17)<4> r[a0.1,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(7, 1)<4> r[a0.5, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(7,17)<4> r[a0.5,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(6, 1)<4> r[a0.1, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(6,17)<4> r[a0.1,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(7, 1)<4> r[a0.5, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(7,17)<4> r[a0.5,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(6, 1)<4> r[a0.1, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(6,17)<4> r[a0.1,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(7, 1)<4> r[a0.5, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(7,17)<4> r[a0.5,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(6, 2)<4> r[a0.0, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(6,18)<4> r[a0.0,33]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(7, 2)<4> r[a0.4, 1]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(7,18)<4> r[a0.4,33]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(6, 2)<4> r[a0.0, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(6,18)<4> r[a0.0,41]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(7, 2)<4> r[a0.4, 9]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(7,18)<4> r[a0.4,41]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(6, 2)<4> r[a0.0, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(6,18)<4> r[a0.0,49]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(7, 2)<4> r[a0.4, 17]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(7,18)<4> r[a0.4,49]<8;4,2> { NoDDClr, NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(6, 2)<4> r[a0.0, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(6,18)<4> r[a0.0,57]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(7, 2)<4> r[a0.4, 25]<8;4,2> { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(7,18)<4> r[a0.4,57]<8;4,2> { NoDDClr, NoDDChk }
+
+
+ mov (4) mubMSGPAYLOAD0(6, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(6,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD0(7, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD0(7,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD1(6, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(6,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD1(7, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD1(7,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD2(6, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(6,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD2(7, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD2(7,19)<4> r2.31:ub { NoDDChk }
+
+ mov (4) mubMSGPAYLOAD3(6, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(6,19)<4> r2.31:ub { NoDDChk }
+ mov (4) mubMSGPAYLOAD3(7, 3)<4> r2.31:ub { NoDDClr, NoDDChk }
+ mov (4) mubMSGPAYLOAD3(7,19)<4> r2.31:ub { NoDDChk }
+
+
+ send (8) null<1>:d r28 0x5 0x120A8018:ud
+ send (8) null<1>:d r37 0x5 0x120A8018:ud
+ send (8) null<1>:d r46 0x5 0x120A8018:ud
+ send (8) null<1>:d r55 0x5 0x120A8018:ud
--- /dev/null
+// 7 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+//Module Name: Set_AVS_Buf_0123_BGRA.asm
+
+
+//Module Name: Set_Buf_0123_BGRA
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ //AVS LAYOUT:(UUYYVVAA)
+ //Assign buffer channel order for Buffer 0123 in the order AUYV a0.3>A, a0.2>U, a0.1>Y, a0.0>V
+ // V = 8, Y= 4, U = 0, A = 12.
+ mov (4) acc0.0<1>:w 0x6AE2:v
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 70:uw
+ shl (4) r22.0<1>:w acc0<4;4,1>:w 5:uw
+
+ //OPT: wAVS_SU_SHUFFLE_PTR_0 and udAVS_SU_SHUFFLE_OFF_0 are sub-regs of same GRF. -rT
+
+ //SU LAYOUT:(VYUAVYUA)
+ //V = 4, Y = 2, U = 0, A = 6
+ mov (4) acc0.0<1>:w 0x6024:v
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 64:uw
+ shl (4) r18.0<1>:w acc0<4;4,1>:w 5:uw { NoDDClr } //Convert to BYTE address.
+
+ //OFFSET:
+ mov (1) r18.4<1>:ud 0x1000100:ud { NoDDChk }
+
+
--- /dev/null
+// 7 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+//Module Name: Set_AVS_Buf_0123_PL2.asm
+
+
+//Module Name: Set_Buf_0123_PL2
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ //AVS LAYOUT: (YYUUVVAA)
+ //Assign buffer channel order for Buffer 0123 in the order AUYV a0.3>A, a0.2>U, a0.1>Y, a0.0>V
+ //For PL2-AVS: V = 8, Y= 0, U = 4, A = 12.
+ mov (4) acc0.0<1>:w 0x6EA2:v //Subtract 6 from 0,4,8,12
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 70:uw //add 6 back
+ shl (4) r22.0<1>:w acc0<4;4,1>:w 5:uw //Convert to BYTE address.
+
+ //OPT: wAVS_SU_SHUFFLE_PTR_0 and udAVS_SU_SHUFFLE_OFF_0 are sub-regs of same GRF. -rT
+ //SU LAYOUT:(YUVAYUVA)
+ //V = 4, Y = 0, U = 2, A = 6
+ mov (4) acc0.0<1>:w 0x6204:v
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 64:uw
+ shl (4) r18.0<1>:w acc0<4;4,1>:w 5:uw { NoDDClr } //Convert to BYTE address.
+
+ //OFFSET:
+ mov (1) r18.4<1>:ud 0x1000100:ud { NoDDChk }
+
+
--- /dev/null
+// 7 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+//Module Name: Set_AVS_Buf_0123_PL3.asm
+
+
+//Module Name: Set_Buf_0123_PL3
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ //AVS LAYOUT: (YYUUVVAA)
+ //Assign buffer channel order for Buffer 0123 in the order AUYV a0.3>A, a0.2>U, a0.1>Y, a0.0>V
+ //For PL3-AVS: V = 8, Y= 0, U = 4, A = 12.
+ mov (4) acc0.0<1>:w 0x6EA2:v //Subtract 6 from 0,4,8,12
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 70:uw //add 6 back
+ shl (4) r22.0<1>:w acc0<4;4,1>:w 5:uw //Convert to BYTE address.
+
+ //OPT: wAVS_SU_SHUFFLE_PTR_0 and udAVS_SU_SHUFFLE_OFF_0 are sub-regs of same GRF. -rT
+ //SU LAYOUT:(YUVAYUVA)
+ //V = 4, Y = 0, U = 2, A = 6
+ mov (4) acc0.0<1>:w 0x6204:v
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 64:uw
+ shl (4) r18.0<1>:w acc0<4;4,1>:w 5:uw { NoDDClr } //Convert to BYTE address.
+
+ //OFFSET:
+ mov (1) r18.4<1>:ud 0x1000100:ud { NoDDChk }
+
+
--- /dev/null
+// 7 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+//Module Name: Set_AVS_Buf_0123_YUVA.asm
+
+
+// Module Name : Set_Buf_0123_VUYA
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ //For AVS: We use surface state as R8G8B8A8_UNORM and hence set pointers to VUYA.
+ //AVS LAYOUT:(VVUUYYAA)
+ //Assign buffer channel order for Buffer 0123 in the order AUYV a0.3>A, a0.2>U, a0.1>Y, a0.0>V
+ //V = 0, Y= 8, U = 4, A = 12.
+ mov (4) acc0.0<1>:w 0x6E2A:v
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 70:uw
+ shl (4) r22.0<1>:w acc0<4;4,1>:w 5:uw
+
+ //Used by Shuffle.
+ //SU LAYOUT:(VUYAVUYA)
+ //V = 0, Y = 4, U = 2, A = 6
+ mov (4) acc0.0<1>:w 0x6240:v
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 64:uw
+ shl (4) r18.0<1>:w acc0<4;4,1>:w 5:uw { NoDDClr } //Convert to BYTE address.
+
+ //OFFSET:
+ mov (1) r18.4<1>:ud 0x1000100:ud { NoDDChk }
+
+
--- /dev/null
+// 7 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+//Module Name: Set_AVS_Buf_0123_VYUA.asm
+
+
+//Module Name: Set_Buf_0123_VYUA
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+ //AVS LAYOUT:(VVYYUUAA)
+ //Assign buffer channel order for Buffer 0123 in the order AUYV a0.3>A, a0.2>U, a0.1>Y, a0.0>V
+ // V = 0, Y= 4, U = 8, A = 12.
+ mov (4) acc0.0<1>:w 0x62EA:v //Subtract 6 from 0,4,8,12
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 70:uw //add 6 back
+ shl (4) r22.0<1>:w acc0<4;4,1>:w 5:uw //Convert to BYTE address.
+
+ //OPT: wAVS_SU_SHUFFLE_PTR_0 and udAVS_SU_SHUFFLE_OFF_0 are sub-regs of same GRF. -rT
+
+ //SU LAYOUT:(VYUAVYUA)
+ //V = 0, Y = 2, U = 4, A = 6
+ mov (4) acc0.0<1>:w 0x6420:v
+ add (4) acc0.0<1>:w acc0<4;4,1>:w 64:uw
+ shl (4) r18.0<1>:w acc0<4;4,1>:w 5:uw { NoDDClr } //Convert to BYTE address.
+
+ //OFFSET:
+ mov (1) r18.4<1>:ud 0x1000100:ud { NoDDChk }
+
+
--- /dev/null
+// 17 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+
+//Module name: Set_Layer_N.inc
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+//Used to generate LABELS at compile time.
+
+
+//definitions for Expand Mask
+.declare uwMask_Temp1 Base=r17.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+.declare ubMask_Temp1 Base=r17.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub // 1 GRF
+.declare udMask_Temp1 Base=r17.0 ElementSize=4 Type=ud // 1 GRF
+.declare uwMask_Temp2 Base=r16.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+.declare ubMask_Temp2 Base=r16.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub // 1 GRF
+.declare udMask_Temp2 Base=r16.0 ElementSize=4 Type=ud // 1 GRF
+
+.declare uwMask_Temp3 Base=r15.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+.declare ubMask_Temp3 Base=r15.0 ElementSize=1 SrcRegion=<16;16,1> Type=ub // 1 GRF
+
+.declare udALPHA_MASK_REG Base=r21.0 ElementSize=4 Type=ud // 1 GRF
+.declare udALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//Initialize mask reg to FFFF
+
+ mov (16) uwALPHA_MASK_REG(0)<1> 0xFFFF:uw
+
+
+//Fast jump for -
+//LAYER0: we determine whether layer 0 is to be loaded and processed or not based
+// on block mask in module "Set_Layer_0" and store result in f0.1.
+// This flag is then directly used to while loading buf0-3 and colorfill.
+// (So flag f0.1 should not be changed from Set_Layer_0 till Colorfill)
+//
+//LAYER1-7: For all other layers, we compute whether layer is to be loaded and processed
+// based on block mask in module "Set_Layer_1-7" and store result in SKIP_LAYER
+// variable.
+// While Loading buf 4 and 5, we move SKIP_LAYER to f0.0 every time and use it
+// for Loading.
+// For processing though, we move SKIP_LAYER only once to f0.1 in module
+// "Set_Buf0_Buf4" and use f0.1 for deciding whether layer 1-7 (all 4 sub blocks)
+// is to be processed or not.
+// (So flag f0.1) should not be modififed from module "Set_Buf0_Buf4" till module
+// that processess sub-block 3).
+//
+//None of the above fast jumps, apply to CSC modules. We always perform CSC irrespective of mask.
+//
+//Example: (Without going into finer details)
+// Typical Combined kernel:
+//
+// (let var = decision whether to load/process that layer)
+//
+// Set_Layer_0 //f0.1 <- var
+// ..
+// Set_Layer_1 //f0.1 <- var, SKIP_LAYER <- var
+// ..
+// Load buf 0 //use f0.1
+// Load buf 4 //f0.0 <- SKIP_LAYER
+// Load buf 1 //use f0.1
+// Load buf 5 //f0.0 <- SKIP_LAYER
+// Load buf 2 //use f0.1
+// Load buf 3 //use f0.1
+// ..
+// ..
+// Colorfill
+// ..
+// Set_Buf0_Buf4 //f0.1 <- SKIP_LAYER
+// process0-4 //Use f0.1
+// Load buf 4
+// Set_Buf1_Buf5
+// process1-5
+// Load buf 5
+// ..
+// Set_Layer_2 //f0.1 <-var, SKIP_LAYER <- var
+// ..
+// Set_Buf2_Buf4
+// process2-4
+// Load buf 4
+// Set_Buf3_Buf5
+// process3-5
+// Load buf 5
+// ..
+
+
+ //For layer 0, use f0.1 directly
+ cmp.ne.f0.1 (1) null<1>:d r7.2:uw 0:uw
+ (f0.1)cmp.ne.f0.1 (1) null<1>:d r7.3:uw 0:uw
+ (-f0.1) jmpi (1) SKIP_LAYER_L0
+
+
+ //Copy all AVS Payload data
+ // Setup Message Payload Header for 1st block of Media Sampler 8x8 (16x4 for IVB+)
+ mov (1) r25.6:f r7.5:f { NoDDClr } //NLAS dx
+ mov (1) r25.4:f r3.0:f { NoDDClr, NoDDChk } //Step X
+ mov (1) r25.5:f r4.0:f { NoDDClr, NoDDChk } //Step Y
+ mov (1) r25.2:f r6.0<0;1,0>:f { NoDDClr, NoDDChk } //Orig X
+ mov (1) r25.3:f r5.0<0;1,0>:f { NoDDChk } //Orig Y
+
+ //NLAS calculations for 2nd half of blocks of Media Sampler 8x8:
+ // X(i) = X0 + dx*i + ddx*i*(i-1)/2 ==> X(8) = X0 + dx*8 +ddx*28
+ // dx(i)= dx(0) + ddx*i ==> dx(8)= dx + ddx*8
+
+ //OPTIMIZATION: fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY - are sub registers of same GRF. Use NODDCLR NODDCHK. -rT
+
+ // Calculating X(8)
+ mov (1) acc0.2:f r6.0:f
+ mac (1) acc0.2:f r3.0:f 8.0:f
+ mac (1) r23.2:f r7.5:f 28.0:f { NoDDClr }
+
+ // Calculating Y(4)
+ mul (1) r23.1<1>:f r4.0:f 4.0:f { NoDDClr, NoDDChk } //dY*4
+
+ // Calculating dx(8)
+ mov (1) acc0.4:f r3.0:f
+ mac (1) r23.4:f r7.5:f 8.0:f { NoDDClr, NoDDChk }
+
+ // Binding Index
+ mov (1) r23.5:ud 0:ud { NoDDChk }
+
+
+SKIP_LAYER_L0:
+ nop
+
+
--- /dev/null
+// 10 // Total instruction count
+// 1 // Total kernel count
+
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type :ub
+
+.reg_count_total 128
+.reg_count_payload 7
+
+//========== Common constants ==========
+
+
+//========== Macros ==========
+
+
+//Fast Jump, For more details see "Set_Layer_N.asm"
+
+
+//========== Defines ====================
+
+//========== Static Parameters (Common To All) ==========
+//r1
+
+
+//r2
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+//Color Pipe (IECP) parameters
+
+
+//r4
+
+ // e.g. byte0 byte1 byte2
+ // YUYV 0 1 3
+ // YVYU 0 3 1
+
+
+//========== Inline parameters (Common To All) ===========
+
+
+//============== Binding Index Table===========
+//Common between DNDI and DNUV
+
+
+//================= Common Message Descriptor =====
+// Message descriptor for thread spawning
+// Message Descriptors
+// = 000 0001 (min message len 1 ) 0,0000 (resp len 0 -add later)
+// 0000,0000,0000
+// 0001(Spawn a root thread),0001 (Root thread spawn thread)
+// = 0x02000011
+// Thread Spawner Message Descriptor
+
+
+// Message descriptor for atomic operation add
+// Message Descriptors
+// = 000 0110 (min message len 6 ) 0,0000 (resp len 0 -add later)
+// 1(header present)001,10(typed atomic operation)0(return enabled)0(slot group, low 8 bits),0111 (AOP_Add)
+// 0000,0000 (Binding table index, added later)
+// = 0x02000011
+
+// Atomic Operation Add Message Descriptor
+
+
+// Message descriptor for dataport media write
+ // Message Descriptors
+ // = 000 0001 (min message len 1 - add later) 00000 (resp len 0)
+ // 1 (header present 1) 0 1010 (media block write) 000000
+ // 00000000 (binding table index - set later)
+ // = 0x020A8000
+
+
+// Message Length defines
+
+
+// Response Length defines
+
+
+// Block Width and Height Size defines
+
+
+// Extended Message Descriptors
+
+
+// Common message descriptors:
+
+
+//===================== Math Function Control ===================================
+
+
+//============ Message Registers ===============
+ // buf4 starts from r28
+
+
+//#define mMSGHDR_EOT r43 // Dummy Message Register for EOT
+
+
+.declare mubMSGPAYLOAD Base=r30 ElementSize=1 SrcRegion=<16;16,1> Type=ub
+.declare muwMSGPAYLOAD Base=r30 ElementSize=2 SrcRegion=<16;16,1> Type=uw
+.declare mudMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=ud
+.declare mfMSGPAYLOAD Base=r30 ElementSize=4 SrcRegion=<8;8,1> Type=f
+
+//=================== End of thread instruction ===========================
+
+
+//=====================Pointers Used=====================================
+
+
+//=======================================================================
+
+
+//r9-r17
+// Define temp space for any usages
+
+
+// Common Buffers
+
+
+// End of common.inc
+
+// FileName: VP_Setup.asm
+// Author: Vivek Kumar
+// Description: Sets up all parameters for the Video Processing Kernel
+
+
+// Description: Includes all definitions explicit to Fast Composite.
+
+
+// End of common.inc
+
+
+//========== GRF partition ==========
+ // r0 header : r0 (1 GRF)
+ // Static parameters : r1 - r6 (6 GRFS)
+ // Inline parameters : r7 - r8 (2 GRFs)
+ // MSGSRC : r27 (1 GRF)
+//===================================
+
+//Interface:
+//========== Static Parameters (Explicit To Fast Composite) ==========
+//r1
+//CSC Set 0
+
+
+.declare udCSC_CURBE Base=r1.0 ElementSize=4 Type=ud
+
+//Constant alpha
+
+
+//r2
+
+
+// WiDi Definitions
+
+
+//Colorfill
+
+
+.declare ubCOLOR_PIXEL_VAL Base=r2.20 ElementSize=1 SrcRegion=<0;1,0> DstRegion=<1> Type=ub
+
+//r3
+//Normalised Ratio of Horizontal step size with main video for all layers
+
+
+ //Normalised Ratio of Horizontal step size with main video for all layers becomes
+ //Normalised Horizontal step size for all layers in VP_Setup.asm
+
+
+//r4
+//Normalised Vertical step size for all layers
+
+
+//r5
+//Normalised Vertical Frame Origin for all layers
+
+
+//r6
+//Normalised Horizontal Frame Origin for all layers
+
+
+//========== Inline Parameters (Explicit To Fast Composite) ==========
+
+
+//Main video Step X
+
+
+//====================== Binding table (Explicit To Fast Composite)=========================================
+
+
+//Used by Interlaced Scaling Kernels
+
+
+//========== Sampler State Table Index (Explicit To Fast Composite)==========
+//Sampler Index for AVS/IEF messages
+
+
+//Sampler Index for SIMD16 sampler messages
+
+
+//=============================================================================
+
+.declare fBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+.declare fBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=f
+
+.declare udBUFFER_0 Base=r64.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_1 Base=r80.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_2 Base=r96.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_3 Base=r112.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_4 Base=r28.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+.declare udBUFFER_5 Base=r46.0 ElementSize=4 SrcRegion=<8;8,1> DstRegion=<1> Type=ud
+
+.declare uwBUFFER_0 Base=r64.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_1 Base=r80.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_2 Base=r96.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_3 Base=r112.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_4 Base=r28.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+.declare uwBUFFER_5 Base=r46.0 ElementSize=2 SrcRegion=<16;16,1> DstRegion=<1> Type=uw
+
+.declare ubBUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+.declare ubBUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<16;16,1> DstRegion=<1> Type=ub
+
+.declare ub4BUFFER_0 Base=r64.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_1 Base=r80.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_2 Base=r96.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_3 Base=r112.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_4 Base=r28.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+.declare ub4BUFFER_5 Base=r46.0 ElementSize=1 SrcRegion=<32;8,4> DstRegion=<4> Type=ub
+
+//Pointer to mask reg
+
+
+//r18
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+.declare udCSC_COEFF_0 Base=r18.0 ElementSize=4 Type=ud // 1 GRF
+
+//r19
+
+
+.declare udCSC_COEFF_1 Base=r19.0 ElementSize=4 Type=ud // 1 GRF
+
+
+//r20
+
+.declare uwALPHA_MASK_REG_TEMP Base=r20.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r21
+
+.declare uwALPHA_MASK_REG Base=r21.0 ElementSize=2 SrcRegion=<16;16,1> Type=uw // 1 GRF
+
+//r22
+
+
+//Always keep Cannel Pointers and Offsets in same GRF, so that we can use
+// NODDCLR, NODDCHK flags. -rT
+
+
+//Keep fORIGIN_X_NLAS, fY_OFFSET_2ND_BLOCK, fSTEP_X_NLAS, pMSGDSC_COPY, ubCONST_ALPHA_COPY as
+//sub registers of same GRF to enable using NODDCLR NODDCHK. -rT
+
+//r23
+
+
+//Lumakey
+
+
+//r24
+
+
+//r25
+
+
+//r26
+
+
+//defines to generate LABELS during compile time.
+
+
+//Setup pointer to the inline parameter
+
+// Copy MSG HDR
+ mov (8) r27.0<1>:ud r0.0<8;8,1>:ud // Initialize message payload header with R0
+
+
+//temp; remove it once unread msg warnings are resolved -vK
+mov (8) r25:ud r0.0<8;8,1>:ud
+mov (8) r26:ud r0.0<8;8,1>:ud
+
+// Calculate StepX for all layers and overwrite it on the ratio
+ mul (8) r3.0<1>:f r3.0<8;8,1>:f r7.4<0;1,0>:f //StepX_ratio = StepX / VideoStepX
+
+ //Normalised Ratio of Horizontal step size with main video for all layers now becomes
+ //Normalised Horizontal step size for all layers
+
+// Calculate block origin for all layers and overwrite it on the frame origin
+ mov (2) r8.5<1>:f r7.0<2;2,1>:w //Convert origin from word to float
+
+ cmp.e.f0.0 (8) null<1>:d r2.26:ub 1:uw
+
+ (-f0.0)mov (8) acc0:f r6.0<8;8,1>:f
+ (-f0.0)mac (8) r6.0<1>:f r3.0<8;8,1>:f r8.5<0;1,0>:f
+
+ mov (8) acc0:f r5.0<8;8,1>:f
+ mac (8) r5.0<1>:f r4.0<8;8,1>:f r8.6<0;1,0>:f
+
+// Calculate X(8) and Y(4), Y(8), Y(12) -vK
--- /dev/null
+// Module name: AVS
+.kernel AVS
+.code
+
+#include "VP_Setup.g4a"
+#include "Set_Layer_0.g4a"
+#include "Set_AVS_Buf_0123_PL2.g4a"
+#include "PL2_AVS_Buf_0.g4a"
+#include "PL2_AVS_Buf_1.g4a"
+#include "PL2_AVS_Buf_2.g4a"
+#include "PL2_AVS_Buf_3.g4a"
+#include "Save_AVS_NV12.g4a"
+#include "EOT.g4a"
+
+.end_code
+
+.end_kernel
+
+// end of DNDI
--- /dev/null
+ { 0x00600001, 0x23600021, 0x008d0000, 0x00000000 },
+ { 0x00600001, 0x23200021, 0x008d0000, 0x00000000 },
+ { 0x00600001, 0x23400021, 0x008d0000, 0x00000000 },
+ { 0x00600041, 0x206077bd, 0x008d0060, 0x000000f0 },
+ { 0x00200001, 0x211401bd, 0x004500e0, 0x00000000 },
+ { 0x01600010, 0x20002e24, 0x0000005a, 0x00010001 },
+ { 0x00710001, 0x240003bc, 0x008d00c0, 0x00000000 },
+ { 0x00710048, 0x20c077bd, 0x008d0060, 0x00000114 },
+ { 0x00600001, 0x240003bc, 0x008d00a0, 0x00000000 },
+ { 0x00600048, 0x20a077bd, 0x008d0080, 0x00000118 },
+ { 0x00800001, 0x22a00169, 0x00000000, 0xffffffff },
+ { 0x02000010, 0x20002d24, 0x020000e4, 0x00000000 },
+ { 0x02010010, 0x20002d24, 0x020000e6, 0x00000000 },
+ { 0x00110220, 0x34001c00, 0x02001400, 0x00000018 },
+ { 0x00000401, 0x233803bd, 0x000000f4, 0x00000000 },
+ { 0x00000c01, 0x233003bd, 0x00000060, 0x00000000 },
+ { 0x00000c01, 0x233403bd, 0x00000080, 0x00000000 },
+ { 0x00000c01, 0x232803bd, 0x000000c0, 0x00000000 },
+ { 0x00000801, 0x232c03bd, 0x000000a0, 0x00000000 },
+ { 0x00000001, 0x240803bc, 0x000000c0, 0x00000000 },
+ { 0x00000048, 0x24087fbc, 0x00000060, 0x41000000 },
+ { 0x00000448, 0x22e87fbd, 0x000000f4, 0x41e00000 },
+ { 0x00000c41, 0x22e47fbd, 0x00000080, 0x40800000 },
+ { 0x00000001, 0x241003bc, 0x00000060, 0x00000000 },
+ { 0x00000c48, 0x22f07fbd, 0x000000f4, 0x41000000 },
+ { 0x00000801, 0x22f40061, 0x00000000, 0x00000000 },
+ { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+ { 0x00400001, 0x2400036c, 0x00000000, 0x00006ea2 },
+ { 0x00400040, 0x24002d8c, 0x00690400, 0x00460046 },
+ { 0x00400009, 0x22c02d8d, 0x00690400, 0x00050005 },
+ { 0x00400001, 0x2400036c, 0x00000000, 0x00006204 },
+ { 0x00400040, 0x24002d8c, 0x00690400, 0x00400040 },
+ { 0x00400409, 0x22402d8d, 0x00690400, 0x00050005 },
+ { 0x00000801, 0x22500061, 0x00000000, 0x01000100 },
+ { 0x00000001, 0x22d00061, 0x00000000, 0x00400040 },
+ { 0x00110220, 0x34001c00, 0x02001400, 0x00000012 },
+ { 0x00000040, 0x22000c20, 0x000002f4, 0x044eb400 },
+ { 0x00000001, 0x22080061, 0x00000000, 0x0000d000 },
+ { 0x00000401, 0x233c0021, 0x000000fc, 0x00000000 },
+ { 0x00000801, 0x23240121, 0x000000f8, 0x00000000 },
+ { 0x00600001, 0x22200021, 0x008d0320, 0x00000000 },
+ { 0x02000031, 0x28000229, 0x00000200, 0x00000200 },
+ { 0x00000040, 0x22000c20, 0x000002f4, 0x048eb801 },
+ { 0x00000001, 0x22080061, 0x00000000, 0x0000a000 },
+ { 0x02000031, 0x28800229, 0x00000200, 0x00000200 },
+ { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+ { 0x00110220, 0x34001c00, 0x02001400, 0x00000010 },
+ { 0x00000040, 0x22000c20, 0x000002f4, 0x044eb400 },
+ { 0x00000001, 0x22080061, 0x00000000, 0x0000d000 },
+ { 0x00000040, 0x23240d21, 0x000000f8, 0x00000001 },
+ { 0x00600001, 0x22200021, 0x008d0320, 0x00000000 },
+ { 0x02000031, 0x2a000229, 0x00000200, 0x00000200 },
+ { 0x00000040, 0x22000c20, 0x000002f4, 0x048eb801 },
+ { 0x00000001, 0x22080061, 0x00000000, 0x0000a000 },
+ { 0x02000031, 0x2a800229, 0x00000200, 0x00000200 },
+ { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+ { 0x00110220, 0x34001c00, 0x02001400, 0x00000010 },
+ { 0x00000040, 0x22000c20, 0x000002f4, 0x044eb400 },
+ { 0x00000001, 0x22080061, 0x00000000, 0x0000d000 },
+ { 0x00000040, 0x23240d21, 0x000000f8, 0x00000002 },
+ { 0x00600001, 0x22200021, 0x008d0320, 0x00000000 },
+ { 0x02000031, 0x2c000229, 0x00000200, 0x00000200 },
+ { 0x00000040, 0x22000c20, 0x000002f4, 0x048eb801 },
+ { 0x00000001, 0x22080061, 0x00000000, 0x0000a000 },
+ { 0x02000031, 0x2c800229, 0x00000200, 0x00000200 },
+ { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+ { 0x00110220, 0x34001c00, 0x02001400, 0x00000010 },
+ { 0x00000040, 0x22000c20, 0x000002f4, 0x044eb400 },
+ { 0x00000001, 0x22080061, 0x00000000, 0x0000d000 },
+ { 0x00000040, 0x23240d21, 0x000000f8, 0x00000003 },
+ { 0x00600001, 0x22200021, 0x008d0320, 0x00000000 },
+ { 0x02000031, 0x2e000229, 0x00000200, 0x00000200 },
+ { 0x00000040, 0x22000c20, 0x000002f4, 0x048eb801 },
+ { 0x00000001, 0x22080061, 0x00000000, 0x0000a000 },
+ { 0x02000031, 0x2e800229, 0x00000200, 0x00000200 },
+ { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+ { 0x00400040, 0x22002da8, 0x006902c0, 0x00000000 },
+ { 0x00400040, 0x22082da8, 0x006902c0, 0x02000200 },
+ { 0x00600001, 0x23800021, 0x008d0360, 0x00000000 },
+ { 0x00600001, 0x24a00021, 0x008d0360, 0x00000000 },
+ { 0x00200401, 0x238001a5, 0x004500e0, 0x00000000 },
+ { 0x00000401, 0x24a001a5, 0x000000e0, 0x00000000 },
+ { 0x00000c08, 0x24a43da5, 0x000000e2, 0x00010001 },
+ { 0x00000801, 0x23880061, 0x00000000, 0x000f000f },
+ { 0x00000801, 0x24a80061, 0x00000000, 0x0007000f },
+ { 0x80800040, 0x28002d29, 0x00b10800, 0x00800080 },
+ { 0x80800040, 0x28202d29, 0x00b10820, 0x00800080 },
+ { 0x80800040, 0x28802d29, 0x00b10880, 0x00800080 },
+ { 0x80800040, 0x28a02d29, 0x00b108a0, 0x00800080 },
+ { 0x80800040, 0x28c02d29, 0x00b108c0, 0x00800080 },
+ { 0x80800040, 0x28e02d29, 0x00b108e0, 0x00800080 },
+ { 0x80800040, 0x29002d29, 0x00b10900, 0x00800080 },
+ { 0x80800040, 0x29202d29, 0x00b10920, 0x00800080 },
+ { 0x80800040, 0x29802d29, 0x00b10980, 0x00800080 },
+ { 0x80800040, 0x29a02d29, 0x00b109a0, 0x00800080 },
+ { 0x80800040, 0x29c02d29, 0x00b109c0, 0x00800080 },
+ { 0x80800040, 0x29e02d29, 0x00b109e0, 0x00800080 },
+ { 0x80800040, 0x2a002d29, 0x00b10a00, 0x00800080 },
+ { 0x80800040, 0x2a202d29, 0x00b10a20, 0x00800080 },
+ { 0x80800040, 0x2a802d29, 0x00b10a80, 0x00800080 },
+ { 0x80800040, 0x2aa02d29, 0x00b10aa0, 0x00800080 },
+ { 0x80800040, 0x2ac02d29, 0x00b10ac0, 0x00800080 },
+ { 0x80800040, 0x2ae02d29, 0x00b10ae0, 0x00800080 },
+ { 0x80800040, 0x2b002d29, 0x00b10b00, 0x00800080 },
+ { 0x80800040, 0x2b202d29, 0x00b10b20, 0x00800080 },
+ { 0x80800040, 0x2b802d29, 0x00b10b80, 0x00800080 },
+ { 0x80800040, 0x2ba02d29, 0x00b10ba0, 0x00800080 },
+ { 0x80800040, 0x2bc02d29, 0x00b10bc0, 0x00800080 },
+ { 0x80800040, 0x2be02d29, 0x00b10be0, 0x00800080 },
+ { 0x80800040, 0x2c002d29, 0x00b10c00, 0x00800080 },
+ { 0x80800040, 0x2c202d29, 0x00b10c20, 0x00800080 },
+ { 0x80800040, 0x2c802d29, 0x00b10c80, 0x00800080 },
+ { 0x80800040, 0x2ca02d29, 0x00b10ca0, 0x00800080 },
+ { 0x80800040, 0x2cc02d29, 0x00b10cc0, 0x00800080 },
+ { 0x80800040, 0x2ce02d29, 0x00b10ce0, 0x00800080 },
+ { 0x80800040, 0x2d002d29, 0x00b10d00, 0x00800080 },
+ { 0x80800040, 0x2d202d29, 0x00b10d20, 0x00800080 },
+ { 0x80800040, 0x2d802d29, 0x00b10d80, 0x00800080 },
+ { 0x80800040, 0x2da02d29, 0x00b10da0, 0x00800080 },
+ { 0x80800040, 0x2dc02d29, 0x00b10dc0, 0x00800080 },
+ { 0x80800040, 0x2de02d29, 0x00b10de0, 0x00800080 },
+ { 0x80800040, 0x2e002d29, 0x00b10e00, 0x00800080 },
+ { 0x80800040, 0x2e202d29, 0x00b10e20, 0x00800080 },
+ { 0x80800040, 0x2e802d29, 0x00b10e80, 0x00800080 },
+ { 0x80800040, 0x2ea02d29, 0x00b10ea0, 0x00800080 },
+ { 0x80800040, 0x2ec02d29, 0x00b10ec0, 0x00800080 },
+ { 0x80800040, 0x2ee02d29, 0x00b10ee0, 0x00800080 },
+ { 0x80800040, 0x2f002d29, 0x00b10f00, 0x00800080 },
+ { 0x80800040, 0x2f202d29, 0x00b10f20, 0x00800080 },
+ { 0x80800040, 0x2f802d29, 0x00b10f80, 0x00800080 },
+ { 0x80800040, 0x2fa02d29, 0x00b10fa0, 0x00800080 },
+ { 0x80800040, 0x2fc02d29, 0x00b10fc0, 0x00800080 },
+ { 0x80800040, 0x2fe02d29, 0x00b10fe0, 0x00800080 },
+ { 0x00800401, 0x23a00231, 0x00d28401, 0x00000000 },
+ { 0x00800801, 0x23b00231, 0x00d28421, 0x00000000 },
+ { 0x00800401, 0x23c00231, 0x00d28441, 0x00000000 },
+ { 0x00800801, 0x23d00231, 0x00d28461, 0x00000000 },
+ { 0x00600401, 0x44c00231, 0x00cf8801, 0x00000000 },
+ { 0x00600c01, 0x44d00231, 0x00cf8841, 0x00000000 },
+ { 0x00600c01, 0x44c10231, 0x00cf8001, 0x00000000 },
+ { 0x00600801, 0x44d10231, 0x00cf8041, 0x00000000 },
+ { 0x00400040, 0x22002da8, 0x006902c0, 0x04000400 },
+ { 0x00800401, 0x23e00231, 0x00d29401, 0x00000000 },
+ { 0x00800801, 0x23f00231, 0x00d29421, 0x00000000 },
+ { 0x00800401, 0x24000231, 0x00d29441, 0x00000000 },
+ { 0x00800801, 0x24100231, 0x00d29461, 0x00000000 },
+ { 0x00600401, 0x44e00231, 0x00cf9801, 0x00000000 },
+ { 0x00600c01, 0x44f00231, 0x00cf9841, 0x00000000 },
+ { 0x00600c01, 0x44e10231, 0x00cf9001, 0x00000000 },
+ { 0x00600801, 0x44f10231, 0x00cf9041, 0x00000000 },
+ { 0x00400040, 0x22082da8, 0x006902c0, 0x06000600 },
+ { 0x00800401, 0x24200231, 0x00d28401, 0x00000000 },
+ { 0x00800801, 0x24300231, 0x00d28421, 0x00000000 },
+ { 0x00800401, 0x24400231, 0x00d28441, 0x00000000 },
+ { 0x00800801, 0x24500231, 0x00d28461, 0x00000000 },
+ { 0x00600401, 0x45000231, 0x00cf8801, 0x00000000 },
+ { 0x00600c01, 0x45100231, 0x00cf8841, 0x00000000 },
+ { 0x00600c01, 0x45010231, 0x00cf8001, 0x00000000 },
+ { 0x00600801, 0x45110231, 0x00cf8041, 0x00000000 },
+ { 0x00800401, 0x24600231, 0x00d29401, 0x00000000 },
+ { 0x00800801, 0x24700231, 0x00d29421, 0x00000000 },
+ { 0x00800401, 0x24800231, 0x00d29441, 0x00000000 },
+ { 0x00800801, 0x24900231, 0x00d29461, 0x00000000 },
+ { 0x00600401, 0x45200231, 0x00cf9801, 0x00000000 },
+ { 0x00600c01, 0x45300231, 0x00cf9841, 0x00000000 },
+ { 0x00600c01, 0x45210231, 0x00cf9001, 0x00000000 },
+ { 0x00600801, 0x45310231, 0x00cf9041, 0x00000000 },
+ { 0x05000031, 0x20000e24, 0x00000380, 0x120a8018 },
+ { 0x05000031, 0x20000e24, 0x000004a0, 0x0a0a8019 },
+ { 0x00600001, 0x2fe00021, 0x008d0000, 0x00000000 },
+ { 0x07000031, 0x20001e24, 0x00000fe0, 0x82000010 },
--- /dev/null
+// Module name: DNDI
+.kernel DNDI
+.code
+
+#include "DI_Core.g4a"
+#include "DI_Save_NV12_16x4.g4a"
+#include "EOT.g4a"
+
+.end_code
+
+.end_kernel
+
+// end of DNDI
--- /dev/null
+ { 0x00600001, 0x22400021, 0x008d0000, 0x00000000 },
+ { 0x00000401, 0x226801ad, 0x000000e0, 0x00000000 },
+ { 0x00000801, 0x227801ad, 0x000000e2, 0x00000000 },
+ { 0x02600031, 0x25c00e21, 0x00000240, 0x04ae8003 },
+ { 0x00200001, 0x20e0012d, 0x004506fc, 0x00000000 },
+ { 0x00600001, 0x22800021, 0x008d0000, 0x00000000 },
+ { 0x00600001, 0x22a00021, 0x008d06c0, 0x00000000 },
+ { 0x00000408, 0x22803da1, 0x000000e0, 0x00010001 },
+ { 0x00000c01, 0x228401a1, 0x000000e2, 0x00000000 },
+ { 0x00000801, 0x22880061, 0x00000000, 0x00030007 },
+ { 0x05600031, 0x20000e24, 0x00000280, 0x040a8021 },
+ { 0x00600001, 0x23200061, 0x00000000, 0x00000000 },
+ { 0x00600001, 0x23000021, 0x008d0000, 0x00000000 },
+ { 0x00000408, 0x23003da1, 0x000000e0, 0x00010001 },
+ { 0x00000041, 0x24043da0, 0x000000e2, 0x00030003 },
+ { 0x00000c08, 0x23043c01, 0x00000404, 0x00020002 },
+ { 0x00000801, 0x23080061, 0x00000000, 0x00020007 },
+ { 0x00200040, 0x23002421, 0x00450300, 0x00450038 },
+ { 0x00000401, 0x23200021, 0x000006e4, 0x00000000 },
+ { 0x00200c01, 0x432c0021, 0x004506ec, 0x00000000 },
+ { 0x00200801, 0x43280021, 0x004506f4, 0x00000000 },
+ { 0x05600031, 0x20000e24, 0x00000300, 0x040a8021 },
+ { 0x00200401, 0x236001a5, 0x004500e0, 0x00000000 },
+ { 0x00000801, 0x23680061, 0x00000000, 0x0003000f },
+ { 0x00600001, 0x22600021, 0x008d05c0, 0x00000000 },
+ { 0x00600001, 0x22800021, 0x008d05e0, 0x00000000 },
+ { 0x00600001, 0x23000021, 0x008d0640, 0x00000000 },
+ { 0x00600001, 0x23200021, 0x008d0660, 0x00000000 },
+ { 0x00600001, 0x22400021, 0x008d0360, 0x00000000 },
+ { 0x00600001, 0x22e00021, 0x008d0360, 0x00000000 },
+ { 0x0000040c, 0x23643ca5, 0x00000364, 0x00010001 },
+ { 0x00000801, 0x23680061, 0x00000000, 0x0001000f },
+ { 0x00800401, 0x42a00231, 0x00ce0601, 0x00000000 },
+ { 0x00800801, 0x42a10231, 0x00ce0600, 0x00000000 },
+ { 0x00800401, 0x43400231, 0x00ce0681, 0x00000000 },
+ { 0x00800801, 0x43410231, 0x00ce0680, 0x00000000 },
+ { 0x00600001, 0x22a00021, 0x008d0360, 0x00000000 },
+ { 0x00600001, 0x23400021, 0x008d0360, 0x00000000 },
+ { 0x05600031, 0x20000e24, 0x00000240, 0x060a801b },
+ { 0x05600031, 0x20000e24, 0x000002e0, 0x060a801e },
+ { 0x05600031, 0x20000e24, 0x000002a0, 0x040a801c },
+ { 0x05600031, 0x20000e24, 0x00000340, 0x040a801f },
+ { 0x00600001, 0x2fe00021, 0x008d0000, 0x00000000 },
+ { 0x07000031, 0x20001e24, 0x00000fe0, 0x82000010 },