original

[gb-231r1-is01/GB_2.3_IS01.git] / system / core / libpixelflinger / t32cb16blend.S
diff --git a/system/core/libpixelflinger/t32cb16blend.S b/system/core/libpixelflinger/t32cb16blend.S

new file mode 100644 (file)

index 0000000..caf9eb7
--- /dev/null
+++ b/system/core/libpixelflinger/t32cb16blend.S
@@ -0,0 +1,202 @@
+/* libs/pixelflinger/t32cb16blend.S
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+       .text
+       .align
+       
+       .global scanline_t32cb16blend_arm
+
+
+/*
+ * .macro pixel
+ *
+ * \DREG is a 32-bit register containing *two* original destination RGB565 
+ *       pixels, with the even one in the low-16 bits, and the odd one in the
+ *       high 16 bits.
+ *
+ * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
+ *
+ * \FB is a target register that will contain the blended pixel values.
+ *
+ * \ODD is either 0 or 1 and indicates if we're blending the lower or 
+ *      upper 16-bit pixels in DREG into FB
+ *
+ *
+ * clobbered: r6, r7, lr
+ *
+ */
+
+.macro pixel,   DREG, SRC, FB, ODD
+
+    // SRC = 0xAABBGGRR
+    mov     r7, \SRC, lsr #24           // sA
+    add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
+    rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))
+
+1:
+
+.if \ODD
+
+    // red
+    mov     lr, \DREG, lsr #(16 + 11)
+    smulbb  lr, r7, lr
+    mov     r6, \SRC, lsr #3
+    and     r6, r6, #0x1F
+    add     lr, r6, lr, lsr #8
+    cmp     lr, #0x1F
+    orrhs   \FB, \FB, #(0x1F<<(16 + 11))
+    orrlo   \FB, \FB, lr, lsl #(16 + 11)
+
+        // green
+        and     r6, \DREG, #(0x3F<<(16 + 5))
+        smulbt  r6, r7, r6
+        mov     lr, \SRC, lsr #(8+2)
+        and     lr, lr, #0x3F
+        add     r6, lr, r6, lsr #(5+8)
+        cmp     r6, #0x3F
+        orrhs   \FB, \FB, #(0x3F<<(16 + 5))
+        orrlo   \FB, \FB, r6, lsl #(16 + 5)
+
+            // blue
+            and     lr, \DREG, #(0x1F << 16)
+            smulbt  lr, r7, lr
+            mov     r6, \SRC, lsr #(8+8+3)
+            and     r6, r6, #0x1F
+            add     lr, r6, lr, lsr #8
+            cmp     lr, #0x1F
+            orrhs   \FB, \FB, #(0x1F << 16)
+            orrlo   \FB, \FB, lr, lsl #16
+
+.else
+
+    // red
+    mov     lr, \DREG, lsr #11
+    and     lr, lr, #0x1F
+    smulbb  lr, r7, lr
+    mov     r6, \SRC, lsr #3
+    and     r6, r6, #0x1F
+    add     lr, r6, lr, lsr #8
+    cmp     lr, #0x1F
+    movhs   \FB, #(0x1F<<11)
+    movlo   \FB, lr, lsl #11
+
+
+        // green
+        and     r6, \DREG, #(0x3F<<5)
+        smulbb  r6, r7, r6
+        mov     lr, \SRC, lsr #(8+2)
+        and     lr, lr, #0x3F
+        add     r6, lr, r6, lsr #(5+8)
+        cmp     r6, #0x3F
+        orrhs   \FB, \FB, #(0x3F<<5)
+        orrlo   \FB, \FB, r6, lsl #5
+
+            // blue
+            and     lr, \DREG, #0x1F
+            smulbb  lr, r7, lr
+            mov     r6, \SRC, lsr #(8+8+3)
+            and     r6, r6, #0x1F
+            add     lr, r6, lr, lsr #8
+            cmp     lr, #0x1F
+            orrhs   \FB, \FB, #0x1F
+            orrlo   \FB, \FB, lr
+
+.endif
+
+    .endm
+    
+
+// r0:  dst ptr
+// r1:  src ptr
+// r2:  count
+// r3:  d
+// r4:  s0
+// r5:  s1
+// r6:  pixel
+// r7:  pixel
+// r8:  free
+// r9:  free
+// r10: free
+// r11: free
+// r12: scratch
+// r14: pixel
+
+scanline_t32cb16blend_arm:
+    stmfd      sp!, {r4-r7, lr}
+
+    pld     [r0]
+    pld     [r1]
+
+    // align DST to 32 bits
+    tst     r0, #0x3
+    beq     aligned
+    subs    r2, r2, #1
+    ldmlofd    sp!, {r4-r7, lr}        // return
+    bxlo    lr
+
+last:
+    ldr     r4, [r1], #4
+    ldrh    r3, [r0]
+    pixel   r3, r4, r12, 0
+    strh    r12, [r0], #2
+
+aligned:
+    subs    r2, r2, #2
+    blo     9f
+
+    // The main loop is unrolled twice and processes 4 pixels
+8:  ldmia   r1!, {r4, r5}
+    // stream the source
+    pld     [r1, #32]
+    add     r0, r0, #4
+    // it's all zero, skip this pixel
+    orrs    r3, r4, r5
+    beq     7f
+    
+    // load the destination
+    ldr     r3, [r0, #-4]
+    // stream the destination
+    pld     [r0, #32]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 1
+    // effectively, we're getting write-combining by virtue of the
+    // cpu's write-back cache.
+    str     r12, [r0, #-4]
+
+    // 2nd iterration of the loop, don't stream anything
+    subs    r2, r2, #2
+    movlt   r4, r5
+    blt     9f
+    ldmia   r1!, {r4, r5}
+    add     r0, r0, #4
+    orrs    r3, r4, r5
+    beq     7f
+    ldr     r3, [r0, #-4]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    str     r12, [r0, #-4]
+
+    
+7:  subs    r2, r2, #2
+    bhs     8b
+    mov     r4, r5
+
+9:  adds    r2, r2, #1
+    ldmlofd sp!, {r4-r7, lr}        // return
+    bxlo    lr
+    b       last