From 2f6eec65acc953faed3951fb8f1eabed830fdd2f Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 5 Feb 2014 07:59:45 +0000
Subject: [PATCH] vp8: fix PPC assembly to work if src_stride != dst_stride

Signed-off-by: Anton Khirnov <anton@khirnov.net>
Signed-off-by: Janne Grunau <janne-libav@jannau.net>
---
 libavcodec/ppc/vp8dsp_altivec.c | 49 +++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/libavcodec/ppc/vp8dsp_altivec.c b/libavcodec/ppc/vp8dsp_altivec.c
index 504139da97..2e6d0faeb7 100644
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@ -241,15 +241,15 @@ void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst
 }
 
 #define EPEL_HV(WIDTH, HTAPS, VTAPS) \
-static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my) \
+static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
 { \
     DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
     if (VTAPS == 6) { \
-        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,     src-2*stride, stride, h+5, mx, my); \
-        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+2*16,     16,     h,   mx, my); \
+        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,      src-2*sstride, sstride, h+5, mx, my); \
+        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16,      16,      h,   mx, my); \
     } else { \
-        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,     src-stride, stride, h+4, mx, my); \
-        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+16,     16,     h,   mx, my); \
+        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,      src-sstride, sstride, h+4, mx, my); \
+        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16,      16,      h,   mx, my); \
     } \
 }
 
@@ -269,9 +269,44 @@ EPEL_HV(4,  4,6)
 EPEL_HV(4,  6,4)
 EPEL_HV(4,  4,4)
 
-static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my)
+static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
 {
-    ff_put_pixels16_altivec(dst, src, stride, h);
+    register vector unsigned char pixelsv1, pixelsv2;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    register vector unsigned char perm = vec_lvsl(0, src);
+    int i;
+    register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
+    register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
+    register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
+
+// hand-unrolling the loop by 4 gains about 15%
+// mininum execution time goes from 74 to 60 cycles
+// it's faster than -funroll-loops, but using
+// -funroll-loops w/ this is bad - 74 cycles again.
+// all this is on a 7450, tuning for the 7450
+    for (i = 0; i < h; i += 4) {
+        pixelsv1  = vec_ld( 0, src);
+        pixelsv2  = vec_ld(15, src);
+        pixelsv1B = vec_ld(sstride, src);
+        pixelsv2B = vec_ld(15 + sstride, src);
+        pixelsv1C = vec_ld(sstride2, src);
+        pixelsv2C = vec_ld(15 + sstride2, src);
+        pixelsv1D = vec_ld(sstride3, src);
+        pixelsv2D = vec_ld(15 + sstride3, src);
+        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
+               0, (unsigned char*)dst);
+        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
+               dstride, (unsigned char*)dst);
+        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
+               dstride2, (unsigned char*)dst);
+        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
+               dstride3, (unsigned char*)dst);
+        src += sstride4;
+        dst += dstride4;
+    }
 }
 
 #endif /* HAVE_ALTIVEC */
-- 
2.11.0