OSDN Git Service

vc4: Declare the cpu pointers as being modified in NEON asm.
authorCarsten Haitzler (Rasterman) <raster@rasterman.com>
Tue, 8 Jan 2019 16:28:30 +0000 (16:28 +0000)
committerEmil Velikov <emil.l.velikov@gmail.com>
Wed, 30 Jan 2019 17:33:23 +0000 (17:33 +0000)
Otherwise, the compiler is free to reuse the register containing the input
for another call and assume that the value hasn't been modified.  Fixes
crashes on texture upload/download with current gcc.

We now have to have a temporary for the cpu2 value, since outputs must be
lvalues.

(commit message by anholt)

Fixes: 4d30024238ef ("vc4: Use NEON to speed up utile loads on Pi2.")
(cherry picked from commit 300d3ae8b1445b5060f92c77c0f577f4b7b2c7d6)
[Emil: apply the patch to vc4_tiling_lt.c instead of v3d_cpu_tiling.h]
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
Conflicts:
src/broadcom/common/v3d_cpu_tiling.h

Squashed with commit:

vc4: Declare the last cpu pointer as being modified in NEON asm.

Earlier commit addressed 7 of the 8 instances available.

v2: Rebase patch back to master (by anholt)

Cc: Carsten Haitzler (Rasterman) <raster@rasterman.com>
Cc: Eric Anholt <eric@anholt.net>
Fixes: 300d3ae8b14 ("vc4: Declare the cpu pointers as being modified in NEON asm.")
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
(cherry picked from commit 385843ac3ce1b868d9e24fcb2dbc0c8d5f5a7c99)

Conflicts:
src/broadcom/common/v3d_cpu_tiling.h

src/gallium/drivers/vc4/vc4_tiling_lt.c

index df6236b..324a633 100644 (file)
@@ -85,13 +85,13 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
                         "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
                         "vst1.8 d7, [%[cpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         } else {
                 assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load from the GPU in one shot, no interleave, to
                          * d0-d7.
@@ -109,10 +109,9 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
                         "vst1.8 d6, [%[cpu]]\n"
                         "vst1.8 d7, [%[cpu2]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         }
@@ -134,13 +133,13 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
                         "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
                         "st1 {v3.D}[1], [%[cpu]]\n"
-                       :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
         } else {
                 assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load from the GPU in one shot, no interleave, to
                          * d0-d7.
@@ -158,10 +157,9 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
                         "st1 {v3.D}[0], [%[cpu]]\n"
                         "st1 {v3.D}[1], [%[cpu2]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
         }
@@ -196,13 +194,13 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                          * d0-d7.
                          */
                         "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        :
+                        : [cpu]         "r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         } else {
                 assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load each 16-byte line in 2 parts from the cpu-side
                          * destination.  (vld1 can only store one d-register
@@ -218,10 +216,9 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                         "vld1.8 d7, [%[cpu2]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         }
@@ -241,13 +238,13 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                         "ld1 {v3.D}[1], [%[cpu]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
         } else {
                 assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load each 16-byte line in 2 parts from the cpu-side
                          * destination.  (vld1 can only store one d-register
@@ -263,10 +260,9 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                         "ld1 {v3.D}[1], [%[cpu2]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
         }