This particular part serves to document the optimized code variant.
s = 0;
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j += 8) {
+#if 0
+ s += sq[pix[0]];
+ s += sq[pix[1]];
+ s += sq[pix[2]];
+ s += sq[pix[3]];
+ s += sq[pix[4]];
+ s += sq[pix[5]];
+ s += sq[pix[6]];
+ s += sq[pix[7]];
+#else
#if LONG_MAX > 2147483647
register uint64_t x=*(uint64_t*)pix;
s += sq[x&0xff];
s += sq[(x>>16)&0xff];
s += sq[(x>>24)&0xff];
#endif
+#endif
pix += 8;
}
pix += line_size - 16;