From d7b20edf76ea6419b6ff3b7ea50d818ab305cd82 Mon Sep 17 00:00:00 2001 From: "K.Ohta" Date: Mon, 13 Mar 2023 16:55:15 +0900 Subject: [PATCH] [VM][FMTOWNS][SIMD] Available to build, see TOWNS_CRTD::render_256() of vm/fmtowns/crtc.cpp . --- source/src/types/simd.h | 524 ++++++++++++++++++++++++++++++++++------- source/src/vm/fmtowns/crtc.cpp | 85 ++++--- 2 files changed, 476 insertions(+), 133 deletions(-) diff --git a/source/src/types/simd.h b/source/src/types/simd.h index 6451c0c90..80dc102cb 100644 --- a/source/src/types/simd.h +++ b/source/src/types/simd.h @@ -23,14 +23,14 @@ template { __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T m_data[8]; public: - csp_vector8(const csp_vector8 __a) + constexpr csp_vector8(const csp_vector8& __a) { __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { m_data[i] = __a.at(i); } } - csp_vector8(const T* p) + constexpr csp_vector8(const T* p) { if(p != nullptr) { __DECL_VECTORIZED_LOOP @@ -44,7 +44,7 @@ public: } } } - csp_vector8(const T n = (const T)0) + constexpr csp_vector8(const T n = (const T)0) { __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { @@ -53,25 +53,190 @@ public: } ~csp_vector8() {} - constexpr T at(size_t n) + constexpr T at(const size_t& n) { return m_data[n]; } // Pointer may be unaligned, or aligned. - inline void load(T* p) + constexpr void load(T* p) { for(size_t i = 0; i < 8; i++) { m_data[i] = p[i]; } } + constexpr void load_limited(T* p, const size_t _limit) + { + + for(size_t i = 0; (i < 8) && (i < _limit); i++) { + m_data[i] = p[i]; + } + } + template + constexpr void load(T2* p) + { + for(size_t i = 0; i < 8; i++) { + m_data[i] = (T)(p[i]); + } + } + template + constexpr void load_limited(T2* p, const size_t _limit) + { + + for(size_t i = 0; (i < 8) && (i < _limit); i++) { + m_data[i] = (T2)(p[i]); + } + } // Pointer may be unaligned, or aligned. - inline void store(T* p) + constexpr void store(T* p) { for(size_t i = 0; i < 8; i++) { p[i] = m_data[i]; } } + constexpr void store_limited(T* p, size_t _limit) + { + for(size_t i = 0; (i < 8) && (i < _limit); i++) { + p[i] = m_data[i]; + } + } + template + constexpr void store(T2* p) + { + for(size_t i = 0; i < 8; i++) { + p[i] = (T2)(m_data[i]); + } + } + template + constexpr void store_limited(T2* p, size_t _limit) + { + for(size_t i = 0; (i < 8) && (i < _limit); i++) { + p[i] = (T2)(m_data[i]); + } + } + constexpr void store2(T* p) + { + for(size_t i = 0, j = 0; i < 8; i++, j += 2) { + p[j] = m_data[i]; + p[j + 1] = m_data[i]; + } + } + constexpr void store2_limited(T* p, const size_t _limit) + { + for(size_t i = 0, j = 0; (i < 8) && (i < _limit); i++, j += 2) { + p[j] = m_data[i]; + p[j + 1] = m_data[i]; + } + } + template + constexpr void store2(T2* p) + { + for(size_t i = 0, j = 0; i < 8; i++, j += 2) { + p[j] = (T2)(m_data[i]); + p[j + 1] = (T2)(m_data[i]); + } + } + template + constexpr void store2_limited(T2* p, const size_t _limit) + { + for(size_t i = 0, j = 0; (i < 8) && (i < _limit); i++, j += 2) { + p[j] = (T2)(m_data[i]); + p[j + 1] = (T2)(m_data[i]); + } + } + constexpr void store4(T* p) + { + for(size_t i = 0, j = 0; i < 8; i++, j += 4) { + p[j] = m_data[i]; + p[j + 1] = m_data[i]; + p[j + 2] = m_data[i]; + p[j + 3] = m_data[i]; + } + } + constexpr void store4_limited(T* p, const size_t _limit) + { + for(size_t i = 0, j = 0; (i < 8) && (i < _limit); i++, j += 4) { + p[j] = m_data[i]; + p[j + 1] = m_data[i]; + p[j + 2] = m_data[i]; + p[j + 3] = m_data[i]; + } + } + template + constexpr void store4(T2* p) + { + for(size_t i = 0, j = 0; i < 8; i++, j += 4) { + p[j] = (T2)(m_data[i]); + p[j + 1] = (T2)(m_data[i]); + p[j + 2] = (T2)(m_data[i]); + p[j + 3] = (T2)(m_data[i]); + } + } + template + constexpr void store4_limited(T2* p, size_t _limit) + { + for(size_t i = 0, j = 0; (i < 8) && (i < _limit); i++, j += 4) { + p[j] = (T2)(m_data[i]); + p[j + 1] = (T2)(m_data[i]); + p[j + 2] = (T2)(m_data[i]); + p[j + 3] = (T2)(m_data[i]); + } + } + constexpr void store_n(const T* p, const size_t _mag) + { + + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T _tmp[8]; + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + _tmp[i] = m_data[i]; + } + for(size_t i = 0; i < 8; i++) { + for(size_t j = 0; j < _mag; j++) { + *p++ = _tmp[i]; + } + } + } + constexpr void store_n_limited(const T* p, const size_t _mag, const size_t _limit) + { + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T _tmp[8]; + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + _tmp[i] = m_data[i]; + } + for(size_t i = 0; (i < 8) && (i < _limit); i++) { + for(size_t j = 0; j < _mag; j++) { + *p++ = _tmp[i]; + } + } + } + template + constexpr void store_n(T2* p, const size_t _mag) + { + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T2 _tmp[8]; + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + _tmp[i] = (T2)(m_data[i]); + } + for(size_t i = 0; i < 8; i++) { + for(size_t j = 0; j < _mag; j++) { + *p++ = _tmp[i]; + } + } + } + template + constexpr void store_n_limited(T2* p, const size_t _mag, const size_t _limit) + { + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T2 _tmp[8]; + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + _tmp[i] = (T2)(m_data[i]); + } + for(size_t i = 0; (i < 8) && (i < _limit); i++) { + for(size_t j = 0; j < _mag; j++) { + *p++ = _tmp[i]; + } + } + } // Pointer must be aligned minimum of 16 bytes. constexpr void load_aligned(T* p) { @@ -81,6 +246,15 @@ public: m_data[i] = q[i]; } } + template + constexpr void load_aligned(T2* p) + { + T2* q = ___assume_aligned(p, __M__MINIMUM_ALIGN_LENGTH); + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + m_data[i] = (T)(q[i]); + } + } // Pointer must be aligned minimum of 16 bytes. constexpr void store_aligned(T* p) @@ -91,6 +265,15 @@ public: q[i] = m_data[i]; } } + template + constexpr void store_aligned(T2* p) + { + T2* q = ___assume_aligned(p, __M__MINIMUM_ALIGN_LENGTH); + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + q[i] = (T2)(m_data[i]); + } + } inline void copy(const csp_vector8 __b) { __DECL_VECTORIZED_LOOP @@ -98,6 +281,15 @@ public: m_data[i] = __b.at(i); } } + template + constexpr void get(csp_vector8 __b) + { + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + m_data[i] = (T)(__b.at(i)); + } + } + inline void clear() { __DECL_VECTORIZED_LOOP @@ -105,17 +297,89 @@ public: m_data[i] = (T)0; } } - constexpr T set(size_t __n, T __val) + inline void set(size_t __n, T __val) { m_data[__n] = __val; } - constexpr T reset(size_t __n) + inline void reset(size_t __n) { m_data[__n] = (T)0; } + template + constexpr csp_vector8& lookup(csp_vector8& __list, T* __table, const size_t count = 8) + { + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T2 rlist[8] = {0}; + size_t _count = ((count > 8) || (count == 0)) ? 8 : count; + constexpr bool _is_signed = std::is_signed().value; + + for(size_t i = 0; i < _count; i++) { + rlist[i] = __list.at(i); + } + if(_is_signed) { + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + rlist[i] = (rlist[i] < 0) ? 0 : rlist[i]; + } + } + + for(size_t i = 0; i < _count; i++) { + m_data[i] = __table[rlist[i]]; + } + return *this; + } + template + constexpr csp_vector8& lookup(csp_vector8& __list, T2 _limit, T* __table, const size_t count = 8) + { + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T2 rlist[8] = {0}; + constexpr bool _is_signed = std::is_signed().value; + size_t _count = ((count > 8) || (count == 0)) ? 8 : count; + + for(size_t i = 0; i < _count; i++) { + rlist[i] = __list.at(i); + } + if(_is_signed) { + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + rlist[i] = (rlist[i] < 0) ? 0 : rlist[i]; + } + } + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + rlist[i] = (rlist[i] > _limit) ? _limit : rlist[i]; + } + + for(size_t i = 0; i < _count; i++) { + m_data[i] = __table[rlist[i]]; + } + return *this; + } + template + constexpr csp_vector8& lookup(csp_vector8& __list, T2 _min, T2 _max, T* __table, const size_t count = 8) + { + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T2 rlist[8] = {0}; + size_t _count = ((count > 8) || (count == 0)) ? 8 : count; + if(_min > _max) std::swap(_min, _max); + for(size_t i = 0; i < _count; i++) { + rlist[i] = __list.at(i); + } + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + rlist[i] = (rlist[i] < _min) ? _min : rlist[i]; + } + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + rlist[i] = (rlist[i] > _max) ? _max : rlist[i]; + } + + + for(size_t i = 0; i < _count; i++) { + m_data[i] = __table[rlist[i]]; + } + return *this; + } // Pointer must be aligned minimum of 16 bytes. - constexpr csp_vector8& set_cond(const csp_vector8& __flags, const T __true_val, const T __false_val) + constexpr csp_vector8& set_cond(csp_vector8& __flags, const T __true_val, const T __false_val) { __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { @@ -123,33 +387,46 @@ public: } return *this; } - constexpr void set_cond(const csp_vector8& __flags, const T __true_val, const T __false_val) + constexpr csp_vector8& set_if_true(csp_vector8& __flags, const T __val) { __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { - m_data[i] = (__flags.at(i)) ? __true_val : __false_val; + if(__flags.at(i)) { + m_data[i] = __val; + } } + return *this; } - constexpr void set_if_true(const csp_vector8& __flags, const T __val) + constexpr csp_vector8& set_if_false(csp_vector8& __flags, const T __val) { __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { - if(__flags.at(i)) { + if(!(__flags.at(i))) { m_data[i] = __val; } } + return *this; } - inline void shuffle(const csp_vector8& __positions) + template + inline void shuffle(csp_vector8& __positions) { __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) T __d[8]; __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) uint8_t __p[8]; __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) const uint8_t __m[8] = {7, 7, 7, 7, 7, 7, 7, 7}; - - __DECL_VECTORIZED_LOOP - for(size_t i = 0; i < 8; i++) { - __p[i] = __positions.at(i); + constexpr bool __is_signed = std::is_signed().value; + if(__is_signed) { + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + T2 _tmp = __positions.at(i); + __p[i] = (_tmp < 0) ? (uint8_t)(_tmp) : (uint8_t)(-_tmp); + } + } else { + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + __p[i] = (uint8_t)(__positions.at(i)); + } } __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { @@ -158,15 +435,26 @@ public: __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { - __d[i] = m_data[__n[i]]; + __d[i] = m_data[__p[i]]; } __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { m_data[i] = __d[i]; } } + template + inline void shuffle_force_unsigned(csp_vector8& __positions) + { + typedef typename std::make_unsigned::type T3U; + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 _p; + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + _p.set(i, (T3U)(__positions.at(i))); + } + shuffle(_p); + } - constexpr T operator[](size_type __n) + constexpr T operator[](const size_t& __n) { return m_data[__n]; } @@ -240,7 +528,7 @@ public: for(size_t i = 0; i < 8; i++) { m_data[i] <<= __n; } - retrurn *this; + return *this; } constexpr csp_vector8& operator>>=(const size_t __n) { @@ -248,72 +536,34 @@ public: for(size_t i = 0; i < 8; i++) { m_data[i] >>= __n; } - retrurn *this; + return *this; } - constexpr csp_vector8& operator~() + constexpr csp_vector8& operator>>=(csp_vector8& __n) { __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { - m_data[i] = ~(m_data[i]); + m_data[i] >>= __n.at(i); } return *this; } - - constexpr csp_vector8& operator+(const csp_vector8& __a, const csp_vector8& __b) + constexpr csp_vector8& operator<<=(csp_vector8& __n) { - copy(__a); - *this += __b; - return *this; - } - constexpr csp_vector8& operator-(const csp_vector8& __a, const csp_vector8& __b) - { - copy(__a); - *this -= __b; - return *this; - } - constexpr csp_vector8& operator*(const csp_vector8& __a, const csp_vector8& __b) - { - copy(__a); - *this *= __b; - return *this; - } - constexpr csp_vector8& operator/(const csp_vector8& __a, const csp_vector8& __b) - { - copy(__a); - *this /= __b; - return *this; - } - constexpr csp_vector8& operator&(const csp_vector8& __a, const csp_vector8& __b) - { - copy(__a); - *this &= __b; - return *this; - } - constexpr csp_vector8& operator|(const csp_vector8& __a, const csp_vector8& __b) - { - copy(__a); - *this |= __b; - return *this; - } - constexpr csp_vector8& operator^(const csp_vector8& __a, const csp_vector8& __b) - { - copy(__a); - *this ^= __b; - return *this; - } - constexpr csp_vector8& operator>>(const csp_vector8& __a, size_t __n) - { - copy(__a); - *this >>= __n; + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + m_data[i] <<= __n.at(i); + } return *this; } - constexpr csp_vector8& operator<<(const csp_vector8& __a, size_t __n) + constexpr csp_vector8& operator~() { - copy(__a); - *this <<= __n; + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + m_data[i] = ~(m_data[i]); + } return *this; } - constexpr bool& operator==(const csp_vector8& __a) + + constexpr bool operator==(const csp_vector8& __a) { bool __f = true; __DECL_VECTORIZED_LOOP @@ -322,7 +572,7 @@ public: } return __f; } - constexpr bool& operator==(const T __a) + constexpr bool operator==(const T __a) { bool __f = true; __DECL_VECTORIZED_LOOP @@ -331,7 +581,7 @@ public: } return __f; } - constexpr bool& operator!=(const csp_vector8& __a) + constexpr bool operator!=(const csp_vector8& __a) { bool __f = true; __DECL_VECTORIZED_LOOP @@ -340,7 +590,7 @@ public: } return __f; } - constexpr bool& operator!=(const T __a) + constexpr bool operator!=(const T __a) { bool __f = true; __DECL_VECTORIZED_LOOP @@ -350,44 +600,140 @@ public: return __f; } - constexpr csp_vector8& operator==(const csp_vector8& __a) + constexpr void equals(csp_vector8& __ret, const csp_vector8& __a) { - csp_vector8 __ret(false); __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { __ret.set(i, (__a.at(i) == m_data[i]) ? true : false); } - return __ret; } - constexpr csp_vector8& operator==(const T __a) + constexpr void equals(csp_vector8& __ret, const T __a) { - csp_vector8 __ret(false); __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { __ret.set(i, (m_data[i] == __a) ? true : false); } - return __ret; } - constexpr csp_vector8& operator!=(const csp_vector8& __a) + constexpr void not_equals(csp_vector8& __ret, const csp_vector8& __a) { - csp_vector8 __ret(false); __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { __ret.set(i, (__a.at(i) != m_data[i]) ? true : false); } - return __ret; } - constexpr csp_vector8& operator!=(const T __a) + constexpr void not_equals(csp_vector8& __ret, const T __a) { - csp_vector8 __ret(false); __DECL_VECTORIZED_LOOP for(size_t i = 0; i < 8; i++) { __ret.set(i, (m_data[i] != __a) ? true : false); } - return __ret; + } + constexpr void check_bits(csp_vector8& __ret, const T& _bitmask) + { + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + __ret.set(i, ((m_data[i] & _bitmask) == _bitmask) ? true : false); + } + } + // Maybe faster than check_bits(). + constexpr void check_any_bits(csp_vector8& __ret, const T& _bitmask) + { + __DECL_VECTORIZED_LOOP + for(size_t i = 0; i < 8; i++) { + __ret.set(i, ((m_data[i] & _bitmask) != 0) ? true : false); + } } }; + +template + constexpr csp_vector8& operator+(const csp_vector8& __a, const csp_vector8& __b) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret += __b; + return __ret; +} + +// Primitive operators must define outside of class :-( +template + constexpr csp_vector8& operator-(const csp_vector8& __a, const csp_vector8& __b) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret -= __b; + return __ret; +} + +template + constexpr csp_vector8& operator*(const csp_vector8& __a, const csp_vector8& __b) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret *= __b; + return __ret; +} + +template + constexpr csp_vector8& operator/(const csp_vector8& __a, const csp_vector8& __b) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret /= __b; + return __ret; +} + +template + constexpr csp_vector8& operator&(const csp_vector8& __a, const csp_vector8& __b) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret &= __b; + return __ret; +} + +template + constexpr csp_vector8& operator|(const csp_vector8& __a, const csp_vector8& __b) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret |= __b; + return __ret; +} +template + constexpr csp_vector8& operator^(const csp_vector8& __a, const csp_vector8& __b) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret ^= __b; + return __ret; +} + +template + constexpr csp_vector8& operator<<(const csp_vector8& __a, const size_t& __shift) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret <<= __shift; + return __ret; +} + +template + constexpr csp_vector8& operator>>(const csp_vector8& __a, const size_t& __shift) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret >>= __shift; + return __ret; +} + +template + constexpr csp_vector8& operator<<(const csp_vector8& __a, csp_vector8& __shift) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret <<= __shift; + return __ret; +} + +template + constexpr csp_vector8& operator>>(const csp_vector8& __a, csp_vector8& __shift) +{ + __DECL_ALIGNED(__M__MINIMUM_ALIGN_LENGTH) csp_vector8 __ret(__a); + __ret >>= __shift; + return __ret; +} + // Please include type specified (and MPU specified) templates. #undef __M__MINIMUM_ALIGN_LENGTH diff --git a/source/src/vm/fmtowns/crtc.cpp b/source/src/vm/fmtowns/crtc.cpp index e59f54365..b73b63cf9 100644 --- a/source/src/vm/fmtowns/crtc.cpp +++ b/source/src/vm/fmtowns/crtc.cpp @@ -9,6 +9,7 @@ */ #include "../vm.h" #include "../../common.h" +#include "../../types/simd.h" #include "crtc.h" #include "vram.h" @@ -1164,7 +1165,20 @@ bool TOWNS_CRTC::render_256(scrntype_t* dst, int y) __UNLIKELY_IF(pwidth < 1) pwidth = 1; int xx = 0; int k = 0; + csp_vector8 __pbuf[2]; + csp_vector8 __sbuf[2]; for(int x = 0; x < (pwidth >> 4); x++) { + #if 1 +__DECL_VECTORIZED_LOOP + for(int ii = 0; ii < 2; ii++) { + __pbuf[ii].load(&p[ii << 3]); + } +__DECL_VECTORIZED_LOOP + for(int ii = 0; ii < 2; ii++) { + __sbuf[ii].lookup(__pbuf[ii], apal256); + } + p += 16; + #else __DECL_VECTORIZED_LOOP for(int i = 0; i < 16; i++) { pbuf[i] = p[i]; @@ -1174,47 +1188,44 @@ __DECL_VECTORIZED_LOOP for(int i = 0; i < 16; i++) { sbuf[i] = apal256[pbuf[i]]; } + #endif int kbak = k; if(((magx << 4) + k) <= width) { switch(magx) { case 1: __DECL_VECTORIZED_LOOP - for(int i = 0; i < 16; i++) { - q[i] = sbuf[i]; + for(int ii = 0; ii < 2; ii++) { + __sbuf[ii].store(&(q[ii << 3])); } k += 16; q += 16; break; case 2: -__DECL_VECTORIZED_LOOP - for(int i = 0; i < 32; i++) { - q[i] = sbuf[i >> 1]; + for(int ii = 0; ii < 2; ii++) { + __sbuf[ii].store2(&(q[ii << 4])); } k += 32; q += 32; break; case 4: -__DECL_VECTORIZED_LOOP - for(int i = 0, j = 0; i < 16; i++, j += 4) { - q[j + 0] = sbuf[i]; - q[j + 1] = sbuf[i]; - q[j + 2] = sbuf[i]; - q[j + 3] = sbuf[i]; + for(int ii = 0; ii < 2; ii++) { + __sbuf[ii].store4(&(q[ii << 5])); } k += 64; q += 64; break; default: - for(int i = 0; i < 16; i++) { - for(int j = 0; j < magx; j++) { - q[j] = sbuf[i]; - } - q += magx; - k += magx; + for(int ii = 0; ii < 2; ii++) { + __sbuf[ii].store_n(q, magx); + q += (magx * 8); + k += (magx * 8); } break; } } else { + for(int ii = 0; ii < 2; ii++) { + __sbuf[ii].store_aligned(&(sbuf[ii << 3])); + } for(int i = 0; i < 16; i++) { for(int j = 0; j < magx; j++) { q[j] = sbuf[i]; @@ -1227,52 +1238,38 @@ __DECL_VECTORIZED_LOOP } } __LIKELY_IF(k >= width) return true; - int w = pwidth & 0x0f; + size_t w = pwidth & 0x0f; __UNLIKELY_IF(w != 0) { - for(int i = 0; i < w; i++) { - pbuf[i] = p[i]; - } - for(int i = 0; i < w; i++) { - sbuf[i] = apal256[pbuf[i]]; - } + __pbuf[0].clear(); + __sbuf[0].clear(); + __pbuf[0].load_limited(p, w); + __sbuf[0].lookup(__pbuf[0], apal256, w); + if(((magx * w) + k) <= width) { switch(magx) { case 1: - for(int i = 0; i < w; i++) { - q[i] = sbuf[i]; - } + __sbuf[0].store_limited(q, w); k += w; q += w; break; case 2: - for(int i = 0, j = 0; i < w; i++, j += 2) { - q[j + 0] = sbuf[i]; - q[j + 1] = sbuf[i]; - } + __sbuf[0].store2_limited(q, w); k += (w << 1); q += (w << 1); break; case 4: - for(int i = 0, j = 0; i < w; i++, j += 4) { - q[j + 0] = sbuf[i]; - q[j + 1] = sbuf[i]; - q[j + 2] = sbuf[i]; - q[j + 3] = sbuf[i]; - } + __sbuf[0].store4_limited(q, w); k += (w << 2); q += (w << 2); break; default: - for(int i = 0; i < w; i++) { - for(int j = 0; j < magx; j++) { - q[j] = sbuf[i]; - } - q += magx; - k += magx; - } + __sbuf[0].store_n_limited(q, magx, w); + q += (magx * w); + k += (magx * w); break; } } else { + __sbuf[0].store_aligned(sbuf); for(int i = 0; i < w; i++) { for(int j = 0; j < magx; j++) { q[j] = sbuf[i]; -- 2.11.0