+typedef __DECL_ALIGNED(16) union {
+ __v4hi v;
+ uint8_t w[8];
+} uint8_vec8_t;
+
+typedef __DECL_ALIGNED(16) union {
+ __v8hi v;
+ uint16_t w[8];
+} uint16_vec8_t;
+
+typedef __DECL_ALIGNED(16) union {
+ __v16hi v;
+ uint32_t w[8];
+} uint32_vec8_t;
+
+typedef __DECL_ALIGNED(16) struct {
+ uint16_vec8_t plane_table[256];
+} _bit_trans_table_t;
+
+typedef __DECL_ALIGNED(sizeof(scrntype_vec8_t)) struct {
+ scrntype_vec8_t plane_table[256];
+} _bit_trans_table_scrn_t;
+
+typedef struct {
+ scrntype_t* palette; // Must be 2^planes entries. If NULL, assume RGB.
+ _bit_trans_table_t* bit_trans_table[16]; // Must be exist >= planes. Must be aligned with sizeof(uint16_vec8_t).
+ int xzoom; // 1 - 4?
+ bool is_render[16];
+ int shift;
+ uint8_t* data[16];
+ uint32_t baseaddress[16];
+ uint32_t voffset[16];
+ uint32_t addrmask; // For global increment.
+ uint32_t addrmask2; // For local increment.
+ uint32_t begin_pos;
+ uint32_t render_width;
+} _render_command_data_t;
+
+
+inline scrntype_vec8_t ConvertByteToMonochromePackedPixel(uint8_t src, _bit_trans_table_t *tbl,scrntype_t on_val, scrntype_t off_val)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ __DECL_ALIGNED(32) scrntype_vec8_t tmpdd;
+ _bit_trans_table_t* vt = (_bit_trans_table_t*)__builtin_assume_aligned(tbl, sizeof(uint16_vec8_t));
+
+ tmpd.v = vt->plane_table[src].v;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ tmpdd.w[i] = (tmpd.w[i] == 0) ? off_val: on_val;
+ }
+ return tmpdd;
+}
+
+// Note: Pls. read Note(s) of common.cpp -- 20181105 K.Ohta.
+// Tables for below functions must be aligned by 16 (_bit_trans_table_t) or 32(_bit_trans_table_scrn_t).
+void DLL_PREFIX ConvertByteToPackedPixelByColorTable(uint8_t *src, scrntype_t* dst, int bytes, _bit_trans_table_t *tbl, scrntype_t *on_color_table, scrntype_t* off_color_table);
+void DLL_PREFIX ConvertByteToPackedPixelByColorTable2(uint8_t *src, scrntype_t* dst, int bytes, _bit_trans_table_scrn_t *tbl, scrntype_t *on_color_table, scrntype_t* off_color_table);
+void DLL_PREFIX ConvertByteToSparceUint16(uint8_t *src, uint16_t* dst, int bytes, _bit_trans_table_t *tbl, uint16_t mask);
+void DLL_PREFIX ConvertByteToSparceUint8(uint8_t *src, uint16_t* dst, int bytes, _bit_trans_table_t *tbl, uint16_t mask);
+
+// Table must be (ON_VAL_COLOR : OFF_VAL_COLOR)[256].
+inline scrntype_vec8_t ConvertByteToPackedPixel_PixelTbl(uint8_t src, _bit_trans_table_scrn_t *tbl)
+{
+ __DECL_ALIGNED(32) scrntype_vec8_t tmpdd;
+ _bit_trans_table_scrn_t* vt = (_bit_trans_table_scrn_t*)__builtin_assume_aligned(tbl, sizeof(uint16_vec8_t));
+
+ tmpdd.v = vt->plane_table[src].v;
+ return tmpdd;
+}
+
+// Table must be (ON_VAL_COLOR : OFF_VAL_COLOR)[256].
+inline scrntype_vec16_t ConvertByteToDoublePackedPixel_PixelTbl(uint8_t src, _bit_trans_table_scrn_t *tbl)
+{
+ __DECL_ALIGNED(32) scrntype_vec16_t tmpdd;
+ __DECL_ALIGNED(32) scrntype_vec8_t tmpd;
+ _bit_trans_table_scrn_t* vt = (_bit_trans_table_scrn_t*)__builtin_assume_aligned(tbl, sizeof(uint16_vec8_t));
+ tmpd.v = vt->plane_table[src].v;
+ int j = 0;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 16; i += 2) {
+ tmpdd.w[i] = tmpd.w[j];
+ tmpdd.w[i + 1] = tmpd.w[j];
+ j++;
+ }
+ return tmpdd;
+}
+
+// Table must be initialize ON_COLOR : OFF_COLOR
+inline void ConvertByteToDoubleMonochromeUint8(uint8_t src, uint8_t* dst, _bit_trans_table_t* tbl)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ __DECL_ALIGNED(16) uint8_t d[16];
+ tmpd = vt[src];
+ int j = 0;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 16; i += 2) {
+ d[i] = (uint8_t)(tmpd.w[j]);
+ d[i + 1] = (uint8_t)(tmpd.w[j]);
+ j++;
+ }
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 16; i++) {
+ dst[i] = d[i];
+ }
+}
+
+inline void ConvertByteToMonochromeUint8(uint8_t src, uint8_t* dst, _bit_trans_table_t* tbl)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ tmpd = vt[src];
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ dst[i] = (uint8_t)(tmpd.w[i]);
+ }
+}
+
+inline void ConvertRGBTo8ColorsUint8(uint8_t r, uint8_t g, uint8_t b, uint8_t* dst, _bit_trans_table_t* rtbl, _bit_trans_table_t* gtbl, _bit_trans_table_t* btbl, int shift)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ uint16_vec8_t* rvt = (uint16_vec8_t*)__builtin_assume_aligned(&(rtbl->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* gvt = (uint16_vec8_t*)__builtin_assume_aligned(&(gtbl->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* bvt = (uint16_vec8_t*)__builtin_assume_aligned(&(btbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ tmpd.v = rvt[r].v;
+ tmpd.v = tmpd.v | gvt[g].v;
+ tmpd.v = tmpd.v | bvt[b].v;
+ tmpd.v = tmpd.v >> shift;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ dst[i] = (uint8_t)(tmpd.w[i]);
+ }
+}
+
+inline void ConvertRGBTo8ColorsUint8_Zoom2Left(uint8_t r, uint8_t g, uint8_t b, uint8_t* dst, _bit_trans_table_t* rtbl, _bit_trans_table_t* gtbl, _bit_trans_table_t* btbl, int shift)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ uint16_vec8_t* rvt = (uint16_vec8_t*)__builtin_assume_aligned(&(rtbl->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* gvt = (uint16_vec8_t*)__builtin_assume_aligned(&(gtbl->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* bvt = (uint16_vec8_t*)__builtin_assume_aligned(&(btbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ tmpd.v = rvt[r].v;
+ tmpd.v = tmpd.v | gvt[g].v;
+ tmpd.v = tmpd.v | bvt[b].v;
+ tmpd.v = tmpd.v >> shift;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0, j = 0; i < 8; i += 2, j++) {
+ dst[i] = (uint8_t)(tmpd.w[j]);
+ dst[i + 1] = (uint8_t)(tmpd.w[j]);
+ }
+}
+
+inline void ConvertRGBTo8ColorsUint8_Zoom2Right(uint8_t r, uint8_t g, uint8_t b, uint8_t* dst, _bit_trans_table_t* rtbl, _bit_trans_table_t* gtbl, _bit_trans_table_t* btbl, int shift)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ uint16_vec8_t* rvt = (uint16_vec8_t*)__builtin_assume_aligned(&(rtbl->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* gvt = (uint16_vec8_t*)__builtin_assume_aligned(&(gtbl->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* bvt = (uint16_vec8_t*)__builtin_assume_aligned(&(btbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ tmpd.v = rvt[r].v;
+ tmpd.v = tmpd.v | gvt[g].v;
+ tmpd.v = tmpd.v | bvt[b].v;
+ tmpd.v = tmpd.v >> shift;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0, j = 4; i < 8; i += 2, j++) {
+ dst[i] = (uint8_t)(tmpd.w[j]);
+ dst[i + 1] = (uint8_t)(tmpd.w[j]);
+ }
+}
+
+inline void ConvertRGBTo8ColorsUint8_Zoom2Double(uint8_t r, uint8_t g, uint8_t b, uint8_t* dst, _bit_trans_table_t* rtbl, _bit_trans_table_t* gtbl, _bit_trans_table_t* btbl, int shift)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ uint16_vec8_t* rvt = (uint16_vec8_t*)__builtin_assume_aligned(&(rtbl->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* gvt = (uint16_vec8_t*)__builtin_assume_aligned(&(gtbl->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* bvt = (uint16_vec8_t*)__builtin_assume_aligned(&(btbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ tmpd.v = rvt[r].v;
+ tmpd.v = tmpd.v | gvt[g].v;
+ tmpd.v = tmpd.v | bvt[b].v;
+ tmpd.v = tmpd.v >> shift;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0, j = 0; i < 16; i += 2, j++) {
+ dst[i] = (uint8_t)(tmpd.w[j]);
+ dst[i + 1] = (uint8_t)(tmpd.w[j]);
+ }
+}
+
+inline void ConvertByteToMonochromeUint8Cond_Zoom2(uint8_t src, uint8_t* dst, _bit_trans_table_t* tbl, uint8_t on_color, uint8_t off_color)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ __DECL_ALIGNED(16) uint8_t d[16];
+ tmpd = vt[src];
+ int j = 0;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 16; i += 2) {
+ d[i] = (tmpd.w[j] == 0) ? off_color : on_color;
+ d[i + 1] = (tmpd.w[j] == 0) ? off_color : on_color;
+ j++;
+ }
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 16; i++) {
+ dst[i] = d[i];
+ }
+}
+
+inline void ConvertByteToMonochromeUint8Cond(uint8_t src, uint8_t* dst, _bit_trans_table_t* tbl, uint8_t on_color, uint8_t off_color)
+{
+ __DECL_ALIGNED(16) uint16_vec8_t tmpd;
+ uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ tmpd = vt[src];
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ dst[i] = (tmpd.w[i] == 0) ? off_color : on_color;
+ }
+}
+
+void DLL_PREFIX PrepareBitTransTableUint16(_bit_trans_table_t *tbl, uint16_t on_val, uint16_t off_val);
+void DLL_PREFIX PrepareBitTransTableScrnType(_bit_trans_table_scrn_t *tbl, scrntype_t on_val, scrntype_t off_val);
+void DLL_PREFIX PrepareReverseBitTransTableUint16(_bit_trans_table_t *tbl, uint16_t on_val, uint16_t off_val);
+void DLL_PREFIX PrepareReverseBitTransTableScrnType(_bit_trans_table_scrn_t *tbl, scrntype_t on_val, scrntype_t off_val);
+
+void DLL_PREFIX Render8Colors_Line(_render_command_data_t *src, scrntype_t *dst, scrntype_t *dst2, bool scan_line);
+
+void DLL_PREFIX Render16Colors_Line(_render_command_data_t *src, scrntype_t *dst, scrntype_t *dst2, bool scan_line);
+void DLL_PREFIX Render2NColors_Line(_render_command_data_t *src, scrntype_t *dst, scrntype_t* dst2, bool scan_line, int planes);
+
+void DLL_PREFIX Convert8ColorsToByte_Line(_render_command_data_t *src, uint8_t *dst);
+void DLL_PREFIX Convert2NColorsToByte_Line(_render_command_data_t *src, uint8_t *dst, int planes);
+void DLL_PREFIX Convert2NColorsToByte_LineZoom2(_render_command_data_t *src, uint8_t *dst, int planes);
+
+inline uint64_t ExchangeEndianU64(uint64_t __in)
+{
+ pair64_t __i, __o;
+ __i.q = __in;
+ __o.b.h7 = __i.b.l;
+ __o.b.h6 = __i.b.h;
+ __o.b.h5 = __i.b.h2;
+ __o.b.h4 = __i.b.h3;
+ __o.b.h3 = __i.b.h4;
+ __o.b.h2 = __i.b.h5;
+ __o.b.h = __i.b.h6;
+ __o.b.l = __i.b.h7;
+ return __o.q;
+}
+
+inline int64_t ExchangeEndianS64(uint64_t __in)
+{
+ pair64_t __i, __o;
+ __i.q = __in;
+ __o.b.h7 = __i.b.l;
+ __o.b.h6 = __i.b.h;
+ __o.b.h5 = __i.b.h2;
+ __o.b.h4 = __i.b.h3;
+ __o.b.h3 = __i.b.h4;
+ __o.b.h2 = __i.b.h5;
+ __o.b.h = __i.b.h6;
+ __o.b.l = __i.b.h7;
+ return __o.sq;
+}
+inline uint32_t ExchangeEndianU32(uint32_t __in)
+{
+ pair32_t __i, __o;
+ __i.d = __in;
+ __o.b.h3 = __i.b.l;
+ __o.b.h2 = __i.b.h;
+ __o.b.h = __i.b.h2;
+ __o.b.l = __i.b.h3;
+ return __o.d;
+}
+
+inline int32_t ExchangeEndianS32(uint32_t __in)
+{
+ pair32_t __i, __o;
+ __i.d = __in;
+ __o.b.h3 = __i.b.l;
+ __o.b.h2 = __i.b.h;
+ __o.b.h = __i.b.h2;
+ __o.b.l = __i.b.h3;
+ return __o.sd;
+}
+
+inline uint16_t ExchangeEndianU16(uint16_t __in)
+{
+ pair16_t __i, __o;
+ __i.u16 = __in;
+ __o.b.h = __i.b.l;
+ __o.b.l = __i.b.h;
+ return __o.u16;
+}
+
+inline int16_t ExchangeEndianS16(uint16_t __in)
+{
+ pair16_t __i, __o;
+ __i.u16 = __in;
+ __o.b.h = __i.b.l;
+ __o.b.l = __i.b.h;
+ return __o.s16;
+}
+