+#include "fileio.h"
+
+#if defined(__MINGW32__) || defined(__MINGW64__)
+ extern DWORD GetLongPathName(LPCTSTR lpszShortPath, LPTSTR lpszLongPath, DWORD cchBuffer);
+#endif
+#if defined(_USE_QT)
+ std::string DLL_PREFIX cpp_homedir;
+ std::string DLL_PREFIX my_procname;
+ std::string DLL_PREFIX sRssDir;
+#endif
+
+void DLL_PREFIX common_initialize()
+{
+ // get the initial current path when the software starts
+ get_initial_current_path();
+}
+
+uint32_t DLL_PREFIX EndianToLittle_DWORD(uint32_t x)
+{
+#if defined(__LITTLE_ENDIAN__)
+ return x;
+#else
+ uint32_t y;
+ y = ((x & 0x000000ff) << 24) | ((x & 0x0000ff00) << 8) |
+ ((x & 0x00ff0000) >> 8) | ((x & 0xff000000) >> 24);
+ return y;
+#endif
+}
+
+uint16_t DLL_PREFIX EndianToLittle_WORD(uint16_t x)
+{
+#if defined(__LITTLE_ENDIAN__)
+ return x;
+#else
+ uint16_t y;
+ y = ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8);
+ return y;
+#endif
+}
+
+uint32_t DLL_PREFIX EndianFromLittle_DWORD(uint32_t x)
+{
+#if defined(__LITTLE_ENDIAN__)
+ return x;
+#else
+ uint32_t y;
+ y = ((x & 0x000000ff) << 24) | ((x & 0x0000ff00) << 8) |
+ ((x & 0x00ff0000) >> 8) | ((x & 0xff000000) >> 24);
+ return y;
+#endif
+}
+
+uint16_t DLL_PREFIX EndianFromLittle_WORD(uint16_t x)
+{
+#if defined(__LITTLE_ENDIAN__)
+ return x;
+#else
+ uint16_t y;
+ y = ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8);
+ return y;
+#endif
+}
+
+
+uint32_t DLL_PREFIX EndianToBig_DWORD(uint32_t x)
+{
+#if defined(__BIG_ENDIAN__)
+ return x;
+#else
+ uint32_t y;
+ y = ((x & 0x000000ff) << 24) | ((x & 0x0000ff00) << 8) |
+ ((x & 0x00ff0000) >> 8) | ((x & 0xff000000) >> 24);
+ return y;
+#endif
+}
+
+uint16_t DLL_PREFIX EndianToBig_WORD(uint16_t x)
+{
+#if defined(__BIG_ENDIAN__)
+ return x;
+#else
+ uint16_t y;
+ y = ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8);
+ return y;
+#endif
+}
+
+uint32_t DLL_PREFIX EndianFromBig_DWORD(uint32_t x)
+{
+#if defined(__BIG_ENDIAN__)
+ return x;
+#else
+ uint32_t y;
+ y = ((x & 0x000000ff) << 24) | ((x & 0x0000ff00) << 8) |
+ ((x & 0x00ff0000) >> 8) | ((x & 0xff000000) >> 24);
+ return y;
+#endif
+}
+
+uint16_t DLL_PREFIX EndianFromBig_WORD(uint16_t x)
+{
+#if defined(__BIG_ENDIAN__)
+ return x;
+#else
+ uint16_t y;
+ y = ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8);
+ return y;
+#endif
+}
+
+
+#ifndef _MSC_VER
+int DLL_PREFIX max(int a, int b)
+{
+ if(a > b) {
+ return a;
+ } else {
+ return b;
+ }
+}
+
+
+unsigned DLL_PREFIX int max(unsigned int a, int b)
+{
+ if(b < 0) return a;
+ if(a > (unsigned int)b) {
+ return a;
+ } else {
+ return b;
+ }
+}
+
+unsigned DLL_PREFIX int max(int a, unsigned int b)
+{
+ if(a < 0) return b;
+ if((unsigned int)a > b) {
+ return a;
+ } else {
+ return b;
+ }
+}
+
+unsigned int DLL_PREFIX max(unsigned int a, unsigned int b)
+{
+ if(a > b) {
+ return a;
+ } else {
+ return b;
+ }
+}
+
+int DLL_PREFIX min(int a, int b)
+{
+ if(a < b) {
+ return a;
+ } else {
+ return b;
+ }
+}
+
+int DLL_PREFIX min(unsigned int a, int b)
+{
+ if(b < 0) return b;
+ if(a > INT_MAX) return b;
+
+ if((int)a < b) {
+ return (int)a;
+ } else {
+ return b;
+ }
+}
+
+int DLL_PREFIX min(int a, unsigned int b)
+{
+ if(a < 0) return a;
+ if(b > INT_MAX) return a;
+
+ if(a < (int)b) {
+ return a;
+ } else {
+ return (int)b;
+ }
+}
+
+unsigned int DLL_PREFIX min(unsigned int a, unsigned int b)
+{
+ if(a < b) {
+ return a;
+ } else {
+ return b;
+ }
+}
+#endif
+
+#ifndef SUPPORT_SECURE_FUNCTIONS
+//errno_t my_tfopen_s(FILE** pFile, const _TCHAR *filename, const _TCHAR *mode)
+//{
+// if((*pFile = _tfopen(filename, mode)) != NULL) {
+// return 0;
+// } else {
+// return errno;
+// }
+//}
+
+errno_t DLL_PREFIX my_tcscat_s(_TCHAR *strDestination, size_t numberOfElements, const _TCHAR *strSource)
+{
+ _tcscat(strDestination, strSource);
+ return 0;
+}
+
+errno_t DLL_PREFIX my_strcpy_s(char *strDestination, size_t numberOfElements, const char *strSource)
+{
+ strcpy(strDestination, strSource);
+ return 0;
+}
+
+errno_t DLL_PREFIX my_tcscpy_s(_TCHAR *strDestination, size_t numberOfElements, const _TCHAR *strSource)
+{
+ _tcscpy(strDestination, strSource);
+ return 0;
+}
+
+errno_t DLL_PREFIX my_strncpy_s(char *strDestination, size_t numberOfElements, const char *strSource, size_t count)
+{
+ strncpy(strDestination, strSource, count);
+ return 0;
+}
+
+errno_t DLL_PREFIX my_tcsncpy_s(_TCHAR *strDestination, size_t numberOfElements, const _TCHAR *strSource, size_t count)
+{
+ _tcsncpy(strDestination, strSource, count);
+ return 0;
+}
+
+char *DLL_PREFIX my_strtok_s(char *strToken, const char *strDelimit, char **context)
+{
+ return strtok(strToken, strDelimit);
+}
+
+_TCHAR *DLL_PREFIX my_tcstok_s(_TCHAR *strToken, const char *strDelimit, _TCHAR **context)
+{
+ return _tcstok(strToken, strDelimit);
+}
+
+int DLL_PREFIX my_sprintf_s(char *buffer, size_t sizeOfBuffer, const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ int result = vsnprintf(buffer, sizeOfBuffer, format, ap);
+ va_end(ap);
+ return result;
+}
+
+int DLL_PREFIX my_swprintf_s(wchar_t *buffer, size_t sizeOfBuffer, const wchar_t *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ int result = vswprintf(buffer, sizeOfBuffer, format, ap);
+ va_end(ap);
+ return result;
+}
+
+int DLL_PREFIX my_stprintf_s(_TCHAR *buffer, size_t sizeOfBuffer, const _TCHAR *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ int result = vsnprintf(buffer, sizeOfBuffer, format, ap);
+ va_end(ap);
+ return result;
+}
+
+int DLL_PREFIX my_vsprintf_s(char *buffer, size_t numberOfElements, const char *format, va_list argptr)
+{
+ return vsnprintf(buffer, numberOfElements * sizeof(char), format, argptr);
+}
+
+int DLL_PREFIX my_vstprintf_s(_TCHAR *buffer, size_t numberOfElements, const _TCHAR *format, va_list argptr)
+{
+ return vsnprintf(buffer, numberOfElements * sizeof(_TCHAR), format, argptr);
+}
+#endif
+
+//#ifdef USE_FAST_MEMCPY
+
+void DLL_PREFIX *my_memcpy(void *dst, void *src, size_t len)
+{
+ return memcpy(dst, src, len);
+}
+//#endif
+
+
+#ifndef _WIN32
+BOOL DLL_PREFIX MyWritePrivateProfileString(LPCTSTR lpAppName, LPCTSTR lpKeyName, LPCTSTR lpString, LPCTSTR lpFileName)
+{
+ BOOL result = FALSE;
+ FILEIO* fio_i = new FILEIO();
+ if(fio_i->Fopen(lpFileName, FILEIO_READ_ASCII)) {
+ char tmp_path[_MAX_PATH];
+ my_sprintf_s(tmp_path, _MAX_PATH, "%s.$$$", lpFileName);
+ FILEIO* fio_o = new FILEIO();
+ if(fio_o->Fopen(tmp_path, FILEIO_WRITE_ASCII)) {
+ bool in_section = false;
+ char section[1024], line[1024], *equal;
+ my_sprintf_s(section, 1024, "[%s]", lpAppName);
+ while(fio_i->Fgets(line, 1024) != NULL && strlen(line) > 0) {
+ if(line[strlen(line) - 1] == '\n') {
+ line[strlen(line) - 1] = '\0';
+ }
+ if(!result) {
+ if(line[0] == '[') {
+ if(in_section) {
+ fio_o->Fprintf("%s=%s\n", lpKeyName, lpString);
+ result = TRUE;
+ } else if(strcmp(line, section) == 0) {
+ in_section = true;
+ }
+ } else if(in_section && (equal = strstr(line, "=")) != NULL) {
+ *equal = '\0';
+ if(strcmp(line, lpKeyName) == 0) {
+ fio_o->Fprintf("%s=%s\n", lpKeyName, lpString);
+ result = TRUE;
+ continue;
+ }
+ *equal = '=';
+ }
+ }
+ fio_o->Fprintf("%s\n", line);
+ }
+ if(!result) {
+ if(!in_section) {
+ fio_o->Fprintf("[%s]\n", lpAppName);
+ }
+ fio_o->Fprintf("%s=%s\n", lpKeyName, lpString);
+ result = TRUE;
+ }
+ fio_o->Fclose();
+ }
+ delete fio_o;
+ fio_i->Fclose();
+ if(result) {
+ if(!(FILEIO::RemoveFile(lpFileName) && FILEIO::RenameFile(tmp_path, lpFileName))) {
+ result = FALSE;
+ }
+ }
+ } else {
+ FILEIO* fio_o = new FILEIO();
+ if(fio_o->Fopen(lpFileName, FILEIO_WRITE_ASCII)) {
+ fio_o->Fprintf("[%s]\n", lpAppName);
+ fio_o->Fprintf("%s=%s\n", lpKeyName, lpString);
+ fio_o->Fclose();
+ }
+ delete fio_o;
+ }
+ delete fio_i;
+ return result;
+}
+
+
+DWORD DLL_PREFIX MyGetPrivateProfileString(LPCTSTR lpAppName, LPCTSTR lpKeyName, LPCTSTR lpDefault, LPTSTR lpReturnedString, DWORD nSize, LPCTSTR lpFileName)
+{
+ _TCHAR *lpp = (_TCHAR *)lpReturnedString;
+ if(lpDefault != NULL) {
+ my_strcpy_s(lpp, nSize, lpDefault);
+ } else {
+ lpp[0] = '\0';
+ }
+ FILEIO* fio = new FILEIO();
+ if(!(fio->IsFileExisting(lpFileName))) return 0;
+ if(fio->Fopen(lpFileName, FILEIO_READ_ASCII)) {
+ bool in_section = false;
+ char section[1024], line[1024], *equal;
+ my_sprintf_s(section, 1024, "[%s]", lpAppName);
+ while(fio->Fgets(line, 1024) != NULL && strlen(line) > 0) {
+ if(line[strlen(line) - 1] == '\n') {
+ line[strlen(line) - 1] = '\0';
+ }
+ if(line[0] == '[') {
+ if(in_section) {
+ break;
+ } else if(strcmp(line, section) == 0) {
+ in_section = true;
+ }
+ } else if(in_section && (equal = strstr(line, "=")) != NULL) {
+ *equal = '\0';
+ if(strcmp(line, lpKeyName) == 0) {
+ my_strcpy_s(lpp, nSize, equal + 1);
+ break;
+ }
+ }
+ }
+ fio->Fclose();
+ }
+ delete fio;
+ //csp_logger->debug_log(CSP_LOG_DEBUG, CSP_LOG_TYPE_GENERAL, "Try App: %s Key: %s", lpAppName, lpKeyName);
+ return strlen(lpp);
+}
+
+UINT DLL_PREFIX MyGetPrivateProfileInt(LPCTSTR lpAppName, LPCTSTR lpKeyName, INT nDefault, LPCTSTR lpFileName)
+{
+ int i;
+ char sstr[128];
+ char sval[128];
+ std::string s;
+ memset(sstr, 0x00, sizeof(sstr));
+ memset(sval, 0x00, sizeof(sval));
+ snprintf(sval, 128, "%d", nDefault);
+ MyGetPrivateProfileString(lpAppName,lpKeyName, sval, sstr, 128, lpFileName);
+ s = sstr;
+
+ if(s.empty()) {
+ i = nDefault;
+ } else {
+ i = strtol(s.c_str(), NULL, 10);
+ }
+ //csp_logger->debug_log(CSP_LOG_DEBUG, CSP_LOG_TYPE_GENERAL, "Got Int: %d\n", i);
+ return i;
+}
+#endif
+
+#if defined(_RGB555)
+scrntype_t DLL_PREFIX RGB_COLOR(uint32_t r, uint32_t g, uint32_t b)
+{
+ scrntype_t rr = ((scrntype_t)r * 0x1f) / 0xff;
+ scrntype_t gg = ((scrntype_t)g * 0x1f) / 0xff;
+ scrntype_t bb = ((scrntype_t)b * 0x1f) / 0xff;
+ return (rr << 10) | (gg << 5) | bb;
+}
+
+scrntype_t DLL_PREFIX RGBA_COLOR(uint32_t r, uint32_t g, uint b, uint32_t a)
+{
+ return RGB_COLOR(r, g, b);
+}
+
+uint8_t DLL_PREFIX R_OF_COLOR(scrntype_t c)
+{
+ c = (c >> 10) & 0x1f;
+ c = (c * 0xff) / 0x1f;
+ return (uint8_t)c;
+}
+
+uint8_t DLL_PREFIX G_OF_COLOR(scrntype_t c)
+{
+ c = (c >> 5) & 0x1f;
+ c = (c * 0xff) / 0x1f;
+ return (uint8_t)c;
+}
+
+uint8_t DLL_PREFIX B_OF_COLOR(scrntype_t c)
+{
+ c = (c >> 0) & 0x1f;
+ c = (c * 0xff) / 0x1f;
+ return (uint8_t)c;
+}
+
+uint8_t DLL_PREFIX A_OF_COLOR(scrntype_t c)
+{
+ return 0xff; //
+}
+#elif defined(_RGB565)
+scrntype_t DLL_PREFIX RGB_COLOR(uint32_t r, uint32_t g, uint32_t b)
+{
+ scrntype_t rr = ((scrntype_t)r * 0x1f) / 0xff;
+ scrntype_t gg = ((scrntype_t)g * 0x3f) / 0xff;
+ scrntype_t bb = ((scrntype_t)b * 0x1f) / 0xff;
+ return (rr << 11) | (gg << 5) | bb;
+}
+
+scrntype_t DLL_PREFIX RGBA_COLOR(uint32_t r, uint32_t g, uint32_t b, uint32_t a)
+{
+ return RGB_COLOR(r, g, b);
+}
+
+uint8_t DLL_PREFIX R_OF_COLOR(scrntype_t c)
+{
+ c = (c >> 11) & 0x1f;
+ c = (c * 0xff) / 0x1f;
+ return (uint8_t)c;
+}
+
+uint8_t DLL_PREFIX G_OF_COLOR(scrntype_t c)
+{
+ c = (c >> 5) & 0x3f;
+ c = (c * 0xff) / 0x3f;
+ return (uint8_t)c;
+}
+
+uint8_t DLL_PREFIX B_OF_COLOR(scrntype_t c)
+{
+ c = (c >> 0) & 0x1f;
+ c = (c * 0xff) / 0x1f;
+ return (uint8_t)c;
+}
+
+uint8_t DLL_PREFIX A_OF_COLOR(scrntype_t c)
+{
+ return 0xff; // Alpha = 255
+}
+#endif
+
+// Note: table strongly recommend to be aligned by sizeof(uint16_vec8_t).
+// This is sizeof(uint16) * 8, some compilers may require to align 16bytes(128)
+// when using SIMD128 -- 20181105 K.O
+void DLL_PREFIX PrepareBitTransTableUint16(_bit_trans_table_t *tbl, uint16_t on_val, uint16_t off_val)
+{
+ if(tbl == NULL) return;
+__DECL_VECTORIZED_LOOP
+ for(uint16_t i = 0; i < 256; i++) {
+ uint16_t n = i;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ tbl->plane_table[i].w[j] = ((n & 0x80) == 0) ? off_val : on_val;
+ n <<= 1;
+ }
+ }
+}
+
+// Note: table strongly recommend to be aligned by sizeof(scrntype_vec8_t).
+// This is sizeof(uint16) * 8, some compilers may require to align 32bytes(256) or 16bytes(128)
+// when using SIMD256 or SIMD128 -- 20181105 K.O
+void DLL_PREFIX PrepareBitTransTableScrnType(_bit_trans_table_scrn_t *tbl, scrntype_t on_val, scrntype_t off_val)
+{
+ if(tbl == NULL) return;
+__DECL_VECTORIZED_LOOP
+ for(uint16_t i = 0; i < 256; i++) {
+ uint16_t n = i;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ tbl->plane_table[i].w[j] = ((n & 0x80) == 0) ? off_val : on_val;
+ n <<= 1;
+ }
+ }
+}
+
+// Prepare reverse byte-order table(s).
+void DLL_PREFIX PrepareReverseBitTransTableUint16(_bit_trans_table_t *tbl, uint16_t on_val, uint16_t off_val)
+{
+ if(tbl == NULL) return;
+__DECL_VECTORIZED_LOOP
+ for(uint16_t i = 0; i < 256; i++) {
+ uint16_t n = i;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ tbl->plane_table[i].w[j] = ((n & 0x01) == 0) ? off_val : on_val;
+ n >>= 1;
+ }
+ }
+}
+
+void DLL_PREFIX PrepareReverseBitTransTableScrnType(_bit_trans_table_scrn_t *tbl, scrntype_t on_val, scrntype_t off_val)
+{
+ if(tbl == NULL) return;
+__DECL_VECTORIZED_LOOP
+ for(uint16_t i = 0; i < 256; i++) {
+ uint16_t n = i;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ tbl->plane_table[i].w[j] = ((n & 0x01) == 0) ? off_val : on_val;
+ n >>= 1;
+ }
+ }
+}
+
+// With _bit_trans_table_scrn_t.
+void DLL_PREFIX ConvertByteToPackedPixelByColorTable2(uint8_t *src, scrntype_t* dst, int bytes, _bit_trans_table_scrn_t *tbl, scrntype_t *on_color_table, scrntype_t* off_color_table)
+{
+
+ scrntype_vec8_t tmpd;
+ scrntype_vec8_t tmpdd;
+ scrntype_vec8_t colors;
+ scrntype_vec8_t* vt = (scrntype_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(scrntype_vec8_t));
+
+ uintptr_t disalign = (uintptr_t)dst;
+ disalign = disalign & (sizeof(scrntype_vec8_t) - 1); //Is align by 128bits or 256bytes?
+ if(disalign == 0) {
+ // Yes.
+ scrntype_vec8_t *vdst = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t));
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < bytes; i++) {
+ tmpd.v = vt[src[i]].v;
+ tmpdd.v = ~tmpd.v;
+
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ colors.w[j] = on_color_table[j];
+ }
+ tmpd.v = tmpd.v & colors.v;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ colors.w[j] = off_color_table[j];
+ }
+ tmpdd.v = tmpdd.v & colors.v;
+ vdst->v = (tmpd.v | tmpdd.v);
+ off_color_table += 8;
+ on_color_table += 8;
+ vdst++;
+ }
+ } else {
+ // Sorry, not aligned.
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < bytes; i++) {
+ tmpd.v = vt[src[i]].v;
+ tmpdd.v = ~tmpd.v;
+
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ colors.w[j] = on_color_table[j];
+ }
+ tmpd.v = tmpd.v & colors.v;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ colors.w[j] = off_color_table[j];
+ }
+ tmpdd.v = tmpdd.v & colors.v;
+ tmpdd.v = tmpdd.v | tmpd.v;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ dst[j] = tmpdd.w[j];
+ }
+ off_color_table += 8;
+ on_color_table += 8;
+ dst += 8;
+ }
+ }
+}
+
+
+// Convert uint8_t[] ed VRAM to uint16_t[] mono pixel pattern.
+// You must set table to "ON_VALUE" : "OFF_VALUE" via PrepareBitTransTableUint16().
+// -- 20181105 K.O
+void DLL_PREFIX ConvertByteToSparceUint16(uint8_t *src, uint16_t* dst, int bytes, _bit_trans_table_t *tbl, uint16_t mask)
+{
+
+ uint16_vec8_t tmpd;
+ uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ uint16_vec8_t __masks;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ __masks.w[i] = mask;
+ }
+ uintptr_t disalign = (uintptr_t)dst;
+ disalign = disalign & 0x0f; //Is align by 128bits?
+ if(disalign == 0) {
+ // Yes.
+ uint16_vec8_t *vdst = (uint16_vec8_t*)__builtin_assume_aligned(dst, sizeof(uint16_vec8_t));
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < bytes; i++) {
+ tmpd.v = vt[src[i]].v;
+ tmpd.v = tmpd.v & __masks.v;
+ vdst->v = tmpd.v;
+ vdst++;
+ }
+ } else {
+ // Sorry, not aligned.
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < bytes; i++) {
+ tmpd.v = vt[src[i]].v;
+ tmpd.v = tmpd.v & __masks.v;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ dst[j] = tmpd.w[j];
+ }
+ dst += 8;
+ }
+ }
+}
+
+// Convert uint8_t[] ed VRAM to uint8_t[] mono pixel pattern.
+// You must set table to "ON_VALUE" : "OFF_VALUE" via PrepareBitTransTableUint16().
+// -- 20181105 K.O
+void DLL_PREFIX ConvertByteToSparceUint8(uint8_t *src, uint16_t* dst, int bytes, _bit_trans_table_t *tbl, uint16_t mask)
+{
+
+ uint16_vec8_t tmpd;
+ uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ uint16_vec8_t __masks;
+ uint8_vec8_t tmpdd;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ __masks.w[i] = mask;
+ }
+ uintptr_t disalign = (uintptr_t)dst;
+ disalign = disalign & 0x07; //Is align by 128bits?
+ if(disalign == 0) {
+ // Yes.
+ uint8_vec8_t *vdst = (uint8_vec8_t*)__builtin_assume_aligned(dst, sizeof(uint8_vec8_t));
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < bytes; i++) {
+ tmpd.v = vt[src[i]].v;
+ tmpd.v = tmpd.v & __masks.v;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ tmpdd.w[j] = (uint8_t)(tmpd.w[j]);
+ }
+ vdst->v = tmpdd.v;
+ vdst++;
+ }
+ } else {
+ // Sorry, not aligned.
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < bytes; i++) {
+ tmpd.v = vt[src[i]].v;
+ tmpd.v = tmpd.v & __masks.v;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ dst[j] = (uint8_t)(tmpd.w[j]);
+ }
+ dst += 8;
+ }
+ }
+}
+
+
+void DLL_PREFIX ConvertByteToPackedPixelByColorTable(uint8_t *src, scrntype_t* dst, int bytes, _bit_trans_table_t *tbl, scrntype_t *on_color_table, scrntype_t* off_color_table)
+{
+
+ uint16_vec8_t tmpd;
+ scrntype_vec8_t tmpdd;
+ uint16_vec8_t* vt = (uint16_vec8_t*)__builtin_assume_aligned(&(tbl->plane_table[0]), sizeof(uint16_vec8_t));
+
+ uintptr_t disalign = (uintptr_t)dst;
+ disalign = disalign & 0x0f; //Is align by 128bits?
+ if(disalign == 0) {
+ // Yes.
+ scrntype_vec8_t *vdst = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t));
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < bytes; i++) {
+ tmpd.v = vt[src[i]].v;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ tmpdd.w[j] = (tmpd.w[j] == 0) ? off_color_table[j] : on_color_table[j];
+ }
+ vdst->v = tmpdd.v;
+ off_color_table += 8;
+ on_color_table += 8;
+ vdst++;
+ }
+ } else {
+ // Sorry, not aligned.
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < bytes; i++) {
+ tmpd.v = vt[src[i]].v;
+__DECL_VECTORIZED_LOOP
+ for(int j = 0; j < 8; j++) {
+ dst[j] = (tmpd.w[j] == 0) ? off_color_table[j] : on_color_table[j];
+ }
+ off_color_table += 8;
+ on_color_table += 8;
+ dst += 8;
+ }
+ }
+}
+
+
+void DLL_PREFIX Render8Colors_Line(_render_command_data_t *src, scrntype_t *dst, scrntype_t* dst2, bool scan_line)
+{
+ if(src == NULL) return;
+ if(dst == NULL) return;
+
+//__DECL_VECTORIZED_LOOP
+// for(int i = 0; i < 3; i++) {
+// if(src->bit_trans_table[i] == NULL) return;
+// if(src->data[i] == NULL) return;
+// }
+ scrntype_t dummy_palette[8]; // fallback
+ scrntype_t *palette = src->palette;
+
+ uint16_vec8_t *vpb = (uint16_vec8_t*)__builtin_assume_aligned(src->bit_trans_table[0], sizeof(uint16_vec8_t));
+ uint16_vec8_t *vpr = (uint16_vec8_t*)__builtin_assume_aligned(src->bit_trans_table[1], sizeof(uint16_vec8_t));
+ uint16_vec8_t *vpg = (uint16_vec8_t*)__builtin_assume_aligned(src->bit_trans_table[2], sizeof(uint16_vec8_t));
+
+ uint32_t x;
+ __DECL_ALIGNED(16) uint32_t offset[4] = {0};
+ __DECL_ALIGNED(16) uint32_t beginaddr[4] = {0};
+ uint32_t mask = src->addrmask;
+ uint32_t offsetmask = src->addrmask2;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 3; i++) {
+ offset[i] = src->voffset[i];
+ }
+ if(palette == NULL) {
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ dummy_palette[i] = RGB_COLOR(((i & 2) << 5) | 0x1f,
+ ((i & 4) << 5) | 0x1f,
+ ((i & 1) << 5) | 0x1f);
+ }
+ palette = dummy_palette;
+ }
+ uint8_t *bp = &(src->data[0][src->baseaddress[0]]);
+ uint8_t *rp = &(src->data[1][src->baseaddress[1]]);
+ uint8_t *gp = &(src->data[2][src->baseaddress[2]]);
+
+ uint8_t r, g, b;
+ int shift = src->shift;
+ const bool is_render[3] = { src->is_render[0], src->is_render[1], src->is_render[2] };
+ uint16_vec8_t tmpd;
+ scrntype_vec8_t tmp_dd;
+ scrntype_vec8_t* vdp = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t));
+
+ x = src->begin_pos;
+ uint32_t n = x;
+ if(dst2 == NULL) {
+ __DECL_VECTORIZED_LOOP
+ for(uint32_t xx = 0; xx < src->render_width; xx++) {
+ b = (is_render[0]) ? bp[(offset[0] + n) & mask] : 0;
+ r = (is_render[1]) ? rp[(offset[1] + n) & mask] : 0;
+ g = (is_render[2]) ? gp[(offset[2] + n) & mask] : 0;
+ tmpd.v = vpb[b].v;
+ tmpd.v = tmpd.v | vpr[r].v;
+ tmpd.v = tmpd.v | vpg[g].v;
+ tmpd.v = tmpd.v >> shift;
+ n = (n + 1) & offsetmask;
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ tmp_dd.w[i] = palette[tmpd.w[i]];
+ }
+ vdp[xx].v = tmp_dd.v;
+ }
+ } else {
+#if defined(_RGB555) || defined(_RGBA565)
+ static const int shift_factor = 2;
+#else // 24bit
+ static const int shift_factor = 3;
+#endif
+ scrntype_vec8_t sline;
+ scrntype_vec8_t* vdp2 = (scrntype_vec8_t*)__builtin_assume_aligned(dst2, sizeof(scrntype_vec8_t));
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ sline.w[i] = (scrntype_t)RGBA_COLOR(31, 31, 31, 255);
+ }
+ __DECL_VECTORIZED_LOOP
+ for(uint32_t xx = 0; xx < src->render_width; xx++) {
+ b = (is_render[0]) ? bp[(offset[0] + n) & mask] : 0;
+ r = (is_render[1]) ? rp[(offset[1] + n) & mask] : 0;
+ g = (is_render[2]) ? gp[(offset[2] + n) & mask] : 0;
+ tmpd.v = vpb[b].v;
+ tmpd.v = tmpd.v | vpr[r].v;
+ tmpd.v = tmpd.v | vpg[g].v;
+ tmpd.v = tmpd.v >> shift;
+ n = (n + 1) & offsetmask;
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ tmp_dd.w[i] = palette[tmpd.w[i]];
+ }
+ vdp[xx].v = tmp_dd.v;
+ if(scan_line) {
+ tmp_dd.v = tmp_dd.v >> shift_factor;
+ tmp_dd.v = tmp_dd.v & sline.v;
+ }
+ vdp2[xx].v = tmp_dd.v;
+ }
+ }
+}
+
+void DLL_PREFIX Render16Colors_Line(_render_command_data_t *src, scrntype_t *dst, scrntype_t* dst2, bool scan_line)
+{
+ if(src == NULL) return;
+ if(dst == NULL) return;
+
+//__DECL_VECTORIZED_LOOP
+// for(int i = 0; i < 3; i++) {
+// if(src->bit_trans_table[i] == NULL) return;
+// if(src->data[i] == NULL) return;
+// }
+ scrntype_t dummy_palette[16]; // fallback
+ scrntype_t *palette = src->palette;
+
+ uint16_vec8_t *vpb = (uint16_vec8_t*)__builtin_assume_aligned(src->bit_trans_table[0], sizeof(uint16_vec8_t));
+ uint16_vec8_t *vpr = (uint16_vec8_t*)__builtin_assume_aligned(src->bit_trans_table[1], sizeof(uint16_vec8_t));
+ uint16_vec8_t *vpg = (uint16_vec8_t*)__builtin_assume_aligned(src->bit_trans_table[2], sizeof(uint16_vec8_t));
+ uint16_vec8_t *vpn = (uint16_vec8_t*)__builtin_assume_aligned(src->bit_trans_table[3], sizeof(uint16_vec8_t));
+
+ uint32_t x;
+ __DECL_ALIGNED(16) uint32_t offset[4];
+ __DECL_ALIGNED(16) uint32_t beginaddr[4];
+ uint32_t mask = src->addrmask;
+ uint32_t offsetmask = src->addrmask2;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 4; i++) {
+ offset[i] = src->voffset[i];
+ }
+ if(palette == NULL) {
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 16; i++) {
+ dummy_palette[i] = RGB_COLOR((((i & 2) + (i & 8)) << 4) | 0x0f,
+ (((i & 4) + (i & 8)) << 4) | 0x0f,
+ (((i & 1) + (i & 8)) << 4) | 0x0f);
+ }
+ palette = dummy_palette;
+ }
+ uint8_t *bp = &(src->data[0][src->baseaddress[0]]);
+ uint8_t *rp = &(src->data[1][src->baseaddress[1]]);
+ uint8_t *gp = &(src->data[2][src->baseaddress[2]]);
+ uint8_t *np = &(src->data[3][src->baseaddress[3]]);
+
+ uint8_t r, g, b, n;
+ int shift = src->shift;
+ const bool is_render[4] = { src->is_render[0], src->is_render[1], src->is_render[2], src->is_render[3] };
+ uint16_vec8_t tmpd;
+ scrntype_vec8_t tmp_dd;
+ scrntype_vec8_t* vdp = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t));
+
+ x = src->begin_pos;
+ uint32_t xn = x;
+ if(dst2 == NULL) {
+ __DECL_VECTORIZED_LOOP
+ for(uint32_t xx = 0; xx < src->render_width; xx++) {
+ b = (is_render[0]) ? bp[(offset[0] + xn) & mask] : 0;
+ r = (is_render[1]) ? rp[(offset[1] + xn) & mask] : 0;
+ g = (is_render[2]) ? gp[(offset[2] + xn) & mask] : 0;
+ n = (is_render[3]) ? np[(offset[3] + xn) & mask] : 0;
+ tmpd.v = vpb[b].v;
+ tmpd.v = tmpd.v | vpr[r].v;
+ tmpd.v = tmpd.v | vpg[g].v;
+ tmpd.v = tmpd.v | vpn[n].v;
+ tmpd.v = tmpd.v >> shift;
+ xn = (xn + 1) & offsetmask;
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ tmp_dd.w[i] = palette[tmpd.w[i]];
+ }
+ vdp[xx].v = tmp_dd.v;
+ }
+ } else {
+#if defined(_RGB555) || defined(_RGBA565)
+ static const int shift_factor = 2;
+#else // 24bit
+ static const int shift_factor = 3;
+#endif
+ scrntype_vec8_t sline;
+ scrntype_vec8_t* vdp2 = (scrntype_vec8_t*)__builtin_assume_aligned(dst2, sizeof(scrntype_vec8_t));
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ sline.w[i] = (scrntype_t)RGBA_COLOR(31, 31, 31, 255);
+ }
+ __DECL_VECTORIZED_LOOP
+ for(uint32_t xx = 0; xx < src->render_width; xx++) {
+ b = (is_render[0]) ? bp[(offset[0] + xn) & mask] : 0;
+ r = (is_render[1]) ? rp[(offset[1] + xn) & mask] : 0;
+ g = (is_render[2]) ? gp[(offset[2] + xn) & mask] : 0;
+ n = (is_render[3]) ? np[(offset[3] + xn) & mask] : 0;
+ tmpd.v = vpb[b].v;
+ tmpd.v = tmpd.v | vpr[r].v;
+ tmpd.v = tmpd.v | vpg[g].v;
+ tmpd.v = tmpd.v | vpn[n].v;
+ tmpd.v = tmpd.v >> shift;
+ xn = (xn + 1) & offsetmask;
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ tmp_dd.w[i] = palette[tmpd.w[i]];
+ }
+ vdp[xx].v = tmp_dd.v;
+ if(scan_line) {
+ tmp_dd.v = tmp_dd.v >> shift_factor;
+ tmp_dd.v = tmp_dd.v & sline.v;
+ }
+ vdp2[xx].v = tmp_dd.v;
+ }
+ }
+}
+
+// src->palette Must be 2^planes entries.
+void DLL_PREFIX Render2NColors_Line(_render_command_data_t *src, scrntype_t *dst, scrntype_t* dst2, bool scan_line, int planes)
+{
+ if(src == NULL) return;
+ if(dst == NULL) return;
+ if(src->palette == NULL) return;
+ if(planes <= 0) return;
+ if(planes >= 16) planes = 16;
+//__DECL_VECTORIZED_LOOP
+// for(int i = 0; i < 3; i++) {
+// if(src->bit_trans_table[i] == NULL) return;
+// if(src->data[i] == NULL) return;
+// }
+ scrntype_t *palette = src->palette;
+
+ uint16_vec8_t* vp[16];
+ for(int i = 0; i < planes; i++) {
+ vp[i] = (uint16_vec8_t*)__builtin_assume_aligned(src->bit_trans_table[i], sizeof(uint16_vec8_t));
+ }
+
+ uint32_t x;
+ __DECL_ALIGNED(16) uint32_t offset[16];
+ __DECL_ALIGNED(16) uint32_t beginaddr[16];
+ uint32_t mask = src->addrmask;
+ uint32_t offsetmask = src->addrmask2;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < planes; i++) {
+ offset[i] = src->voffset[i];
+ }
+ uint8_t *pp[16];
+ for(int i = 0; i < planes; i++) {
+ pp[i] = &(src->data[i][src->baseaddress[i]]);
+ }
+
+ uint8_t d[16];
+ int shift = src->shift;
+ const bool is_render[4] = { src->is_render[0], src->is_render[1], src->is_render[2], src->is_render[3] };
+ uint16_vec8_t tmpd;
+ scrntype_vec8_t tmp_dd;
+ scrntype_vec8_t* vdp = (scrntype_vec8_t*)__builtin_assume_aligned(dst, sizeof(scrntype_vec8_t));
+
+ x = src->begin_pos;
+ if(dst2 == NULL) {
+ uint32_t n = x;
+ __DECL_VECTORIZED_LOOP
+ for(uint32_t xx = 0; xx < src->render_width; xx++) {
+ d[0] = (is_render[0]) ? pp[0][(offset[0] + n) & mask] : 0;
+ tmpd.v = vp[0][d[0]].v;
+ __DECL_VECTORIZED_LOOP
+ for(int i = 1; i < planes; i++) {
+ d[i] = (is_render[i]) ? pp[i][(offset[i] + n) & mask] : 0;
+ tmpd.v = tmpd.v | vp[i][d[i]].v;
+ }
+ n = (n + 1) & offsetmask;
+ tmpd.v = tmpd.v >> shift;
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ tmp_dd.w[i] = palette[tmpd.w[i]];
+ }
+ vdp[xx].v = tmp_dd.v;
+ }
+ } else {
+#if defined(_RGB555) || defined(_RGBA565)
+ static const int shift_factor = 2;
+#else // 24bit
+ static const int shift_factor = 3;
+#endif
+ scrntype_vec8_t sline;
+ scrntype_vec8_t* vdp2 = (scrntype_vec8_t*)__builtin_assume_aligned(dst2, sizeof(scrntype_vec8_t));
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ sline.w[i] = (scrntype_t)RGBA_COLOR(31, 31, 31, 255);
+ }
+ uint32_t n = x;
+ __DECL_VECTORIZED_LOOP
+ for(uint32_t xx = 0; xx < src->render_width; xx++) {
+ d[0] = (is_render[0]) ? pp[0][(offset[0] + n) & mask] : 0;
+ tmpd.v = vp[0][d[0]].v;
+ __DECL_VECTORIZED_LOOP
+ for(int i = 1; i < planes; i++) {
+ d[i] = (is_render[i]) ? pp[i][(offset[i] + n) & mask] : 0;
+ tmpd.v = tmpd.v | vp[i][d[i]].v;
+ }
+ n = (n + 1) & offsetmask;
+ tmpd.v = tmpd.v >> shift;
+ __DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ tmp_dd.w[i] = palette[tmpd.w[i]];
+ }
+ vdp[xx].v = tmp_dd.v;
+ if(scan_line) {
+ tmp_dd.v = tmp_dd.v >> shift_factor;
+ tmp_dd.v = tmp_dd.v & sline.v;
+ }
+ vdp2[xx].v = tmp_dd.v;
+ }
+ }
+}
+
+void DLL_PREFIX Convert2NColorsToByte_Line(_render_command_data_t *src, uint8_t *dst, int planes)
+{
+ if(planes >= 8) planes = 8;
+ if(planes <= 0) return;
+
+ uint8_t* srcp[8];
+ __DECL_ALIGNED(32) uint32_t offset[8] = {0};
+ uint16_vec8_t dat;
+ uint16_vec8_t* bp[8] ;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < planes; i++) {
+ bp[i] = (uint16_vec8_t*)__builtin_assume_aligned(&(src->bit_trans_table[i]->plane_table[0]), sizeof(uint16_vec8_t));
+ srcp[i] = &(src->data[i][src->baseaddress[i]]);
+ }
+ uint32_t addrmask = src->addrmask;
+ uint32_t offsetmask = src->addrmask2;
+ int shift = src->shift;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < planes; i++) {
+ offset[i] = src->voffset[i];
+ }
+
+ uint32_t noffset = src->begin_pos & offsetmask;
+ uint8_t td[16];
+__DECL_VECTORIZED_LOOP
+ for(int x = 0; x < src->render_width; x++) {
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < planes; i++) {
+ td[i] = srcp[i][(noffset + offset[i]) & addrmask];
+ }
+ noffset = (noffset + 1) & offsetmask;
+ dat.v = bp[0][td[0]].v;
+__DECL_VECTORIZED_LOOP
+ for(int i = 1; i < planes; i++) {
+ dat.v = dat.v | bp[i][td[i]].v;
+ }
+ dat.v = dat.v >> shift;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ dst[i] = (uint8_t)(dat.w[i]);
+ }
+ dst += 8;
+
+ }
+}
+
+void DLL_PREFIX Convert2NColorsToByte_LineZoom2(_render_command_data_t *src, uint8_t *dst, int planes)
+{
+ if(planes >= 8) planes = 8;
+ if(planes <= 0) return;
+
+ uint8_t* srcp[8];
+ __DECL_ALIGNED(32) uint32_t offset[8] = {0};
+ uint16_vec8_t dat;
+ uint16_vec8_t* bp[8] ;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < planes; i++) {
+ bp[i] = (uint16_vec8_t*)__builtin_assume_aligned(&(src->bit_trans_table[i]->plane_table[0]), sizeof(uint16_vec8_t));
+ srcp[i] = &(src->data[i][src->baseaddress[i]]);
+ }
+ uint32_t addrmask = src->addrmask;
+ uint32_t offsetmask = src->addrmask2;
+ int shift = src->shift;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < planes; i++) {
+ offset[i] = src->voffset[i];
+ }
+
+ uint32_t noffset = src->begin_pos & offsetmask;
+ uint8_t td[16];
+__DECL_VECTORIZED_LOOP
+ for(int x = 0; x < src->render_width; x++) {
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < planes; i++) {
+ td[i] = srcp[i][(noffset + offset[i]) & addrmask];
+ }
+ noffset = (noffset + 1) & offsetmask;
+ dat.v = bp[0][td[0]].v;
+__DECL_VECTORIZED_LOOP
+ for(int i = 1; i < planes; i++) {
+ dat.v = dat.v | bp[i][td[i]].v;
+ }
+ dat.v = dat.v >> shift;
+__DECL_VECTORIZED_LOOP
+ for(int i = 0, j = 0; i < 16; i +=2, j++) {
+ dst[i] = (uint8_t)(dat.w[j]);
+ dst[i + 1] = (uint8_t)(dat.w[j]);
+ }
+ dst += 16;
+ }
+}
+
+void DLL_PREFIX Convert8ColorsToByte_Line(_render_command_data_t *src, uint8_t *dst)
+{
+ uint8_t *bp = &(src->data[0][src->baseaddress[0]]);
+ uint8_t *rp = &(src->data[1][src->baseaddress[1]]);
+ uint8_t *gp = &(src->data[2][src->baseaddress[2]]);
+ __DECL_ALIGNED(16) uint32_t offset[4] = {0};
+
+ uint16_vec8_t rdat;
+ uint16_vec8_t gdat;
+ uint16_vec8_t bdat;
+ uint16_vec8_t tmpd;
+
+ uint16_vec8_t* bpb = (uint16_vec8_t*)__builtin_assume_aligned(&(src->bit_trans_table[0]->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* bpr = (uint16_vec8_t*)__builtin_assume_aligned(&(src->bit_trans_table[1]->plane_table[0]), sizeof(uint16_vec8_t));
+ uint16_vec8_t* bpg = (uint16_vec8_t*)__builtin_assume_aligned(&(src->bit_trans_table[2]->plane_table[0]), sizeof(uint16_vec8_t));
+
+ uint32_t addrmask = src->addrmask;
+ uint32_t offsetmask = src->addrmask2;
+ int shift = src->shift;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 3; i++) {
+ offset[i] = src->voffset[i];
+ }
+
+ uint32_t noffset = src->begin_pos & offsetmask;
+ uint8_t b, r, g;
+__DECL_VECTORIZED_LOOP
+ for(int x = 0; x < src->render_width; x++) {
+ b = bp[(noffset + offset[0]) & addrmask];
+ r = rp[(noffset + offset[1]) & addrmask];
+ g = gp[(noffset + offset[2]) & addrmask];
+
+ noffset = (noffset + 1) & offsetmask;
+
+ bdat.v = bpb[b].v;
+ rdat.v = bpr[r].v;
+ gdat.v = bpg[g].v;
+ tmpd.v = bdat.v;
+ tmpd.v = tmpd.v | rdat.v;
+ tmpd.v = tmpd.v | gdat.v;
+ tmpd.v = tmpd.v >> shift;
+
+__DECL_VECTORIZED_LOOP
+ for(int i = 0; i < 8; i++) {
+ dst[i] = (uint8_t)(tmpd.w[i]);
+ }
+ dst += 8;
+ }
+}
+
+
+#ifndef _MSC_VER
+struct to_upper { // Refer from documentation of libstdc++, GCC5.
+ char operator() (char c) const { return std::toupper(c); }
+};
+#endif
+
+#if defined(_USE_QT)
+static void _my_mkdir(std::string t_dir)
+{
+ struct stat st;
+//#if !defined(__WIN32) && !defined(__WIN64)
+// if(fstatat(AT_FDCWD, csppath.c_str(), &st, 0) != 0) {
+// mkdirat(AT_FDCWD, t_dir.c_str(), 0700); // Not found
+// }
+#if defined(_USE_QT)
+ if(stat(t_dir.c_str(), &st) != 0) {
+ QDir dir = QDir::current();
+ dir.mkdir(QString::fromStdString(t_dir));
+ //dir.mkpath(QString::fromUtf8(app_path));
+ }
+#else
+ if(stat(csppath.c_str(), &st) != 0) {
+ _mkdir(t_dir.c_str()); // Not found
+ }
+#endif
+}
+#endif
+
+const _TCHAR *DLL_PREFIX get_application_path()
+{
+ static _TCHAR app_path[_MAX_PATH];
+ static bool initialized = false;
+
+ if(!initialized) {
+#if defined(_WIN32) && !defined(_USE_QT)
+ _TCHAR tmp_path[_MAX_PATH], *ptr = NULL;
+ if(GetModuleFileName(NULL, tmp_path, _MAX_PATH) != 0 && GetFullPathName(tmp_path, _MAX_PATH, app_path, &ptr) != 0 && ptr != NULL) {
+ *ptr = _T('\0');
+ } else {
+ my_tcscpy_s(app_path, _MAX_PATH, _T(".\\"));
+ }
+#else
+#if defined(Q_OS_WIN)
+ std::string delim = "\\";
+#else
+ std::string delim = "/";
+#endif
+ std::string csppath = cpp_homedir + "CommonSourceCodeProject" + delim ;
+ _my_mkdir(csppath);
+
+ std::string cpath = csppath + my_procname + delim;
+ _my_mkdir(cpath);
+ strncpy(app_path, cpath.c_str(), _MAX_PATH - 1);
+#endif
+ initialized = true;
+ }
+ return (const _TCHAR *)app_path;
+}
+
+const _TCHAR *DLL_PREFIX get_initial_current_path()
+{
+ static _TCHAR current_path[_MAX_PATH];
+ static bool initialized = false;
+
+ if(!initialized) {
+#if defined(_WIN32) && !defined(_USE_QT)
+ GetCurrentDirectoryA(_MAX_PATH, current_path);