OSDN Git Service

[Agar][General] Move important file(s) from XM7/SDL, these are Ohta's original writte...
authorK.Ohta <whatisthis.sowhat@gmail.com>
Tue, 30 Dec 2014 04:20:46 +0000 (13:20 +0900)
committerK.Ohta <whatisthis.sowhat@gmail.com>
Tue, 30 Dec 2014 04:20:46 +0000 (13:20 +0900)
56 files changed:
source/src/agar/common/agar_glcl.cpp [new file with mode: 0644]
source/src/agar/common/agar_glcl.h [new file with mode: 0644]
source/src/agar/common/agar_gldraw.h [new file with mode: 0644]
source/src/agar/common/agar_gldraw2.cpp [new file with mode: 0644]
source/src/agar/common/agar_glutil.cpp [new file with mode: 0644]
source/src/agar/common/agar_glutil.h [new file with mode: 0644]
source/src/agar/common/agar_logger.cpp [new file with mode: 0644]
source/src/agar/common/agar_logger.h [new file with mode: 0644]
source/src/agar/common/agar_sdlscaler.cpp [new file with mode: 0644]
source/src/agar/common/agar_sdlview.c [new file with mode: 0644]
source/src/agar/common/agar_sdlview.h [new file with mode: 0644]
source/src/agar/common/scaler/generic/CMakeLists.txt [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x05.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x1.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x125.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x15.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x2.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x225.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x25.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x3.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x4.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x45.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x5.c [new file with mode: 0644]
source/src/agar/common/scaler/generic/scaler_x6.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/CMakeLists.txt [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x125_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x15_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x1_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x225_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x25_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x2_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x3_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x45_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x4_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x5_sse2.c [new file with mode: 0644]
source/src/agar/common/scaler/sse2/scaler_x6_sse2.c [new file with mode: 0644]
source/src/agar/common/sdl_sound.cpp
source/src/agar/fm7/CMakeLists.txt [new file with mode: 0644]
source/src/agar/fm7/vram/generic/CMakeLists.txt [new file with mode: 0644]
source/src/agar/fm7/vram/generic/api_vram256k.c [new file with mode: 0644]
source/src/agar/fm7/vram/generic/api_vram4096.c [new file with mode: 0644]
source/src/agar/fm7/vram/generic/api_vram8.c [new file with mode: 0644]
source/src/agar/fm7/vram/generic/api_vramvec.c [new file with mode: 0644]
source/src/agar/fm7/vram/sse2/CMakeLists.txt [new file with mode: 0644]
source/src/agar/fm7/vram/sse2/api_vram256k.c [new file with mode: 0644]
source/src/agar/fm7/vram/sse2/api_vram4096.c [new file with mode: 0644]
source/src/agar/fm7/vram/sse2/api_vram8.c [new file with mode: 0644]
source/src/agar/fm7/vram/sse2/api_vramvec.c [new file with mode: 0644]
source/src/agar/tools/cl2cpp.awk [new file with mode: 0644]
source/src/common.h
source/src/config.cpp
source/src/emu.h
source/src/vm/CMakeLists.txt [new file with mode: 0644]
source/src/vm/fmgen/CMakeLists.txt [new file with mode: 0644]
source/src/vm/pc8801/CMakeLists.txt [new file with mode: 0644]
source/src/vm/x1/CMakeLists.txt [new file with mode: 0644]

diff --git a/source/src/agar/common/agar_glcl.cpp b/source/src/agar/common/agar_glcl.cpp
new file mode 100644 (file)
index 0000000..fd4af21
--- /dev/null
@@ -0,0 +1,921 @@
+/*
+ * Renderer using OPENCL/GL
+ * (C) 2012 K.Ohta <whatisthis.sowhat@gmail.com>
+ * History:
+ * Nov 01,2012: Initial
+ */
+
+
+#include <agar/core.h>
+#include <agar/core/types.h>
+#include <agar/gui.h>
+
+
+#include "api_draw.h"
+#include "api_kbd.h"
+
+#include "agar_xm7.h"
+#include "agar_draw.h"
+#include "agar_gldraw.h"
+#include "agar_glutil.h"
+#include "agar_logger.h"
+#include "xm7.h"
+#include "display.h"
+#include "subctrl.h"
+#include "device.h"
+#include "multipag.h"
+#include "ttlpalet.h"
+#include "apalet.h"
+
+#include "agar_glcl.h"
+
+#define LOGSIZE 1024*1024
+
+extern "C"{
+extern Uint8 *vram_pb;
+extern Uint8 *vram_pr;
+extern Uint8 *vram_pg;
+}
+
+extern PFNGLBINDBUFFERPROC glBindBuffer;
+extern float fBrightR;
+extern float fBrightG;
+extern float fBrightB;
+
+
+
+GLCLDraw::GLCLDraw()
+{
+   int i;
+   pixelBuffer = NULL;
+   AG_MutexInit(&mutex_buffer);
+   AG_MutexInit(&mutex_palette);
+   TransferBuffer = NULL;
+   nkernels = 0;
+   using_device = 0;
+   for(i = 0; i < 8; i++) device_type[i] = 0;
+   for(i = 0; i < 8; i++) local_memsize[i] = 0;
+}
+
+GLCLDraw::~GLCLDraw()
+{
+   cl_int ret;
+   int i;
+   if(nkernels > 0) {
+     for(i = 0; i < nkernels; i++) if(kernels_array[i] != NULL) ret = clReleaseKernel(kernels_array[i]);
+   }
+   
+   if(program != NULL) ret |= clReleaseProgram(program);
+   if(command_queue != NULL) ret |= clReleaseCommandQueue(command_queue);
+   if(context != NULL) ret |= clReleaseContext(context);
+   if(properties != NULL) free(properties);
+   for(i = 0; i < 2; i++) {
+      if(inbuf[i] != NULL) ret |= clReleaseMemObject(inbuf[i]);
+      if(palette_buf[i] != NULL) ret |= clReleaseMemObject(palette_buf[i]);
+   }
+   if(outbuf != NULL) ret |= clReleaseMemObject(outbuf);
+   if(table != NULL) ret |= clReleaseMemObject(table);
+   if(pixelBuffer != NULL) free(pixelBuffer);
+   AG_MutexDestroy(&mutex_buffer);
+   AG_MutexDestroy(&mutex_palette);
+}
+
+static void cl_notify_log(const char *errinfo, const void *private_info, size_t cb, void *user_data)
+{
+   Uint8 dump[128];
+   char dumpStr[1024];
+   int i;
+   
+   dumpStr[0] = '\0';
+   XM7_DebugLog(XM7_LOG_WARN, "CL Notify: %s", errinfo);
+}
+
+int GLCLDraw::GetGLEnabled(void)
+{
+  if(bCLEnableKhrGLShare != FALSE) return -1;
+  return 0;
+}
+
+Uint32 *GLCLDraw::GetPixelBuffer(void)
+{
+    Uint32 *p;
+    int ret = 0;
+    p = (Uint32 *) clEnqueueMapBuffer(command_queue, outbuf, CL_TRUE, CL_MAP_READ,
+                                      0, (size_t)(640 * 400 * sizeof(Uint32)),
+                                      1, &event_exec, &event_release, &ret);
+    if(ret < 0) return NULL;
+    clFlush(command_queue);
+    return p;
+}
+
+int GLCLDraw::ReleasePixelBuffer(Uint32 *p)
+{
+#if 0
+   return 0;
+#else
+  int ret;
+  if(p == NULL) return 0;
+//  clFlush(command_queue);
+  ret |= clEnqueueUnmapMemObject(command_queue, outbuf,
+                                p, 1, &event_release, NULL);
+  clFinish(command_queue);
+  return ret;
+#endif
+}
+
+int GLCLDraw::GetUsingDeviceNo(void)
+{
+  return using_device;
+}
+
+int GLCLDraw::GetDevices(void)
+{
+  return ret_num_devices;
+}
+
+int GLCLDraw::GetPlatforms(void)
+{
+  return ret_num_platforms;
+}
+
+void GLCLDraw::GetDeviceType(char *str, int maxlen, int num)
+{
+  if((str == NULL) || (maxlen < 1)) return;
+  str[0] = '\0';
+  if((num < 0) || (num >= 8) || (num >= ret_num_devices)) return;
+
+  switch(device_type[num]) {
+  case CL_DEVICE_TYPE_CPU:
+       strncpy(str, "CPU", maxlen - 1);
+       break;
+  case CL_DEVICE_TYPE_GPU:
+       strncpy(str, "GPU", maxlen - 1);
+       break;
+  case CL_DEVICE_TYPE_ACCELERATOR:
+       strncpy(str, "ACCELERATOR", maxlen - 1);
+       break;
+  case CL_DEVICE_TYPE_DEFAULT:
+       strncpy(str, "DEFAULT", maxlen - 1);
+       break;
+  default:
+       strncpy(str, "Unknown", maxlen - 1);
+       break;
+     }
+}
+
+void GLCLDraw::GetDeviceName(char *str, int maxlen, int num)
+{
+  size_t llen;
+
+  if((str == NULL) || (maxlen < 1)) return;
+  str[0] = '\0';
+  if((num < 0) || (num > 8) || (num >= ret_num_devices)) return;
+  clGetDeviceInfo(device_id[num], CL_DEVICE_NAME,
+                    maxlen - 1, str, &llen);
+  str[llen - 1] = '\0';
+}
+
+cl_int GLCLDraw::InitContext(int platformnum, int processornum, int GLinterop)
+{
+   cl_int ret;
+   size_t len;
+   char extension_data[1024];
+   size_t llen;
+   size_t extension_len;
+   int i;
+   
+   properties = malloc(16 * sizeof(intptr_t));
+   ret = clGetPlatformIDs(8, platform_id, &ret_num_platforms);
+   if(ret != CL_SUCCESS) return ret;
+
+   if(ret_num_platforms <= 0) return CL_INVALID_PLATFORM;
+
+   platform_num = platformnum;
+   if(platform_num >= ret_num_platforms) platform_num = ret_num_platforms - 1;
+   if(platform_num <= 0) platform_num = 0;
+   ret = clGetDeviceIDs(platform_id[platform_num], CL_DEVICE_TYPE_ALL, 8, device_id,
+                            &ret_num_devices);
+   if(ret != CL_SUCCESS) return ret;
+   if(ret_num_devices <= 0) {
+     XM7_DebugLog(XM7_LOG_DEBUG, "CL : Has no useful device(s).");
+     return ret;
+   }
+   if(ret_num_devices > 8) ret_num_devices = 8;
+   if(ret_num_devices <= 0) return CL_INVALID_DEVICE_TYPE;
+   XM7_DebugLog(XM7_LOG_DEBUG, "CL : Found %d processors.", ret_num_devices);
+
+   using_device = processornum;
+   if(using_device >= ret_num_devices) using_device = ret_num_devices - 1;
+   if(using_device <= 0) using_device = 0;
+
+   bCLEnableKhrGLShare = 0;
+
+   for(i = 0; i < ret_num_devices; i++ ){
+
+     extension_data[0] = '\0';
+     GetDeviceName(extension_data, sizeof(extension_data), i);
+     XM7_DebugLog(XM7_LOG_DEBUG, "CL : Processor #%d : Name = %s ", i, extension_data);
+
+     extension_data[0] = '\0';
+     clGetDeviceInfo(device_id[i], CL_DEVICE_TYPE,
+                    sizeof(cl_ulong), &(device_type[i]), &llen);
+     clGetDeviceInfo(device_id[i], CL_DEVICE_LOCAL_MEM_SIZE,
+                    sizeof(cl_ulong), &(local_memsize[i]), &llen);
+     GetDeviceType(extension_data, sizeof(extension_data), i);
+     XM7_DebugLog(XM7_LOG_DEBUG, "CL : Processor #%d : TYPE = %s / Local memory size = %d bytes", i, extension_data, local_memsize[i]);
+
+     extension_data[0] = '\0';
+     clGetDeviceInfo(device_id[i], CL_DEVICE_EXTENSIONS,
+                  1024, extension_data, &extension_len);
+     XM7_DebugLog(XM7_LOG_DEBUG, "CL : Extension features(#%d):%s", i, extension_data);
+     if(i == using_device) {
+       if(strcasestr(extension_data, "cl_khr_gl_sharing") != NULL) {
+        if(GLinterop != 0) bCLEnableKhrGLShare = -1;
+       } else {
+        bCLEnableKhrGLShare = 0;
+       }
+     }
+   }
+   
+   XM7_DebugLog(XM7_LOG_DEBUG, "CL : Using device #%d", using_device);
+   if(bCLEnableKhrGLShare != 0) { // This is only under X11. Must fix.
+     XM7_DebugLog(XM7_LOG_DEBUG, "CL : GL Interoperability enabled.");
+     properties[0] = CL_GL_CONTEXT_KHR;
+     properties[1] = (cl_context_properties)glXGetCurrentContext();
+     XM7_DebugLog(XM7_LOG_DEBUG, "CL : GL Context = %08x", glXGetCurrentContext());
+     properties[2] = CL_GLX_DISPLAY_KHR;
+     properties[3] = (cl_context_properties)glXGetCurrentDisplay();
+     XM7_DebugLog(XM7_LOG_DEBUG, "CL : GL Display = %08x", glXGetCurrentDisplay());
+     properties[4] = CL_CONTEXT_PLATFORM;
+     properties[5] = (cl_context_properties)platform_id[platform_num];
+     properties[6] = 0;
+   } else {
+     XM7_DebugLog(XM7_LOG_DEBUG, "CL : GL Interoperability disabled.");
+     properties[0] = CL_CONTEXT_PLATFORM;
+     properties[1] = (cl_context_properties)platform_id[platform_num];
+     properties[2] = 0;
+   }
+//   if(device_id == NULL) return -1;
+   
+   context = clCreateContext(properties, 1, &device_id[using_device], cl_notify_log, NULL, &ret);
+   XM7_DebugLog(XM7_LOG_DEBUG, "CL : Created context : STS = %d", ret);
+   if(ret != CL_SUCCESS) return ret;
+       
+   command_queue = clCreateCommandQueue(context, device_id[using_device],
+                                         CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
+   XM7_DebugLog(XM7_LOG_DEBUG, "CL: Created command queue.");
+   return ret;
+}
+
+static void CL_LogProgramExecute(cl_program program, void *userdata)
+{
+  char *logBuf;
+  size_t length;
+  cl_int r;
+  cl_int n;
+  cl_int num;
+  cl_device_id *devid;
+  class GLCLDraw *t = (class GLCLDraw *)userdata;
+
+  logBuf = (char *)malloc(LOGSIZE * sizeof(char));
+  if((logBuf == NULL) || (t == NULL))return;
+  num = t->ret_num_devices;
+  devid = t->device_id;
+  //  printf("DBG: %08x %d\n", t, num);
+  for(n = 0; n < num; n++) {
+    logBuf[0] = '\0';
+    r = clGetProgramBuildInfo(program, devid[n],  CL_PROGRAM_BUILD_LOG, 
+                             LOGSIZE - 1, (void *)logBuf, &length);
+    if((length > 0) && (length <= LOGSIZE)){
+      logBuf[length] = '\0';
+      if(strlen(logBuf) > 0) XM7_DebugLog(XM7_LOG_INFO, "CL :Build Log of Device #%d:%s", n, logBuf);
+    }
+  }
+  free(logBuf);
+  return;
+}
+
+
+cl_int GLCLDraw::BuildFromSource(const char *p)
+{
+    cl_int ret;
+    size_t codeSize;
+    char *logBuf;
+    char compile_options[2048];
+    cl_bool endian_little;
+    compile_options[0] = '\0';
+   
+    codeSize = strlen(p);
+    program = clCreateProgramWithSource(context, 1, (const char **)&p,
+                                        (const size_t *)&codeSize, &ret);
+    XM7_DebugLog(XM7_LOG_INFO, "CL: Build Result=%d", ret);
+    if(ret < CL_SUCCESS) {
+      return ret;
+    }
+
+
+    // Compile from source
+    strncat(compile_options, "-cl-fast-relaxed-math ", sizeof(compile_options) - 1);
+    if(clGetDeviceInfo(device_id[using_device], CL_DEVICE_ENDIAN_LITTLE,
+                      sizeof(cl_bool), &endian_little, NULL) == CL_SUCCESS){
+      if(endian_little == CL_TRUE) {
+       strncat(compile_options, "-D_CL_KERNEL_LITTLE_ENDIAN=1 ", sizeof(compile_options) - 1);
+      } else {
+       strncat(compile_options, "-D_CL_KERNEL_LITTLE_ENDIAN=0 ", sizeof(compile_options) - 1); // Big endian
+      }
+    } else {
+      strncat(compile_options, "-D_CL_KERNEL_LITTLE_ENDIAN=1 ", sizeof(compile_options) - 1); // Assume little endian
+    }
+    build_callback = CL_LogProgramExecute;
+    ret = clBuildProgram(program, 1, &device_id[using_device], compile_options,
+                        build_callback, (void *)this);
+    XM7_DebugLog(XM7_LOG_INFO, "Compile Result=%d", ret);
+    if(ret != CL_SUCCESS) {  // Printout error log.
+      //      clReleaseProgram(program);
+      return ret;
+    }
+    ret = clCreateKernelsInProgram(program, sizeof(kernels_array) / sizeof(cl_kernel),
+                                  kernels_array, &nkernels);
+    if(ret < CL_SUCCESS) {
+      XM7_DebugLog(XM7_LOG_INFO, "Unable to build CL kernel. Status=%d", ret);
+    } else {
+      char funcname[128];
+      int i;
+      size_t size;
+      XM7_DebugLog(XM7_LOG_INFO, "Built %d CL kernel(s).", nkernels);
+      for(i = 0; i < nkernels; i++) {
+       funcname[0] = '\0';
+       if(clGetKernelInfo(kernels_array[i], CL_KERNEL_FUNCTION_NAME,
+                          sizeof(funcname) / sizeof(char) - 1, 
+                          funcname, size) == CL_SUCCESS){
+         if((strncmp(funcname, "getvram8", strlen("getvram8")) == 0) && (kernel_8colors == NULL)) kernel_8colors = &kernels_array[i];
+         if((strncmp(funcname, "getvram4096", strlen("getvram4096")) == 0) && (kernel_4096colors == NULL)) kernel_4096colors = &kernels_array[i];
+         if((strncmp(funcname, "getvram256k", strlen("getvram256k")) == 0) && (kernel_256kcolors == NULL)) kernel_256kcolors = &kernels_array[i];
+         if((strncmp(funcname, "CreateTable", strlen("CreateTable")) == 0) && (kernel_table == NULL)) kernel_table = &kernels_array[i];
+         if((strncmp(funcname, "CopyVram", strlen("CopyVram")) == 0) && (kernel_copyvram == NULL)) kernel_copyvram = &kernels_array[i];
+       }
+      }
+    }
+   return ret;
+}
+
+Uint8 *GLCLDraw::GetBufPtr(Uint32 timeout)
+{
+  Uint32 t = timeout / 10;
+  Uint32 i;
+  BOOL flag = FALSE;
+  if(timeout == 0) {
+    AG_MutexLock(&mutex_buffer);
+    return TransferBuffer;
+  } else {
+    for(i = 0; i < t; i++) {
+      if(AG_MutexTryLock(&mutex_buffer) == 0) {
+       flag = TRUE;
+       break;
+      }
+      AG_Delay(10);
+    }
+    if(flag == FALSE) {
+      t = timeout % 10;
+      AG_Delay(t);
+      if(AG_MutexTryLock(&mutex_buffer) == 0) flag = TRUE;
+    }
+    if(flag == FALSE) return NULL;
+    return TransferBuffer;
+  }
+}
+
+void GLCLDraw::ReleaseBufPtr(void)
+{
+  AG_MutexUnlock(&mutex_buffer);
+}
+
+Uint8 *GLCLDraw::MapTransferBuffer(int bmode)
+{
+  Uint8 *p = NULL;
+  cl_int ret;
+  switch(bmode)
+  {
+  case SCR_200LINE:
+    p = clEnqueueMapBuffer(command_queue, inbuf[inbuf_bank], CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION,
+                        0, 0x4000 * 3,
+                        0, NULL, &event_uploadvram[0], &ret);
+    break;
+  case SCR_400LINE:
+    p = clEnqueueMapBuffer(command_queue, inbuf[inbuf_bank], CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION,
+                        0, 0x8000 * 3,
+                        0, NULL, &event_uploadvram[0], &ret);
+    break;
+  case SCR_4096:
+    p = clEnqueueMapBuffer(command_queue, inbuf[inbuf_bank], CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION,
+                        0, 0x2000 * 12,
+                        0, NULL, &event_uploadvram[0], &ret);
+    break;
+  case SCR_262144:
+    p = clEnqueueMapBuffer(command_queue, inbuf[inbuf_bank], CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION,
+                        0, 0x2000 * 18,
+                        0, NULL, &event_uploadvram[0], &ret);
+    break;
+  }
+  if(ret < CL_SUCCESS) p = NULL;
+  return p;
+}
+
+cl_int GLCLDraw::UnMapTransferBuffer(Uint8 *p)
+{
+  cl_int ret;
+  if(p == NULL) return CL_INVALID_MEM_OBJECT;
+  ret = clEnqueueUnmapMemObject(command_queue, inbuf[inbuf_bank],
+                                p, 0, NULL, &event_uploadvram[1]);
+  return ret;
+}
+
+void GLCLDraw::AddPalette(int line, Uint8 mpage, BOOL analog)
+{
+   struct palettebuf_t *p;
+   
+   if(palettebuf == NULL) return;
+   p = palettebuf;
+   AG_MutexLock(&mutex_palette);
+   if(analog) {
+      int i;
+      Uint32 lines;
+      if(line < 0) line = 0;
+      if(line > 199) line = 199;
+      lines = p->alines_h * 256 + p->alines_l;
+      if((lastline != line) || (lines == 1)) {
+        lastline = line;
+        lines++;
+        if(lines > 199) {
+           AG_MutexUnlock(&mutex_palette);
+           return;
+        }
+        
+      }
+      //printf("AddPalette %d\n", lines);
+      p->alines_h = lines / 256;
+      p->alines_l = lines % 256;
+      p->atbls[lines - 1].line_h = line / 256;  
+      p->atbls[lines - 1].line_l = line % 256;
+      p->atbls[lines - 1].mpage = mpage;
+      for(i = 0; i < 4096; i++) {
+        p->atbls[lines - 1].r_4096[i] = apalet_r[i];
+        p->atbls[lines - 1].g_4096[i] = apalet_g[i];
+        p->atbls[lines - 1].b_4096[i] = apalet_b[i];
+      }
+   } else {
+      int i;
+      Uint32 lines;
+      int h = 199;
+      
+      if(bMode == SCR_400LINE) h = 399;
+      if(line < 0) line = 0;
+      if(line > h) line = h;
+      lines = p->dlines_h * 256 + p->dlines_l;
+      if((lastline != line) || (lines == 1)) {
+        lines++;
+        lastline = line;
+        if(lines > h) {
+           AG_MutexUnlock(&mutex_palette);
+           return;
+        }
+        
+      }
+      p->dlines_h = lines / 256;
+      p->dlines_l = lines % 256;
+      p->dtbls[lines - 1].line_h = line / 256;  
+      p->dtbls[lines - 1].line_l = line % 256;
+      p->dtbls[lines - 1].mpage = mpage;
+      for(i = 0; i < 7; i++) {
+        p->dtbls[lines - 1].tbl[i] = ttl_palet[i];
+      }
+   }
+   AG_MutexUnlock(&mutex_palette);
+
+}
+
+void GLCLDraw::ResetPalette(void)
+{
+   struct palettebuf_t *pold, *pnew;
+   int newline;
+   int endline;
+   cl_int r;
+   cl_event ev_unmap, ev_map;
+   int i;
+   
+
+ //  CopyPalette();
+   AG_MutexLock(&mutex_palette);
+   //   pold = palettebuf;
+   pnew = palettebuf;
+   lastline = 0;
+   if(pnew != NULL) {
+       palettebuf = pnew;
+            {
+               pnew->alines_h = 0;
+               pnew->alines_l = 1;
+               pnew->atbls[0].line_h = 0;
+               pnew->atbls[0].line_l = 0;
+               pnew->atbls[0].mpage = multi_page;
+               for(i = 0; i < 4096; i++) {
+                  pnew->atbls[0].r_4096[i] = apalet_r[i];
+                  pnew->atbls[0].g_4096[i] = apalet_g[i];
+                  pnew->atbls[0].b_4096[i] = apalet_b[i];
+               }
+            }
+            {
+               pnew->dlines_h = 0;
+               pnew->dlines_l = 1;
+               pnew->dtbls[0].line_h = 0;
+               pnew->dtbls[0].line_l = 0;
+               pnew->dtbls[0].mpage = multi_page;
+               for(i = 0; i < 8; i++) pnew->dtbls[0].tbl[i] = ttl_palet[i];
+            }
+   }
+   AG_MutexUnlock(&mutex_palette);
+//   clFinish(command_queue);
+//   clFlush(command_queue);
+}
+
+void GLCLDraw::CopyPalette(void)
+{
+   struct palettebuf_t *pold, *pnew;
+   int newline;
+   int endline;
+   int alines;
+   int dlines;
+   cl_int r;
+   cl_event ev_unmap, ev_map;
+   int i;
+   
+   AG_MutexLock(&mutex_palette);
+
+   pold = palettebuf;
+   newline = palette_bank + 1;
+   if(newline >= 2) newline = 0;
+   
+   pnew = clEnqueueMapBuffer(command_queue, palette_buf[newline], CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                            0, (size_t)sizeof(struct palettebuf_t),
+                            0, NULL, &ev_map, &r);
+   if(r < CL_SUCCESS) {
+      AG_MutexUnlock(&mutex_palette);
+      return;
+   }
+   alines = pold->alines_h * 256 + pold->alines_l;
+   dlines = pold->dlines_h * 256 + pold->dlines_l;
+   if(alines < 0)   alines = 0;
+   if(alines > 199) alines = 199;
+   if(dlines < 0)   dlines = 0;
+   if(dlines > 399) dlines = 399;
+
+   if((pold != NULL) && (pnew != NULL)) {
+     memcpy(pnew, pold, sizeof(Uint8) * 4 + sizeof(struct apalettetbl_t) * alines); // Copy Lines + Analog Palette
+     memcpy(&(pnew->dtbls[0]), &(pold->dtbls[0]), sizeof(struct dpalettetbl_t) * dlines); // Copy Digital Palette
+     palettebuf = pnew;
+     
+     clEnqueueUnmapMemObject(command_queue, palette_buf[palette_bank], 
+                            pold, 1, &ev_map,
+                            &ev_unmap);
+     palette_bank_old = palette_bank;
+     palette_bank = newline;
+   }
+   
+   AG_MutexUnlock(&mutex_palette);
+   //clFinish(command_queue);
+   clFlush(command_queue);
+}
+
+          
+   
+
+
+cl_int GLCLDraw::GetVram(int bmode)
+{
+   cl_int ret = 0;
+   cl_int r;
+   cl_kernel *kernel = NULL;
+   int w = 0;
+   int h = 0;
+   Uint8 *pr,*pg,*pb;
+   size_t lws[] = {10}; // local jobs.
+   size_t gws[] = {nCLGlobalWorkThreads}; // Parallel jobs.
+   size_t *goff = NULL;
+   int mpage = multi_page;
+   int dummy = 0;
+   int vpage;
+   int crtflag = crt_flag;
+   int bank;
+   BOOL flag = FALSE;
+   int i;
+   cl_float4 bright;
+   cl_event copy_event;
+
+   bright.s[0] = fBrightR; // R
+   bright.s[1] = fBrightG; // G
+   bright.s[2] = fBrightB; // B
+   bright.s[3] = 1.0; // A
+   
+   //if(inbuf == NULL) return -1;
+   if(outbuf == NULL) return -1;
+   //if(TransferBuffer == NULL) return -1;
+   /*
+    * Swap Buffer
+    */
+   {
+     size_t transfer_size = 0;
+     bank = inbuf_bank;
+     Uint8 *p;
+     p = GetBufPtr(0); // Maybe okay?
+     for(i = 0; i < 400 ; i++) {
+       if(bDrawLine[i]) flag = TRUE;
+       bDrawLine[i] = FALSE;
+    }
+    if(flag) {
+       ret = UnMapTransferBuffer(p);
+       if(ret < CL_SUCCESS) {
+        ReleaseBufPtr();
+        return ret;
+       }
+     }
+     switch(bmode){
+     case SCR_200LINE:
+       transfer_size = 0x4000 * 3;
+       break;
+     case SCR_400LINE:
+       transfer_size = 0x8000 * 3;
+       break;
+     case SCR_4096:
+       transfer_size = 0x2000 * 12;
+       break;
+     case SCR_262144:
+       transfer_size = 0x2000 * 18;
+       break;
+     }
+     if((flag != FALSE) && (transfer_size > 0)){
+       inbuf_bank++;
+       if(inbuf_bank >= 2) inbuf_bank = 0;
+       if(kernel_copyvram != NULL) {
+        size_t lws_copy[] = {1};
+        size_t gws_copy[] = {gws[0]};
+             
+        cl_int size = transfer_size;
+        ret |= clSetKernelArg(*kernel_copyvram, 0, sizeof(cl_mem), (void *)&(inbuf[inbuf_bank]));
+        ret |= clSetKernelArg(*kernel_copyvram, 1, sizeof(cl_mem), (void *)&(inbuf[bank]));
+        ret |= clSetKernelArg(*kernel_copyvram, 2, sizeof(cl_int), &size);
+        ret |= clSetKernelArg(*kernel_copyvram, 3, sizeof(cl_int), &bCLSparse);
+        if(bCLSparse) {
+           ret = clEnqueueNDRangeKernel(command_queue, *kernel_copyvram, 1, 
+                                        goff, gws_copy, lws_copy, 
+                                        0, NULL,  &copy_event);
+        } else {
+           ret = clEnqueueTask(command_queue,
+                               *kernel_copyvram, 0, NULL, &copy_event);
+        }
+      } else {
+       ret = clEnqueueCopyBuffer(command_queue, inbuf[bank], inbuf[inbuf_bank], 0,
+                                0, transfer_size, 0, NULL,
+                              &copy_event);
+      }
+      clFinish(command_queue);
+      TransferBuffer = MapTransferBuffer(bmode);
+     }
+     ReleaseBufPtr();
+     if(TransferBuffer == NULL) return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+   }
+   if((flag) || bPaletFlag || SDLDrawFlag.APaletteChanged || SDLDrawFlag.DPaletteChanged) {
+   kernel = NULL;
+   LockVram();
+   SDLDrawFlag.APaletteChanged = FALSE;
+   SDLDrawFlag.DPaletteChanged = FALSE;
+   SDLDrawFlag.Drawn = FALSE;
+   bPaletFlag = FALSE;
+   UnlockVram();
+   CopyPalette();
+   switch(bmode) {
+    case SCR_400LINE:
+    case SCR_200LINE:
+      w = 640;
+      h = 200;
+      if(bmode == SCR_400LINE) h = 400;
+      vpage = (~(multi_page >> 4)) & 0x07;
+      //gws[0] = h;
+      if(kernel_8colors != NULL) kernel = kernel_8colors;
+      if(kernel != NULL) {
+        ret |= clSetKernelArg(*kernel, 0, sizeof(cl_mem), (void *)&(inbuf[bank]));
+        ret |= clSetKernelArg(*kernel, 1, sizeof(int),    (void *)&w);
+        ret |= clSetKernelArg(*kernel, 2, sizeof(int), (void *)&h);
+        ret |= clSetKernelArg(*kernel, 3, sizeof(cl_mem), (void *)&outbuf);
+        ret |= clSetKernelArg(*kernel, 4, sizeof(cl_mem), (void *)&palette_buf[palette_bank_old]);
+        ret |= clSetKernelArg(*kernel, 5, sizeof(cl_mem), (void *)&table);
+        ret |= clSetKernelArg(*kernel, 6, sizeof(int), (void *)&bCLSparse);
+        ret |= clSetKernelArg(*kernel, 7, sizeof(int), (void *)&crtflag);
+        ret |= clSetKernelArg(*kernel, 8, sizeof(cl_float4), (void *)&bright);
+        clFlush(command_queue);
+      }
+      break;
+    case SCR_262144:// Windowはなし
+      w = 320;
+      h = 200;
+      //gws[0] = h;
+
+      //      if(kernel == NULL) kernel = clCreateKernel(program, "getvram256k", &ret);
+      if(kernel_256kcolors != NULL) kernel = kernel_256kcolors;
+      if(kernel != NULL) {
+      /*
+       * Below transfer is dummy.
+       */
+        ret |= clSetKernelArg(*kernel, 0, sizeof(cl_mem),  (void *)&(inbuf[bank]));
+        ret |= clSetKernelArg(*kernel, 1, sizeof(cl_int),  (void *)&w);
+        ret |= clSetKernelArg(*kernel, 2, sizeof(cl_int),  (void *)&h);
+        ret |= clSetKernelArg(*kernel, 3, sizeof(cl_mem),  (void *)&outbuf);
+        ret |= clSetKernelArg(*kernel, 4, sizeof(cl_mem),  (void *)&table);
+        ret |= clSetKernelArg(*kernel, 5, sizeof(cl_uint), (void *)&mpage);
+        ret |= clSetKernelArg(*kernel, 6, sizeof(cl_int),  (void *)&bCLSparse);
+        ret |= clSetKernelArg(*kernel, 7, sizeof(cl_int),  (void *)&crtflag);
+        ret |= clSetKernelArg(*kernel, 8, sizeof(cl_float4), (void *)&bright);
+        clFlush(command_queue);
+      }
+      break;
+    case SCR_4096:
+      w = 320;
+      h = 200;
+      //gws[0] = h;
+      //if(kernel == NULL) kernel = clCreateKernel(program, "getvram4096", &ret);
+      if(kernel_4096colors != NULL) kernel = kernel_4096colors;
+      if(kernel != NULL) {
+        
+        ret |= clSetKernelArg(*kernel, 0, sizeof(cl_mem),  (void *)&(inbuf[bank]));
+        ret |= clSetKernelArg(*kernel, 1, sizeof(cl_int),  (void *)&w);
+        ret |= clSetKernelArg(*kernel, 2, sizeof(cl_int),  (void *)&h);
+        ret |= clSetKernelArg(*kernel, 3, sizeof(cl_mem),  (void *)&outbuf);
+        ret |= clSetKernelArg(*kernel, 4, sizeof(cl_mem),  (void *)&(palette_buf[palette_bank_old]));
+        ret |= clSetKernelArg(*kernel, 5, sizeof(cl_mem),  (void *)&table);
+        ret |= clSetKernelArg(*kernel, 6, sizeof(cl_int),  (void *)&bCLSparse);
+        ret |= clSetKernelArg(*kernel, 7, sizeof(cl_int),  (void *)&crtflag);
+        ret |= clSetKernelArg(*kernel, 8, sizeof(cl_float4), (void *)&bright);
+      //clFinish(command_queue);
+        clFlush(command_queue);
+      }
+      break;
+   }
+   w2 = w;
+   h2 = h;
+   //CopyPalette();
+   if(bCLEnableKhrGLShare != 0) {
+     glFlush();
+     ret |= clEnqueueAcquireGLObjects (command_queue,
+                                 1, (cl_mem *)&outbuf,
+                                 0, NULL, &event_copytotexture);
+     if(kernel != NULL) {
+       if(bCLSparse) {
+        ret = clEnqueueNDRangeKernel(command_queue, *kernel, 1, 
+                                     goff, gws, lws, 
+                                     1, &event_copytotexture,  &event_exec);
+       } else {
+        ret = clEnqueueTask(command_queue,
+                            *kernel, 1, &event_copytotexture, &event_exec);
+       }
+     }
+      
+     ret |= clEnqueueReleaseGLObjects (command_queue,
+                                 1, (cl_mem *)&outbuf,
+                                 1, &event_exec, &event_release);
+     clFinish(command_queue);
+//     glFinish();
+   } else {
+     if(kernel != NULL) {
+       if(bCLSparse) {
+        ret = clEnqueueNDRangeKernel(command_queue, *kernel, 1, 
+                                     goff, gws, lws, 
+                                     1, &event_uploadvram[2],  &event_exec);
+       } else {
+        ret = clEnqueueTask(command_queue,
+                            *kernel, 1, &event_uploadvram[2], &event_exec);
+       }
+     }
+   }
+   }
+   
+   return ret;
+}
+
+
+cl_int GLCLDraw::SetupTable(void)
+{
+   cl_int r = CL_INVALID_KERNEL;
+   cl_uint pages;
+   cl_event tbl_ev;
+   pages = 6;
+
+   if(kernel_table != NULL) {
+     r = 0;
+      r |= clSetKernelArg(*kernel_table, 0, sizeof(cl_mem),     (void *)&table);
+      r |= clSetKernelArg(*kernel_table, 1, sizeof(cl_uint),    (void *)&pages);
+      r |= clEnqueueTask(command_queue,
+                            *kernel_table, 0, NULL, NULL);
+      clFinish(command_queue);
+    }
+   return r;
+}
+
+
+cl_int GLCLDraw::SetupBuffer(GLuint *texid)
+{
+   cl_int ret = 0;
+   cl_int r = 0;
+   cl_event ev;
+   unsigned int size = 640 * 400 * sizeof(cl_uchar4);
+   int i;
+
+   inbuf_bank = 0;
+   for(i = 0; i < 2; i++) {
+     inbuf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | 0, // Reduce HOST-CPU usage.
+                            (size_t)(0x2000 * 18 * sizeof(Uint8)), NULL, &r);
+     ret |= r;
+     if(r == CL_SUCCESS){
+       cl_int r2;
+       cl_event cl_event_map;
+       cl_event cl_event_unmap;
+       Uint8 *p;
+
+       p = clEnqueueMapBuffer(command_queue, inbuf[i], CL_TRUE, CL_MAP_WRITE,
+                             0, (size_t)(0x2000 * 18 * sizeof(Uint8)),
+                        0, NULL, &cl_event_map, &r2);
+       if((r2 >= CL_SUCCESS) && (p != NULL)) {
+       memset(p, 0x00, (size_t)(0x2000 * 18 * sizeof(Uint8)));
+       clEnqueueUnmapMemObject(command_queue, inbuf[i], 
+                                p, 1, &cl_event_map,
+                                &cl_event_unmap);
+       clFinish(command_queue);
+       }
+     }
+     XM7_DebugLog(XM7_LOG_INFO, "CL: Alloc STS: inbuf[%d] : %d", i, r);
+     palette_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | 0, // Reduce HOST-CPU usage.
+                            (size_t)sizeof(struct palettebuf_t), NULL, &r);
+     ret = r;
+     if(r == CL_SUCCESS){
+       cl_int r2;
+       cl_event cl_event_map;
+       cl_event cl_event_unmap;
+       Uint8 *p;
+       p = clEnqueueMapBuffer(command_queue, palette_buf[i], CL_TRUE, CL_MAP_WRITE,
+                             0, (size_t)sizeof(struct palettebuf_t),
+                             0, NULL, &cl_event_map, &r2);
+       if((r2 >= CL_SUCCESS) && (p != NULL)) {
+         memset(p, 0x00, (size_t)sizeof(struct palettebuf_t));
+         clEnqueueUnmapMemObject(command_queue, palette_buf[i], 
+                                 p, 1, &cl_event_map,
+                                 &cl_event_unmap);
+         clFinish(command_queue);
+       }
+     }
+     XM7_DebugLog(XM7_LOG_INFO, "CL: Alloc STS: palette_buf[%d] : %d", i, r);
+   }
+   TransferBuffer = MapTransferBuffer(SCR_262144);
+   palettebuf = clEnqueueMapBuffer(command_queue, palette_buf[0], CL_TRUE, CL_MAP_WRITE,
+                             0, (size_t)sizeof(struct palettebuf_t),
+                             0, NULL, &ev, &r);
+   ResetPalette();
+   table = clCreateBuffer(context, CL_MEM_READ_WRITE | 0,
+                 (size_t)(0x100 * 8 * 6 * sizeof(cl_uint)), NULL, &r);
+   ret |= r;
+   XM7_DebugLog(XM7_LOG_INFO, "CL: Alloc STS: table : %d", r);
+
+   // Texture直接からPBO使用に変更 20121102
+   if((bCLEnableKhrGLShare != 0) && (bGL_PIXEL_UNPACK_BUFFER_BINDING != FALSE)){
+       glGenBuffers(1, &pbo);
+       if(pbo <= 0) {
+        bCLEnableKhrGLShare = FALSE;
+        goto _fallback;
+       }
+
+       glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
+       //glBufferData(GL_PIXEL_UNPACK_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
+       glBufferData(GL_PIXEL_UNPACK_BUFFER, size, NULL, GL_STREAM_DRAW);
+       //    XM7_DebugLog(XM7_LOG_DEBUG, "CL: PBO=%08x Size=%d context=%08x", pbo, size, context);
+       outbuf = clCreateFromGLBuffer(context, CL_MEM_WRITE_ONLY | 0, 
+                                    pbo, &r);
+       if(r != GL_NO_ERROR) {
+        glDeleteBuffers(1, &pbo);
+//      pbo = 0;
+        bCLEnableKhrGLShare = FALSE;
+        goto _fallback;
+       }
+       glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+       ret |= r;
+       XM7_DebugLog(XM7_LOG_INFO, "CL: Alloc STS: outbuf (GLCL Interop): %d", r);
+       return ret;
+   }
+ _fallback:
+   outbuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                          (size_t)(640 * 400 * sizeof(Uint32)), NULL, &r);
+   ret |= r;
+   XM7_DebugLog(XM7_LOG_INFO, "CL: Alloc STS: outbuf (CL): %d", r);
+   return ret;
+}
+
+GLuint GLCLDraw::GetPbo(void)
+{
+   return pbo;
+}
diff --git a/source/src/agar/common/agar_glcl.h b/source/src/agar/common/agar_glcl.h
new file mode 100644 (file)
index 0000000..dedb24d
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * Header for CL with GL
+ * (C) 2012 K.Ohta
+ * Notes:
+ *   Not CL model: VramDraw->[Virtual Vram]->AGEventDraw2->drawUpdateTexture->[GL Texture]->Drawing
+ *       CL Model: AGEvenDraw2 -> GLCL_DrawEventSub -> [GL/CL Texture] ->Drawing
+ * History:
+ *   Nov 01,2012: Initial.
+ */
+#include <SDL/SDL.h>
+#include <agar/core.h>
+
+#ifdef _WINDOWS
+#include <GL/gl.h>
+#include <GL/glext.h>
+#else
+#include <GL/glx.h>
+#include <GL/glxext.h>
+#endif
+
+#ifdef _USE_OPENCL
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+
+#if 1
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+
+extern "C" {
+   #include "xm7_types.h"
+   extern BYTE     ttl_palet[8];
+   extern BYTE     apalet_b[4096];
+   extern BYTE     apalet_r[4096];
+   extern BYTE     apalet_g[4096];
+   extern BYTE     multi_page;
+};
+
+extern GLuint uVramTextureID;
+
+struct apalettetbl_t {
+   Uint8 line_h;
+   Uint8 line_l;
+   Uint8 mpage;
+   Uint8 r_4096[4096];
+   Uint8 g_4096[4096];
+   Uint8 b_4096[4096];
+} __attribute__((packed));
+
+struct dpalettetbl_t {
+   Uint8 line_h;
+   Uint8 line_l;
+   Uint8 mpage;
+   Uint8 tbl[8];
+}__attribute__((packed));
+
+struct palettebuf_t {
+   Uint8 alines_h;
+   Uint8 alines_l;
+   Uint8 dlines_h;
+   Uint8 dlines_l;
+   struct apalettetbl_t atbls[200];
+   struct dpalettetbl_t dtbls[400];
+}__attribute__((packed));
+
+
+class GLCLDraw {
+ public:
+   GLCLDraw();
+   ~GLCLDraw();
+   cl_int GetVram(int bmode);
+   cl_int BuildFromSource(const char *p);
+   cl_int SetupBuffer(GLuint *texid);
+   cl_int SetupTable(void);
+   cl_int InitContext(int platformnum, int processornum, int GLinterop);
+   int GetPlatforms(void);
+   int GetUsingDeviceNo(void);
+   int GetDevices(void);
+   void GetDeviceType(char *str, int maxlen, int num);
+   void GetDeviceName(char *str, int maxlen, int num);
+   Uint8 *MapTransferBuffer(int bmode);
+   cl_int UnMapTransferBuffer(Uint8 *p);
+   GLuint GetPbo(void);
+   int GetGLEnabled(void);
+   Uint32 *GetPixelBuffer(void);
+   int ReleasePixelBuffer(Uint32 *p);
+   Uint8 *GetBufPtr(Uint32 timeout);
+   void ReleaseBufPtr(void);
+   void AddPalette(int line, Uint8 mpage, BOOL analog);
+   void ResetPalette(void);
+   void CopyPalette(void);
+   cl_context context = NULL;
+   cl_command_queue command_queue = NULL;
+
+   /* Program Object */
+   const char *source = NULL;
+   cl_program program = NULL;
+   cl_int ret_num_devices;
+   cl_int ret_num_platforms;
+   cl_int platform_num = 0;
+   cl_platform_id platform_id[8];
+   cl_device_id device_id[8];
+
+ private:
+   CL_CALLBACK LogProgramExecute(cl_program program, void *userdata);
+   CL_CALLBACK (*build_callback)(cl_program, void *);
+   int w2 = 0;
+   int h2 = 0;
+   cl_event event_exec;
+   cl_event event_uploadvram[4];
+   cl_event event_copytotexture;
+   cl_event event_release;
+   cl_kernel kernels_array[16];
+   cl_kernel *kernel_8colors = NULL;
+   cl_kernel *kernel_4096colors = NULL;
+   cl_kernel *kernel_256kcolors = NULL;
+   cl_kernel *kernel_table = NULL; 
+   cl_kernel *kernel_copyvram = NULL;
+   cl_uint nkernels;
+
+   int inbuf_bank = 0;
+   int palette_bank = 0;
+   int palette_bank_old = 0;
+   cl_mem inbuf[2] = {NULL, NULL};
+   cl_mem palette_buf[2] = {NULL, NULL};
+   cl_mem outbuf = NULL;
+   cl_mem internalpal = NULL;
+   cl_mem table = NULL;
+   cl_context_properties *properties = NULL;   
+   GLuint pbo = 0;
+   int lastline;
+   int using_device = 0;
+   int bCLEnableKhrGLShare = 0;
+   Uint32 *pixelBuffer = NULL;
+   Uint8 *TransferBuffer = NULL;
+   struct palettebuf_t *palettebuf = NULL;
+   int bModeOld = -1;
+   cl_device_type device_type[8];
+   cl_ulong local_memsize[8];
+   AG_Mutex mutex_buffer;
+   AG_Mutex mutex_palette;
+};
+
+enum {
+  CLKERNEL_8 = 0,
+  CLKERNEL_4096,
+  CLKERNEL_256K,
+  CLKERNEL_END
+};
+
+#endif /* _USE_OPENCL */
diff --git a/source/src/agar/common/agar_gldraw.h b/source/src/agar/common/agar_gldraw.h
new file mode 100644 (file)
index 0000000..3141c34
--- /dev/null
@@ -0,0 +1,50 @@
+/*\r
+ * agar_gldraw.h\r
+ *\r
+ *  Created on: 2011/01/21\r
+ *      Author: whatisthis\r
+ */\r
+\r
+#ifndef AGAR_GLDRAW_H_\r
+#define AGAR_GLDRAW_H_\r
+\r
+#include <agar/core/types.h>\r
+#include <agar/core.h>\r
+#include <agar/gui.h>\r
+#include "api_draw.h"\r
+#include "api_vram.h"\r
+\r
+#include "agar_vramutil.h"\r
+#include "agar_draw.h"\r
+#include "agar_glutil.h"\r
+\r
+extern BOOL EventSDL(AG_Driver *drv);\r
+extern BOOL EventGUI(AG_Driver *drv);\r
+\r
+#ifdef USE_OPENGL\r
+extern void DrawOSDGL(AG_GLView *w);\r
+\r
+extern void AGEventScaleGL(AG_Event *event);\r
+extern void AGEventDrawGL(AG_Event *event);\r
+\r
+extern void AGEventOverlayGL(AG_Event *event);\r
+extern void AGEventMouseMove_AG_GL(AG_Event *event);\r
+extern void AGEventKeyRelease_AG_GL(AG_Event *event);\r
+extern void AGEventKeyPress_AG_GL(AG_Event *event);\r
+\r
+extern void InitGL_AG_GL(int w, int h);\r
+extern void Detach_AG_GL();\r
+/*\r
+ * agar_gldraw2.cpp\r
+ */\r
+extern void InitGL_AG2(int w, int h);\r
+extern void DetachGL_AG2(void);\r
+\r
+extern void AGEventDrawGL2(AG_Event *event);\r
+extern void AGEventKeyUpGL(AG_Event *event);\r
+extern void AGEventKeyDownGL(AG_Event *event);\r
+\r
+extern  GLuint uVramTextureID;\r
+extern  GLuint uNullTextureID;\r
+#endif /* USE_OPENGL */\r
+#endif /* AGAR_GLDRAW_H_ */\r
diff --git a/source/src/agar/common/agar_gldraw2.cpp b/source/src/agar/common/agar_gldraw2.cpp
new file mode 100644 (file)
index 0000000..f6c382b
--- /dev/null
@@ -0,0 +1,427 @@
+/*
+ * agar_gldraw2.cpp
+ * Using Indexed palette @8Colors.
+ * (c) 2011 K.Ohta <whatisthis.sowhat@gmail.com>
+ */
+
+#include <agar/core.h>
+#include <agar/core/types.h>
+#include <agar/gui.h>
+
+#include <SDL/SDL.h>
+#ifdef _WINDOWS
+#include <GL/gl.h>
+#include <GL/glext.h>
+#else
+#include <GL/glx.h>
+#include <GL/glxext.h>
+#endif
+#ifdef _USE_OPENCL
+# include "agar_glcl.h"
+#endif
+
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif //_OPENMP
+
+#include "api_draw.h"
+//#include "api_scaler.h"
+#include "api_kbd.h"
+
+#include "agar_xm7.h"
+#include "agar_draw.h"
+#include "agar_gldraw.h"
+#include "agar_cfg.h"
+#include "xm7.h"
+#include "display.h"
+#include "subctrl.h"
+#include "device.h"
+
+
+GLuint uVramTextureID;
+GLuint uNullTextureID;
+#ifdef _USE_OPENCL
+extern class GLCLDraw *cldraw;
+extern void InitContextCL(void);
+#endif
+
+extern void InitGL_AG2(int w, int h);
+extern void DetachGL_AG2(void);
+
+// Grids
+extern GLfloat *GridVertexs200l;
+extern GLfloat *GridVertexs400l;
+
+// Brights
+float fBrightR;
+float fBrightG;
+float fBrightB;
+
+
+void SetBrightRGB_AG_GL2(float r, float g, float b)
+{
+   fBrightR = r;
+   fBrightG = g;
+   fBrightB = b;
+   SDLDrawFlag.Drawn = TRUE; // Force draw.
+}
+
+
+
+/*
+ * Event Functins
+ */
+
+void AGEventOverlayGL(AG_Event *event)
+{
+       AG_GLView *glv = (AG_GLView *)AG_SELF();
+}
+
+
+void AGEventScaleGL(AG_Event *event)
+{
+   AG_GLView *glv = (AG_GLView *)AG_SELF();
+
+   glViewport(glv->wid.rView.x1, glv->wid.rView.y1, glv->wid.rView.w, glv->wid.rView.h);
+    //glLoadIdentity();
+    //glOrtho(-1.0, 1.0,       1.0, -1.0, -1.0,  1.0);
+
+}
+
+
+static void drawGrids(void *pg,int w, int h)
+{
+    AG_GLView *glv = (AG_GLView *)pg;
+
+   
+}
+
+
+static void drawUpdateTexture(Uint32 *p, int w, int h, BOOL crtflag)
+{
+    if(uVramTextureID != 0){
+       Uint32 *pu;
+       Uint32 *pq;
+       int xx;
+       int yy;
+       int ww;
+       int hh;
+       int ofset;
+       BOOL flag;
+       int i;
+       //       glPushAttrib(GL_TEXTURE_BIT);
+       ww = w >> 3;
+       hh = h >> 3;
+
+#ifdef _USE_OPENCL
+       if((cldraw != NULL) && (bCLEnabled)) {
+         cl_int ret = CL_SUCCESS;
+         LockVram();
+         flag = FALSE;
+         for(i = 0; i < h; i++) {
+           if(bDrawLine[i]) {
+             flag = TRUE;
+           }
+         }
+         if(SDLDrawFlag.Drawn) flag = TRUE;
+         if(flag) {
+               ret = cldraw->GetVram(bModeOld);
+               for(i = 0; i < h; i++)  bDrawLine[i] = FALSE;
+
+               if(ret != CL_SUCCESS) {
+                 SDLDrawFlag.Drawn = FALSE;
+                 bPaletFlag = FALSE;
+                 glBindTexture(GL_TEXTURE_2D, 0);
+                 UnlockVram();
+                 return;
+               }
+           }
+           if(bCLGLInterop){
+               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, cldraw->GetPbo());
+               glBindTexture(GL_TEXTURE_2D, uVramTextureID);
+               // Copy pbo to texture 
+               glTexSubImage2D(GL_TEXTURE_2D, 
+                               0,
+                               0,
+                               0,
+                               w,
+                               h,
+                               GL_RGBA,
+                               GL_UNSIGNED_BYTE,
+                               NULL);
+               glFinish();
+               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+               glBindTexture(GL_TEXTURE_2D, 0);
+               glFinish();
+           } else { // Not interoperability with GL
+               Uint32 *pp;
+               pp = cldraw->GetPixelBuffer();
+               glBindTexture(GL_TEXTURE_2D, uVramTextureID);
+               if(pp != NULL) glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0,
+                                             w, h, GL_RGBA, GL_UNSIGNED_BYTE, pp);
+               glFinish();
+               cldraw->ReleasePixelBuffer(pp);
+               glBindTexture(GL_TEXTURE_2D, 0);
+               glFinish();
+           }
+           SDLDrawFlag.Drawn = FALSE;
+           bPaletFlag = FALSE;
+         UnlockVram();
+       } else {
+#endif
+         LockVram();
+         flag = FALSE;
+         for(i = 0; i < h; i++) {
+           if(bDrawLine[i]) {
+             flag = TRUE;
+             bDrawLine[i] = FALSE;
+           }
+         }
+         flag |= SDLDrawFlag.Drawn;
+         if((p != NULL) && (flag)) {
+            if(crtflag != FALSE) {
+               glBindTexture(GL_TEXTURE_2D, uVramTextureID);
+               glTexSubImage2D(GL_TEXTURE_2D, 
+                         0,
+                         0,
+                         0,
+                         640,
+                         h,
+                         GL_RGBA,
+                         GL_UNSIGNED_BYTE,
+                         p);
+              glFinish();
+              glBindTexture(GL_TEXTURE_2D, 0); // 20111023 チラつきなど抑止
+            }
+            bPaletFlag = FALSE;
+            SDLDrawFlag.Drawn = FALSE;
+         }
+         UnlockVram();
+#ifdef _USE_OPENCL
+       }
+#endif       
+    }
+}
+
+   
+
+
+
+
+/*
+ * "Draw"イベントハンドラ
+ */
+
+void AGEventDrawGL2(AG_Event *event)
+{
+   AG_GLView *glv = (AG_GLView *)AG_SELF();
+   int w;
+   int h;
+   int i;
+   float width;
+   float yf;
+   Uint32 *p;
+   Uint32 *pp;
+   int x;
+   int y;
+   GLfloat TexCoords[4][2];
+   GLfloat Vertexs[4][3];
+   GLfloat TexCoords2[4][2];
+   GLfloat *gridtid;
+   BOOL crtflag = crt_flag;
+   
+   p = pVram2;
+   if((p == NULL) && (bCLEnabled == FALSE)) return;
+     switch(bModeOld) {
+        case SCR_400LINE:
+            w = 640;
+            h = 400;
+           TexCoords[0][0] = TexCoords[3][0] = 0.0f; // Xbegin
+            TexCoords[0][1] = TexCoords[1][1] = 0.0f; // Ybegin
+
+            TexCoords[2][0] = TexCoords[1][0] = 640.0f / 640.0f; // Xend
+            TexCoords[2][1] = TexCoords[3][1] = 399.0f / 400.0f; // Yend
+           gridtid = GridVertexs400l;
+            break;
+        case SCR_200LINE:
+            w = 640;
+            h = 200;
+            TexCoords[0][0] = TexCoords[3][0] = 0.0f; // Xbegin
+            TexCoords[0][1] = TexCoords[1][1] = 0.0f; // Ybegin
+
+            TexCoords[2][0] = TexCoords[1][0] = 640.0f / 640.0f; // Xend
+            TexCoords[2][1] = TexCoords[3][1] = 199.0f / 400.0f; // Yend
+           gridtid = GridVertexs200l;
+            break;
+        case SCR_262144:
+        case SCR_4096:
+        default:
+            w = 320;
+            h = 200;
+            TexCoords[0][0] = TexCoords[3][0] = 0.0f; // Xbegin
+            TexCoords[0][1] = TexCoords[1][1] = 0.0f; // Ybegin
+
+            TexCoords[2][0] = TexCoords[1][0] = 320.0f / 640.0f; // Xend
+            TexCoords[2][1] = TexCoords[3][1] = 199.0f / 400.0f; // Yend
+           gridtid = GridVertexs200l;
+            break;
+     }
+
+    Vertexs[0][2] = Vertexs[1][2] = Vertexs[2][2] = Vertexs[3][2] = -0.98f;
+    Vertexs[0][0] = Vertexs[3][0] = -1.0f; // Xbegin
+    Vertexs[0][1] = Vertexs[1][1] = 1.0f;  // Yend
+    Vertexs[2][0] = Vertexs[1][0] = 1.0f; // Xend
+    Vertexs[2][1] = Vertexs[3][1] = -1.0f; // Ybegin
+
+
+    if(uVramTextureID == 0) uVramTextureID = CreateNullTexture(640, 400); //  ドットゴーストを防ぐ
+    if(uNullTextureID == 0) uNullTextureID = CreateNullTexture(640, 400); //  ドットゴーストを防ぐ
+     /*
+     * 20110904 OOPS! Updating-Texture must be in Draw-Event-Handler(--;
+     */
+
+    glPushAttrib(GL_TEXTURE_BIT);
+    glPushAttrib(GL_TRANSFORM_BIT);
+    glPushAttrib(GL_ENABLE_BIT);
+    InitContextCL();   
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    
+   
+    glEnable(GL_DEPTH_TEST);
+    glDisable(GL_BLEND);
+   
+    /*
+     * VRAMの表示:テクスチャ貼った四角形
+     */
+     //if(uVramTextureID != 0) {
+
+       if((bMode == bModeOld) && (crtflag)){
+       drawUpdateTexture(p, w, h, crtflag);
+       glEnable(GL_TEXTURE_2D);
+       glBindTexture(GL_TEXTURE_2D, uVramTextureID);
+       glColor4f(0.0f, 0.0f, 0.0f, 1.0f);
+     //} else {
+//     glDisable(GL_TEXTURE_2D);
+//     glColor4f(0.0f, 0.0f, 0.0f, 1.0f);
+//     }
+     } else {
+          glEnable(GL_TEXTURE_2D);
+         glBindTexture(GL_TEXTURE_2D, uNullTextureID);
+          glColor4f(1.0f, 1.0f, 1.0f, 1.0f);
+     }      
+       if(!bSmoosing) {
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+       } else {
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+       }
+       if(bGL_EXT_VERTEX_ARRAY) {
+        glEnable(GL_TEXTURE_COORD_ARRAY_EXT);
+        glEnable(GL_VERTEX_ARRAY_EXT);
+             
+        glTexCoordPointerEXT(2, GL_FLOAT, 0, 4, TexCoords);
+        glVertexPointerEXT(3, GL_FLOAT, 0, 4, Vertexs);
+        glDrawArraysEXT(GL_POLYGON, 0, 4);
+        
+        glDisable(GL_VERTEX_ARRAY_EXT);
+        glDisable(GL_TEXTURE_COORD_ARRAY_EXT);
+       } else {
+        glBegin(GL_POLYGON);
+        glTexCoord2f(TexCoords[0][0], TexCoords[0][1]);
+        glVertex3f(Vertexs[0][0], Vertexs[0][1], Vertexs[0][2]);
+        
+        glTexCoord2f(TexCoords[1][0], TexCoords[1][1]);
+        glVertex3f(Vertexs[1][0], Vertexs[1][1], Vertexs[1][2]);
+        
+        glTexCoord2f(TexCoords[2][0], TexCoords[2][1]);
+        glVertex3f(Vertexs[2][0], Vertexs[2][1], Vertexs[2][2]);
+             
+        glTexCoord2f(TexCoords[3][0], TexCoords[3][1]);
+        glVertex3f(Vertexs[3][0], Vertexs[3][1], Vertexs[3][2]);
+        glEnd();
+       }
+    // }
+   
+     // 20120502 輝度調整
+    glBindTexture(GL_TEXTURE_2D, 0); // 20111023
+    glDisable(GL_TEXTURE_2D);
+    glDisable(GL_DEPTH_TEST);
+
+    if(bCLEnabled == FALSE){
+       glEnable(GL_BLEND);
+   
+       glColor3f(fBrightR , fBrightG, fBrightB);
+       glBlendFunc(GL_ZERO, GL_SRC_COLOR);
+    
+       //    glBlendFunc(GL_ZERO, GL_SRC_ALPHA);
+       if(bGL_EXT_VERTEX_ARRAY) {
+         glEnable(GL_VERTEX_ARRAY_EXT);
+         glVertexPointerEXT(3, GL_FLOAT, 0, 4, Vertexs);
+         glDrawArraysEXT(GL_POLYGON, 0, 4);
+         glDisable(GL_VERTEX_ARRAY_EXT);
+       } else {
+         glBegin(GL_POLYGON);
+         glVertex3f(Vertexs[0][0], Vertexs[0][1], Vertexs[0][2]);
+         glVertex3f(Vertexs[1][0], Vertexs[1][1], Vertexs[1][2]);
+         glVertex3f(Vertexs[2][0], Vertexs[2][1], Vertexs[2][2]);
+         glVertex3f(Vertexs[3][0], Vertexs[3][1], Vertexs[3][2]);
+         glEnd();
+       }
+       
+       glBlendFunc(GL_ONE, GL_ZERO);
+   
+       glDisable(GL_BLEND);
+    }
+       glDisable(GL_TEXTURE_2D);
+       glDisable(GL_DEPTH_TEST);
+       if((glv->wid.rView.h >= (h * 2)) && (bFullScan == 0)) {
+         glLineWidth((float)(glv->wid.rView.h) / (float)(h * 2));
+         glColor4f(0.0f, 0.0f, 0.0f, 1.0f);
+         if(bGL_EXT_VERTEX_ARRAY) {
+            glEnable(GL_VERTEX_ARRAY_EXT);
+            glVertexPointerEXT(3, GL_FLOAT, 0, h + 1, gridtid);
+            glDrawArraysEXT(GL_LINE, 0, h + 1);
+            glDisable(GL_VERTEX_ARRAY_EXT);
+         } else {
+            glBegin(GL_LINES);
+            for(y = 0; y < h; y++) {
+               yf = -1.0f + (float) (y + 1) * 2.0f / (float)h;
+               glVertex3f(-1.0f, yf, 0.96f);  
+               glVertex3f(+1.0f, yf, 0.96f);  
+            }
+            glEnd();
+         }
+       
+       }
+   //}
+   glDisable(GL_BLEND);
+   glDisable(GL_TEXTURE_2D);
+   glDisable(GL_DEPTH_TEST);
+#ifdef USE_OPENGL
+    DrawOSDGL(glv);
+#endif
+    glPopAttrib();
+    glPopAttrib();
+    glPopAttrib();
+    glFlush();
+}
+
+void AGEventKeyUpGL(AG_Event *event)
+{
+    int key = AG_INT(1);
+    int mod = AG_INT(2);
+    Uint32 ucs = AG_ULONG(3);
+       OnKeyReleaseAG(key, mod, ucs);
+}
+
+void AGEventKeyDownGL(AG_Event *event)
+{
+    int key = AG_INT(1);
+    int mod = AG_INT(2);
+    Uint32 ucs = AG_ULONG(3);
+       OnKeyPressAG(key, mod, ucs);
+
+}
diff --git a/source/src/agar/common/agar_glutil.cpp b/source/src/agar/common/agar_glutil.cpp
new file mode 100644 (file)
index 0000000..1581aa6
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ * Agar: OpenGLUtils
+ * (C) 2011 K.Ohta <whatisthis.sowhat@gmail.com>
+ */
+
+
+#include "agar_glutil.h"
+#ifdef _USE_OPENCL
+#include "agar_glcl.h"
+#endif
+#include "agar_logger.h"
+
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif //_OPENMP
+
+extern "C" {
+    AG_GLView *GLDrawArea;
+    BOOL bInitCL = FALSE;
+    BOOL bCLEnabled = FALSE;
+    BOOL bCLGLInterop = FALSE;
+    int nCLGlobalWorkThreads = 10;
+    BOOL bCLSparse = FALSE; // TRUE=Multi threaded CL,FALSE = Single Thread.
+    int nCLPlatformNum;
+    int nCLDeviceNum;
+    BOOL bCLInteropGL;
+    extern BOOL bUseOpenCL;
+}
+
+GLfloat GridVertexs200l[202 * 6];
+GLfloat GridVertexs400l[402 * 6];
+
+// Brights
+extern float fBrightR;
+extern float fBrightG;
+extern float fBrightB;
+extern const char *cl_render;
+extern GLuint uVramTextureID;
+
+#ifdef _USE_OPENCL
+class GLCLDraw *cldraw = NULL;
+#endif
+
+
+
+GLuint CreateNullTexture(int w, int h)
+{
+    GLuint ttid;
+    Uint32 *p;
+
+    p =(Uint32 *)malloc((w + 2)*  (h  + 2) * sizeof(Uint32));
+    if(p == NULL) return 0;
+
+    //    memset(p, 0x00, (w + 2) * (h + 2) * sizeof(Uint32));
+    memset(p, 0x00, (w + 2) * (h + 2) * sizeof(Uint32));
+    glGenTextures(1, &ttid);
+    glBindTexture(GL_TEXTURE_2D, ttid);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); // Limit mipmap level , reduce resources.
+    glTexImage2D(GL_TEXTURE_2D,
+                 0,
+                 GL_RGBA,
+                 w, h + 2,
+                 0,
+                 GL_RGBA,
+                 GL_UNSIGNED_BYTE,
+                 p);
+    free(p);
+    return ttid;
+}
+
+GLuint CreateNullTextureCL(int w, int h)
+{
+    GLuint ttid;
+    Uint32 *p;
+
+    p =(Uint32 *)malloc((w + 2)*  (h  + 2) * sizeof(Uint32));
+    if(p == NULL) return 0;
+    memset(p, 0x00, (w + 2) * (h + 2) * sizeof(Uint32));
+    glGenTextures(1, &ttid);
+    glBindTexture(GL_TEXTURE_2D, ttid);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); // Limit mipmap level , reduce resources.
+    glTexImage2D(GL_TEXTURE_2D,
+                 0,
+                 GL_RGBA8UI,
+                 w, h + 2,
+                 0,
+                 GL_RGBA_INTEGER,
+                 GL_UNSIGNED_BYTE,
+                 p);
+    glBindTexture(GL_TEXTURE_2D, 0);
+    free(p);
+    return ttid;
+}
+
+
+void Flip_AG_GL(void)
+{
+       if(!InitVideo) return;
+}
+
+void DiscardTextures(int n, GLuint *id)
+{
+       if(GLDrawArea == NULL) return;
+       if(agDriverOps == NULL) return;
+       glDeleteTextures(n, id);
+
+}
+
+void DiscardTexture(GLuint tid)
+{
+       DiscardTextures(1, &tid);
+}
+
+
+void InitContextCL(void)
+{
+  if(GLDrawArea == NULL) return; // Context not created yet.
+  if(bInitCL == TRUE) return; // CL already initialized.
+
+#ifdef _USE_OPENCL
+     bCLEnabled = FALSE;
+     bCLGLInterop = FALSE;
+     if(bUseOpenCL && (cldraw == NULL) && 
+       bGL_PIXEL_UNPACK_BUFFER_BINDING) {
+           cl_int r;
+           cldraw = new GLCLDraw;
+           if(cldraw != NULL) {
+             r = cldraw->InitContext(nCLPlatformNum, nCLDeviceNum, bCLInteropGL);
+              if(r == CL_SUCCESS){
+                r = cldraw->BuildFromSource(cl_render);
+                XM7_DebugLog(XM7_LOG_DEBUG, "CL: Build KERNEL: STS = %d", r);
+                if(r == CL_SUCCESS) {
+                   r = cldraw->SetupBuffer(&uVramTextureID);
+                   r |= cldraw->SetupTable();
+                   if(r != CL_SUCCESS){
+                      delete cldraw;
+                      cldraw = NULL;
+                   } else if(cldraw->GetGLEnabled() != 0) {
+                     bCLGLInterop = TRUE;
+                     bCLEnabled = TRUE;
+                   } else {
+                     /*
+                      *
+                      */
+                     bCLGLInterop = FALSE;
+                     bCLEnabled = TRUE;
+                   }
+                } else {
+                   delete cldraw;
+                   cldraw = NULL;
+                }
+              } else {
+                 delete cldraw;
+                 cldraw = NULL;
+              }
+           }
+     }
+#else
+     bCLEnabled = FALSE;
+     bCLGLInterop = FALSE;
+#endif // _USE_OPENCL   
+     bInitCL = TRUE;
+}
+
+
+static void InitGridVertexsSub(GLfloat *p, int h)
+{
+   int y;
+   int yp;
+   float yf;
+   yp = 0;
+   for(y = 0; y < (h + 1); y++) {
+      yf = -1.0f + (float) (y + 1) * 2.0f / (float)h;
+      p[yp + 0] = -1.0f;
+      p[yp + 1] = yf;
+      p[yp + 2] = 0.96f;
+      p[yp + 3] = -1.0f;
+      p[yp + 4] = yf;
+      p[yp + 5] = 0.96f;
+      yp += 6;
+   }
+   return;
+}
+
+
+void InitGridVertexs(void)
+{
+   InitGridVertexsSub(GridVertexs200l, 200);
+   InitGridVertexsSub(GridVertexs400l, 400);
+}
+
+
+void InitGL_AG2(int w, int h)
+{
+       Uint32 flags;
+       int bpp = 32;
+       int rgb_size[3];
+       char *ext;
+
+       if(InitVideo) return;
+    InitVideo = TRUE;
+
+    vram_pb = NULL;
+    vram_pg = NULL;
+    vram_pr = NULL;
+#ifdef _USE_OPENCL
+   cldraw = NULL;
+#endif
+       flags = SDL_OPENGL | SDL_RESIZABLE;
+    switch (bpp) {
+         case 8:
+             rgb_size[0] = 3;
+             rgb_size[1] = 3;
+             rgb_size[2] = 2;
+             break;
+         case 15:
+         case 16:
+             rgb_size[0] = 5;
+             rgb_size[1] = 5;
+             rgb_size[2] = 5;
+             break;
+         default:
+             rgb_size[0] = 8;
+             rgb_size[1] = 8;
+             rgb_size[2] = 8;
+             break;
+     }
+    /*
+     * GL 拡張の取得 20110907-
+     */
+       InitVramSemaphore();
+       uVramTextureID = 0;
+       uNullTextureID = 0;
+       pVram2 = NULL;
+#ifdef _USE_OPENCL
+        bInitCL = FALSE;
+        nCLGlobalWorkThreads = 10;
+        bCLSparse = FALSE; // TRUE=Multi threaded CL,FALSE = Single Thread.
+       nCLPlatformNum = 0;
+       nCLDeviceNum = 0;
+       bCLInteropGL = FALSE;
+        //bCLDirectMapping = FALSE;
+#endif
+       InitVirtualVram();
+        //if(AG_UsingSDL(NULL)) {
+          InitFBO(); // 拡張の有無を調べてからFBOを初期化する。
+          // FBOの有無を受けて、拡張の有無変数を変更する(念のために)
+          InitGLExtensionVars();
+          InitGridVertexs(); // Grid初期化
+       //}
+   
+    fBrightR = 1.0; // 輝度の初期化
+    fBrightG = 1.0;
+    fBrightB = 1.0;
+
+    return;
+}
+
+
+extern "C" {
+// OpenGL状態変数
+BOOL bGL_ARB_IMAGING; // イメージ操作可能か?
+BOOL bGL_ARB_COPY_BUFFER;  // バッファ内コピー(高速化!)サポート
+BOOL bGL_EXT_INDEX_TEXTURE; // パレットモードに係わる
+BOOL bGL_EXT_COPY_TEXTURE; // テクスチャ間のコピー
+BOOL bGL_SGI_COLOR_TABLE; // パレットモード(SGI拡張)
+BOOL bGL_SGIS_PIXEL_TEXTURE; // テクスチャアップデート用
+BOOL bGL_EXT_PACKED_PIXEL; // PackedPixelを使ってアップデートを高速化?
+BOOL bGL_EXT_VERTEX_ARRAY; // 頂点を配列化して描画を高速化
+BOOL bGL_EXT_PALETTED_TEXTURE; // パレットモード(更に別拡張)
+BOOL bGL_PIXEL_UNPACK_BUFFER_BINDING; // ピクセルバッファがあるか?
+
+   
+// FBO API
+PFNGLVERTEXPOINTEREXTPROC glVertexPointerEXT;
+PFNGLDRAWARRAYSEXTPROC glDrawArraysEXT;
+PFNGLTEXCOORDPOINTEREXTPROC glTexCoordPointerEXT;
+//#ifndef _WINDOWS
+PFNGLBINDBUFFERPROC glBindBuffer;
+PFNGLBUFFERDATAPROC glBufferData;
+PFNGLGENBUFFERSPROC glGenBuffers;
+PFNGLDELETEBUFFERSPROC glDeleteBuffers;
+//#endif
+
+BOOL QueryGLExtensions(const char *str)
+{
+    char *ext;
+    char *p;
+    int i;
+    int j;
+    int k;
+    int l;
+    int ll;
+//#ifndef _WINDOWS
+
+    if(str == NULL) return FALSE;
+    ll = strlen(str);
+    if(ll <= 0) return FALSE;
+
+    ext =(char *)glGetString(GL_EXTENSIONS);
+    if(ext == NULL) return FALSE;
+    l = strlen(ext);
+    if(l <= 0) return FALSE;
+    p = ext;
+    for(i = 0; i < l ; ){
+        int j = strcspn(p, " ");
+        if((ll == j) && (strncmp(str, p, j) == 0)) {
+            return TRUE;
+        }
+        p += (j + 1);
+        i += (j + 1);
+    }
+//#endif
+    return FALSE;
+}
+
+void InitGLExtensionVars(void)
+{
+    bGL_ARB_IMAGING = QueryGLExtensions("GL_ARB_imaging");
+    bGL_ARB_COPY_BUFFER = QueryGLExtensions("GL_ARB_copy_buffer");
+    bGL_EXT_INDEX_TEXTURE = QueryGLExtensions("GL_EXT_index_texture");
+    bGL_EXT_COPY_TEXTURE = QueryGLExtensions("GL_EXT_copy_texture");
+    bGL_SGI_COLOR_TABLE = QueryGLExtensions("GL_SGI_color_table");
+    bGL_SGIS_PIXEL_TEXTURE = QueryGLExtensions("GL_SGIS_pixel_texture");
+    bGL_EXT_PACKED_PIXEL = QueryGLExtensions("GL_EXT_packed_pixel");
+    bGL_EXT_PALETTED_TEXTURE = QueryGLExtensions("GL_EXT_paletted_texture");
+    bGL_EXT_VERTEX_ARRAY = QueryGLExtensions("GL_EXT_vertex_array");
+//    bGL_PIXEL_UNPACK_BUFFER_BINDING = QueryGLExtensions("GL_pixel_unpack_buffer_binding");
+    bGL_PIXEL_UNPACK_BUFFER_BINDING = TRUE;
+    bCLEnabled = FALSE;
+    bCLGLInterop = FALSE;
+}
+
+   
+#ifdef _WINDOWS
+#include <windef.h>
+extern PROC WINAPI wglGetProcAddress(LPCSTR lpszProc);
+//#else 
+//extern void *glXGetProcAddress(const GLubyte *);
+#endif
+   
+void InitFBO(void)
+{
+//#ifndef _WINDOWS // glx is for X11.
+// Use SDL for wrapper. 20130128
+    if(AG_UsingSDL(NULL)) {
+       glVertexPointerEXT = (PFNGLVERTEXPOINTEREXTPROC)SDL_GL_GetProcAddress("glVertexPointerEXT");
+       if(glVertexPointerEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glDrawArraysEXT = (PFNGLDRAWARRAYSEXTPROC)SDL_GL_GetProcAddress("glDrawArraysEXT");
+       if(glDrawArraysEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glTexCoordPointerEXT = (PFNGLTEXCOORDPOINTEREXTPROC)SDL_GL_GetProcAddress("glTexCoordPointerEXT");
+       if(glTexCoordPointerEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glBindBuffer = (PFNGLBINDBUFFERPROC)SDL_GL_GetProcAddress("glBindBuffer");
+       if(glBindBuffer == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glBufferData = (PFNGLBUFFERDATAPROC)SDL_GL_GetProcAddress("glBufferData");
+       if(glBufferData == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glGenBuffers = (PFNGLGENBUFFERSPROC)SDL_GL_GetProcAddress("glGenBuffers");
+       if(glGenBuffers == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glDeleteBuffers = (PFNGLDELETEBUFFERSPROC)SDL_GL_GetProcAddress("glDeleteBuffers");
+       if(glDeleteBuffers == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+    } else { // glx, wgl
+#ifndef _WINDOWS
+       glVertexPointerEXT = (PFNGLVERTEXPOINTEREXTPROC)glXGetProcAddress((const GLubyte *)"glVertexPointerEXT");
+       if(glVertexPointerEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glDrawArraysEXT = (PFNGLDRAWARRAYSEXTPROC)glXGetProcAddress((const GLubyte *)"glDrawArraysEXT");
+       if(glDrawArraysEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glTexCoordPointerEXT = (PFNGLTEXCOORDPOINTEREXTPROC)glXGetProcAddress((const GLubyte *)"glTexCoordPointerEXT");
+       if(glTexCoordPointerEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glBindBuffer = (PFNGLBINDBUFFERPROC)glXGetProcAddress((const GLubyte *)"glBindBuffer");
+       if(glBindBuffer == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glBufferData = (PFNGLBUFFERDATAPROC)glXGetProcAddress((const GLubyte *)"glBufferData");
+       if(glBufferData == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glGenBuffers = (PFNGLGENBUFFERSPROC)glXGetProcAddress((const GLubyte *)"glGenBuffers");
+       if(glGenBuffers == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glDeleteBuffers = (PFNGLDELETEBUFFERSPROC)glXGetProcAddress((const GLubyte *)"glDeleteBuffers");
+       if(glDeleteBuffers == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+#else
+       glVertexPointerEXT = (PFNGLVERTEXPOINTEREXTPROC)wglGetProcAddress("glVertexPointerEXT");
+       if(glVertexPointerEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glDrawArraysEXT = (PFNGLDRAWARRAYSEXTPROC)wglGetProcAddress("glDrawArraysEXT");
+       if(glDrawArraysEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glTexCoordPointerEXT = (PFNGLTEXCOORDPOINTEREXTPROC)wglGetProcAddress("glTexCoordPointerEXT");
+       if(glTexCoordPointerEXT == NULL) bGL_EXT_VERTEX_ARRAY = FALSE;
+       glBindBuffer = (PFNGLBINDBUFFERPROC)wglGetProcAddress("glBindBuffer");
+       if(glBindBuffer == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glBufferData = (PFNGLBUFFERDATAPROC)wglGetProcAddress("glBufferData");
+       if(glBufferData == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glGenBuffers = (PFNGLGENBUFFERSPROC)wglGetProcAddress("glGenBuffers");
+       if(glGenBuffers == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+       glDeleteBuffers = (PFNGLDELETEBUFFERSPROC)wglGetProcAddress("glDeleteBuffers");
+       if(glDeleteBuffers == NULL) bGL_PIXEL_UNPACK_BUFFER_BINDING = FALSE;
+#endif // _WINDOWS    
+    }
+   
+}
+
+}
diff --git a/source/src/agar/common/agar_glutil.h b/source/src/agar/common/agar_glutil.h
new file mode 100644 (file)
index 0000000..bb4ad51
--- /dev/null
@@ -0,0 +1,78 @@
+#ifndef AGAR_GLUTIL_H_INCLUDED\r
+#define AGAR_GLUTIL_H_INCLUDED\r
+\r
+\r
+#include <agar/core.h>\r
+#include <agar/core/types.h>\r
+#include <agar/gui.h>\r
+#include <agar/gui/glview.h>\r
+\r
+#include <SDL/SDL.h>\r
+#ifdef _WINDOWS\r
+#include <GL/gl.h>\r
+#include <GL/glext.h>\r
+#include <SDL/SDL_opengl.h>\r
+#else\r
+#include <GL/glx.h>\r
+#include <GL/glxext.h>\r
+#include <SDL/SDL_opengl.h>\r
+#endif\r
+\r
+#include "api_draw.h"\r
+#include "agar_xm7.h"\r
+#include "agar_vramutil.h"\r
+#include "agar_draw.h"\r
+#include "agar_gldraw.h"\r
+\r
+#ifdef __cplusplus\r
+extern "C" {\r
+#endif\r
+extern  AG_GLView *GLDrawArea;\r
+extern BOOL bGL_ARB_IMAGING; // イメージ操作可能か?\r
+extern BOOL bGL_ARB_COPY_BUFFER;  // バッファ内コピー(高速化!)サポート\r
+extern BOOL bGL_EXT_INDEX_TEXTURE; // パレットモードに係わる\r
+extern BOOL bGL_EXT_COPY_TEXTURE; // テクスチャ間のコピー\r
+extern BOOL bGL_SGI_COLOR_TABLE; // パレットモード(SGI拡張)\r
+extern BOOL bGL_SGIS_PIXEL_TEXTURE; // テクスチャアップデート用\r
+extern BOOL bGL_EXT_PACKED_PIXEL; // PackedPixelを使ってアップデートを高速化?\r
+extern BOOL bGL_EXT_VERTEX_ARRAY; // 頂点を配列化して描画を高速化\r
+extern BOOL bGL_EXT_PALETTED_TEXTURE; // パレットモード(更に別拡張)\r
+extern BOOL bGL_PIXEL_UNPACK_BUFFER_BINDING; // Pixel buffer\r
+extern BOOL bCLEnabled;\r
+extern BOOL bCLGLInterop;\r
+\r
+// FBO API\r
+extern PFNGLVERTEXPOINTEREXTPROC glVertexPointerEXT;\r
+extern PFNGLDRAWARRAYSEXTPROC glDrawArraysEXT;\r
+extern PFNGLTEXCOORDPOINTEREXTPROC glTexCoordPointerEXT;\r
+#ifndef _WINDOWS\r
+extern PFNGLBINDBUFFERPROC glBindBuffer;\r
+extern PFNGLBUFFERDATAPROC glBufferData;\r
+extern PFNGLGENBUFFERSPROC glGenBuffers;\r
+extern PFNGLDELETEBUFFERSPROC glDeleteBuffers;\r
+#endif\r
+   \r
+extern void InitFBO(void);\r
+   \r
+extern void InitGLExtensionVars(void);\r
+extern BOOL QueryGLExtensions(const char *str);\r
+\r
+#ifdef _USE_OPENCL\r
+extern int nCLGlobalWorkThreads;\r
+extern BOOL bCLSparse; // TRUE=Multi threaded CL,FALSE = Single Thread.\r
+extern BOOL bInitCL;\r
+   //extern BOOL bCLDirectMapping;\r
+#endif // _USE_OPENCL\r
+#ifdef __cplusplus\r
+}\r
+#endif\r
+#ifdef __cplusplus\r
+extern GLuint CreateNullTexture(int w, int h);\r
+extern GLuint CreateNullTextureCL(int w, int h);\r
+\r
+extern void Flip_AG_GL(void);\r
+extern void DiscardTextures(int n, GLuint *id);\r
+extern void DiscardTexture(GLuint id);\r
+#endif\r
+\r
+#endif // AGAR_GLUTIL_H_INCLUDED\r
diff --git a/source/src/agar/common/agar_logger.cpp b/source/src/agar/common/agar_logger.cpp
new file mode 100644 (file)
index 0000000..b3bf980
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Log functions
+ * (C) 2014-06-30 K.Ohta
+ * History:
+ *  Dec 30, 2014 Move from XM7/SDL, this was Ohta's original code.
+ * Licence : GPLv2
+ */
+
+#include "agar_logger.h"
+
+static int syslog_flag = 0;
+static int log_cons = 0;
+static int log_onoff = 0;
+static int log_opened = FALSE;
+extern "C" 
+{
+   
+void AGAR_OpenLog(int syslog, int cons)
+     {
+       int flags = 0;
+       
+       log_onoff = 1;
+       if(syslog != 0) {
+          syslog_flag = -1;
+#if defined(_SYS_SYSLOG_H) || defined(_SYSLOG_H)
+          if(cons != 0) { 
+             flags = LOG_CONS;
+          }
+          openlog("XM7", flags | LOG_PID | LOG_NOWAIT, LOG_USER);
+#endif
+       } else {
+          syslog_flag = 0;
+       }
+       log_cons = cons;
+       log_opened = TRUE;
+     }
+   
+   
+void AGAR_DebugLog(int level, const char *fmt, ...)
+     {
+       va_list ap;
+       struct tm *timedat;
+       time_t nowtime;
+       char strbuf[4096];
+       char strbuf2[256];
+       char strbuf3[24];
+       struct timeval tv;
+       int level_flag = LOG_USER;
+       
+       if(log_onoff == 0) return;
+       
+       if(level == AGAR_LOG_DEBUG) {
+          level_flag |= LOG_DEBUG;
+       } else if(level == AGAR_LOG_INFO) { 
+          level_flag |= LOG_INFO;
+       } else if(level == AGAR_LOG_WARN) {
+          level_flag |= LOG_WARNING;
+       } else {
+          level_flag |= LOG_DEBUG;
+       }
+       
+       
+       va_start(ap, fmt);      
+       vsnprintf(strbuf, 4095, fmt, ap);
+       nowtime = time(NULL);
+       gettimeofday(&tv, NULL);
+       if(log_cons != 0) { // Print only
+          timedat = localtime(&nowtime);
+          strftime(strbuf2, 255, "XM7: %Y-%m-%d %H:%M:%S", timedat);
+          snprintf(strbuf3, 23, ".%06d", tv.tv_usec);
+          fprintf(stdout, "%s%s %s\n", strbuf2, strbuf3, strbuf);
+       } 
+       if(syslog_flag != 0) { // SYSLOG
+          syslog(level_flag, "uS=%06d %s", tv.tv_usec, strbuf);
+       }
+       va_end(ap);
+     }
+
+void AGAR_SetLogStatus(int sw)
+     {
+       if(sw == 0) {
+          log_onoff = 0;
+       } else {
+          log_onoff = 1;
+       }
+     }
+   
+void AGAR_SetLogStdOut(int sw)
+     {
+       if(sw == 0) {
+          log_cons = 0;
+       } else {
+          log_cons = 1;
+       }
+     }
+
+void AGAR_SetLogSysLog(int sw)
+     {
+       if(sw == 0) {
+          syslog_flag = 0;
+       } else {
+          syslog_flag = 1;
+       }
+     }
+
+BOOL AGAR_LogGetStatus(void)
+     {
+       return (BOOL) log_opened;
+     }
+   
+       
+void AGAR_CloseLog(void)
+    {
+       if(syslog_flag != 0) {
+            closelog();
+       }
+       syslog_flag = 0;
+       log_cons = 0;
+        log_onoff = 0;
+        log_opened = 0;
+     }
+}
+
diff --git a/source/src/agar/common/agar_logger.h b/source/src/agar/common/agar_logger.h
new file mode 100644 (file)
index 0000000..86e2282
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Log functions
+ * (C) 2014-06-30 K.Ohta
+ * 
+ * History:
+ *  Dec 30, 2014 Move from XM7/SDL, this was Ohta's original code.
+ * Licence : GPLv2
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include <syslog.h>
+
+#include <time.h>
+#include <sys/time.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+   extern void AGAR_OpenLog(int syslog, int cons);
+   extern void AGAR_DebugLog(int level, const char *fmt, ...);
+   extern void AGAR_CloseLog(void);
+   extern void AGAR_SetLogStatus(int sw);
+   extern void AGAR_SetLogSysLog(int sw);
+   extern void AGAR_SetLogStdOut(int sw);
+   extern BOOL AGAR_LogGetStatus(void);
+
+#define AGAR_LOG_ON 1
+#define AGAR_LOG_OFF 0
+   
+#define AGAR_LOG_DEBUG 0
+#define AGAR_LOG_INFO 1
+#define AGAR_LOG_WARN 2
+
+   
+#ifndef FALSE
+#define FALSE                   0
+#endif
+#ifndef TRUE
+#define TRUE                    (!FALSE)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+   
\ No newline at end of file
diff --git a/source/src/agar/common/agar_sdlscaler.cpp b/source/src/agar/common/agar_sdlscaler.cpp
new file mode 100644 (file)
index 0000000..2ea311b
--- /dev/null
@@ -0,0 +1,475 @@
+/*
+* FM-7 Emulator "XM7" -> CommonSourceProjedct
+* Virtual Vram Display(Agar widget version)
+* (C) 2012 K.Ohta <whatisthis.sowhat@gmail.com>
+* History:
+* Jan 18,2012 From demos/customwidget/mywidget.[c|h]
+* Jan 20,2012 Separete subroutines.
+* Dec 30,2014 Move from XM7/SDL, 100% my original file.
+*/
+
+#include "agar_sdlview.h"
+#include "agar_cfg.h"
+#include "api_vram.h"
+#include "api_draw.h"
+//#include "api_scaler.h"
+#include "api_kbd.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern "C" {
+extern struct AGAR_CPUID *pCpuID;
+extern BOOL bUseSIMD;
+}
+
+extern "C" { // Define Headers
+   // scaler/generic
+   extern void pVram2RGB_x05_Line(Uint32 *src, Uint8 *dst, int x, int xend, int y, int yrep); // scaler_x05.c , raster render
+   extern void pVram2RGB_x1_Line(Uint32 *src,  Uint8 *dst, int x, int xend, int y, int yrep); // scaler_x1.c , raster render
+   extern void pVram2RGB_x125_Line(Uint32 *src,  Uint8 *dst, int x, int xend, int y, int yrep); // scaler_x125.c , raster render
+   extern void pVram2RGB_x15_Line(Uint32 *src,  Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x15.c , raster render.
+   extern void pVram2RGB_x2_Line(Uint32 *src,  Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x2.c , raster render.
+   extern void pVram2RGB_x225_Line(Uint32 *src,  Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x225.c , raster render.
+   extern void pVram2RGB_x25_Line(Uint32 *src,  Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x25.c , raster render.
+   extern void pVram2RGB_x3_Line(Uint32 *src,  Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x3.c , raster render.
+   extern void pVram2RGB_x4_Line(Uint32 *src,  Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x4.c , raster render.
+   extern void pVram2RGB_x45_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x45.c , raster render.
+   extern void pVram2RGB_x5_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x5.c , raster render.
+   extern void pVram2RGB_x6_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x6.c , raster render.
+#if defined(USE_SSE2) // scaler/sse2/
+   extern void pVram2RGB_x1_Line_SSE2(Uint32 *src, Uint8 *dst, int x, int xend, int y, int yrep); // scaler_x1_sse2.c , raster render
+   extern void pVram2RGB_x125_Line_SSE2(Uint32 *src, Uint8 *dst, int x, int xend, int y, int yrep); // scaler_x125_sse2.c , raster render
+   extern void pVram2RGB_x15_Line_SSE2(Uint32 *src, Uint8 *dst, int x, int xend, int y, int yrep); // scaler_x15_sse2.c , raster render
+   extern void pVram2RGB_x2_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x2_sse2.c , raster render.
+   extern void pVram2RGB_x225_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x225_sse2.c , raster render.
+   extern void pVram2RGB_x25_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x25_sse2.c , raster render.
+   extern void pVram2RGB_x3_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x3_sse2.c , raster render.
+   extern void pVram2RGB_x4_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x4_sse2.c , raster render.
+   extern void pVram2RGB_x45_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x45_sse2.c , raster render.
+   extern void pVram2RGB_x5_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x5_sse2.c , raster render.
+   extern void pVram2RGB_x6_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep); // scaler_x6_sse2.c , raster render.
+#endif
+}
+
+static int iScaleFactor = 1;
+static void *pDrawFn = NULL;
+static void *pDrawFn2 = NULL;
+static int iOldW = 0;
+static int iOldH = 0;
+
+
+static inline Uint32 pVram_XtoHalf(Uint32 d1, Uint32 d2)
+{
+   Uint32 d0;
+   Uint16 r,g,b,a;
+#if AG_BIG_ENDIAN
+   r = (d1 & 0x000000ff) + (d2 & 0x000000ff);
+   g = ((d1 & 0x0000ff00) >> 8) + ((d2 & 0x0000ff00) >> 8);
+   b = ((d1 & 0x00ff0000) >> 16) + ((d2 & 0x00ff0000) >> 16);
+   d0 = 0xff000000 | (r >> 1) | ((b << 15) & 0x00ff0000) | ((g << 7) & 0x0000ff00);
+#else
+   r = ((d1 & 0xff000000) >> 24) + ((d2 & 0xff000000) >> 24);
+   g = ((d1 & 0x00ff0000) >> 16) + ((d2 & 0x00ff0000) >> 16);
+   b = ((d1 & 0x0000ff00) >> 8) + ((d2 & 0x0000ff00) >> 8);
+   d0 = 0x000000ff | ((r << 23) & 0xff000000) | ((g << 15) & 0x00ff0000) | ((b << 7) & 0x0000ff00);
+#endif
+   return d0;
+}
+
+
+#if defined(USE_SSE2)
+// w0, h0 = Console
+// w1, h1 = DrawMode
+static void *AGAR_SDLViewSelectScaler_Line_SSE2(int w0 ,int h0, int w1, int h1)
+{
+    int wx0 = w0 >> 1; // w1/4
+    int hy0 = h0 >> 1;
+    int xfactor;
+    int yfactor;
+    int xth;
+    void (*DrawFn)(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep);
+    DrawFn = NULL;
+   
+    xfactor = w1 % wx0;
+    yfactor = h1 % hy0;
+    xth = wx0 >> 1;
+    if(__builtin_expect((iScaleFactor == (w1 / w0) && (pDrawFn2 != NULL)
+      && (w1 == iOldW) && (h1 == iOldH)), 1))  return (void *)pDrawFn2;
+    iScaleFactor = w1 / w0;
+    iOldW = w1;
+    iOldH = h1;
+    switch(iScaleFactor){
+     case 0:
+            if(w0 > 480){
+               if((w1 < 480) || (h1 < 150)){
+                  DrawFn = pVram2RGB_x05_Line;
+               } else {
+                  DrawFn = pVram2RGB_x1_Line_SSE2;
+               }
+            } else {
+                DrawFn = pVram2RGB_x1_Line_SSE2;
+            }
+            break;
+
+     case 1:
+              if(w1 > 900) {
+                 DrawFn = pVram2RGB_x15_Line_SSE2; // 1.5?
+              } else if(w1 > 720) {
+                 DrawFn = pVram2RGB_x125_Line_SSE2; // 1.25
+              } else {
+                 DrawFn = pVram2RGB_x1_Line_SSE2; // 1.0
+              }
+            break;
+     case 2:
+//            if(xfactor < xth){
+             if((w1 > 720) && (w0 <= 480)) {
+                DrawFn = pVram2RGB_x25_Line_SSE2;  // x2.5
+             } else if((w1 > 1360) && (w1 <= 1520)){
+                DrawFn = pVram2RGB_x225_Line_SSE2; // x2.25
+             } else if(w1 > 1700){
+                DrawFn = pVram2RGB_x3_Line_SSE2; // x3
+             } else if(w1 > 1520){
+                DrawFn = pVram2RGB_x25_Line_SSE2; // x2.5@1600
+             } else {
+                DrawFn = pVram2RGB_x2_Line_SSE2; // x2
+             }
+            break;
+     case 3:
+            DrawFn = pVram2RGB_x3_Line_SSE2; // x3
+            break;
+     case 4:
+       if((w1 > 1360) && (w1 < 1760) && (w0 <= 480)) { // 4.5
+            DrawFn = pVram2RGB_x45_Line_SSE2; // 4.5
+       } else {
+            DrawFn = pVram2RGB_x4_Line_SSE2; // 4.0
+       }
+       break;
+     case 5:
+            DrawFn = pVram2RGB_x5_Line_SSE2;
+            break;
+     case 6:
+     case 7:
+     case 8:
+            DrawFn = pVram2RGB_x6_Line_SSE2;
+            break;
+     default:
+             DrawFn = pVram2RGB_x1_Line_SSE2;
+            break;
+        }
+        pDrawFn2 = (void *)DrawFn;
+        return (void *)DrawFn;
+}
+#endif // USE_SSE2
+
+
+// w0, h0 = Console
+// w1, h1 = DrawMode
+static void *AGAR_SDLViewSelectScaler_Line(int w0 ,int h0, int w1, int h1)
+{
+    int wx0 = w0 >> 1; // w1/4
+    int hy0 = h0 >> 1;
+    int xfactor;
+    int yfactor;
+    int xth;
+    void (*DrawFn)(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep);
+    DrawFn = NULL;
+
+#if defined(USE_SSE2)
+   if(pCpuID != NULL){
+      if(pCpuID->use_sse2) {
+        return AGAR_SDLViewSelectScaler_Line_SSE2(w0, h0, w1, h1);
+      }
+   }
+#endif
+   
+    xfactor = w1 % wx0;
+    yfactor = h1 % hy0;
+    xth = wx0 >> 1;
+    if(__builtin_expect((iScaleFactor == (w1 / w0) && (pDrawFn2 != NULL)
+      && (w1 == iOldW) && (h1 == iOldH)), 1))  return (void *)pDrawFn2;
+    iScaleFactor = w1 / w0;
+    iOldW = w1;
+    iOldH = h1;
+    switch(iScaleFactor){
+     case 0:
+            if(w0 > 480){
+               if((w1 < 480) || (h1 < 150)){
+                  DrawFn = pVram2RGB_x05_Line;
+               } else {
+                  DrawFn = pVram2RGB_x1_Line;
+               }
+            } else {
+                DrawFn = pVram2RGB_x1_Line;
+            }
+            break;
+
+     case 1:
+       if(w1 > 900) {
+         DrawFn = pVram2RGB_x15_Line; // 1.5?
+       } else if(w1 > 720) {
+         DrawFn = pVram2RGB_x125_Line; // 1.25
+       } else {
+         DrawFn = pVram2RGB_x1_Line; // 1.0
+       }
+       break;
+     case 2:
+//            if(xfactor < xth){
+             if((w1 > 720) && (w0 <= 480)) {
+                DrawFn = pVram2RGB_x25_Line;  // x2.5
+             } else if((w1 > 1360) && (w1 <= 1520)){
+                DrawFn = pVram2RGB_x225_Line; // x2.25
+             }else if(w1 > 1700){
+                DrawFn = pVram2RGB_x3_Line; // x3
+             }else if(w1 > 1520){
+                DrawFn = pVram2RGB_x25_Line; // x2.5
+             } else {
+                DrawFn = pVram2RGB_x2_Line;
+             }
+            break;
+     case 3:
+           DrawFn = pVram2RGB_x3_Line; // x3
+           break;
+     case 4:
+       if((w1 > 1360) && (w1 < 1760) && (w0 <= 480)) { // 4.5
+            DrawFn = pVram2RGB_x45_Line; // 4.5
+       } else {
+            DrawFn = pVram2RGB_x4_Line; // 4.0
+       }
+       break;
+     case 5:
+            DrawFn = pVram2RGB_x5_Line;
+            break;
+     case 6:
+     case 7:
+     case 8:
+            DrawFn = pVram2RGB_x6_Line;
+            break;
+     default:
+             DrawFn = pVram2RGB_x1_Line;
+            break;
+        }
+        pDrawFn2 = (void *)DrawFn;
+        return (void *)DrawFn;
+}
+
+
+
+void AGAR_SDLViewUpdateSrc(AG_Event *event)
+{
+   AGAR_SDLView *my = (AGAR_SDLView *)AG_SELF();
+   void *Fn = NULL;
+   void (*DrawFn2)(Uint32 *, Uint8 *, int , int , int, int);
+   AG_Surface *Surface;
+   
+   Uint8 *pb;
+   Uint32 *disp;
+   Uint32 *src;
+   Uint8 *dst;
+   int yrep2;
+   int y2, y3;
+   int w;
+   int h;
+   int ww;
+   int hh;
+   int xx;
+   int yy;
+   int pitch;
+   int bpp;
+   int of;
+   int yrep;
+   int ymod;
+   int yfact;
+   int lcount;
+   int xcache;
+   BOOL flag = FALSE;
+
+   Fn = AG_PTR(1);
+   if(my == NULL) return;
+   Surface = AGAR_SDLViewGetSrcSurface(my);
+   
+   if(Surface == NULL) return;
+   DrawSurface = Surface;
+   w = Surface->w;
+   h = Surface->h;
+   pb = (Uint8 *)(Surface->pixels);
+   pitch = Surface->pitch;
+   bpp = Surface->format->BytesPerPixel;
+   
+
+   if(pVram2 == NULL) return;
+   if(__builtin_expect((crt_flag == FALSE), 0)) {
+      AG_Rect rr;
+      AG_Color cc;
+      
+      cc.r = 0x00;
+      cc.g = 0x00;
+      cc.b = 0x00;
+      cc.a = 0xff;
+      
+      LockVram();
+      //AG_ObjectLock(AGOBJECT(my));
+      AG_SurfaceLock(Surface);
+      AG_FillRect(Surface, NULL, cc);
+      //AG_ObjectUnlock(AGOBJECT(my));
+      AGAR_SDLViewSetDirty(my);
+      UnlockVram();
+      return;
+   }
+   
+   switch(bMode){
+    case SCR_200LINE:
+        ww = 640;
+        hh = 200;
+        break;
+    case SCR_400LINE:
+        ww = 640;
+        hh = 400;
+        break;
+    default:
+        ww = 320;
+        hh = 200;
+        break;
+   }
+   Fn = XM7_SDLViewSelectScaler_Line(ww , hh, w, h);
+   if(__builtin_expect((Fn != NULL), 1)) {
+      DrawFn2 = (void (*)(Uint32 *, Uint8 *, int , int , int, int))Fn;
+   } else {
+     return;
+   }
+   
+
+   
+   if(h > hh) {
+      ymod = h % hh;
+      yrep = h / hh;
+   } else {
+      ymod = h % hh;
+      yrep = 1;
+   }
+   
+   if(Fn == NULL) return; 
+    src = pVram2;
+    LockVram();
+    AG_ObjectLock(AGOBJECT(my));
+
+   if(nRenderMethod == RENDERING_RASTER) {
+      if(my->forceredraw != 0){
+         for(yy = 0; yy < hh; yy++) {
+            bDrawLine[yy] = TRUE;
+         }
+         my->forceredraw = 0;
+       }
+       Surface = GetDrawSurface();
+       if(Surface == NULL)       goto _end1;
+       AG_SurfaceLock(Surface);
+       dst = (Uint8 *)(Surface->pixels);
+#ifdef _OPENMP
+#pragma omp parallel for shared(hh, bDrawLine, yrep, ww, src, Surface, flag) private(dst, y2, y3)
+#endif
+      for(yy = 0 ; yy < hh; yy++) {
+/*
+*  Virtual VRAM -> Real Surface:
+*/
+        if(__builtin_expect((bDrawLine[yy] == TRUE), 0)) {
+//         _prefetch_data_read_l2(&src[yy * 80], ww * sizeof(Uint32));
+           y2 = (h * yy ) / hh;
+           y3 = (h * (yy + 1)) / hh;
+           dst = (Uint8 *)(Surface->pixels + Surface->pitch * y2);
+           yrep2 = y3 - y2;
+           if(__builtin_expect((yrep2 < 1), 0)) yrep2 = 1;
+           DrawFn2(src, dst, 0, ww, yy, yrep2);
+           bDrawLine[yy] = FALSE;
+           flag = TRUE;
+        }
+        dst = dst + (yrep2 * Surface->pitch);
+      }
+      AG_SurfaceUnlock(Surface);
+      // BREAK.
+      goto _end1;
+   } else { // Block
+      if(my->forceredraw != 0){
+        for(yy = 0; yy < (hh >> 3); yy++) {
+            for(xx = 0; xx < (ww >> 3); xx++ ){
+              SDLDrawFlag.write[xx][yy] = TRUE;
+            }
+        }
+      }
+   }
+   
+/*
+ * Below is BLOCK or FULL.
+ * Not use from line-rendering.
+ */
+
+   Surface = GetDrawSurface();
+   if(Surface == NULL) goto _end1;
+   AG_SurfaceLock(Surface);
+
+#ifdef _OPENMP
+# pragma omp parallel for shared(pb, SDLDrawFlag, ww, hh, src, flag) private(disp, of, xx, lcount, xcache, y2, y3, dst)
+#endif
+    for(yy = 0 ; yy < hh; yy += 8) {
+       lcount = 0;
+       xcache = 0;
+//       dst = (Uint8 *)(Surface->pixels + Surface->pitch * y2);
+       for(xx = 0; xx < ww; xx += 8) {
+/*
+*  Virtual VRAM -> Real Surface:
+*                disp = (Uint32 *)(pb + xx  * bpp + yy * pitch);
+*                of = (xx % 8) + (xx / 8) * (8 * 8)
+*                    + (yy % 8) * 8 + (yy / 8) * 640 * 8;
+*                *disp = src[of];
+** // xx,yy = 1scale(not 8)
+*/
+//            if(xx >= w) continue;
+          if(__builtin_expect((SDLDrawFlag.write[xx >> 3][yy >> 3] != FALSE), 1)) {
+             lcount += 8;
+             SDLDrawFlag.write[xx >> 3][yy >> 3] = FALSE;
+          } else {
+             if(__builtin_expect((lcount != 0), 1)) {
+                int yy2;
+                //           disp = (Uint32 *)pb;
+                //           of = (xx *8) + yy * ww;
+                //           DrawFn(&src[of], disp, xx, yy, yrep);
+                for(yy2 = 0; yy2 < 8; yy2++) {
+                   y2 = (h * (yy + yy2)) / hh;
+                   y3 = (h * (yy + yy2 + 1)) / hh;
+                   dst = (Uint8 *)(Surface->pixels + Surface->pitch * y2);
+                   yrep2 = y3 - y2;
+                   if(__builtin_expect((yrep2 < 1), 0)) yrep2 = 1;
+                   DrawFn2(src, dst, xcache, xcache + lcount, yy + yy2 , yrep2);
+                   flag = TRUE;
+                }
+             }
+             
+             xcache = xx + 8;
+             lcount = 0;
+          }
+       }
+       
+       
+       if(__builtin_expect((lcount != 0), 1)) {
+         int yy2;
+         //          disp = (Uint32 *)pb;
+         //          of = (xx *8) + yy * ww;
+         //          DrawFn(&src[of], disp, xx, yy, yrep);
+         for(yy2 = 0; yy2 < 8; yy2++) {
+            y2 = (h * (yy + yy2)) / hh;
+            y3 = (h * (yy + yy2 + 1)) / hh;
+            dst = (Uint8 *)(Surface->pixels + Surface->pitch * y2);
+            yrep2 = y3 - y2;
+            if(__builtin_expect((yrep2 < 1), 0)) yrep2 = 1;
+            DrawFn2(src, dst, xcache, xcache + lcount, yy + yy2 , yrep2);
+            flag = TRUE;
+         }
+       }
+//                     if(yy >= h) continue;
+    }
+   AG_SurfaceUnlock(Surface);
+      
+_end1:   
+   AG_ObjectUnlock(AGOBJECT(my));
+   if(flag != FALSE) XM7_SDLViewSetDirty(my);
+   UnlockVram();
+   return;
+}
diff --git a/source/src/agar/common/agar_sdlview.c b/source/src/agar/common/agar_sdlview.c
new file mode 100644 (file)
index 0000000..04bb370
--- /dev/null
@@ -0,0 +1,440 @@
+/*
+* FM-7 Emulator "XM7"
+* Virtual Vram Display(Agar widget version)
+* (C) 2012 K.Ohta <whatisthis.sowhat@gmail.com>
+* License: GPL2
+* History:
+* Jan 18,2012 From demos/customwidget/mywidget.[c|h]
+* Dec 30,2014 Move from XM7/SDL, these codes are complete? original.
+*             Createed from libAgar's templete.                           
+* 
+*/
+/*
+ * Implementation of a typical Agar widget which uses surface mappings to
+ * efficiently draw surfaces, regardless of the underlying graphics system.
+ *
+ * If you are not familiar with the way the Agar object system handles
+ * inheritance, see demos/objsystem.
+ */
+
+#ifdef _WINDOWS
+//#define _OFF_T_
+#endif
+
+#include "agar_sdlview.h"
+#include "cache_wrapper.h"
+#include <SDL/SDL.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif // _OPENMP
+
+/*
+ * This is a generic constructor function. It is completely optional, but
+ * customary of FooNew() functions to allocate, initialize and attach an
+ * instance of the class.
+ */
+static void ForceRedrawFn(AG_Event *event)
+{
+   AGAR_SDLView *my = (AGAR_SDLView *)AG_SELF();
+   AG_WidgetMapSurfaceNODUP(my, AGWIDGET_SURFACE(my, my->mySurface));
+
+   AGAR_SDLViewSetDirty(my);
+}
+   
+
+AGAR_SDLView *AGAR_SDLViewNew(void *parent, AG_Surface *src, const char *param)
+{
+   AGAR_SDLView *my;
+
+   /* Create a new instance of the MyWidget class */
+   my = malloc(sizeof(AGAR_SDLView));
+   AG_ObjectInit(my, &AGAR_SDLViewClass);
+
+   /* Set some constructor arguments */
+   my->param = param;
+   my->draw_ev = NULL;
+
+   my->forceredraw = 1;
+   /* Attach the object to the parent (no-op if parent is NULL) */
+   AG_ObjectAttach(parent, my);
+   AG_ObjectLock(my);
+   if(__builtin_expect((src != NULL), 1)) {
+      my->mySurface = AGAR_SDLViewLinkSurface(my, src);
+   } else {
+      my->mySurface = -1;
+   }
+   AG_ObjectUnlock(my);
+   return (my);
+}
+
+
+
+int AGAR_SDLViewLinkSurface(void *p, AG_Surface *src)
+{
+   AGAR_SDLView *my = p;
+   my->mySurface = AG_WidgetMapSurfaceNODUP(my, src);
+//   my->mySurface = AG_WidgetMapSurface(my, src);
+   return my->mySurface;
+}
+
+int AGAR_SDLViewSurfaceNew(void *p, int w, int h)
+{
+   AGAR_SDLView *my = p;
+   AG_Surface *src;
+   AG_PixelFormat fmt;
+
+   fmt.BitsPerPixel = 32;
+   fmt.BytesPerPixel = 4;
+#ifdef AG_BIG_ENDIAN
+   fmt.Rmask = 0x000000ff; // R
+   fmt.Gmask = 0x0000ff00; // G
+   fmt.Bmask = 0x00ff0000; // B
+   fmt.Amask = 0xff000000; // A
+#else
+   fmt.Rmask = 0x00ff0000; // R
+   fmt.Gmask = 0x0000ff00; // G
+   fmt.Bmask = 0xff000000; // B
+   fmt.Amask = 0x000000ff; // A
+#endif
+   fmt.Rshift = 0;
+   fmt.Gshift = 8;
+   fmt.Bshift = 16;
+   fmt.Ashift = 24;
+   fmt.Rloss = 0;
+   fmt.Gloss = 0;
+   fmt.Bloss = 0;
+   fmt.Aloss = 0;
+   fmt.palette = NULL;
+   fmt.alpha = 255;
+
+   src = AG_SurfaceNew(AG_SURFACE_PACKED  , w, h, &fmt, 0);
+   AG_ObjectLock(my);
+   //my->mySurface = AG_WidgetMapSurfaceNODUP(my, src);
+   AGAR_SDLViewLinkSurface(my, src);
+   my->forceredraw = 1;
+   AG_ObjectUnlock(my);
+   return my->mySurface;
+}
+
+
+void AGAR_SDLViewSurfaceDetach(void *p)
+{
+   AGAR_SDLView *my = p;
+   int i;
+   
+   AG_ObjectLock(my);
+   if(my->mySurface >= 0) {
+      AG_WidgetUnmapSurface(my, my->mySurface);
+   }
+   my->mySurface = -1;
+   AG_ObjectUnlock(my);
+   printf("AGAR_SDLViewSurfaceDetach()\n");
+}
+
+AG_Surface *AGAR_SDLViewGetSurface(void *p, int num)
+{
+   AGAR_SDLView *my = p;
+   if(my != NULL) {
+      if(num >= 0) {
+        if(num <= my->_inherit.nsurfaces) return my->_inherit.surfaces[num];
+      }
+   }
+   return NULL;
+}
+
+AG_Surface *AGAR_SDLViewGetSrcSurface(void *p)
+{
+   AGAR_SDLView *my = p;
+   if(my == NULL) return NULL;
+   return AGAR_SDLViewGetSurface(my, my->mySurface);
+}
+
+void AGAR_SDLViewSetSurfaceNum(void *p, int num)
+{
+   AGAR_SDLView *my = p;
+   if(num < -1) return;
+   
+   AG_ObjectLock(AGOBJECT(my));
+   if(num < my->_inherit.nsurfaces) my->mySurface = num;
+   AG_ObjectUnlock(AGOBJECT(my));
+}
+
+
+
+
+// Resist Draw Function
+void AGAR_SDLViewDrawFn(void *p, AG_EventFn fn, const char *fmt, ...)
+{
+    /*
+    * Function must be void foo(AG_Event *) .
+    */
+    AGAR_SDLView *my = p;
+
+    AG_ObjectLock(my);
+    my->draw_ev = AG_SetEvent(my, NULL, fn , NULL);
+    AG_EVENT_GET_ARGS(my->draw_ev, fmt);
+    AG_ObjectUnlock(my);
+
+}
+
+void AGAR_SDLViewSetDirty(void *p)
+{
+  AGAR_SDLView *my = p;
+  AG_ObjectLock(my);
+  my->dirty = 1;
+  AG_ObjectUnlock(my);
+}
+
+/*
+ * This function requests a minimal geometry for displaying the widget.
+ * It is expected to return the width and height in pixels into r.
+ *
+ * Note: Some widgets will provide FooSizeHint() functions to allow the
+ * programmer to request an initial size in pixels or some other metric
+ * FooSizeHint() typically sets some structure variable, which are then
+ * used here.
+ */
+static void SizeRequest(void *p, AG_SizeReq *r)
+{
+   AGAR_SDLView *my = p;
+
+   AG_ObjectLock(my);
+   if (my->mySurface == -1) {
+      /*
+       * We can use AG_TextSize() to return the dimensions of rendered
+       * text, without rendering it.
+       */
+        r->w = my->_inherit.w;
+        r->h = my->_inherit.w;
+         AGAR_SDLViewSurfaceNew(my, r->w, r->h);
+   } else {
+      /*
+       * We can use the geometry of the rendered surface. The
+       * AGWIDGET_SURFACE() macro returns the AG_Surface given a
+       * Widget surface handle.
+       */
+      r->w = (AGWIDGET_SURFACE(my,my->mySurface)->w / 8) * 8; // Set boundary as 32(bytes) = 8(dwords) : 256bit.
+      r->h = AGWIDGET_SURFACE(my,my->mySurface)->h;
+      if(AGWIDGET_SURFACE(my,my->mySurface) != NULL) AG_SurfaceResize(AGWIDGET_SURFACE(my,my->mySurface), r->w, r->h);
+   }
+   AG_ObjectUnlock(my);
+}
+
+/*
+ * This function is called by the parent widget after it decided how much
+ * space to allocate to this widget. It is mostly useful to container
+ * widgets, but other widgets generally use it to check if the allocated
+ * geometry can be handled by Draw().
+ */
+static void Draw(void *p);
+
+static int SizeAllocate(void *p, const AG_SizeAlloc *a)
+{
+   AGAR_SDLView *my = p;
+   AG_Surface *su;
+   AG_Rect r;
+   AG_Color c;
+
+    if(my == NULL) return -1;
+   /* If we return -1, Draw() will not be called. */
+   if (a->w < 5 || a->h < 5)
+     return (-1);
+   
+   su =  AGWIDGET_SURFACE(my, my->mySurface);
+   AG_ObjectLock(my);
+   if(su == NULL) {
+      my->mySurface = AGAR_SDLViewSurfaceNew(my, a->w, a->h);
+      su =  AGWIDGET_SURFACE(my, my->mySurface);
+   }
+   
+   if((su->w != a->w) || (su->h != a->h)) {
+       if(AG_SurfaceResize(su, a->w, a->h) < 0) {
+            AG_ObjectUnlock(my);
+            return (-1);
+       }
+   }
+   
+   my->forceredraw = 1;
+   my->dirty = 1;
+   // Clear
+   r.x = 0;
+   r.y = 0;
+   r.w = a->w;
+   r.h = a->h;
+   c.a = 255;
+   c.r = 0;
+   c.g = 0;
+   c.b = 0;
+   AG_FillRect(su, &r, c);
+//   AGAR_SDLViewLinkSurface(AGWIDGET(my), su);
+   AG_WidgetSetPosition(AGWIDGET(my), a->x, a->y);
+   AG_ObjectUnlock(my);
+   return (0);
+}
+
+/*
+ * Draw function. Invoked from GUI rendering context to draw the widget
+ * at its current location. All primitive and surface operations operate
+ * on widget coordinates.
+ */
+static void Draw(void *p)
+{
+   AGAR_SDLView *my = p;
+   /*
+    * Draw a box spanning the widget area. In order to allow themeing,
+    * you would generally use a STYLE() call here instead, see AG_Style(3)
+    * for more information on styles.
+    */
+   AG_ObjectLock(my);
+
+   if(my->draw_ev != NULL){
+      my->draw_ev->handler(my->draw_ev);
+   }
+
+   /*
+    * Render some text into a new surface. In OpenGL mode, the
+    * AG_WidgetMapSurface() call involves a texture upload.
+    */
+
+   /* Blit the mapped surface at [0,0]. */
+   //   _prefetch_data_read_l2(my->Surface->pixels, sizeof(my->Surface->pixels));
+   if((my->dirty != 0) || (my->forceredraw != 0)){
+     if(my->mySurface >= 0) {
+       if(AG_UsingGL(NULL) != 0) {
+        AG_WidgetMapSurfaceNODUP(my, AGWIDGET_SURFACE(my, my->mySurface));
+        //AG_WidgetUpdateSurface(my, my->mySurface);
+        AG_WidgetBlitSurface(my, my->mySurface, 0, 0);
+       } else {
+        //AG_WidgetMapSurface(my, AGWIDGET_SURFACE(my, my->mySurface));
+        AG_WidgetBlitSurface(my, my->mySurface, 0, 0);
+       }
+     }
+     my->dirty = 0;
+   }
+   AG_ObjectUnlock(my);
+}
+
+   
+
+/* Mouse motion event handler */
+static void MouseMotion(AG_Event *event)
+{
+       AGAR_SDLView *my = AG_SELF();
+       int x = AG_INT(1);
+       int y = AG_INT(2);
+       /* ... */
+}
+
+/* Mouse click event handler */
+static void MouseButtonDown(AG_Event *event)
+{
+       AGAR_SDLView *my = AG_SELF();
+       int button = AG_INT(1);
+       int x = AG_INT(2);
+       int y = AG_INT(3);
+
+       if (button != AG_MOUSE_LEFT) {
+               return;
+       }
+       printf("Click at %d,%d\n", x, y);
+       AG_WidgetFocus(my);
+}
+
+/* Mouse click event handler */
+static void MouseButtonUp(AG_Event *event)
+{
+       AGAR_SDLView *my = AG_SELF();
+       int button = AG_INT(1);
+       int x = AG_INT(2);
+       int y = AG_INT(3);
+
+       /* ... */
+}
+
+/* Keystroke event handler */
+static void KeyDown(AG_Event *event)
+{
+       AGAR_SDLView *my = AG_SELF();
+       int keysym = AG_INT(1);
+
+//     printf("Keystroke: 0x%x\n", keysym);
+}
+
+/* Keystroke event handler */
+static void KeyUp(AG_Event *event)
+{
+       AGAR_SDLView *my = AG_SELF();
+       int keysym = AG_INT(1);
+
+       /* ... */
+}
+
+/*
+ * Initialization routine. Note that the object system will automatically
+ * invoke the initialization routines of the parent classes first.
+ */
+static void Init(void *obj)
+{
+   AGAR_SDLView *my = obj;
+   
+   /* Allow this widget to grab focus. */
+   AGWIDGET(my)->flags |= AG_WIDGET_FOCUSABLE;
+   
+   /* Initialize instance variables. */
+   my->param = "";
+   
+   /*
+    * We'll eventually need to create and map a surface, but we cannot
+    * do this from Init(), because it involves texture operations in
+    * GL mode which are thread-unsafe. We wait until Draw() to do that.
+    */
+   my->mySurface = -1;
+
+   /*
+    * Map our event handlers. For a list of all meaningful events
+    * we can handle, see AG_Object(3), AG_Widget(3) and AG_Window(3).
+    *
+    * Here we register handlers for the common AG_Window(3) events.
+    */
+
+   AG_SetEvent(my, "mouse-button-up", MouseButtonUp, NULL);
+   AG_SetEvent(my, "mouse-button-down", MouseButtonDown, NULL);
+   AG_SetEvent(my, "mouse-motion", MouseMotion, NULL);
+   AG_SetEvent(my, "key-up", KeyUp, NULL);
+   AG_SetEvent(my, "key-down", KeyDown, NULL);
+}
+
+static void Detach(void *obj)
+{
+   AGAR_SDLView *my = obj;
+   int i;
+   
+   if(my == NULL) return;
+   AG_ObjectLock(my);
+   for(i = (my->_inherit.nsurfaces - 1); i >= 0; i--) {
+      AG_WidgetUnmapSurface(my, i);
+   }
+
+   my->mySurface = -1;
+   AG_ObjectUnlock(my);
+}
+/*
+ * This structure describes our widget class. It inherits from AG_ObjectClass.
+ * Any of the function members may be NULL. See AG_Widget(3) for details.
+ */
+AG_WidgetClass AGAR_SDLViewClass = {
+       {
+               "AG_Widget:AGAR_SDLView",       /* Name of class */
+               sizeof(AGAR_SDLView),   /* Size of structure */
+               { 0,0 },                /* Version for load/save */
+               Init,                   /* Initialize dataset */
+               Detach,                 /* Free dataset */
+               NULL,                   /* Destroy widget */
+               NULL,                   /* Load widget (for GUI builder) */
+               NULL,                   /* Save widget (for GUI builder) */
+               NULL                    /* Edit (for GUI builder) */
+       },
+       Draw,                           /* Render widget */
+       SizeRequest,                    /* Default size requisition */
+       SizeAllocate                    /* Size allocation callback */
+};
diff --git a/source/src/agar/common/agar_sdlview.h b/source/src/agar/common/agar_sdlview.h
new file mode 100644 (file)
index 0000000..9a00c51
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+* FM-7 Emulator "XM7"
+* Virtual Vram Display(Agar widget version)
+* (C) 2012 K.Ohta <whatisthis.sowhat@gmail.com>
+* License: CC-BY-SA
+* History:
+* Jan 18,2012 From demos/customwidget/mywidget.[c|h]
+*
+*/
+#ifndef __AGAR_SDL_VIEW
+#define __AGAR_SDL_VIEW
+
+# ifdef __cplusplus
+extern "C" {
+#endif
+
+//#include <sys/types.h>
+//#include <agar/core/string_compat.h>
+#include <agar/core.h>
+//#include <agar/core/types.h>
+#include <agar/gui.h>
+
+/*
+* Do compatibility for widget
+*/
+#define Strlcat AG_Strlcat
+#define Strlcpy AG_Strlcpy
+#define Strsep AG_Strsep
+#define Strdup AG_Strdup
+#define TryStrdup AG_TryStrdup
+#define Strcasecmp AG_Strcasecmp
+#define Strncasecmp AG_Strncasecmp
+#define Strcasestr AG_Strcasestr
+#define StrlcatUCS4 AG_StrlcatUCS4
+#define StrlcpyUCS4 AG_StrlcpyUCS4
+#define StrsepUCS4 AG_StrsepUCS4
+#define StrdupUCS4 AG_StrdupUCS4
+#define TryStrdupUCS4 AG_TryStrdupUCS4
+#define StrReverse AG_StrReverse
+#define StrlcpyInt AG_StrlcpyInt
+#define StrlcatInt AG_StrlcatInt
+#define StrlcpyUint AG_StrlcpyUint
+#define StrlcatUint AG_StrlcatUint
+
+
+/* Structure describing an instance of the AGAR_SDLView. */
+typedef struct  AGAR_SDLView {
+       struct ag_widget _inherit;      /* Inherit from AG_Widget */
+       int mySurface;                  /* Surface handle : CURRENT */
+       AG_Event *draw_ev;     // draw handler event
+       int forceredraw;
+        int dirty;
+       const char *param;              /* Some parameter */
+} AGAR_SDLView;
+
+extern AG_WidgetClass AGAR_SDLViewClass;
+extern AGAR_SDLView *AGAR_SDLViewNew(void *, AG_Surface *, const char *);
+
+extern int AGAR_SDLViewLinkSurface(void *p, AG_Surface *src);
+extern int AGAR_SDLViewSurfaceNew(void *p, int w, int h);
+extern void AGAR_SDLViewSurfaceDetach(void *p);
+extern AG_Surface *AGAR_SDLViewGetSurface(void *p, int index);
+extern AG_Surface *AGAR_SDLViewGetSrcSurface(void *p);
+extern void AGAR_SDLViewSetSurfaceNum(void *p, int num);
+
+extern void AGAR_SDLViewDrawFn(void *p, AG_EventFn fn, const char *fmt, ...);
+extern void AGAR_SDLViewSetDirty(void *p);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __AGAR_SDL_VIEW */
diff --git a/source/src/agar/common/scaler/generic/CMakeLists.txt b/source/src/agar/common/scaler/generic/CMakeLists.txt
new file mode 100644 (file)
index 0000000..e54ad1d
--- /dev/null
@@ -0,0 +1,17 @@
+message("* ui-agar/scaler/generic")
+
+#set(CMAKE_BUILD_SETTING_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -msse -mmmx")
+add_library(xm7_scaler-generic
+                               scaler_x05.c
+                              scaler_x1.c
+                              scaler_x125.c
+                              scaler_x15.c
+                              scaler_x2.c
+                              scaler_x225.c
+                              scaler_x25.c
+                              scaler_x3.c
+                              scaler_x4.c
+                              scaler_x45.c
+                              scaler_x5.c
+                              scaler_x6.c
+)
diff --git a/source/src/agar/common/scaler/generic/scaler_x05.c b/source/src/agar/common/scaler/generic/scaler_x05.c
new file mode 100644 (file)
index 0000000..c49f161
--- /dev/null
@@ -0,0 +1,168 @@
+/*
+ * Zoom x0.5
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-01-26 Move from agar_sdlscaler.cpp
+ */
+
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+
+void pVram2RGB_x05_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   v8hi_t *b;
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int yy2;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int pitch;
+   int yrep2 = yrep;
+   v8hi_t rmask1, gmask1, bmask1, amask1;
+   v4hi rmask2, gmask2, bmask2, amask2;
+   Uint32 black;
+   AG_Surface *Surface = GetDrawSurface();
+    
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+   pitch = Surface->pitch / sizeof(Uint32);
+   if(yrep2 <= 0) yrep2 = 1; // Okay?
+   
+#if AG_BIG_ENDIAN != 1
+   rmask1.i[0] = rmask1.i[1] = rmask1.i[2] = rmask1.i[3] =
+   rmask1.i[4] = rmask1.i[5] = rmask1.i[6] = rmask1.i[7] = 0x000000ff;
+
+   gmask1.i[0] = gmask1.i[1] = gmask1.i[2] = gmask1.i[3] =
+   gmask1.i[4] = gmask1.i[5] = gmask1.i[6] = gmask1.i[7] = 0x0000ff00;
+
+   bmask1.i[0] = bmask1.i[1] = bmask1.i[2] = bmask1.i[3] =
+   bmask1.i[4] = bmask1.i[5] = bmask1.i[6] = bmask1.i[7] = 0x00ff0000;
+
+   amask1.i[0] = amask1.i[1] = amask1.i[2] = amask1.i[3] =
+   amask1.i[4] = amask1.i[5] = amask1.i[6] = amask1.i[7] = 0xff000000;
+
+   amask2.i[0] = amask2.i[1] = amask2.i[2] = amask2.i[3] = 0xff000000;
+   bmask2.i[0] = bmask2.i[1] = bmask2.i[2] = bmask2.i[3] = 0x00ff0000;
+   gmask2.i[0] = gmask2.i[1] = gmask2.i[2] = gmask2.i[3] = 0x0000ff00;
+   rmask2.i[0] = rmask2.i[1] = rmask2.i[2] = rmask2.i[3] = 0x000000ff;
+
+#else
+   rmask1.i[0] = rmask1.i[1] = rmask1.i[2] = rmask1.i[3] =
+   rmask1.i[4] = rmask1.i[5] = rmask1.i[6] = rmask1.i[7] = 0xff000000;
+
+   gmask1.i[0] = gmask1.i[1] = gmask1.i[2] = gmask1.i[3] =
+   gmask1.i[4] = gmask1.i[5] = gmask1.i[6] = gmask1.i[7] = 0x00ff0000;
+
+   bmask1.i[0] = bmask1.i[1] = bmask1.i[2] = bmask1.i[3] =
+   bmask1.i[4] = bmask1.i[5] = bmask1.i[6] = bmask1.i[7] = 0x0000ff00;
+
+   amask1.i[0] = amask1.i[1] = amask1.i[2] = amask1.i[3] =
+   amask1.i[4] = amask1.i[5] = amask1.i[6] = amask1.i[7] = 0x000000ff;
+
+   rmask2.i[0] = rmask2.i[1] = rmask2.i[2] = rmask2.i[3] = 0xff000000;
+   gmask2.i[0] = gmask2.i[1] = gmask2.i[2] = gmask2.i[3] = 0x00ff0000;
+   bmask2.i[0] = bmask2.i[1] = bmask2.i[2] = bmask2.i[3] = 0x0000ff00;
+   amask2.i[0] = amask2.i[1] = amask2.i[2] = amask2.i[3] = 0x000000ff;
+#endif
+   d1 = (Uint32 *)(dst + (xbegin >> 1) * Surface->format->BytesPerPixel);
+   p = &src[xbegin + y * 640];
+   if(((xbegin >>1) + 4) >= w) {
+       Uint32 amask, rmask, gmask, bmask;
+        Uint32 bd1, bd2;
+        Uint32 r, g, b, a;
+        int j;
+
+#if AG_BIG_ENDIAN != 1
+      amask = 0xff000000;
+      bmask = 0x00ff0000;
+      gmask = 0x0000ff00;
+      rmask = 0x000000ff;
+#else
+      rmask = 0xff000000;
+      gmask = 0x00ff0000;
+      bmask = 0x0000ff00;
+      amask = 0x000000ff;
+#endif
+      ww = (xend - xbegin) / 2;
+      if(ww > w) ww = w;
+            
+      for(xx = 0; xx < ww; xx++) {
+        bd1 = p[0];
+        bd2 = p[1];
+        r = (((bd1 & rmask) >> 1) + ((bd2 & rmask) >> 1)) & rmask;
+        g = (((bd1 & gmask) >> 1) + ((bd2 & gmask) >> 1)) & gmask;
+        b = (((bd1 & bmask) >> 1) + ((bd2 & bmask) >> 1)) & bmask;
+        d2 = &d1[xx];
+        for(j = 0; j < yrep2; j++) {
+           *d2 = r | g  | b | amask;
+           d2 += pitch;
+        }
+        p += 2;
+      }
+      return;
+   }
+   
+
+     {
+      v4hi *pd;
+      v4hi cr, cg, cb, cd;
+      v8hi_t *b;
+      v8hi_t br,bg, bb;
+      Uint32 *d0;
+       
+      ww = (xend - xbegin) / 2;
+      if(ww > w) ww = w;
+      d0 = d1;
+      for(xx = 0; xx < ww; xx++) {
+        d1 = d0;
+        b = (v8hi_t *)p;
+        br.v = b->v & rmask1.v;
+        bg.v = b->v & gmask1.v;
+        bb.v = b->v & bmask1.v;
+        cr.i[0] = (br.i[0] >> 1) + (br.i[1] >> 1);
+        cr.i[1] = (br.i[2] >> 1) + (br.i[3] >> 1);
+        cr.i[2] = (br.i[4] >> 1) + (br.i[5] >> 1);
+        cr.i[3] = (br.i[6] >> 1) + (br.i[7] >> 1);
+
+        cb.i[0] = (bb.i[0] + bb.i[1]) >> 1;
+        cb.i[1] = (bb.i[2] + bb.i[3]) >> 1;
+        cb.i[2] = (bb.i[4] + bb.i[5]) >> 1;
+        cb.i[3] = (bb.i[6] + bb.i[7]) >> 1;
+
+        cg.i[0] = (bg.i[0] + bg.i[1]) >> 1;
+        cg.i[1] = (bg.i[2] + bg.i[3]) >> 1;
+        cg.i[2] = (bg.i[4] + bg.i[5]) >> 1;
+        cg.i[3] = (bg.i[6] + bg.i[7]) >> 1;
+        cr.v = cr.v & rmask2.v;
+        cg.v = cg.v & gmask2.v;
+        cb.v = cb.v & bmask2.v;
+        cd.v = cr.v | cg.v | cb.v | amask2.v;
+        for(i = 0; i < yrep2; i++) {
+           pd = (v4hi *)d1;
+           *pd = cd;
+           d1 += pitch;
+        }
+        d0 += 4;
+        p += 8;
+      }
+   }
+}
+
+
+
+
+
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x1.c b/source/src/agar/common/scaler/generic/scaler_x1.c
new file mode 100644 (file)
index 0000000..6fe5bfb
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * Zoom x1x1
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-01-26 Move from agar_sdlscaler.cpp
+ */
+
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+void pVram2RGB_x1_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2 = yrep;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+   
+   ww = xend - xbegin;
+   if(ww <= 0) return;
+   
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+   d1 = (Uint32 *)(dst + xbegin * Surface->format->BytesPerPixel);
+   d2 = &src[xbegin + y * 640];
+
+   pitch = Surface->pitch / sizeof(Uint32);
+   { // Not thinking align ;-(
+       
+    int j;
+    v4hi b2;
+    v4hi b3;
+    register v4hi bb;
+    v4hi *b2p;
+    Uint32 *d0;
+      
+    b = (v4hi *)d2;
+    bb.i[0] = bb.i[1] = bb.i[2] = bb.i[3] = black;
+    if(yrep2 <= 0) yrep2 = 1;
+       switch(yrep2) {
+       case 1:
+//     case 2:
+         for(xx = 0; xx < ww; xx += 8) {
+            b2p = (v4hi *)d1;
+            b2p[0] = b[0];
+            b2p[1] = b[1];
+            d1 += 8;
+            b += 2;
+         }
+         break;
+       default:
+         d0 = d1;
+         for(xx = 0; xx < ww; xx += 8){
+            d1 = d0;
+            b2 = b[0];
+            b3 = b[1];
+
+            for(j = 0; j < yrep2; j++) {
+               b2p = (v4hi *)d1;
+               if(!bFullScan && (j >= (yrep2 >> 1))) {
+                  b2p[0] = 
+                  b2p[1] = bb;
+                } else {
+                  b2p[0] = b2;
+                  b2p[1] = b3;
+               }
+               d1 += pitch;
+            }
+            d0 += 8;
+            b += 2;
+         }
+
+         break;
+       }
+
+   }
+}
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x125.c b/source/src/agar/common/scaler/generic/scaler_x125.c
new file mode 100644 (file)
index 0000000..c8fd122
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Zoom x1.25x2 i.e. 800x480.
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+
+static void Scaler_DrawLine(Uint32 *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   Uint32 *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const Uint32 bb = 0xff000000;
+#else
+   const Uint32 bb = 0x000000ff;
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = (Uint32 *)dst;
+   pitch2 = pitch / sizeof(Uint32);
+   if((bFullScan) || (repeat < 2)) {
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = (Uint32 *)dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 7654432100
+        for(yy = 0; yy < repeat; yy++) {
+              b2p[0] = b2p[1] = r1.i[0];
+              b2p[2] = r1.i[1];
+              b2p[3] = r1.i[2];
+              b2p[4] = r1.i[3];
+              b2p[5] = b2p[6] = r2.i[0];
+              b2p[7] = r2.i[1];
+              b2p[8] = r2.i[2];
+              b2p[9] = r2.i[3];
+              b2p = b2p + pitch2;
+        }
+        dst = dst + 10;
+//      b += 2;
+      }
+   } else {
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = (Uint32 *)dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 776655444332211000
+        // 76543210 -> 7654432100
+        for(yy = 0; yy < repeat - 1; yy++) {
+              b2p[0] = b2p[1] = r1.i[0];
+              b2p[2] = r1.i[1];
+              b2p[3] = r1.i[2];
+              b2p[4] = r1.i[3];
+              b2p[5] = b2p[6] = r2.i[0];
+              b2p[7] = r2.i[1];
+              b2p[8] = r2.i[2];
+              b2p[9] = r2.i[3];
+              b2p = b2p + pitch2;
+        }
+        b2p[0] = b2p[1] = b2p[2] = b2p[3] =
+        b2p[4] = b2p[5] = b2p[6] = b2p[7] =
+        b2p[8] = b2p[9] =
+          bb;
+        dst = dst + 10;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x125_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 10) / 8) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine(d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x15.c b/source/src/agar/common/scaler/generic/scaler_x15.c
new file mode 100644 (file)
index 0000000..5ad57fe
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * Zoom x1.5
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      v4hi r3, r4, r5;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 766544322100
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[1], r1.i[2]};  
+        r4.uv  = (v4ui){r1.i[2], r1.i[3], r2.i[0], r2.i[0]};  
+        r5.uv  = (v4ui){r2.i[1], r2.i[2], r2.i[2], r2.i[3]};  
+        for(yy = 0; yy < repeat; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p = b2p + pitch2;
+        }
+        dst += 3;
+//      b += 2;
+      }
+   } else {
+      v4hi r3, r4, r5;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 766544322100
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[1], r1.i[2]};  
+        r4.uv  = (v4ui){r1.i[2], r1.i[3], r2.i[0], r2.i[0]};  
+        r5.uv  = (v4ui){r2.i[1], r2.i[2], r2.i[2], r2.i[3]};  
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p = b2p + pitch2;
+        }
+        b2p[0].uv = 
+        b2p[1].uv = 
+        b2p[2].uv = bb;
+        dst += 3;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x15_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 6) / 4) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x2.c b/source/src/agar/common/scaler/generic/scaler_x2.c
new file mode 100644 (file)
index 0000000..6af2707
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ * Zoom x2x2
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-01-26 Move from agar_sdlscaler.cpp
+ */
+
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+void pVram2RGB_x2_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int wodd;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+   
+   ww = xend - xbegin;
+   if((ww * 2) > w) ww = w / 2;
+   if(ww <= 0) return;
+   wodd = ww % 8;
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+   yrep2 = yrep;
+   d1 = (Uint32 *)(dst + xbegin * 2 * Surface->format->BytesPerPixel);
+   d2 = &src[xbegin + y * 640];
+
+   pitch = Surface->pitch / sizeof(Uint32);
+   { // Not thinking align ;-(
+       
+    int j;
+    v4hi b2;
+    v4hi b3;
+    v4hi b4;
+    v4hi b5;
+    register v4hi bb;
+    v4hi *b2p;
+    Uint32 *d0;
+      
+    b = (v4hi *)d2;
+    bb.i[0] = bb.i[1] = bb.i[2] = bb.i[3] = black;
+       switch(yrep2) {
+       case 0:
+       case 1:
+//     case 2:
+         d0 = d1;
+         for(xx = 0; xx < (ww - 1); xx += 8) {
+            d1 = d0;
+            b2p = (v4hi *)d1;
+            b2.i[0] = b2.i[1] = b[0].i[0];
+            b2.i[2] = b2.i[3] = b[0].i[1];
+            b3.i[0] = b3.i[1] = b[0].i[2];
+            b3.i[2] = b3.i[3] = b[0].i[3];
+
+            b4.i[0] = b4.i[1] = b[1].i[0];
+            b4.i[2] = b4.i[3] = b[1].i[1];
+            b5.i[0] = b5.i[1] = b[1].i[2];
+            b5.i[2] = b5.i[3] = b[1].i[3];
+            b2p[0] = b2;
+            b2p[1] = b3;
+            b2p[2] = b4;
+            b2p[3] = b5;
+            d0 += 16;
+            b += 2;
+         }
+         if(wodd != 0) {
+            Uint32 *bp = (Uint32 *)b;
+            for(i = 0; i < wodd; i++) {
+               *d0 = *bp;
+               d0[1] = *bp;
+               d0++;
+               bp++;
+            }
+         }
+         break;
+       default:
+         d0 = d1;
+         for(xx = 0; xx < (ww - 1); xx += 8){
+            d1 = d0;
+            b2.i[0] = b2.i[1] = b[0].i[0];
+            b2.i[2] = b2.i[3] = b[0].i[1];
+            b3.i[0] = b3.i[1] = b[0].i[2];
+            b3.i[2] = b3.i[3] = b[0].i[3];
+            
+            b4.i[0] = b4.i[1] = b[1].i[0];
+            b4.i[2] = b4.i[3] = b[1].i[1];
+            b5.i[0] = b5.i[1] = b[1].i[2];
+            b5.i[2] = b5.i[3] = b[1].i[3];
+
+            for(j = 0; j < yrep2; j++) {
+               b2p = (v4hi *)d1;
+               if(!bFullScan && (j >= (yrep2 >> 1))) {
+                  b2p[0] = 
+                  b2p[1] = 
+                  b2p[2] = 
+                  b2p[3] = bb;
+                } else {
+                  b2p[0] = b2;
+                  b2p[1] = b3;
+                  b2p[2] = b4;
+                  b2p[3] = b5;
+               }
+               d1 += pitch;
+            }
+            d0 += 16;
+            b += 2;
+         }
+         if(wodd != 0) {
+            Uint32 *bp = (Uint32 *)b;
+            for(i = 0; i < wodd; i++) {
+               *d0 = *bp;
+               d0[1] = *bp;
+               d0[pitch] = *bp;
+               d0[pitch + 1] = *bp;
+               d0++;
+               bp++;
+            }
+         }
+         break;
+       }
+
+   }
+}
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x225.c b/source/src/agar/common/scaler/generic/scaler_x225.c
new file mode 100644 (file)
index 0000000..3d542a8
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Zoom x2.25x2 i.e. 1440x900.
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+static void Scaler_DrawLine(Uint32 *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   Uint32 *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const Uint32 bb = 0xff000000;
+#else
+   const Uint32 bb = 0x000000ff;
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = (Uint32 *)dst;
+   pitch2 = pitch / sizeof(Uint32);
+   if((bFullScan) || (repeat < 2)) {
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = (Uint32 *)dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 776655444332211000
+        for(yy = 0; yy < repeat; yy++) {
+              b2p[0] = b2p[1] = b2p[2] = r1.i[0];
+              b2p[3] = b2p[4] = r1.i[1];
+              b2p[5] = b2p[6] = r1.i[2];
+              b2p[7] = b2p[8] = r1.i[3];
+              b2p[9] = b2p[10] = b2p[11] = r2.i[0];
+              b2p[12] = b2p[13] = r2.i[1];
+              b2p[14] = b2p[15] = r2.i[2];
+              b2p[16] = b2p[17] = r2.i[3];
+              b2p = b2p + pitch2;
+        }
+        dst = dst + 18;
+//      b += 2;
+      }
+   } else {
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = (Uint32 *)dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 776655444332211000
+        for(yy = 0; yy < repeat - 1; yy++) {
+              b2p[0] = b2p[1] = b2p[2] = r1.i[0];
+              b2p[3] = b2p[4] = r1.i[1];
+              b2p[5] = b2p[6] = r1.i[2];
+              b2p[7] = b2p[8] = r1.i[3];
+              b2p[9] = b2p[10] = b2p[11] = r2.i[0];
+              b2p[12] = b2p[13] = r2.i[1];
+              b2p[14] = b2p[15] = r2.i[2];
+              b2p[16] = b2p[17] = r2.i[3];
+              b2p = b2p + pitch2;
+        }
+        b2p[0] = b2p[1] = b2p[2] = b2p[3] =
+        b2p[4] = b2p[5] = b2p[6] = b2p[7] =
+        b2p[8] = b2p[9] =
+        b2p[10] = b2p[11] = b2p[12] = b2p[13] =
+        b2p[14] = b2p[15] = b2p[16] = b2p[17] =
+          bb;
+        dst = dst + 18;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x225_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 18) / 8) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine(d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x25.c b/source/src/agar/common/scaler/generic/scaler_x25.c
new file mode 100644 (file)
index 0000000..1e82e5f
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ * Zoom x2x2
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x25(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   v4hi bb2;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   bb2.uv = bb;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      volatile v4hi r3, r4, r5, r6, r7;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 77666554443322211000
+        r3.uv = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[1]};  
+        r4.uv = (v4ui){r1.i[1], r1.i[2], r1.i[2], r1.i[2]};  
+        r5.uv = (v4ui){r1.i[3], r1.i[3], r2.i[0], r2.i[0]};  
+        r6.uv = (v4ui){r2.i[0], r2.i[1], r2.i[1], r2.i[2]};  
+        r7.uv = (v4ui){r2.i[2], r2.i[2], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat; yy++) {
+           *b2p++ = r3;
+           *b2p++ = r4;
+           *b2p++ = r5;
+           *b2p++ = r6;
+           *b2p++ = r7;
+           b2p = b2p + (pitch2 - 5);
+        }
+        dst += 5;
+//      b += 2;
+      }
+   } else {
+      volatile v4hi r3, r4, r5, r6, r7;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 77666554443322211000
+        r3.uv = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[1]};  
+        r4.uv = (v4ui){r1.i[1], r1.i[2], r1.i[2], r1.i[2]};  
+        r5.uv = (v4ui){r1.i[3], r1.i[3], r2.i[0], r2.i[0]};  
+        r6.uv = (v4ui){r2.i[0], r2.i[1], r2.i[1], r2.i[2]};  
+        r7.uv = (v4ui){r2.i[2], r2.i[2], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat - 1; yy++) {
+           *b2p++ = r3;
+           *b2p++ = r4;
+           *b2p++ = r5;
+           *b2p++ = r6;
+           *b2p++ = r7;
+           b2p = b2p + (pitch2 - 5);
+        }
+        b2p[0].uv = bb;
+        b2p[1].uv = bb;
+        b2p[2].uv = bb;
+        b2p[3].uv = bb;
+        b2p[4].uv = bb;
+        dst += 5;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x25_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 20) / 16) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
diff --git a/source/src/agar/common/scaler/generic/scaler_x3.c b/source/src/agar/common/scaler/generic/scaler_x3.c
new file mode 100644 (file)
index 0000000..e8fa3ad
--- /dev/null
@@ -0,0 +1,145 @@
+/*
+ * Zoom x3
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      v4hi r3, r4, r5, r6, r7, r8;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 7776666555444333222111000
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[1]};  
+        r4.uv  = (v4ui){r1.i[1], r1.i[1], r1.i[2], r1.i[2]};  
+        r5.uv  = (v4ui){r1.i[2], r1.i[3], r1.i[3], r1.i[3]};  
+
+        r6.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[1]};  
+        r7.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[2], r2.i[2]};  
+        r8.uv  = (v4ui){r2.i[2], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p = b2p + pitch2;
+        }
+        dst += 6;
+//      b += 2;
+      }
+   } else {
+      v4hi r3, r4, r5, r6, r7, r8;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 777766666555544444333322222111100000
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[1]};  
+        r4.uv  = (v4ui){r1.i[1], r1.i[1], r1.i[2], r1.i[2]};  
+        r5.uv  = (v4ui){r1.i[2], r1.i[3], r1.i[3], r1.i[3]};  
+
+        r6.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[1]};  
+        r7.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[2], r2.i[2]};  
+        r8.uv  = (v4ui){r2.i[2], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p = b2p + pitch2;
+        }
+        b2p[0].uv = 
+        b2p[1].uv = 
+        b2p[2].uv = 
+        b2p[3].uv = 
+        b2p[4].uv = 
+        b2p[5].uv = bb;
+        dst += 6;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x3_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + x * 3 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x4.c b/source/src/agar/common/scaler/generic/scaler_x4.c
new file mode 100644 (file)
index 0000000..0d98405
--- /dev/null
@@ -0,0 +1,178 @@
+/*
+ * Zoom x4x4
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-01-26 Move from agar_sdlscaler.cpp
+ *  2013-09-17 Move from ui-agar/
+ */
+
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+
+
+void pVram2RGB_x4_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   unsigned  pitch;
+   int yrep2;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+   
+   ww = xend - xbegin;
+   if((ww * 4) >= w) ww = w / 4;
+   ww = ww - 7;
+   if(ww <= 0) return;
+   
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+   yrep2 = yrep;
+   d1 = (Uint32 *)(dst + x * 4 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+
+
+   pitch = Surface->pitch / sizeof(Uint32);
+   { // Not thinking align ;-(
+       
+    int j;
+    v4hi b2;
+    v4hi b3;
+    v4hi b4;
+    v4hi b5;
+    v4hi b6;
+    v4hi b7;
+    v4hi b8;
+    v4hi b9;
+    register v4hi bb;
+    v4hi *b2p;
+    Uint32 *d0;
+    Uint32 dd;
+      
+    b = (v4hi *)d2;
+    bb.i[0] = bb.i[1] = bb.i[2] = bb.i[3] = black;
+    if((((y * yrep2) % 16) == 0) && ((yrep2 % 16) != 0)) yrep2 += 16;
+    yrep2 >>= 4;
+       switch(yrep2) {
+       case 0:
+       case 1:
+//     case 2:
+         for(xx = 0; xx < ww; xx += 8) {
+            b2p = (v4hi *)d1;
+            b2.i[0] = b2.i[1] = b2.i[2] = b2.i[3] = b[0].i[0];
+            b3.i[0] = b3.i[1] = b3.i[2] = b3.i[3] = b[0].i[1];
+            b4.i[0] = b4.i[1] = b4.i[2] = b4.i[3] = b[0].i[2];
+            b5.i[0] = b5.i[1] = b5.i[2] = b5.i[3] = b[0].i[3];
+
+            b6.i[0] = b6.i[1] = b6.i[2] = b6.i[3] = b[1].i[0];
+            b7.i[0] = b7.i[1] = b7.i[2] = b7.i[3] = b[1].i[1];
+            b8.i[0] = b8.i[1] = b8.i[2] = b8.i[3] = b[1].i[2];
+            b9.i[0] = b9.i[1] = b9.i[2] = b9.i[3] = b[1].i[3];
+
+            b2p[0] = b2;
+            b2p[1] = b3;
+            b2p[2] = b4;
+            b2p[3] = b5;
+            b2p[4] = b6;
+            b2p[5] = b7;
+            b2p[6] = b8;
+            b2p[7] = b9;
+            d1 += 32;
+            b += 2;
+         }
+         if((ww % 8) != 0){
+            j = 0;
+            d0 = (Uint32 *)b;
+            b2p = (v4hi *)d1;
+            for(j = 0;j < (ww % 8); j++) {
+               b2.i[0] = b2.i[1] = b2.i[3] = b2.i[4] = *d0;
+               *b2p = b2;
+               d0++;
+               b2p++;
+            }
+         }
+         break;
+       default:
+         d0 = d1;
+         for(xx = 0; xx < ww; xx += 8){
+            d1 = d0;
+            b2.i[0] = b2.i[1] = b2.i[2] = b2.i[3] = b[0].i[0];
+            b3.i[0] = b3.i[1] = b3.i[2] = b3.i[3] = b[0].i[1];
+            b4.i[0] = b4.i[1] = b4.i[2] = b4.i[3] = b[0].i[2];
+            b5.i[0] = b5.i[1] = b5.i[2] = b5.i[3] = b[0].i[3];
+
+            b6.i[0] = b6.i[1] = b6.i[2] = b6.i[3] = b[1].i[0];
+            b7.i[0] = b7.i[1] = b7.i[2] = b7.i[3] = b[1].i[1];
+            b8.i[0] = b8.i[1] = b8.i[2] = b8.i[3] = b[1].i[2];
+            b9.i[0] = b9.i[1] = b9.i[2] = b9.i[3] = b[1].i[3];
+
+
+            for(j = 0; j < yrep2; j++) {
+               b2p = (v4hi *)d1;
+               if(!bFullScan && (j > (yrep2 >> 1))) {
+                  b2p[0] = 
+                  b2p[1] = 
+                  b2p[2] = 
+                  b2p[3] = 
+                  b2p[4] = 
+                  b2p[5] = 
+                  b2p[6] = 
+                  b2p[7] = bb;
+                } else {
+                   b2p[0] = b2;
+                   b2p[1] = b3;
+                   b2p[2] = b4;
+                   b2p[3] = b5;
+                   b2p[4] = b6;
+                   b2p[5] = b7;
+                   b2p[6] = b8;
+                   b2p[7] = b9;
+               }
+               d1 += pitch;
+            }
+            d0 += 32;
+            b += 2;
+         }
+         if((ww % 8) != 0){
+            d2 = (Uint32 *)b;
+            d0 = d1;
+            for(j = 0;j < (ww % 8); j++) {
+               d1 = d0;
+               b2.i[0] = b2.i[1] = b2.i[3] = b2.i[4] = *d2;
+               for(i = 0; i < (yrep2 >> 1); i++) {
+                  b2p = (v4hi *)d1;
+                  if(!bFullScan && (j > (yrep2 >> 2))) {
+                     *b2p = bb;
+                  } else {
+                     *b2p = b2;
+                  }
+                  d1 += pitch;
+               }
+               d0 += 4;
+               d2++;
+            }
+         }
+         break;
+       }
+
+   }
+}
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x45.c b/source/src/agar/common/scaler/generic/scaler_x45.c
new file mode 100644 (file)
index 0000000..f7f9a5c
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+ * Zoom x4.5
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 777766666555544444333322222111100000
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv  = (v4ui){r1.i[0], r1.i[1], r1.i[1], r1.i[1]};  
+        r5.uv  = (v4ui){r1.i[1], r1.i[2], r1.i[2], r1.i[2]};  
+        r6.uv  = (v4ui){r1.i[2], r1.i[2], r1.i[3], r1.i[3]};  
+        r7.uv  = (v4ui){r1.i[3], r1.i[3], r2.i[0], r2.i[0]};  
+
+        r8.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[1]};  
+        r9.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[1], r2.i[2]};  
+        r10.uv = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[2]};  
+        r11.uv = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p = b2p + pitch2;
+        }
+        dst += 9;
+//      b += 2;
+      }
+   } else {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 777766666555544444333322222111100000
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv  = (v4ui){r1.i[0], r1.i[1], r1.i[1], r1.i[1]};  
+        r5.uv  = (v4ui){r1.i[1], r1.i[2], r1.i[2], r1.i[2]};  
+        r6.uv  = (v4ui){r1.i[2], r1.i[2], r1.i[3], r1.i[3]};  
+        r7.uv  = (v4ui){r1.i[3], r1.i[3], r2.i[0], r2.i[0]};  
+
+        r8.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[1]};  
+        r9.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[1], r2.i[2]};  
+        r10.uv = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[2]};  
+        r11.uv = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p = b2p + pitch2;
+        }
+        b2p[0].uv = 
+        b2p[1].uv = 
+        b2p[2].uv = 
+        b2p[3].uv = 
+        b2p[4].uv = 
+        b2p[5].uv = 
+        b2p[6].uv = 
+        b2p[7].uv = 
+        b2p[8].uv = bb;
+        dst += 9;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x45_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 18) / 4) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
diff --git a/source/src/agar/common/scaler/generic/scaler_x5.c b/source/src/agar/common/scaler/generic/scaler_x5.c
new file mode 100644 (file)
index 0000000..d0b0dc2
--- /dev/null
@@ -0,0 +1,167 @@
+/*
+ * Zoom x2x2
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11, r12;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv  = (v4ui){r1.i[0], r1.i[1], r1.i[1], r1.i[1]};  
+        r5.uv  = (v4ui){r1.i[1], r1.i[1], r1.i[2], r1.i[2]};  
+        r6.uv  = (v4ui){r1.i[2], r1.i[2], r1.i[2], r1.i[3]};  
+        r7.uv  = (v4ui){r1.i[3], r1.i[3], r1.i[3], r1.i[3]};  
+
+        r8.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[0]};  
+        r9.uv  = (v4ui){r2.i[0], r2.i[1], r2.i[1], r2.i[1]};  
+        r10.uv = (v4ui){r2.i[1], r2.i[1], r2.i[2], r2.i[2]};  
+        r11.uv = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[3]};  
+        r12.uv = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p[9] = r12;
+           b2p = b2p + pitch2;
+        }
+        dst += 10;
+//      b += 2;
+      }
+   } else {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11, r12;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv  = (v4ui){r1.i[0], r1.i[1], r1.i[1], r1.i[1]};  
+        r5.uv  = (v4ui){r1.i[1], r1.i[1], r1.i[2], r1.i[2]};  
+        r6.uv  = (v4ui){r1.i[2], r1.i[2], r1.i[2], r1.i[3]};  
+        r7.uv  = (v4ui){r1.i[3], r1.i[3], r1.i[3], r1.i[3]};  
+
+        r8.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[0]};  
+        r9.uv  = (v4ui){r2.i[0], r2.i[1], r2.i[1], r2.i[1]};  
+        r10.uv = (v4ui){r2.i[1], r2.i[1], r2.i[2], r2.i[2]};  
+        r11.uv = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[3]};  
+        r12.uv = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p[9] = r12;
+           b2p = b2p + pitch2;
+        }
+        b2p[0].uv = 
+        b2p[1].uv = 
+        b2p[2].uv = 
+        b2p[3].uv = 
+        b2p[4].uv = 
+        b2p[5].uv = 
+        b2p[6].uv = 
+        b2p[7].uv = 
+        b2p[8].uv = 
+        b2p[9].uv = bb;
+        dst += 10;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x5_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + x * 5 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
diff --git a/source/src/agar/common/scaler/generic/scaler_x6.c b/source/src/agar/common/scaler/generic/scaler_x6.c
new file mode 100644 (file)
index 0000000..05b68ca
--- /dev/null
@@ -0,0 +1,178 @@
+/*
+ * Zoom x2x2
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11, r12;
+      v4hi r13, r14;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        r3.uv   = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv   = (v4ui){r1.i[0], r1.i[0], r1.i[1], r1.i[1]};  
+        r5.uv   = (v4ui){r1.i[1], r1.i[1], r1.i[1], r1.i[1]};  
+        r6.uv   = (v4ui){r1.i[2], r1.i[2], r1.i[2], r1.i[2]};  
+        r7.uv   = (v4ui){r1.i[2], r1.i[2], r1.i[3], r1.i[3]};  
+        r8.uv   = (v4ui){r1.i[3], r1.i[3], r1.i[3], r1.i[3]};
+        
+        r9.uv   = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[0]};  
+        r10.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[1], r2.i[1]};  
+        r11.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[1], r2.i[1]};  
+        r12.uv  = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[2]};  
+        r13.uv  = (v4ui){r2.i[2], r2.i[2], r2.i[3], r2.i[3]};  
+        r14.uv  = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};
+        for(yy = 0; yy < repeat; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p[9] = r12;
+           b2p[10] = r13;
+           b2p[11] = r14;
+           b2p = b2p + pitch2;
+        }
+        dst += 10;
+//      b += 2;
+      }
+   } else {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11, r12;
+      v4hi r13, r14;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+
+        r3.uv   = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv   = (v4ui){r1.i[0], r1.i[0], r1.i[1], r1.i[1]};  
+        r5.uv   = (v4ui){r1.i[1], r1.i[1], r1.i[1], r1.i[1]};  
+        r6.uv   = (v4ui){r1.i[2], r1.i[2], r1.i[2], r1.i[2]};  
+        r7.uv   = (v4ui){r1.i[2], r1.i[2], r1.i[3], r1.i[3]};  
+        r8.uv   = (v4ui){r1.i[3], r1.i[3], r1.i[3], r1.i[3]};
+
+        r9.uv   = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[0]};  
+        r10.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[1], r2.i[1]};  
+        r11.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[1], r2.i[1]};  
+        r12.uv  = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[2]};  
+        r13.uv  = (v4ui){r2.i[2], r2.i[2], r2.i[3], r2.i[3]};  
+        r14.uv  = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p[9] = r12;
+           b2p[10] = r13;
+           b2p[11] = r14;
+           b2p = b2p + pitch2;
+        }
+        b2p[0].uv = 
+        b2p[1].uv = 
+        b2p[2].uv = 
+        b2p[3].uv = 
+        b2p[4].uv = 
+        b2p[5].uv = 
+        b2p[6].uv = 
+        b2p[7].uv = 
+        b2p[8].uv = 
+        b2p[9].uv = 
+        b2p[10].uv =
+        b2p[11].uv = bb;
+        dst += 12;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x6_Line(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + x * 6 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
diff --git a/source/src/agar/common/scaler/sse2/CMakeLists.txt b/source/src/agar/common/scaler/sse2/CMakeLists.txt
new file mode 100644 (file)
index 0000000..fd37895
--- /dev/null
@@ -0,0 +1,17 @@
+message("* ui-agar/scaler/sse2")
+
+#set(CMAKE_BUILD_SETTING_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -msse -mmmx")
+add_compile_options(-msse2 -msse -mmmx) 
+add_library(xm7_scaler-sse2
+                              scaler_x1_sse2.c
+                              scaler_x125_sse2.c
+                              scaler_x15_sse2.c
+                              scaler_x2_sse2.c
+                              scaler_x225_sse2.c
+                              scaler_x25_sse2.c
+                              scaler_x3_sse2.c
+                              scaler_x4_sse2.c
+                              scaler_x45_sse2.c
+                              scaler_x5_sse2.c
+                              scaler_x6_sse2.c
+)
diff --git a/source/src/agar/common/scaler/sse2/scaler_x125_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x125_sse2.c
new file mode 100644 (file)
index 0000000..da8cea4
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * Zoom x1.25x2 i.e. 800x480.
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x125(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+
+static void Scaler_DrawLine(Uint32 *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   Uint32 *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const Uint32 bb = 0xff000000;
+#else
+   const Uint32 bb = 0x000000ff;
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = (Uint32 *)dst;
+   pitch2 = pitch / sizeof(Uint32);
+   if((bFullScan) || (repeat < 2)) {
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = (Uint32 *)dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 7654432100
+        for(yy = 0; yy < repeat; yy++) {
+              b2p[0] = b2p[1] = r1.i[0];
+              b2p[2] = r1.i[1];
+              b2p[3] = r1.i[2];
+              b2p[4] = r1.i[3];
+              b2p[5] = b2p[6] = r2.i[0];
+              b2p[7] = r2.i[1];
+              b2p[8] = r2.i[2];
+              b2p[9] = r2.i[3];
+              b2p = b2p + pitch2;
+        }
+        dst = dst + 10;
+//      b += 2;
+      }
+   } else {
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = (Uint32 *)dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 776655444332211000
+        // 76543210 -> 7654432100
+        for(yy = 0; yy < repeat - 1; yy++) {
+              b2p[0] = b2p[1] = r1.i[0];
+              b2p[2] = r1.i[1];
+              b2p[3] = r1.i[2];
+              b2p[4] = r1.i[3];
+              b2p[5] = b2p[6] = r2.i[0];
+              b2p[7] = r2.i[1];
+              b2p[8] = r2.i[2];
+              b2p[9] = r2.i[3];
+              b2p = b2p + pitch2;
+        }
+        b2p[0] = b2p[1] = b2p[2] = b2p[3] =
+        b2p[4] = b2p[5] = b2p[6] = b2p[7] =
+        b2p[8] = b2p[9] =
+          bb;
+        dst = dst + 10;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x125_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 10) / 8) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine(d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x125_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x125_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x15_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x15_sse2.c
new file mode 100644 (file)
index 0000000..1647de1
--- /dev/null
@@ -0,0 +1,141 @@
+/*
+ * Zoom x1.5
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x15_Line(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      v4hi r3, r4, r5;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 766544322100
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[1], r1.i[2]};  
+        r4.uv  = (v4ui){r1.i[2], r1.i[3], r2.i[0], r2.i[0]};  
+        r5.uv  = (v4ui){r2.i[1], r2.i[2], r2.i[2], r2.i[3]};  
+        for(yy = 0; yy < repeat; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p = b2p + pitch2;
+        }
+        dst += 3;
+//      b += 2;
+      }
+   } else {
+      v4hi r3, r4, r5;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 766544322100
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[1], r1.i[2]};  
+        r4.uv  = (v4ui){r1.i[2], r1.i[3], r2.i[0], r2.i[0]};  
+        r5.uv  = (v4ui){r2.i[1], r2.i[2], r2.i[2], r2.i[3]};  
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p = b2p + pitch2;
+        }
+        b2p[0].uv = 
+        b2p[1].uv = 
+        b2p[2].uv = bb;
+        dst += 3;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x15_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 6) / 4) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x3_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x3_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x1_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x1_sse2.c
new file mode 100644 (file)
index 0000000..a4747ce
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * Zoom x1x1
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-01-26 Move from agar_sdlscaler.cpp
+ */
+
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   register v4hi *b2p;
+   register v4hi r1, r2;
+   v4hi *d0;
+   register v4hi *b;
+   register v4hi bb2;
+   register int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(__builtin_expect((repeat < 0), 0)) return;
+   b = (v4hi *)src;
+   bb2.uv = bb;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if(bFullScan || (repeat < 2)) {
+      if(__builtin_expect((repeat >= 2), 1)) {
+        for(xx = 0; xx < ww; xx += 8) {
+           b2p = dst;
+           r1 = b[0];
+           r2 = b[1];
+           for(yy = 0; yy < repeat; yy++) {
+              b2p[0] = r1;
+              b2p[1] = r2;
+              b2p = b2p + pitch2;
+           }
+        dst += 2;
+        b += 2;
+        }
+      } else { // repeat == 1
+        for(xx = 0; xx < ww; xx += 8) {
+           b2p = dst;
+           b2p[0] = b[0];
+           b2p[1] = b[1];
+           dst += 2;
+           b += 2;
+        }
+      }
+   } else {
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = b[0];
+        r2 = b[1];
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r1;
+           b2p[1] = r2;
+           b2p = b2p + pitch2;
+        }
+        b2p[0] = bb2;
+        b2p[1] = bb2;
+        dst += 2;
+        b += 2;
+      }
+   }
+   
+}
+
+
+
+      
+void pVram2RGB_x1_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v8hi_t *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   unsigned  pitch;
+   int yrep2;
+   int yrep3;
+   if(Surface == NULL) return;
+
+   w = Surface->w;
+   h = Surface->h;
+   
+   ww = xend - xbegin;
+   if(ww <= 0) return;
+   yrep2 = yrep;
+   d1 = (Uint32 *)(dst + x * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
diff --git a/source/src/agar/common/scaler/sse2/scaler_x225_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x225_sse2.c
new file mode 100644 (file)
index 0000000..1819a8c
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ * Zoom x2.25x2 i.e. 1440x900.
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x225(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+
+static void Scaler_DrawLine(Uint32 *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v2hi *b2p;
+   v4hi *b4p, *s4p;
+   register v2hi r1, r2, r3, r4;
+   v2hi r5v[(640 * 9) / 8 + 1];
+   v4hi *d0;
+   register v2hi *b;
+   int pitch2;
+   int ip = 0;
+#if AG_BIG_ENDIAN != 1
+   const v2ui bb = (v2ui){0xff000000, 0xff000000};
+#else
+   const v2ui bb = (v2ui){0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v2hi *)src;
+   pitch2 = pitch / sizeof(v2hi);
+
+   _prefetch_data_write_l1(r5v, sizeof(r5v));
+   if((bFullScan) || (repeat < 2)) {
+      yrep2 = repeat;
+      yrep3 = 0;
+   } else {
+      // 76543210 -> 776655444332211000
+      yrep2 = repeat - 1;
+      if(yrep2 < 1) {
+        yrep2 = 1;
+        yrep3 = 0;
+      } else {
+        yrep3 = 1;
+      }
+   }
+#ifndef __x86_64__ /* ia32 etc */
+        // 76543210 -> 776655444332211000
+   _prefetch_data_write_l1(r5v, sizeof(r5v));
+   for(xx = 0; xx < ww; xx += 8) {
+      r1 = b[0];
+      r2 = b[1];
+      r3 = b[2];
+      r4 = b[3];
+      r5v[ip + 0].uv = (v2ui){r1.i[0], r1.i[0]}; //00
+      r5v[ip + 1].uv = (v2ui){r1.i[0], r1.i[1]}; //01
+      
+      r5v[ip + 2].uv = (v2ui){r1.i[1], r2.i[0]}; //12
+      r5v[ip + 3].uv = (v2ui){r2.i[0], r2.i[1]}; //23
+        
+      r5v[ip + 4].uv = (v2ui){r2.i[1], r3.i[0]}; //34
+      r5v[ip + 5].uv = (v2ui){r3.i[0], r3.i[0]}; //44
+      r5v[ip + 6].uv = (v2ui){r3.i[1], r3.i[1]}; //55
+
+      r5v[ip + 7].uv = (v2ui){r4.i[0], r4.i[0]}; //66
+      r5v[ip + 8].uv = (v2ui){r4.i[1], r4.i[1]}; //77   
+      ip += 9;
+      b += 4;
+   }
+   b2p = (v2hi *)dst;
+   //_prefetch_data_read_l1(r5v, sizeof(r5v));
+   for(yy = 0; yy < yrep2; yy++) {
+        memcpy((void *)b2p, (void *)r5v, ww * sizeof(Uint32));
+        b2p = b2p + pitch2;
+   }
+   for(yy = 0; yy < yrep3; yy++) {
+      for(xx = 0; xx < ip; xx++) b2p[xx].uv = bb;
+   }
+#else /* defined(__x86_64__) */
+   /* x86_64 : Using assembly. */
+   Uint32 *p;
+   p = dst;
+#if 0
+   for(yy = 0; yy < repeat; yy++) {
+      _prefetch_data_write_l1(p, ww * sizeof(Uint32));
+      p += (pitch / sizeof(Uint32));
+   }
+#endif   
+      asm volatile (
+                  "movl %[ww], %%edx\n\t"
+                  "shr  $3, %%edx\n\t"
+                  "movq %[src], %%rsi\n\t"
+                  "movq %[dst], %%rdi\n\t"
+                  "movl %[pitch], %%r11d\n\t"
+                  "movl %[rep], %%r9d\n\t"
+                  "movl %[rep2], %%r10d\n\t"
+                  "_l3:\n\t"    
+                  "movq %%rdi, %%r8\n\t"
+                  "movdqu 0(%%rsi), %%xmm0\n\t"
+                  "movdqu 16(%%rsi), %%xmm5\n\t"
+                  "pshufd $0b11111010 ,%%xmm0, %%xmm1\n\t"
+                  "pshufd $0b01010000 ,%%xmm0, %%xmm2\n\t"
+                  "movd %%xmm0, %%eax\n\t"
+
+                  "pshufd $0b11111010, %%xmm5, %%xmm3\n\t"
+                  "pshufd $0b01010000, %%xmm5, %%xmm4\n\t"
+                  "movd %%xmm5, %%ebx\n\t"
+                  
+                  "movl %%r9d, %%ecx\n\t"
+                  "movq %%r8, %%rdi\n"
+                  "_l4:\n\t"
+                  "movl   %%eax,  0(%%rdi)\n\t"
+                  "movdqu %%xmm2, 4(%%rdi)\n\t"
+                  "movdqu %%xmm1, 20(%%rdi)\n\t"
+                  "movl   %%ebx,  36(%%rdi)\n\t"
+                  "movdqu %%xmm4, 40(%%rdi)\n\t"
+                  "movdqu %%xmm3, 56(%%rdi)\n\t"
+                  "addq   %%r11, %%rdi\n\t"
+                  "dec %%ecx\n\t"
+                  "jnz _l4\n\t"
+                  "movl %%r10d, %%ecx\n\t"
+                  "cmpl $0x00000000, %%ecx\n\t"
+                  "jz _l5\n\t"
+                  "_l6:\n\t"
+                  "movq $0xff000000ff000000, %%r12\n\t"
+                  "movq %%r12, %%xmm6\n\t"
+                  "movdqu %%xmm6, 0(%%rdi)\n\t"
+                  "movdqu %%xmm6, 16(%%rdi)\n\t"
+                  "movdqu %%xmm6, 32(%%rdi)\n\t"
+                  "movdqu %%xmm6, 48(%%rdi)\n\t"
+                  "movq   %%r12,  64(%%rdi)\n\t"
+                  "dec %%ecx\n\t"
+                  "jnz _l6\n\t"
+                  "_l5:\n\t"
+                  "addq $32, %%rsi\n\t"
+                  "movq %%r8, %%rdi\n\t"
+                  "addq $72, %%rdi\n\t"
+                  "dec %%edx\n\t"
+                  "jnz _l3\n\t"
+                  :
+                  : [pitch] "rm"(pitch), [ww]"rm" (ww),
+                    [rep] "rm"(yrep2),[rep2] "rm"(yrep3),
+                    [src] "rm" (src), [dst] "rm" (dst)
+                  : "eax","ebx","rdi", "rsi",  "ecx", "edx", 
+                    "r8", "r9", "r10", "r11", "r12",
+                   "xmm0","xmm1","xmm2",
+                   "xmm3","xmm4", "xmm5", "xmm6");
+      
+#endif      
+}
+
+
+
+
+void pVram2RGB_x225_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 18) / 8) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine(d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x2_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x2_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x25_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x25_sse2.c
new file mode 100644 (file)
index 0000000..983c2e2
--- /dev/null
@@ -0,0 +1,151 @@
+/*
+ * Zoom x2x2
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x25(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   register v4hi *b2p;
+   register v4hi r1, r2;
+   v4hi *d0;
+   register v4hi *b;
+   int pitch2;
+   register int ip;
+   v4hi r3v[5 * 80];
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+   const v4ui order3 = (v4ui){3, 3, 4, 4};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+   const v4ui order3 = (v4ui){3, 3, 4, 4};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+//   _prefetch_data_read_l2((void *)src, sizeof(Uint32) * ww);
+   _prefetch_data_write_l1((void *)r3v, sizeof(r3v));
+   if(__builtin_expect(((bFullScan) || (repeat < 2)), 1)) {
+      ip = 0;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = b[0];
+        r2 = b[1];
+        // 76543210 -> 77666554443322211000
+        r3v[ip + 0].uv = __builtin_ia32_pshufd(r1.uv, 0b01000000);
+        r3v[ip + 1].uv = __builtin_ia32_pshufd(r1.uv, 0b10101001);
+        r3v[ip + 2] = (v4hi)__builtin_shuffle(r1.uv, r2.uv, order3);
+        r3v[ip + 3].uv = __builtin_ia32_pshufd(r2.uv, 0b10010100);
+        r3v[ip + 4].uv = __builtin_ia32_pshufd(r2.uv, 0b11111010);
+        ip += 5;
+        b += 2;
+      }
+      
+      for(yy = 0; yy < repeat; yy++) {
+        //         _prefetch_data_write_l2((void *)b2p, sizeof(v4hi) * 5);
+        memcpy((void *)b2p, (void *)r3v, sizeof(v4hi) * ip);
+        b2p = b2p + pitch2;
+      }
+   } else {
+      ip = 0;
+      for(xx = 0; xx < ww; xx += 8) {
+        yy = 0;
+//      b2p = dst;
+        r1 = b[0];
+        r2 = b[1];
+        // 76543210 -> 77666554443322211000
+        r3v[ip + 0].uv = __builtin_ia32_pshufd(r1.uv, 0b01000000);
+        r3v[ip + 1].uv = __builtin_ia32_pshufd(r1.uv, 0b10101001);
+        r3v[ip + 2] = (v4hi)__builtin_shuffle(r1.uv, r2.uv, order3);
+        r3v[ip + 3].uv = __builtin_ia32_pshufd(r2.uv, 0b10010100);
+        r3v[ip + 4].uv = __builtin_ia32_pshufd(r2.uv, 0b11111010);
+        ip += 5;
+        b += 2;
+      }
+      b2p = dst;
+      for(yy = 0; yy < repeat - 1; yy++) {
+//         _prefetch_data_write_l2((void *)b2p, sizeof(v4hi) * 5);
+           memcpy((void *)b2p, (void *)r3v, ip * sizeof(v4hi));
+           b2p = b2p + pitch2;
+      }
+//      _prefetch_data_write_l2((void *)b2p, sizeof(v4hi) * 5);
+      for(xx = 0; xx < ip; xx++) b2p[xx].uv = bb;
+   }
+   
+}
+
+
+
+void pVram2RGB_x25_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 20) / 16) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x25_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x25_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x2_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x2_sse2.c
new file mode 100644 (file)
index 0000000..9114af4
--- /dev/null
@@ -0,0 +1,268 @@
+/*
+ * Zoom x2x2
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x2(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+//          b2p = d0;
+//          b2.vv = __builtin_ia32_pshufd(b[0].v, 0x50);
+//          b3.vv = __builtin_ia32_pshufd(b[0].v, 0xfa);
+
+//          b4.vv = __builtin_ia32_pshufd(b[1].v, 0x50);
+//          b5.vv = __builtin_ia32_pshufd(b[1].v, 0xfa);
+
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+
+#ifndef __x86_64__
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   volatile v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   v4hi bb2;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   bb2.uv = bb;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      volatile v4hi r3, r4, r5, r6;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+         r3.vv = __builtin_ia32_pshufd(r1.vv, 0x50);
+         r4.vv = __builtin_ia32_pshufd(r1.vv, 0xfa);
+
+         r5.vv = __builtin_ia32_pshufd(r2.vv, 0x50);
+        r6.vv = __builtin_ia32_pshufd(r2.vv, 0xfa);
+        for(yy = 0; yy < repeat; yy++) {
+              b2p[0] = r3;
+              b2p[1] = r4;
+              b2p[2] = r5;
+              b2p[3] = r6;
+              
+              b2p = b2p + pitch2;
+        }
+        dst = dst + 4;
+//      b += 2;
+      }
+   } else {
+      volatile v4hi r3, r4, r5, r6;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        r3 = r1;
+        r4 = r1;
+        r5 = r2;
+        r6 = r2;
+         r3.vv = __builtin_ia32_pshufd(r1.vv, 0x50);
+         r4.vv = __builtin_ia32_pshufd(r1.vv, 0xfa);
+
+         r5.vv = __builtin_ia32_pshufd(r2.vv, 0x50);
+        r6.vv = __builtin_ia32_pshufd(r2.vv, 0xfa);
+        for(yy = 0; yy < repeat - 1; yy++) {
+           *b2p++ = r3;
+           *b2p++ = r4;
+           *b2p++ = r5;
+           *b2p++ = r6;
+           b2p = b2p + (pitch2 - 4);
+        }
+        *b2p++ = bb2;
+        *b2p++ = bb2;
+        *b2p++ = bb2;
+        *b2p++ = bb2;
+        dst += 4;
+//      b += 2;
+      }
+   }
+#else /* __x86_64__ */
+   int yrep2, yrep3;
+   
+   if(repeat <= 0) return;
+   if((bFullScan) || (repeat < 2)) {
+      yrep2 = repeat;
+      if(yrep2 < 1) yrep2 = 1;
+      yrep3 = 0;
+   } else {
+      yrep2 = repeat - 1;
+      yrep3 = 1;
+   }
+   // 7766554433221100
+   asm ( "/* _dst: .equ 40 */\n\t"
+        "/*_count0: .equ 32 */\n\t"
+        "/*_count1: .equ 24 */\n\t"
+        "/*_count2: .equ 16 */\n\t"
+        "/*_yrep2:  .equ 8 */\n\t"
+        "/*_yrep3:  .equ 0 */\n\t"
+        "subq $64, %%rsp /* Allocate local value */\n\t"
+        "movq %[src], %%rsi\n\t"
+        "movq %[dst], %%rdi\n\t"
+        "movq %%rdi, 40(%%rsp) /* _dst */\n\t"
+        
+        "movl %[pitch], %%eax\n\t"
+        "movq %%rax, %%r10 /* pitch */\n\t"
+        
+        "movl %[ww], %%ecx \n\t"
+        "shrl $3, %%ecx\n\t"
+        "movl %%ecx, 32(%%rsp) /* _count0 */\n\t"
+        
+        "movl %[rep2], %%r11d\n\t"
+        "movl %[rep3], %%r12d\n\t"
+        "movl %%r12d, 0(%%rsp) /* _yrep3 */\n\t"
+        "cmpl $0, %%r11d\n\t"
+        "je _l2\n\t"
+        "movl %%r11d, 8(%%rsp) /* _yrep2 */\n\t"
+        
+        "cmpl $0, %%ecx\n\t"
+        "je _exit0\n\t"
+        
+        "_l0: \n\t"
+        "movdqu 0(%%rsi), %%xmm0 /* 0123 */\n\t"
+        "movdqu 16(%%rsi), %%xmm1 /* 4567 */\n\t"
+        "pshufd $0b01010000, %%xmm0, %%xmm2 /* 2233 */\n\t"
+        "pshufd $0b11111010, %%xmm0, %%xmm0 /* 0011 */\n\t"
+        "pshufd $0b01010000, %%xmm1, %%xmm3 /* 6677 */\n\t"
+        "pshufd $0b11111010, %%xmm1, %%xmm1 /* 4455 */\n\t"
+        "addq $32, %%rsi\n\t"
+        "movl %%r11d, %%r13d\n\t"
+        "movq %%rdi,  %%r14\n\t"
+        "_l0a: \n\t"
+        "movdqu %%xmm2, 0(%%rdi)\n\t"
+        "movdqu %%xmm0, 16(%%rdi)\n\t"
+        "movdqu %%xmm3, 32(%%rdi)\n\t"
+        "movdqu %%xmm1, 48(%%rdi)\n\t"
+        "addq %%r10, %%rdi\n\t"
+        "decl %%r13d\n\t"
+        "jnz _l0a\n\t"
+        "addq $64, %%r14\n\t"
+        "movq %%r14, %%rdi\n\t"
+        "decl %%ecx\n\t"
+        "jnz _l0\n\t"
+
+        "movl 0(%%rsp), %%ecx /* _yrep3 */\n\t"
+        "cmpl $1, %%ecx\n\t"
+        "jl _exit0\n\t"
+        
+        "movq 40(%%rsp), %%rdi /* _dst */\n\t"
+        "movl 8(%%rsp), %%eax /* _yrep2 */\n\t"
+        "mulq %%r10\n\t"
+        "addq %%rax, %%rdi\n\t"
+        "movq %%rdi, %%r14\n\t"
+        
+        "movl $0xff000000, %%eax /* ABGR */\n\t"
+        "movd %%eax, %%xmm0\n\t"
+        "pshufd $0b00000000, %%xmm0, %%xmm0\n\t"
+        
+        "_l2: \n\t"
+        "movl 32(%%rsp), %%r8d /* _count0 */\n\t"
+        "cmpl $1, %%r8d\n\t"
+        "jl _exit0\n\t"
+        
+        "_l2a:\n\t"
+        "movdqu %%xmm0, 0(%%rdi)\n\t"
+        "movdqu %%xmm0, 16(%%rdi)\n\t"
+        "movdqu %%xmm0, 32(%%rdi)\n\t"
+        "movdqu %%xmm0, 48(%%rdi)\n\t"
+        "addq $64, %%rdi\n\t"
+       
+        "decl %%r8d\n\t"
+        "jnz _l2a\n\t"
+        
+        "movq %%r14, %%rdi\n\t"
+        "addq %%r10, %%rdi\n\t"
+        "movq %%rdi, %%r14\n\t"
+        "decl %%ecx\n\t"
+        "jnz _l2\n\t"
+       
+        "_exit0:\n\t"
+        "addq $64, %%rsp /* Free local value */\n\t"
+       :
+       : [src] "rm" (src), [dst] "rm" (dst), [pitch] "rm" (pitch),
+       [ww] "rm" (ww), [rep2] "rm" (yrep2), [rep3] "rm" (yrep3)
+       : "xmm0", "xmm1", "xmm2", "xmm3",
+       "rax", "rcx", "rdi", "rsi", "r10", "r11", "r12", "r13", "r14" );
+   
+#endif
+}
+
+
+
+void pVram2RGB_x2_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + x * 2 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x2_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x2_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x3_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x3_sse2.c
new file mode 100644 (file)
index 0000000..5c44405
--- /dev/null
@@ -0,0 +1,145 @@
+/*
+ * Zoom x3
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x3_Line(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+static inline void  Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   if((bFullScan) || (repeat < 2)) {
+      yrep2 = repeat;
+      yrep3 = 0;
+   } else {
+      yrep2 = repeat - 1;
+      yrep3 = 1;
+   }
+   
+   
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   {
+      v4hi r3, r4, r5, r6, r7, r8;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 777766666555544444333322222111100000
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[1]};  
+        r4.uv  = (v4ui){r1.i[1], r1.i[1], r1.i[2], r1.i[2]};  
+        r5.uv  = (v4ui){r1.i[2], r1.i[3], r1.i[3], r1.i[3]};  
+
+        r6.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[1]};  
+        r7.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[2], r2.i[2]};  
+        r8.uv  = (v4ui){r2.i[2], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < yrep2; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p = b2p + pitch2;
+        }
+        for(yy = 0; yy < yrep3; yy++) {
+           b2p[0].uv = 
+             b2p[1].uv = 
+             b2p[2].uv = 
+             b2p[3].uv = 
+             b2p[4].uv = 
+             b2p[5].uv = bb;
+             b2p = b2p + pitch2;
+        }
+        dst += 6;
+      }
+      
+   }
+   
+}
+
+
+
+
+void pVram2RGB_x3_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + x * 3 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x3_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x3_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x45_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x45_sse2.c
new file mode 100644 (file)
index 0000000..914b83d
--- /dev/null
@@ -0,0 +1,295 @@
+/*
+ * Zoom x4.5
+ * (C) 2014 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x45_Line(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   if(repeat <= 0) return;
+
+# ifndef __x86_64__     
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 777766666555544444333322222111100000
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv  = (v4ui){r1.i[0], r1.i[1], r1.i[1], r1.i[1]};  
+        r5.uv  = (v4ui){r1.i[1], r1.i[2], r1.i[2], r1.i[2]};  
+        r6.uv  = (v4ui){r1.i[2], r1.i[2], r1.i[3], r1.i[3]};  
+        r7.uv  = (v4ui){r1.i[3], r1.i[3], r2.i[0], r2.i[0]};  
+
+        r8.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[1]};  
+        r9.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[1], r2.i[2]};  
+        r10.uv = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[2]};  
+        r11.uv = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p = b2p + pitch2;
+        }
+        dst += 9;
+//      b += 2;
+      }
+   } else {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        // 76543210 -> 777766666555544444333322222111100000
+        r3.uv  = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv  = (v4ui){r1.i[0], r1.i[1], r1.i[1], r1.i[1]};  
+        r5.uv  = (v4ui){r1.i[1], r1.i[2], r1.i[2], r1.i[2]};  
+        r6.uv  = (v4ui){r1.i[2], r1.i[2], r1.i[3], r1.i[3]};  
+        r7.uv  = (v4ui){r1.i[3], r1.i[3], r2.i[0], r2.i[0]};  
+
+        r8.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[1]};  
+        r9.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[1], r2.i[2]};  
+        r10.uv = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[2]};  
+        r11.uv = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};  
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p = b2p + pitch2;
+        }
+        b2p[0].uv = 
+        b2p[1].uv = 
+        b2p[2].uv = 
+        b2p[3].uv = 
+        b2p[4].uv = 
+        b2p[5].uv = 
+        b2p[6].uv = 
+        b2p[7].uv = 
+        b2p[8].uv = bb;
+        dst += 9;
+//      b += 2;
+      }
+   }
+#else // __x86_64__
+   
+   if((bFullScan) || (repeat < 2)) {
+      yrep2 = repeat;
+      if(yrep2 < 1) yrep2 = 1;
+      yrep3 = 0;
+   } else {
+      yrep2 = repeat - 1;
+      yrep3 = 1;
+   }
+   // 76543210 -> 7777 6666 6555 5444 4433 3322 2221 1110 0000
+   asm (
+               "subq $64, %%rsp /* Allocate local value */\n\t"
+               "movq %[src], %%rsi\n\t"
+               "movq %[dst], %%rdi\n\t"
+               "movl %[pitch], %%eax\n\t"
+               "movq %%rax, %%r10 /* pitch */\n\t"
+               "movq %%rdi, 40(%%rsp)  /* dst */\n\t"
+               "movl %[ww], %%ecx\n\t"
+               "shr  $3, %%ecx\n\t"
+               "movl %%ecx, 32(%%rsp) /* r10 / 16(rsp) = count(ww) */\n\t"
+               "movl %%ecx, 24(%%rsp) /* r10 / 16(rsp) = count(ww) */\n\t"
+               "movl %%ecx, 16(%%rsp) /* r10 / 16(rsp) = count(ww) */\n\t"
+               "movl %[rep2], %%ecx \n\t"
+               "movl %%ecx, 8(%%rsp) /*  r9 / 8(rsp) = yrep2 */\n\t"
+               "movl %[rep3], %%ecx \n\t"
+               "movl %%ecx, 0(%%rsp) /*  r8 / 0(rsp) = yrep3 */\n\t"
+               "movq $0, %%r11 /* Set offset counter of source */\n\t"
+               "movl 32(%%rsp), %%ecx \n\t"
+               "_l0:\n\t"
+               "movl %%ecx, 16(%%rsp) \n\t"
+               "/* Get upper to xmm0 */\n\t"
+               "movdqu 0(%%rsi),  %%xmm0 /* Get Upper */\n\t"
+               "movdqu 16(%%rsi), %%xmm5 /* Get Lower */\n\t"
+               "/* 76543210 -> 7777 6666 6555 5444 4433 3322 2221 1110 0000 */"
+               "pshufd $0b11111111, %%xmm0, %%xmm1 /* 7777 -> xmm1 */\n\t"
+               "pshufd $0b11101010, %%xmm0, %%xmm2 /* 6666 -> xmm2 */\n\t"
+               "pshufd $0b10100101, %%xmm0, %%xmm3 /* 6555 -> xmm3 */\n\t"
+               "pshufd $0b01010000, %%xmm0, %%xmm4 /* 5444 -> xmm4 */\n\t"
+               "addq $32, %%rsi\n\t"
+               "movd %%xmm1, %%eax /* $00,$00,0,4 */\n\t"
+               "movd %%eax, %%xmm6\n\t"
+               "pshufd $0b11110000, %%xmm6, %%xmm6\n\t"
+               "/* Store higher */\n\t"
+               "movq $0, %%r13 /* r13 -> offset */\n\t"
+               "movl 8(%%rsp), %%ecx /* yrep2 */\n\t"
+               "pushq %%rdi\n\t"
+               "_l1a:\n\t"
+               "movdqu %%xmm4, 0(%%rdi) /* store 6666 */\n\t"
+               "movdqu %%xmm3, 16(%%rdi) /* store 6555 */\n\t"
+               "movdqu %%xmm2, 32(%%rdi) /* store 5444 */\n\t"
+               "movdqu %%xmm1, 48(%%rdi) /* store 5444 */\n\t"
+               "addq %%r10, %%rdi\n\t"
+               "dec %%ecx\n\t"
+               "jnz _l1a\n\t"
+               "popq %%rdi\n\t"
+               "pshufd $0b11111111, %%xmm5, %%xmm1 /* 3322 */\n\t"
+               "pshufd $0b11101010, %%xmm5, %%xmm2 /* 2221 */\n\t"
+               "pshufd $0b10100101, %%xmm5, %%xmm3 /* 1110 */\n\t"
+               "pshufd $0b01010000, %%xmm5, %%xmm4 /* 0000 */\n\t"
+               "movd %%xmm4, %%eax\n\t"
+               "movd %%eax, %%xmm0\n\t"
+               "pshufd $0b00001111, %%xmm0, %%xmm0\n\t"
+               "por %%xmm0, %%xmm6\n\t"
+               "movl 8(%%rsp), %%ecx\n\t"
+               "movq $0, %%r13 /* r13 -> offset */\n\t"
+               "pushq %%rdi\n\t"
+               "_l1b:\n\t"
+               "movdqu %%xmm6, 64(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm4, 80(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm3, 96(%%rdi) /* store 2221 */\n\t"
+               "movdqu %%xmm2, 112(%%rdi) /* store 1110 */\n\t"
+               "movdqu %%xmm1, 128(%%rdi) /* store 0000 */\n\t"
+               "addq %%r10, %%rdi\n\t"
+               "dec %%ecx\n\t"
+               "jnz _l1b\n\t"
+               "popq %%rdi\n\t"
+               "addq $144, %%rdi\n\t"
+               "addq $4, %%r11\n\t"
+               "movl 16(%%rsp), %%ecx\n\t"
+               "dec %%ecx\n\t"
+               "jnz _l0\n\t"
+               
+               "movl 0(%%rsp), %%ebx\n\t"
+               "cmpl $0, %%ebx /* cmp yrep3, 0 */\n\t"
+               "jz _l2c\n\t"
+               
+               "/* clear */"
+               "movl $0xff000000, %%eax\n\t"
+               "movd %%eax, %%xmm0\n\t"
+               "pshufd $0b00000000, %%xmm0, %%xmm0\n\t"
+               "_l2a:\n\t"
+               "movq 40(%%rsp), %%rdi\n\t"
+               "movl 32(%%rsp), %%ecx\n\t"
+               "pushq %%rdi\n\t"
+               "_l2b:\n\t"
+               "movdqu %%xmm0,  0(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm0, 16(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm0, 32(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm0, 48(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm0, 64(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm0, 80(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm0, 96(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm0, 112(%%rdi) /* store 3322 */\n\t"
+               "movdqu %%xmm0, 128(%%rdi) /* store 3322 */\n\t"
+               "addq $144, %%rdi\n\t"
+               "dec %%ecx\n\t"
+               "jnz _l2b\n\t"
+               "popq %%rdi\n\t"
+               "addq %%r10, %%r13\n\t"
+               "dec %%ebx\n\t"
+               "jnz _l2a\n\t"
+               "_l2c:\n\t"
+               "addq $64, %%rsp"
+               :
+               : [src] "rm" (src), [dst] "rm" (dst), [pitch] "rm" (pitch),
+                  [ww] "rm" (ww), [rep2] "rm" (yrep2), [rep3] "rm" (yrep3)
+               : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+                 "rax", "rbx", "rcx", "rdi", "rsi", "r10", "r11", "r12", "r13" );
+#endif   
+}
+
+
+
+void pVram2RGB_x45_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + ((x * 18) / 4) * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x45_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x45_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x4_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x4_sse2.c
new file mode 100644 (file)
index 0000000..e99819e
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ * Zoom x4x4
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-01-26 Move from agar_sdlscaler.cpp
+ *  2013-09-17 Move from scaler/generic/scaler_x4.c
+ */
+
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+#if defined(__SSE2__)
+void pVram2RGB_x4_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+//   AG_SurfaceLock(Surface);
+   w = Surface->w;
+   h = Surface->h;
+   
+   ww = xend - xbegin;
+   if(ww > (w / 4)) ww = w / 4;
+   ww = ww - 7;
+   if(ww <= 0) return;
+//   if(yrep < 2) {
+//      if(y >= h) return;
+//   } else {
+//      if(y >= (h / (yrep >> 1))) return;/
+//   }
+   
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+   yrep2 = yrep;
+   d1 = (Uint32 *)(dst+ x * 4 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   
+   pitch = Surface->pitch / sizeof(Uint32);
+   { // Not thinking align ;-(
+       
+    int j;
+    register v4hi b2, b3, b4, b5, b6, b7, b8, b9;
+    register v4hi bb;
+    register v4hi bx0, bx1;
+    v4hi *b2p;
+    Uint32 *d0;
+      
+    b = (v4hi *)d2;
+    bb.i[0] = bb.i[1] = bb.i[2] = bb.i[3] = black;
+    //if((((y * yrep2) % 16) == 0) && ((yrep2 % 16) != 0)) yrep2 += 16;
+    //yrep2 >>= 4;
+       switch(yrep2) {
+       case 0:
+       case 1:
+//     case 2:
+         _prefetch_data_write_l2(d1, sizeof(v4hi) * 8 * ww);
+         for(xx = 0; xx < ww; xx += 8) {
+            b2p = (v4hi *)d1;
+            bx0 = b[0];
+            bx1 = b[1];
+            b2.vv = __builtin_ia32_pshufd(bx0.vv, 0x00);
+            b3.vv = __builtin_ia32_pshufd(bx0.vv, 0x55);
+            b4.vv = __builtin_ia32_pshufd(bx0.vv, 0xaa);
+            b5.vv = __builtin_ia32_pshufd(bx0.vv, 0xff);
+
+            b6.vv = __builtin_ia32_pshufd(bx1.vv, 0x00);
+            b7.vv = __builtin_ia32_pshufd(bx1.vv, 0x55);
+            b8.vv = __builtin_ia32_pshufd(bx1.vv, 0xaa);
+            b9.vv = __builtin_ia32_pshufd(bx1.vv, 0xff);
+            
+            *b2p++ = b2;
+            *b2p++ = b3;
+            *b2p++ = b4;
+            *b2p++ = b5;
+            *b2p++ = b6;
+            *b2p++ = b7;
+            *b2p++ = b8;
+            *b2p++ = b9;
+            d1 += 32;
+            b += 2;
+         }
+         break;
+       default:
+         d0 = d1;
+         _prefetch_data_write_l2(d1, sizeof(v4hi) * 8 * ww);
+         for(xx = 0; xx < ww; xx += 8){
+            d1 = d0;
+            b2p = (v4hi *)d1;
+            bx0 = b[0];
+            bx1 = b[1];
+            b2.vv = __builtin_ia32_pshufd(bx0.vv, 0x00);
+            b3.vv = __builtin_ia32_pshufd(bx0.vv, 0x55);
+            b4.vv = __builtin_ia32_pshufd(bx0.vv, 0xaa);
+            b5.vv = __builtin_ia32_pshufd(bx0.vv, 0xff);
+
+            b6.vv = __builtin_ia32_pshufd(bx1.vv, 0x00);
+            b7.vv = __builtin_ia32_pshufd(bx1.vv, 0x55);
+            b8.vv = __builtin_ia32_pshufd(bx1.vv, 0xaa);
+            b9.vv = __builtin_ia32_pshufd(bx1.vv, 0xff);
+            
+            for(j = 0; j < yrep2; j++) {
+               b2p = (v4hi *)d1;
+               _prefetch_data_write_l2(d1, sizeof(v4hi) * 8);
+               if(!bFullScan && (j >= (yrep2 - 1))) {
+                  b2p[0] = 
+                  b2p[1] = 
+                  b2p[2] = 
+                  b2p[3] = 
+                  b2p[4] = 
+                  b2p[5] = 
+                  b2p[6] = 
+                  b2p[7] = bb;
+                } else {
+                   b2p[0] = b2;
+                   b2p[1] = b3;
+                   b2p[2] = b4;
+                   b2p[3] = b5;
+                   b2p[4] = b6;
+                   b2p[5] = b7;
+                   b2p[6] = b8;
+                   b2p[7] = b9;
+               }
+               d1 += pitch;
+            }
+            d0 += 32;
+            b += 2;
+         }
+         break;
+       }
+   }
+//   AG_SurfaceUnlock(Surface);
+}
+
+
+#else // NON-SSE2
+void pVram2RGB_x4_SSE2_Line(Uint32 *src, int xbegin, int xend, int y, float yrep)
+{
+   pVram2RGB_x4_Line(Uint32 *src, xbegin, xend, int y, yrep);
+}
+#endif
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x5_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x5_sse2.c
new file mode 100644 (file)
index 0000000..f1105d0
--- /dev/null
@@ -0,0 +1,152 @@
+/*
+ * Zoom x2x2
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x5_Line(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   register v4hi r1, r2;
+   register v4hi *b;
+   v4hi r3v[10 * 80];
+   int ip;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   v4ui bb2 = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   v4ui bb2 = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   _prefetch_data_write_l1(r3v, sizeof(r3v));
+   if(__builtin_expect(((bFullScan) || (repeat < 2)), 0)) {
+      ip = 0;
+      for(xx = 0; xx < ww; xx += 8) {
+        r1 = b[0];
+        r2 = b[1];
+        r3v[ip + 0].uv  = __builtin_ia32_pshufd(r1.uv, 0b00000000); // 0000
+        r3v[ip + 1].uv  = __builtin_ia32_pshufd(r1.uv, 0b01010100); // 0111
+        r3v[ip + 2].uv  = __builtin_ia32_pshufd(r1.uv, 0b10100101); // 1122
+        r3v[ip + 3].uv  = __builtin_ia32_pshufd(r1.uv, 0b11101010); // 2223
+        r3v[ip + 4].uv  = __builtin_ia32_pshufd(r1.uv, 0b11111111); // 3333
+
+        r3v[ip + 5].uv  = __builtin_ia32_pshufd(r2.uv, 0b00000000); // 0000
+        r3v[ip + 6].uv  = __builtin_ia32_pshufd(r2.uv, 0b01010100); // 0111
+        r3v[ip + 7].uv  = __builtin_ia32_pshufd(r2.uv, 0b10100101); // 1122
+        r3v[ip + 8].uv  = __builtin_ia32_pshufd(r2.uv, 0b11101010); // 2223
+        r3v[ip + 9].uv  = __builtin_ia32_pshufd(r2.uv, 0b11111111); // 3333
+
+        ip += 10;
+        b += 2;
+      }
+      _prefetch_data_read_l1(r3v, sizeof(r3v));
+      for(yy = 0; yy < repeat; yy++) {
+        memcpy(b2p, r3v, ip * sizeof(v4hi));
+        b2p = b2p + pitch2;
+      }
+   } else {
+      ip = 0;
+      for(xx = 0; xx < ww; xx += 8) {
+        r1 = b[0];
+        r2 = b[1];
+        r3v[ip + 0].uv  = __builtin_ia32_pshufd(r1.uv, 0b00000000); // 0000
+        r3v[ip + 1].uv  = __builtin_ia32_pshufd(r1.uv, 0b01010100); // 0111
+        r3v[ip + 2].uv  = __builtin_ia32_pshufd(r1.uv, 0b10100101); // 1122
+        r3v[ip + 3].uv  = __builtin_ia32_pshufd(r1.uv, 0b11101010); // 2223
+        r3v[ip + 4].uv  = __builtin_ia32_pshufd(r1.uv, 0b11111111); // 3333
+
+        r3v[ip + 5].uv  = __builtin_ia32_pshufd(r2.uv, 0b00000000); // 0000
+        r3v[ip + 6].uv  = __builtin_ia32_pshufd(r2.uv, 0b01010100); // 0111
+        r3v[ip + 7].uv  = __builtin_ia32_pshufd(r2.uv, 0b10100101); // 1122
+        r3v[ip + 8].uv  = __builtin_ia32_pshufd(r2.uv, 0b11101010); // 2223
+        r3v[ip + 9].uv  = __builtin_ia32_pshufd(r2.uv, 0b11111111); // 3333
+        ip += 10;
+        b += 2;
+      }
+      _prefetch_data_read_l1(r3v, sizeof(r3v));
+      for(yy = 0; yy < repeat - 1; yy++) {
+        memcpy(b2p, r3v, ip * sizeof(v4hi));
+        b2p = b2p + pitch2;
+      }
+      for(xx = 0; xx < ip; xx++) b2p[xx].uv = bb2; 
+   }
+   
+}
+
+
+
+void pVram2RGB_x5_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + x * 5 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x5_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x5_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
diff --git a/source/src/agar/common/scaler/sse2/scaler_x6_sse2.c b/source/src/agar/common/scaler/sse2/scaler_x6_sse2.c
new file mode 100644 (file)
index 0000000..3952762
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Zoom x2x2
+ * (C) 2013 K.Ohta
+ * 
+ * History:
+ *  2013-04-02 Move from scaler_x2.c
+ */
+#include "agar_sdlview.h"
+#include "api_vram.h"
+#include "api_draw.h"
+#include "sdl_cpuid.h"
+#include "cache_wrapper.h"
+
+extern struct XM7_CPUID *pCpuID;
+
+extern void pVram2RGB_x6_Line(Uint32 *src, Uint32 *dst, int x, int y, int yrep);
+
+#if defined(__SSE2__)
+static void Scaler_DrawLine(v4hi *dst, Uint32 *src, int ww, int repeat, int pitch)
+{
+   int xx;
+   int yy;
+   int yrep2;
+   int yrep3;
+   int blank;
+   v4hi *b2p;
+   v4hi r1, r2;
+   v4hi *d0;
+   v4hi *b;
+   int pitch2;
+#if AG_BIG_ENDIAN != 1
+   const v4ui bb = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
+#else
+   const v4ui bb = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff};
+#endif
+     
+   if(repeat <= 0) return;
+   b = (v4hi *)src;
+   b2p = dst;
+   pitch2 = pitch / sizeof(v4hi);
+   if((bFullScan) || (repeat < 2)) {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11, r12;
+      v4hi r13, r14;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+        r3.uv   = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv   = (v4ui){r1.i[0], r1.i[0], r1.i[1], r1.i[1]};  
+        r5.uv   = (v4ui){r1.i[1], r1.i[1], r1.i[1], r1.i[1]};  
+        r6.uv   = (v4ui){r1.i[2], r1.i[2], r1.i[2], r1.i[2]};  
+        r7.uv   = (v4ui){r1.i[2], r1.i[2], r1.i[3], r1.i[3]};  
+        r8.uv   = (v4ui){r1.i[3], r1.i[3], r1.i[3], r1.i[3]};
+        
+        r9.uv   = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[0]};  
+        r10.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[1], r2.i[1]};  
+        r11.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[1], r2.i[1]};  
+        r12.uv  = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[2]};  
+        r13.uv  = (v4ui){r2.i[2], r2.i[2], r2.i[3], r2.i[3]};  
+        r14.uv  = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};
+        for(yy = 0; yy < repeat; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p[9] = r12;
+           b2p[10] = r13;
+           b2p[11] = r14;
+           b2p = b2p + pitch2;
+        }
+        dst += 10;
+//      b += 2;
+      }
+   } else {
+      v4hi r3, r4, r5, r6, r7;
+      v4hi r8, r9, r10, r11, r12;
+      v4hi r13, r14;
+      for(xx = 0; xx < ww; xx += 8) {
+        b2p = dst;
+        r1 = *b++;
+        r2 = *b++;
+
+        r3.uv   = (v4ui){r1.i[0], r1.i[0], r1.i[0], r1.i[0]};  
+        r4.uv   = (v4ui){r1.i[0], r1.i[0], r1.i[1], r1.i[1]};  
+        r5.uv   = (v4ui){r1.i[1], r1.i[1], r1.i[1], r1.i[1]};  
+        r6.uv   = (v4ui){r1.i[2], r1.i[2], r1.i[2], r1.i[2]};  
+        r7.uv   = (v4ui){r1.i[2], r1.i[2], r1.i[3], r1.i[3]};  
+        r8.uv   = (v4ui){r1.i[3], r1.i[3], r1.i[3], r1.i[3]};
+
+        r9.uv   = (v4ui){r2.i[0], r2.i[0], r2.i[0], r2.i[0]};  
+        r10.uv  = (v4ui){r2.i[0], r2.i[0], r2.i[1], r2.i[1]};  
+        r11.uv  = (v4ui){r2.i[1], r2.i[1], r2.i[1], r2.i[1]};  
+        r12.uv  = (v4ui){r2.i[2], r2.i[2], r2.i[2], r2.i[2]};  
+        r13.uv  = (v4ui){r2.i[2], r2.i[2], r2.i[3], r2.i[3]};  
+        r14.uv  = (v4ui){r2.i[3], r2.i[3], r2.i[3], r2.i[3]};
+        for(yy = 0; yy < repeat - 1; yy++) {
+           b2p[0] = r3;
+           b2p[1] = r4;
+           b2p[2] = r5;
+           b2p[3] = r6;
+           b2p[4] = r7;
+           b2p[5] = r8;
+           b2p[6] = r9;
+           b2p[7] = r10;
+           b2p[8] = r11;
+           b2p[9] = r12;
+           b2p[10] = r13;
+           b2p[11] = r14;
+           b2p = b2p + pitch2;
+        }
+        b2p[0].uv = 
+        b2p[1].uv = 
+        b2p[2].uv = 
+        b2p[3].uv = 
+        b2p[4].uv = 
+        b2p[5].uv = 
+        b2p[6].uv = 
+        b2p[7].uv = 
+        b2p[8].uv = 
+        b2p[9].uv = 
+        b2p[10].uv =
+        b2p[11].uv = bb;
+        dst += 12;
+//      b += 2;
+      }
+   }
+   
+}
+
+
+
+void pVram2RGB_x6_Line_SSE2(Uint32 *src, Uint8 *dst, int xbegin, int xend, int y, int yrep)
+{
+   register v4hi *b;
+   AG_Surface *Surface = GetDrawSurface();
+   Uint32 *d1;
+   Uint32 *d2;
+   Uint32 *p;
+   int w;
+   int h;
+   int yy;
+   int xx;
+   int hh;
+   int ww;
+   int i;
+   int x = xbegin;
+   int yrep2;
+   unsigned  pitch;
+   Uint32 black;
+   if(Surface == NULL) return;
+   w = Surface->w;
+   h = Surface->h;
+
+
+   ww = xend - xbegin;
+//   if(ww > (w / 2)) ww = w / 2;
+   ww = (ww / 8) * 8;
+   if(ww <= 0) return;
+
+
+#if AG_BIG_ENDIAN != 1
+   black = 0xff000000;
+#else
+   black = 0x000000ff;
+#endif
+//   yrep = yrep * 16.0f;
+
+   yrep2 = yrep;
+
+   d1 = (Uint32 *)((Uint8 *)dst + x * 6 * Surface->format->BytesPerPixel);
+   d2 = &src[x + y * 640];
+   Scaler_DrawLine((v4hi *)d1, (Uint32 *)d2, ww, yrep2, Surface->pitch);
+//   AG_SurfaceUnlock(Surface);
+   return;
+}
+
+
+#else 
+
+void pVram2RGB_x6_Line_SSE2(Uint32 *src, int xbegin,  int xend, int y, int yrep)
+{
+   pVram2RGB_x6_Line(src, dst, x, y, yrep);
+}
+
+#endif // __SSE2__
\ No newline at end of file
index 7996fa8..e983b0b 100644 (file)
@@ -25,6 +25,7 @@ typedef struct {
        DWORD dwDataLength;
 } wavheader_t;
 
+
 void EMU::AudioCallbackSDL(void *udata, Uint8 *stream, int len)
 {
    int pos;
@@ -100,7 +101,7 @@ void EMU::initialize_sound()
        
        // secondary buffer
        uBufSize = (100 * SndSpecPresented.freq * SndSpecPresented.channels * 2) / 1000;
-        pSoundBuf = malloc(uBufSize * sizeof(sint16_t)); 
+        pSoundBuf = malloc(uBufSize * sizeof(Sint16)); 
         if(pSoundBuf == NULL) {
           SDL_CloseAudio();
           return;
@@ -112,6 +113,7 @@ void EMU::initialize_sound()
           return;
        }
    
+        ZeroMemory(pSoundBuf, uBufSize * sizeof(Sint16));
         sound_ok = first_half = true;
 }
 
@@ -142,7 +144,7 @@ void EMU::update_sound(int* extra_frames)
        
        if(sound_ok) {
                DWORD play_c, offset, size1, size2;
-               sint16_t *ptr1, *ptr2;
+               Sint16 *ptr1, *ptr2;
                
                // start play
                if(!sound_started) {
@@ -152,7 +154,7 @@ void EMU::update_sound(int* extra_frames)
                }
                SDL_LockAudio();
                // check current position
-               play_c = nSndWritePos * sizeof(sint16_t);
+               play_c = nSndWritePos * sizeof(Sint16);
                if(first_half) {
                        if(play_c < (uBufSize / 2)) {
                                SDL_UnlockAudio();
@@ -169,7 +171,7 @@ void EMU::update_sound(int* extra_frames)
                SDL_UnlockAudio();
                
                // sound buffer must be updated
-               uint16* sound_buffer = vm->create_sound(extra_frames);
+               uint16_t* sound_buffer = (uint16_t)vm->create_sound(extra_frames);
                if(now_rec_sound) {
                        // record sound
                        if(sound_samples > rec_buffer_ptr) {
@@ -211,26 +213,30 @@ void EMU::update_sound(int* extra_frames)
                        int pos;
                        int pos2;
                        SDL_LockAudio();
-                       ssize = sound_samples * SndSpecPresented.channels;
-                       pos = nSndDataPos;
-                       pos2 = pos + ssize;
-                       ptr1 = &pSoundBuf[pos];
-                       if(pos2 >= uBufSize) {
-                          size1 = uBufSize - pos;
-                          size2 = pos2 - uBufSize;
-                          ptr2 = &pSoundBuf[0];
-                       } else {
-                          size1 = ssize;
-                          size2 = 0;
-                          ptr2 = NULL;
-                       }
-                       if(ptr1) {
-                               CopyMemory(ptr1, sound_buffer, size1);
-                       }
-                       if(ptr2) {
-                               CopyMemory(ptr2, sound_buffer + size1, size2);
+                       if(pSndApplySem) {
+                               SDL_SemWait(pSndApplySem);
+                               ssize = sound_samples * SndSpecPresented.channels;
+                               pos = nSndDataPos;
+                               pos2 = pos + ssize;
+                               ptr1 = &pSoundBuf[pos];
+                               if(pos2 >= uBufSize) {
+                                       size1 = uBufSize - pos;
+                                       size2 = pos2 - uBufSize;
+                                       ptr2 = &pSoundBuf[0];
+                               } else {
+                                       size1 = ssize;
+                                       size2 = 0;
+                                       ptr2 = NULL;
+                               }
+                               if(ptr1) {
+                                       CopyMemory(ptr1, sound_buffer, size1 * sizeof(Sint16));
+                               }
+                               if(ptr2) {
+                                       CopyMemory(ptr2, sound_buffer + size1, size2 * sizeof(Sint16));
+                               }
+                               nSndDataPos = (nSndDataPos + ssize) % uBufSize;
+                               SDL_SemPost(pSndApplySem);
                        }
-                       nSndDataPos = (nSndDataPos + ssize) % uBufSize;
                        SDL_UnlockAudio();
                }
                SDL_PauseAudio(0);
@@ -243,16 +249,40 @@ void EMU::mute_sound()
        if(!now_mute && sound_ok) {
                // check current position
                DWORD size1, size2;
+               
                WORD *ptr1, *ptr2;
                // WIP
-               
-               if(ptr1) {
-                       ZeroMemory(ptr1, size1);
-               }
-               if(ptr2) {
-                       ZeroMemory(ptr2, size2);
+               int ssize;
+               int pos;
+               int pos2;
+               if(pSndApplySem) { 
+                       SDL_SemWait(pSndApplySem);
+                       SDL_LockAudio();
+                       ssize = sound_samples * SndSpecPresented.channels;
+                       pos = nSndDataPos;
+                       pos2 = pos + ssize;
+                       ptr1 = &pSoundBuf[pos];
+                       if(pos2 >= uBufSize) {
+                               size1 = uBufSize - pos;
+                               size2 = pos2 - uBufSize;
+                               ptr2 = &pSoundBuf[0];
+                       } else {
+                               size1 = ssize;
+                               size2 = 0;
+                               ptr2 = NULL;
+                       }
+
+                       if(ptr1) {
+                               ZeroMemory(ptr1, size1 * sizeof(Sint16));
+                       }
+                       if(ptr2) {
+                               ZeroMemory(ptr2, size2 * sizeof(Sint16));
+                       }
+                       nSndDataPos = (nSndDataPos + ssize) % uBufSize;
+                       SDL_UnlockAudio();
+                       SDL_SemPost(pSndApplySem);
                }
-               lpdsb->Unlock(ptr1, size1, ptr2, size2);
+               SDL_PauseAudio(0);
        }
        now_mute = true;
 }
@@ -292,20 +322,20 @@ void EMU::stop_rec_sound()
                } else {
                        // update wave header
                        struct wavheader_t header;
-                       header.dwRIFF = 0x46464952;
-                       header.dwFileSize = rec_bytes + sizeof(wavheader_t) - 8;
-                       header.dwWAVE = 0x45564157;
-                       header.dwfmt_ = 0x20746d66;
-                       header.dwFormatSize = 16;
-                       header.wFormatTag = 1;
-                       header.wChannels = 2;
-                       header.wBitsPerSample = 16;
-                       header.dwSamplesPerSec = sound_rate;
-                       header.wBlockAlign = header.wChannels * header.wBitsPerSample / 8;
-                       header.dwAvgBytesPerSec = header.dwSamplesPerSec * header.wBlockAlign;
-                       header.dwdata = 0x61746164;
-                       header.dwDataLength = rec_bytes;
-                       
+
+                       header.dwRIFF = EndianToLittle_DWORD(0x46464952);
+                       header.dwFileSize = EndianToLittle_DWORD(rec_bytes + sizeof(wavheader_t) - 8);
+                       header.dwWAVE = EndianToLittle_DWORD(0x45564157);
+                       header.dwfmt_ = EndianToLittle_DWORD(0x20746d66);
+                       header.dwFormatSize = EndianToLittle_DWORD(16);
+                       header.wFormatTag = EndianToLittle_WORD(1);
+                       header.wChannels =  EndianToLittle_WORD(2);
+                       header.wBitsPerSample = EndianToLittle_WORD(16);
+                       header.dwSamplesPerSec = EndianToLittle_DWORD(sound_rate);
+                       header.wBlockAlign = EndianToLittle_WORD(header.wChannels * header.wBitsPerSample / 8);
+                       header.dwAvgBytesPerSec = EndianToLittle_DWORD(header.dwSamplesPerSec * header.wBlockAlign);
+                       header.dwdata = EndianToLittle_DWORD(0x61746164);
+                       header.dwDataLength = EndianToLittle_DWORD(rec_bytes);
                        rec->Fseek(0, FILEIO_SEEK_SET);
                        rec->Fwrite(&header, sizeof(wavheader_t), 1);
                        rec->Fclose();
diff --git a/source/src/agar/fm7/CMakeLists.txt b/source/src/agar/fm7/CMakeLists.txt
new file mode 100644 (file)
index 0000000..cd5aee5
--- /dev/null
@@ -0,0 +1,39 @@
+cmake_minimum_required (VERSION 2.6)
+
+message("* sdl")
+
+
+add_executable(xm7
+                 api_draw.cpp
+                 api_snd2.cpp api_wavwriter.cpp
+                snd_buffer.cpp
+                SndDrvTmpl.cpp SndDrvOpn.cpp SndDrvWav.cpp SndDrvBeep.cpp SndDrvCMT.cpp
+                api_kbd.cpp api_js.cpp api_mouse.cpp
+                SDLKbdInterface.cpp
+                SDLJoyInterface.cpp
+                KbdInterface.cpp 
+                draw_thread.cpp
+                sdl_cpuid.c
+                sdl_inifile.c
+                sdl_file.c
+                windows_main.cpp
+                )
+
+target_link_libraries(xm7 ${LOCAL_LIBS}
+                          ${AGAR_LIBS}
+                          ${OPENGL_LIBRARY}
+                          ${OPENCL_LIBRARY}
+                          ${GETTEXT_LIBRARY}
+                          ${OPENMP_LIBRARY}
+                          ${SDL_LIBRARY}
+                          ${THREADS_LIBRARY}
+                          fontconfig
+                          freetype
+                          ${AGAR_DEPLIBS}
+)
+
+if(LIB_RT_HAS_NANOSLEEP)
+  add_target_library(xm7 rt)
+endif(LIB_RT_HAS_NANOSLEEP)
+
+install(TARGETS xm7 DESTINATION bin)
\ No newline at end of file
diff --git a/source/src/agar/fm7/vram/generic/CMakeLists.txt b/source/src/agar/fm7/vram/generic/CMakeLists.txt
new file mode 100644 (file)
index 0000000..a318c3f
--- /dev/null
@@ -0,0 +1,7 @@
+message("* sdl/vram/generic")
+
+add_library(xm7_vram-generic api_vram256k.c
+                   api_vram4096.c
+                   api_vram8.c
+                   api_vramvec.c
+)
\ No newline at end of file
diff --git a/source/src/agar/fm7/vram/generic/api_vram256k.c b/source/src/agar/fm7/vram/generic/api_vram256k.c
new file mode 100644 (file)
index 0000000..a8e5cf5
--- /dev/null
@@ -0,0 +1,252 @@
+/*\r
+ * api_vram256k.cpp\r
+ * Convert VRAM -> VirtualVram\r
+ * (C) 2011 K.Ohta <whatisthis.sowhat@gmail.com>\r
+ */\r
+\r
+\r
+#include "api_draw.h"\r
+//#include "api_scaler.h"\r
+#include "api_vram.h"\r
+\r
+\r
+static void putword(Uint32 *disp, Uint32 *cx)\r
+{\r
+    disp[0] = cx[0];\r
+    disp[1] = cx[1];\r
+    disp[2] = cx[2];\r
+    disp[3] = cx[3];\r
+    disp[4] = cx[4];\r
+    disp[5] = cx[5];\r
+    disp[6] = cx[6];\r
+    disp[7] = cx[7];\r
+}\r
+\r
+\r
+\r
+static v8hi_t gpixel2cbuf(Uint32 addr, Uint32 mpage)\r
+{\r
+   Uint8 ret = 0;\r
+   v8hi_t v;\r
+   v8hi_t v1;\r
+   Uint8 *vram_p = vram_pb;\r
+   \r
+    v.i[0] = v.i[1] = v.i[2] = v.i[3] = 0;\r
+    if(!(mpage & 0x40)){\r
+        v.b[5] = vram_p[addr + 0x10000]; \r
+        v.b[4] = vram_p[addr + 0x12000]; \r
+        v.b[3] = vram_p[addr + 0x14000]; \r
+        v.b[2] = vram_p[addr + 0x16000]; \r
+        v.b[1] = vram_p[addr + 0x28000]; \r
+        v.b[0] = vram_p[addr + 0x2a000]; \r
+        v1 = lshift_6bit8v(&v);\r
+        return v1;\r
+    \r
+    } else {\r
+       v8hi_t r;\r
+       r.v = (v8si){0, 0, 0, 0, 0, 0, 0, 0};\r
+       return r;\r
+   }\r
+   \r
+\r
+}\r
+\r
+static v8hi_t rpixel2cbuf(Uint32 addr, Uint32 mpage)\r
+{\r
+   Uint8 ret = 0;\r
+   v8hi_t v;\r
+   v8hi_t v1;\r
+   Uint8 *vram_p = vram_pb;\r
+   \r
+    v.i[0] = v.i[1] = v.i[2] = v.i[3] = 0;\r
+    if(!(mpage & 0x20)){\r
+        v.b[5] = vram_p[addr + 0x08000]; \r
+        v.b[4] = vram_p[addr + 0x0a000]; \r
+        v.b[3] = vram_p[addr + 0x0c000]; \r
+        v.b[2] = vram_p[addr + 0x0e000]; \r
+        v.b[1] = vram_p[addr + 0x20000]; \r
+        v.b[0] = vram_p[addr + 0x22000]; \r
+        v1 = lshift_6bit8v(&v);\r
+        return v1;\r
+   } else {\r
+       v8hi_t r;\r
+       r.v = (v8si){0, 0, 0, 0, 0, 0, 0, 0};\r
+       return r;\r
+   }\r
+}\r
+\r
+static v8hi_t bpixel2cbuf(Uint32 addr, Uint32 mpage)\r
+{\r
+   Uint8 ret = 0;\r
+   v8hi_t v;\r
+   v8hi_t v1;\r
+   Uint8 *vram_p = vram_pb;\r
+   \r
+    v.i[0] = v.i[1] = v.i[2] = v.i[3] = 0;\r
+    if(!(mpage & 0x10)){\r
+        v.b[5] = vram_p[addr + 0x00000]; \r
+        v.b[4] = vram_p[addr + 0x02000]; \r
+        v.b[3] = vram_p[addr + 0x04000]; \r
+        v.b[2] = vram_p[addr + 0x06000]; \r
+        v.b[1] = vram_p[addr + 0x18000]; \r
+        v.b[0] = vram_p[addr + 0x1a000]; \r
+        \r
+        v1 = lshift_6bit8v(&v);\r
+//        v1.v <<= 16;\r
+        return v1;\r
+   } else {\r
+       v8hi_t r;\r
+       r.v = (v8si){0, 0, 0, 0, 0, 0, 0, 0};\r
+       return r;\r
+   }\r
+}\r
+\r
+\r
+\r
+\r
+static void getvram_256k(Uint32 addr, Uint32 mpage, Uint32 *cbuf)\r
+{\r
+   v8hi_t r, g, b;\r
+   /*\r
+     * R,G,Bについて8bit単位で描画する。\r
+     * 高速化…キャッシュヒット率の向上を考慮して、\r
+     * インライン展開と細かいループの廃止を同時に行う\r
+     */\r
+   \r
+   b = bpixel2cbuf(addr, mpage);\r
+   r = rpixel2cbuf(addr, mpage);\r
+   g = gpixel2cbuf(addr, mpage);\r
+#ifdef AG_LITTLE_ENDIAN   \r
+   cbuf[0] = (b.i[0] << 16) | (g.i[0] << 8) | r.i[0] | 0xff000000;\r
+   cbuf[1] = (b.i[1] << 16) | (g.i[1] << 8) | r.i[1] | 0xff000000;\r
+   cbuf[2] = (b.i[2] << 16) | (g.i[2] << 8) | r.i[2] | 0xff000000;\r
+   cbuf[3] = (b.i[3] << 16) | (g.i[3] << 8) | r.i[3] | 0xff000000;\r
+   cbuf[4] = (b.i[4] << 16) | (g.i[4] << 8) | r.i[4] | 0xff000000;\r
+   cbuf[5] = (b.i[5] << 16) | (g.i[5] << 8) | r.i[5] | 0xff000000;\r
+   cbuf[6] = (b.i[6] << 16) | (g.i[6] << 8) | r.i[6] | 0xff000000;\r
+   cbuf[7] = (b.i[7] << 16) | (g.i[7] << 8) | r.i[7] | 0xff000000;\r
+#else   \r
+#endif\r
+   return ;\r
+}\r
+\r
+\r
+\r
+/*\r
+ * 8x8のピースをVRAMから作成する:VramLockしない事に注意\r
+ */\r
+void CreateVirtualVram256k_1Pcs(Uint32 *p, int x, int y, int pitch, int mpage)\r
+{\r
+    Uint32 c[8];\r
+    Uint32 *disp = p;\r
+    Uint32 addr;\r
+   \r
+    addr = y * 40 + x;\r
+    // Loop廃止(高速化)\r
+\r
+    getvram_256k(addr, mpage, (Uint32 *)&c);\r
+    putword((Uint32 *)disp, (Uint32 *)&c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    getvram_256k(addr, mpage, (Uint32 *)&c);\r
+    putword((Uint32 *)disp, (Uint32 *)&c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    getvram_256k(addr, mpage, (Uint32 *)&c);\r
+    putword((Uint32 *)disp, (Uint32 *)&c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    getvram_256k(addr, mpage, (Uint32 *)&c);\r
+    putword((Uint32 *)disp, (Uint32 *)&c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    getvram_256k(addr, mpage, (Uint32 *)&c);\r
+    putword((Uint32 *)disp, (Uint32 *)&c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    getvram_256k(addr, mpage, (Uint32 *)&c);\r
+    putword((Uint32 *)disp, (Uint32 *)&c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    getvram_256k(addr, mpage, (Uint32 *)&c);\r
+    putword((Uint32 *)disp, (Uint32 *)&c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    getvram_256k(addr, mpage, (Uint32 *)&c);\r
+    putword((Uint32 *)disp, (Uint32 *)&c);\r
+\r
+}\r
+\r
+void CreateVirtualVram256k_WindowedLine(Uint32 *p, int ybegin, int yend, int xbegin, int xend, int mpage)\r
+{\r
+    Uint32 c[8];\r
+    Uint8 *disp;\r
+    Uint32 addr;\r
+    int pitch = sizeof(Uint32) * 8;\r
+    int xx;\r
+    int yy;\r
+   \r
+    for(yy = ybegin ; yy < yend; yy++) {\r
+       addr = yy * 40 + xbegin;\r
+       disp = (Uint8 *)p + (pitch * addr);\r
+       for(xx = xbegin; xx < xend; xx++) {\r
+\r
+         getvram_256k(addr, mpage, (Uint32 *)&c);\r
+         putword((Uint32 *)disp, (Uint32 *)&c);\r
+         addr++;\r
+         disp += pitch;\r
+         \r
+         getvram_256k(addr, mpage, (Uint32 *)&c);\r
+         putword((Uint32 *)disp, (Uint32 *)&c);\r
+         addr++;\r
+         disp += pitch;\r
+         \r
+         getvram_256k(addr, mpage, (Uint32 *)&c);\r
+         putword((Uint32 *)disp, (Uint32 *)&c);\r
+         addr++;\r
+         disp += pitch;\r
+         \r
+         getvram_256k(addr, mpage, (Uint32 *)&c);\r
+         putword((Uint32 *)disp, (Uint32 *)&c);\r
+         addr++;\r
+         disp += pitch;\r
+         \r
+         getvram_256k(addr, mpage, (Uint32 *)&c);\r
+         putword((Uint32 *)disp, (Uint32 *)&c);\r
+         addr++;\r
+         disp += pitch;\r
+         \r
+         getvram_256k(addr, mpage, (Uint32 *)&c);\r
+         putword((Uint32 *)disp, (Uint32 *)&c);\r
+         addr++;\r
+         disp += pitch;\r
+         \r
+         getvram_256k(addr, mpage, (Uint32 *)&c);\r
+         putword((Uint32 *)disp, (Uint32 *)&c);\r
+         addr++;\r
+         disp += pitch;\r
+         \r
+         getvram_256k(addr, mpage, (Uint32 *)&c);\r
+         putword((Uint32 *)disp, (Uint32 *)&c);\r
+         \r
+       }\r
+    }\r
+}\r
+\r
+void CreateVirtualVram256k_Line(Uint32 *p, int ybegin, int yend, int mpage)\r
+{\r
+   CreateVirtualVram256k_WindowedLine(p, ybegin, yend, 0, 40, mpage);\r
+}\r
+\r
+Api_Vram_FuncList api_vram256k_generic = {\r
+   CreateVirtualVram256k_1Pcs,\r
+   CreateVirtualVram256k_Line,\r
+   CreateVirtualVram256k_WindowedLine\r
+};\r
diff --git a/source/src/agar/fm7/vram/generic/api_vram4096.c b/source/src/agar/fm7/vram/generic/api_vram4096.c
new file mode 100644 (file)
index 0000000..2d56689
--- /dev/null
@@ -0,0 +1,319 @@
+/*\r
+ * api_vram4096.cpp\r
+ * Convert VRAM -> VirtualVram\r
+ * (C) 2011 K.Ohta <whatisthis.sowhat@gmail.com>\r
+ */\r
+\r
+\r
+#include "api_draw.h"\r
+#include "api_vram.h"\r
+#include "sdl_cpuid.h"\r
+#include "cache_wrapper.h"\r
+\r
+Uint8 *vram_pb;\r
+Uint8 *vram_pr;\r
+Uint8 *vram_pg;\r
+\r
+extern struct XM7_CPUID *pCpuID;\r
+\r
+void CalcPalette_4096Colors(Uint32 index, Uint8 r, Uint8 g, Uint8 b, Uint8 a)\r
+{\r
+    Uint32 ds;\r
+    Uint32 *pal = rgbAnalogGDI;\r
+    r = r & 0xf0;\r
+    g = g & 0xf0;\r
+    b = b & 0xf0;\r
+   \r
+//    if((index > 4095) || (index < 0)) return;\r
+    index &= 0x0fff;\r
+#ifdef SDL_LIL_ENDIAN\r
+       ds =r | (g << 8) | (b << 16) | (a<<24);\r
+#else\r
+       ds = r<<24 + g<<16 + b<<8 + 255<<0;\r
+#endif\r
+   // Prefetch to cache when writing, not temporally.\r
+    _prefetch_data_write_permanent(&pal[index], sizeof(Uint32));\r
+    pal[index] = ds;\r
+}\r
+\r
+static inline void putword2_vec(Uint32 *disp, volatile v8hi_t cbuf)\r
+{\r
+   v8hi_t *dst = (v8hi_t *)disp;\r
+   v8hi_t r1;\r
+   \r
+   r1.i[0] = rgbAnalogGDI[cbuf.i[0]];\r
+   r1.i[1] = rgbAnalogGDI[cbuf.i[1]];\r
+   r1.i[2] = rgbAnalogGDI[cbuf.i[2]];\r
+   r1.i[3] = rgbAnalogGDI[cbuf.i[3]];\r
+   r1.i[4] = rgbAnalogGDI[cbuf.i[4]];\r
+   r1.i[5] = rgbAnalogGDI[cbuf.i[5]];\r
+   r1.i[6] = rgbAnalogGDI[cbuf.i[6]];\r
+   r1.i[7] = rgbAnalogGDI[cbuf.i[7]];\r
+   dst->v = r1.v;\r
+}\r
+\r
+static inline void getvram_4096_vec(Uint32 addr, v8hi_t *cbuf)\r
+{\r
+\r
+    uint8_t r0, r1, r2, r3;\r
+    uint8_t g0, g1, g2, g3;\r
+    uint8_t b0, b1, b2, b3;\r
+        /*\r
+         * R,G,Bについて8bit単位で描画する。\r
+         * 高速化…キャッシュヒット率の向上とVector演算(MMXetc)の速度効果を考慮して、\r
+         * ループの廃止を同時に行う\r
+         */\r
+    g3 = vram_pg[addr + 0x00000];\r
+    g2 = vram_pg[addr + 0x02000];\r
+    g1 = vram_pg[addr + 0x04000];\r
+    g0 = vram_pg[addr + 0x06000];\r
+    cbuf->v = \r
+        aPlanes[G0 + g0] |\r
+        aPlanes[G1 + g1] |\r
+        aPlanes[G2 + g2] |\r
+        aPlanes[G3 + g3] ;\r
+\r
+   \r
+    r3 = vram_pr[addr + 0x00000];\r
+    r2 = vram_pr[addr + 0x02000];\r
+    r1 = vram_pr[addr + 0x04000];\r
+    r0 = vram_pr[addr + 0x06000];\r
+    cbuf->v = cbuf->v |\r
+        aPlanes[R0 + r0] |\r
+        aPlanes[R1 + r1] |\r
+        aPlanes[R2 + r2] |\r
+        aPlanes[R3 + r3] ;\r
+\r
+    b3 = vram_pb[addr + 0x00000];\r
+    b2 = vram_pb[addr + 0x02000];\r
+    b1 = vram_pb[addr + 0x04000];\r
+    b0 = vram_pb[addr + 0x06000];\r
+    cbuf->v = cbuf->v |\r
+        aPlanes[B0 + b0] |\r
+        aPlanes[B1 + b1] |\r
+        aPlanes[B2 + b2] |\r
+        aPlanes[B3 + b3] ;\r
+   return;\r
+}\r
+\r
+/*\r
+ * 8x8のピースをVRAMから作成する:VramLockしない事に注意\r
+ */\r
+void CreateVirtualVram4096_1Pcs(Uint32 *p, int x, int y, int pitch, int mode)\r
+{\r
+//    Uint32 c[8];\r
+    v8hi_t c;\r
+    Uint32 *disp = p;\r
+    Uint32 addr;\r
+\r
+    addr = y * 40 + x;\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+//       disp += pitch;\r
+    } else {\r
+       getvram_4096_vec(addr, &c);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+\r
+       getvram_4096_vec(addr, &c);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+\r
+       getvram_4096_vec(addr, &c);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+\r
+       getvram_4096_vec(addr, &c);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+       \r
+       getvram_4096_vec(addr, &c);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+       \r
+       getvram_4096_vec(addr, &c);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+       \r
+       getvram_4096_vec(addr, &c);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+       \r
+       getvram_4096_vec(addr, &c);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+    }\r
+   \r
+}\r
+\r
+/*\r
+ * 1LineのピースをVRAMから作成する:VramLockしない事に注意\r
+ */\r
+void CreateVirtualVram4096_Line(Uint32 *p, int ybegin, int yend, int mode)\r
+{\r
+//    Uint32 c[8];\r
+    v8hi_t c;\r
+    Uint8 *disp;\r
+    Uint32 addr;\r
+    int yy;\r
+    int xx;\r
+    int pitch;\r
+\r
+    pitch = sizeof(Uint32) * 8;\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       for(yy = ybegin; yy < yend; yy++) {\r
+         addr = yy * 40;\r
+         disp = (Uint8 *)p + (pitch * addr);\r
+         for(xx = 0; xx < 5; xx++) {\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+         }\r
+       }\r
+    } else {\r
+       for(yy = ybegin; yy < yend; yy++) {\r
+         addr = yy * 40;\r
+         disp = (Uint8 *)p + (pitch * addr);\r
+         for(xx = 0; xx < 5; xx++) {\r
+            getvram_4096_vec(addr, &c);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            getvram_4096_vec(addr, &c);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            getvram_4096_vec(addr, &c);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            getvram_4096_vec(addr, &c);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            getvram_4096_vec(addr, &c);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            getvram_4096_vec(addr, &c);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            getvram_4096_vec(addr, &c);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            getvram_4096_vec(addr, &c);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+         }\r
+       }\r
+    } \r
+}\r
+\r
+/*\r
+ * ybegin - yendの行を変換する\r
+ */\r
+void CreateVirtualVram4096_WindowedLine(Uint32 *p, int ybegin, int yend, int xbegin, int xend, int mode)\r
+{\r
+#if (__GNUC__ >= 4)   \r
+    v8hi_t c;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    Uint32 addr;\r
+    int pitch;\r
+    int xx;\r
+    int yy;\r
+   \r
+    if(p == NULL) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 40 + xbegin;\r
+          disp = (Uint8 *)(&p[yy * 320 + xbegin]);\r
+          for(xx = xbegin; xx < xend; xx ++) { \r
+             putword2_vec((Uint32 *)disp, c);\r
+             disp += pitch;\r
+          }\r
+       }\r
+       return;\r
+     } else {\r
+       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 40 + xbegin;\r
+          disp = (Uint8 *)(&p[yy * 320 + xbegin]);\r
+          for(xx = xbegin; xx < xend; xx++) { \r
+             getvram_4096_vec(addr, &c);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+          }\r
+       }\r
+       return;\r
+     }\r
+ #else \r
+    Uint32 c[8];\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    int xx;\r
+    int yy;\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+    for(yy = ybegin; yy < yend; yy++) {  \r
+      addr = y * 40 + xbegin;\r
+      disp = (Uint8 *)(&p[yy * 320 + xbegin]);\r
+      for(xx = xbegin; xx < xend; xx++) {\r
+        getvram_4096(addr, c);\r
+        putword2((Uint32 *)disp, c, pal);\r
+        addr++;\r
+        disp += pitch;\r
+      }\r
+   }\r
+#endif   \r
+}\r
+\r
+Api_Vram_FuncList api_vram4096_generic = {\r
+   CreateVirtualVram4096_1Pcs,\r
+   CreateVirtualVram4096_Line,\r
+   CreateVirtualVram4096_WindowedLine\r
+};\r
diff --git a/source/src/agar/fm7/vram/generic/api_vram8.c b/source/src/agar/fm7/vram/generic/api_vram8.c
new file mode 100644 (file)
index 0000000..83a4e03
--- /dev/null
@@ -0,0 +1,431 @@
+/*\r
+ * api_vram8.cpp\r
+ * Convert VRAM -> VirtualVram\r
+ * (C) 2011 K.Ohta <whatisthis.sowhat@gmail.com>\r
+ */\r
+\r
+\r
+#include "api_draw.h"\r
+#include "api_vram.h"\r
+#include "sdl_cpuid.h"\r
+#include "cache_wrapper.h"\r
+\r
+extern struct XM7_CPUID *pCpuID;\r
+\r
+\r
+void SetVram_200l(Uint8 *p)\r
+{\r
+    vram_pb = p + 0;\r
+    vram_pg = p + 0x10000;\r
+    vram_pr = p + 0x8000;\r
+}\r
+\r
+void SetVram_400l(Uint8 *p)\r
+{\r
+    vram_pb = p + 0;\r
+    vram_pg = p + 0x10000;\r
+    vram_pr = p + 0x8000;\r
+}\r
+\r
+\r
+void CalcPalette_8colors(Uint32 index, Uint8 r, Uint8 g, Uint8 b, Uint8 a)\r
+{\r
+     Uint32 ds;\r
+\r
+#ifdef AG_LITTLE_ENDIAN\r
+       ds = r | (g << 8) | (b << 16) | 0xff000000;\r
+#else\r
+       ds = r<<24 + g<<16 + b<<8 + 255<<0;\r
+#endif\r
+    _prefetch_data_write_permanent(rgbTTLGDI, sizeof(Uint32) * 8);\r
+    rgbTTLGDI[index] = ds;\r
+}\r
+\r
+#if (__GNUC__ >= 4)\r
+static void getvram_8_vec(Uint32 addr, v8hi_t *cbuf)\r
+{\r
+    uint8_t r, g, b;\r
+//    volatile v4hi cbuf __attribute__((aligned(32)));\r
+        /*\r
+         * R,G,Bについて8bit単位で描画する。\r
+         * 高速化…キャッシュヒット率の向上とVector演算(MMXetc)の速度効果を考慮して、\r
+         * ループの廃止を同時に行う\r
+         */\r
+\r
+    g = vram_pg[addr];\r
+    r = vram_pr[addr];\r
+    b = vram_pb[addr];\r
+\r
+    cbuf->v = aPlanes[B0 + b] |\r
+              aPlanes[B1 + r] |\r
+              aPlanes[B2 + g];\r
+   return;\r
+}\r
+\r
+static inline void  putword8_vec(Uint32 *disp, volatile v8hi_t c, Uint32 *pal)\r
+{\r
+\r
+   v8hi_t *dst = (v8hi_t *)disp;\r
+   v8hi_t r1;\r
+   \r
+//   if(disp == NULL) return;\r
+   //c.v = c.v & (v8si){7, 7, 7, 7, 7, 7, 7, 7};\r
+   r1.i[0] = pal[c.i[0] & 7]; // ?!\r
+   r1.i[1] = pal[c.i[1] & 7];\r
+   r1.i[2] = pal[c.i[2] & 7];\r
+   r1.i[3] = pal[c.i[3] & 7];\r
+   r1.i[4] = pal[c.i[4] & 7];\r
+   r1.i[5] = pal[c.i[5] & 7];\r
+   r1.i[6] = pal[c.i[6] & 7];\r
+   r1.i[7] = pal[c.i[7] & 7];\r
+   dst->v = r1.v;\r
+}\r
+\r
+#else\r
+static inline void planeto8(Uint32 *c, uint8_t r, unit8_t g, uint8_t b)\r
+{\r
+   Uint8 mask;\r
+   \r
+   mask = 0x80;\r
+   c[0] = ((r & mask) >> 6) | ((g & mask) >> 5) || ((b & mask) >> 7);\r
+   mask >>= 1;\r
+   c[1] = ((r & mask) >> 5) | ((g & mask) >> 4) || ((b & mask) >> 6);\r
+   mask >>= 1;\r
+   c[2] = ((r & mask) >> 4) | ((g & mask) >> 3) || ((b & mask) >> 5);\r
+   mask >>= 1;\r
+   c[3] = ((r & mask) >> 3) | ((g & mask) >> 2) || ((b & mask) >> 4);\r
+   mask >>= 1;\r
+   c[4] = ((r & mask) >> 2) | ((g & mask) >> 1) || ((b & mask) >> 3);\r
+   mask >>= 1;\r
+   c[5] = ((r & mask) >> 1) | (g & mask) || ((b & mask) >> 2);\r
+   mask >>= 1;\r
+   c[6] = (r & mask) | ((g & mask) << 1) || ((b & mask) >> 1);\r
+   mask >>= 1;\r
+   c[7] = ((r & mask) << 1) | ((g & mask) << 2) || (b & mask);\r
+   mask >>= 1;\r
+}\r
+\r
+static void getvram_8(Uint32 addr, Uint32 *cbuf)\r
+{\r
+    uint8_t r, g, b;\r
+        /*\r
+         * R,G,Bについて8bit単位で描画する。\r
+         * 高速化…キャッシュヒット率の向上とVector演算(MMXetc)の速度効果を考慮して、\r
+         * ループの廃止を同時に行う\r
+         */\r
+   \r
+   g = vram_pg[addr];\r
+   r = vram_pr[addr];\r
+   b = vram_pb[addr];\r
+   planeto8(cbuf, r, g, b);\r
+  \r
+   return;\r
+}\r
+\r
+static inline void  putword8(Uint32 *disp, Uint32 *c, Uint32 *pal)\r
+{\r
+\r
+   Uint32 *r1 = disp;\r
+\r
+   r1[0] = pal[c[0] & 7]; // ?!\r
+   r1[1] = pal[c[1] & 7];\r
+   r1[2] = pal[c[2] & 7];\r
+   r1[3] = pal[c[3] & 7];\r
+   r1[4] = pal[c[4] & 7];\r
+   r1[5] = pal[c[5] & 7];\r
+   r1[6] = pal[c[6] & 7];\r
+   r1[7] = pal[c[7] & 7];\r
+}\r
+\r
+#endif // __GNUC__ >= 4\r
+\r
+\r
+\r
+/*\r
+ * 8x8のピースをVRAMから作成する:VramLockしない事に注意\r
+ */\r
+void CreateVirtualVram8_1Pcs(Uint32 *p, int x, int y, int pitch, int mode)\r
+{\r
+#if (__GNUC__ >= 4)   \r
+    v8hi_t c;\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint32 *disp = p;\r
+    Uint32 addr;\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+    addr = y * 80 + x;\r
+\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+//       disp += pitch;\r
+       return;\r
+     } else {\r
+       getvram_8_vec(addr, &c);\r
+       putword8_vec((Uint32 *)disp, c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       getvram_8_vec(addr , &c);\r
+       putword8_vec((Uint32 *)disp, c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       getvram_8_vec(addr, &c);\r
+       putword8_vec((Uint32 *)disp, c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       getvram_8_vec(addr , &c);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       getvram_8_vec(addr, &c);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       getvram_8_vec(addr, &c);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       getvram_8_vec(addr, &c);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       getvram_8_vec(addr, &c);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+//    addr += 80;\r
+//    disp += pitch;\r
+     }\r
+#else \r
+    Uint32 c[8];\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+    addr = y * 80 + x;\r
+\r
+    // Loop廃止(高速化)\r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp, c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr , c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr , c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   //    addr += 80;\r
+   //    disp += pitch;\r
+     \r
+#endif   \r
+}\r
+\r
+/*\r
+ * ybegin - yendの行を変換する\r
+ */\r
+void CreateVirtualVram8_Line(Uint32 *p, int ybegin, int yend, int mode)\r
+{\r
+    v8hi_t c;\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    Uint32 addr;\r
+    int pitch;\r
+    int xx;\r
+    int yy = ybegin;\r
+   \r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+//       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 80;\r
+//        disp = (Uint8 *)(&p[yy * 640]);\r
+          for(xx = 0; xx < (80 / 8); xx ++) { \r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp += pitch;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp += pitch;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp += pitch;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp += pitch;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp += pitch;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp += pitch;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp += pitch;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp += pitch;\r
+          }\r
+//       }\r
+       return;\r
+     } else {\r
+//       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 80;\r
+//        disp = (Uint8 *)(&p[yy * 640]);\r
+          for(xx = 0; xx < (80 / 8); xx++) { \r
+             getvram_8_vec(addr, &c);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp += pitch;\r
+             \r
+             getvram_8_vec(addr , &c);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp += pitch;\r
+\r
+             getvram_8_vec(addr, &c);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp += pitch;\r
+             \r
+             getvram_8_vec(addr , &c);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp += pitch;\r
+             \r
+             getvram_8_vec(addr, &c);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp += pitch;\r
+             \r
+             getvram_8_vec(addr, &c);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp += pitch;\r
+             \r
+             getvram_8_vec(addr, &c);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp += pitch;\r
+             \r
+             getvram_8_vec(addr, &c);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp += pitch;\r
+          }\r
+         \r
+//       }\r
+       return;\r
+     }\r
+}\r
+\r
+/*\r
+ * ybegin - yendの行を変換する\r
+ */\r
+void CreateVirtualVram8_WindowedLine(Uint32 *p, int ybegin, int yend, int xbegin, int xend, int mode)\r
+{\r
+#if (__GNUC__ >= 4)   \r
+    v8hi_t c;\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    Uint32 addr;\r
+    int pitch;\r
+    int xx;\r
+    int yy = ybegin;\r
+    \r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+    xbegin = xbegin % 80;\r
+    xend = xend % 80;\r
+    ybegin = ybegin % 400;\r
+   \r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       addr = yy * 80 + xbegin;\r
+       disp = (Uint8 *)(&p[xbegin * 8]);\r
+       for(xx = xbegin; xx < xend; xx ++) { \r
+         putword8_vec((Uint32 *)disp,  c, pal);\r
+         disp += pitch;\r
+       }\r
+       return;\r
+     } else {\r
+       addr = yy * 80 + xbegin;\r
+       disp = (Uint8 *)(&p[xbegin * 8]);\r
+       for(xx = xbegin; xx < xend; xx++) { \r
+          getvram_8_vec(addr, &c);\r
+          putword8_vec((Uint32 *)disp, c, pal);\r
+          addr++;\r
+          disp += pitch;\r
+       }\r
+       return;\r
+     }\r
+ #else \r
+    Uint32 c[8];\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    int xx;\r
+    int yy;\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+    for(yy = ybegin; yy < yend; yy++) {  \r
+      addr = y * 80 + xbegin;\r
+      disp = (Uint8 *)(&p[yy * 640 + xbegin]);\r
+      for(xx = xbegin; xx < xend; xx++) {\r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp, c, pal);\r
+        addr++;\r
+        disp += pitch;\r
+      }\r
+   }\r
+#endif   \r
+}\r
+\r
+Api_Vram_FuncList api_vram8_generic = {\r
+   CreateVirtualVram8_1Pcs,\r
+   CreateVirtualVram8_Line,\r
+   CreateVirtualVram8_WindowedLine\r
+};\r
diff --git a/source/src/agar/fm7/vram/generic/api_vramvec.c b/source/src/agar/fm7/vram/generic/api_vramvec.c
new file mode 100644 (file)
index 0000000..040eb3d
--- /dev/null
@@ -0,0 +1,147 @@
+/*\r
+ * api_vramvec.cpp\r
+ * Convert VRAM -> VirtualVram(Vector Version)\r
+ * (C) 2012 K.Ohta <whatisthis.sowhat@gmail.com>\r
+ */\r
+\r
+\r
+#include "xm7_types.h"\r
+#include "api_draw.h"\r
+#include "api_vram.h"\r
+#include "agar_logger.h"\r
+#include "cache_wrapper.h"\r
+\r
+/*\r
+* Definition of Convertsion Tables.\r
+*/\r
+// Reduce Tables 20120131\r
+\r
+v8si *aPlanes;\r
+static void initvramtblsub_vec(volatile unsigned char x, volatile v8hi_t *p)\r
+{\r
+//    p->v = (v8si){x & 0x80, x & 0x40, x & 0x20, x & 0x10, x & 0x08, x & 0x04, x & 0x02, x & 0x01};\r
+    \r
+    p->i[0] = (x & 0x80) >> 7;\r
+    p->i[1] = (x & 0x40) >> 6;\r
+    p->i[2] = (x & 0x20) >> 5;\r
+    p->i[3] = (x & 0x10) >> 4;\r
+    p->i[4] = (x & 0x08) >> 3;\r
+    p->i[5] = (x & 0x04) >> 2;\r
+    p->i[6] = (x & 0x02) >> 1;\r
+    p->i[7] = x & 0x01;\r
+    // 8 Colors\r
+}\r
+\r
+void initvramtbl_8_vec(void)\r
+{\r
+}\r
+\r
+static v8si *initvramtblsub(int size)\r
+{\r
+   v8si *p;\r
+#ifndef _WINDOWS\r
+   if(posix_memalign((void **)&p, 16 * sizeof(Uint32), sizeof(v8si) * size) != 0) return NULL;\r
+#else\r
+   p = (v8si *)__mingw_aligned_malloc(sizeof(v8si) * size, 16 * sizeof(Uint32));\r
+   if(p == NULL) return NULL;\r
+#endif\r
+   return p;\r
+}\r
+\r
+\r
+void initvramtbl_4096_vec(void)\r
+{\r
+    int i;\r
+    volatile v8hi_t r;\r
+    aPlanes = initvramtblsub(12 * 256);\r
+    if(aPlanes == NULL) return;\r
+    XM7_DebugLog(XM7_LOG_DEBUG, "Vram Table OK");\r
+    // Init Mask Table\r
+   for(i = 0; i <= 255; i++){\r
+        initvramtblsub_vec(i & 255, &r);\r
+\r
+        aPlanes[B0 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[B1 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[B2 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[B3 + i] = r.v;\r
+        r.v <<= 1;\r
+\r
+\r
+        aPlanes[R0 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[R1 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[R2 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[R3 + i] = r.v;\r
+        r.v <<= 1;\r
+      \r
+        aPlanes[G0 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[G1 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[G2 + i] = r.v;\r
+        r.v <<= 1;\r
+        aPlanes[G3 + i] = r.v;\r
+//        r.v <<= 1;\r
+    }\r
+    _prefetch_data_read_permanent(aPlanes, sizeof(Uint32) * 256 * 8 * 12); // 98KB (!), priority = 1.\r
+}\r
+\r
+void detachvramtbl_8_vec(void)\r
+{\r
+   \r
+}\r
+\r
+void detachvramtbl_4096_vec(void)\r
+{\r
+   if(aPlanes != NULL) {\r
+#ifndef _WINDOWS\r
+      free(aPlanes);\r
+#else\r
+      __mingw_aligned_free(aPlanes);\r
+#endif\r
+      aPlanes = NULL;\r
+   }\r
+}\r
+\r
+\r
+\r
+v8hi_t lshift_6bit8v(v8hi_t *v)\r
+{\r
+   v8hi_t r;\r
+   v8hi_t cbuf;\r
+   v8hi_t mask;\r
+   mask.v = (v8si){0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8};\r
+   cbuf.v =\r
+        aPlanes[B2 + v->b[0]] |\r
+        aPlanes[B3 + v->b[1]] |\r
+        aPlanes[R0 + v->b[2]] |\r
+        aPlanes[R1 + v->b[3]] |\r
+        aPlanes[R2 + v->b[4]] |\r
+        aPlanes[R3 + v->b[5]];\r
+   \r
+   mask.v = mask.v & cbuf.v;\r
+#if ((__GNUC__ == 4) && (__GCC_MINOR__ >= 7)) || (__GNUC__ > 4) //GCC 4.7 or later.\r
+   r.v = mask.v != (v8si){0, 0, 0, 0, 0, 0, 0, 0};\r
+   r.v = r.v & (v8si) {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03};\r
+   cbuf.v = cbuf.v |  r.v;\r
+#else\r
+   if(mask.i[0] != 0) cbuf.s[0] |= 0x03;\r
+   if(mask.i[1] != 0) cbuf.s[1] |= 0x03;\r
+   if(mask.i[2] != 0) cbuf.s[2] |= 0x03;\r
+   if(mask.i[3] != 0) cbuf.s[3] |= 0x03;\r
+   if(mask.i[4] != 0) cbuf.s[4] |= 0x03;\r
+   if(mask.i[5] != 0) cbuf.s[5] |= 0x03;\r
+   if(mask.i[6] != 0) cbuf.s[6] |= 0x03;\r
+   if(mask.i[7] != 0) cbuf.s[7] |= 0x03;\r
+#endif \r
+  return cbuf;\r
+}\r
+\r
+\r
+\r
+\r
diff --git a/source/src/agar/fm7/vram/sse2/CMakeLists.txt b/source/src/agar/fm7/vram/sse2/CMakeLists.txt
new file mode 100644 (file)
index 0000000..08f3bc1
--- /dev/null
@@ -0,0 +1,9 @@
+message("* sdl/vram/sse2")
+
+#set(CMAKE_BUILD_SETTING_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -msse -mmmx")
+add_compile_options(-msse2 -msse -mmmx)
+add_library(xm7_vram-sse2 api_vram256k.c
+                   api_vram4096.c
+                   api_vram8.c
+                   api_vramvec.c
+)
diff --git a/source/src/agar/fm7/vram/sse2/api_vram256k.c b/source/src/agar/fm7/vram/sse2/api_vram256k.c
new file mode 100644 (file)
index 0000000..0724a27
--- /dev/null
@@ -0,0 +1,261 @@
+/*\r
+ * api_vram256k.cpp\r
+ * Convert VRAM -> VirtualVram\r
+ * (C) 2011 K.Ohta <whatisthis.sowhat@gmail.com>\r
+ */\r
+\r
+#include "api_draw.h"\r
+//#include "api_scaler.h"\r
+#include "api_vram.h"\r
+#include "cache_wrapper.h"\r
+\r
+extern v8hi_t lshift_6bit8v_SSE2(v8hi_t v);\r
+\r
+static inline void putword(Uint32 *disp, v8hi_t cx)\r
+{\r
+    v8hi_t *dst = (v8hi_t *)disp;\r
+    _prefetch_data_write_l1(disp, sizeof(Uint32) * 8);\r
+    *dst = cx;\r
+}\r
+\r
+\r
+\r
+static v8hi_t gpixel2cbuf(Uint32 addr, Uint32 mpage)\r
+{\r
+   Uint8 ret = 0;\r
+   register v8hi_t v;\r
+   register v8hi_t v1;\r
+   Uint8 *vram_p = vram_pb;\r
+   \r
+    v.i[0] = v.i[1] = v.i[2] = v.i[3] = 0;\r
+    if(!(mpage & 0x40)){\r
+        v.b[5] = vram_p[addr + 0x10000]; \r
+        v.b[4] = vram_p[addr + 0x12000]; \r
+        v.b[3] = vram_p[addr + 0x14000]; \r
+        v.b[2] = vram_p[addr + 0x16000]; \r
+        v.b[1] = vram_p[addr + 0x28000]; \r
+        v.b[0] = vram_p[addr + 0x2a000]; \r
+        v1 = lshift_6bit8v_SSE2(v);\r
+        return v1;\r
+    \r
+    } else {\r
+       register v8hi_t r;\r
+       r.v = (v8si){0, 0, 0, 0, 0, 0, 0, 0};\r
+       return r;\r
+   }\r
+}\r
+\r
+static v8hi_t rpixel2cbuf(Uint32 addr, Uint32 mpage)\r
+{\r
+   Uint8 ret = 0;\r
+   register v8hi_t v;\r
+   register v8hi_t v1;\r
+   Uint8 *vram_p = vram_pb;\r
+   \r
+    v.i[0] = v.i[1] = v.i[2] = v.i[3] = 0;\r
+    if(!(mpage & 0x20)){\r
+        v.b[5] = vram_p[addr + 0x08000]; \r
+        v.b[4] = vram_p[addr + 0x0a000]; \r
+        v.b[3] = vram_p[addr + 0x0c000]; \r
+        v.b[2] = vram_p[addr + 0x0e000]; \r
+        v.b[1] = vram_p[addr + 0x20000]; \r
+        v.b[0] = vram_p[addr + 0x22000]; \r
+        v1 = lshift_6bit8v_SSE2(v);\r
+        return v1;\r
+   } else {\r
+       register v8hi_t r;\r
+       r.v = (v8si){0, 0, 0, 0, 0, 0, 0, 0};\r
+       return r;\r
+   }\r
+}\r
+\r
+static v8hi_t bpixel2cbuf(Uint32 addr, Uint32 mpage)\r
+{\r
+   Uint8 ret = 0;\r
+   register v8hi_t v;\r
+   register v8hi_t v1;\r
+   Uint8 *vram_p = vram_pb;\r
+   \r
+    v.i[0] = v.i[1] = v.i[2] = v.i[3] = 0;\r
+    if(!(mpage & 0x10)){\r
+        v.b[5] = vram_p[addr + 0x00000]; \r
+        v.b[4] = vram_p[addr + 0x02000]; \r
+        v.b[3] = vram_p[addr + 0x04000]; \r
+        v.b[2] = vram_p[addr + 0x06000]; \r
+        v.b[1] = vram_p[addr + 0x18000]; \r
+        v.b[0] = vram_p[addr + 0x1a000]; \r
+\r
+        v1 = lshift_6bit8v_SSE2(v);\r
+//        v1.v <<= 16;\r
+        return v1;\r
+   } else {\r
+       register v8hi_t r;\r
+       r.vv = (v8ii){0, 0, 0, 0, 0, 0, 0, 0};\r
+       return r;\r
+   }\r
+}\r
+\r
+\r
+\r
+\r
+static v8hi_t getvram_256k(Uint32 addr, Uint32 mpage)\r
+{\r
+   register v8hi_t r, g, b;\r
+   v8hi_t a;\r
+   register v8hi_t dst;\r
+   /*\r
+     * R,G,Bについて8bit単位で描画する。\r
+     * 高速化…キャッシュヒット率の向上を考慮して、\r
+     * インライン展開と細かいループの廃止を同時に行う\r
+     */\r
+   \r
+   b = bpixel2cbuf(addr, mpage);\r
+   r = rpixel2cbuf(addr, mpage);\r
+   g = gpixel2cbuf(addr, mpage);\r
+#ifdef AG_LITTLE_ENDIAN\r
+   a.vv = (v8ii){0xff000000, 0xff000000, 0xff000000, 0xff000000, 0xff000000, 0xff000000, 0xff000000, 0xff000000};\r
+   dst.vv = (b.vv << 16 ) | (g.vv << 8) | r.vv | a.vv;\r
+#else   \r
+#endif\r
+   return dst;\r
+}\r
+\r
+\r
+\r
+/*\r
+ * 8x8のピースをVRAMから作成する:VramLockしない事に注意\r
+ */\r
+void CreateVirtualVram256k_1Pcs_SSE2(Uint32 *p, int x, int y, int pitch, int mpage)\r
+{\r
+    register v8hi_t c;\r
+    register Uint32 *disp = p;\r
+    register Uint32 addr;\r
+   \r
+    addr = y * 40 + x;\r
+    // Loop廃止(高速化)\r
+    \r
+    c = getvram_256k(addr, mpage);\r
+    putword((Uint32 *)disp, c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    c = getvram_256k(addr, mpage);\r
+    putword((Uint32 *)disp, c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    c = getvram_256k(addr, mpage);\r
+    putword((Uint32 *)disp, c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    c = getvram_256k(addr, mpage);\r
+    putword((Uint32 *)disp, c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    c = getvram_256k(addr, mpage);\r
+    putword((Uint32 *)disp, c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    c = getvram_256k(addr, mpage);\r
+    putword((Uint32 *)disp, c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    c = getvram_256k(addr, mpage);\r
+    putword((Uint32 *)disp, c);\r
+    addr += 40;\r
+    disp += pitch;\r
+\r
+    c = getvram_256k(addr, mpage);\r
+    putword((Uint32 *)disp, c);\r
+\r
+}\r
+\r
+void CreateVirtualVram256k_Line_SSE2(Uint32 *p, int ybegin, int yend, int mpage)\r
+{\r
+    register v8hi_t c;\r
+    register v8hi_t *disp;\r
+    register Uint32 addr;\r
+    int yy;\r
+    int xx;\r
+    const int pitch = sizeof(Uint32) * 8;\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       for(yy = ybegin; yy < yend; yy++) {\r
+         addr = yy * 40;\r
+         disp = (v8hi_t *)((Uint8 *)p + (pitch * addr));\r
+         for(xx = 0; xx < (40 / 8); xx++) {\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+         }\r
+       }\r
+    } else {\r
+       for(yy = ybegin; yy < yend; yy++) {\r
+         addr = yy * 40;\r
+         disp = (v8hi_t *)((Uint8 *)p + (pitch * addr));\r
+         for(xx = 0; xx < (40 / 8); xx++) {\r
+            c = getvram_256k(addr, mpage);\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            addr++;\r
+            c = getvram_256k(addr, mpage);\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            addr++;\r
+            c = getvram_256k(addr, mpage);\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            addr++;\r
+            c = getvram_256k(addr, mpage);\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            addr++;\r
+            c = getvram_256k(addr, mpage);\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            addr++;\r
+            c = getvram_256k(addr, mpage);\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            addr++;\r
+            c = getvram_256k(addr, mpage);\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            addr++;\r
+            c = getvram_256k(addr, mpage);\r
+            putword((Uint32 *)disp,  c);\r
+            disp++;\r
+            addr++;\r
+         }\r
+       }\r
+    } \r
+}\r
+\r
+void CreateVirtualVram256k_WindowedLine_SSE2(Uint32 *p, int ybegin, int yend, int xbegin, int xend, int mpage)\r
+{\r
+   CreateVirtualVram256k_Line_SSE2(p, ybegin, yend, mpage);\r
+}\r
+\r
+Api_Vram_FuncList api_vram256k_sse2 = {\r
+   CreateVirtualVram256k_1Pcs_SSE2,\r
+   CreateVirtualVram256k_Line_SSE2,\r
+   CreateVirtualVram256k_WindowedLine_SSE2\r
+};\r
diff --git a/source/src/agar/fm7/vram/sse2/api_vram4096.c b/source/src/agar/fm7/vram/sse2/api_vram4096.c
new file mode 100644 (file)
index 0000000..e7d8ca7
--- /dev/null
@@ -0,0 +1,337 @@
+/*\r
+ * api_vram4096.cpp\r
+ * Convert VRAM -> VirtualVram\r
+ * (C) 2011 K.Ohta <whatisthis.sowhat@gmail.com>\r
+ */\r
+\r
+\r
+#include "api_draw.h"\r
+#include "api_vram.h"\r
+#include "cache_wrapper.h"\r
+\r
+//Uint8 *vram_pb;\r
+//Uint8 *vram_pr;\r
+//Uint8 *vram_pg;\r
+\r
+\r
+\r
+static inline void putword2_vec(Uint32 *disp, v8hi_t cbuf)\r
+{\r
+   v8hi_t *dst = (v8hi_t *)disp;\r
+   v8hi_t r1;\r
+   register int j;\r
+   _prefetch_data_write_l1(disp, sizeof(Uint32) * 8); // 4 * 8  = 32bytes.\r
+   for(j = 0; j < 8; j++) dst->i[j] = rgbAnalogGDI[cbuf.i[j]];\r
+}\r
+\r
+static inline v8hi_t getvram_4096_vec(Uint32 addr)\r
+{\r
+    v8hi_t cbuf;\r
+    uint8_t r0, r1, r2, r3;\r
+    uint8_t g0, g1, g2, g3;\r
+    uint8_t b0, b1, b2, b3;\r
+        /*\r
+         * R,G,Bについて8bit単位で描画する。\r
+         * 高速化…キャッシュヒット率の向上とVector演算(MMXetc)の速度効果を考慮して、\r
+         * ループの廃止を同時に行う\r
+         */\r
+    g3 = vram_pg[addr + 0x00000];\r
+    g2 = vram_pg[addr + 0x02000];\r
+    g1 = vram_pg[addr + 0x04000];\r
+    g0 = vram_pg[addr + 0x06000];\r
+    cbuf.v = \r
+        aPlanes[G0 + g0] |\r
+        aPlanes[G1 + g1] |\r
+        aPlanes[G2 + g2] |\r
+        aPlanes[G3 + g3] ;\r
+\r
+   \r
+    r3 = vram_pr[addr + 0x00000];\r
+    r2 = vram_pr[addr + 0x02000];\r
+    r1 = vram_pr[addr + 0x04000];\r
+    r0 = vram_pr[addr + 0x06000];\r
+    cbuf.v = cbuf.v |\r
+        aPlanes[R0 + r0] |\r
+        aPlanes[R1 + r1] |\r
+        aPlanes[R2 + r2] |\r
+        aPlanes[R3 + r3] ;\r
+\r
+    b3 = vram_pb[addr + 0x00000];\r
+    b2 = vram_pb[addr + 0x02000];\r
+    b1 = vram_pb[addr + 0x04000];\r
+    b0 = vram_pb[addr + 0x06000];\r
+    cbuf.v = cbuf.v |\r
+        aPlanes[B0 + b0] |\r
+        aPlanes[B1 + b1] |\r
+        aPlanes[B2 + b2] |\r
+        aPlanes[B3 + b3] ;\r
+   return cbuf;\r
+}\r
+\r
+/*\r
+ * 8x8のピースをVRAMから作成する:VramLockしない事に注意\r
+ */\r
+void CreateVirtualVram4096_1Pcs_SSE2(Uint32 *p, int x, int y, int pitch, int mode)\r
+{\r
+//    Uint32 c[8];\r
+    register v8hi_t c;\r
+    Uint32 *disp = p;\r
+    Uint32 addr;\r
+    register int i;\r
+\r
+//   for(i = 0; i < 4096; i++) __builtin_prefetch(&rgbAnalogGDI[i], 0, 0);\r
+    addr = y * 40 + x;\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       disp += pitch;\r
+       putword2_vec((Uint32 *)disp,  c);\r
+//       disp += pitch;\r
+    } else {\r
+       c = getvram_4096_vec(addr);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+\r
+       c = getvram_4096_vec(addr);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+\r
+       c = getvram_4096_vec(addr);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+\r
+       c = getvram_4096_vec(addr);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+       \r
+       c = getvram_4096_vec(addr);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+       \r
+       c = getvram_4096_vec(addr);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+       \r
+       c = getvram_4096_vec(addr);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+       addr += 40;\r
+       disp += pitch;\r
+       \r
+       c = getvram_4096_vec(addr);\r
+       putword2_vec((Uint32 *)disp,  c);\r
+    }\r
+   \r
+}\r
+   \r
+\r
+/*\r
+ * 1LineのピースをVRAMから作成する:VramLockしない事に注意\r
+ */\r
+void CreateVirtualVram4096_Line_SSE2(Uint32 *p, int ybegin, int yend, int mode)\r
+{\r
+//    Uint32 c[8];\r
+    register v8hi_t c;\r
+    Uint8 *disp;\r
+    Uint32 addr;\r
+    int yy;\r
+    int xx;\r
+    const int pitch = sizeof(Uint32) * 8;\r
+    int i;\r
+\r
+//    for(i = 0; i < 4096; i++) __builtin_prefetch(&rgbAnalogGDI[i], 0, 0);\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       for(yy = ybegin; yy < yend; yy++) {\r
+         addr = yy * 40;\r
+         disp = (Uint8 *)p + (pitch * addr);\r
+         for(xx = 0; xx < (40 / 8); xx++) {\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+         }\r
+       }\r
+    } else {\r
+       for(yy = ybegin; yy < yend; yy++) {\r
+         addr = yy * 40;\r
+         disp = (Uint8 *)p + (pitch * addr);\r
+         for(xx = 0; xx < (40 / 8); xx++) {\r
+            c = getvram_4096_vec(addr);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            c = getvram_4096_vec(addr);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            c = getvram_4096_vec(addr);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            c = getvram_4096_vec(addr);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            c = getvram_4096_vec(addr);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            c = getvram_4096_vec(addr);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            c = getvram_4096_vec(addr);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+            c = getvram_4096_vec(addr);\r
+            putword2_vec((Uint32 *)disp,  c);\r
+            disp +=  pitch;\r
+            addr++;\r
+         }\r
+       }\r
+    } \r
+}\r
+\r
+/*\r
+ * ybegin - yendの行を変換する\r
+ */\r
+void CreateVirtualVram4096_WindowedLine_SSE2(Uint32 *p, int ybegin, int yend, int xbegin, int xend, int mode)\r
+{\r
+#if (__GNUC__ >= 4)   \r
+    register v8hi_t c;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    Uint32 addr;\r
+    int pitch;\r
+    int xx;\r
+    int yy;\r
+   \r
+    if(p == NULL) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 40 + xbegin;\r
+          disp = (Uint8 *)(&p[yy * 320 + xbegin]);\r
+          for(xx = xbegin; xx < xend; xx ++) { \r
+             putword2_vec((Uint32 *)disp, c);\r
+             disp += pitch;\r
+          }\r
+       }\r
+       return;\r
+     } else {\r
+       int xs =  (xend - xbegin) / 8;\r
+       int xs2 = (xend - xbegin) % 8;\r
+       int xx2;\r
+       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 40 + xbegin;\r
+          disp = (Uint8 *)(&p[yy * 320 + xbegin]);\r
+          xx = xbegin;\r
+          for(xx2 = 0; xx2 < xs; xx2++) {\r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+             \r
+             xx += 8;\r
+          }\r
+          if(xs2 <= 0) continue;\r
+          \r
+          for(;xx < xend; xx++) { \r
+             c = getvram_4096_vec(addr);\r
+             putword2_vec((Uint32 *)disp, c);\r
+             addr++;\r
+             disp += pitch;\r
+          }\r
+       }\r
+       return;\r
+     }\r
+ #else \r
+    Uint32 c[8];\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    int xx;\r
+    int yy;\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+    for(yy = ybegin; yy < yend; yy++) {  \r
+      addr = y * 40 + xbegin;\r
+      disp = (Uint8 *)(&p[yy * 320 + xbegin]);\r
+      for(xx = xbegin; xx < xend; xx++) {\r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp, c, pal);\r
+        addr++;\r
+        disp += pitch;\r
+      }\r
+   }\r
+#endif   \r
+}\r
+\r
+Api_Vram_FuncList api_vram4096_sse2 = {\r
+   CreateVirtualVram4096_1Pcs_SSE2,\r
+   CreateVirtualVram4096_Line_SSE2,\r
+   CreateVirtualVram4096_WindowedLine_SSE2\r
+};\r
diff --git a/source/src/agar/fm7/vram/sse2/api_vram8.c b/source/src/agar/fm7/vram/sse2/api_vram8.c
new file mode 100644 (file)
index 0000000..9804e90
--- /dev/null
@@ -0,0 +1,779 @@
+/*\r
+ * api_vram8.cpp\r
+ * Convert VRAM -> VirtualVram\r
+ * (C) 2011 K.Ohta <whatisthis.sowhat@gmail.com>\r
+ */\r
+\r
+\r
+#include "api_draw.h"\r
+#include "api_vram.h"\r
+#include "sdl_cpuid.h"\r
+#include "cache_wrapper.h"\r
+\r
+extern void CreateVirtualVram8_WindowedLine(Uint32 *p, int ybegin, int yend, int xbegin, int xend, int mode);\r
+\r
+\r
+#if (__GNUC__ >= 4)\r
+\r
+static inline v8hi_t getvram_8_vec(Uint32 addr)\r
+{\r
+    register uint8_t r, g, b;\r
+    v8hi_t ret;\r
+//    volatile v4hi cbuf __attribute__((aligned(32)));\r
+        /*\r
+         * R,G,Bについて8bit単位で描画する。\r
+         * 高速化…キャッシュヒット率の向上とVector演算(MMXetc)の速度効果を考慮して、\r
+         * ループの廃止を同時に行う\r
+         */\r
+    g = vram_pg[addr];\r
+    r = vram_pr[addr];\r
+    b = vram_pb[addr];\r
+\r
+   ret.v   = aPlanes[B0 + b] |\r
+              aPlanes[B1 + r] |\r
+              aPlanes[B2 + g];\r
+   return ret;\r
+}\r
+\r
+static void  putword8_vec(Uint32 *disp, v8hi_t c, Uint32 *pal)\r
+{\r
+   v8hi_t *p = (v8hi_t *)disp;\r
+   register int j;\r
+\r
+//   if(disp == NULL) return;\r
+\r
+   // recommand -finline-loop\r
+#ifdef __x86_64__\r
+   if((pal == NULL) || (disp == NULL))return;\r
+   asm ("movq %[c], %%r8\n\t"\r
+       "movdqa  0(%%r8), %%xmm0\n\t"\r
+       "movdqa 16(%%r8), %%xmm1\n\t"\r
+       "movq %[pal], %%r8\n\t"\r
+       "movq %[disp], %%rdi\n\t"\r
+       "movl $7, %%r9d\n\t"\r
+       "movd %%r9d, %%xmm2\n\t"\r
+       "pshufd $0b00000000, %%xmm2, %%xmm2\n\t"\r
+       "pand %%xmm2, %%xmm0\n\t"\r
+       "pand %%xmm2, %%xmm1\n\t"\r
+       "pshufd $0b00011011, %%xmm0, %%xmm0\n\t"\r
+       "pshufd $0b00011011, %%xmm1, %%xmm1\n\t"\r
+       \r
+       "movd %%xmm0, %%r9d\n\t"\r
+       "movd 0(%%r8, %%r9, 4), %%xmm2\n\t"\r
+       "psrldq $4, %%xmm0\n\t"\r
+       "movdqa %%xmm2, %%xmm3\n\t"\r
+       "pslldq $4, %%xmm3\n\t"\r
+       \r
+       "movd %%xmm0, %%r9d\n\t"\r
+       "movd 0(%%r8, %%r9, 4), %%xmm2\n\t"\r
+       "psrldq $4, %%xmm0\n\t"\r
+       "por    %%xmm2, %%xmm3\n\t"\r
+       "pslldq $4, %%xmm3\n\t"\r
+\r
+       "movd %%xmm0, %%r9d\n\t"\r
+       "movd 0(%%r8, %%r9, 4), %%xmm2\n\t"\r
+       "psrldq $4, %%xmm0\n\t"\r
+       "por    %%xmm2, %%xmm3\n\t"\r
+       "pslldq $4, %%xmm3\n\t"\r
+\r
+       "movd %%xmm0, %%r9d\n\t"\r
+       "movd 0(%%r8, %%r9, 4), %%xmm2\n\t"\r
+       "/* psrldq $4, %%xmm0 */\n\t"\r
+       "por    %%xmm2, %%xmm3\n\t"\r
+       "/* pslldq $4, %%xmm3 */\n\t"\r
+       "movdqu %%xmm3, 0(%%rdi)\n\t"\r
+       \r
+       "movd %%xmm1, %%r10d\n\t"\r
+       "movd 0(%%r8, %%r10, 4), %%xmm4\n\t"\r
+       "psrldq $4, %%xmm1\n\t"\r
+       "movdqa %%xmm4, %%xmm5\n\t"\r
+       "pslldq $4, %%xmm5\n\t"\r
+       \r
+       "movd %%xmm1, %%r10d\n\t"\r
+       "movd 0(%%r8, %%r10, 4), %%xmm4\n\t"\r
+       "psrldq $4, %%xmm1\n\t"\r
+       "por    %%xmm4, %%xmm5\n\t"\r
+       "pslldq $4, %%xmm5\n\t"\r
+\r
+       "movd %%xmm1, %%r10d\n\t"\r
+       "movd 0(%%r8, %%r10, 4), %%xmm4\n\t"\r
+       "psrldq $4, %%xmm1\n\t"\r
+       "por    %%xmm4, %%xmm5\n\t"\r
+       "pslldq $4, %%xmm5\n\t"\r
+\r
+       "movd %%xmm1, %%r10d\n\t"\r
+       "movd 0(%%r8, %%r10, 4), %%xmm4\n\t"\r
+       "/* psrldq $4, %%xmm1 */\n\t"\r
+       "por    %%xmm4, %%xmm5\n\t"\r
+       "/* pslldq $4, %%xmm5 */\n\t"\r
+       "movdqu %%xmm5, 16(%%rdi)\n\t"\r
+       :\r
+       : [c] "rm" (&c), [disp] "rm" (disp), [pal] "rm" (pal)\r
+       : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",\r
+         "r8", "r9", "r10", "rdi");\r
+#else\r
+   v8hi_t tmp;\r
+   if((pal == NULL) || (disp == NULL))return;\r
+   c.vv &= (v8ii){7, 7, 7, 7, 7, 7, 7, 7,};\r
+   for(j = 0; j < 8; j++) {\r
+      tmp.i[j] = pal[c.i[j]];\r
+   }\r
+   *p = tmp;\r
+#endif   \r
+}\r
+\r
+\r
+static  void getputvram_8_vec(Uint32 addr, Uint32 *disp, Uint32 *pal)\r
+{\r
+#ifdef __x86_64__\r
+   if((pal == NULL) || (disp == NULL)) return;\r
+   asm (\r
+       "movq %[vram_pg], %%r9\n\t"\r
+       "movq %[vram_pr], %%r10\n\t"\r
+       "movq %[vram_pb], %%r11\n\t"\r
+                \r
+       "movb 0(%%r11), %%r13b\n\t"\r
+       "movb 0(%%r10), %%r14b\n\t"\r
+       "movb 0(%%r9), %%r15b\n\t"\r
+       "andq $0xff, %%r13\n\t"\r
+       "andq $0xff, %%r14\n\t"\r
+       "andq $0xff, %%r15\n\t"\r
+       "shlq $5, %%r13\n\t"\r
+       "shlq $5, %%r14\n\t"\r
+       "shlq $5, %%r15\n\t"\r
+       "addq $0x2000, %%r14 /* 256 * 32 */\n\t"\r
+       "addq $0x4000, %%r15 /* 512 * 32 */\n\t"\r
+       \r
+       "movq %[pal], %%r8\n\t"\r
+       "movq %[disp], %%rdi\n\t"\r
+       "movq %[aPlanes], %%r12\n\t"\r
+       \r
+       "movdqa 0(%%r12, %%r13), %%xmm0\n\t"\r
+       "movdqa 0(%%r12, %%r14), %%xmm1\n\t"\r
+       "movdqa 0(%%r12, %%r15), %%xmm2\n\t"\r
+       "por %%xmm1, %%xmm0\n\t"\r
+       "por %%xmm2, %%xmm0\n\t"\r
+\r
+       "movdqa 16(%%r12, %%r13), %%xmm1\n\t"\r
+       "movdqa 16(%%r12, %%r14), %%xmm4\n\t"\r
+       "movdqa 16(%%r12, %%r15), %%xmm5\n\t"\r
+       "por %%xmm4, %%xmm1\n\t"\r
+       "por %%xmm5, %%xmm1\n\t"\r
+       \r
+       "movl $0x07, %%eax\n\t"\r
+       "movd %%eax, %%xmm2\n\t"\r
+       "pshufd $0b00000000, %%xmm2, %%xmm2\n\t"\r
+       "pand %%xmm2, %%xmm0\n\t"\r
+       "pand %%xmm2, %%xmm1\n\t"\r
+       "pshufd $0b00011011, %%xmm0, %%xmm0\n\t"\r
+       "pshufd $0b00011011, %%xmm1, %%xmm1\n\t"\r
+       "pxor %%xmm2, %%xmm2\n\t"\r
+       "pxor %%xmm4, %%xmm4\n\t"\r
+\r
+       "movd %%xmm0, %%r9d\n\t"\r
+       "movd 0(%%r8, %%r9, 4), %%xmm2\n\t"\r
+       "psrldq $4, %%xmm0\n\t"\r
+       "movdqa %%xmm2, %%xmm3\n\t"\r
+       "pslldq $4, %%xmm3\n\t"\r
+       \r
+       "movd %%xmm0, %%r9d\n\t"\r
+       "movd 0(%%r8, %%r9, 4), %%xmm2\n\t"\r
+       "psrldq $4, %%xmm0\n\t"\r
+       "por    %%xmm2, %%xmm3\n\t"\r
+       "pslldq $4, %%xmm3\n\t"\r
+\r
+       "movd %%xmm0, %%r9d\n\t"\r
+       "movd 0(%%r8, %%r9, 4), %%xmm2\n\t"\r
+       "psrldq $4, %%xmm0\n\t"\r
+       "por    %%xmm2, %%xmm3\n\t"\r
+       "pslldq $4, %%xmm3\n\t"\r
+\r
+       "movd %%xmm0, %%r9d\n\t"\r
+       "movd 0(%%r8, %%r9, 4), %%xmm2\n\t"\r
+       "por    %%xmm2, %%xmm3\n\t"\r
+       "movdqu %%xmm3, 0(%%rdi)\n\t"\r
+       \r
+       "movd %%xmm1, %%r10d\n\t"\r
+       "movd 0(%%r8, %%r10, 4), %%xmm4\n\t"\r
+       "psrldq $4, %%xmm1\n\t"\r
+       "movdqa %%xmm4, %%xmm5\n\t"\r
+       "pslldq $4, %%xmm5\n\t"\r
+       \r
+       "movd %%xmm1, %%r10d\n\t"\r
+       "movd 0(%%r8, %%r10, 4), %%xmm4\n\t"\r
+       "psrldq $4, %%xmm1\n\t"\r
+       "por    %%xmm4, %%xmm5\n\t"\r
+       "pslldq $4, %%xmm5\n\t"\r
+\r
+       "movd %%xmm1, %%r10d\n\t"\r
+       "movd 0(%%r8, %%r10, 4), %%xmm4\n\t"\r
+       "psrldq $4, %%xmm1\n\t"\r
+       "por    %%xmm4, %%xmm5\n\t"\r
+       "pslldq $4, %%xmm5\n\t"\r
+\r
+       "movd %%xmm1, %%r10d\n\t"\r
+       "movd 0(%%r8, %%r10, 4), %%xmm4\n\t"\r
+       "por    %%xmm4, %%xmm5\n\t"\r
+       "movdqu %%xmm5, 16(%%rdi)\n\t"\r
+       :\r
+       : [aPlanes] "rm" (aPlanes),\r
+         [disp] "rm" (disp),     [pal] "rm" (pal), \r
+         [vram_pg] "rm" (&vram_pg[addr]), [vram_pr] "rm" (&vram_pr[addr]), [vram_pb] "rm" (&vram_pb[addr])\r
+       : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",\r
+         "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",\r
+         "rdi");\r
+#else\r
+   v8hi_t c;\r
+   register uint8_t g, r, b;\r
+   int j;\r
+   v8hi_t *p = (v8hi_t *)disp;\r
+   v8hi_t tmp;\r
+   \r
+   if((pal == NULL) || (p == NULL))return;\r
+   \r
+   g = vram_pg[addr];\r
+   r = vram_pr[addr];\r
+   b = vram_pb[addr];\r
+\r
+   c.v   = aPlanes[B0 + b] |\r
+           aPlanes[B1 + r] |\r
+           aPlanes[B2 + g];\r
+   c.vv &= (v8ii){7, 7, 7, 7, 7, 7, 7, 7,};\r
+   for(j = 0; j < 8; j++) {\r
+      tmp.i[j] = pal[c.i[j]];\r
+   }\r
+   *p = tmp;\r
+#endif\r
+}\r
+\r
+\r
+\r
+\r
+#else\r
+static inline void planeto8(Uint32 *c, uint8_t r, unit8_t g, uint8_t b)\r
+{\r
+   Uint8 mask;\r
+   \r
+   mask = 0x80;\r
+   c[0] = ((r & mask) >> 6) | ((g & mask) >> 5) || ((b & mask) >> 7);\r
+   mask >>= 1;\r
+   c[1] = ((r & mask) >> 5) | ((g & mask) >> 4) || ((b & mask) >> 6);\r
+   mask >>= 1;\r
+   c[2] = ((r & mask) >> 4) | ((g & mask) >> 3) || ((b & mask) >> 5);\r
+   mask >>= 1;\r
+   c[3] = ((r & mask) >> 3) | ((g & mask) >> 2) || ((b & mask) >> 4);\r
+   mask >>= 1;\r
+   c[4] = ((r & mask) >> 2) | ((g & mask) >> 1) || ((b & mask) >> 3);\r
+   mask >>= 1;\r
+   c[5] = ((r & mask) >> 1) | (g & mask) || ((b & mask) >> 2);\r
+   mask >>= 1;\r
+   c[6] = (r & mask) | ((g & mask) << 1) || ((b & mask) >> 1);\r
+   mask >>= 1;\r
+   c[7] = ((r & mask) << 1) | ((g & mask) << 2) || (b & mask);\r
+   mask >>= 1;\r
+}\r
+\r
+static void getvram_8(Uint32 addr, Uint32 *cbuf)\r
+{\r
+    uint8_t r, g, b;\r
+        /*\r
+         * R,G,Bについて8bit単位で描画する。\r
+         * 高速化…キャッシュヒット率の向上とVector演算(MMXetc)の速度効果を考慮して、\r
+         * ループの廃止を同時に行う\r
+         */\r
+   \r
+   g = vram_pg[addr];\r
+   r = vram_pr[addr];\r
+   b = vram_pb[addr];\r
+   planeto8(cbuf, r, g, b);\r
+  \r
+   return;\r
+}\r
+\r
+static inline void  putword8(Uint32 *disp, Uint32 *c, Uint32 *pal)\r
+{\r
+\r
+   Uint32 *r1 = disp;\r
+\r
+   r1[0] = pal[c[0] & 7]; // ?!\r
+   r1[1] = pal[c[1] & 7];\r
+   r1[2] = pal[c[2] & 7];\r
+   r1[3] = pal[c[3] & 7];\r
+   r1[4] = pal[c[4] & 7];\r
+   r1[5] = pal[c[5] & 7];\r
+   r1[6] = pal[c[6] & 7];\r
+   r1[7] = pal[c[7] & 7];\r
+}\r
+\r
+#endif // __GNUC__ >= 4\r
+\r
+\r
+\r
+/*\r
+ * 8x8のピースをVRAMから作成する:VramLockしない事に注意\r
+ */\r
+void CreateVirtualVram8_1Pcs_SSE2(Uint32 *p, int x, int y, int pitch, int mode)\r
+{\r
+#if (__GNUC__ >= 4)   \r
+    register v8hi_t c;\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    register v8hi_t *disp =(v8hi_t *) p;\r
+    register Uint32 addr;\r
+    register int i;\r
+    pitch = pitch / (sizeof(v8hi_t) / sizeof(Uint32));\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+//    for(i = 0; i < 8; i++) __builtin_prefetch(&pal[i], 0, 0); // パレットテーブルをキャッシュに読み込ませておく\r
+    addr = y * 80 + x;\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       disp += pitch;\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+//       disp++;\r
+       return;\r
+     } else {\r
+#if 0\r
+       c = getvram_8_vec(addr);\r
+       putword8_vec((Uint32 *)disp, c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       c = getvram_8_vec(addr);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       c = getvram_8_vec(addr);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       c = getvram_8_vec(addr);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       c = getvram_8_vec(addr);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       c = getvram_8_vec(addr);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       c = getvram_8_vec(addr);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       c = getvram_8_vec(addr);\r
+       putword8_vec((Uint32 *)disp,  c, pal);\r
+#else\r
+       getputvram_8_vec(addr, disp, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+\r
+       getputvram_8_vec(addr, disp, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+       \r
+       getputvram_8_vec(addr, disp, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+       \r
+       getputvram_8_vec(addr, disp, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+       \r
+       getputvram_8_vec(addr, disp, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+       \r
+       getputvram_8_vec(addr, disp, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+       \r
+       getputvram_8_vec(addr, disp, pal);\r
+       addr += 80;\r
+       disp += pitch;\r
+       \r
+       getputvram_8_vec(addr, disp, pal);\r
+//     addr += 80;\r
+//     disp += pitch;\r
+#endif\r
+     }\r
+#else \r
+    Uint32 c[8];\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    v8hi_t *disp =(V8hi_t *) p;\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+    addr = y * 80 + x;\r
+\r
+    // Loop廃止(高速化)\r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp, c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr , c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr , c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   addr += 80;\r
+   disp += pitch;\r
+   \r
+   getvram_8(addr, c);\r
+   putword8((Uint32 *)disp,  c, pal);\r
+   //    addr += 80;\r
+   //    disp++;\r
+     \r
+#endif   \r
+}\r
+\r
+/*\r
+ * ybegin - yendの行を変換する\r
+ */\r
+void CreateVirtualVram8_Line_SSE2(Uint32 *p, int ybegin, int yend, int mode)\r
+{\r
+#if (__GNUC__ >= 4)   \r
+    register v8hi_t c;\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    register v8hi_t *disp =(v8hi_t *) p;\r
+    register Uint32 addr;\r
+    const int pitch = sizeof(Uint32) * 8;\r
+    int xx;\r
+    int yy;\r
+    register int i;\r
+   \r
+    if((p == NULL) || (pal == NULL)) return;\r
+\r
+//    for(i = 0; i < 8; i++) __builtin_prefetch(&pal[i], 0, 0); // パレットテーブルをキャッシュに読み込ませておく\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.v = (v8si){0,0,0,0,0,0,0,0};\r
+       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = ybegin * 80;\r
+          for(xx = 0; xx < (80 / 8); xx ++) { \r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+          }\r
+       }\r
+       return;\r
+     } else {\r
+       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 80;\r
+          for(xx = 0; xx < (80 / 8); xx++) { \r
+#if 1\r
+             getputvram_8_vec(addr, (Uint32 *)disp, pal);\r
+             addr++;\r
+             disp++;\r
+\r
+             getputvram_8_vec(addr, (Uint32 *)disp, pal);\r
+             addr++;\r
+             disp++;\r
+\r
+             getputvram_8_vec(addr, (Uint32 *)disp, pal);\r
+             addr++;\r
+             disp++;\r
+\r
+             getputvram_8_vec(addr, (Uint32 *)disp, pal);\r
+             addr++;\r
+             disp++;\r
+\r
+             getputvram_8_vec(addr, (Uint32 *)disp, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             getputvram_8_vec(addr, (Uint32 *)disp, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             getputvram_8_vec(addr, (Uint32 *)disp, pal);\r
+             addr++;\r
+             disp++;\r
+\r
+             getputvram_8_vec(addr, (Uint32 *)disp, pal);\r
+             addr++;\r
+             disp++;\r
+#else\r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+\r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             addr++;\r
+             disp++;\r
+#endif\r
+          }\r
+         \r
+       }\r
+       return;\r
+     }\r
+ #else \r
+    Uint32 c[8];\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    int xx;\r
+    int yy;\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+   for(yy = ybegin; yy < yend; yy++) {  \r
+      addr = y * 80;\r
+      for(xx = 0; xx < (80 / 8) ; xx++) {\r
+          \r
+        // Loop廃止(高速化)\r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp, c, pal);\r
+        addr++;\r
+        disp++;\r
+   \r
+        getvram_8(addr , c);\r
+        putword8((Uint32 *)disp,  c, pal);\r
+        addr++;\r
+        disp++;\r
+        \r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp,  c, pal);\r
+        addr += 1;\r
+        disp++;\r
+   \r
+        getvram_8(addr , c);\r
+        putword8((Uint32 *)disp,  c, pal);\r
+        addr += 1;\r
+        disp++;\r
+   \r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp,  c, pal);\r
+        addr += 1;\r
+        disp++;\r
+   \r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp,  c, pal);\r
+        addr += 1;\r
+        disp++;\r
+   \r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp,  c, pal);\r
+        addr += 1;\r
+        disp++;\r
+   \r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp,  c, pal);\r
+        addr += 1;\r
+        disp++;\r
+      }\r
+   }\r
+   \r
+     \r
+#endif   \r
+}\r
+/*\r
+ * ybegin - yendの行を変換する\r
+ */\r
+void CreateVirtualVram8_WindowedLine_SSE2(Uint32 *p, int ybegin, int yend, int xbegin, int xend, int mode)\r
+{\r
+#if (__GNUC__ >= 4)   \r
+    register v8hi_t c;\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    Uint32 addr;\r
+    int pitch;\r
+    int xx;\r
+    int yy;\r
+   \r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+\r
+    // Loop廃止(高速化)\r
+    if(aPlanes == NULL) {\r
+       c.vv = (v8ii){0,0,0,0,0,0,0,0};\r
+       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 80 + xbegin;\r
+          disp = (Uint8 *)(&p[yy * 640 + xbegin]);\r
+          for(xx = xbegin; xx < xend; xx ++) { \r
+             putword8_vec((Uint32 *)disp,  c, pal);\r
+             disp++;\r
+          }\r
+       }\r
+       return;\r
+     } else {\r
+       int xs =  (xend - xbegin) / 8;\r
+       int xs2 = (xend - xbegin) % 8;\r
+       int xx2;\r
+       for(yy = ybegin; yy < yend; yy++) { \r
+           addr = yy * 80 + xbegin;\r
+          disp = (Uint8 *)(&p[yy * 640 + xbegin]);\r
+          xx = xbegin;\r
+          for(xx2 = 0; xx2 < xs; xx2++) {\r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+             xx += 8;\r
+          }\r
+          if(xs2 <= 0) continue;\r
+          for(; xx < xend; xx++) { \r
+             c = getvram_8_vec(addr);\r
+             putword8_vec((Uint32 *)disp, c, pal);\r
+             addr++;\r
+             disp++;\r
+          }\r
+       }\r
+       return;\r
+     }\r
+ #else \r
+    Uint32 c[8];\r
+    Uint32 *pal = (Uint32 *)rgbTTLGDI;\r
+    Uint8 *disp =(Uint8 *) p;\r
+    int xx;\r
+    int yy;\r
+\r
+    if((p == NULL) || (pal == NULL)) return;\r
+    pitch = sizeof(Uint32) * 8;\r
+    for(yy = ybegin; yy < yend; yy++) {  \r
+      addr = y * 80 + xbegin;\r
+      disp = (Uint8 *)(&p[yy * 640 + xbegin]);\r
+      for(xx = xbegin; xx < xend; xx++) {\r
+        getvram_8(addr, c);\r
+        putword8((Uint32 *)disp, c, pal);\r
+        addr++;\r
+        disp++;\r
+      }\r
+   }\r
+#endif   \r
+}\r
+\r
+Api_Vram_FuncList api_vram8_sse2 = {\r
+   CreateVirtualVram8_1Pcs_SSE2,\r
+   CreateVirtualVram8_Line_SSE2,\r
+   CreateVirtualVram8_WindowedLine\r
+};\r
diff --git a/source/src/agar/fm7/vram/sse2/api_vramvec.c b/source/src/agar/fm7/vram/sse2/api_vramvec.c
new file mode 100644 (file)
index 0000000..8bc0985
--- /dev/null
@@ -0,0 +1,57 @@
+/*\r
+ * api_vramvec.cpp\r
+ * Convert VRAM -> VirtualVram(Vector Version)\r
+ * (C) 2012 K.Ohta <whatisthis.sowhat@gmail.com>\r
+ */\r
+\r
+\r
+#include "xm7_types.h"\r
+#include "api_draw.h"\r
+#include "api_vram.h"\r
+\r
+/*\r
+* Definition of Convertsion Tables.\r
+*/\r
+// Reduce Tables 20120131\r
+\r
+extern v8si *aPlanes;\r
+\r
+\r
+v8hi_t lshift_6bit8v_SSE2(v8hi_t v)\r
+{\r
+   v8hi_t r;\r
+   register v8hi_t cbuf;\r
+   register v8hi_t mask;\r
+   v8hi_t ret;\r
+   mask.v = (v8si){0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8};\r
\r
+   cbuf.v =\r
+        aPlanes[B2 + v.b[0]] |\r
+        aPlanes[B3 + v.b[1]] |\r
+        aPlanes[R0 + v.b[2]] |\r
+        aPlanes[R1 + v.b[3]] |\r
+        aPlanes[R2 + v.b[4]] |\r
+        aPlanes[R3 + v.b[5]];\r
+   \r
+   mask.v = mask.v & cbuf.v;\r
+#if ((__GNUC__ == 4) && (__GCC_MINOR__ >= 7)) || (__GNUC__ > 4) //GCC 4.7 or later.\r
+   r.v = mask.v != (v8si){0, 0, 0, 0, 0, 0, 0, 0};\r
+   r.v = r.v & (v8si) {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03};\r
+   cbuf.v = cbuf.v |  r.v;\r
+#else\r
+   if(mask.s[0] != 0) cbuf.s[0] |= 0x03;\r
+   if(mask.s[1] != 0) cbuf.s[1] |= 0x03;\r
+   if(mask.s[2] != 0) cbuf.s[2] |= 0x03;\r
+   if(mask.s[3] != 0) cbuf.s[3] |= 0x03;\r
+   if(mask.s[4] != 0) cbuf.s[4] |= 0x03;\r
+   if(mask.s[5] != 0) cbuf.s[5] |= 0x03;\r
+   if(mask.s[6] != 0) cbuf.s[6] |= 0x03;\r
+   if(mask.s[7] != 0) cbuf.s[7] |= 0x03;\r
+#endif \r
+//   ret = cbuf;\r
+   return cbuf;\r
+}\r
+\r
+\r
+\r
+\r
diff --git a/source/src/agar/tools/cl2cpp.awk b/source/src/agar/tools/cl2cpp.awk
new file mode 100644 (file)
index 0000000..01a7e10
--- /dev/null
@@ -0,0 +1,12 @@
+BEGIN {
+           printf "const char* %s = \"", VARNAME;
+}
+
+/^.*/ {
+        gsub("\"", "\\\"", $0);
+       printf "%s\\n", $0;
+}
+
+END {
+    printf "\";\n"
+}
\ No newline at end of file
index 83e3e95..7f0563d 100644 (file)
 #ifndef _COMMON_H_\r
 #define _COMMON_H_\r
 \r
+#if defined(_USE_AGAR) || defined(_USE_SDL)\r
+#include <SDL.h>\r
+\r
+# ifndef uint8\r
+   typedef uint8_t uint8;\r
+# endif\r
+# ifndef int8\r
+   typedef int8_t int8;\r
+# endif\r
+# ifndef uint16\r
+   typedef uint16_t uint16;\r
+# endif\r
+# ifndef int16\r
+   typedef int16_t int16;\r
+# endif\r
+# ifndef uint32\r
+   typedef uint32_t uint32;\r
+# endif\r
+# ifndef int32\r
+   typedef int32_t int32;\r
+# endif\r
+# ifndef uint64\r
+   typedef uint64_t uint64;\r
+# endif\r
+# ifndef int64\r
+   typedef int64_t int64;\r
+# endif\r
+# ifndef BOOL\r
+   typedef int BOOL;\r
+# endif\r
+# ifndef BYTE\r
+   typedef uint8_t BYTE;\r
+# endif\r
+# ifndef WORD\r
+   typedef uint16_t WORD;\r
+# endif\r
+# ifndef DWORD\r
+   typedef uint32_t DWORD;\r
+# endif\r
+# ifndef QWORD\r
+   typedef uint64_t QWORD;\r
+# endif\r
+\r
+\r
+\r
+// tchar.h\r
+#  ifdef  _UNICODE\r
+#    define __T(x)      L ## x\r
+#  else\r
+#    define __T(x)      x\r
+#  endif\r
\r
+#  define _T(x)       __T(x)\r
+#  define _TEXT(x)    __T(x)\r
+\r
+#  ifdef _UNICODE\r
+    typedef wchar_t _TCHAR;\r
+#  else\r
+    typedef char    _TCHAR;\r
+#  endif\r
+\r
+#  ifndef LPCTSTR\r
+    typedef _TCHAR LPCTSTR;\r
+#  endif\r
+\r
+#  ifdef _USE_GETTEXT\r
+#  include <libintl.h>\r
+#  define _N(x) gettext(x)\r
+# else\r
+#  define _N(x) _T(x)\r
+# endif\r
+\r
+#if (SDL_BYTEORDER == SDL_LIL_ENDIAN)\r
+static inline DWORD EndianToLittle_DWORD(DWORD x)\r
+{\r
+   return x;\r
+}\r
+\r
+static inline WORD EndianToLittle_WORD(WORD x)\r
+{\r
+   return x;\r
+}\r
+#else // BIG_ENDIAN\r
+static inline DWORD EndianToLittle_DWORD(DWORD x)\r
+{\r
+   DWORD y;\r
+   y = ((x & 0x000000ff) << 24) | ((x & 0x0000ff00) << 8) |\r
+       ((x & 0x00ff0000) >> 8)  | ((x & 0xff000000) >> 24);\r
+   return y;\r
+}\r
+\r
+static inline WORD EndianToLittle_WORD(WORD x)\r
+{\r
+   WORD y;\r
+   y = ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8);\r
+   return y;\r
+}\r
+#endif\r
+#define ZeroMemory(p,s) memset(p,0x00,s)\r
+#define CopyMemory(t,f,s) memcopy(t,f,s)\r
+\r
+extern "C" \r
+{\r
+extern void Sleep(int tick);\r
+extern uint32_t timeGetTime(void);\r
+}\r
+\r
+\r
+#else\r
 #include <tchar.h>\r
 \r
 // variable scope of 'for' loop for microsoft visual c++ 6.0 and embedded visual c++ 4.0\r
 #pragma warning( disable : 4995 )\r
 #pragma warning( disable : 4996 )\r
 #endif\r
+#endif\r
 \r
 // type definition\r
 #ifndef uint8\r
@@ -57,8 +167,21 @@ typedef signed __int64 int64;
 #else\r
 typedef signed long long int64;\r
 #endif\r
+\r
+static inline DWORD EndianToLittle_DWORD(DWORD x)\r
+{\r
+   return x;\r
+}\r
+\r
+static inline WORD EndianToLittle_WORD(WORD x)\r
+{\r
+   return x;\r
+}\r
+\r
+\r
 #endif\r
 \r
+\r
 typedef union {\r
 #ifdef _BIG_ENDIAN\r
        struct {\r
index 585cdc4..fd32b85 100644 (file)
@@ -6,15 +6,19 @@
 \r
        [ config ]\r
 */\r
-\r
+#if defined(_USE_AGAR) || defined(_USE_SDL)\r
+#include <SDL.h>\r
+#include <agar/core.h>\r
+#else\r
 #include <windows.h>\r
+#endif\r
+\r
 #include <stdlib.h>\r
 #include <stdio.h>\r
 #include "config.h"\r
 #include "fileio.h"\r
 \r
 config_t config;\r
-\r
 BOOL WritePrivateProfileInt(LPCTSTR lpAppName, LPCTSTR lpKeyName, int Value, LPCTSTR lpFileName)\r
 {\r
        _TCHAR String[32];\r
@@ -34,6 +38,7 @@ bool GetPrivateProfileBool(LPCTSTR lpAppName, LPCTSTR lpKeyName, bool bDefault,
        return (GetPrivateProfileInt(lpAppName, lpKeyName, bDefault ? 1 : 0, lpFileName) != 0);\r
 }\r
 \r
+\r
 void init_config()\r
 {\r
        // initial settings\r
index 417dfc3..af0cc65 100644 (file)
@@ -22,6 +22,7 @@
        // output i/o debug log\r
 //     #define _IO_DEBUG_LOG\r
 #endif\r
+\r
 #if defined(_USE_AGAR) || defined(_USE_SDL)\r
 # include <SDL.h>\r
 # include <agar/core.h>\r
 \r
 // Wrapper of WIN32->*nix\r
 \r
-// tchar.h\r
-#  ifdef  _UNICODE\r
-#    define __T(x)      L ## x\r
-#  else\r
-#    define __T(x)      x\r
-#  endif\r
\r
-#  define _T(x)       __T(x)\r
-#  define _TEXT(x)    __T(x)\r
-\r
-#  ifdef _UNICODE\r
-    typedef wchar_t _TCHAR;\r
-#  else\r
-    typedef char    _TCHAR;\r
-#  endif\r
-  typedef int bool;\r
-  typedef bool BOOL;\r
-\r
-# ifdef _USE_GETTEXT\r
-#  include <libintl.h>\r
-#  define _N(x) gettext(x)\r
-# else\r
-#  define _N(x) _T(x)\r
-# endif\r
+\r
 \r
 #else // _USE_WIN32\r
 #include <windows.h>\r
 #include <windowsx.h>\r
 #include <mmsystem.h>\r
 #include <process.h>\r
+\r
 #endif // _USE_WIN32\r
 \r
 #include <stdio.h>\r
diff --git a/source/src/vm/CMakeLists.txt b/source/src/vm/CMakeLists.txt
new file mode 100644 (file)
index 0000000..dfcccc6
--- /dev/null
@@ -0,0 +1,68 @@
+cmake_minimum_required (VERSION 2.6)
+
+message("* vm")
+
+add_library(vm_vm
+  315-5124.cpp
+  and.cpp
+  beep.cpp
+  datarec.cpp
+  disk.cpp
+  event.cpp
+  hd146818p.cpp
+  hd46505.cpp
+  hd63484.cpp
+  huc6280.cpp
+  i286.cpp
+  i386.cpp
+  i8080.cpp
+  i8155.cpp
+  i8237.cpp
+  i8251.cpp
+  i8253.cpp
+  i8255.cpp
+  i8259.cpp
+  i86.cpp
+  io.cpp
+  ld700.cpp
+  ls244.cpp
+  ls393.cpp
+  m6502.cpp
+  mb8877.cpp
+  mc6800.cpp
+  mc6809.cpp
+  mc6820.cpp
+  mc6840.cpp
+  mc6847.cpp
+  mcs48.cpp
+  memory.cpp
+  msm58321.cpp
+  nand.cpp
+  nor.cpp
+  not.cpp
+  or.cpp
+  pc6031.cpp
+  pc80s31k.cpp
+  pcm1bit.cpp
+  rp5c01.cpp
+  sn76489an.cpp
+  tf20.cpp
+  tms9918a.cpp
+  tms9995.cpp
+  upd1990a.cpp
+  upd4991a.cpp
+  upd71071.cpp
+  upd7220.cpp
+  upd765a.cpp
+  upd7752.cpp
+  upd7801.cpp
+  w3100a.cpp
+  ym2151.cpp
+  ym2203.cpp
+  ym2413.cpp
+  z80.cpp
+  z80ctc.cpp
+  z80dma.cpp
+  z80pio.cpp
+  z80sio.cpp
+)
\ No newline at end of file
diff --git a/source/src/vm/fmgen/CMakeLists.txt b/source/src/vm/fmgen/CMakeLists.txt
new file mode 100644 (file)
index 0000000..92c2aff
--- /dev/null
@@ -0,0 +1,12 @@
+cmake_minimum_required (VERSION 2.6)
+
+message("* vm/fmgen")
+
+add_library(vm_fmgen
+                    file.cpp
+                    fmgen.cpp
+                    fmtimer.cpp
+                    opm.cpp
+                    opna.cpp
+                    psg.cpp
+)
\ No newline at end of file
diff --git a/source/src/vm/pc8801/CMakeLists.txt b/source/src/vm/pc8801/CMakeLists.txt
new file mode 100644 (file)
index 0000000..ec9bd23
--- /dev/null
@@ -0,0 +1,8 @@
+cmake_minimum_required (VERSION 2.6)
+
+message("* vm/x1")
+
+add_library(vm_pc8801
+       pc88.cpp
+       pc8801.cpp
+)
\ No newline at end of file
diff --git a/source/src/vm/x1/CMakeLists.txt b/source/src/vm/x1/CMakeLists.txt
new file mode 100644 (file)
index 0000000..5333dfb
--- /dev/null
@@ -0,0 +1,18 @@
+cmake_minimum_required (VERSION 2.6)
+
+message("* vm/x1")
+
+add_library(vm_x1
+       wdisplay.cpp
+       emm.cpp
+       floppy.cpp
+       io.cpp
+       joystick.cpp
+       keyboard.cpp
+       memory.cpp
+       mouse.cpp
+       printer.cpp
+       psub.cpp
+       sub.cpp
+       x1.cpp
+)
\ No newline at end of file