PATCH: [ 826250 ] New UniMemFile class for memory-mapping

author Perry Rapp <elsapo@users.sourceforge.net>

Thu, 23 Oct 2003 00:04:34 +0000 (00:04 +0000)

committer Perry Rapp <elsapo@users.sourceforge.net>

Thu, 23 Oct 2003 00:04:34 +0000 (00:04 +0000)
author Perry Rapp <elsapo@users.sourceforge.net>
Thu, 23 Oct 2003 00:04:34 +0000 (00:04 +0000)
committer Perry Rapp <elsapo@users.sourceforge.net>
Thu, 23 Oct 2003 00:04:34 +0000 (00:04 +0000)
diff --git a/Src/Common/UniFile.cpp b/Src/Common/UniFile.cpp

new file mode 100644 (file)

index 0000000..22cbe9c
--- /dev/null
+++ b/Src/Common/UniFile.cpp
@@ -0,0 +1,481 @@
+/** 
+ *  @file   UniFile.cpp
+ *  @author Perry Rapp, Creator, 2003
+ *  @date   Created: 2003-10
+ *  @date   Edited:  2003-10-21 (Perry)
+ *
+ *  @brief Implementation of Memory-Mapped Unicode enabled file class
+ */
+
+/* The MIT License
+Copyright (c) 2003 Perry Rapp
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "stdafx.h"
+#include "UniFile.h"
+#include "unicoder.h"
+
+#ifdef _DEBUG
+#define new DEBUG_NEW
+#undef THIS_FILE
+static char THIS_FILE[] = __FILE__;
+#endif
+
+
+UniMemFile::UniMemFile(LPCTSTR filename)
+: m_statusFetched(0)
+, m_filesize(0)
+, m_handle(INVALID_HANDLE_VALUE)
+, m_hMapping(INVALID_HANDLE_VALUE)
+, m_base(NULL)
+, m_data(NULL)
+, m_current(NULL)
+, m_lineno(-1)
+, m_readbom(false)
+, m_unicoding(ucr::NONE)
+, m_charsize(0)
+, m_codepage(0)
+{
+       m_filename = filename;
+       m_filepath = m_filename;
+}
+
+void UniMemFile::Close()
+{
+       m_lastError.ClearError();
+
+       if (m_base)
+       {
+               UnmapViewOfFile(m_base);
+               m_base = 0;
+       }
+       m_data = NULL;
+       m_current = NULL;
+       m_lineno = -1;
+       if (m_hMapping != INVALID_HANDLE_VALUE)
+       {
+               CloseHandle(m_hMapping);
+               m_hMapping = INVALID_HANDLE_VALUE;
+       }
+       if (m_handle != INVALID_HANDLE_VALUE)
+       {
+               FlushFileBuffers(m_handle);
+               CloseHandle(m_handle);
+               m_handle = INVALID_HANDLE_VALUE;
+       }
+       m_statusFetched = 0;
+       m_readbom = false;
+}
+
+/** @brief Get file status into member variables */
+bool UniMemFile::GetFileStatus()
+{
+       m_lastError.ClearError();
+       m_statusFetched = -1;
+
+       if (!CFile::GetStatus(m_filepath, m_filestatus))
+       {
+               LastError(_T("CFile::GetStatus"), 0);
+               return false;
+       }
+       m_filepath = m_filestatus.m_szFullName;
+
+       if (m_handle == INVALID_HANDLE_VALUE)
+       {
+               LastErrorCustom(_T("UniMemFile::GetFileStatus called without open file handle"));
+               return false;
+       }
+       DWORD sizehi=0;
+       DWORD sizelo = GetFileSize(m_handle, &sizehi);
+       int errnum = GetLastError();
+       if (errnum != NO_ERROR)
+       {
+               LastError(_T("GetFileSize"), errnum);
+               return false;
+       }
+       m_filesize = sizelo + (sizehi << 32);
+       m_statusFetched = 1;
+       return true;
+}
+
+/** @brief Open file for generic read-only access */
+bool UniMemFile::OpenReadOnly()
+{
+       DWORD dwOpenAccess = GENERIC_READ;
+       DWORD dwOpenShareMode = FILE_SHARE_READ;
+       DWORD dwOpenCreationDispostion = OPEN_EXISTING;
+       DWORD dwMappingProtect = PAGE_READONLY;
+       DWORD dwMapViewAccess = FILE_MAP_READ;
+       return Open(dwOpenAccess, dwOpenShareMode, dwOpenCreationDispostion, dwMappingProtect, dwMapViewAccess);
+}
+
+/** @brief Open file for generic read-write access */
+bool UniMemFile::Open()
+{
+       DWORD dwOpenAccess = GENERIC_WRITE;
+       DWORD dwOpenShareMode = 0;
+       DWORD dwOpenCreationDispostion = OPEN_EXISTING;
+       DWORD dwMappingProtect = PAGE_READWRITE;
+       DWORD dwMapViewAccess = FILE_MAP_WRITE;
+       return Open(dwOpenAccess, dwOpenShareMode, dwOpenCreationDispostion, dwMappingProtect, dwMapViewAccess);
+}
+
+/** @brief Open file with specified arguments */
+bool UniMemFile::Open(DWORD dwOpenAccess, DWORD dwOpenShareMode, DWORD dwOpenCreationDispostion, DWORD dwMappingProtect, DWORD dwMapViewAccess)
+{
+       // We use an internal workhorse to make it easy to close on any error
+       if (!DoOpen(dwOpenAccess, dwOpenShareMode, dwOpenCreationDispostion, dwMappingProtect, dwMapViewAccess))
+       {
+               Close();
+               return false;
+       }
+       return true;
+}
+
+/** @brief Internal implementation of Open */
+bool UniMemFile::DoOpen(DWORD dwOpenAccess, DWORD dwOpenShareMode, DWORD dwOpenCreationDispostion, DWORD dwMappingProtect, DWORD dwMapViewAccess)
+{
+       if (m_lastError.hasError()) return false;
+
+       Close();
+
+       m_handle = CreateFile(m_filename, dwOpenAccess, dwOpenShareMode, NULL, dwOpenCreationDispostion, 0, 0);
+       if (m_handle == INVALID_HANDLE_VALUE)
+       {
+               LastError(_T("CreateFile"), GetLastError());
+               return false;
+       }
+       if (!GetFileStatus())
+               return false;
+
+       DWORD sizehi = (DWORD)(m_filesize >> 32);
+       DWORD sizelo = (DWORD)(m_filesize & 0xFFFFFFFF);
+
+       if (sizehi)
+       {
+               LastErrorCustom(_T("UniMemFile cannot handle files over 4 gigabytes"));
+               return false;
+       }
+
+       LPSECURITY_ATTRIBUTES lpAttributes = NULL; // default security
+       LPCTSTR lpName = NULL; // nameless mapping
+       m_hMapping = CreateFileMapping(m_handle, lpAttributes, dwMappingProtect, sizehi, sizelo, lpName);
+       if (!m_hMapping)
+       {
+               LastError(_T("CreateFileMapping"), GetLastError());
+               return false;
+       }
+
+
+       DWORD dwFileOffsetHigh = 0;
+       DWORD dwFileOffsetLow = 0;
+       SIZE_T dwNumberOfBytesToMap = sizelo;
+       m_base = (LPBYTE)MapViewOfFile(m_hMapping, dwMapViewAccess, dwFileOffsetHigh, dwFileOffsetLow, dwNumberOfBytesToMap);
+       if (!m_base)
+       {
+               LastError(_T("MapViewOfFile"), GetLastError());
+               return false;
+       }
+       m_data = m_base;
+       m_current = m_base;
+       m_lineno = 0;
+
+       return true;
+}
+
+/** @brief Record an API call failure */
+void UniMemFile::LastError(LPCTSTR apiname, int syserrnum)
+{
+       m_lastError.ClearError();
+
+       m_lastError.apiname = apiname;
+       m_lastError.syserrnum = syserrnum;
+}
+
+/** @brief Record a custom error */
+void UniMemFile::LastErrorCustom(LPCTSTR desc)
+{
+       m_lastError.ClearError();
+
+       m_lastError.desc = desc;
+}
+
+/**
+ * @brief Check for Unicode BOM (byte order mark) at start of file
+ *
+ * @note This code only checks for UCS-2LE, UCS-2BE, and UTF-8 BOMs (no UCS-4).
+ */
+bool UniMemFile::ReadBom()
+{
+       byte * lpByte = (byte *)m_base;
+       m_current = m_data = m_base;
+       m_charsize = 1;
+       if (m_filesize >= 2)
+       {
+               if (lpByte[0] == 0xFF && lpByte[1] == 0xFE)
+               {
+                       m_unicoding = ucr::UCS2LE;
+                       m_charsize = 2;
+                       m_data = lpByte+2;
+               }
+               else if (lpByte[0] == 0xFE && lpByte[1] == 0xFF)
+               {
+                       m_unicoding = ucr::UCS2BE;
+                       m_charsize = 2;
+                       m_data = lpByte+2;
+               }
+       }
+       if (m_filesize >=3)
+       {
+               if (lpByte[0] == 0xEF && lpByte[1] == 0xBB && lpByte[2] == 0xBF)
+               {
+                       m_unicoding = ucr::UTF8;
+                       m_data = lpByte+3;
+               }
+       }
+       m_readbom = true;
+       m_current = m_data;
+       return (m_data != m_base);
+}
+
+/**
+ * @brief Read one (DOS or UNIX or Mac) line. Do not include eol chars.
+ */
+BOOL UniMemFile::ReadString(CString & line)
+{
+       CString eol;
+       BOOL ok = ReadString(line, eol);
+       return ok;
+}
+
+/**
+ * @brief Read one (DOS or UNIX or Mac) line
+ */
+BOOL UniMemFile::ReadString(CString & line, CString & eol)
+{
+       line = _T("");
+       eol = _T("");
+       // shortcut methods in case file is in the same encoding as our CStrings
+#ifdef _UNICODE
+       if (m_unicoding == ucr::UCS2LE)
+       {
+               // If there aren't any wchars left in the file, return FALSE to indicate EOF
+               if (m_current - m_base + 1 >= m_filesize)
+                       return FALSE;
+               // Loop through wchars, watching for eol chars or zero
+               while (m_current - m_base + 1 < m_filesize)
+               {
+                       wchar_t wch = *(wchar_t *)m_current;
+                       m_current += 2;
+                       if (wch == '\n' || wch == '\r')
+                       {
+                               eol += wch;
+                               if (wch == '\r')
+                               {
+                                       if (m_current - m_base + 1 < m_filesize && *(wchar_t *)m_current == '\n')
+                                       {
+                                               eol += '\n';
+                                               m_current += 2;
+                                               ++m_txtstats.ncrlfs;
+                                       }
+                                       else
+                                       {
+                                               ++m_txtstats.ncrs;
+                                       }
+                               }
+                               else
+                               {
+                                       ++m_txtstats.nlfs;
+                               }
+                               ++m_lineno;
+                               return TRUE;
+                       }
+                       if (!wch)
+                       {
+                               ++m_txtstats.nzeros;
+                               return TRUE;
+                       }
+                       line += wch;
+               }
+               return TRUE;
+       }
+#else
+       if (m_unicoding == ucr::NONE && !m_codepage)
+       {
+               // If there aren't any bytes left in the file, return FALSE to indicate EOF
+               if (m_current - m_base >= m_filesize)
+                       return FALSE;
+               // Loop through chars, watching for eol chars or zero
+               while (m_current - m_base < m_filesize)
+               {
+                       char ch = *m_current;
+                       ++m_current;
+                       if (ch == '\n' || ch == '\r')
+                       {
+                               eol += ch;
+                               if (ch == '\r')
+                               {
+                                       if (m_current - m_base < m_filesize && *m_current == '\n')
+                                       {
+                                               eol += '\n';
+                                               ++m_current;
+                                               ++m_txtstats.ncrlfs;
+                                       }
+                                       else
+                                       {
+                                               ++m_txtstats.ncrs;
+                                       }
+                               }
+                               else
+                               {
+                                       ++m_txtstats.nlfs;
+                               }
+                               ++m_lineno;
+                               return TRUE;
+                       }
+                       if (!ch)
+                       {
+                               ++m_txtstats.nzeros;
+                               return TRUE;
+                       }
+                       line += ch;
+               }
+               return TRUE;
+       }
+#endif
+
+       if (m_current - m_base + (m_charsize-1) >= m_filesize)
+               return FALSE;
+
+       // Handle 8-bit strings in line chunks because of multibyte codings (eg, 936)
+       if (m_unicoding == ucr::NONE && m_codepage)
+       {
+               bool eof=true;
+               for (LPBYTE eolptr = m_current; (eolptr - m_base + (m_charsize-1) < m_filesize); ++eolptr)
+               {
+                       if (*eolptr == '\n' || *eolptr == '\r')
+                       {
+                               eof=false;
+                               break;
+                       }
+               }
+               line = ucr::maketstring(m_current, eolptr-m_current, m_codepage);
+               if (!eof)
+               {
+                       eol += (TCHAR)*eolptr;
+                       if (*eolptr == '\r' && (eolptr - m_base + (m_charsize-1) < m_filesize) && eolptr[1] == '\n')
+                               eol += '\n';
+                       ++m_lineno;
+               }
+               m_current = eolptr + eol.GetLength();
+               return !eof;
+       }
+
+       while (m_current - m_base + (m_charsize-1) < m_filesize)
+       {
+               UINT ch=0;
+               UINT utf8len=0;
+               bool doneline=false;
+
+               if (m_unicoding == ucr::UTF8)
+               {
+                       // check for end in middle of UTF-8 character
+                       utf8len = ucr::Utf8len_fromLeadByte(*m_current);
+                       if (m_current - m_base + utf8len > m_filesize)
+                       {
+                               ch = '?';
+                               m_current = m_base + m_filesize;
+                               doneline = true;
+                       }
+                       // Handle bad UTF-8 or UTF-8 outside of UCS-2
+                       // (Convert bad bytes individually to '?'
+                       else if (utf8len < 1 || utf8len > 4)
+                       {
+                               ch = '?';
+                               utf8len=1;
+                       }
+                       else
+                       {
+                               ch = ucr::GetUtf8Char(m_current);
+                       }
+               }
+               else
+               {
+                       ch = ucr::get_unicode_char(m_current, (ucr::UNICODESET)m_unicoding, m_codepage);
+                       if (!ch)
+                               doneline = true;
+               }
+               // convert from Unicode codepoint to TCHAR string
+               // could be multicharacter if decomposition took place, for example
+               bool lossy = false; // try to avoid lossy conversion
+               CString sch = ucr::maketchar(ch, lossy);
+               if (lossy)
+                       ++m_txtstats.nlosses;
+               if (sch.GetLength() >= 1)
+                       ch = sch[0];
+               else
+                       ch = 0;
+
+
+               if (ch == '\r')
+               {
+                       eol = _T("\r");
+                       doneline = true;
+                       bool crlf = false;
+                       // check for crlf pair
+                       if (m_current - m_base + 2 * m_charsize - 1 < m_filesize)
+                       {
+                               // For UTF-8, this ch will be wrong if character is non-ASCII
+                               // but we only check it against \n here, so it doesn't matter
+                               UINT ch = ucr::get_unicode_char(m_current+m_charsize, (ucr::UNICODESET)m_unicoding);
+                               if (ch == '\n')
+                               {
+                                       crlf = true;
+                               }
+                       }
+                       if (crlf)
+                       {
+                               eol = _T("\r\n");
+                               ++m_txtstats.ncrlfs;
+                               // advance an extra character to skip the following lf
+                               m_current += m_charsize;
+                       }
+                       else
+                       {
+                               ++m_txtstats.ncrs;
+                       }
+               }
+               else if (ch == '\n')
+               {
+                       eol = _T("\n");
+                       doneline = true;
+                       ++m_txtstats.nlfs;
+               }
+               else if (!ch)
+               {
+                       doneline = true;
+                       ++m_txtstats.nzeros;
+               }
+               // always advance to next character
+               if (m_unicoding == ucr::UTF8)
+               {
+                       m_current += utf8len;
+               }
+               else
+               {
+                       m_current += m_charsize;
+               }
+               if (doneline)
+               {
+                       if (!eol.IsEmpty())
+                               ++m_lineno;
+                       return TRUE;
+               }
+               line += sch;
+       }
+       return TRUE;
+}
+
diff --git a/Src/Common/UniFile.h b/Src/Common/UniFile.h

new file mode 100644 (file)

index 0000000..d0728e1
--- /dev/null
+++ b/Src/Common/UniFile.h
@@ -0,0 +1,118 @@
+/** 
+ *  @file   UniFile.h
+ *  @author Perry Rapp, Creator, 2003
+ *  @date   Created: 2003-10
+ *  @date   Edited:  2003-10-21 (Perry)
+ *
+ *  @brief  Declaration of Memory-Mapped Unicode enabled file class
+ */
+
+#ifndef UniFile_h_included
+#define UniFile_h_included
+
+/**
+ * @brief Interface to file classes in this module
+ */
+class UniFile
+{
+       struct UniError
+       {
+               CString apiname;
+               int syserrnum; // valid if apiname nonempty
+               CString desc; // valid if apiname empty
+               bool hasError() const { return !apiname.IsEmpty() || !desc.IsEmpty(); }
+               void ClearError() { apiname = _T(""); syserrnum = ERROR_SUCCESS; desc = _T(""); }
+               UniError() { ClearError(); }
+       };
+public:
+       virtual bool OpenReadOnly() = 0;
+
+       virtual void Close() = 0;
+
+       virtual CString GetFullyQualifiedPath() const = 0;
+
+       virtual UniError GetLastUniError() const = 0;
+
+       virtual bool ReadBom() = 0;
+       virtual int GetUnicoding() = 0;
+       virtual void SetCodepage(int codepage) = 0;
+
+       virtual BOOL ReadString(CString & line) = 0;
+       virtual BOOL ReadString(CString & line, CString & eol) = 0;
+       virtual int GetLineNumber() const = 0;
+
+       struct txtstats
+       {
+               int ncrs;
+               int nlfs;
+               int ncrlfs;
+               int nzeros;
+               int nlosses;
+               txtstats() { clear(); }
+               void clear() { ncrs = nlfs = ncrlfs = nzeros = 0; }
+       };
+       virtual const txtstats & GetTxtStats() const = 0;
+};
+
+
+/**
+ * @brief Memory-Mapped disk file
+ */
+class UniMemFile : public UniFile
+{
+public:
+       UniMemFile(LPCTSTR filename);
+       virtual ~UniMemFile() { Close(); }
+
+       virtual bool GetFileStatus();
+
+       virtual bool OpenReadOnly();
+       virtual bool Open();
+       virtual bool Open(DWORD dwOpenAccess, DWORD dwOpenShareMode, DWORD dwOpenCreationDispostion, DWORD dwMappingProtect, DWORD dwMapViewAccess);
+
+       void Close();
+
+       virtual CString GetFullyQualifiedPath() const { return m_filepath; }
+       const CFileStatus & GetFileStatus() const { return m_filestatus; }
+
+       virtual UniError GetLastUniError() const { return m_lastError; }
+
+       virtual bool ReadBom();
+       virtual int GetUnicoding() { return m_unicoding; }
+       virtual void SetCodepage(int codepage) { m_codepage = codepage; }
+
+       virtual BOOL ReadString(CString & line);
+       virtual BOOL ReadString(CString & line, CString & eol);
+       virtual int GetLineNumber() const { return m_lineno; }
+
+
+       virtual const txtstats & GetTxtStats() const { return m_txtstats; }
+
+// Implementation methods
+protected:
+       virtual bool DoOpen(DWORD dwOpenAccess, DWORD dwOpenShareMode, DWORD dwOpenCreationDispostion, DWORD dwMappingProtect, DWORD dwMapViewAccess);
+       virtual void LastError(LPCTSTR apiname, int syserrnum);
+       virtual void LastErrorCustom(LPCTSTR desc);
+
+// Implementation data
+private:
+       int m_statusFetched; // 0 not fetched, -1 error, +1 success
+       CFileStatus m_filestatus;
+       __int64 m_filesize;
+       CString m_filepath;
+       CString m_filename;
+       HANDLE m_handle;
+       HANDLE m_hMapping;
+       LPBYTE m_base; // points to base of mapping
+       LPBYTE m_data; // similar to m_base, but after BOM if any
+       LPBYTE m_current; // current location in file
+       int m_lineno; // current 0-based line of m_current
+       UniError m_lastError;
+       bool m_readbom; // whether have tested for BOM
+       int m_unicoding; // enum UNICODESET in unicoder.h
+       int m_charsize; // 2 for UCS-2, else 1
+       int m_codepage; // only valid if m_unicoding==ucr::NONE;
+       txtstats m_txtstats;
+};
+
+#endif // UniFile_h_included
diff --git a/Src/Common/unicoder.cpp b/Src/Common/unicoder.cpp

index 2228d5e..1b8eef2 100644 (file)
--- a/Src/Common/unicoder.cpp
+++ b/Src/Common/unicoder.cpp
@@ -1,11 +1,18 @@
  /**
- *  @file unicoder.cpp
+ *  @file   unicoder.cpp
+ *  @author Perry Rapp, Creator, 2003
+ *  @date   Created: 2003-10
+ *  @date   Edited:  2003-10-21 (Perry)
   *
- *  @brief Implementation of utility unicode conversion routines
- *
- */ 
-// RCS ID line follows -- this is updated by CVS
-// $Id$
+ *  @brief  Implementation of utility unicode conversion routines
+ */
+
+/* The MIT License
+Copyright (c) 2003 Perry Rapp
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
  
  #include "StdAfx.h"
  #include "unicoder.h"
@@ -479,7 +486,7 @@ convertToBuffer(const CString & src, LPVOID dest, UNICODESET codeset)
   * @brief Extract character from pointer, handling UCS-2 codesets (doesn't handle UTF-8)
   */
  UINT
-get_unicode_char(unsigned char * ptr, UNICODESET codeset)
+get_unicode_char(unsigned char * ptr, UNICODESET codeset, int codepage)
  {
         UINT ch;
         switch (codeset)
@@ -491,9 +498,85 @@ get_unicode_char(unsigned char * ptr, UNICODESET codeset)
                 ch = (ptr[0] << 8) + ptr[1];
                 break;
         default:
-               ch = byteToUnicode(*ptr);
+               ch = (codepage ? byteToUnicode(*ptr, codepage) : byteToUnicode(*ptr));
         }
         return ch;
  }
  
+/**
+ * @brief Convert series of bytes (8-bit chars) to TCHARs, using specified codepage
+ *
+ * TODO: This doesn't inform the caller whether translation was lossy
+ *  In fact, this doesn't even know. Probably going to have to make
+ *  two passes, the first with MB_ERR_INVALID_CHARS. Ugh. :(
+ */
+CString maketstring(unsigned char * lpd, UINT len, int codepage)
+{
+       static bool vercheck=false;
+       static int defcodepage = CP_ACP;
+       if (!vercheck)
+       {
+               if (!f_osvi_fetched) fetch_verinfo();
+               // Need 2000 or better for CP_THREAD_ACP
+               if (f_osvi.dwMajorVersion>=5)
+                       defcodepage = CP_THREAD_ACP;
+               vercheck = true;
+       }
+       if (!len) return _T("");
+       if (!codepage)
+               codepage = defcodepage;
+
+       if (codepage == defcodepage)
+       {
+               // trivial case, they want the bytes in the file interpreted in our current codepage
+#ifndef UNICODE
+               return lpd;
+#else
+               CString s;
+               for (UINT i=0; i<len; ++i)
+                       s += (wchar_t)(*lpd++);
+               return s;
+#endif
+       }
+
+       // Convert input to Unicode, using specified codepage
+       DWORD flags = 0;
+       int wlen = len*2+6;
+       wchar_t * wbuff = new wchar_t[wlen];
+       int n = MultiByteToWideChar(codepage, flags, (LPCSTR)lpd, len, wbuff, wlen-1);
+       if (!n)
+       {
+               delete [] wbuff;
+               return _T("?");
+       }
+       wbuff[n] = 0; // zero-terminate string
+
+#ifdef UNICODE
+       // wchar_t is TCHAR, so we're done
+       CString str = wbuff;
+       delete [] wbuff;
+       return str;
+#else
+       // Now convert to TCHAR (which means defcodepage)
+       flags = WC_NO_BEST_FIT_CHARS; // TODO: Think about this
+       CString str;
+       wlen = n;
+       int clen = wlen * 2 + 6;
+       LPSTR cbuff = str.GetBuffer(clen);
+       BOOL defaulted=FALSE;
+       n = WideCharToMultiByte(defcodepage, flags, wbuff, n, cbuff, clen-1, NULL, &defaulted);
+       if (n)
+       {
+               cbuff[n] = 0; // zero-terminate string
+               str.ReleaseBuffer();
+       }
+       else
+       {
+               str = _T("?");
+       }
+       delete [] wbuff;
+       return str;
+#endif
+}
+
  } // namespace ucr
diff --git a/Src/Common/unicoder.h b/Src/Common/unicoder.h

index 91f98c1..8c3dc9e 100644 (file)
--- a/Src/Common/unicoder.h
+++ b/Src/Common/unicoder.h
@@ -1,18 +1,18 @@
  /**
- *  @file unicoder.h
- *
- *  @brief Declaration of utility unicode conversion routines
+ *  @file   unicoder.h
+ *  @author Perry Rapp, Creator, 2003
+ *  @date   Created: 2003-10
+ *  @date   Edited:  2003-10-21 (Perry)
   *
+ *  @brief  Declaration of utility unicode conversion routines
   */ 
-// RCS ID line follows -- this is updated by CVS
-// $Id$
  
  #ifndef unicoder_h_included
  #define unicoder_h_included
  
  namespace ucr {
  
-typedef enum { NONE=1, UCS2LE, UCS2BE, UTF8 } UNICODESET;
+typedef enum { NONE=0, UCS2LE, UCS2BE, UTF8 } UNICODESET;
  
  int Ucs4_to_Utf8(UINT unich, unsigned char * utf8);
  int Utf8len_fromLeadByte(unsigned char ch);
@@ -22,7 +22,8 @@ UINT GetUtf8Char(unsigned char * str);
  int to_utf8_advance(UINT u, unsigned char * &lpd);
  CString maketchar(UINT ch, bool & lossy);
  void convertToBuffer(const CString & src, LPVOID dest, UNICODESET codeset);
-UINT get_unicode_char(unsigned char * ptr, UNICODESET codeset);
+UINT get_unicode_char(unsigned char * ptr, UNICODESET codeset, int codepage=0);
+CString maketstring(unsigned char * lpd, UINT len, int codepage=0);
  CString maketchar(UINT unich, bool & lossy);
  CString maketchar(UINT unich, bool & lossy, UINT codepage);
  UINT byteToUnicode(unsigned char ch);
diff --git a/Src/Merge.dsp b/Src/Merge.dsp

index b03cc7a..dd73a88 100644 (file)
--- a/Src/Merge.dsp
+++ b/Src/Merge.dsp
@@ -1322,6 +1322,10 @@ SOURCE=.\UnicodeUtf8.cpp
  # End Source File
  # Begin Source File
  
+SOURCE=..\common\UniFile.cpp
+# End Source File
+# Begin Source File
+
  SOURCE=.\UTIL.C
  
  !IF  "$(CFG)" == "Merge - Win32 Release"
@@ -1706,6 +1710,10 @@ SOURCE=.\UnicodeUtf8.h
  # End Source File
  # Begin Source File
  
+SOURCE=..\common\UniFile.h
+# End Source File
+# Begin Source File
+
  SOURCE=..\common\version.h
  # End Source File
  # Begin Source File
diff --git a/Src/MergeDoc.cpp b/Src/MergeDoc.cpp

index efcab09..3496d0e 100644 (file)
--- a/Src/MergeDoc.cpp
+++ b/Src/MergeDoc.cpp
@@ -47,6 +47,7 @@
  #include "WaitStatusCursor.h"
  #include "FileTransform.h"
  #include "unicoder.h"
+#include "UniFile.h"
  
  #ifdef _DEBUG
  #define new DEBUG_NEW
@@ -1040,11 +1041,11 @@ void CMergeDoc::CDiffTextBuffer::SetTempPath(CString path)
  /**
   * @brief Examine statistics in textFileStats and return a crystaltextbuffer enum value for line style
   */
-int GetTextFileStyle(const ParsedTextFile & parsedTextFile)
+int GetTextFileStyle(const UniMemFile::txtstats & stats)
  {
-       if (parsedTextFile.crlfs >= parsedTextFile.lfs)
+       if (stats.ncrlfs >= stats.nlfs)
         {
-               if (parsedTextFile.crlfs >= parsedTextFile.crs)
+               if (stats.ncrlfs >= stats.ncrs)
                 {
                         return CRLF_STYLE_DOS;
                 }
@@ -1055,7 +1056,7 @@ int GetTextFileStyle(const ParsedTextFile & parsedTextFile)
         }
         else
         {
-               if (parsedTextFile.lfs >= parsedTextFile.crs)
+               if (stats.nlfs >= stats.ncrs)
                 {
                         return CRLF_STYLE_UNIX;
                 }
@@ -1091,7 +1092,7 @@ int CMergeDoc::CDiffTextBuffer::LoadFromFile(LPCTSTR pszFileNameInit, PackingInf
  //     We call FreeAll just before reading m_aLines
  //     ASSERT(!m_bInit);
  //     ASSERT(m_aLines.GetSize() == 0);
-       MAPPEDFILEDATA fileData = {0};
+
         CString sExt;
         BOOL bSuccess = FALSE;
         int nRetVal = FRESULT_OK;
@@ -1103,54 +1104,44 @@ int CMergeDoc::CDiffTextBuffer::LoadFromFile(LPCTSTR pszFileNameInit, PackingInf
         if (def && def->encoding != -1)
                 m_nSourceEncoding = def->encoding;
         
-       // Init filedata struct and open file as memory mapped 
-       _tcsncpy(fileData.fileName, pszFileName, countof(fileData.fileName));
-       fileData.bWritable = FALSE;
-       fileData.dwOpenFlags = OPEN_EXISTING;
-       bSuccess = files_openFileMapped(&fileData);
-
-       // Inefficiency here
-       // We load up the line array in files_loadLines
-       // only to recopy them all into m_aLines
-       // This wouldn't be bad if it were a CString copy (as CStrings are reference counted)
-       // but I think that AppendLine leads to a new allocation
-       // so we are copying the whole file into textFileStats line buffers
-       // and then recopying it into crystal line buffers
-       // so... perhaps this could be improved.
-
-       ParsedTextFile parsedTextFile;
-       if (bSuccess)
-               nRetVal = files_loadLines(&fileData, &parsedTextFile);
-       else
-               nRetVal = FRESULT_ERROR;
-       
-       if (nRetVal == FRESULT_OK)
+       UniMemFile ufile(pszFileName);
+       UniFile * pufile = &ufile;
+
+       // Now we only use the UniFile interface
+       // which is something we could implement for HTTP and/or FTP files
+
+       if (pufile->OpenReadOnly())
         {
-               // FreeAll() is needed before loading (this is complicated)
-               FreeAll();
+               pufile->ReadBom();
+               UINT lineno = 0;
+               CString line, eol;
  
-               m_aLines.SetSize(parsedTextFile.lines.GetSize(), 4096);
+               // Manually grow line array exponentially
+               int arraysize = 500;
+               m_aLines.SetSize(arraysize);
                 
-               DWORD dwBytesRead = 0;
-               TCHAR *lpChar = (TCHAR *)fileData.pMapBase;
-               TCHAR *lpLineBegin = lpChar;
-               int eolChars = 0;
-               UINT lineno = 0;
-               for ( ; lineno < parsedTextFile.lines.GetSize(); ++lineno)
+               while (pufile->ReadString(line, eol))
                 {
-                       textline & lp = parsedTextFile.lines.GetAt(lineno);
-
-                       // Skipping iconvert call here
-                       // because I don't know if it even works
-                       // Perry 2003-09-15
+                       // Manually grow line array exponentially
+                       if (lineno == arraysize)
+                       {
+                               arraysize *= 2;
+                               m_aLines.SetSize(arraysize);
+                               
+                       }
  
-                       AppendLine(lineno, lp.sline, lp.sline.GetLength());
+                       line += eol;
+                       AppendLine(lineno, line, line.GetLength());
+                       ++lineno;
                 }
+               // fix array size (due to our manual exponential growth
+               m_aLines.SetSize(lineno);
+       
                 
                 //Try to determine current CRLF mode (most frequent)
                 if (nCrlfStyle == CRLF_STYLE_AUTOMATIC)
                 {
-                       nCrlfStyle = GetTextFileStyle(parsedTextFile);
+                       nCrlfStyle = GetTextFileStyle(pufile->GetTxtStats());
                 }
                 ASSERT(nCrlfStyle >= 0 && nCrlfStyle <= 2);
                 SetCRLFMode(nCrlfStyle);
@@ -1178,7 +1169,7 @@ int CMergeDoc::CDiffTextBuffer::LoadFromFile(LPCTSTR pszFileNameInit, PackingInf
                 nRetVal = FRESULT_OK;
  
                 // stash original encoding away
-               switch (parsedTextFile.codeset)
+               switch (pufile->GetUnicoding())
                 {
                 case ucr::UCS2LE:
                         m_nSourceEncoding = -20;
@@ -1190,11 +1181,10 @@ int CMergeDoc::CDiffTextBuffer::LoadFromFile(LPCTSTR pszFileNameInit, PackingInf
                         m_nSourceEncoding = -22;
                         break;
                 }
-               if (parsedTextFile.lossy)
+               if (pufile->GetTxtStats().nlosses)
                         readOnly = TRUE;
         }
         
-       files_closeFileMapped(&fileData, 0xFFFFFFFF, FALSE);
  
         // delete the file that unpacking may have created
         if (_tcscmp(pszFileNameInit, pszFileName) != 0)
diff --git a/Src/readme.txt b/Src/readme.txt

index ceccdd6..2a2be0c 100644 (file)
--- a/Src/readme.txt
+++ b/Src/readme.txt
@@ -1,3 +1,8 @@
+2003-10-23 Perry
+ PATCH: [ 826250 ] New UniMemFile class for memory-mapping
+  WinMerge: Merge.dsp MergeDoc.cpp
+  common: unicoder.cpp unicoder.h UniFile.cpp UniFile.h
+
  2003-10-22 Perry
   Fix compile for MakeResDll
    common: coretools.cpp
author	Perry Rapp <elsapo@users.sourceforge.net>
	Thu, 23 Oct 2003 00:04:34 +0000 (00:04 +0000)
committer	Perry Rapp <elsapo@users.sourceforge.net>
	Thu, 23 Oct 2003 00:04:34 +0000 (00:04 +0000)
Src/Common/UniFile.cpp	[new file with mode: 0644]	patch \| blob
Src/Common/UniFile.h	[new file with mode: 0644]	patch \| blob
Src/Common/unicoder.cpp		patch \| blob \| history
Src/Common/unicoder.h		patch \| blob \| history
Src/Merge.dsp		patch \| blob \| history
Src/MergeDoc.cpp		patch \| blob \| history
Src/readme.txt		patch \| blob \| history