* fhandler.h (class dev_console): Constify charset parameter of

author corinna <corinna>

Mon, 28 Sep 2009 12:10:32 +0000 (12:10 +0000)

committer corinna <corinna>

Mon, 28 Sep 2009 12:10:32 +0000 (12:10 +0000)
author corinna <corinna>
Mon, 28 Sep 2009 12:10:32 +0000 (12:10 +0000)
committer corinna <corinna>
Mon, 28 Sep 2009 12:10:32 +0000 (12:10 +0000)
diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog

index b47845a..d978eee 100644 (file)
--- a/winsup/cygwin/ChangeLog
+++ b/winsup/cygwin/ChangeLog
@@ -1,5 +1,31 @@
  2009-09-28  Corinna Vinschen  <corinna@vinschen.de>
  
+       * fhandler.h (class dev_console): Constify charset parameter of
+       str_to_con.
+       * fhandler_console.cc (dev_console::con_to_str): Simplify.  Always
+       default to the current internal locale.
+       (dev_console::get_console_cp): Always use codepage 437 for alternate
+       charset.
+       (dev_console::str_to_con): Constify charset parameter.
+       (fhandler_console::write_normal): Always use codepage 437 for alternate
+       charset.  Otherwise always default to the current internal locale.
+       Replace ASCII SO with ASCII CAN.
+       * strfuncs.cc: Tweka comments according to below changes.
+       (sys_cp_wcstombs): Constify charset parameter.  Convert all wchar_t
+       values in the Unicode private use area U+F0xx to the singlebyte
+       counterpart.  Drop special handling creating ASCII SO sequence from
+       U+DCxx value.  Rearrange for performance.  Replace ASCII SO with
+       ASCII CAN.
+       (sys_cp_mbstowcs): Constify charset parameter.  Replace ASCII SO with
+       ASCII CAN.  Drop special case for U+DCxx ASCII SO sequences.  Always
+       create a replacement from the Unicode private use area U+F0xx for
+       invalid byte values in a multibyte sequence.  Do the same for wchar_t
+       values from the U+F0xx range to make them roundtrip safe.
+       * wchar.h (sys_cp_wcstombs): Constify charset parameter.
+       (sys_cp_mbstowcs): Ditto.
+
+2009-09-28  Corinna Vinschen  <corinna@vinschen.de>
+
         * cygheap.cc (cygheap_init): Default locale.charset to "UTF-8".
         * dcrt0.cc (dll_crt0_1): Call setlocale (LC_CTYPE, "") here, just
         before resetting application locale to "C".
diff --git a/winsup/cygwin/fhandler.h b/winsup/cygwin/fhandler.h

index dd9b591..dac0ce2 100644 (file)
--- a/winsup/cygwin/fhandler.h
+++ b/winsup/cygwin/fhandler.h
@@ -934,7 +934,7 @@ class dev_console
  
    inline UINT get_console_cp ();
    DWORD con_to_str (char *d, int dlen, WCHAR w);
-  DWORD str_to_con (mbtowc_p, char *, PWCHAR d, const char *s, DWORD sz);
+  DWORD str_to_con (mbtowc_p, const char *, PWCHAR d, const char *s, DWORD sz);
    void set_color (HANDLE);
    bool fillin_info (HANDLE);
    void set_default_attr ();
diff --git a/winsup/cygwin/fhandler_console.cc b/winsup/cygwin/fhandler_console.cc

index 53da269..251a827 100644 (file)
--- a/winsup/cygwin/fhandler_console.cc
+++ b/winsup/cygwin/fhandler_console.cc
@@ -127,19 +127,19 @@ tty_list::get_tty (int n)
  inline DWORD
  dev_console::con_to_str (char *d, int dlen, WCHAR w)
  {
-  return sys_cp_wcstombs (*cygheap->locale.charset == 'A'
-                         ? __ascii_wctomb : cygheap->locale.wctomb,
-                         cygheap->locale.charset, d, dlen, &w, 1);
+  return sys_cp_wcstombs (cygheap->locale.wctomb, cygheap->locale.charset,
+                         d, dlen, &w, 1);
  }
  
  inline UINT
  dev_console::get_console_cp ()
  {
-  return alternate_charset_active ? GetConsoleOutputCP () : 0;
+  /* The alternate charset is always 437, just as in the Linux console. */
+  return alternate_charset_active ? 437 : 0;
  }
  
  inline DWORD
-dev_console::str_to_con (mbtowc_p f_mbtowc, char *charset,
+dev_console::str_to_con (mbtowc_p f_mbtowc, const char *charset,
                          PWCHAR d, const char *s, DWORD sz)
  {
    return sys_cp_mbstowcs (f_mbtowc, charset, d, CONVERT_LIMIT, s, sz);
@@ -1454,16 +1454,19 @@ fhandler_console::write_normal (const unsigned char *src,
    size_t ret;
    mbstate_t ps;
    UINT cp = dev_state->get_console_cp ();
-  char charsetbuf[ENCODING_LEN + 1];
-  char *charset;
+  const char *charset;
    mbtowc_p f_mbtowc;
  
    if (cp)
-    f_mbtowc = __set_charset_from_codepage (cp, charset = charsetbuf);
+    {
+      /* The alternate charset is always 437, just as in the Linux console. */
+      f_mbtowc = __cp_mbtowc;
+      charset = "CP437";
+    }
    else
      {
+      f_mbtowc = cygheap->locale.mbtowc;
        charset = cygheap->locale.charset;
-      f_mbtowc = (*charset == 'A') ? __ascii_mbtowc : cygheap->locale.mbtowc;
      }
  
    /* First check if we have cached lead bytes of a former try to write
@@ -1606,10 +1609,10 @@ fhandler_console::write_normal (const unsigned char *src,
           cursor_set (false, 0, y);
           break;
         case ERR:
-         /* Don't print chars marked as ERR chars, except for a SO sequence
-            which is printed as singlebyte chars from the UTF Basic Latin
-            and Latin 1 Supplement plains. */
-         if (*found == 0x0e)
+         /* Don't print chars marked as ERR chars, except for a ASCII CAN
+            sequence which is printed as singlebyte chars from the UTF
+            Basic Latin and Latin 1 Supplement plains. */
+         if (*found == 0x18)
             {
               write_replacement_char ();
               if (found + 1 < end)
diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc

index e273f76..009af17 100644 (file)
--- a/winsup/cygwin/strfuncs.cc
+++ b/winsup/cygwin/strfuncs.cc
@@ -310,8 +310,7 @@ __big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
     Called from newlib's setlocale() with codepage set to 0, if the
     charset isn't given explicitely in the POSIX compatible locale specifier.
     The function also returns a pointer to the corresponding _mbtowc_r
-   function.  Also called from fhandler_console::write_normal() if the
-   "Alternate Charset" has been switched on by an escape sequence. */
+   function. */
  extern "C" mbtowc_p
  __set_charset_from_codepage (UINT cp, char *charset)
  {
@@ -400,17 +399,17 @@ __set_charset_from_codepage (UINT cp, char *charset)
       multibyte charset, then usually you wouldn't be able to access the
       file.  To fix this problem, sys_wcstombs creates a replacement multibyte
       sequences for the non-representable wide-char.  The sequence starts with
-     an ASCII SO (0x0e, Ctrl-N), followed by the UTF-8 representation of the
-     character.  The sys_(cp_)mbstowcs function detects ASCII SO characters
+     an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the
+     character.  The sys_(cp_)mbstowcs function detects ASCII CAN characters
       in the input multibyte string and converts the following multibyte
       sequence in by treating it as an UTF-8 char.  If that fails, the ASCII
-     SO was probably standalone and it gets just copied over as ASCII SO.
+     CAN was probably standalone and it gets just copied over as ASCII CAN.
  
     - The functions always create 0-terminated results, no matter what.
       If the result is truncated due to buffer size, it's a bug in Cygwin
       and the buffer in the calling function should be raised. */
  size_t __stdcall
-sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
+sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
                  const wchar_t *src, size_t nwc)
  {
    char buf[10];
@@ -426,46 +425,47 @@ sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
    while (n < len && nwc-- > 0)
      {
        wchar_t pw = *pwcs;
-      /* Convert UNICODE private use area.  Reverse functionality (only for
-        path names) is transform_chars in path.cc. */
-      if ((pw & 0xff00) == 0xf000)
-       pw &= 0xff;
-      int bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
-      if (bytes == -1 && (pw & 0xff00) == 0xdc00)
+      int bytes;
+
+      /* Convert UNICODE private use area.  Reverse functionality for the
+         ASCII area <= 0x7f (only for path names) is transform_chars in
+        path.cc.  Reverse functionality for invalid bytes in a multibyte
+        sequence is in sys_cp_mbstowcs. */
+      if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1))
         {
-         /* Reverse functionality of the single invalid second half of a
-            surrogate pair in the 0xDCxx range specifying an invalid byte
-            value when converting from MB to WC.
-            The comment in sys_cp_mbstowcs below explains it. */
-         buf[0] = 0x0e; /* ASCII SO */
-         buf[1] = 0xff;
-         buf[2] = (char) (pw & 0xff);
-         bytes = 3;
-       }
-      else if (bytes == -1 && *charset != 'U'/*TF-8*/)
+         buf[0] = pw & 0xff;
+         bytes = 1;
+       }
+      else
         {
-         /* Convert chars invalid in the current codepage to a sequence
-            ASCII SO; UTF-8 representation of invalid char. */
-         buf[0] = 0x0e; /* ASCII SO */
-         bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
-         if (bytes == -1)
+         bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
+         if (bytes == -1 && *charset != 'U'/*TF-8*/)
             {
-             ++pwcs;
-             ps.__count = 0;
-             continue;
-           }
-         ++bytes; /* Add the ASCII SO to the byte count. */
-         if (ps.__count == -4 && nwc > 0) /* First half of a surrogate pair. */
-           {
-             ++pwcs;
-             if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
+             /* Convert chars invalid in the current codepage to a sequence
+                ASCII CAN; UTF-8 representation of invalid char. */
+             buf[0] = 0x18; /* ASCII CAN */
+             bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
+             if (bytes == -1)
                 {
                   ++pwcs;
                   ps.__count = 0;
                   continue;
                 }
-             bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset, &ps);
-             nwc--;
+             ++bytes; /* Add the ASCII CAN to the byte count. */
+             if (ps.__count == -4 && nwc > 0)
+               {
+                 /* First half of a surrogate pair. */
+                 ++pwcs;
+                 if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
+                   {
+                     ++pwcs;
+                     ps.__count = 0;
+                     continue;
+                   }
+                 bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset,
+                                         &ps);
+                 nwc--;
+               }
             }
         }
        if (n + bytes <= len)
@@ -535,8 +535,8 @@ sys_wcstombs_alloc (char **dst_p, int type, const wchar_t *src, size_t nwc)
     charset, which is the charset returned by GetConsoleCP ().  Most of the
     time this is used for box and line drawing characters. */
  size_t __stdcall
-sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
-                const char *src, size_t nms)
+sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
+                size_t dlen, const char *src, size_t nms)
  {
    wchar_t *ptr = dst;
    unsigned const char *pmbs = (unsigned const char *) src;
@@ -551,10 +551,10 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
      len = (size_t)-1;
    while (len > 0 && nms > 0)
      {
-      /* ASCII SO handling. */
-      if (*pmbs == 0x0e)
+      /* ASCII CAN handling. */
+      if (*pmbs == 0x18)
         {
-         /* Sanity check: If this is a lead SO byte for a following UTF-8
+         /* Sanity check: If this is a lead CAN byte for a following UTF-8
              sequence, there must be at least two more bytes left, and the
              next byte must be a valid UTF-8 start byte.  If the charset
              isn't UTF-8 anyway, try to convert the following bytes as UTF-8
@@ -565,16 +565,16 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
                                      nms - 1, charset, &ps);
               if (bytes < 0)
                 {
-                 /* Invalid UTF-8 sequence?  Treat the ASCII SO character as
-                    stand-alone ASCII SO char. */
+                 /* Invalid UTF-8 sequence?  Treat the ASCII CAN character as
+                    stand-alone ASCII CAN char. */
                   bytes = 1;
                   if (dst)
-                   *ptr = 0x0e;
+                   *ptr = 0x18;
                   memset (&ps, 0, sizeof ps);
                 }
               else
                 {
-                 ++bytes; /* Count SO byte */
+                 ++bytes; /* Count CAN byte */
                   if (bytes > 1 && ps.__count == 4)
                     {
                       /* First half of a surrogate. */
@@ -594,40 +594,28 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
                     }
                 }
             }
-         /* Sequence for an invalid byte originally created in the next outer
-            else branch below.  This must be converted back to a 0xDCxx value
-            as well. */
-         else if (nms > 2 && pmbs[1] == 0xff)
-           {
-             bytes = 3;
-             if (dst)
-               *ptr = L'\xdc80' | pmbs[2];
-           }
-         /* Otherwise it's just a simple ASCII SO. */
+         /* Otherwise it's just a simple ASCII CAN. */
           else
             {
               bytes = 1;
               if (dst)
-               *ptr = 0x0e;
+               *ptr = 0x18;
             }
         }
        else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
                                   charset, &ps)) < 0
-              && *pmbs >= 0x80)
+              || (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80))
         {
-         /* This should probably be handled in f_mbtowc which can operate
-            on sequences rather than individual characters.
-            The technique is based on a discussion here:
-
+         /* The technique is based on a discussion here:
              http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
  
-            This is hardly perfect.  Windows doesn't do anything sensical with
-            characters converted to this format.  It does allow processing of
-            src to continue, however, which, since there is no way to signal
-            decoding errors, seems like the best we can do. */
+            Invalid bytes in a multibyte secuence are converted to
+            the private use area which is already used to store ASCII
+            chars invalid in Windows filenames.  This techinque allows 
+            to store them in a symmetric way. */
           bytes = 1;
           if (dst)
-           *ptr = L'\xdc80' | *pmbs;
+           *ptr = L'\xf000' | *pmbs;
           memset (&ps, 0, sizeof ps);
         }
  
diff --git a/winsup/cygwin/wchar.h b/winsup/cygwin/wchar.h

index f989b7d..1398238 100644 (file)
--- a/winsup/cygwin/wchar.h
+++ b/winsup/cygwin/wchar.h
@@ -51,7 +51,7 @@ extern mbtowc_p __set_charset_from_codepage (unsigned int cp, char *charset);
  #endif
  
  #ifdef __INSIDE_CYGWIN__
-size_t __stdcall sys_cp_wcstombs (wctomb_p, char *, char *, size_t,
+size_t __stdcall sys_cp_wcstombs (wctomb_p, const char *, char *, size_t,
                                   const wchar_t *, size_t = (size_t) -1)
         __attribute__ ((regparm(3)));
  size_t __stdcall sys_wcstombs (char *dst, size_t len, const wchar_t * src,
@@ -61,7 +61,7 @@ size_t __stdcall sys_wcstombs_alloc (char **, int, const wchar_t *,
                                      size_t = (size_t) -1)
         __attribute__ ((regparm(3)));
  
-size_t __stdcall sys_cp_mbstowcs (mbtowc_p, char *, wchar_t *, size_t,
+size_t __stdcall sys_cp_mbstowcs (mbtowc_p, const char *, wchar_t *, size_t,
                                   const char *, size_t = (size_t) -1)
         __attribute__ ((regparm(3)));
  size_t __stdcall sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src,
author	corinna <corinna>
	Mon, 28 Sep 2009 12:10:32 +0000 (12:10 +0000)
committer	corinna <corinna>
	Mon, 28 Sep 2009 12:10:32 +0000 (12:10 +0000)
winsup/cygwin/ChangeLog		patch \| blob \| history
winsup/cygwin/fhandler.h		patch \| blob \| history
winsup/cygwin/fhandler_console.cc		patch \| blob \| history
winsup/cygwin/strfuncs.cc		patch \| blob \| history
winsup/cygwin/wchar.h		patch \| blob \| history