Upgrade to mksh R56b.

[android-x86/external-mksh.git] / src / shf.c
diff --git a/src/shf.c b/src/shf.c

index 09cc7c3..7e53352 100644 (file)
--- a/src/shf.c
+++ b/src/shf.c
@@ -4,6 +4,8 @@
   * Copyright (c) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011,
   *              2012, 2013, 2015, 2016, 2017
   *     mirabilos <m@mirbsd.org>
+ * Copyright (c) 2015
+ *     Daniel Richard G. <skunk@iSKUNK.ORG>
   *
   * Provided that these terms and disclaimer and all copyright notices
   * are retained or reproduced in an accompanying document, permission
@@ -25,7 +27,7 @@
  
  #include "sh.h"
  
-__RCSID("$MirOS: src/bin/mksh/shf.c,v 1.79 2017/04/12 17:08:49 tg Exp $");
+__RCSID("$MirOS: src/bin/mksh/shf.c,v 1.95 2017/05/05 22:45:58 tg Exp $");
  
  /* flags to shf_emptybuf() */
  #define EB_READSW      0x01    /* about to switch to reading */
@@ -874,11 +876,11 @@ shf_vfprintf(struct shf *shf, const char *fmt, va_list args)
                                 flags |= FL_SIZET;
                                 continue;
                         }
-                       if (ksh_isdigit(c)) {
+                       if (ctype(c, C_DIGIT)) {
                                 bool overflowed = false;
  
                                 tmp = ksh_numdig(c);
-                               while (c = *fmt++, ksh_isdigit(c))
+                               while (ctype((c = *fmt++), C_DIGIT))
                                         if (notok2mul(2147483647, tmp, 10))
                                                 overflowed = true;
                                         else
@@ -899,7 +901,7 @@ shf_vfprintf(struct shf *shf, const char *fmt, va_list args)
                         /* nasty format */
                         break;
  
-               if (ksh_isupper(c)) {
+               if (ctype(c, C_UPPER)) {
                         flags |= FL_UPPER;
                         c = ksh_tolower(c);
                 }
@@ -1029,8 +1031,7 @@ shf_vfprintf(struct shf *shf, const char *fmt, va_list args)
                         if (!(flags & FL_RIGHT)) {
                                 /* skip past sign or 0x when padding with 0 */
                                 if ((flags & FL_ZERO) && (flags & FL_NUMBER)) {
-                                       if (*s == '+' || *s == '-' ||
-                                           *s == ' ') {
+                                       if (ctype(*s, C_SPC | C_PLUS | C_MINUS)) {
                                                 shf_putc(*s, shf);
                                                 s++;
                                                 precision--;
@@ -1158,3 +1159,163 @@ cstrerror(int errnum)
         }
  }
  #endif
+
+/* fast character classes */
+const uint32_t tpl_ctypes[128] = {
+       /* 0x00 */
+       CiNUL,          CiCNTRL,        CiCNTRL,        CiCNTRL,
+       CiCNTRL,        CiCNTRL,        CiCNTRL,        CiCNTRL,
+       CiCNTRL,        CiTAB,          CiNL,           CiSPX,
+       CiSPX,          CiCR,           CiCNTRL,        CiCNTRL,
+       /* 0x10 */
+       CiCNTRL,        CiCNTRL,        CiCNTRL,        CiCNTRL,
+       CiCNTRL,        CiCNTRL,        CiCNTRL,        CiCNTRL,
+       CiCNTRL,        CiCNTRL,        CiCNTRL,        CiCNTRL,
+       CiCNTRL,        CiCNTRL,        CiCNTRL,        CiCNTRL,
+       /* 0x20 */
+       CiSP,           CiALIAS | CiVAR1,       CiQC,   CiHASH,
+       CiSS,           CiPERCT,        CiQCL,          CiQC,
+       CiQCL,          CiQCL,          CiQCX | CiVAR1, CiPLUS,
+       CiALIAS,        CiMINUS,        CiALIAS,        CiQCM,
+       /* 0x30 */
+       CiOCTAL,        CiOCTAL,        CiOCTAL,        CiOCTAL,
+       CiOCTAL,        CiOCTAL,        CiOCTAL,        CiOCTAL,
+       CiDIGIT,        CiDIGIT,        CiCOLON,        CiQCL,
+       CiANGLE,        CiEQUAL,        CiANGLE,        CiQUEST,
+       /* 0x40 */
+       CiALIAS | CiVAR1,       CiUPPER | CiHEXLT,
+       CiUPPER | CiHEXLT,      CiUPPER | CiHEXLT,
+       CiUPPER | CiHEXLT,      CiUPPER | CiHEXLT,
+       CiUPPER | CiHEXLT,      CiUPPER,
+       CiUPPER,        CiUPPER,        CiUPPER,        CiUPPER,
+       CiUPPER,        CiUPPER,        CiUPPER,        CiUPPER,
+       /* 0x50 */
+       CiUPPER,        CiUPPER,        CiUPPER,        CiUPPER,
+       CiUPPER,        CiUPPER,        CiUPPER,        CiUPPER,
+       CiUPPER,        CiUPPER,        CiUPPER,        CiQCX | CiBRACK,
+       CiQCX,          CiBRACK,        CiQCM,          CiUNDER,
+       /* 0x60 */
+       CiGRAVE,                CiLOWER | CiHEXLT,
+       CiLOWER | CiHEXLT,      CiLOWER | CiHEXLT,
+       CiLOWER | CiHEXLT,      CiLOWER | CiHEXLT,
+       CiLOWER | CiHEXLT,      CiLOWER,
+       CiLOWER,        CiLOWER,        CiLOWER,        CiLOWER,
+       CiLOWER,        CiLOWER,        CiLOWER,        CiLOWER,
+       /* 0x70 */
+       CiLOWER,        CiLOWER,        CiLOWER,        CiLOWER,
+       CiLOWER,        CiLOWER,        CiLOWER,        CiLOWER,
+       CiLOWER,        CiLOWER,        CiLOWER,        CiCURLY,
+       CiQCL,          CiCURLY,        CiQCM,          CiCNTRL
+};
+
+void
+set_ifs(const char *s)
+{
+#if defined(MKSH_EBCDIC) || defined(MKSH_FAUX_EBCDIC)
+       int i = 256;
+
+       memset(ksh_ctypes, 0, sizeof(ksh_ctypes));
+       while (i--)
+               if (ebcdic_map[i] < 0x80U)
+                       ksh_ctypes[i] = tpl_ctypes[ebcdic_map[i]];
+#else
+       memcpy(ksh_ctypes, tpl_ctypes, sizeof(tpl_ctypes));
+       memset((char *)ksh_ctypes + sizeof(tpl_ctypes), '\0',
+           sizeof(ksh_ctypes) - sizeof(tpl_ctypes));
+#endif
+       ifs0 = *s;
+       while (*s)
+               ksh_ctypes[ord(*s++)] |= CiIFS;
+}
+
+#if defined(MKSH_EBCDIC) || defined(MKSH_FAUX_EBCDIC)
+#include <locale.h>
+
+/*
+ * Many headaches with EBCDIC:
+ * 1. There are numerous EBCDIC variants, and it is not feasible for us
+ *    to support them all. But we can support the EBCDIC code pages that
+ *    contain all (most?) of the characters in ASCII, and these
+ *    usually tend to agree on the code points assigned to the ASCII
+ *    subset. If you need a representative example, look at EBCDIC 1047,
+ *    which is first among equals in the IBM MVS development
+ *    environment: https://en.wikipedia.org/wiki/EBCDIC_1047
+ *    Unfortunately, the square brackets are not consistently mapped,
+ *    and for certain reasons, we need an unambiguous bijective
+ *    mapping between EBCDIC and "extended ASCII".
+ * 2. Character ranges that are contiguous in ASCII, like the letters
+ *    in [A-Z], are broken up into segments (i.e. [A-IJ-RS-Z]), so we
+ *    can't implement e.g. islower() as { return c >= 'a' && c <= 'z'; }
+ *    because it will also return true for a handful of extraneous
+ *    characters (like the plus-minus sign at 0x8F in EBCDIC 1047, a
+ *    little after 'i'). But at least '_' is not one of these.
+ * 3. The normal [0-9A-Za-z] characters are at codepoints beyond 0x80.
+ *    Not only do they require all 8 bits instead of 7, if chars are
+ *    signed, they will have negative integer values! Something like
+ *    (c - 'A') could actually become (c + 63)! Use the ord() macro to
+ *    ensure you're getting a value in [0, 255].
+ * 4. '\n' is actually NL (0x15, U+0085) instead of LF (0x25, U+000A).
+ *    EBCDIC has a proper newline character instead of "emulating" one
+ *    with line feeds, although this is mapped to LF for our purposes.
+ * 5. Note that it is possible to compile programs in ASCII mode on IBM
+ *    mainframe systems, using the -qascii option to the XL C compiler.
+ *    We can determine the build mode by looking at __CHARSET_LIB:
+ *    0 == EBCDIC, 1 == ASCII
+ */
+
+void
+ebcdic_init(void)
+{
+       int i = 256;
+       unsigned char t;
+       bool mapcache[256];
+
+       while (i--)
+               ebcdic_rtt_toascii[i] = i;
+       memset(ebcdic_rtt_fromascii, 0xFF, sizeof(ebcdic_rtt_fromascii));
+       setlocale(LC_ALL, "");
+#ifdef MKSH_EBCDIC
+       if (__etoa_l(ebcdic_rtt_toascii, 256) != 256) {
+               write(2, "mksh: could not map EBCDIC to ASCII\n", 36);
+               exit(255);
+       }
+#endif
+
+       memset(mapcache, 0, sizeof(mapcache));
+       i = 256;
+       while (i--) {
+               t = ebcdic_rtt_toascii[i];
+               /* ensure unique round-trip capable mapping */
+               if (mapcache[t]) {
+                       write(2, "mksh: duplicate EBCDIC to ASCII mapping\n", 40);
+                       exit(255);
+               }
+               /*
+                * since there are 256 input octets, this also ensures
+                * the other mapping direction is completely filled
+                */
+               mapcache[t] = true;
+               /* fill the complete round-trip map */
+               ebcdic_rtt_fromascii[t] = i;
+               /*
+                * Only use the converted value if it's in the range
+                * [0x00; 0x7F], which I checked; the "extended ASCII"
+                * characters can be any encoding, not just Latin1,
+                * and the C1 control characters other than NEL are
+                * hopeless, but we map EBCDIC NEL to ASCII LF so we
+                * cannot even use C1 NEL.
+                * If ever we map to Unicode, bump the table width to
+                * an unsigned int, and or the raw unconverted EBCDIC
+                * values with 0x01000000 instead.
+                */
+               if (t < 0x80U)
+                       ebcdic_map[i] = (unsigned short)ord(t);
+               else
+                       ebcdic_map[i] = (unsigned short)(0x100U | ord(i));
+       }
+       if (ebcdic_rtt_toascii[0] || ebcdic_rtt_fromascii[0] || ebcdic_map[0]) {
+               write(2, "mksh: NUL not at position 0\n", 28);
+               exit(255);
+       }
+}
+#endif