* Copyright (c) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011,
* 2012, 2013, 2015, 2016, 2017
* mirabilos <m@mirbsd.org>
+ * Copyright (c) 2015
+ * Daniel Richard G. <skunk@iSKUNK.ORG>
*
* Provided that these terms and disclaimer and all copyright notices
* are retained or reproduced in an accompanying document, permission
#include "sh.h"
-__RCSID("$MirOS: src/bin/mksh/shf.c,v 1.79 2017/04/12 17:08:49 tg Exp $");
+__RCSID("$MirOS: src/bin/mksh/shf.c,v 1.95 2017/05/05 22:45:58 tg Exp $");
/* flags to shf_emptybuf() */
#define EB_READSW 0x01 /* about to switch to reading */
flags |= FL_SIZET;
continue;
}
- if (ksh_isdigit(c)) {
+ if (ctype(c, C_DIGIT)) {
bool overflowed = false;
tmp = ksh_numdig(c);
- while (c = *fmt++, ksh_isdigit(c))
+ while (ctype((c = *fmt++), C_DIGIT))
if (notok2mul(2147483647, tmp, 10))
overflowed = true;
else
/* nasty format */
break;
- if (ksh_isupper(c)) {
+ if (ctype(c, C_UPPER)) {
flags |= FL_UPPER;
c = ksh_tolower(c);
}
if (!(flags & FL_RIGHT)) {
/* skip past sign or 0x when padding with 0 */
if ((flags & FL_ZERO) && (flags & FL_NUMBER)) {
- if (*s == '+' || *s == '-' ||
- *s == ' ') {
+ if (ctype(*s, C_SPC | C_PLUS | C_MINUS)) {
shf_putc(*s, shf);
s++;
precision--;
}
}
#endif
+
+/* fast character classes */
+const uint32_t tpl_ctypes[128] = {
+ /* 0x00 */
+ CiNUL, CiCNTRL, CiCNTRL, CiCNTRL,
+ CiCNTRL, CiCNTRL, CiCNTRL, CiCNTRL,
+ CiCNTRL, CiTAB, CiNL, CiSPX,
+ CiSPX, CiCR, CiCNTRL, CiCNTRL,
+ /* 0x10 */
+ CiCNTRL, CiCNTRL, CiCNTRL, CiCNTRL,
+ CiCNTRL, CiCNTRL, CiCNTRL, CiCNTRL,
+ CiCNTRL, CiCNTRL, CiCNTRL, CiCNTRL,
+ CiCNTRL, CiCNTRL, CiCNTRL, CiCNTRL,
+ /* 0x20 */
+ CiSP, CiALIAS | CiVAR1, CiQC, CiHASH,
+ CiSS, CiPERCT, CiQCL, CiQC,
+ CiQCL, CiQCL, CiQCX | CiVAR1, CiPLUS,
+ CiALIAS, CiMINUS, CiALIAS, CiQCM,
+ /* 0x30 */
+ CiOCTAL, CiOCTAL, CiOCTAL, CiOCTAL,
+ CiOCTAL, CiOCTAL, CiOCTAL, CiOCTAL,
+ CiDIGIT, CiDIGIT, CiCOLON, CiQCL,
+ CiANGLE, CiEQUAL, CiANGLE, CiQUEST,
+ /* 0x40 */
+ CiALIAS | CiVAR1, CiUPPER | CiHEXLT,
+ CiUPPER | CiHEXLT, CiUPPER | CiHEXLT,
+ CiUPPER | CiHEXLT, CiUPPER | CiHEXLT,
+ CiUPPER | CiHEXLT, CiUPPER,
+ CiUPPER, CiUPPER, CiUPPER, CiUPPER,
+ CiUPPER, CiUPPER, CiUPPER, CiUPPER,
+ /* 0x50 */
+ CiUPPER, CiUPPER, CiUPPER, CiUPPER,
+ CiUPPER, CiUPPER, CiUPPER, CiUPPER,
+ CiUPPER, CiUPPER, CiUPPER, CiQCX | CiBRACK,
+ CiQCX, CiBRACK, CiQCM, CiUNDER,
+ /* 0x60 */
+ CiGRAVE, CiLOWER | CiHEXLT,
+ CiLOWER | CiHEXLT, CiLOWER | CiHEXLT,
+ CiLOWER | CiHEXLT, CiLOWER | CiHEXLT,
+ CiLOWER | CiHEXLT, CiLOWER,
+ CiLOWER, CiLOWER, CiLOWER, CiLOWER,
+ CiLOWER, CiLOWER, CiLOWER, CiLOWER,
+ /* 0x70 */
+ CiLOWER, CiLOWER, CiLOWER, CiLOWER,
+ CiLOWER, CiLOWER, CiLOWER, CiLOWER,
+ CiLOWER, CiLOWER, CiLOWER, CiCURLY,
+ CiQCL, CiCURLY, CiQCM, CiCNTRL
+};
+
+void
+set_ifs(const char *s)
+{
+#if defined(MKSH_EBCDIC) || defined(MKSH_FAUX_EBCDIC)
+ int i = 256;
+
+ memset(ksh_ctypes, 0, sizeof(ksh_ctypes));
+ while (i--)
+ if (ebcdic_map[i] < 0x80U)
+ ksh_ctypes[i] = tpl_ctypes[ebcdic_map[i]];
+#else
+ memcpy(ksh_ctypes, tpl_ctypes, sizeof(tpl_ctypes));
+ memset((char *)ksh_ctypes + sizeof(tpl_ctypes), '\0',
+ sizeof(ksh_ctypes) - sizeof(tpl_ctypes));
+#endif
+ ifs0 = *s;
+ while (*s)
+ ksh_ctypes[ord(*s++)] |= CiIFS;
+}
+
+#if defined(MKSH_EBCDIC) || defined(MKSH_FAUX_EBCDIC)
+#include <locale.h>
+
+/*
+ * Many headaches with EBCDIC:
+ * 1. There are numerous EBCDIC variants, and it is not feasible for us
+ * to support them all. But we can support the EBCDIC code pages that
+ * contain all (most?) of the characters in ASCII, and these
+ * usually tend to agree on the code points assigned to the ASCII
+ * subset. If you need a representative example, look at EBCDIC 1047,
+ * which is first among equals in the IBM MVS development
+ * environment: https://en.wikipedia.org/wiki/EBCDIC_1047
+ * Unfortunately, the square brackets are not consistently mapped,
+ * and for certain reasons, we need an unambiguous bijective
+ * mapping between EBCDIC and "extended ASCII".
+ * 2. Character ranges that are contiguous in ASCII, like the letters
+ * in [A-Z], are broken up into segments (i.e. [A-IJ-RS-Z]), so we
+ * can't implement e.g. islower() as { return c >= 'a' && c <= 'z'; }
+ * because it will also return true for a handful of extraneous
+ * characters (like the plus-minus sign at 0x8F in EBCDIC 1047, a
+ * little after 'i'). But at least '_' is not one of these.
+ * 3. The normal [0-9A-Za-z] characters are at codepoints beyond 0x80.
+ * Not only do they require all 8 bits instead of 7, if chars are
+ * signed, they will have negative integer values! Something like
+ * (c - 'A') could actually become (c + 63)! Use the ord() macro to
+ * ensure you're getting a value in [0, 255].
+ * 4. '\n' is actually NL (0x15, U+0085) instead of LF (0x25, U+000A).
+ * EBCDIC has a proper newline character instead of "emulating" one
+ * with line feeds, although this is mapped to LF for our purposes.
+ * 5. Note that it is possible to compile programs in ASCII mode on IBM
+ * mainframe systems, using the -qascii option to the XL C compiler.
+ * We can determine the build mode by looking at __CHARSET_LIB:
+ * 0 == EBCDIC, 1 == ASCII
+ */
+
+void
+ebcdic_init(void)
+{
+ int i = 256;
+ unsigned char t;
+ bool mapcache[256];
+
+ while (i--)
+ ebcdic_rtt_toascii[i] = i;
+ memset(ebcdic_rtt_fromascii, 0xFF, sizeof(ebcdic_rtt_fromascii));
+ setlocale(LC_ALL, "");
+#ifdef MKSH_EBCDIC
+ if (__etoa_l(ebcdic_rtt_toascii, 256) != 256) {
+ write(2, "mksh: could not map EBCDIC to ASCII\n", 36);
+ exit(255);
+ }
+#endif
+
+ memset(mapcache, 0, sizeof(mapcache));
+ i = 256;
+ while (i--) {
+ t = ebcdic_rtt_toascii[i];
+ /* ensure unique round-trip capable mapping */
+ if (mapcache[t]) {
+ write(2, "mksh: duplicate EBCDIC to ASCII mapping\n", 40);
+ exit(255);
+ }
+ /*
+ * since there are 256 input octets, this also ensures
+ * the other mapping direction is completely filled
+ */
+ mapcache[t] = true;
+ /* fill the complete round-trip map */
+ ebcdic_rtt_fromascii[t] = i;
+ /*
+ * Only use the converted value if it's in the range
+ * [0x00; 0x7F], which I checked; the "extended ASCII"
+ * characters can be any encoding, not just Latin1,
+ * and the C1 control characters other than NEL are
+ * hopeless, but we map EBCDIC NEL to ASCII LF so we
+ * cannot even use C1 NEL.
+ * If ever we map to Unicode, bump the table width to
+ * an unsigned int, and or the raw unconverted EBCDIC
+ * values with 0x01000000 instead.
+ */
+ if (t < 0x80U)
+ ebcdic_map[i] = (unsigned short)ord(t);
+ else
+ ebcdic_map[i] = (unsigned short)(0x100U | ord(i));
+ }
+ if (ebcdic_rtt_toascii[0] || ebcdic_rtt_fromascii[0] || ebcdic_map[0]) {
+ write(2, "mksh: NUL not at position 0\n", 28);
+ exit(255);
+ }
+}
+#endif