punycode.c

   1 /*\r
   2 punycode.c from RFC 3492\r
   3 http://www.nicemice.net/idn/\r
   4 Adam M. Costello\r
   5 http://www.nicemice.net/amc/\r
   6 \r
   7 This is ANSI C code (C89) implementing Punycode (RFC 3492).\r
   8 \r
   9 */\r
  10 \r
  11 \r
  12 /************************************************************/\r
  13 /* Public interface (would normally go in its own .h file): */\r
  14 \r
  15 #include <limits.h>\r
  16 \r
  17 enum punycode_status {\r
  18   punycode_success,\r
  19   punycode_bad_input,   /* Input is invalid.                       */\r
  20   punycode_big_output,  /* Output would exceed the space provided. */\r
  21   punycode_overflow     /* Input needs wider integers to process.  */\r
  22 };\r
  23 \r
  24 #if UINT_MAX >= (1 << 26) - 1\r
  25 typedef unsigned int punycode_uint;\r
  26 #else\r
  27 typedef unsigned long punycode_uint;\r
  28 #endif\r
  29 \r
  30 enum punycode_status punycode_encode(\r
  31   punycode_uint input_length,\r
  32   const punycode_uint input[],\r
  33   const unsigned char case_flags[],\r
  34   punycode_uint *output_length,\r
  35   char output[] );\r
  36 \r
  37     /* punycode_encode() converts Unicode to Punycode.  The input     */\r
  38     /* is represented as an array of Unicode code points (not code    */\r
  39     /* units; surrogate pairs are not allowed), and the output        */\r
  40     /* will be represented as an array of ASCII code points.  The     */\r
  41     /* output string is *not* null-terminated; it will contain        */\r
  42     /* zeros if and only if the input contains zeros.  (Of course     */\r
  43     /* the caller can leave room for a terminator and add one if      */\r
  44     /* needed.)  The input_length is the number of code points in     */\r
  45     /* the input.  The output_length is an in/out argument: the       */\r
  46     /* caller passes in the maximum number of code points that it     */\r
  47     /* can receive, and on successful return it will contain the      */\r
  48     /* number of code points actually output.  The case_flags array   */\r
  49     /* holds input_length boolean values, where nonzero suggests that */\r
  50     /* the corresponding Unicode character be forced to uppercase     */\r
  51     /* after being decoded (if possible), and zero suggests that      */\r
  52     /* it be forced to lowercase (if possible).  ASCII code points    */\r
  53     /* are encoded literally, except that ASCII letters are forced    */\r
  54     /* to uppercase or lowercase according to the corresponding       */\r
  55     /* uppercase flags.  If case_flags is a null pointer then ASCII   */\r
  56     /* letters are left as they are, and other code points are        */\r
  57     /* treated as if their uppercase flags were zero.  The return     */\r
  58     /* value can be any of the punycode_status values defined above   */\r
  59     /* except punycode_bad_input; if not punycode_success, then       */\r
  60     /* output_size and output might contain garbage.                  */\r
  61 \r
  62 enum punycode_status punycode_decode(\r
  63   punycode_uint input_length,\r
  64   const char input[],\r
  65   punycode_uint *output_length,\r
  66   punycode_uint output[],\r
  67   unsigned char case_flags[] );\r
  68 \r
  69     /* punycode_decode() converts Punycode to Unicode.  The input is  */\r
  70     /* represented as an array of ASCII code points, and the output   */\r
  71     /* will be represented as an array of Unicode code points.  The   */\r
  72     /* input_length is the number of code points in the input.  The   */\r
  73     /* output_length is an in/out argument: the caller passes in      */\r
  74     /* the maximum number of code points that it can receive, and     */\r
  75     /* on successful return it will contain the actual number of      */\r
  76     /* code points output.  The case_flags array needs room for at    */\r
  77     /* least output_length values, or it can be a null pointer if the */\r
  78     /* case information is not needed.  A nonzero flag suggests that  */\r
  79     /* the corresponding Unicode character be forced to uppercase     */\r
  80     /* by the caller (if possible), while zero suggests that it be    */\r
  81     /* forced to lowercase (if possible).  ASCII code points are      */\r
  82     /* output already in the proper case, but their flags will be set */\r
  83     /* appropriately so that applying the flags would be harmless.    */\r
  84     /* The return value can be any of the punycode_status values      */\r
  85     /* defined above; if not punycode_success, then output_length,    */\r
  86     /* output, and case_flags might contain garbage.  On success, the */\r
  87     /* decoder will never need to write an output_length greater than */\r
  88     /* input_length, because of how the encoding is defined.          */\r
  89 \r
  90 /**********************************************************/\r
  91 /* Implementation (would normally go in its own .c file): */\r
  92 \r
  93 #include <string.h>\r
  94 \r
  95 /*** Bootstring parameters for Punycode ***/\r
  96 \r
  97 enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,\r
  98        initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };\r
  99 \r
 100 /* basic(cp) tests whether cp is a basic code point: */\r
 101 #define basic(cp) ((punycode_uint)(cp) < 0x80)\r
 102 \r
 103 /* delim(cp) tests whether cp is a delimiter: */\r
 104 #define delim(cp) ((cp) == delimiter)\r
 105 \r
 106 /* decode_digit(cp) returns the numeric value of a basic code */\r
 107 /* point (for use in representing integers) in the range 0 to */\r
 108 /* base-1, or base if cp is does not represent a value.       */\r
 109 \r
 110 static punycode_uint decode_digit(punycode_uint cp)\r
 111 {\r
 112   return  cp - 48 < 10 ? cp - 22 :  cp - 65 < 26 ? cp - 65 :\r
 113           cp - 97 < 26 ? cp - 97 :  base;\r
 114 }\r
 115 \r
 116 /* encode_digit(d,flag) returns the basic code point whose value      */\r
 117 /* (when used for representing integers) is d, which needs to be in   */\r
 118 /* the range 0 to base-1.  The lowercase form is used unless flag is  */\r
 119 /* nonzero, in which case the uppercase form is used.  The behavior   */\r
 120 /* is undefined if flag is nonzero and digit d has no uppercase form. */\r
 121 \r
 122 static char encode_digit(punycode_uint d, int flag)\r
 123 {\r
 124   return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);\r
 125   /*  0..25 map to ASCII a..z or A..Z */\r
 126   /* 26..35 map to ASCII 0..9         */\r
 127 }\r
 128 \r
 129 /* flagged(bcp) tests whether a basic code point is flagged */\r
 130 /* (uppercase).  The behavior is undefined if bcp is not a  */\r
 131 /* basic code point.                                        */\r
 132 \r
 133 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)\r
 134 \r
 135 /* encode_basic(bcp,flag) forces a basic code point to lowercase */\r
 136 /* if flag is zero, uppercase if flag is nonzero, and returns    */\r
 137 /* the resulting code point.  The code point is unchanged if it  */\r
 138 /* is caseless.  The behavior is undefined if bcp is not a basic */\r
 139 /* code point.                                                   */\r
 140 \r
 141 static char encode_basic(punycode_uint bcp, int flag)\r
 142 {\r
 143   bcp -= (bcp - 97 < 26) << 5;\r
 144   return bcp + ((!flag && (bcp - 65 < 26)) << 5);\r
 145 }\r
 146 \r
 147 /*** Platform-specific constants ***/\r
 148 \r
 149 /* maxint is the maximum value of a punycode_uint variable: */\r
 150 static const punycode_uint maxint = -1;\r
 151 /* Because maxint is unsigned, -1 becomes the maximum value. */\r
 152 \r
 153 /*** Bias adaptation function ***/\r
 154 \r
 155 static punycode_uint adapt(\r
 156   punycode_uint delta, punycode_uint numpoints, int firsttime )\r
 157 {\r
 158   punycode_uint k;\r
 159 \r
 160   delta = firsttime ? delta / damp : delta >> 1;\r
 161   /* delta >> 1 is a faster way of doing delta / 2 */\r
 162   delta += delta / numpoints;\r
 163 \r
 164   for (k = 0;  delta > ((base - tmin) * tmax) / 2;  k += base) {\r
 165     delta /= base - tmin;\r
 166   }\r
 167 \r
 168   return k + (base - tmin + 1) * delta / (delta + skew);\r
 169 }\r
 170 \r
 171 /*** Main encode function ***/\r
 172 \r
 173 enum punycode_status punycode_encode(\r
 174   punycode_uint input_length,\r
 175   const punycode_uint input[],\r
 176   const unsigned char case_flags[],\r
 177   punycode_uint *output_length,\r
 178   char output[] )\r
 179 {\r
 180   punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;\r
 181 \r
 182   /* Initialize the state: */\r
 183 \r
 184   n = initial_n;\r
 185   delta = out = 0;\r
 186   max_out = *output_length;\r
 187   bias = initial_bias;\r
 188 \r
 189   /* Handle the basic code points: */\r
 190 \r
 191   for (j = 0;  j < input_length;  ++j) {\r
 192     if (basic(input[j])) {\r
 193       if (max_out - out < 2) return punycode_big_output;\r
 194       output[out++] =\r
 195         case_flags ?  encode_basic(input[j], case_flags[j]) : input[j];\r
 196     }\r
 197     /* else if (input[j] < n) return punycode_bad_input; */\r
 198     /* (not needed for Punycode with unsigned code points) */\r
 199   }\r
 200 \r
 201   h = b = out;\r
 202 \r
 203   /* h is the number of code points that have been handled, b is the  */\r
 204   /* number of basic code points, and out is the number of characters */\r
 205   /* that have been output.                                           */\r
 206 \r
 207   if (b > 0) output[out++] = delimiter;\r
 208 \r
 209   /* Main encoding loop: */\r
 210 \r
 211   while (h < input_length) {\r
 212     /* All non-basic code points < n have been     */\r
 213     /* handled already.  Find the next larger one: */\r
 214 \r
 215     for (m = maxint, j = 0;  j < input_length;  ++j) {\r
 216       /* if (basic(input[j])) continue; */\r
 217       /* (not needed for Punycode) */\r
 218       if (input[j] >= n && input[j] < m) m = input[j];\r
 219     }\r
 220 \r
 221     /* Increase delta enough to advance the decoder's    */\r
 222     /* <n,i> state to <m,0>, but guard against overflow: */\r
 223 \r
 224     if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;\r
 225     delta += (m - n) * (h + 1);\r
 226     n = m;\r
 227 \r
 228     for (j = 0;  j < input_length;  ++j) {\r
 229       /* Punycode does not need to check whether input[j] is basic: */\r
 230       if (input[j] < n /* || basic(input[j]) */ ) {\r
 231         if (++delta == 0) return punycode_overflow;\r
 232       }\r
 233 \r
 234       if (input[j] == n) {\r
 235         /* Represent delta as a generalized variable-length integer: */\r
 236 \r
 237         for (q = delta, k = base;  ;  k += base) {\r
 238           if (out >= max_out) return punycode_big_output;\r
 239           t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */\r
 240               k >= bias + tmax ? tmax : k - bias;\r
 241           if (q < t) break;\r
 242           output[out++] = encode_digit(t + (q - t) % (base - t), 0);\r
 243           q = (q - t) / (base - t);\r
 244         }\r
 245 \r
 246         output[out++] = encode_digit(q, case_flags && case_flags[j]);\r
 247         bias = adapt(delta, h + 1, h == b);\r
 248         delta = 0;\r
 249         ++h;\r
 250       }\r
 251     }\r
 252 \r
 253     ++delta, ++n;\r
 254   }\r
 255 \r
 256   *output_length = out;\r
 257   return punycode_success;\r
 258 }\r
 259 \r
 260 /*** Main decode function ***/\r
 261 \r
 262 enum punycode_status punycode_decode(\r
 263   punycode_uint input_length,\r
 264   const char input[],\r
 265   punycode_uint *output_length,\r
 266   punycode_uint output[],\r
 267   unsigned char case_flags[] )\r
 268 {\r
 269   punycode_uint n, out, i, max_out, bias,\r
 270                  b, j, in, oldi, w, k, digit, t;\r
 271 \r
 272   /* Initialize the state: */\r
 273 \r
 274   n = initial_n;\r
 275   out = i = 0;\r
 276   max_out = *output_length;\r
 277   bias = initial_bias;\r
 278 \r
 279   /* Handle the basic code points:  Let b be the number of input code */\r
 280   /* points before the last delimiter, or 0 if there is none, then    */\r
 281   /* copy the first b code points to the output.                      */\r
 282 \r
 283   for (b = j = 0;  j < input_length;  ++j) if (delim(input[j])) b = j;\r
 284   if (b > max_out) return punycode_big_output;\r
 285 \r
 286   for (j = 0;  j < b;  ++j) {\r
 287     if (case_flags) case_flags[out] = flagged(input[j]);\r
 288     if (!basic(input[j])) return punycode_bad_input;\r
 289     output[out++] = input[j];\r
 290   }\r
 291 \r
 292   /* Main decoding loop:  Start just after the last delimiter if any  */\r
 293   /* basic code points were copied; start at the beginning otherwise. */\r
 294 \r
 295   for (in = b > 0 ? b + 1 : 0;  in < input_length;  ++out) {\r
 296 \r
 297     /* in is the index of the next character to be consumed, and */\r
 298     /* out is the number of code points in the output array.     */\r
 299 \r
 300     /* Decode a generalized variable-length integer into delta,  */\r
 301     /* which gets added to i.  The overflow checking is easier   */\r
 302     /* if we increase i as we go, then subtract off its starting */\r
 303     /* value at the end to obtain delta.                         */\r
 304 \r
 305     for (oldi = i, w = 1, k = base;  ;  k += base) {\r
 306       if (in >= input_length) return punycode_bad_input;\r
 307       digit = decode_digit(input[in++]);\r
 308       if (digit >= base) return punycode_bad_input;\r
 309       if (digit > (maxint - i) / w) return punycode_overflow;\r
 310       i += digit * w;\r
 311       t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */\r
 312           k >= bias + tmax ? tmax : k - bias;\r
 313       if (digit < t) break;\r
 314       if (w > maxint / (base - t)) return punycode_overflow;\r
 315       w *= (base - t);\r
 316     }\r
 317 \r
 318     bias = adapt(i - oldi, out + 1, oldi == 0);\r
 319 \r
 320     /* i was supposed to wrap around from out+1 to 0,   */\r
 321     /* incrementing n each time, so we'll fix that now: */\r
 322 \r
 323     if (i / (out + 1) > maxint - n) return punycode_overflow;\r
 324     n += i / (out + 1);\r
 325     i %= (out + 1);\r
 326 \r
 327     /* Insert n at position i of the output: */\r
 328 \r
 329     /* not needed for Punycode: */\r
 330     /* if (decode_digit(n) <= base) return punycode_invalid_input; */\r
 331     if (out >= max_out) return punycode_big_output;\r
 332 \r
 333     if (case_flags) {\r
 334       memmove(case_flags + i + 1, case_flags + i, out - i);\r
 335       /* Case of last character determines uppercase flag: */\r
 336       case_flags[i] = flagged(input[in - 1]);\r
 337     }\r
 338 \r
 339     memmove(output + i + 1, output + i, (out - i) * sizeof *output);\r
 340     output[i++] = n;\r
 341   }\r
 342 \r
 343   *output_length = out;\r
 344   return punycode_success;\r
 345 }\r
 346 \r