lib/util_string.cc

   1 #include "util_string.h"
   2 #include "util_const.h"
   3 #include "util_random.h"
   4 #include "ustring.h"
   5 #include "utf8.h"
   6 #include "utf16.h"
   7 #include <boost/regex.hpp>
   8 #include <iconv.h>
   9 #include <vector>
  10 #include <algorithm>
  11 #include <stdlib.h>
  12 #include <unistd.h>
  13 #include <string.h>
  14 #include <float.h>
  15 #include <ctype.h>
  16
  17 ustring  c3 (const ustring& str) {
  18     bool  qsign = false;
  19     static uregex  re ("^[0-9]+");
  20     uiterator  b, e;
  21     umatch  m;
  22
  23     b = str.begin ();
  24     e = str.end ();
  25     if (str[0] == '-' || str[0] == '+') {
  26         qsign = true;
  27         b = b + 1;
  28     }
  29     if (usearch (b, e, m, re)) {
  30         int  n = m[0].second - m[0].first;
  31         int  l = str.size () + n / 3;
  32         ustring  ans;
  33
  34         ans.reserve (l);
  35         if (qsign) {
  36             ans.append (1, str[0]);
  37         }
  38         for (; b != m[0].second; b ++) {
  39             ans.append (1, *b);
  40             if (n > 1 && n % 3 == 1) {
  41                 ans.append (CharConst (","));
  42             }
  43             n --;
  44         }
  45         for (; b != e; b ++) {
  46             ans.append (1, *b);
  47         }
  48         return ans;
  49     } else {
  50         return str;
  51     }
  52 }
  53
  54 static int  hex (char c) {
  55     if ('0' <= c && c <= '9') {
  56         return (c - '0');
  57     } else if ('a' <= c && c <= 'f') {
  58         return (c -  'a' + 10);
  59     } else if ('A' <= c && c <= 'F') {
  60         return (c - 'A' + 10);
  61     } else {
  62         return 0;
  63     }
  64 }
  65
  66 static int  hex (char c1, char c2) {
  67     return (hex (c1) * 16 + hex (c2));
  68 }
  69
  70 static char  hexchar (int c) {
  71     if (0 <= c && c <= 9)
  72         return '0' + c;
  73     else if (10 <= c <= 15)
  74         return 'a' - 10 + c;
  75     else
  76         return '0';
  77 }
  78
  79 static ustring  percentHex (int c) {
  80     ustring  ans (3, '%');
  81
  82     ans[1] = hexchar ((c >> 4) & 0x0f);
  83     ans[2] = hexchar (c & 0x0f);
  84     return ans;
  85 }
  86
  87 ustring  urldecode_nonul (const ustring& str) {
  88     ustring  ans;
  89     static uregex  re ("(\\+)|%([0-9a-fA-F][0-9a-fA-F])|\\x00");
  90     umatch  m;
  91     uiterator  b, e;
  92
  93     ans.reserve (str.size ());
  94     b = str.begin ();
  95     e = str.end ();
  96     while (usearch (b, e, m, re)) {
  97         if (b != m[0].first) {
  98             ans.append (b, m[0].first);
  99         }
 100         if (m[1].matched) {
 101             ans.append (1, ' ');
 102         } else if (m[2].matched) {
 103             int  v = hex (*(m[2].first), *(m[2].first + 1));
 104             if (v != 0)
 105                 ans.append (1, v);
 106         } else {
 107         }
 108         b = m[0].second;
 109     }
 110     if (b != e) {
 111         ans.append (b, e);
 112     }
 113
 114     return ans;
 115 }
 116
 117 static ustring  omitPattern (const ustring& text, uregex& re) {
 118     Splitter  sp (text, re);
 119
 120     if (sp.next ()) {
 121         if (sp.match (0)) {
 122             ustring  ans;
 123             ans.reserve (text.length ());
 124             if (sp.begin () != sp.end ())
 125                 ans.append (sp.begin (), sp.end ());
 126             while (sp.next ()) {
 127                 if (sp.begin () != sp.end ())
 128                     ans.append (sp.begin (), sp.end ());
 129             }
 130             return ans;
 131         } else {
 132             return text;
 133         }
 134     } else {
 135         return text;
 136     }
 137 }
 138
 139 ustring  omitCtrl (const ustring& str) {
 140     static uregex  re ("[\\x00-\\x1f\\x7f]+");
 141     return omitPattern (str, re);
 142 }
 143
 144 ustring  omitNL (const ustring& str) {
 145     return omitPattern (str, re_nl);
 146 }
 147
 148 ustring  omitNonAscii (const ustring& str) {
 149     static uregex  re ("[^ -\\x7e]+");
 150     return omitPattern (str, re);
 151 }
 152
 153 ustring  omitNonAsciiWord (const ustring& str) {
 154     static uregex  re ("[^\\x21-\\x7e]+");
 155     return omitPattern (str, re);
 156 }
 157
 158 bool  to_bool (const ustring& v) {
 159     if (v.length () == 0 || (v.length () == 1 && v[0] == '0')) {
 160         return false;
 161     } else {
 162         return true;
 163     }
 164 }
 165
 166 static ustring  percentEncode (const ustring& text, uregex& re) {
 167     /* $1 -> _
 168        $2 -> %HEX
 169     */
 170     umatch  m;
 171     uiterator  b, e;
 172     ustring  ans;
 173
 174     b = text.begin ();
 175     e = text.end ();
 176     if (b != e && usearch (b, e, m, re)) {
 177         if (b != m[0].first) {
 178             ans.append (ustring (b, m[0].first));
 179         }
 180         if (m[1].matched) {
 181             ans.append (uUScore);
 182         } else if (m[2].matched) {
 183             ans.append (percentHex (*m[2].first));
 184         } else {
 185             assert (0);
 186         }
 187         b = m[0].second;
 188         while (b != e && usearch (b, e, m, re)) {
 189             if (b != m[0].first) {
 190                 ans.append (ustring (b, m[0].first));
 191             }
 192             if (m[1].matched) {
 193                 ans.append (uUScore);
 194             } else if (m[2].matched) {
 195                 ans.append (percentHex (*m[2].first));
 196             } else {
 197                 assert (0);
 198             }
 199             b = m[0].second;
 200         }
 201         if (b != e) {
 202             ans.append (ustring (b, e));
 203         }
 204         return ans;
 205     } else {
 206         return text;
 207     }
 208 }
 209
 210 ustring  urlencode (const ustring& url) {
 211     static uregex  re ("(\\x00)|([^a-zA-Z0-9_.,/-])");
 212
 213     return percentEncode (url, re);
 214 }
 215
 216 ustring  cookieencode (const ustring& text) {
 217     static uregex  re ("([\\x00-\\x1f\\x7f])|([ ,;%\\x80-\\xff])");
 218
 219     return percentEncode (text, re);
 220 }
 221
 222 ustring  cookiedecode (const ustring& text) {
 223     umatch  m;
 224     uiterator  b, e;
 225     ustring  ans;
 226     int  a;
 227     static uregex  re ("%([0-9a-fA-F])([0-9a-fA-F])");
 228
 229     b = text.begin ();
 230     e = text.end ();
 231     while (usearch (b, e, m, re)) {
 232         if (b != m[0].first)
 233             ans.append (ustring (b, m[0].first));
 234         a = hex (*m[1].first, *m[2].first);
 235         ans.append (1, a);
 236         b = m[0].second;
 237     }
 238     if (b != e)
 239         ans.append (ustring (b, e));
 240
 241     return ans;
 242 }
 243
 244 ustring  clipColon (const ustring& text) {
 245     int  i;
 246     ustring  ans (text);
 247
 248     for (i = 0; i < ans.size (); i ++) {
 249         if (ans[i] == ':')
 250             ans[i] = '_';
 251     }
 252     return ans;
 253 }
 254
 255 ustring  dirPart (char* path) {
 256     char*  e = rindex (path, '/');
 257
 258     if (e && e != path) {
 259         return ustring (path, e - path);
 260     } else {
 261         return uSlash;
 262     }
 263 }
 264
 265 ustring  dirPart (const ustring& path) {
 266     ustring::size_type  s = path.rfind ('/', path.size ());
 267
 268     if (s == ustring::npos) {
 269         return uSlash;
 270     } else {
 271         return ustring (path.begin (), path.begin () + s);
 272     }
 273 }
 274
 275 ustring  filePart_osSafe (const ustring& path) {
 276     umatch  m;
 277     static uregex  re ("[^\\\\/]+$");
 278
 279     if (usearch (path, m, re)) {
 280         return ustring (m[0].first, m[0].second);
 281     } else {
 282         return uEmpty;
 283     }
 284 }
 285
 286 void  split (uiterator b, uiterator e, uregex& re, std::vector<ustring>& ans) {
 287     Splitter  sp (b, e, re);
 288
 289     while (sp.next ()) {
 290         ans.push_back (sp.cur ());
 291     }
 292 }
 293
 294 bool  splitChar (uiterator b, uiterator e, uiterator::value_type ch, uiterator& m1) {
 295     for (; b < e; b ++) {
 296         if (*b == ch) {
 297             m1 = b;
 298             return true;
 299         }
 300     }
 301     m1 = e;
 302     return false;
 303 }
 304
 305 static char  Base64Char[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 306 ustring  base64Encode (uiterator b, uiterator e) {
 307     ustring  ans;
 308     size_t  size;
 309     int  c0, c1, c2;
 310
 311     while (b != e) {
 312         size = e - b;
 313         if (size >= 3) {
 314             c0 = *b ++;
 315             c1 = *b ++;
 316             c2 = *b ++;
 317             ans.append (1, Base64Char[(c0 >> 2) & 0x3f]);
 318             ans.append (1, Base64Char[((c0 & 0x03) << 4) | ((c1 >> 4) & 0x0f)]);
 319             ans.append (1, Base64Char[((c1 & 0x0f) << 2) | ((c2 >> 6) & 0x03)]);
 320             ans.append (1, Base64Char[c2 & 0x3f]);
 321         } else if (size == 2) {
 322             c0 = *b ++;
 323             c1 = *b ++;
 324             ans.append (1, Base64Char[(c0 >> 2) & 0x3f]);
 325             ans.append (1, Base64Char[((c0 & 0x03) << 4) | ((c1 >> 4) & 0x0f)]);
 326             ans.append (1, Base64Char[((c1 & 0x0f) << 2)]);
 327             ans.append (1, '=');
 328         } else if (size == 1) {
 329             c0 = *b ++;
 330             ans.append (1, Base64Char[(c0 >> 2) & 0x3f]);
 331             ans.append (1, Base64Char[((c0 & 0x03) << 4)]);
 332             ans.append (1, '=');
 333             ans.append (1, '=');
 334         } else {
 335             break;
 336         }
 337     }
 338     return ans;
 339 }
 340
 341 ustring  escape_re (const ustring& text) {
 342     ustring::const_iterator  b, e;
 343     umatch  m;
 344     ustring  ans;
 345     int  c;
 346     char  buf[4];
 347     static uregex  re ("[^\\x01- !\"#%',/0-9:;<=>@A-Z_`a-z~\\x7f-\\xff-]");
 348
 349     buf[0] = '\\';
 350     buf[1] = 'x';
 351     ans.reserve (text.size () + 16);
 352     b = text.begin ();
 353     e = text.end ();
 354     while (b != e && usearch (b, e, m, re)) {
 355         if (b != m[0].first)
 356             ans.append (b, m[0].first);
 357         c = *m[0].first;
 358         buf[2] = hexchar ((c >> 4) & 0x0f);
 359         buf[3] = hexchar (c & 0x0f);
 360         ans.append (buf, 4);
 361         b = m[0].second;
 362     }
 363     if (b != e)
 364         ans.append (b, e);
 365     return ans;
 366 }
 367
 368 ustring  slashEncode (const ustring& text) {
 369     ustring::const_iterator  b, e;
 370     umatch  m;
 371     ustring  ans;
 372     int  c;
 373     char  buf[4];
 374     static uregex  re ("([\\x00-\\x1f\\x7f])|(\\\\)|(\")");
 375
 376     buf[0] = '\\';
 377     buf[1] = 'x';
 378     b = text.begin ();
 379     e = text.end ();
 380     while (b != e && usearch (b, e, m, re)) {
 381         if (b != m[0].first)
 382             ans.append (b, m[0].first);
 383         if (m[1].matched) {
 384             c = *m[0].first;
 385             switch (c) {
 386             case '\t':
 387                 ans.append (CharConst ("\\t"));
 388                 break;
 389             case '\r':
 390                 ans.append (CharConst ("\\r"));
 391                 break;
 392             case '\n':
 393                 ans.append (CharConst ("\\n"));
 394                 break;
 395             default:
 396                 buf[2] = hexchar ((c >> 4) & 0x0f);
 397                 buf[3] = hexchar (c & 0x0f);
 398                 ans.append (buf, 4);
 399             }
 400         } else if (m[2].matched) {
 401             ans.append (CharConst ("\\\\"));
 402         } else if (m[3].matched) {
 403             ans.append (CharConst ("\\\""));
 404         } else {
 405             assert (0);
 406         }
 407         b = m[0].second;
 408     }
 409     if (b != e)
 410         ans.append (b, e);
 411     return ans;
 412 }
 413
 414 ustring  slashDecode (const ustring& text) {
 415     ustring::const_iterator  b, e;
 416     umatch  m;
 417     ustring  ans;
 418     int  c;
 419     static uregex  re ("\\\\([0-7][0-7][0-7]|[\\x00-\\x7f])");
 420
 421     b = text.begin ();
 422     e = text.end ();
 423     while (b != e && usearch (b, e, m, re)) {
 424         if (b != m[0].first)
 425             ans.append (b, m[0].first);
 426         b = m[0].first + 1;
 427         c = *b;
 428         switch (c) {
 429         case 't':
 430             ans.append (CharConst ("\t"));
 431             break;
 432         case 'r':
 433             ans.append (CharConst ("\r"));
 434             break;
 435         case 'n':
 436             ans.append (CharConst ("\n"));
 437             break;
 438         default:
 439             if (m[0].second - m[0].first == 4) {
 440                 c = (c - '0') * 64;
 441                 b ++;
 442                 c += (*b - '0') * 8;
 443                 b ++;
 444                 c += *b - '0';
 445                 if (0 < c && c < 0x20)
 446                     ans.append (1, c);
 447             } else {
 448                 ans.append (1, c);
 449             }
 450         }
 451         b = m[0].second;
 452     }
 453     if (b != e)
 454         ans.append (b, e);
 455     return ans;
 456 }
 457
 458 unsigned long  strtoul (const ustring& str) {
 459     return strtoul (str.c_str (), NULL, 10);
 460 }
 461
 462 unsigned long  strtoul (const uiterator& b) {
 463     return strtoul (&*b, NULL, 10);
 464 }
 465
 466 long  strtol (const ustring& str) {
 467     return strtol (str.c_str (), NULL, 10);
 468 }
 469
 470 double  strtod (const ustring& str) {
 471     return strtod (str.c_str (), NULL);
 472 }
 473
 474 bool  passMatch (const ustring& pass, const ustring& cpass) {
 475     if (pass.length () == 0 || cpass.length () == 0)
 476         return false;
 477     return (strcmp (crypt (pass.c_str (), cpass.c_str ()), cpass.c_str ()) == 0);
 478 }
 479
 480 ustring  passCrypt (const ustring& pass) {
 481     ustring  salt = makeSalt ();
 482     return ustring (crypt (pass.c_str (), salt.c_str ()));
 483 }
 484
 485 size_t  strLength (const ustring& src) {
 486     uiterator  b, e;
 487     size_t  n = 0;
 488     b = src.begin ();
 489     e = src.end ();
 490     while (b < e) {
 491         n ++;
 492         nextChar (b, e);
 493     }
 494     return n;
 495 }
 496
 497 void  substring (const ustring& src, size_t idx, size_t len, int flen, ustring& ans) {
 498     uiterator  b, e, t;
 499     size_t  i;
 500
 501     b = src.begin ();
 502     e = src.end ();
 503     for (i = 0; i < idx && b < e; i ++)
 504         nextChar (b, e);
 505     if (flen) {
 506         t = b;
 507         for (i = 0; i < len && t < e; i ++)
 508             nextChar (t, e);
 509         ans.assign (b, t);
 510     } else {
 511         ans.assign (b, e);
 512     }
 513 }
 514
 515 ustring  utf16Encode (const ustring& str) {
 516     int  i;
 517     ustring  u, ans;
 518     int  c;
 519     char  b[8];
 520
 521     u = utf8to16 (str);
 522     ans.reserve (u.size () * 3);
 523     b[0] = '\\';
 524     b[1] = 'u';
 525     for (i = 0; i < u.size (); i += 2) {
 526         c = u[i];
 527         b[2] = hexchar ((c >> 4) & 0x0f);
 528         b[3] = hexchar (c & 0x0f);
 529         c = u[i + 1];
 530         b[4] = hexchar ((c >> 4) & 0x0f);
 531         b[5] = hexchar (c & 0x0f);
 532         ans.append (b, 6);
 533     }
 534     return ans;
 535 }
 536
 537 ustring  filenameEncode (const ustring& text) {
 538     static uregex  re ("([\\x00-\\x1f\\x7f])|([^a-zA-Z0-9._-])|(^\\.+)");
 539     Splitter  sp (text, re);
 540     ustring  ans;
 541     int  c;
 542
 543     if (text.length () == 0) {
 544         throw (ustring (text).append (uErrorBadName));
 545     }
 546     ans.reserve (text.length () + 16);
 547     while (sp.next ()) {
 548         if (sp.begin () < sp.end ())
 549             ans.append (sp.begin (), sp.end ());
 550         if (sp.match (1)) {
 551         } else if (sp.match (2)) {
 552             c = *sp.matchBegin (2);
 553             ans.append (1, ':');
 554             ans.append (1, hexchar ((c >> 4) & 0x0f));
 555             ans.append (1, hexchar (c & 0x0f));
 556         } else if (sp.match (3)) {
 557             for (c = sp.matchEnd (3) - sp.matchBegin (3); c > 0; c --) {
 558                 ans.append (CharConst (":2e"));
 559             }
 560         }
 561     }
 562     if (ans.length () > 250)
 563         ans.resize (250);
 564     return ans;
 565 }
 566
 567 bool  matchSkip (uiterator& b, uiterator e, const char* t, size_t s) {
 568     if (e - b >= s && memcmp (t, &b[0], s) == 0) {
 569         b += s;
 570         return true;
 571     } else {
 572         return false;
 573     }
 574 }
 575
 576 bool  matchHead (uiterator& b, uiterator e, const char* t, size_t s) {
 577     if (e - b >= s && memcmp (t, &b[0], s) == 0) {
 578         return true;
 579     } else {
 580         return false;
 581     }
 582 }
 583
 584 bool  matchHead (const ustring& str, const char* t, size_t s) {
 585     if (str.length () >= s && memcmp (t, &*str.begin (), s) == 0) {
 586         return true;
 587     } else {
 588         return false;
 589     }
 590 }
 591
 592 bool  matchHead (const ustring& str, const ustring& head) {
 593     if (str.length () >= head.length () && memcmp (&*str.begin (), &*head.begin (), head.length ()) == 0) {
 594         return true;
 595     } else {
 596         return false;
 597     }
 598 }
 599
 600 bool  match (uiterator b, uiterator e, const char* t, size_t s) {
 601     if (e - b == s && memcmp (t, &b[0], s) == 0) {
 602         return true;
 603     } else {
 604         return false;
 605     }
 606 }
 607
 608 bool  match (const ustring& str, const char* t, size_t s) {
 609     if (str.length () == s && memcmp (t, str.data (), s) == 0) {
 610         return true;
 611     } else {
 612         return false;
 613     }
 614 }
 615
 616 bool  match (uiterator b, uiterator e, const ustring& str) {
 617     if (e - b == str.length () && memcmp (str.data (), &b[0], str.length ()) == 0) {
 618         return true;
 619     } else {
 620         return false;
 621     }
 622 }
 623
 624 bool  match (const ustring& str, const char* t, size_t s, const char* t2, size_t s2) {
 625     if (match (str, t, s) || match (str, t2, s2)) {
 626         return true;
 627     } else {
 628         return false;
 629     }
 630 }
 631
 632 ustring  clipWhite (uiterator b, uiterator e) {
 633     while (b < e)
 634         if (isblank (*b)) {
 635             b ++;
 636         } else {
 637             break;
 638         }
 639     while (b < e)
 640         if (isblank (*(e - 1))) {
 641             e --;
 642         } else {
 643             break;
 644         }
 645     return ustring (b, e);
 646 }
 647 ustring  clipWhite (const ustring& str) {
 648     return clipWhite (str.begin (), str.end ());
 649 }
 650
 651 ustring  getenvString (const char* key) {
 652     char*  e = getenv (key);
 653     if (e) {
 654         return ustring (e);
 655     } else {
 656         return uEmpty;
 657     }
 658 }
 659
 660 ustring  zeroPad (int n, const ustring& src) {
 661     int  m;
 662
 663     n = std::min (32, n);
 664     m = n - src.length ();
 665     if (m > 0) {
 666         ustring  ans;
 667         ans.reserve (m);
 668         ans.append (m, '0');
 669         ans.append (src);
 670         return ans;
 671     } else {
 672         return src;
 673     }
 674 }
 675
 676 bool  wsearch (const ustring& text, boost::wsmatch& m, const ustring& reg, boost::wregex::flag_type reg_flags, boost::match_flag_type search_flags) {
 677     std::wstring  wtext = utow (text);
 678     std::wstring  wreg = utow (reg);
 679     boost::wregex  re (wreg, reg_flags);
 680     return regex_search (wtext, m, re, search_flags);
 681 }
 682
 683 ustring  uiconv (const ustring& src, const char* tocode, const char* fromcode) {
 684     iconv_t  cd;
 685     char  buf[4096];
 686     const char*  ibuf;
 687     char*  obuf;
 688     size_t  isize, osize, rsize;
 689     ustring  ans;
 690
 691     cd = iconv_open (tocode, fromcode);
 692     if (cd == (iconv_t)(-1))
 693         throw (ustring ("bad encoding name."));
 694     ibuf = &src.at (0);
 695     isize = src.size ();
 696     while (isize > 0) {
 697         obuf = buf;
 698         osize = 4096;
 699         rsize = iconv (cd, &ibuf, &isize, &obuf, &osize);
 700 //      if (rsize < 0)
 701         if (obuf - buf <= 0)
 702             break;
 703         ans.append (buf, obuf - buf);
 704     }
 705     iconv_close (cd);
 706     return ans;
 707 }
 708
 709 ustring  padEmpty (const ustring& name) {
 710     if (name.empty ())
 711         return ustring (CharConst ("(null)"));
 712     else
 713         return name;
 714 }
 715
 716 ustring  dtoustring (double val) {
 717     char  b[32];
 718     return ustring (b, snprintf (b, 32, "%.*g", DBL_DIG, val));
 719 }
 720
 721 uint32_t  hextoul (uiterator b, uiterator e) {
 722     uint32_t  ans = 0;
 723     int  n;
 724
 725     for (n = 0; n < 8 && b != e; n ++, b ++) {
 726         ans = (ans << 4) + hex (*b);
 727     }
 728     return ans;
 729 }
 730
 731 ustring  toCRLF (const ustring& str) {
 732     uiterator  b = str.begin ();
 733     uiterator  e = str.end ();
 734     umatch  m;
 735     ustring  ans;
 736
 737     while (usearch (b, e, m, re_lf)) {
 738         ans.append (b, m[0].first).append (uCRLF);
 739         b = m[0].second;
 740     }
 741     ans.append (b, e);
 742     return ans;
 743 }
 744
 745 void  skipSpace (uiterator& b, uiterator e) {
 746     while (b < e && *b == ' ') {
 747         b ++;
 748     }
 749 }
 750
 751 static ustring::value_type  toLower_ustring_value (ustring::value_type v) {
 752     if ('A' <= v && v <= 'Z') {
 753         return v - 'A' + 'a';
 754     } else {
 755         return v;
 756     }
 757 }
 758
 759 #if 0
 760 void  toLower (ustring::iterator* b, ustring::iterator* e) {
 761     transform (*b, *e, *b, toLower_ustring_value);
 762 }
 763 #endif
 764
 765 ustring  toLower (uiterator b, uiterator e) {
 766     ustring::iterator  i;
 767     ustring  ans;
 768     ans.resize (e - b);
 769     i = ans.begin ();
 770     for (; b < e; b ++, i++) {
 771         *i = toLower_ustring_value (*b);
 772     }
 773     return ans;
 774 }