src/locale/utf-8.cpp

   1 #include "locale/utf-8.h"
   2
   3 /*!
   4  * @brief 文字列の最初の文字のUTF-8エンコーディングにおけるバイト長を返す
   5  *
   6  * UTF-8エンコーディングの文字列が渡されるのを想定し、
   7  * その文字列の最初の文字のバイト長を返す。
   8  * UTF-8エンコーディングとして適合しなければ0を返す。
   9  * また文字列終端文字('\\0')の場合も0を返す。
  10  *
  11  * @note UTF-8エンコーディングの厳密なバリデーションにはなっていない。
  12  *       2バイト目以降は0x80-0xBF固定ではなく、バイト長・何バイト目かなど
  13  *       によって若干変化するが、ここでは簡便のため0x80-0xBFの範囲のみ
  14  *       チェックする
  15  *
  16  * @param str 判定する文字列へのポインタ
  17  *
  18  * @return 最初の文字のバイト長を返す。
  19  *         終端文字もしくはUTF-8エンコーディングに適合しない場合は0を返す。
  20  */
  21 int utf8_next_char_byte_length(concptr str)
  22 {
  23     const unsigned char *p = (const unsigned char *)str;
  24     int length = 0;
  25
  26     // バイト長の判定
  27     if (0x00 < *p && *p <= 0x7f) {
  28         length = 1;
  29     } else if ((*p & 0xe0) == 0xc0) {
  30         length = 2;
  31     } else if ((*p & 0xf0) == 0xe0) {
  32         length = 3;
  33     } else if ((*p & 0xf8) == 0xf0) {
  34         length = 4;
  35     } else {
  36         return 0;
  37     }
  38
  39     // trailing bytesが0x80-0xBFである事のチェック
  40     while ((++p) < (const unsigned char *)str + length) {
  41         if ((*p & 0xc0) != 0x80) {
  42             return 0;
  43         }
  44     }
  45
  46     return length;
  47 }
  48
  49 /*!
  50  * @brief 文字列がUTF-8の文字列として適合かどうかを判定する
  51  *
  52  * @param str 判定する文字列へのポインタ
  53  *
  54  * @return 文字列がUTF-8として適合ならTRUE、そうでなければFALSE
  55  */
  56 bool is_utf8_str(concptr str)
  57 {
  58     while (*str) {
  59         const int byte_length = utf8_next_char_byte_length(str);
  60
  61         if (byte_length == 0) {
  62             return false;
  63         }
  64
  65         str += byte_length;
  66     }
  67
  68     return true;
  69 }