utf8.cpp

   1 #include <vector>
   2 #include <iostream>
   3 #include <algorithm>
   4 #include <string>
   5 #include <sstream>
   6 #include <assert.h>
   7
   8 #include "InputStream.h"
   9 #include "utf8.h"
  10 #include "smart_ptr.h"
  11
  12 using namespace std;;
  13 using namespace utakata::utf8;
  14
  15 UTF8InputStream::UTF8InputStream() : EOF_(0xff), strm_(), pos_(0)
  16 {
  17 }
  18
  19 UTF8InputStream::UTF8InputStream(smart_ptr<std::istream> strm) : EOF_(0xff), strm_(strm),
  20                                                                         pos_(0)
  21 {
  22 }
  23
  24 bool UTF8InputStream::open(smart_ptr<std::istream> strm)
  25 {
  26     //現在保持しているストリームと切り替える。
  27     // 基本的にただスワップするだけで問題ない。
  28
  29     // NULLポインタではなく、問題無く開かれている場合には、次のようにして開始する。
  30     if (strm.isNull() != false && !strm->good()) {
  31         strm_ = strm;
  32     } else {
  33         return false;
  34     }
  35     return true;
  36 }
  37
  38 std::vector<unsigned char> UTF8InputStream::read()
  39 {
  40     // UTF-8の一文字を読みだして返す。
  41     // UTF-8に該当しない場合、空のvectorを返す。
  42     if (!strm_->eof() && !strm_->good()) {
  43         throw StreamException("not ready input stream");
  44     }
  45
  46     // 最初に一文字だけ読みだして、チェックをかける。
  47     int c = 0;
  48     c = strm_->peek();
  49     if (c != std::istream::traits_type::eof())
  50     {
  51         // 末尾でない場合のみ、以降のチェックに入る。
  52         size_t size = 0;
  53         // 先頭1バイトが正常でなかった場合はそのまま抜ける
  54         if (is_utf8_first_byte(static_cast<unsigned char>(c), size))
  55         {
  56             if (size > 0)
  57             {
  58                 // sizeが0より大きい場合には、この時複数バイトで文字が構成
  59                 // されていると考えられるため、明示的に複数文字を取得する。
  60                 std::vector<char> tmp(size, 0);
  61                 strm_->read(&tmp[0], size);
  62
  63                 if (strm_->bad())
  64                 {
  65                     // 読み取りきれなかった場合には、ストリームに一応読出せた
  66                     // 分を元に戻す。
  67                     std::for_each(tmp.rbegin(), tmp.rend(), PutBack(strm_));
  68                     return std::vector<unsigned char>(1, 0);
  69                 }
  70                 std::vector<unsigned char> rtn;
  71                 rtn.insert(rtn.begin(), tmp.begin(), tmp.end());
  72
  73                 // 読みこめたので、++する。
  74                 ++pos_;
  75                 return rtn;
  76             }
  77         }
  78     }
  79
  80     return std::vector<unsigned char>(1, EOF_);
  81 }
  82
  83 std::vector<unsigned char> UTF8InputStream::read(int num)
  84 {
  85     // 指定された文字分だけ読みだしてくる。
  86     // 途中で終了した場合、その文字の分だけunsigned charが減少すること
  87     // になっている。
  88     // numが0の場合、必ず空のvectorが返される。
  89
  90     if (num == 0)
  91     {
  92         return std::vector<unsigned char>();
  93     }
  94
  95     // eofの場合なら、この時点でeofが返るので、それで問題はない。
  96     std::vector<unsigned char> rtn = this->read();
  97     for (int i = 1; i < num && !strm_->eof(); ++i)
  98     {
  99         // 個数に到達するか、もしくはeofとなるまでは追加しつづける。
 100         std::vector<unsigned char> tmp = this->read();
 101         rtn.insert(rtn.end(), tmp.begin(), tmp.end());
 102     }
 103
 104     pos_ += num;
 105
 106     return rtn;
 107 }
 108
 109 std::vector<unsigned char> UTF8InputStream::peek()
 110 {
 111     // 一文字分だけ先読みする。先読みした場合、文字は戻す。
 112     std::vector<unsigned char> tmp = this->read();
 113     // 一応戻すサイズが存在する場合だけ、これを実行させることにする。
 114     if (tmp.size() > 0)
 115     {
 116         // 複雑な繰り返しを表現する場合には、積極的にalgorithmを利用するようにする。
 117
 118         std::for_each(tmp.rbegin(), tmp.rend(),
 119                       utakata::utf8::PutBack(strm_));
 120         --pos_;
 121     }
 122     return tmp;
 123 }
 124
 125 void UTF8InputStream::unget(const std::vector<unsigned char>& ch)
 126 {
 127     // 渡されたバイト列をストリームに差し戻す。
 128     size_t t = 0;
 129     if (is_utf8_one(ch, t))
 130     {
 131         std::for_each(ch.rbegin(), ch.rend(), PutBack(strm_));
 132     }
 133
 134     if (pos_ > 0)
 135     {
 136         --pos_;
 137     }
 138 }
 139
 140 bool utakata::utf8::UTF8InputStream::isEOF() const
 141 {
 142     bool ret = false;
 143     if (strm_->good())
 144     {
 145         ret = strm_->eof() ? true : false;
 146     }
 147     else
 148     {
 149         ret = true;
 150     }
 151
 152     return ret;
 153 }
 154
 155 //================================================================================
 156
 157 long utakata::utf8::generateUTF8Code(const std::vector<unsigned char>& bytes)
 158 {
 159     // 1文字分のUTF8のバイト列を受け取って、コードに変換して返す。
 160     // 先頭の値によって、次のように値を決定することができる。
 161     // x = utf8の先頭バイト
 162     // y1〜yN = utf8の先頭バイト以降のバイト
 163     // N = utf8の先頭バイトを含むバイト数
 164     // code = (y1 & ((1 << 7) - 1)) << (6 * n-1) + (y2 & ((1 << 7) -1)) << (6 * (n - 1))...+ x & ((1 << N) -1) << (6 * N-1)
 165     // 先頭バイト以外は、全て先頭に10とうビットが設定されている。このビットを除いた6ビットをする。
 166     // つまり、末尾のバイトから順次やっていけばよい。
 167
 168     std::vector<unsigned char> tmp(bytes);
 169     const unsigned char max_c = (1 << (sizeof(unsigned char) * 8 - 1)) - 1;
 170     long code = 0;
 171
 172     if (tmp.empty())
 173     {
 174         return 0;
 175     }
 176
 177     if (tmp.size() == 1)
 178     {
 179         // asciiコードは7bitなのでそこだけ切り取って返す。
 180         code = tmp[0] & max_c;
 181     }
 182     else
 183     {
 184         // 一時的に利用されるクラス。
 185         struct Lambda
 186         {
 187             unsigned char operator()(unsigned char c, int s) {
 188                 return c << (6 * s);
 189             }
 190         };
 191
 192         // サイズが1以外の場合、ここからがちと違う。
 193         std::vector<unsigned char>::reverse_iterator beg = tmp.rbegin(),
 194             end = tmp.rend() - 1;
 195         const unsigned char char_bit = (1 << 6) - 1;
 196         for (int i = 0; beg != end; ++i,++beg)
 197         {
 198             code += Lambda()((*beg & char_bit), i);
 199         }
 200
 201         // 最後だけ、別の計算が必要になる。
 202         const unsigned char first_byte = (1 << ((sizeof(unsigned char) + 1) - tmp.size())) - 1;
 203         code += Lambda()(first_byte,tmp.size() - 1);
 204
 205     }
 206
 207     return code;
 208 }
 209
 210
 211 long utakata::utf8::generateUTF8Code(const std::string& bytes)
 212 {
 213     // UTF8である一文字のstringを受け取って、先頭1文字の値を返す。
 214
 215     std::string str = bytes;
 216     std::vector<unsigned char> tmp;
 217     tmp.insert(tmp.end(), str.begin(), str.end());
 218
 219     // vectorにしなおしたら後は元々の関数に任せる。
 220     return generateUTF8Code(tmp);
 221 }
 222
 223 bool utakata::utf8::is_utf8_one(const std::vector<unsigned char>& bytes, size_t& size)
 224 {
 225     //渡したバイト列がUTF8の一文字に該当するかどうかを返す。
 226     size_t back = size;
 227
 228     if (bytes.size() == 0)
 229     {
 230         size = back;
 231         return false;
 232     }
 233
 234     size_t num = 0;
 235
 236     if (!is_utf8_first_byte(bytes[0], num))
 237     {
 238         size = back;
 239         return false;
 240     }
 241
 242     // そもそもbytesのサイズが足りない場合にも失敗とする。
 243     if (num > bytes.size())
 244     {
 245         size = back;
 246         return false;
 247     }
 248
 249     // 先頭要素以外が正しければそれで問題ないとする。
 250     if (num > 1)
 251     {
 252
 253         const CheckUTF8Byte& checker = for_each(bytes.begin() + 1, bytes.begin() + num,
 254                                                  CheckUTF8Byte());
 255
 256         if (checker.good)
 257         {
 258             size = num;
 259         } else {
 260             size = back;
 261             return false;
 262         }
 263     } else {
 264         // sizeが0の場合には、この時点で1を設定するようにする。
 265         size = num;
 266     }
 267
 268     // 最後まで到達した時点で成功とする。
 269     return true;
 270 }
 271
 272 bool utakata::utf8::is_utf8_all(const std::vector<unsigned char>& bytes)
 273 {
 274     // 与えられたバイト列全てがUTF-8であるかどうかを返す。
 275     size_t size = 0;
 276     std::vector<unsigned char>::const_iterator it = bytes.begin();
 277     while (is_utf8_one(std::vector<unsigned char>(it, bytes.end()), size)) {
 278         it += size;
 279         size = 0;
 280     }
 281
 282     if (it == bytes.end()) {
 283         return true;
 284     }
 285
 286     return false;
 287 }
 288
 289 bool utakata::utf8::is_utf8_first_byte(unsigned char c, size_t& size)
 290 {
 291     // UTf-8の先頭バイトであるかどうかを返す。
 292     // 先頭バイトである場合には、その先頭バイトを含む、一文字のサイズを返す。
 293     // 先頭バイトではない場合には、
 294     const unsigned char max_c = 1 << (sizeof(unsigned char) * 8 - 1);
 295
 296     size_t back = size;
 297
 298     // 最上位ビットが0である場合、これはasciiコードを指す。
 299     if (!(c & max_c))
 300     {
 301         size = 1;
 302         return true;
 303     }
 304
 305     unsigned char first = c << 1;
 306     size_t num = 1;
 307     while (first & max_c) {
 308         first <<= 1;
 309         num += 1;
 310     }
 311
 312     // ここまできたとき、最上位ビットは0であるはず。
 313     // numが5未満である場合、とりあえず正常としておくこととする。
 314     const unsigned char max_utf8_sequence = 5;
 315     if (num < max_utf8_sequence) {
 316         size = num;
 317         return true;
 318     }
 319     else if (num == 1)
 320     {
 321         // numが1の場合、何らかの理由で先頭が欠落したと見られる。
 322         // この場合、スキップするべきバイト数を返す。
 323         size = 1;
 324         return false;
 325
 326     } else {
 327         size = back;
 328         return false;
 329     }
 330 }
 331
 332
 333 bool utakata::utf8::is_utf8_ascii(const std::vector<unsigned char>& bytes)
 334 {
 335     // 一文字かつ、0x00〜0x7fの範囲であるデータであることが条件となる。
 336     size_t s = 0;
 337     bool b = is_utf8_one(bytes, s);
 338
 339     if (b && s == 1)
 340     {
 341         return true;
 342     }
 343     return false;
 344 }
 345
 346 bool utakata::utf8::is_utf8_numeric(const std::vector<unsigned char>& bytes)
 347 {
 348     // 一文字分だけが渡されていると判断する。
 349     if (!is_utf8_ascii(bytes))
 350     {
 351         return false;
 352     }
 353
 354     if (bytes[0] >= '0' && bytes[0] <= '9')
 355     {
 356         return true;
 357     }
 358
 359     return false;
 360 }
 361
 362 bool utakata::utf8::is_utf8_alpha(const std::vector<unsigned char>& bytes)
 363 {
 364     // 同じく一文字であると判別する。
 365     // asciiのサブセットなので、先にasciiであると判別しておく。
 366     if (!is_utf8_ascii(bytes))
 367     {
 368         return false;
 369     }
 370
 371     if ((bytes[0] >= 'a' && bytes[0] <= 'z') ||
 372         (bytes[0] >= 'A' && bytes[0] <= 'Z'))
 373
 374     {
 375         return true;
 376     }
 377
 378     return false;
 379 }