utf8.cpp

   1 #include <vector>
   2 #include <iostream>
   3 #include <algorithm>
   4 #include <string>
   5 #include <sstream>
   6 #include <assert.h>
   7
   8 #include "InputStream.h"
   9 #include "utf8.h"
  10 #include "smart_ptr.h"
  11
  12 using namespace std;;
  13 using namespace utakata::utf8;
  14
  15 CUTF8InputStream::CUTF8InputStream() : EOF_(0xff), strm_()
  16 {
  17 }
  18
  19 CUTF8InputStream::CUTF8InputStream(const smart_ptr<std::istream>& strm) : EOF_(0xff), strm_(strm)
  20 {
  21 }
  22
  23 bool CUTF8InputStream::open(const smart_ptr<std::istream>& strm)
  24 {
  25     //現在保持しているストリームと切り替える。
  26     // 基本的にただスワップするだけで問題ない。
  27
  28     // NULLポインタではなく、問題無く開かれている場合には、次のようにして開始する。
  29     if (strm.isNull() != false && !strm->good()) {
  30         strm_ = strm;
  31     } else {
  32         return false;
  33     }
  34     return true;
  35 }
  36
  37 std::vector<unsigned char> CUTF8InputStream::read()
  38 {
  39     // UTF-8の一文字を読みだして返す。
  40     // UTF-8に該当しない場合、空のvectorを返す。
  41     if (!strm_->good()) {
  42         throw CStreamException("not ready input stream");
  43     }
  44
  45     // 最初に一文字だけ読みだして、チェックをかける。
  46     int c = 0;
  47     c = strm_->peek();
  48     if (c != std::istream::traits_type::eof())
  49     {
  50         // 末尾でない場合のみ、以降のチェックに入る。
  51         size_t size = 0;
  52         // 先頭1バイトが正常でなかった場合はそのまま抜ける
  53         if (is_utf8_first_byte(static_cast<unsigned char>(c), size))
  54         {
  55             if (size > 0)
  56             {
  57                 // sizeが0より大きい場合には、この時複数バイトで文字が構成
  58                 // されていると考えられるため、明示的に複数文字を取得する。
  59                 std::vector<char> tmp(size, 0);
  60                 strm_->read(&tmp[0], size);
  61
  62                 if (strm_->bad())
  63                 {
  64                     // 読み取りきれなかった場合には、ストリームに一応読出せた
  65                     // 分を元に戻す。
  66                     std::for_each(tmp.rbegin(), tmp.rend(), PutBack(strm_));
  67                     return std::vector<unsigned char>(0);
  68                 }
  69                 std::vector<unsigned char> rtn;
  70                 rtn.insert(rtn.begin(), tmp.begin(), tmp.end());
  71
  72                 return rtn;
  73             }
  74         }
  75     }
  76
  77     return std::vector<unsigned char>(EOF_);
  78 }
  79
  80 std::vector<unsigned char> CUTF8InputStream::read(int num)
  81 {
  82     // 指定された文字分だけ読みだしてくる。
  83     // 途中で終了した場合、その文字の分だけunsigned charが減少すること
  84     // になっている。
  85     // numが0の場合、必ず空のvectorが返される。
  86
  87     if (num == 0)
  88     {
  89         return std::vector<unsigned char>();
  90     }
  91
  92     // eofの場合なら、この時点でeofが返るので、それで問題はない。
  93     std::vector<unsigned char> rtn = this->read();
  94     for (int i = 1; i < num && !strm_->eof(); ++i)
  95     {
  96         // 個数に到達するか、もしくはeofとなるまでは追加しつづける。
  97         std::vector<unsigned char> tmp = this->read();
  98         rtn.insert(rtn.end(), tmp.begin(), tmp.end());
  99     }
 100
 101     return rtn;
 102 }
 103
 104 std::vector<unsigned char> CUTF8InputStream::peek()
 105 {
 106     // 一文字分だけ先読みする。先読みした場合、文字は戻す。
 107     std::vector<unsigned char> tmp = this->read();
 108     // 一応戻すサイズが存在する場合だけ、これを実行させることにする。
 109     if (tmp.size() > 0)
 110     {
 111         // 複雑な繰り返しを表現する場合には、積極的にalgorithmを利用するようにする。
 112
 113         std::for_each(tmp.rbegin(), tmp.rend(),
 114                       utakata::utf8::PutBack(strm_));
 115     }
 116     return tmp;
 117 }
 118
 119 void CUTF8InputStream::unget(const std::vector<unsigned char>& ch)
 120 {
 121     // 渡されたバイト列をストリームに差し戻す。
 122     size_t t = 0;
 123     if (is_utf8_one(ch, t))
 124     {
 125         std::for_each(ch.rbegin(), ch.rend(), PutBack(strm_));
 126     }
 127 }
 128
 129 bool utakata::utf8::CUTF8InputStream::isEOF() const
 130 {
 131     if (strm_->good())
 132     {
 133         return strm_->eof() ? true : false;
 134     }
 135     else
 136     {
 137         return false;
 138     }
 139 }
 140
 141 //================================================================================
 142
 143 long utakata::utf8::generateUTF8Code(const std::vector<unsigned char>& bytes)
 144 {
 145     // 1文字分のUTF8のバイト列を受け取って、コードに変換して返す。
 146     // 先頭の値によって、次のように値を決定することができる。
 147     // x = utf8の先頭バイト
 148     // y1〜yN = utf8の先頭バイト以降のバイト
 149     // N = utf8の先頭バイトを含むバイト数
 150     // code = (y1 & ((1 << 7) - 1)) << (6 * n-1) + (y2 & ((1 << 7) -1)) << (6 * (n - 1))...+ x & ((1 << N) -1) << (6 * N-1)
 151     // 先頭バイト以外は、全て先頭に10とうビットが設定されている。このビットを除いた6ビットをする。
 152     // つまり、末尾のバイトから順次やっていけばよい。
 153
 154     std::vector<unsigned char> tmp(bytes);
 155     const unsigned char max_c = (1 << (sizeof(unsigned char) * 8 - 1)) - 1;
 156     long code = 0;
 157
 158     if (tmp.empty())
 159     {
 160         return 0;
 161     }
 162
 163     if (tmp.size() == 1)
 164     {
 165         // asciiコードは7bitなのでそこだけ切り取って返す。
 166         code = tmp[0] & max_c;
 167     }
 168     else
 169     {
 170         // 一時的に利用されるクラス。
 171         struct Lambda
 172         {
 173             unsigned char operator()(unsigned char c, int s) {
 174                 return c << (6 * s);
 175             }
 176         };
 177
 178         // サイズが1以外の場合、ここからがちと違う。
 179         std::vector<unsigned char>::reverse_iterator beg = tmp.rbegin(),
 180             end = tmp.rend() - 1;
 181         const unsigned char char_bit = (1 << 6) - 1;
 182         for (int i = 0; beg != end; ++i,++beg)
 183         {
 184             code += Lambda()((*beg & char_bit), i);
 185         }
 186
 187         // 最後だけ、別の計算が必要になる。
 188         const unsigned char first_byte = (1 << ((sizeof(unsigned char) + 1) - tmp.size())) - 1;
 189         code += Lambda()(first_byte,tmp.size() - 1);
 190
 191     }
 192
 193     return code;
 194 }
 195
 196
 197 long utakata::utf8::generateUTF8Code(const std::string& bytes)
 198 {
 199     // UTF8である一文字のstringを受け取って、先頭1文字の値を返す。
 200
 201     std::string str = bytes;
 202     std::vector<unsigned char> tmp;
 203     tmp.insert(tmp.end(), str.begin(), str.end());
 204
 205     // vectorにしなおしたら後は元々の関数に任せる。
 206     return generateUTF8Code(tmp);
 207 }
 208
 209 bool utakata::utf8::is_utf8_one(const std::vector<unsigned char>& bytes, size_t& size)
 210 {
 211     //渡したバイト列がUTF8の一文字に該当するかどうかを返す。
 212     size_t back = size;
 213
 214     if (bytes.size() == 0)
 215     {
 216         size = back;
 217         return false;
 218     }
 219
 220     size_t num = 0;
 221
 222     if (!is_utf8_first_byte(bytes[0], num))
 223     {
 224         size = back;
 225         return false;
 226     }
 227
 228     // そもそもbytesのサイズが足りない場合にも失敗とする。
 229     if (num > bytes.size())
 230     {
 231         size = back;
 232         return false;
 233     }
 234
 235     // 先頭要素以外が正しければそれで問題ないとする。
 236     if (num > 1)
 237     {
 238
 239         const CheckUTF8Byte& checker = for_each(bytes.begin() + 1, bytes.begin() + num,
 240                                                  CheckUTF8Byte());
 241
 242         if (checker.good)
 243         {
 244             size = num;
 245         } else {
 246             size = back;
 247             return false;
 248         }
 249     } else {
 250         // sizeが0の場合には、この時点で1を設定するようにする。
 251         size = num;
 252     }
 253
 254     // 最後まで到達した時点で成功とする。
 255     return true;
 256 }
 257
 258 bool utakata::utf8::is_utf8_all(const std::vector<unsigned char>& bytes)
 259 {
 260     // 与えられたバイト列全てがUTF-8であるかどうかを返す。
 261     size_t size = 0;
 262     std::vector<unsigned char>::const_iterator it = bytes.begin();
 263     while (is_utf8_one(std::vector<unsigned char>(it, bytes.end()), size)) {
 264         it += size;
 265         size = 0;
 266     }
 267
 268     if (it == bytes.end()) {
 269         return true;
 270     }
 271
 272     return false;
 273 }
 274
 275 bool utakata::utf8::is_utf8_first_byte(unsigned char c, size_t& size)
 276 {
 277     // UTf-8の先頭バイトであるかどうかを返す。
 278     // 先頭バイトである場合には、その先頭バイトを含む、一文字のサイズを返す。
 279     // 先頭バイトではない場合には、
 280     const unsigned char max_c = 1 << (sizeof(unsigned char) * 8 - 1);
 281
 282     size_t back = size;
 283
 284     // 最上位ビットが0である場合、これはasciiコードを指す。
 285     if (!(c & max_c))
 286     {
 287         size = 1;
 288         return true;
 289     }
 290
 291     unsigned char first = c << 1;
 292     size_t num = 1;
 293     while (first & max_c) {
 294         first <<= 1;
 295         num += 1;
 296     }
 297
 298     // ここまできたとき、最上位ビットは0であるはず。
 299     // numが5未満である場合、とりあえず正常としておくこととする。
 300     const unsigned char max_utf8_sequence = 5;
 301     if (num < max_utf8_sequence) {
 302         size = num;
 303         return true;
 304     }
 305     else if (num == 1)
 306     {
 307         // numが1の場合、何らかの理由で先頭が欠落したと見られる。
 308         // この場合、スキップするべきバイト数を返す。
 309         size = 1;
 310         return false;
 311
 312     } else {
 313         size = back;
 314         return false;
 315     }
 316 }