+/*
+ * Copyright (C) 2009 by Aiwota Programmer
+ * aiwotaprog@tetteke.tk
+ *
+ * This file is part of Dialektos.
+ *
+ * Dialektos is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Dialektos is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Dialektos. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cp932.hxx"
+
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <string>
+#include <sstream>
+
+
+namespace dialektos {
+
+namespace convert {
+
+
+CP932toUTF8::CP932toUTF8() : CP932Table(), buffer_(), lead_(0) {
+}
+
+void CP932toUTF8::bmp_to_buffer(unsigned short bmp) {
+ if (bmp >= 0x0000 && bmp <= 0x007f) {
+ // ascii
+ buffer_ += char(bmp);
+ } else if (bmp >= 0x0080 && bmp <= 0x07ff) {
+ // 00000xxxxxyyyyyy -> 110xxxxx 10yyyyyy
+ char ch1 = (bmp >> 6) | 0xc0;
+ buffer_ += ch1;
+ char ch2 = (bmp & 0x3f) | 0x80;
+ buffer_ += ch2;
+ } else /*if (bmp >= 0x0800 && bmp <= 0xffff)*/ {
+ // xxxxyyyyyyzzzzzz -> 1110xxxx 10yyyyyy 10zzzzzz
+ char ch1 = (bmp >> 12) | 0xe0;
+ buffer_ += ch1;
+ char ch2 = ((bmp >> 6) & 0x3f) | 0x80;
+ buffer_ += ch2;
+ char ch3 = (bmp & 0x3f) | 0x80;
+ buffer_ += ch3;
+ }
+}
+
+void CP932toUTF8::cp932_to_buffer(unsigned char ch) {
+ const unsigned short bmp = to_bmp(ch);
+ if (bmp == 0) unknown_to_buffer(ch);
+ else bmp_to_buffer(bmp);
+}
+
+void CP932toUTF8::cp932_to_buffer(unsigned char ch1, unsigned char ch2) {
+ const unsigned short bmp = to_bmp(ch1, ch2);
+ if (bmp == 0) unknown_to_buffer(ch1, ch2);
+ else bmp_to_buffer(bmp);
+}
+
+void CP932toUTF8::unknown_to_buffer(unsigned char ch) {
+ std::stringstream ss;
+ ss << boost::format("\\x%02x") % int(ch);
+ buffer_ += ss.str();
+}
+
+void CP932toUTF8::unknown_to_buffer(unsigned char ch1, unsigned char ch2) {
+ std::stringstream ss;
+ ss << boost::format("\\x%02x\\x%02x") % int(ch1) % int(ch2);
+ buffer_ += ss.str();
+}
+
+std::string CP932toUTF8::operator()(const std::string& input) {
+ BOOST_FOREACH(const unsigned char ch, input) {
+ if (lead_ > 0) {
+ cp932_to_buffer(lead_, ch);
+ lead_ = 0;
+ continue;
+ }
+
+ if (ch >= 0 && ch <= 0x7f) {
+ // ascii
+ lead_ = 0;
+ buffer_ += ch;
+ } else if (ch == 0x80) {
+ // undefined
+ lead_ = 0;
+ unknown_to_buffer(ch);
+ } else if (ch >= 0x81 && ch <= 0x9f) {
+ // dbcs lead byte
+ lead_ = ch;
+ } else if (ch == 0xa0) {
+ // undefined
+ lead_ = 0;
+ unknown_to_buffer(ch);
+ } else if (ch >= 0xa1 && ch <= 0xdf) {
+ // half witdh katakana
+ lead_ = 0;
+ cp932_to_buffer(ch);
+ } else if (ch >= 0xe0 && ch <= 0xfc) {
+ // dbcs lead byte
+ lead_ = ch;
+ } else /*if (ch >= 0xfd && ch <= 0xff)*/ {
+ // undefined
+ lead_ = 0;
+ unknown_to_buffer(ch);
+ }
+ }
+
+ return buffer_;
+}
+
+std::string cp932(const std::string& input) {
+ CP932toUTF8 conv;
+ return conv(input);
+}
+
+
+} // namespace convert
+
+} // namespace dialektos