From: Myun2 Date: Tue, 3 Aug 2010 13:06:14 +0000 (+0900) Subject: utf8_lexical.hpp 途中 X-Git-Tag: 20111130_shapeup_prev~277 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=0b7672ea35e45163691a72df81b046d9c4cf60d2;p=roast%2Froast.git utf8_lexical.hpp 途中 --- diff --git a/roast/include/roast/str/multibyte/utf8_lexical.hpp b/roast/include/roast/str/multibyte/utf8_lexical.hpp index c14c1c37..68d98e1a 100644 --- a/roast/include/roast/str/multibyte/utf8_lexical.hpp +++ b/roast/include/roast/str/multibyte/utf8_lexical.hpp @@ -6,13 +6,66 @@ #ifndef __SFJP_ROAST__str__multi_byte__utf8_lexical_HPP__ #define __SFJP_ROAST__str__multi_byte__utf8_lexical_HPP__ +#include "roast/lexical/string_structure.hpp" + namespace roast { namespace multibyte { namespace utf8 { - typedef lexical_rule; + using namespace ::roast::lexical::structure; + + /* + (00-7f) + (c0-df)(80-bf) + (e0-ef)(80-bf)(80-bf) + (f0-f7)(80-bf)(80-bf)(80-bf) + (f8-fb)(80-bf)(80-bf)(80-bf)(80-bf) + (fc-fd)(80-bf)(80-bf)(80-bf)(80-bf)(80-bf) + + U+0000ccU+007F 0xxxxxxx (00-7f) 07bit + U+0080ccU+07FF 110yyyyx 10xxxxxx (c0-df)(80-bf) 11bit + U+0800ccU+FFFF 1110yyyy 10yxxxxx 10xxxxxx (e0-ef)(80-bf)(80-bf) 16bit + U+10000ccU+1FFFFF 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx (f0-f7)(80-bf)(80-bf)(80-bf) 21bit + U+200000ccU+3FFFFFF 111110yy 10yyyxxx 10xxxxxx 10xxxxxx 10xxxxxx (f8-fb)(80-bf)(80-bf)(80-bf)(80-bf) 26bit + U+4000000ccU+7FFFFFFF 1111110y 10yyyyxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (fc-fd)(80-bf)(80-bf)(80-bf)(80-bf)(80-bf) 31bit + + */ + typedef or< + // 1byte (U+0000 - U+007F) -> (00-7f) + int_range< 0x00, 0x7f >, + + // 2byte (U+0080 - U+07FF) -> (c0-df)(80-bf) + seq< + int_range< 0xc0, 0xdf >, + int_range< 0x80, 0xbf > + >, + + // 3byte (U+0800 - U+FFFF) -> (e0-ef)(80-bf)(80-bf) + seq< + int_range< 0xc0, 0xdf >, + int_range< 0x80, 0xbf > + >, + + // 2byte (U+0080 - U+07FF) -> (c0-df)(80-bf) + seq< + int_range< 0xc0, 0xdf >, + int_range< 0x80, 0xbf > + >, + + // 2byte (U+0080 - U+07FF) -> (c0-df)(80-bf) + seq< + int_range< 0xc0, 0xdf >, + int_range< 0x80, 0xbf > + >, + + // 2byte (U+0080 - U+07FF) -> (c0-df)(80-bf) + seq< + int_range< 0xc0, 0xdf >, + int_range< 0x80, 0xbf > + >, + > lexical_rule; } } } diff --git a/roast/test/lexical_test/mbstring_test.cpp b/roast/test/lexical_test/mbstring_test.cpp new file mode 100644 index 00000000..362371d3 --- /dev/null +++ b/roast/test/lexical_test/mbstring_test.cpp @@ -0,0 +1,64 @@ +#//include "roast/xml/roast_dom_driver.hpp" +#include "roast/xml/roast_xml/roast_xml_dom_parser.hpp" +#include "roast/xml/roast_xml/roast_xml_sax_parser.hpp" +#include +#include + +using namespace roast; +using namespace roast::lexical; + +#define BUFF_SIZE 200*1024*1024 + + + class test + { + private: + int attr_count ; + int text_count ; + int element_count ; + public: + void attribute( const sized_ccharbuf &attr_name, const sized_ccharbuf &attr_value ) + { + attr_count ++; + } + + void comment( const sized_ccharbuf &s ) + { + } + + void text( const sized_ccharbuf &s ) + { + text_count ++; + } + void start_element( const sized_ccharbuf &s ) + { + element_count ++; + } + void end_element(){ + } + }; + + +int main() +{ + FILE* fp = fopen("temp.xml","r"); + //FILE* fp = fopen("MIDIƒfƒoƒCƒXƒ}ƒl[ƒWƒƒ3.xml","r"); + //char work[256]; + //fread(work,sizeof(work),1,fp); + char *work = new char [BUFF_SIZE]; + fread(work,BUFF_SIZE,1,fp); + + printf("%d\n", clock()); + + roast_xml::dom_parser xml(work); + + /*roast_xml::sax_parser< + roast_xml::sax_callback_sample> xml(work); + //test> xml(work);*/ + roast_xml::document doc = xml.analyze(); + + roast_xml::element *e = xml.get_root_element(); + printf("%d\n", clock()); + + return 0; +}