sublexer_impl.cpp

   1 #include <iostream>
   2 #include <assert.h>
   3 #include <sstream>
   4
   5
   6 #include "utf8_string.h"
   7
   8 #include "sublexer_impl.h"
   9 #include "lexeme_impl.h"
  10 #include "delimiter.h"
  11 #include "number_lexer.h"
  12
  13
  14 using namespace utakata;
  15 using namespace utakata::utf8_string;
  16
  17 smart_ptr<lexeme::ILexeme> sublexer::FirstLexer::lex(smart_ptr<utf8::UTF8InputStream> stream,
  18                                                      smart_ptr<sublexer::ISubLexer>& next)
  19 {
  20     // chにはlexerから渡された、今回読みだした文字が渡されている。
  21
  22     UTF8String str;
  23
  24     // 最初に実行されるので、最初の空白を読み飛ばす。
  25     UTF8Char ch(stream->read());
  26     {
  27         lexer_delimiter::Whitespace sps;
  28         while (sps(ch)) {
  29             ch = stream->read();
  30         }
  31     }
  32     // ここまで来た段階で、chには調べる対象となる文字が入っている。
  33
  34     // ()[]`'が最初だった場合、それぞれを返してやる。
  35     if (ch.toUTF16Code() == '(' || ch.toUTF16Code() == '[')
  36     {
  37         return lexeme::makeOpenParen();
  38     }
  39     else if (ch.toUTF16Code() == ')' || ch.toUTF16Code() == ']')
  40     {
  41         return lexeme::makeCloseParen();
  42     }
  43     else if (ch.toUTF16Code() == '`')
  44     {
  45         return lexeme::makeBackQuote();
  46     }
  47     else if (ch.toUTF16Code() == '\'')
  48     {
  49         return lexeme::makeQuote();
  50     }
  51     else if (ch.toUTF16Code() == '.')
  52     {
  53         // 次の文字がデリミタで終了していなければならない。
  54         if (lexer_delimiter::Normal()(UTF8Char(stream->peek())))
  55         {
  56             return lexeme::makeDot();
  57         }
  58     }
  59     else if (utf8_string::is_numeric(ch))
  60     {
  61         // 先頭が数値だった場合、これは10進数だと判断して次に進む。
  62         next.add(new sublexer::NumberLexer(UTF8String(ch.getBytes())));
  63         return smart_ptr<lexeme::ILexeme>();
  64     }
  65     else if (utf8_string::is_alpha(ch) ||
  66              ch.toUTF16Code() == '!' || ch.toUTF16Code() == '$' ||
  67              ch.toUTF16Code() == '%' || ch.toUTF16Code() == '&' ||
  68              ch.toUTF16Code() == '*' || ch.toUTF16Code() == '/' ||
  69              ch.toUTF16Code() == ':' || ch.toUTF16Code() == '<' ||
  70              ch.toUTF16Code() == '>' || ch.toUTF16Code() == '?' ||
  71              ch.toUTF16Code() == '^' || ch.toUTF16Code() == '_' ||
  72              ch.toUTF16Code() == '~' || ch.toUTF16Code() == '+' ||
  73              ch.toUTF16Code() == '.')
  74     {
  75         // やたらと多いが、上記のどれかである場合には、identityとして解析を
  76         // 開始させる。
  77         next.add(new sublexer::IdentifierLexer(UTF8String(ch.getBytes())));
  78         return smart_ptr<lexeme::ILexeme>();
  79     }
  80     else if (ch.toUTF16Code() == ';')
  81     {
  82         // 一行コメントになるため、そちらに任せる。
  83         next.add(new sublexer::OneLineCommentLexer());
  84     }
  85     else if (str[0].toUTF16Code() == '"')
  86     {
  87         // 先頭が"の場合、stringと判断される。
  88         next.add(new sublexer::StringLexer());
  89     }
  90
  91
  92     // ここ以降になると、決定するまで読んでから決定させる。
  93     str += ch;
  94     smart_ptr<lexeme::ILexeme> l = lex_(str, stream, next);
  95     if (!l.isNull())
  96     {
  97         return l;
  98     }
  99
 100     // ここに来る場合、次の解析結果が存在する場合のみである。
 101     if (next.isNull())
 102     {
 103         throw sublexer::LexException(stream->pos(), "構文エラー");
 104     }
 105     return smart_ptr<lexeme::ILexeme>();
 106 }
 107
 108 smart_ptr<lexeme::ILexeme> sublexer::FirstLexer::lex_(const utakata::utf8_string::UTF8String& str,
 109                                                       smart_ptr<utakata::utf8::UTF8InputStream> stream,
 110                                                       smart_ptr<ISubLexer>& next)
 111 {
 112     if (str[0].toUTF16Code() == ',')
 113     {
 114         UTF8Char ch = stream->peek();
 115         // 次の一文字で決定できる。
 116         if (ch.toUTF16Code() == '@')
 117         {
 118             stream->read();
 119             return lexeme::makeUnquoteSplicing(str + ch);
 120         }
 121         else
 122         {
 123             // 次の文字が@では無い場合、とりあえず,で返す。
 124             return lexeme::makeUnquote();
 125         }
 126     }
 127     else if (str[0].toUTF16Code() == '#')
 128     {
 129         // コメントなどに繋がる時もあるため、面倒。
 130         UTF8Char ch = stream->peek();
 131         if (ch.toUTF16Code() == '\'')
 132         {
 133             return lexeme::makeSyntax();
 134         }
 135         else if ('`')
 136         {
 137             return lexeme::makeQuasiSyntax();
 138         }
 139         else if (ch.toUTF16Code() == ',')
 140         {
 141             // 更に次の文字を読む。
 142             stream->read();
 143             ch = stream->peek();
 144             if (ch.toUTF16Code() == '@')
 145             {
 146                 stream->read();
 147                 return lexeme::makeUnsyntaxSplicing();
 148             }
 149             else
 150             {
 151                 return lexeme::makeUnsyntax();
 152             }
 153         }
 154         else if (ch.toUTF16Code() == '|')
 155         {
 156             // ネストコメントの開始という判断がなされ、ネストコメント
 157             // 用の字句解析に移る。
 158             stream->read();
 159             next.add(new sublexer::NestedCommentLexer(str + ch));
 160         }
 161         else if (ch.toUTF16Code() == 'e' || ch.toUTF16Code() == 'E' ||
 162                  ch.toUTF16Code() == 'i' || ch.toUTF16Code() == 'I' ||
 163                  ch.toUTF16Code() == 'b' || ch.toUTF16Code() == 'B' ||
 164                  ch.toUTF16Code() == 'o' || ch.toUTF16Code() == 'O' ||
 165                  ch.toUTF16Code() == 'd' || ch.toUTF16Code() == 'D' ||
 166                  ch.toUTF16Code() == 'x' || ch.toUTF16Code() == 'X')
 167         {
 168             // 数値のプレフィックス、もしくはexactnessになるため、これは
 169             // 数値として判定させる。
 170             stream->read();
 171             next.add(new sublexer::NumberLexer(str + ch));
 172         }
 173         else if (ch.toUTF16Code() == '\\')
 174         {
 175             // #\の場合、次には文字が出てくるはずなので、そのまま進む。
 176             stream->read();
 177             next.add(new sublexer::CharactorLexer());
 178         }
 179     }
 180     else if (str[0].toUTF16Code() == '-')
 181     {
 182         // 先頭が-の場合、次の文字を見てから決める。
 183         // 基本的には数値だが、次の文字によってはidentifierになりうる。
 184         UTF8Char ch(stream->peek());
 185         lexer_delimiter::Normal nor;
 186         if (ch.toUTF16Code() == '>' || nor(ch))
 187         {
 188             next.add(new sublexer::IdentifierLexer(str));
 189         }
 190         else
 191         {
 192             next.add(new sublexer::NumberLexer(str));
 193         }
 194     }
 195
 196     return smart_ptr<lexeme::ILexeme>();
 197 }
 198
 199
 200 //================================================================================
 201
 202 smart_ptr<lexeme::ILexeme> sublexer::StringLexer::lex(smart_ptr<utf8::UTF8InputStream> stream,
 203                                                       smart_ptr<sublexer::ISubLexer>& next)
 204 {
 205     // 文字列を解析する。
 206
 207     lexer_delimiter::Normal nor;
 208     lexer_delimiter::String st;
 209     lexer_delimiter::Whitespace w;
 210     utf8_string::UTF8String str;
 211
 212     while (!stream->isEOF()) {
 213         UTF8Char tmp(stream->peek());
 214         if (nor(tmp) || st(tmp)) {
 215             break;
 216         }
 217
 218         if (tmp.toUTF16Code() == '\\')
 219         {
 220             // 空白及び改行を飛ばす。
 221             while (!stream->isEOF() &&
 222                    (w(tmp) || nor(tmp))) {}
 223         } else {
 224             str += stream->read();
 225         }
 226     }
 227
 228     return smart_ptr<lexeme::ILexeme>();
 229 }
 230
 231 //================================================================================
 232
 233 smart_ptr<lexeme::ILexeme> sublexer::NestedCommentLexer::lex(smart_ptr<utf8::UTF8InputStream> stream,
 234                                                              smart_ptr<sublexer::ISubLexer>& next)
 235 {
 236     // ネストしたコメントを解釈する。基本的には改行も関係無く処理する。
 237     int count = 1;
 238     while (!stream->isEOF() && count > 0) {
 239         utf8_string::UTF8Char tmp(stream->read());
 240
 241         if (tmp.toUTF16Code() == '#')
 242         {
 243             // 次の文字を調べる。
 244             utf8_string::UTF8Char t2(stream->read());
 245             if (t2.toUTF16Code() == '|')
 246             {
 247                 ++count;
 248             }
 249         }
 250         else if (tmp.toUTF16Code() == '|')
 251         {
 252             // 次の文字を調べる。
 253             utf8_string::UTF8Char t2(stream->read());
 254             if (t2.toUTF16Code() == '#')
 255             {
 256                 --count;
 257             }
 258         }
 259     }
 260
 261     return smart_ptr<lexeme::ILexeme>();
 262 }
 263
 264 //================================================================================
 265
 266 sublexer::NumberLexer::NumberLexer(const utf8_string::UTF8String& str) :
 267     BINARY(2), OCTET(8), DECIMAL(10), HEX(16),
 268     str_(new utf8_string::UTF8String(str)), exact_(false), prefix_(0)
 269 {
 270 }
 271
 272 smart_ptr<lexeme::ILexeme> sublexer::NumberLexer::innerLex_(smart_ptr<utf8::UTF8InputStream> stream,
 273                                                             smart_ptr<sublexer::ISubLexer>& next,
 274                                                             const utf8_string::UTF8String& str)
 275 {
 276     // prefixをチェックして、返すべきものを決定する。
 277     if (prefix_ == BINARY)
 278     {
 279         return number::Number<2>().lex(stream, next, exact_);
 280     }
 281     else if (prefix_ == OCTET)
 282     {
 283         return number::Number<8>().lex(stream, next, exact_);
 284     }
 285     else if (prefix_ == DECIMAL)
 286     {
 287         return number::Number<10>().lex(stream, next, exact_);
 288     }
 289     else if (prefix_ == HEX)
 290     {
 291         return number::Number<16>().lex(stream, next, exact_);
 292     }
 293     else
 294     {
 295         return smart_ptr<lexeme::ILexeme>();
 296     }
 297 }
 298
 299 void sublexer::NumberLexer::checkExactness_(const utf8_string::UTF8String& str)
 300 {
 301     // 正確性をチェックする。2文字目だけを調べればそれですむ。
 302     if (str[1].toUTF16Code() == 'e' || str[1].toUTF16Code() == 'E')
 303     {
 304         exact_ = true;
 305     }
 306     else if (str[1].toUTF16Code() == 'i' || str [1].toUTF16Code() == 'I')
 307     {
 308         exact_ = false;
 309     }
 310 }
 311
 312 unsigned char sublexer::NumberLexer::getPrefix_(const utf8_string::UTF8String& str)
 313 {
 314     // 2文字目の値を見て判断を返す。
 315     if (str[1].toUTF16Code() == 'b' ||
 316         str[1].toUTF16Code() == 'B')
 317     {
 318         return BINARY;
 319     }
 320     else if (str[1].toUTF16Code() == 'o' ||
 321              str[1].toUTF16Code() == 'O')
 322     {
 323         return OCTET;
 324     }
 325     else if (str[1].toUTF16Code() == 'd' ||
 326              str[1].toUTF16Code() == 'D')
 327     {
 328         return DECIMAL;
 329     }
 330     else if (str[1].toUTF16Code() == 'x' ||
 331              str[1].toUTF16Code() == 'X')
 332     {
 333         return HEX;
 334     }
 335
 336     return DECIMAL;
 337 }
 338
 339 smart_ptr<lexeme::ILexeme> sublexer::NumberLexer::lex(smart_ptr<utf8::UTF8InputStream> stream,
 340                                                       smart_ptr<sublexer::ISubLexer>& next)
 341 {
 342     // 数値を解釈する。
 343     lexer_delimiter::Exactness e;
 344     lexer_delimiter::Prefix p;
 345
 346
 347     // 先頭はすでに読みとばされている。次に続く可能性があるため、継続して調査をしてみる。
 348
 349     if (p(*str_))
 350     {
 351         prefix_ = getPrefix_(*str_);
 352         // 次にexactnessが続いているかどうかを調べる。
 353         utf8_string::UTF8String s = utf8_string::substring(*str_, 2);
 354         if (e(s))
 355         {
 356             // Exactnessである場合、実際にそうであるかどうかをチェックする。
 357             checkExactness_(s);
 358         }
 359         s = utf8_string::substring(s, 2);
 360         return innerLex_(stream, next, s);
 361     }
 362     else if (e(*str_))
 363     {
 364         // Exactnessであるため、次にprefixの必要がある。
 365         checkExactness_(*str_);
 366
 367         utf8_string::UTF8String s = utf8_string::substring(*str_, 2);
 368         if (p(s))
 369         {
 370             prefix_ = getPrefix_(s);
 371         }
 372         else
 373         {
 374             prefix_ = DECIMAL;
 375         }
 376         return innerLex_(stream, next, s);
 377     }
 378     else
 379     {
 380         // どっちでもない場合、基本的に10進数として処理する。
 381         prefix_ = DECIMAL;
 382         return innerLex_(stream, next, *str_);
 383     }
 384 }
 385
 386 //================================================================================
 387
 388 sublexer::IdentifierLexer::IdentifierLexer(const utf8_string::UTF8String& str) :
 389     str_(new utf8_string::UTF8String(str))
 390 {}
 391
 392 smart_ptr<lexeme::ILexeme> sublexer::IdentifierLexer::lex(smart_ptr<utf8::UTF8InputStream> stream,
 393                                                           smart_ptr<sublexer::ISubLexer>& next)
 394 {
 395     // Identityを解析する。解釈自体は結構簡単。
 396
 397     // 一文字目はすでに判定されているため、二文字目以降で判定する。
 398     // 二文字名以降でなければ判定できないものもあるため、一度まとめてみる。
 399     lexer_delimiter::Normal nor;
 400     utf8_string::UTF8String& str = *str_;
 401     if (str[0].toUTF16Code() == '+')
 402     {
 403         return lexeme::makeIdentifier(str);
 404     }
 405     else if (str[0].toUTF16Code() == '-')
 406     {
 407         // 次の文字がデリミタの場合には、そのまま返す。
 408         UTF8Char next(stream->peek());
 409         if (nor(next))
 410         {
 411             return lexeme::makeIdentifier(str);
 412         }
 413         else if (next.toUTF16Code() == '>')
 414         {
 415             // この場合、そのまま続けてもよし。
 416         }
 417     }
 418
 419     while (!stream->isEOF()) {
 420         UTF8Char tmp(stream->peek());
 421         if (nor(tmp)) {
 422             break;
 423         }
 424         str += stream->read();
 425     }
 426
 427     return lexeme::makeIdentifier(str);
 428 }
 429
 430 //================================================================================
 431
 432 smart_ptr<lexeme::ILexeme> sublexer::CharactorLexer::lex(smart_ptr<utf8::UTF8InputStream> stream,
 433                                                          smart_ptr<sublexer::ISubLexer>& next)
 434 {
 435     // #\に続く文字名、あるいは#\xに続く16進数のチェックを行なう。
 436     // ここに来た時点で、#\までは読まれている。
 437     UTF8Char ch(stream->read());
 438     bool flag = ch.toUTF16Code() == 'x' ? true : false;
 439
 440     // Hexを解釈する。解釈部分を書くと面倒になるため、単純な別関数にしておく。
 441     UTF8String str(ch.getBytes());
 442     lexer_delimiter::Normal nor;
 443     lexer_delimiter::HexValue hex;
 444     while (!stream->isEOF()) {
 445         UTF8Char tmp(stream->peek());
 446         if (nor(tmp))
 447         {
 448             break;
 449         }
 450         else if (flag && !hex(tmp))
 451         {
 452             // hex valueではなかった場合には、これはエラーであると
 453             // して返す。
 454             std::stringstream ss;
 455             ss << tmp.toStr() << "は16進数中の文字名として利用できません";
 456             throw sublexer::LexException(stream->pos(), ss.str());
 457         }
 458         str += stream->read();
 459     }
 460
 461     return lexeme::makeCharactor(str);
 462 }
 463
 464 //================================================================================
 465
 466
 467 smart_ptr<lexeme::ILexeme> sublexer::OneLineCommentLexer::lex(
 468     smart_ptr<utf8::UTF8InputStream> stream,
 469     smart_ptr<sublexer::ISubLexer>& next)
 470 {
 471
 472
 473     // 一行コメントであるかどうかを返す。
 474     // なおコメントは原則として全て読み飛ばされるため、解釈が完了した
 475     // 後、FirstLexerが再び返される。
 476     lexer_delimiter::LineEnding end;
 477     while (!stream->isEOF()) {
 478         // 改行が出力されるまで全て読み飛ばす。
 479         UTF8Char ch = stream->read();
 480         if (end(ch, stream))
 481         {
 482             // この時点で、読み飛ばしまで行われている。
 483             next.add(new sublexer::FirstLexer());
 484             break;
 485         }
 486     }
 487     return smart_ptr<lexeme::ILexeme>();
 488 }
 489
 490
 491 //================================================================================
 492
 493 sublexer::LexException::LexException(size_t pos, std::string str) : pos_(pos),
 494                                                                     str_()
 495 {
 496     // エラーメッセージを定義する。
 497     std::stringstream ss;
 498     ss << "lex error ! -- pos : [" << pos << "] message : [" << str << "]" << std::endl;
 499     str_ = ss.str();
 500 }
 501
 502 const char* sublexer::LexException::what() const throw()
 503 {
 504     return str_.c_str();
 505 }