1 /***************************************************************************
2 * Copyright (C) 2009 by Kita Developers *
3 * ikemo@users.sourceforge.jp *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
9 ***************************************************************************/
12 #include <QtCore/QString>
15 #include "globalconfig.h"
16 #include "kita-utf16.h"
17 #include "kita-utf8.h"
20 using namespace Kita::Parser;
22 static const int KITA_RESDIGIT = 4;
24 /* if cdat == str, return str.length() */
25 static int isEqual(const QChar *cdat, const QString& str)
28 const int size = str.size();
29 while (i < size && str.at(i) != '\0') {
30 if (*cdat != str.at(i)) return 0;
36 /* parsing function for special char (such as ♥ */
38 /* For example, if cdat = "&", then
40 pos (= length of cdat) = 5,
42 static QString parseSpecialChar(
52 if ((pos = isEqual(cdat , ">"))) retstr = '>';
53 else if ((pos = isEqual(cdat , "<"))) retstr = '<';
54 else if ((pos = isEqual(cdat , " "))) retstr = ' ';
55 else if ((pos = isEqual(cdat , "&"))) retstr = '&';
56 else if ((pos = isEqual(cdat , """))) retstr = '"';
58 else if ((pos = isEqual(cdat , "♥")))
59 retstr = QString::fromUtf8(KITAUTF8_HEART);
61 else if ((pos = isEqual(cdat , "♦")))
62 retstr = QString::fromUtf8(KITAUTF8_DIA);
64 else if ((pos = isEqual(cdat , "♣")))
65 retstr = QString::fromUtf8(KITAUTF8_CLUB);
67 else if ((pos = isEqual(cdat , "♠")))
68 retstr = QString::fromUtf8(KITAUTF8_SPADE);
73 /* get plain text from raw data */
75 This function replaces "<br>" to "\n", removes HTML tags and
76 replaces special chars.
78 void Parser::datToText(
81 const QString &rawData,
89 unsigned int startPos, pos;
90 const QChar *chpt = rawData.unicode();
91 unsigned int length = rawData.length();
93 for (unsigned int i = startPos = 0 ; i < length ; i++) {
95 switch (chpt[ i ].unicode()) {
100 if (chpt[ i + 1 ] == 'b' && chpt[ i + 2 ] == 'r' && chpt[ i + 3 ] == '>') {
102 unsigned int i2 = i - startPos;
103 if (i > 0 && chpt[ i - 1 ] == ' ') i2--; /* remove space before <br> */
104 text += rawData.mid(startPos, i2) + '\n';
106 if (chpt[ startPos ] == ' ') startPos++; /* remove space after <br> */
110 /*----------------------------------------*/
112 /* remove HTML tags <[^>]*> */
115 if (i - startPos) text += rawData.mid(startPos, i - startPos);
116 while (chpt[ i ] != '>' && i < length) i++;
122 /*----------------------------------*/
129 tmpstr = parseSpecialChar(chpt + i, pos);
131 if (!tmpstr.isEmpty()) {
132 text += rawData.mid(startPos, i - startPos) + tmpstr;
142 text += rawData.mid(startPos);
145 /* parsing function for anchor (>>digits) */
147 /* This function parses res anchor.
149 For example, if cdat = ">12-20", then
154 pos (= length of cdat) = 9,
158 static bool parseResAnchor(
161 const QChar *cdat, const unsigned int length,
164 QString& linkstr, int* refNum, unsigned int& pos)
168 static bool isHYPHEN(unsigned short c)
173 || (c >= 0x2010 && c <= 0x2015)
175 || (c == 0xFF0D) /* UTF8: 0xEFBC8D */
186 if (length == 0) return false;
193 /* check '>' twice */
194 for (int i = 0; i < 2; i++) {
196 if (cdat[ pos ].unicode() == UTF16_BRACKET) {
197 linkstr += cdat[ pos ];
199 } else if (cdat[ pos ] == '&' && cdat[ pos + 1 ] == 'g' /* > */
200 && cdat[ pos + 2 ] == 't' && cdat[ pos + 3 ] == ';') {
209 if (cdat[ pos ] == ',' || cdat[ pos ].unicode() == UTF16_COMMA) {
217 if (cdat[ pos ] == '=' || cdat[ pos ].unicode() == UTF16_EQ) {
226 for (int i = 0 ; i < KITA_RESDIGIT + 1 && pos < length ; i++, pos++) {
228 unsigned short c = cdat[ pos ].unicode();
230 if ((c < UTF16_0 || c > UTF16_9)
231 && (c < '0' || c > '9')
232 && (!LocalFunc::isHYPHEN(c)
233 || (i == 0 && LocalFunc::isHYPHEN(c))
234 || (hyphen && LocalFunc::isHYPHEN(c)))
237 linkstr += cdat[ pos ];
239 if (LocalFunc::isHYPHEN(c)) {
243 if (c >= UTF16_0) c = '0' + cdat[ pos ].unicode() - UTF16_0;
244 refNum[ hyphen ] *= 10;
245 refNum[ hyphen ] += c - '0';
262 static void parseName(const QString& rawStr, RESDAT& resdat)
264 unsigned int i = 0, pos;
266 QString linkurl, linkstr;
268 datToText(rawStr, resdat.name);
270 const QChar * chpt = resdat.name.unicode();
271 unsigned int length = resdat.name.length();
272 resdat.nameHTML.clear();
275 while (parseResAnchor(chpt + i, length - i, linkstr, refNum, pos)) {
277 linkurl = QString("#%1").arg(refNum[ 0 ]);
278 if (refNum[ 1 ]) linkurl += QString("-%1").arg(refNum[ 1 ]);
280 resdat.nameHTML += "<a href=\"" + linkurl + "\">";
281 resdat.nameHTML += linkstr;
282 resdat.nameHTML += "</a>";
285 if (refNum[ 1 ] < refNum[ 0 ]) refNum[ 1 ] = refNum[ 0 ];
286 anctmp.from = refNum[ 0 ];
287 anctmp.to = refNum[ 1 ];
288 resdat.anclist += anctmp;
293 /* non-digits strings */
296 resdat.nameHTML += "<span class=\"name_noaddr\">";
297 resdat.nameHTML += resdat.name.mid(i);
298 resdat.nameHTML += "</span>";
303 /* parse date, ID, host */
313 static void parseDateId(const QString& rawStr, RESDAT& resdat)
315 resdat.date = rawStr;
319 resdat.bepointmark.clear();
321 const QChar *chpt = rawStr.unicode();
322 unsigned int pos = 0, startpos = 0;
323 unsigned int length = rawStr.length();
325 while (chpt[ pos ] != '\0' &&
326 !(chpt[ pos ] == 'I' && chpt[ pos + 1 ] == 'D') &&
327 !(chpt[ pos ] == 'B' && chpt[ pos + 1 ] == 'E')) {
330 resdat.date = rawStr.left(pos);
333 if (chpt[ pos ] == 'I' && chpt[ pos + 1 ] == 'D') {
336 while (chpt[ pos ] != ' ' && pos++ < length) {};
337 resdat.id = rawStr.mid(startpos, pos - startpos);
341 // qDebug("date %s, ID %s", (const char*)resdat.date.local8Bit(), resdat.id.ascii());
343 if (pos >= length) return ;
346 if (chpt[ pos ] == 'B' && chpt[ pos + 1 ] == 'E') {
349 while (chpt[ pos ] != '-' && pos++ < length) {};
350 resdat.be = rawStr.mid(startpos, pos - startpos);
352 if (pos < length && chpt[ pos ] == '#') {
354 while (chpt[ pos ] == '#' && pos++ < length) {};
355 resdat.bepointmark = rawStr.mid(startpos, pos - startpos);
359 if (pos >= length) return ;
362 if (chpt[ pos ] == 'H' && chpt[ pos + 1 ] == 'O') {
365 while (chpt[ pos ] != ' ' && pos++ < length) {};
366 resdat.host = rawStr.mid(startpos, pos - startpos);
368 // qDebug("host %s", resdat.host.ascii());
372 /* parsing function for link */
376 cdat = "ttp://foo.com",
380 linkstr = "ttp://foo.com",
381 linkurl = "http://foo.com",
382 pos (= length of cdat) = 13,
386 static bool parseLink(
389 const QChar *cdat, const unsigned int length,
392 QString& linkstr, QString& linkurl, unsigned int& pos
396 /*-----------------------------*/
405 if (isEqual(cdat , "http://")) {
408 } else if (isEqual(cdat , "ttp://")) {
411 } else if (isEqual(cdat , "tp://")) {
414 } else if (isEqual(cdat , "https://")) {
417 } else if (isEqual(cdat , "ttps://")) {
420 } else if (isEqual(cdat , "tps://")) {
427 pos = prefix.length();
428 while (cdat[ pos ] >= '!' && cdat[ pos ] <= '~' &&
429 cdat[ pos ] != ' ' && cdat[ pos ] != '<' && cdat[ pos ] != '>'
431 retlinkstr += cdat[ pos++ ];
433 if (pos > length) return false;
435 if (!retlinkstr.isEmpty()) datToText(retlinkstr, linkstr);
437 linkurl = scheme + linkstr;
438 linkstr = prefix + linkstr;
443 /* create res anchor */
444 /* This function is called from parseBody internally.
445 See also parseBody. */
446 static bool createResAnchor(const QString &rawStr, RESDAT& resdat,
447 const QChar *chpt, unsigned int &i, unsigned int &startPos)
449 QString linkstr, linkurl;
452 unsigned int length = rawStr.length();
455 if (!parseResAnchor(chpt + i, length - i, linkstr, refNum, pos)) {
462 resdat.bodyHTML += rawStr.mid(startPos, i - startPos);
463 linkurl = QString("#%1").arg(refNum[ 0 ]);
464 if (refNum[ 1 ]) linkurl += QString("-%1").arg(refNum[ 1 ]);
466 resdat.bodyHTML += "<a href=\"" + linkurl + "\">";
467 resdat.bodyHTML += linkstr;
468 resdat.bodyHTML += "</a>";
470 /* add anchor to ancList */
472 if (refNum[ 1 ] < refNum[ 0 ]) refNum[ 1 ] = refNum[ 0 ];
473 anctmp.from = refNum[ 0 ];
474 anctmp.to = refNum[ 1 ];
475 resdat.anclist += anctmp;
490 static void parseBody(const QString &rawStr, RESDAT& resdat)
492 resdat.bodyHTML.clear();
494 unsigned int startPos;
495 QString linkstr, linkurl;
496 const QChar *chpt = rawStr.unicode();
497 unsigned int length = rawStr.length();
499 bool ancChain = false;
501 /* ancChain is chain for anchor. For examle, if anchor ">2"
502 appeared, ancChain is set to true. Moreover, if next strings
503 are "=5", anchor for 5 is also set. Thus, we can obtain anchors
504 for strings ">2=5" as follows:
506 <a href="#2">>2</a><a href="#5">=5</a>
510 if (chpt[ 0 ] == ' ') offset = 1; /* remove one space after <> */
511 for (unsigned int i = startPos = offset ; i < length ; i++) {
513 switch (chpt[ i ].unicode()) {
518 if (chpt[ i + 1 ] == 'b' && chpt[ i + 2 ] == 'r' && chpt[ i + 3 ] == '>') {
520 /* reset anchor chain */
523 unsigned int i2 = i - startPos;
524 if (i > 0 && chpt[ i - 1 ] == ' ') i2--; /* remove space before <br> */
525 resdat.bodyHTML += rawStr.mid(startPos, i2);
527 resdat.bodyHTML += "<br>";
530 if (chpt[ startPos ] == ' ') startPos++; /* remove space after <br> */
534 /*----------------------------------------*/
536 /* remove HTML tags <[^>]*> */
539 if (i - startPos) resdat.bodyHTML += rawStr.mid(startPos, i - startPos);
540 while (chpt[ i ] != '>' && i < length) i++;
546 /*----------------------------------------*/
548 case 'h': /* "http://" or "ttp://" or "tp:" */
551 unsigned int pos = 0;
552 if (parseLink(chpt + i, length - i, linkstr, linkurl, pos)) {
553 resdat.bodyHTML += rawStr.mid(startPos, i - startPos);
554 resdat.bodyHTML += "<a href=\"" + linkurl + "\">";
555 resdat.bodyHTML += linkstr;
556 resdat.bodyHTML += "</a>";
565 /*----------------------------------*/
570 if (chpt[ i + 1 ] == 'g' && chpt[ i + 2 ] == 't' && chpt[ i + 3 ] == ';')
571 ancChain = createResAnchor(rawStr, resdat, chpt, i, startPos);
575 /*----------------------------------------*/
580 ancChain = createResAnchor(rawStr, resdat, chpt, i, startPos);
583 /*----------------------------------*/
587 if (ancChain) ancChain = createResAnchor(rawStr, resdat, chpt, i, startPos);
591 resdat.bodyHTML += rawStr.mid(startPos);
597 struct RESDAT is defined in datinfo.h.
598 This function is called from DatToHtml() and DatInfo::parseDat()
602 resdat.num ... number
603 resdat.linestr ... raw line strings
610 bool Parser::parseResDat(RESDAT& resdat, QString& subject)
612 if (resdat.parsed) return true;
614 resdat.parsed = true;
615 resdat.broken = false;
616 resdat.anclist.clear();
618 /* search the staring positions of each section to split raw data. */
619 const QChar *chpt = resdat.linestr.unicode();
620 unsigned int length = resdat.linestr.length();
621 unsigned int section = 0;
622 unsigned int sectionPos[ 5 ];
623 for (unsigned int i = 0 ; i < length ; i++) {
625 /* sections are splitted by "<>" */
626 if (chpt[ i ] == '<' && chpt[ i + 1 ] == '>') {
630 resdat.broken = true;
634 sectionPos[ section ] = i + 2;
641 resdat.broken = true;
645 // qDebug("[%d] %d %d %d %d",section, sectionPos[1],sectionPos[2],sectionPos[3],sectionPos[4]);
648 length = sectionPos[ 1 ] - 2 ;
649 parseName(resdat.linestr.mid(0, length), resdat);
652 length = sectionPos[ 2 ] - 2 - sectionPos[ 1 ];
653 datToText(resdat.linestr.mid(sectionPos[ 1 ], length), resdat.address);
656 length = sectionPos[ 3 ] - 2 - sectionPos[ 2 ];
657 parseDateId(resdat.linestr.mid(sectionPos[ 2 ], length), resdat);
660 length = sectionPos[ 4 ] - 2 - sectionPos[ 3 ];
661 parseBody(resdat.linestr.mid(sectionPos[ 3 ], length), resdat);
664 subject = resdat.linestr.mid(sectionPos[ 4 ]);