4 * License : The MIT License
5 * Copyright(c) 2009 olyutorskii
8 package jp.sourceforge.jindolf.parser;
10 import java.util.regex.Pattern;
11 import jp.sourceforge.jindolf.corelib.PeriodType;
12 import jp.sourceforge.jindolf.corelib.VillageState;
15 * 人狼BBS各種XHTML文字列のパースを行いハンドラに通知する。
17 public class HtmlParser extends AbstractParser{
19 private static final String SP = "\u0020";
22 private BasicHandler basicHandler;
23 private final TalkParser talkParser = new TalkParser(this);
24 private final SysEventParser sysEventParser = new SysEventParser(this);
26 private final SeqRange rangepool_1 = new SeqRange();
27 private final SeqRange rangepool_2 = new SeqRange();
38 * {@link BasicHandler}ハンドラを登録する。
39 * @param basicHandler ハンドラ
41 public void setBasicHandler(BasicHandler basicHandler){
42 this.basicHandler = basicHandler;
47 * {@link TalkHandler}ハンドラを登録する。
48 * @param talkHandler ハンドラ
50 public void setTalkHandler(TalkHandler talkHandler){
51 this.talkParser.setTalkHandler(talkHandler);
56 * {@link SysEventHandler}ハンドラを登録する。
59 public void setSysEventHandler(SysEventHandler handler){
60 this.sysEventParser.setSysEventHandler(handler);
64 private static final Pattern XMLDECL_PATTERN =
65 compile("<\\?xml\u0020");
66 private static final Pattern O_HTML_PATTERN =
67 compile("<html\u0020");
68 private static final Pattern TITLE_PATTERN =
69 compile("<title>([^<]*)</title>");
70 private static final Pattern O_BODY_PATTERN =
72 private static final Pattern O_DIVMAIN_PATTERN =
73 compile("<div\u0020class=\"main\">");
77 * @throws HtmlParseException パースエラー
79 private void parseHead() throws HtmlParseException{
80 setContextErrorMessage("lost head part");
82 SeqRange titleRange = this.rangepool_1;
84 lookingAtAffirm(XMLDECL_PATTERN);
87 findAffirm(O_HTML_PATTERN);
90 findAffirm(TITLE_PATTERN);
91 titleRange.setLastMatchedGroupRange(getMatcher(), 1);
94 this.basicHandler.pageTitle(getContent(), titleRange);
96 findAffirm(O_BODY_PATTERN);
99 findAffirm(O_DIVMAIN_PATTERN);
105 private static final Pattern LOGINFORM_PATTERN =
109 +SP + "action=\"index\\.rb\""
110 +SP + "method=\"post\""
111 +SP + "class=\"login_form\""
115 +SP + "class=\"login_form\""
119 private static final Pattern C_EDIV_PATTERN =
122 + "<a\u0020href=\"[^\"]*\">[^<]*</a>"
126 private static final Pattern USERID_PATTERN =
130 + "value=\"([^\"]*)\""
132 private static final Pattern C_FORM_PATTERN =
137 * ログイン名までの認識を確認したのはF国のみ。
138 * @throws HtmlParseException パースエラー
140 private void parseLoginForm() throws HtmlParseException{
141 setContextErrorMessage("lost login form");
143 SeqRange accountRange = this.rangepool_1;
145 boolean isLand_E_Form;
146 findAffirm(LOGINFORM_PATTERN);
147 if(isGroupMatched(1)){
148 isLand_E_Form = false;
149 }else{ // E国ログインフォーム検出
150 isLand_E_Form = true;
155 lookingAtAffirm(C_EDIV_PATTERN);
159 findAffirm(USERID_PATTERN);
160 accountRange.setLastMatchedGroupRange(getMatcher(), 1);
163 if(accountRange.length() > 0){
165 .loginName(getContent(), accountRange);
168 findAffirm(C_FORM_PATTERN);
175 private static final Pattern VILLAGEINFO_PATTERN =
177 "([^<]+?)" +SP_I // 最短一致数量子
184 +"(?:(?:(午前)|(午後))\u0020)?" // AMPM
195 * @throws HtmlParseException パースエラー
197 private void parseVillageInfo() throws HtmlParseException{
198 setContextErrorMessage("lose village information");
200 SeqRange villageRange = this.rangepool_1;
204 lookingAtAffirm(VILLAGEINFO_PATTERN);
205 villageRange.setLastMatchedGroupRange(getMatcher(), 1);
207 int month = parseGroupedInt(2);
208 int day = parseGroupedInt(3);
209 int hour = parseGroupedInt(6);
210 int minute = parseGroupedInt(7);
211 if(isGroupMatched(5)){ // 午後指定
212 hour = (hour + 12) % 24;
216 this.basicHandler.villageName(getContent(), villageRange);
217 this.basicHandler.commitTime(month, day, hour, minute);
222 private static final Pattern O_PARAG_PATTERN = compile("<p>");
223 private static final Pattern PERIODLINK_PATTERN =
226 + "<span\u0020class=\"time\">"
228 + "<a\u0020href=\"([^\"]*)\">"
233 private static final Pattern PERIOD_PATTERN =
243 private static final Pattern C_SPAN_PATTERN = compile("</span>");
244 private static final Pattern C_ANCHOR_PATTERN = compile("</a>");
248 * @throws HtmlParseException パースエラー
250 private void parsePeriodLink() throws HtmlParseException{
251 setContextErrorMessage("lost period link");
253 SeqRange anchorRange = this.rangepool_1;
255 findAffirm(O_PARAG_PATTERN);
259 Pattern closePattern;
260 anchorRange.setInvalid();
263 lookingAtAffirm(PERIODLINK_PATTERN);
264 if(isGroupMatched(1)){
265 closePattern = C_SPAN_PATTERN;
266 }else if(isGroupMatched(2)){
267 closePattern = C_ANCHOR_PATTERN;
268 anchorRange.setLastMatchedGroupRange(getMatcher(), 2);
269 }else if(isGroupMatched(3)){
274 throw buildParseException();
279 PeriodType periodType = null;
280 lookingAtAffirm(PERIOD_PATTERN);
281 if(isGroupMatched(1)){
282 periodType = PeriodType.PROLOGUE;
283 }else if(isGroupMatched(2)){
284 periodType = PeriodType.EPILOGUE;
285 }else if(isGroupMatched(3)){
287 }else if(isGroupMatched(4)){
288 periodType = PeriodType.PROGRESS;
289 day = parseGroupedInt(4);
292 throw buildParseException();
296 lookingAtAffirm(closePattern);
299 this.basicHandler.periodLink(getContent(),
307 private static final Pattern O_MESSAGE_PATTERN =
308 compile("<div\u0020class=\"message(?:\u0020ch[0-9]+)?\">");
309 private static final Pattern O_RELOAD_PATTERN =
310 compile("<div\u0020id=\"reload\">");
311 private static final Pattern O_MSGKIND_PATTERN =
314 +"<div\u0020class=\"(?:(announce)|(order)|(extra))\">"
317 +"(?:<a name=\"[^\"]*\">)?"
319 +"<span\u0020class=\"mes_no\">"
325 +"<a\u0020name=\"([^\"]*)\"(?:\u0020class=\"ch_name\")?>"
328 private static final Pattern C_DIV_PATTERN = compile("</div>");
332 * @throws HtmlParseException パースエラー
334 private void parseMessage() throws HtmlParseException{
335 setContextErrorMessage("lost message");
337 boolean skipGarbage = true;
345 matched = findProbe(O_MESSAGE_PATTERN); // 最初の1回のみ
347 matched = lookingAtProbe(O_MESSAGE_PATTERN);
350 matched = lookingAtProbe(O_RELOAD_PATTERN);
353 findAffirm(C_DIV_PATTERN);
363 lookingAtAffirm(C_DIV_PATTERN);
371 * イベント種別によって処理を振り分ける。
372 * @throws HtmlParseException パースエラー
374 private void dispatchFamily() throws HtmlParseException{
377 SeqRange nameRange = this.rangepool_1;
379 lookingAtAffirm(O_MSGKIND_PATTERN);
380 if(isGroupMatched(1)){
382 this.sysEventParser.parseAnnounce();
383 }else if(isGroupMatched(2)){
385 this.sysEventParser.parseOrder();
386 }else if(isGroupMatched(3)){
388 this.sysEventParser.parseExtra();
389 }else if(isGroupMatched(5)){
390 nameRange.setLastMatchedGroupRange(getMatcher(), 5);
392 if(isGroupMatched(4)){
393 talkNo = parseGroupedInt(4);
396 this.talkParser.parseTalk(talkNo, nameRange);
399 throw buildParseException();
405 private static final Pattern O_LISTTABLE_PATTERN =
406 compile("<table\u0020class=\"list\">");
407 private static final Pattern ACTIVEVILLAGE =
413 +"<a\u0020href=\"([^\"]*)\">([^<]*)</a>"
414 +"\u0020<strong>\uff08"
415 +"(?:(?:(午前)|(午後))\u0020)?" // AMPM
434 * @throws HtmlParseException パースエラー
436 private void parseTopList() throws HtmlParseException{
437 setContextErrorMessage("lost village list");
439 SeqRange anchorRange = this.rangepool_1;
440 SeqRange villageRange = this.rangepool_2;
442 if( ! findProbe(O_LISTTABLE_PATTERN) ) return;
446 lookingAtAffirm(ACTIVEVILLAGE);
447 if(isGroupMatched(1)) break;
448 anchorRange .setLastMatchedGroupRange(getMatcher(), 2);
449 villageRange.setLastMatchedGroupRange(getMatcher(), 3);
450 int hour = parseGroupedInt(6);
451 if(isGroupMatched(5)){
452 hour = (hour + 12) % 24;
454 int minute = parseGroupedInt(7);
457 if(isGroupMatched(8)){
458 state = VillageState.PROLOGUE;
459 }else if(isGroupMatched(9)){
460 state = VillageState.PROLOGUE;
461 }else if(isGroupMatched(10)){
462 state = VillageState.PROGRESS;
463 }else if(isGroupMatched(11)){
464 state = VillageState.EPILOGUE;
465 }else if(isGroupMatched(12)){
466 state = VillageState.GAMEOVER;
469 throw buildParseException();
476 this.basicHandler.villageRecord(getContent(),
486 private static final Pattern O_LISTLOG_PATTERN =
488 "<a\u0020href=\"(index[^\"]*(?:ready_0|000_ready))\">"
495 * @throws HtmlParseException パースエラー
497 private void parseLogList() throws HtmlParseException{
498 setContextErrorMessage("lost village list");
500 SeqRange anchorRange = this.rangepool_1;
501 SeqRange villageRange = this.rangepool_2;
503 boolean is1st = true;
507 matched = findProbe(O_LISTLOG_PATTERN);
510 matched = lookingAtProbe(O_LISTLOG_PATTERN);
512 if( ! matched ) break;
514 anchorRange .setLastMatchedGroupRange(getMatcher(), 1);
515 villageRange.setLastMatchedGroupRange(getMatcher(), 2);
519 this.basicHandler.villageRecord(getContent(),
523 VillageState.GAMEOVER );
529 private static final Pattern C_BODY_PATTERN =
531 private static final Pattern C_HTML_PATTERN =
532 compile(SP_I+ "</html>" +SP_I);
536 * @throws HtmlParseException パースエラー
538 private void parseTail() throws HtmlParseException{
539 setContextErrorMessage("lost last part");
541 findAffirm(C_BODY_PATTERN);
544 matchesAffirm(C_HTML_PATTERN);
550 private static final Pattern LISTTITLE_PATTERN =
554 * 人狼BBSのページ種別を自動認識しつつパースする。
555 * @param content パース対象の文字列
556 * @throws HtmlParseException パースエラー
558 public void parseAutomatic(DecodedContent content)
559 throws HtmlParseException{
562 this.basicHandler.startParse(getContent());
568 if(lookingAtProbe(LISTTITLE_PATTERN)){
570 this.basicHandler.pageType(PageType.VILLAGELIST_PAGE);
575 if(lookingAtProbe(O_PARAG_PATTERN)){
577 this.basicHandler.pageType(PageType.TOP_PAGE);
580 this.basicHandler.pageType(PageType.PERIOD_PAGE);
589 this.basicHandler.endParse();