OSDN Git Service

改行コード指定
[jindolf/JinParser.git] / src / main / java / jp / sourceforge / jindolf / parser / HtmlParser.java
1 /*
2  * XHTML parser
3  *
4  * License : The MIT License
5  * Copyright(c) 2009 olyutorskii
6  */
7
8 package jp.sourceforge.jindolf.parser;
9
10 import java.util.regex.Pattern;
11 import jp.sourceforge.jindolf.corelib.PeriodType;
12 import jp.sourceforge.jindolf.corelib.VillageState;
13
14 /**
15  * 人狼BBS各種XHTML文字列のパースを行いハンドラに通知する。
16  */
17 public class HtmlParser extends AbstractParser{
18
19     private static final String SP = "\u0020";
20
21
22     private BasicHandler basicHandler;
23     private final TalkParser     talkParser     = new TalkParser(this);
24     private final SysEventParser sysEventParser = new SysEventParser(this);
25
26     private final SeqRange rangepool_1 = new SeqRange();
27     private final SeqRange rangepool_2 = new SeqRange();
28
29     /**
30      * コンストラクタ。
31      */
32     public HtmlParser(){
33         super();
34         return;
35     }
36
37     /**
38      * {@link BasicHandler}ハンドラを登録する。
39      * @param basicHandler ハンドラ
40      */
41     public void setBasicHandler(BasicHandler basicHandler){
42         this.basicHandler = basicHandler;
43         return;
44     }
45
46     /**
47      * {@link TalkHandler}ハンドラを登録する。
48      * @param talkHandler ハンドラ
49      */
50     public void setTalkHandler(TalkHandler talkHandler){
51         this.talkParser.setTalkHandler(talkHandler);
52         return;
53     }
54
55     /**
56      * {@link SysEventHandler}ハンドラを登録する。
57      * @param handler ハンドラ
58      */
59     public void setSysEventHandler(SysEventHandler handler){
60         this.sysEventParser.setSysEventHandler(handler);
61         return;
62     }
63
64     private static final Pattern XMLDECL_PATTERN =
65             compile("<\\?xml\u0020");
66     private static final Pattern O_HTML_PATTERN =
67             compile("<html\u0020");
68     private static final Pattern TITLE_PATTERN =
69             compile("<title>([^<]*)</title>");
70     private static final Pattern O_BODY_PATTERN =
71             compile("<body>");
72     private static final Pattern O_DIVMAIN_PATTERN =
73             compile("<div\u0020class=\"main\">");
74
75     /**
76      * XHTML先頭部分のパース。
77      * @throws HtmlParseException パースエラー
78      */
79     private void parseHead() throws HtmlParseException{
80         setContextErrorMessage("lost head part");
81
82         SeqRange titleRange = this.rangepool_1;
83
84         lookingAtAffirm(XMLDECL_PATTERN);
85         shrinkRegion();
86
87         findAffirm(O_HTML_PATTERN);
88         shrinkRegion();
89
90         findAffirm(TITLE_PATTERN);
91         titleRange.setLastMatchedGroupRange(getMatcher(), 1);
92         shrinkRegion();
93
94         this.basicHandler.pageTitle(getContent(), titleRange);
95
96         findAffirm(O_BODY_PATTERN);
97         shrinkRegion();
98
99         findAffirm(O_DIVMAIN_PATTERN);
100         shrinkRegion();
101
102         return;
103     }
104
105     private static final Pattern LOGINFORM_PATTERN =
106             compile(
107                   "("
108                     +"<form"
109                     +SP + "action=\"index\\.rb\""
110                     +SP + "method=\"post\""
111                     +SP + "class=\"login_form\""
112                     +">"
113                 + ")|("
114                     +"<div"
115                     +SP + "class=\"login_form\""
116                     +">"
117                 + ")"
118             );
119     private static final Pattern C_EDIV_PATTERN =
120             compile(
121                   SP_I
122                 + "<a\u0020href=\"[^\"]*\">[^<]*</a>"
123                 + SP_I
124                 + "</div>"
125             );
126     private static final Pattern USERID_PATTERN =
127             compile(
128                   "name=\"user_id\""
129                 + SP
130                 + "value=\"([^\"]*)\""
131             );
132     private static final Pattern C_FORM_PATTERN =
133             compile("</form>");
134
135     /**
136      * ログインフォームのパース。
137      * ログイン名までの認識を確認したのはF国のみ。
138      * @throws HtmlParseException パースエラー
139      */
140     private void parseLoginForm() throws HtmlParseException{
141         setContextErrorMessage("lost login form");
142
143         SeqRange accountRange = this.rangepool_1;
144
145         boolean isLand_E_Form;
146         findAffirm(LOGINFORM_PATTERN);
147         if(isGroupMatched(1)){
148             isLand_E_Form = false;
149         }else{                         // E国ログインフォーム検出
150             isLand_E_Form = true;
151         }
152         shrinkRegion();
153
154         if(isLand_E_Form){
155             lookingAtAffirm(C_EDIV_PATTERN);
156             shrinkRegion();
157             return;
158         }else{
159             findAffirm(USERID_PATTERN);
160             accountRange.setLastMatchedGroupRange(getMatcher(), 1);
161             shrinkRegion();
162
163             if(accountRange.length() > 0){
164                 this.basicHandler
165                     .loginName(getContent(), accountRange);
166             }
167
168             findAffirm(C_FORM_PATTERN);
169             shrinkRegion();
170         }
171
172         return;
173     }
174
175     private static final Pattern VILLAGEINFO_PATTERN =
176             compile(
177                  "([^<]+?)" +SP_I          // 最短一致数量子
178                 +"<strong>"
179                     +"\uff08"
180                     +"([0-9]+)"                       // 月
181                     +"/"
182                     +"([0-9]+)"                       // 日
183                     +SP
184                     +"(?:(?:(午前)|(午後))\u0020)?"  // AMPM
185                     +"([0-9]+)"                       // 時
186                     +"(?:時\u0020|\\:)"
187                     +"([0-9]+)"                       // 分
188                     +"分?\u0020に更新"
189                     +"\uff09"
190                 +"</strong>"
191             );
192
193     /**
194      * 村に関する各種情報をパース。
195      * @throws HtmlParseException パースエラー
196      */
197     private void parseVillageInfo() throws HtmlParseException{
198         setContextErrorMessage("lose village information");
199
200         SeqRange villageRange = this.rangepool_1;
201
202         sweepSpace();
203
204         lookingAtAffirm(VILLAGEINFO_PATTERN);
205         villageRange.setLastMatchedGroupRange(getMatcher(), 1);
206
207         int month  = parseGroupedInt(2);
208         int day    = parseGroupedInt(3);
209         int hour   = parseGroupedInt(6);
210         int minute = parseGroupedInt(7);
211         if(isGroupMatched(5)){  // 午後指定
212             hour = (hour + 12) % 24;
213         }
214         shrinkRegion();
215
216         this.basicHandler.villageName(getContent(), villageRange);
217         this.basicHandler.commitTime(month, day, hour, minute);
218
219         return;
220     }
221
222     private static final Pattern O_PARAG_PATTERN = compile("<p>");
223     private static final Pattern PERIODLINK_PATTERN =
224             compile(
225             "("
226                 + "<span\u0020class=\"time\">"
227             +")|(?:"
228                 + "<a\u0020href=\"([^\"]*)\">"
229             +")|("
230                 + "</p>"
231             +")"
232             );
233     private static final Pattern PERIOD_PATTERN =
234             compile(
235                 "(プロローグ)" +
236             "|"+
237                 "(エピローグ)" +
238             "|"+
239                 "(終了)" +
240             "|"+
241                 "([0-9]+)日目"
242             );
243     private static final Pattern C_SPAN_PATTERN   = compile("</span>");
244     private static final Pattern C_ANCHOR_PATTERN = compile("</a>");
245
246     /**
247      * Period間リンクをパース。
248      * @throws HtmlParseException パースエラー
249      */
250     private void parsePeriodLink() throws HtmlParseException{
251         setContextErrorMessage("lost period link");
252
253         SeqRange anchorRange = this.rangepool_1;
254
255         findAffirm(O_PARAG_PATTERN);
256         shrinkRegion();
257
258         for(;;){
259             Pattern closePattern;
260             anchorRange.setInvalid();
261
262             sweepSpace();
263             lookingAtAffirm(PERIODLINK_PATTERN);
264             if(isGroupMatched(1)){
265                 closePattern = C_SPAN_PATTERN;
266             }else if(isGroupMatched(2)){
267                 closePattern = C_ANCHOR_PATTERN;
268                 anchorRange.setLastMatchedGroupRange(getMatcher(), 2);
269             }else if(isGroupMatched(3)){
270                 shrinkRegion();
271                 break;
272             }else{
273                 assert false;
274                 throw buildParseException();
275             }
276             shrinkRegion();
277
278             int day = -1;
279             PeriodType periodType = null;
280             lookingAtAffirm(PERIOD_PATTERN);
281             if(isGroupMatched(1)){
282                 periodType = PeriodType.PROLOGUE;
283             }else if(isGroupMatched(2)){
284                 periodType = PeriodType.EPILOGUE;
285             }else if(isGroupMatched(3)){
286                 periodType = null;
287             }else if(isGroupMatched(4)){
288                 periodType = PeriodType.PROGRESS;
289                 day = parseGroupedInt(4);
290             }else{
291                 assert false;
292                 throw buildParseException();
293             }
294             shrinkRegion();
295
296             lookingAtAffirm(closePattern);
297             shrinkRegion();
298
299             this.basicHandler.periodLink(getContent(),
300                                          anchorRange,
301                                          periodType, day );
302         }
303
304         return;
305     }
306
307     private static final Pattern O_MESSAGE_PATTERN =
308             compile("<div\u0020class=\"message(?:\u0020ch[0-9]+)?\">");
309     private static final Pattern O_RELOAD_PATTERN =
310             compile("<div\u0020id=\"reload\">");
311     private static final Pattern O_MSGKIND_PATTERN =
312             compile(
313              "(?:"
314                 +"<div\u0020class=\"(?:(announce)|(order)|(extra))\">"
315             +")|(?:"
316                 +"(?:"
317                 +"(?:<a name=\"[^\"]*\">)?"
318                 +SP_I
319                 +"<span\u0020class=\"mes_no\">"
320                     +"([0-9]+)\\."
321                 +"</span>)?"
322                 +SP_I
323                 +"(?:</a>)?"
324                 +SP_I
325                 +"<a\u0020name=\"([^\"]*)\"(?:\u0020class=\"ch_name\")?>"
326             +")"
327             );
328     private static final Pattern C_DIV_PATTERN = compile("</div>");
329
330     /**
331      * 各種メッセージをパース。
332      * @throws HtmlParseException パースエラー
333      */
334     private void parseMessage() throws HtmlParseException{
335         setContextErrorMessage("lost message");
336
337         boolean skipGarbage = true;
338
339         for(;;){
340             sweepSpace();
341
342             boolean matched;
343             if(skipGarbage){
344                 skipGarbage = false;
345                 matched = findProbe(O_MESSAGE_PATTERN); // 最初の1回のみ
346             }else{
347                 matched = lookingAtProbe(O_MESSAGE_PATTERN);
348             }
349             if( ! matched ){
350                 matched = lookingAtProbe(O_RELOAD_PATTERN);
351                 if(matched){
352                     shrinkRegion();
353                     findAffirm(C_DIV_PATTERN);
354                     shrinkRegion();
355                     continue;
356                 }
357                 break;
358             }
359             shrinkRegion();
360
361             dispatchFamily();
362
363             lookingAtAffirm(C_DIV_PATTERN);
364             shrinkRegion();
365         }
366
367         return;
368     }
369
370     /**
371      * イベント種別によって処理を振り分ける。
372      * @throws HtmlParseException パースエラー
373      */
374     private void dispatchFamily() throws HtmlParseException{
375         sweepSpace();
376
377         SeqRange nameRange = this.rangepool_1;
378
379         lookingAtAffirm(O_MSGKIND_PATTERN);
380         if(isGroupMatched(1)){
381             shrinkRegion();
382             this.sysEventParser.parseAnnounce();
383         }else if(isGroupMatched(2)){
384             shrinkRegion();
385             this.sysEventParser.parseOrder();
386         }else if(isGroupMatched(3)){
387             shrinkRegion();
388             this.sysEventParser.parseExtra();
389         }else if(isGroupMatched(5)){
390             nameRange.setLastMatchedGroupRange(getMatcher(), 5);
391             int talkNo = -1;
392             if(isGroupMatched(4)){
393                 talkNo = parseGroupedInt(4);
394             }
395             shrinkRegion();
396             this.talkParser.parseTalk(talkNo, nameRange);
397         }else{
398             assert false;
399             throw buildParseException();
400         }
401
402         return;
403     }
404
405     private static final Pattern O_LISTTABLE_PATTERN =
406             compile("<table\u0020class=\"list\">");
407     private static final Pattern ACTIVEVILLAGE =
408             compile(
409              "("
410                 +"</table>"
411             +")|(?:"
412                 +"<tr><td>"
413                 +"<a\u0020href=\"([^\"]*)\">([^<]*)</a>"
414                 +"\u0020<strong>\uff08"
415                     +"(?:(?:(午前)|(午後))\u0020)?"  // AMPM
416                     +"([0-9]+)"                       // 時
417                     +"(?:時\u0020|\\:)"
418                     +"([0-9]+)"                       // 分
419                     +"分?\u0020更新"
420                 +"\uff09</strong>"
421                 +"</td><td>(?:"
422                 +"[^<]*"
423                     + "(参加者募集中です。)"
424                     +"|(開始待ちです。)"
425                     +"|(進行中です。)"
426                     +"|(勝敗が決定しました。)"
427                     +"|(終了・ログ公開中。)"
428                 +")</td></tr>"
429             +")"
430             );
431
432     /**
433      * トップページの村一覧表のパース。
434      * @throws HtmlParseException パースエラー
435      */
436     private void parseTopList() throws HtmlParseException{
437         setContextErrorMessage("lost village list");
438
439         SeqRange anchorRange  = this.rangepool_1;
440         SeqRange villageRange = this.rangepool_2;
441
442         if( ! findProbe(O_LISTTABLE_PATTERN) ) return;
443         shrinkRegion();
444
445         for(;;){
446             lookingAtAffirm(ACTIVEVILLAGE);
447             if(isGroupMatched(1)) break;
448             anchorRange .setLastMatchedGroupRange(getMatcher(), 2);
449             villageRange.setLastMatchedGroupRange(getMatcher(), 3);
450             int hour = parseGroupedInt(6);
451             if(isGroupMatched(5)){
452                 hour = (hour + 12) % 24;
453             }
454             int minute = parseGroupedInt(7);
455
456             VillageState state;
457             if(isGroupMatched(8)){
458                 state = VillageState.PROLOGUE;
459             }else if(isGroupMatched(9)){
460                 state = VillageState.PROLOGUE;
461             }else if(isGroupMatched(10)){
462                 state = VillageState.PROGRESS;
463             }else if(isGroupMatched(11)){
464                 state = VillageState.EPILOGUE;
465             }else if(isGroupMatched(12)){
466                 state = VillageState.GAMEOVER;
467             }else{
468                 assert false;
469                 throw buildParseException();
470             }
471
472             shrinkRegion();
473
474             sweepSpace();
475
476             this.basicHandler.villageRecord(getContent(),
477                                             anchorRange,
478                                             villageRange,
479                                             hour, minute,
480                                             state );
481         }
482
483         return;
484     }
485
486     private static final Pattern O_LISTLOG_PATTERN =
487             compile(
488             "<a\u0020href=\"(index[^\"]*(?:ready_0|000_ready))\">"
489             +"([^<]*)"
490             +"</a><br\u0020/>"
491             );
492
493     /**
494      * 村一覧ページのパース。
495      * @throws HtmlParseException パースエラー
496      */
497     private void parseLogList() throws HtmlParseException{
498         setContextErrorMessage("lost village list");
499
500         SeqRange anchorRange  = this.rangepool_1;
501         SeqRange villageRange = this.rangepool_2;
502
503         boolean is1st = true;
504         for(;;){
505             boolean matched;
506             if(is1st){
507                 matched = findProbe(O_LISTLOG_PATTERN);
508                 is1st = false;
509             }else{
510                 matched = lookingAtProbe(O_LISTLOG_PATTERN);
511             }
512             if( ! matched ) break;
513
514             anchorRange .setLastMatchedGroupRange(getMatcher(), 1);
515             villageRange.setLastMatchedGroupRange(getMatcher(), 2);
516
517             shrinkRegion();
518
519             this.basicHandler.villageRecord(getContent(),
520                                             anchorRange,
521                                             villageRange,
522                                             -1, -1,
523                                             VillageState.GAMEOVER );
524         }
525
526         return;
527     }
528
529     private static final Pattern C_BODY_PATTERN =
530             compile("</body>");
531     private static final Pattern C_HTML_PATTERN =
532             compile(SP_I+ "</html>" +SP_I);
533
534     /**
535      * XHTML末尾のパース。
536      * @throws HtmlParseException パースエラー
537      */
538     private void parseTail() throws HtmlParseException{
539         setContextErrorMessage("lost last part");
540
541         findAffirm(C_BODY_PATTERN);
542         shrinkRegion();
543
544         matchesAffirm(C_HTML_PATTERN);
545         shrinkRegion();
546
547         return;
548     }
549
550     private static final Pattern LISTTITLE_PATTERN =
551             compile("終了した村の記録");
552
553     /**
554      * 人狼BBSのページ種別を自動認識しつつパースする。
555      * @param content パース対象の文字列
556      * @throws HtmlParseException パースエラー
557      */
558     public void parseAutomatic(DecodedContent content)
559             throws HtmlParseException{
560         setContent(content);
561
562         this.basicHandler.startParse(getContent());
563
564         parseHead();
565
566         sweepSpace();
567
568         if(lookingAtProbe(LISTTITLE_PATTERN)){
569             shrinkRegion();
570             this.basicHandler.pageType(PageType.VILLAGELIST_PAGE);
571             parseLogList();
572         }else{
573             parseLoginForm();
574             sweepSpace();
575             if(lookingAtProbe(O_PARAG_PATTERN)){
576                 shrinkRegion();
577                 this.basicHandler.pageType(PageType.TOP_PAGE);
578                 parseTopList();
579             }else{
580                 this.basicHandler.pageType(PageType.PERIOD_PAGE);
581                 parseVillageInfo();
582                 parsePeriodLink();
583                 parseMessage();
584             }
585         }
586
587         parseTail();
588
589         this.basicHandler.endParse();
590
591         reset();
592
593         return;
594     }
595
596 }