OSDN Git Service

ea806971b511e51802a15b210218c97c911a5bc9
[jindolf/JinParser.git] / src / main / java / jp / sourceforge / jindolf / parser / HtmlParser.java
1 /*\r
2  * XHTML parser\r
3  *\r
4  * Copyright(c) 2009 olyutorskii\r
5  * $Id: HtmlParser.java 1021 2010-03-24 16:03:21Z olyutorskii $\r
6  */\r
7 \r
8 package jp.sourceforge.jindolf.parser;\r
9 \r
10 import java.util.regex.Pattern;\r
11 import jp.sourceforge.jindolf.corelib.PeriodType;\r
12 import jp.sourceforge.jindolf.corelib.VillageState;\r
13 \r
14 /**\r
15  * 人狼BBS各種XHTML文字列のパースを行いハンドラに通知する。\r
16  */\r
17 public class HtmlParser extends AbstractParser{\r
18 \r
19     private BasicHandler basicHandler;\r
20     private final TalkParser     talkParser     = new TalkParser(this);\r
21     private final SysEventParser sysEventParser = new SysEventParser(this);\r
22 \r
23     private final SeqRange rangepool_1 = new SeqRange();\r
24     private final SeqRange rangepool_2 = new SeqRange();\r
25 \r
26     /**\r
27      * コンストラクタ。\r
28      */\r
29     public HtmlParser(){\r
30         super();\r
31         return;\r
32     }\r
33 \r
34     /**\r
35      * {@link BasicHandler}ハンドラを登録する。\r
36      * @param basicHandler ハンドラ\r
37      */\r
38     public void setBasicHandler(BasicHandler basicHandler){\r
39         this.basicHandler = basicHandler;\r
40         return;\r
41     }\r
42 \r
43     /**\r
44      * {@link TalkHandler}ハンドラを登録する。\r
45      * @param talkHandler ハンドラ\r
46      */\r
47     public void setTalkHandler(TalkHandler talkHandler){\r
48         this.talkParser.setTalkHandler(talkHandler);\r
49         return;\r
50     }\r
51 \r
52     /**\r
53      * {@link SysEventHandler}ハンドラを登録する。\r
54      * @param handler ハンドラ\r
55      */\r
56     public void setSysEventHandler(SysEventHandler handler){\r
57         this.sysEventParser.setSysEventHandler(handler);\r
58         return;\r
59     }\r
60 \r
61     private static final Pattern XMLDECL_PATTERN =\r
62             compile("<\\?xml\u0020");\r
63     private static final Pattern O_HTML_PATTERN =\r
64             compile("<html\u0020");\r
65     private static final Pattern TITLE_PATTERN =\r
66             compile("<title>([^<]*)</title>");\r
67     private static final Pattern O_BODY_PATTERN =\r
68             compile("<body>");\r
69     private static final Pattern O_DIVMAIN_PATTERN =\r
70             compile("<div\u0020class=\"main\">");\r
71 \r
72     /**\r
73      * XHTML先頭部分のパース。\r
74      * @throws HtmlParseException パースエラー\r
75      */\r
76     private void parseHead() throws HtmlParseException{\r
77         setContextErrorMessage("lost head part");\r
78 \r
79         SeqRange titleRange = this.rangepool_1;\r
80 \r
81         lookingAtAffirm(XMLDECL_PATTERN);\r
82         shrinkRegion();\r
83 \r
84         findAffirm(O_HTML_PATTERN);\r
85         shrinkRegion();\r
86 \r
87         findAffirm(TITLE_PATTERN);\r
88         titleRange.setLastMatchedGroupRange(getMatcher(), 1);\r
89         shrinkRegion();\r
90 \r
91         this.basicHandler.pageTitle(getContent(), titleRange);\r
92 \r
93         findAffirm(O_BODY_PATTERN);\r
94         shrinkRegion();\r
95 \r
96         findAffirm(O_DIVMAIN_PATTERN);\r
97         shrinkRegion();\r
98 \r
99         return;\r
100     }\r
101 \r
102     private static final Pattern LOGINFORM_PATTERN =\r
103             compile(\r
104                   "("\r
105                     +"<form"\r
106                     +"\u0020" + "action=\"index\\.rb\""\r
107                     +"\u0020" + "method=\"post\""\r
108                     +"\u0020" + "class=\"login_form\""\r
109                     +">"\r
110                 + ")|("\r
111                     +"<div"\r
112                     +"\u0020" + "class=\"login_form\""\r
113                     +">"\r
114                 + ")"\r
115             );\r
116     private static final Pattern C_EDIV_PATTERN =\r
117             compile(\r
118                   SP_I\r
119                 + "<a\u0020href=\"[^\"]*\">[^<]*</a>"\r
120                 + SP_I\r
121                 + "</div>"\r
122             );\r
123     private static final Pattern USERID_PATTERN =\r
124             compile(\r
125                   "name=\"user_id\""\r
126                 + "\u0020"\r
127                 + "value=\"([^\"]*)\""\r
128             );\r
129     private static final Pattern C_FORM_PATTERN =\r
130             compile("</form>");\r
131 \r
132     /**\r
133      * ログインフォームのパース。\r
134      * ログイン名までの認識を確認したのはF国のみ。\r
135      * @throws HtmlParseException パースエラー\r
136      */\r
137     private void parseLoginForm() throws HtmlParseException{\r
138         setContextErrorMessage("lost login form");\r
139 \r
140         SeqRange accountRange = this.rangepool_1;\r
141 \r
142         boolean isLand_E_Form;\r
143         findAffirm(LOGINFORM_PATTERN);\r
144         if(isGroupMatched(1)){\r
145             isLand_E_Form = false;\r
146         }else{                         // E国ログインフォーム検出\r
147             isLand_E_Form = true;\r
148         }\r
149         shrinkRegion();\r
150 \r
151         if(isLand_E_Form){\r
152             lookingAtAffirm(C_EDIV_PATTERN);\r
153             shrinkRegion();\r
154             return;\r
155         }else{\r
156             findAffirm(USERID_PATTERN);\r
157             accountRange.setLastMatchedGroupRange(getMatcher(), 1);\r
158             shrinkRegion();\r
159 \r
160             if(accountRange.length() > 0){\r
161                 this.basicHandler\r
162                     .loginName(getContent(), accountRange);\r
163             }\r
164 \r
165             findAffirm(C_FORM_PATTERN);\r
166             shrinkRegion();\r
167         }\r
168 \r
169         return;\r
170     }\r
171 \r
172     private static final Pattern VILLAGEINFO_PATTERN =\r
173             compile(\r
174                  "([^<]+?)" +SP_I          // 最短一致数量子\r
175                 +"<strong>"\r
176                     +"\uff08"\r
177                     +"([0-9]+)"                       // 月\r
178                     +"/"\r
179                     +"([0-9]+)"                       // 日\r
180                     +"\u0020"\r
181                     +"(?:(?:(午前)|(午後))\u0020)?"  // AMPM\r
182                     +"([0-9]+)"                       // 時\r
183                     +"(?:時\u0020|\\:)"\r
184                     +"([0-9]+)"                       // 分\r
185                     +"分?\u0020に更新"\r
186                     +"\uff09"\r
187                 +"</strong>"\r
188             );\r
189 \r
190     /**\r
191      * 村に関する各種情報をパース。\r
192      * @throws HtmlParseException パースエラー\r
193      */\r
194     private void parseVillageInfo() throws HtmlParseException{\r
195         setContextErrorMessage("lose village information");\r
196 \r
197         SeqRange villageRange = this.rangepool_1;\r
198 \r
199         sweepSpace();\r
200 \r
201         lookingAtAffirm(VILLAGEINFO_PATTERN);\r
202         villageRange.setLastMatchedGroupRange(getMatcher(), 1);\r
203 \r
204         int month  = parseGroupedInt(2);\r
205         int day    = parseGroupedInt(3);\r
206         int hour   = parseGroupedInt(6);\r
207         int minute = parseGroupedInt(7);\r
208         if(isGroupMatched(5)){  // 午後指定\r
209             hour = (hour + 12) % 24;\r
210         }\r
211         shrinkRegion();\r
212 \r
213         this.basicHandler.villageName(getContent(), villageRange);\r
214         this.basicHandler.commitTime(month, day, hour, minute);\r
215 \r
216         return;\r
217     }\r
218 \r
219     private static final Pattern O_PARAG_PATTERN = compile("<p>");\r
220     private static final Pattern PERIODLINK_PATTERN =\r
221             compile(\r
222             "("\r
223                 + "<span\u0020class=\"time\">"\r
224             +")|(?:"\r
225                 + "<a\u0020href=\"([^\"]*)\">"\r
226             +")|("\r
227                 + "</p>"\r
228             +")"\r
229             );\r
230     private static final Pattern PERIOD_PATTERN =\r
231             compile(\r
232                 "(プロローグ)" +\r
233             "|"+\r
234                 "(エピローグ)" +\r
235             "|"+\r
236                 "(終了)" +\r
237             "|"+\r
238                 "([0-9]+)日目"\r
239             );\r
240     private static final Pattern C_SPAN_PATTERN   = compile("</span>");\r
241     private static final Pattern C_ANCHOR_PATTERN = compile("</a>");\r
242 \r
243     /**\r
244      * Period間リンクをパース。\r
245      * @throws HtmlParseException パースエラー\r
246      */\r
247     private void parsePeriodLink() throws HtmlParseException{\r
248         setContextErrorMessage("lost period link");\r
249 \r
250         SeqRange anchorRange = this.rangepool_1;\r
251 \r
252         findAffirm(O_PARAG_PATTERN);\r
253         shrinkRegion();\r
254 \r
255         for(;;){\r
256             Pattern closePattern;\r
257             anchorRange.setInvalid();\r
258 \r
259             sweepSpace();\r
260             lookingAtAffirm(PERIODLINK_PATTERN);\r
261             if(isGroupMatched(1)){\r
262                 closePattern = C_SPAN_PATTERN;\r
263             }else if(isGroupMatched(2)){\r
264                 closePattern = C_ANCHOR_PATTERN;\r
265                 anchorRange.setLastMatchedGroupRange(getMatcher(), 2);\r
266             }else if(isGroupMatched(3)){\r
267                 shrinkRegion();\r
268                 break;\r
269             }else{\r
270                 assert false;\r
271                 throw buildParseException();\r
272             }\r
273             shrinkRegion();\r
274 \r
275             int day = -1;\r
276             PeriodType periodType = null;\r
277             lookingAtAffirm(PERIOD_PATTERN);\r
278             if(isGroupMatched(1)){\r
279                 periodType = PeriodType.PROLOGUE;\r
280             }else if(isGroupMatched(2)){\r
281                 periodType = PeriodType.EPILOGUE;\r
282             }else if(isGroupMatched(3)){\r
283                 periodType = null;\r
284             }else if(isGroupMatched(4)){\r
285                 periodType = PeriodType.PROGRESS;\r
286                 day = parseGroupedInt(4);\r
287             }else{\r
288                 assert false;\r
289                 throw buildParseException();\r
290             }\r
291             shrinkRegion();\r
292 \r
293             lookingAtAffirm(closePattern);\r
294             shrinkRegion();\r
295 \r
296             this.basicHandler.periodLink(getContent(),\r
297                                          anchorRange,\r
298                                          periodType, day );\r
299         }\r
300 \r
301         return;\r
302     }\r
303 \r
304     private static final Pattern O_MESSAGE_PATTERN =\r
305             compile("<div\u0020class=\"message(?:\u0020ch[0-9]+)?\">");\r
306     private static final Pattern O_RELOAD_PATTERN =\r
307             compile("<div\u0020id=\"reload\">");\r
308     private static final Pattern O_MSGKIND_PATTERN =\r
309             compile(\r
310              "(?:"\r
311                 +"<div\u0020class=\"(?:(announce)|(order)|(extra))\">"\r
312             +")|(?:"\r
313                 +"(?:"\r
314                 +"(?:<a name=\"[^\"]*\">)?"\r
315                 +SP_I\r
316                 +"<span\u0020class=\"mes_no\">"\r
317                     +"([0-9]+)\\."\r
318                 +"</span>)?"\r
319                 +SP_I\r
320                 +"(?:</a>)?"\r
321                 +SP_I\r
322                 +"<a\u0020name=\"([^\"]*)\"(?:\u0020class=\"ch_name\")?>"\r
323             +")"\r
324             );\r
325     private static final Pattern C_DIV_PATTERN = compile("</div>");\r
326 \r
327     /**\r
328      * 各種メッセージをパース。\r
329      * @throws HtmlParseException パースエラー\r
330      */\r
331     private void parseMessage() throws HtmlParseException{\r
332         setContextErrorMessage("lost message");\r
333 \r
334         SeqRange nameRange = this.rangepool_1;\r
335 \r
336         boolean skipGarbage = true;\r
337 \r
338         for(;;){\r
339             sweepSpace();\r
340 \r
341             boolean matched;\r
342             if(skipGarbage){\r
343                 skipGarbage = false;\r
344                 matched = findProbe(O_MESSAGE_PATTERN); // 最初の1回のみ\r
345             }else{\r
346                 matched = lookingAtProbe(O_MESSAGE_PATTERN);\r
347             }\r
348             if( ! matched ){\r
349                 matched = lookingAtProbe(O_RELOAD_PATTERN);\r
350                 if(matched){\r
351                     shrinkRegion();\r
352                     findAffirm(C_DIV_PATTERN);\r
353                     shrinkRegion();\r
354                     continue;\r
355                 }\r
356                 break;\r
357             }\r
358             shrinkRegion();\r
359 \r
360             sweepSpace();\r
361 \r
362             lookingAtAffirm(O_MSGKIND_PATTERN);\r
363             if(isGroupMatched(1)){\r
364                 shrinkRegion();\r
365                 this.sysEventParser.parseAnnounce();\r
366             }else if(isGroupMatched(2)){\r
367                 shrinkRegion();\r
368                 this.sysEventParser.parseOrder();\r
369             }else if(isGroupMatched(3)){\r
370                 shrinkRegion();\r
371                 this.sysEventParser.parseExtra();\r
372             }else if(isGroupMatched(5)){\r
373                 nameRange.setLastMatchedGroupRange(getMatcher(), 5);\r
374                 int talkNo = -1;\r
375                 if(isGroupMatched(4)){\r
376                     talkNo = parseGroupedInt(4);\r
377                 }\r
378                 shrinkRegion();\r
379                 this.talkParser.parseTalk(talkNo, nameRange);\r
380             }else{\r
381                 assert false;\r
382                 throw buildParseException();\r
383             }\r
384 \r
385             lookingAtAffirm(C_DIV_PATTERN);\r
386             shrinkRegion();\r
387         }\r
388 \r
389         return;\r
390     }\r
391 \r
392     private static final Pattern O_LISTTABLE_PATTERN =\r
393             compile("<table\u0020class=\"list\">");\r
394     private static final Pattern ACTIVEVILLAGE =\r
395             compile(\r
396              "("\r
397                 +"</table>"\r
398             +")|(?:"\r
399                 +"<tr><td>"\r
400                 +"<a\u0020href=\"([^\"]*)\">([^<]*)</a>"\r
401                 +"\u0020<strong>\uff08"\r
402                     +"(?:(?:(午前)|(午後))\u0020)?"  // AMPM\r
403                     +"([0-9]+)"                       // 時\r
404                     +"(?:時\u0020|\\:)"\r
405                     +"([0-9]+)"                       // 分\r
406                     +"分?\u0020更新"\r
407                 +"\uff09</strong>"\r
408                 +"</td><td>(?:"\r
409                 +"[^<]*"\r
410                     + "(参加者募集中です。)"\r
411                     +"|(開始待ちです。)"\r
412                     +"|(進行中です。)"\r
413                     +"|(勝敗が決定しました。)"\r
414                     +"|(終了・ログ公開中。)"\r
415                 +")</td></tr>"\r
416             +")"\r
417             );\r
418 \r
419     /**\r
420      * トップページの村一覧表のパース。\r
421      * @throws HtmlParseException パースエラー\r
422      */\r
423     private void parseTopList() throws HtmlParseException{\r
424         setContextErrorMessage("lost village list");\r
425 \r
426         SeqRange anchorRange  = this.rangepool_1;\r
427         SeqRange villageRange = this.rangepool_2;\r
428 \r
429         if( ! findProbe(O_LISTTABLE_PATTERN) ) return;\r
430         shrinkRegion();\r
431 \r
432         for(;;){\r
433             lookingAtAffirm(ACTIVEVILLAGE);\r
434             if(isGroupMatched(1)) break;\r
435             anchorRange .setLastMatchedGroupRange(getMatcher(), 2);\r
436             villageRange.setLastMatchedGroupRange(getMatcher(), 3);\r
437             int hour = parseGroupedInt(6);\r
438             if(isGroupMatched(5)){\r
439                 hour = (hour + 12) % 24;\r
440             }\r
441             int minute = parseGroupedInt(7);\r
442 \r
443             VillageState state;\r
444             if(isGroupMatched(8)){\r
445                 state = VillageState.PROLOGUE;\r
446             }else if(isGroupMatched(9)){\r
447                 state = VillageState.PROLOGUE;\r
448             }else if(isGroupMatched(10)){\r
449                 state = VillageState.PROGRESS;\r
450             }else if(isGroupMatched(11)){\r
451                 state = VillageState.EPILOGUE;\r
452             }else if(isGroupMatched(12)){\r
453                 state = VillageState.GAMEOVER;\r
454             }else{\r
455                 assert false;\r
456                 throw buildParseException();\r
457             }\r
458 \r
459             shrinkRegion();\r
460 \r
461             sweepSpace();\r
462 \r
463             this.basicHandler.villageRecord(getContent(),\r
464                                             anchorRange,\r
465                                             villageRange,\r
466                                             hour, minute,\r
467                                             state );\r
468         }\r
469 \r
470         return;\r
471     }\r
472 \r
473     private static final Pattern O_LISTLOG_PATTERN =\r
474             compile(\r
475             "<a\u0020href=\"(index[^\"]*(?:ready_0|000_ready))\">"\r
476             +"([^<]*)"\r
477             +"</a><br\u0020/>"\r
478             );\r
479 \r
480     /**\r
481      * 村一覧ページのパース。\r
482      * @throws HtmlParseException パースエラー\r
483      */\r
484     private void parseLogList() throws HtmlParseException{\r
485         setContextErrorMessage("lost village list");\r
486 \r
487         SeqRange anchorRange  = this.rangepool_1;\r
488         SeqRange villageRange = this.rangepool_2;\r
489 \r
490         boolean is1st = true;\r
491         for(;;){\r
492             boolean matched;\r
493             if(is1st){\r
494                 matched = findProbe(O_LISTLOG_PATTERN);\r
495                 is1st = false;\r
496             }else{\r
497                 matched = lookingAtProbe(O_LISTLOG_PATTERN);\r
498             }\r
499             if( ! matched ) break;\r
500 \r
501             anchorRange .setLastMatchedGroupRange(getMatcher(), 1);\r
502             villageRange.setLastMatchedGroupRange(getMatcher(), 2);\r
503 \r
504             shrinkRegion();\r
505 \r
506             this.basicHandler.villageRecord(getContent(),\r
507                                             anchorRange,\r
508                                             villageRange,\r
509                                             -1, -1,\r
510                                             VillageState.GAMEOVER );\r
511         }\r
512 \r
513         return;\r
514     }\r
515 \r
516     private static final Pattern C_BODY_PATTERN =\r
517             compile("</body>");\r
518     private static final Pattern C_HTML_PATTERN =\r
519             compile(SP_I+ "</html>" +SP_I);\r
520 \r
521     /**\r
522      * XHTML末尾のパース。\r
523      * @throws HtmlParseException パースエラー\r
524      */\r
525     private void parseTail() throws HtmlParseException{\r
526         setContextErrorMessage("lost last part");\r
527 \r
528         findAffirm(C_BODY_PATTERN);\r
529         shrinkRegion();\r
530 \r
531         matchesAffirm(C_HTML_PATTERN);\r
532         shrinkRegion();\r
533 \r
534         return;\r
535     }\r
536 \r
537     private static final Pattern LISTTITLE_PATTERN =\r
538             compile("終了した村の記録");\r
539 \r
540     /**\r
541      * 人狼BBSのページ種別を自動認識しつつパースする。\r
542      * @param content パース対象の文字列\r
543      * @throws HtmlParseException パースエラー\r
544      */\r
545     public void parseAutomatic(DecodedContent content)\r
546             throws HtmlParseException{\r
547         setContent(content);\r
548 \r
549         this.basicHandler.startParse(getContent());\r
550 \r
551         parseHead();\r
552 \r
553         sweepSpace();\r
554 \r
555         if(lookingAtProbe(LISTTITLE_PATTERN)){\r
556             shrinkRegion();\r
557             this.basicHandler.pageType(PageType.VILLAGELIST_PAGE);\r
558             parseLogList();\r
559         }else{\r
560             parseLoginForm();\r
561             sweepSpace();\r
562             if(lookingAtProbe(O_PARAG_PATTERN)){\r
563                 shrinkRegion();\r
564                 this.basicHandler.pageType(PageType.TOP_PAGE);\r
565                 parseTopList();\r
566             }else{\r
567                 this.basicHandler.pageType(PageType.PERIOD_PAGE);\r
568                 parseVillageInfo();\r
569                 parsePeriodLink();\r
570                 parseMessage();\r
571             }\r
572         }\r
573 \r
574         parseTail();\r
575 \r
576         this.basicHandler.endParse();\r
577 \r
578         reset();\r
579 \r
580         return;\r
581     }\r
582 \r
583 }\r