src/main/java/jp/sourceforge/jindolf/parser/EntityConverter.java

   1 /*
   2  * entity converter
   3  *
   4  * License : The MIT License
   5  * Copyright(c) 2009 olyutorskii
   6  */
   7
   8 package jp.sourceforge.jindolf.parser;
   9
  10 import java.util.regex.Matcher;
  11 import java.util.regex.Pattern;
  12
  13 /**
  14  * 人狼BBSで用いられる4種類のXHTML文字実体参照の
  15  * 解決を伴う{@link DecodedContent}の切り出しを行う。
  16  *
  17  * <p>文字実体参照は{@code &gt; &lt; &quot; &amp;}が対象。
  18  *
  19  * <p>U+005C(バックスラッシュ)をU+00A5(円通貨)に直す処理も行われる。
  20  * ※ 人狼BBSはShift_JIS(⊃JISX0201)で運営されているので、
  21  * バックスラッシュは登場しないはず。
  22  * ※ が、バックスラッシュを生成するShift_JISデコーダは存在する。
  23  *
  24  * <p>指示によりサロゲートペア上位下位の並びを
  25  * 単一疑問符?に直す処理も可能。
  26  * {@link java.lang.Character#MIN_SUPPLEMENTARY_CODE_POINT}
  27  * {@link java.lang.Character#MAX_CODE_POINT}
  28  *
  29  * <p>マルチスレッドには非対応。
  30  */
  31 public class EntityConverter{
  32
  33     private static final char   DQ_CH = '"';
  34     private static final String DQ_STR = Character.toString(DQ_CH);
  35     private static final String YEN_STR = "\u00a5";
  36
  37     private static final char   BS_CH = '\u005c\u005c';
  38     private static final String BS_STR = Character.toString(BS_CH);
  39     private static final String BS_PATTERN = BS_STR + BS_STR;
  40
  41     private static final String UCS4_PATTERN = "[\\x{10000}-\\x{10ffff}]";
  42
  43     private static final RegexRep[] VALUES_CACHE = RegexRep.values();
  44
  45
  46     private final Matcher matcher = RegexRep.buildMatcher();
  47     private final boolean replaceSmp;
  48
  49
  50     /**
  51      * コンストラクタ。
  52      * SMP面文字の代替処理は行われない。
  53      */
  54     public EntityConverter(){
  55         this(false);
  56         return;
  57     }
  58
  59     /**
  60      * コンストラクタ。
  61      * @param replaceSmp SMP面文字を代替処理するならtrue
  62      */
  63     public EntityConverter(boolean replaceSmp){
  64         super();
  65         this.replaceSmp = replaceSmp;
  66         return;
  67     }
  68
  69
  70     /**
  71      * 実体参照の変換を行う。
  72      * @param content 変換元文書
  73      * @return 切り出された変換済み文書
  74      */
  75     public DecodedContent convert(DecodedContent content){
  76         int startPos = 0;
  77         int endPos   = content.length();
  78         return append(null, content, startPos, endPos);
  79     }
  80
  81     /**
  82      * 実体参照の変換を行う。
  83      * @param content 変換元文書
  84      * @param range 範囲指定
  85      * @return 切り出された変換済み文書
  86      * @throws IndexOutOfBoundsException 位置指定に不正があった
  87      */
  88     public DecodedContent convert(DecodedContent content, SeqRange range)
  89             throws IndexOutOfBoundsException{
  90         int startPos = range.getStartPos();
  91         int endPos   = range.getEndPos();
  92         return append(null, content, startPos, endPos);
  93     }
  94
  95     /**
  96      * 実体参照の変換を行う。
  97      * @param content 変換元文書
  98      * @param startPos 開始位置
  99      * @param endPos 終了位置
 100      * @return 切り出された変換済み文書
 101      * @throws IndexOutOfBoundsException 位置指定に不正があった
 102      */
 103     public DecodedContent convert(DecodedContent content,
 104                                    int startPos, int endPos)
 105             throws IndexOutOfBoundsException{
 106         return append(null, content, startPos, endPos);
 107     }
 108
 109     /**
 110      * 実体参照の変換を行い既存のDecodedContentに追加を行う。
 111      * @param target 追加先文書。nullなら新たな文書が用意される。
 112      * @param content 変換元文書
 113      * @return targetもしくは新規に用意された文書
 114      * @throws IndexOutOfBoundsException 位置指定に不正があった
 115      */
 116     public DecodedContent  append(DecodedContent target,
 117                                    DecodedContent content)
 118             throws IndexOutOfBoundsException{
 119         int startPos = 0;
 120         int endPos   = content.length();
 121         return append(target, content, startPos, endPos);
 122     }
 123
 124     /**
 125      * 実体参照の変換を行い既存のDecodedContentに追加を行う。
 126      * @param target 追加先文書。nullなら新たな文書が用意される。
 127      * @param content 変換元文書
 128      * @param range 範囲指定
 129      * @return targetもしくは新規に用意された文書
 130      * @throws IndexOutOfBoundsException 位置指定に不正があった
 131      */
 132     public DecodedContent  append(DecodedContent target,
 133                                    DecodedContent content,
 134                                    SeqRange range )
 135             throws IndexOutOfBoundsException{
 136         int startPos = range.getStartPos();
 137         int endPos   = range.getEndPos();
 138         return append(target, content, startPos, endPos);
 139     }
 140
 141     /**
 142      * 実体参照の変換を行い既存のDecodedContentに追加を行う。
 143      * @param target 追加先文書。nullなら新たな文書が用意される。
 144      * @param content 変換元文書
 145      * @param startPos 開始位置
 146      * @param endPos 終了位置
 147      * @return targetもしくは新規に用意された文書
 148      * @throws IndexOutOfBoundsException 位置指定に不正があった
 149      */
 150     public DecodedContent append(DecodedContent target,
 151                                   DecodedContent content,
 152                                   int startPos, int endPos)
 153             throws IndexOutOfBoundsException{
 154         if(    startPos > endPos
 155             || startPos < 0
 156             || content.length() < endPos){
 157             throw new IndexOutOfBoundsException();
 158         }
 159
 160         DecodedContent result;
 161         if(target == null){
 162             int length = endPos - startPos;
 163             result = new DecodedContent(length);
 164         }else{
 165             result = target;
 166         }
 167
 168         this.matcher.reset(content.getRawContent());
 169         this.matcher.region(startPos, endPos);
 170
 171         int copiedPos = startPos;
 172         while(this.matcher.find()){
 173             int group = -1;
 174             int matchStart = -1;
 175             String altTxt = "";
 176             for(RegexRep rr : VALUES_CACHE){
 177                 group = rr.getGroupNo();
 178                 matchStart = this.matcher.start(group);
 179                 if(matchStart >= 0){
 180                     if(rr == RegexRep.UCS4 &&  ! this.replaceSmp){
 181                         altTxt = this.matcher.group(group);
 182                     }else{
 183                         altTxt = rr.getAltTxt();
 184                     }
 185                     break;
 186                 }
 187             }
 188             assert group >= 1;
 189             int matchEnd = this.matcher.end(group);
 190
 191             result.append(content, copiedPos, matchStart);
 192             result.append(altTxt);
 193
 194             copiedPos = matchEnd;
 195         }
 196         result.append(content, copiedPos, endPos);
 197
 198         this.matcher.reset("");
 199
 200         return result;
 201     }
 202
 203
 204     /**
 205      * 文字列置換リスト。
 206      */
 207     private static enum RegexRep{
 208
 209         GT   ("&gt;",       ">"),
 210         LT   ("&lt;",       "<"),
 211         AMP  ("&amp;",      "&"),
 212         QUAT ("&quot;",     DQ_STR),
 213         BS   (BS_PATTERN,   YEN_STR),
 214         UCS4 (UCS4_PATTERN, "?"),
 215         ;
 216
 217
 218         private final String regex;
 219         private final String altTxt;
 220
 221
 222         /**
 223          * コンストラクタ。
 224          * @param regex 置換元パターン正規表現
 225          * @param altTxt 置換文字列。
 226          */
 227         private RegexRep(String regex, String altTxt){
 228             this.regex = regex;
 229             this.altTxt = altTxt;
 230             return;
 231         }
 232
 233
 234         /**
 235          * 全正規表現をOR連結したパターンを生成する。
 236          * @return パターン
 237          */
 238         private static Pattern buildPattern(){
 239             StringBuilder orRegex = new StringBuilder();
 240
 241             for(RegexRep rr : values()){
 242                 if(orRegex.length() > 0) orRegex.append('|');
 243                 orRegex.append('(');
 244                 orRegex.append(rr.regex);
 245                 orRegex.append(')');
 246             }
 247
 248             Pattern result = Pattern.compile(orRegex.toString());
 249             return result;
 250         }
 251
 252         /**
 253          * マッチャを生成する。
 254          * @return マッチャ
 255          */
 256         private static Matcher buildMatcher(){
 257             Pattern pattern = buildPattern();
 258             Matcher result = pattern.matcher("");
 259             return result;
 260         }
 261
 262
 263         /**
 264          * 置換文字列を返す。
 265          * @return 置換文字列
 266          */
 267         private String getAltTxt(){
 268             return this.altTxt;
 269         }
 270
 271         /**
 272          * パターン内において占めるグループ番号を返す。
 273          * @return グループ番号
 274          */
 275         private int getGroupNo(){
 276             int group = ordinal() + 1;
 277             return group;
 278         }
 279
 280     }
 281
 282 }