2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 * $Id: CharInfo.java 468654 2006-10-28 07:09:23Z minchau $
21 package org.apache.xml.serializer;
23 import java.io.BufferedReader;
24 import java.io.InputStream;
25 import java.io.InputStreamReader;
26 import java.io.UnsupportedEncodingException;
28 import java.util.Enumeration;
29 import java.util.HashMap;
30 import java.util.Hashtable;
31 import java.util.PropertyResourceBundle;
32 import java.util.ResourceBundle;
33 import java.security.AccessController;
34 import java.security.PrivilegedAction;
36 import javax.xml.transform.TransformerException;
38 import org.apache.xml.serializer.utils.MsgKey;
39 import org.apache.xml.serializer.utils.SystemIDResolver;
40 import org.apache.xml.serializer.utils.Utils;
41 import org.apache.xml.serializer.utils.WrappedRuntimeException;
44 * This class provides services that tell if a character should have
45 * special treatement, such as entity reference substitution or normalization
46 * of a newline character. It also provides character to entity reference
49 * DEVELOPERS: See Known Issue in the constructor.
55 /** Given a character, lookup a String to output (e.g. a decorated entity reference). */
56 private HashMap m_charToString;
59 * The name of the HTML entities file.
60 * If specified, the file will be resource loaded with the default class loader.
62 public static final String HTML_ENTITIES_RESOURCE =
63 SerializerBase.PKG_NAME+".HTMLEntities";
66 * The name of the XML entities file.
67 * If specified, the file will be resource loaded with the default class loader.
69 public static final String XML_ENTITIES_RESOURCE =
70 SerializerBase.PKG_NAME+".XMLEntities";
72 /** The horizontal tab character, which the parser should always normalize. */
73 static final char S_HORIZONAL_TAB = 0x09;
75 /** The linefeed character, which the parser should always normalize. */
76 static final char S_LINEFEED = 0x0A;
78 /** The carriage return character, which the parser should always normalize. */
79 static final char S_CARRIAGERETURN = 0x0D;
80 static final char S_SPACE = 0x20;
81 static final char S_QUOTE = 0x22;
82 static final char S_LT = 0x3C;
83 static final char S_GT = 0x3E;
84 static final char S_NEL = 0x85;
85 static final char S_LINE_SEPARATOR = 0x2028;
87 /** This flag is an optimization for HTML entities. It false if entities
88 * other than quot (34), amp (38), lt (60) and gt (62) are defined
89 * in the range 0 to 127.
92 boolean onlyQuotAmpLtGt;
94 /** Copy the first 0,1 ... ASCII_MAX values into an array */
95 static final int ASCII_MAX = 128;
97 /** Array of values is faster access than a set of bits
98 * to quickly check ASCII characters in attribute values,
99 * the value is true if the character in an attribute value
100 * should be mapped to a String.
102 private final boolean[] shouldMapAttrChar_ASCII;
104 /** Array of values is faster access than a set of bits
105 * to quickly check ASCII characters in text nodes,
106 * the value is true if the character in a text node
107 * should be mapped to a String.
109 private final boolean[] shouldMapTextChar_ASCII;
111 /** An array of bits to record if the character is in the set.
112 * Although information in this array is complete, the
113 * isSpecialAttrASCII array is used first because access to its values
114 * is common and faster.
116 private final int array_of_bits[];
119 // 5 for 32 bit words, 6 for 64 bit words ...
121 * This constant is used to shift an integer to quickly
122 * calculate which element its bit is stored in.
123 * 5 for 32 bit words (int) , 6 for 64 bit words (long)
125 private static final int SHIFT_PER_WORD = 5;
128 * A mask to get the low order bits which are used to
129 * calculate the value of the bit within a given word,
130 * that will represent the presence of the integer in the
133 * 0x1F for 32 bit words (int),
134 * or 0x3F for 64 bit words (long)
136 private static final int LOW_ORDER_BITMASK = 0x1f;
139 * This is used for optimizing the lookup of bits representing
140 * the integers in the set. It is the index of the first element
141 * in the array array_of_bits[] that is not used.
143 private int firstWordNotUsed;
147 * A base constructor just to explicitly create the fields,
148 * with the exception of m_charToString which is handled
149 * by the constructor that delegates base construction to this one.
151 * m_charToString is not created here only for performance reasons,
152 * to avoid creating a Hashtable that will be replaced when
153 * making a mutable copy, {@link #mutableCopyOf(CharInfo)}.
158 this.array_of_bits = createEmptySetOfIntegers(65535);
159 this.firstWordNotUsed = 0;
160 this.shouldMapAttrChar_ASCII = new boolean[ASCII_MAX];
161 this.shouldMapTextChar_ASCII = new boolean[ASCII_MAX];
162 this.m_charKey = new CharKey();
164 // Not set here, but in a constructor that uses this one
165 // this.m_charToString = new Hashtable();
167 this.onlyQuotAmpLtGt = true;
173 private CharInfo(String entitiesResource, String method, boolean internal)
175 // call the default constructor to create the fields
177 m_charToString = new HashMap();
179 ResourceBundle entities = null;
180 boolean noExtraEntities = true;
182 // Make various attempts to interpret the parameter as a properties
183 // file or resource file, as follows:
185 // 1) attempt to load .properties file using ResourceBundle
186 // 2) try using the class loader to find the specified file a resource
188 // 3) try treating the resource a URI
192 // Load entity property files by using PropertyResourceBundle,
193 // cause of security issure for applets
194 entities = PropertyResourceBundle.getBundle(entitiesResource);
195 } catch (Exception e) {}
198 if (entities != null) {
199 Enumeration keys = entities.getKeys();
200 while (keys.hasMoreElements()){
201 String name = (String) keys.nextElement();
202 String value = entities.getString(name);
203 int code = Integer.parseInt(value);
204 boolean extra = defineEntity(name, (char) code);
206 noExtraEntities = false;
209 InputStream is = null;
211 // Load user specified resource file by using URL loading, it
212 // requires a valid URI as parameter
215 is = CharInfo.class.getResourceAsStream(entitiesResource);
217 ClassLoader cl = ObjectFactory.findClassLoader();
219 is = ClassLoader.getSystemResourceAsStream(entitiesResource);
221 is = cl.getResourceAsStream(entitiesResource);
226 URL url = new URL(entitiesResource);
227 is = url.openStream();
228 } catch (Exception e) {}
233 throw new RuntimeException(
234 Utils.messages.createMessage(
235 MsgKey.ER_RESOURCE_COULD_NOT_FIND,
236 new Object[] {entitiesResource, entitiesResource}));
239 // Fix Bugzilla#4000: force reading in UTF-8
240 // This creates the de facto standard that Xalan's resource
241 // files must be encoded in UTF-8. This should work in all
244 // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which
245 // didn't implement the UTF-8 encoding. Theoretically, we should
246 // simply let it fail in that case, since the JVM is obviously
247 // broken if it doesn't support such a basic standard. But
248 // since there are still some users attempting to use VJ++ for
249 // development, we have dropped in a fallback which makes a
250 // second attempt using the platform's default encoding. In VJ++
251 // this is apparently ASCII, which is subset of UTF-8... and
252 // since the strings we'll be reading here are also primarily
253 // limited to the 7-bit ASCII range (at least, in English
254 // versions of Xalan), this should work well enough to keep us
255 // on the air until we're ready to officially decommit from
258 BufferedReader reader;
260 reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
261 } catch (UnsupportedEncodingException e) {
262 reader = new BufferedReader(new InputStreamReader(is));
265 String line = reader.readLine();
267 while (line != null) {
268 if (line.length() == 0 || line.charAt(0) == '#') {
269 line = reader.readLine();
274 int index = line.indexOf(' ');
277 String name = line.substring(0, index);
281 if (index < line.length()) {
282 String value = line.substring(index);
283 index = value.indexOf(' ');
286 value = value.substring(0, index);
289 int code = Integer.parseInt(value);
291 boolean extra = defineEntity(name, (char) code);
293 noExtraEntities = false;
297 line = reader.readLine();
301 } catch (Exception e) {
302 throw new RuntimeException(
303 Utils.messages.createMessage(
304 MsgKey.ER_RESOURCE_COULD_NOT_LOAD,
305 new Object[] { entitiesResource,
313 } catch (Exception except) {}
318 onlyQuotAmpLtGt = noExtraEntities;
320 /* Now that we've used get(ch) just above to initialize the
321 * two arrays we will change by adding a tab to the set of
322 * special chars for XML (but not HTML!).
323 * We do this because a tab is always a
324 * special character in an XML attribute,
325 * but only a special character in XML text
326 * if it has an entity defined for it.
327 * This is the reason for this delay.
329 if (Method.XML.equals(method))
331 // We choose not to escape the quotation mark as " in text nodes
332 shouldMapTextChar_ASCII[S_QUOTE] = false;
335 if (Method.HTML.equals(method)) {
336 // The XSLT 1.0 recommendation says
337 // "The html output method should not escape < characters occurring in attribute values."
338 // So we don't escape '<' in an attribute for HTML
339 shouldMapAttrChar_ASCII['<'] = false;
341 // We choose not to escape the quotation mark as " in text nodes.
342 shouldMapTextChar_ASCII[S_QUOTE] = false;
347 * Defines a new character reference. The reference's name and value are
348 * supplied. Nothing happens if the character reference is already defined.
349 * <p>Unlike internal entities, character references are a string to single
350 * character mapping. They are used to map non-ASCII characters both on
351 * parsing and printing, primarily for HTML documents. '&lt;' is an
352 * example of a character reference.</p>
354 * @param name The entity's name
355 * @param value The entity's value
356 * @return true if the mapping is not one of:
360 * <li> '&' to "&"
361 * <li> '"' to """
364 private boolean defineEntity(String name, char value)
366 StringBuffer sb = new StringBuffer("&");
369 String entityString = sb.toString();
371 boolean extra = defineChar2StringMapping(entityString, value);
376 * A utility object, just used to map characters to output Strings,
377 * needed because a HashMap needs to map an object as a key, not a
378 * Java primitive type, like a char, so this object gets around that
379 * and it is reusable.
381 private final CharKey m_charKey;
384 * Map a character to a String. For example given
385 * the character '>' this method would return the fully decorated
386 * entity name "<".
387 * Strings for entity references are loaded from a properties file,
388 * but additional mappings defined through calls to defineChar2String()
389 * are possible. Such entity reference mappings could be over-ridden.
391 * This is reusing a stored key object, in an effort to avoid
392 * heap activity. Unfortunately, that introduces a threading risk.
393 * Simplest fix for now is to make it a synchronized method, or to give
394 * up the reuse; I see very little performance difference between them.
395 * Long-term solution would be to replace the hashtable with a sparse array
396 * keyed directly from the character's integer value; see DTM's
397 * string pool for a related solution.
399 * @param value The character that should be resolved to
400 * a String, e.g. resolve '>' to "<".
402 * @return The String that the character is mapped to, or null if not found.
403 * @xsl.usage internal
405 String getOutputStringForChar(char value)
407 // CharKey m_charKey = new CharKey(); //Alternative to synchronized
408 m_charKey.setChar(value);
409 return (String) m_charToString.get(m_charKey);
413 * Tell if the character argument that is from
414 * an attribute value has a mapping to a String.
416 * @param value the value of a character that is in an attribute value
417 * @return true if the character should have any special treatment,
418 * such as when writing out entity references.
419 * @xsl.usage internal
421 final boolean shouldMapAttrChar(int value)
423 // for performance try the values in the boolean array first,
424 // this is faster access than the BitSet for common ASCII values
426 if (value < ASCII_MAX)
427 return shouldMapAttrChar_ASCII[value];
429 // rather than java.util.BitSet, our private
430 // implementation is faster (and less general).
435 * Tell if the character argument that is from a
436 * text node has a mapping to a String, for example
437 * to map '<' to "<".
439 * @param value the value of a character that is in a text node
440 * @return true if the character has a mapping to a String,
441 * such as when writing out entity references.
442 * @xsl.usage internal
444 final boolean shouldMapTextChar(int value)
446 // for performance try the values in the boolean array first,
447 // this is faster access than the BitSet for common ASCII values
449 if (value < ASCII_MAX)
450 return shouldMapTextChar_ASCII[value];
452 // rather than java.util.BitSet, our private
453 // implementation is faster (and less general).
459 private static CharInfo getCharInfoBasedOnPrivilege(
460 final String entitiesFileName, final String method,
461 final boolean internal){
462 return (CharInfo) AccessController.doPrivileged(
463 new PrivilegedAction() {
464 public Object run() {
465 return new CharInfo(entitiesFileName,
471 * Factory that reads in a resource file that describes the mapping of
472 * characters to entity references.
474 * Resource files must be encoded in UTF-8 and have a format like:
476 * # First char # is a comment
477 * Entity numericValue
481 * (Note: Why don't we just switch to .properties files? Oct-01 -sc)
483 * @param entitiesResource Name of entities resource file that should
484 * be loaded, which describes that mapping of characters to entity references.
485 * @param method the output method type, which should be one of "xml", "html", "text"...
487 * @xsl.usage internal
489 static CharInfo getCharInfo(String entitiesFileName, String method)
491 CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName);
492 if (charInfo != null) {
493 return mutableCopyOf(charInfo);
496 // try to load it internally - cache
498 charInfo = getCharInfoBasedOnPrivilege(entitiesFileName,
500 // Put the common copy of charInfo in the cache, but return
502 m_getCharInfoCache.put(entitiesFileName, charInfo);
503 return mutableCopyOf(charInfo);
504 } catch (Exception e) {}
506 // try to load it externally - do not cache
508 return getCharInfoBasedOnPrivilege(entitiesFileName,
510 } catch (Exception e) {}
512 String absoluteEntitiesFileName;
514 if (entitiesFileName.indexOf(':') < 0) {
515 absoluteEntitiesFileName =
516 SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName);
519 absoluteEntitiesFileName =
520 SystemIDResolver.getAbsoluteURI(entitiesFileName, null);
521 } catch (TransformerException te) {
522 throw new WrappedRuntimeException(te);
526 return getCharInfoBasedOnPrivilege(entitiesFileName,
531 * Create a mutable copy of the cached one.
532 * @param charInfo The cached one.
535 private static CharInfo mutableCopyOf(CharInfo charInfo) {
536 CharInfo copy = new CharInfo();
538 int max = charInfo.array_of_bits.length;
539 System.arraycopy(charInfo.array_of_bits,0,copy.array_of_bits,0,max);
541 copy.firstWordNotUsed = charInfo.firstWordNotUsed;
543 max = charInfo.shouldMapAttrChar_ASCII.length;
544 System.arraycopy(charInfo.shouldMapAttrChar_ASCII,0,copy.shouldMapAttrChar_ASCII,0,max);
546 max = charInfo.shouldMapTextChar_ASCII.length;
547 System.arraycopy(charInfo.shouldMapTextChar_ASCII,0,copy.shouldMapTextChar_ASCII,0,max);
549 // utility field copy.m_charKey is already created in the default constructor
551 copy.m_charToString = (HashMap) charInfo.m_charToString.clone();
553 copy.onlyQuotAmpLtGt = charInfo.onlyQuotAmpLtGt;
559 * Table of user-specified char infos.
560 * The table maps entify file names (the name of the
561 * property file without the .properties extension)
562 * to CharInfo objects populated with entities defined in
563 * corresponding property file.
565 private static Hashtable m_getCharInfoCache = new Hashtable();
568 * Returns the array element holding the bit value for the
570 * @param i the integer that might be in the set of integers
573 private static int arrayIndex(int i) {
574 return (i >> SHIFT_PER_WORD);
578 * For a given integer in the set it returns the single bit
579 * value used within a given word that represents whether
580 * the integer is in the set or not.
582 private static int bit(int i) {
583 int ret = (1 << (i & LOW_ORDER_BITMASK));
588 * Creates a new empty set of integers (characters)
589 * @param max the maximum integer to be in the set.
591 private int[] createEmptySetOfIntegers(int max) {
592 firstWordNotUsed = 0; // an optimization
594 int[] arr = new int[arrayIndex(max - 1) + 1];
600 * Adds the integer (character) to the set of integers.
601 * @param i the integer to add to the set, valid values are
602 * 0, 1, 2 ... up to the maximum that was specified at
603 * the creation of the set.
605 private final void set(int i) {
606 setASCIItextDirty(i);
607 setASCIIattrDirty(i);
609 int j = (i >> SHIFT_PER_WORD); // this word is used
612 if(firstWordNotUsed < k) // for optimization purposes.
613 firstWordNotUsed = k;
615 array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK));
620 * Return true if the integer (character)is in the set of integers.
622 * This implementation uses an array of integers with 32 bits per
623 * integer. If a bit is set to 1 the corresponding integer is
624 * in the set of integers.
626 * @param i an integer that is tested to see if it is the
627 * set of integers, or not.
629 private final boolean get(int i) {
631 boolean in_the_set = false;
632 int j = (i >> SHIFT_PER_WORD); // wordIndex(i)
633 // an optimization here, ... a quick test to see
634 // if this integer is beyond any of the words in use
635 if(j < firstWordNotUsed)
636 in_the_set = (array_of_bits[j] &
637 (1 << (i & LOW_ORDER_BITMASK))
638 ) != 0; // 0L for 64 bit words
643 * This method returns true if there are some non-standard mappings to
644 * entities other than quot, amp, lt, gt, and its only purpose is for
646 * @param charToMap The value of the character that is mapped to a String
647 * @param outputString The String to which the character is mapped, usually
648 * an entity reference such as "<".
649 * @return true if the mapping is not one of:
653 * <li> '&' to "&"
654 * <li> '"' to """
657 private boolean extraEntity(String outputString, int charToMap)
659 boolean extra = false;
660 if (charToMap < ASCII_MAX)
665 if (!outputString.equals("""))
669 if (!outputString.equals("&"))
673 if (!outputString.equals("<"))
677 if (!outputString.equals(">"))
680 default : // other entity in range 0 to 127
688 * If the character is in the ASCII range then
689 * mark it as needing replacement with
690 * a String on output if it occurs in a text node.
693 private void setASCIItextDirty(int j)
695 if (0 <= j && j < ASCII_MAX)
697 shouldMapTextChar_ASCII[j] = true;
702 * If the character is in the ASCII range then
703 * mark it as needing replacement with
704 * a String on output if it occurs in a attribute value.
707 private void setASCIIattrDirty(int j)
709 if (0 <= j && j < ASCII_MAX)
711 shouldMapAttrChar_ASCII[j] = true;
717 * Call this method to register a char to String mapping, for example
718 * to map '<' to "<".
719 * @param outputString The String to map to.
720 * @param inputChar The char to map from.
721 * @return true if the mapping is not one of:
725 * <li> '&' to "&"
726 * <li> '"' to """
729 boolean defineChar2StringMapping(String outputString, char inputChar)
731 CharKey character = new CharKey(inputChar);
732 m_charToString.put(character, outputString);
733 set(inputChar); // mark the character has having a mapping to a String
735 boolean extraMapping = extraEntity(outputString, inputChar);
741 * Simple class for fast lookup of char values, when used with
742 * hashtables. You can set the char, then use it as a key.
744 * @xsl.usage internal
746 private static class CharKey extends Object
753 * Constructor CharKey
755 * @param key char value of this object.
757 public CharKey(char key)
763 * Default constructor for a CharKey.
765 * @param key char value of this object.
772 * Get the hash value of the character.
774 * @return hash value of the character.
776 public final void setChar(char c)
784 * Get the hash value of the character.
786 * @return hash value of the character.
788 public final int hashCode()
794 * Override of equals() for this object
796 * @param obj to compare to
798 * @return True if this object equals this string value
800 public final boolean equals(Object obj)
802 return ((CharKey)obj).m_char == m_char;