--- /dev/null
+<?php
+
+class ENTITY
+{
+ /**
+ * ENTITY::hen
+ * htmlentities wrapper
+ *
+ * @static
+ * @access public
+ * @param string $string target string
+ * @param string $quotation quotation mode. please refer to the argument of PHP built-in htmlentities
+ * @return string escaped string
+ */
+ static public function hen($string, $quotation=ENT_QUOTES)
+ {
+ /*
+ * we can use 'double_encode' flag instead of this when dropping supports for PHP 5.2.2 or lower
+ */
+ $string = html_entity_decode($string, $quotation, i18n::get_current_charset());
+ return (string) htmlentities($string, $quotation, i18n::get_current_charset());
+ }
+
+ /**
+ * ENTITY::hsc
+ * htmlspecialchars wrapper
+ *
+ * NOTE: htmlspecialchars_decode() is ASCII-to-ACII conversion
+ * and its target string consists of several letters.
+ * There are no problems.
+ *
+ * @static
+ * @access public
+ * @param string $string target string
+ * @param string $quotation quotation mode. please refer to the argument of PHP built-in htmlspecialchars
+ * @return string escaped string
+ *
+ */
+ static public function hsc($string, $quotation=ENT_QUOTES)
+ {
+ /*
+ * we can use 'double_encode' flag instead of this when dropping supports for PHP 5.2.2 or lower
+ */
+ $string = htmlspecialchars_decode($string, $quotation);
+ return (string) htmlspecialchars($string, $quotation, i18n::get_current_charset());
+ }
+
+ /**
+ * ENTITY::strip_tags()
+ * Strip HTML tags from a string
+ *
+ * This function is a bit more intelligent than a regular call to strip_tags(),
+ * because it also deletes the contents of certain tags and cleans up any
+ * unneeded whitespace.
+ *
+ * @static
+ * @param String $string target string
+ * @return String string with stripped tags
+ */
+ static public function strip_tags($string)
+ {
+ $string = preg_replace("#<del[^>]*>.+<\/del[^>]*>#isU", '', $string);
+ $string = preg_replace("#<script[^>]*>.+<\/script[^>]*>#isU", '', $string);
+ $string = preg_replace("#<style[^>]*>.+<\/style[^>]*>#isU", '', $string);
+ $string = preg_replace('#>#', '> ', $string);
+ $string = preg_replace('#<#', ' <', $string);
+ $string = strip_tags($string);
+ $string = preg_replace("#\s+#", " ", $string);
+ $string = trim($string);
+ return $string;
+ }
+
+ /**
+ * shortens a text string to maxlength.
+ * $suffix is what needs to be added at the end (end length is <= $maxlength)
+ *
+ * The purpose is to limit the width of string for rendered screen in web browser.
+ * So it depends on style sheet, browser's rendering scheme, client's system font.
+ *
+ * NOTE: In general, non-Latin font such as Japanese, Chinese, Cyrillic have two times as width as Latin fonts,
+ * but this is not always correct, for example, rendered by proportional font.
+ *
+ * @static
+ * @param string $escaped_string target string
+ * @param integer $maxlength maximum length of return string which includes suffix
+ * @param string $suffix added in the end of shortened-string
+ * @return string
+ */
+ static public function shorten($string, $maxlength, $suffix)
+ {
+ static $flag;
+
+ $decoded_entities_pcre = array();
+ $encoded_entities = array();
+
+ /* 1. store html entities */
+ preg_match('#&[^&]+?;#', $string, $encoded_entities);
+ if ( !$encoded_entities )
+ {
+ $flag = FALSE;
+ }
+ else
+ {
+ $flag = TRUE;
+ }
+ if ( $flag )
+ {
+ foreach ( $encoded_entities as $encoded_entity )
+ {
+ $decoded_entities_pcre[] = '#' . html_entity_decode($encoded_entity, ENT_QUOTES, i18n::get_current_charset()) . '#';
+ }
+ }
+
+ /* 2. decode string */
+ $string = html_entity_decode($string, ENT_QUOTES, i18n::get_current_charset());
+
+ /* 3. shorten string and add suffix if string length is longer */
+ if ( i18n::strlen($string) > $maxlength - i18n::strlen($suffix) )
+ {
+ $string = i18n::substr($string, 0, $maxlength - i18n::strlen($suffix) );
+ $string .= $suffix;
+ }
+
+ /* 4. recover entities */
+ if ( $flag )
+ {
+ $string = preg_replace($decoded_entities_pcre, $encoded_entities, $string);
+ }
+
+ return $string;
+ }
+
+ /**
+ * ENTITY::highlight()
+ * highlights a specific query in a given HTML text (not within HTML tags)
+ *
+ * @static
+ * @param string $text text to be highlighted
+ * @param string $expression regular expression to be matched (can be an array of expressions as well)
+ * @param string $highlight highlight to be used (use \\0 to indicate the matched expression)
+ * @return string
+ */
+ static public function highlight($text, $expression, $highlight)
+ {
+ if ( !$highlight || !$expression )
+ {
+ return $text;
+ }
+
+ if ( is_array($expression) && (count($expression) == 0) )
+ {
+ return $text;
+ }
+
+ $text = "<!--h-->{$text}";
+ preg_match_all('#(<[^>]+>)([^<>]*)#', $text, $matches);
+ $result = '';
+ $count = count($matches[2]);
+
+ for ( $i = 0; $i < $count; $i++ )
+ {
+ if ( $i != 0 )
+ {
+ $result .= $matches[1][$i];
+ }
+
+ if ( is_array($expression) )
+ {
+ foreach ( $expression as $regex )
+ {
+ $matches[2][$i] = preg_replace("#{$regex}#i", $highlight, $matches[2][$i]);
+ }
+ $result .= $matches[2][$i];
+ }
+ else
+ {
+ $result .= preg_replace("#{$expression}#i", $highlight, $matches[2][$i]);
+ }
+ }
+ return $result;
+ }
+
+ /**
+ * ENTITY::anchor_footnoting()
+ * change strings with footnoticing generated from anchor elements
+ *
+ * @static
+ * @param String $string strings which includes html elements
+ * @return String string with footnotes
+ */
+ static public function anchor_footnoting($string)
+ {
+ /* 1. detect anchor elements */
+ $anchors = array();
+ if ( !preg_match_all("#<a[^>]*href=[\"\']([^\"^']*)[\"\'][^>]*>([^<]*)<\/a>#i", $subject, $anchors) )
+ {
+ return $string;
+ }
+
+ /* 2. add footnotes */
+ $string .= "\n\n";
+ $count = 1;
+ foreach ( $anchors as $anchor )
+ {
+ preg_replace("#{$anchor[0]}#", "{$anchor[2]} [{$count}] ", $subject);
+ $subject .= "[{$count}] {$anchor[1]}\n";
+ $count++;
+ }
+
+ return strip_tags($ascii);
+ }
+
+ /*
+ * NOTE: Obsoleted functions
+ */
+
+ /**
+ * ENTITY::named_to_numeric()
+ *
+ * @deprecated
+ * @param String $string
+ */
+ function named_to_numeric ($string)
+ {
+ $string = preg_replace('/(&[0-9A-Za-z]+)(;?\=?|([^A-Za-z0-9\;\:\.\-\_]))/e', "entity::_named('\\1', '\\2') . '\\3'", $string);
+ return $string;
+ }
+
+ /**
+ * ENTITY::named_to_numeric()
+ *
+ * @deprecated
+ * @param String $string
+ */
+ function normalize_numeric ($string) {
+ $string = preg_replace('/&#([0-9]+)(;)?/e', "'&#x'.dechex('\\1').';'", $string);
+ $string = preg_replace('/&#[Xx](0)*([0-9A-Fa-f]+)(;?|([^A-Za-z0-9\;\:\.\-\_]))/e', "'&#x' . strtoupper('\\2') . ';\\4'", $string);
+ $string = strtr($string, self::$entities['Windows-1252']);
+ return $string;
+ }
+
+ /**
+ * ENTITY::numeric_to_utf8()
+ *
+ * @deprecated
+ * @param String $string
+ */
+ function numeric_to_utf8 ($string) {
+ $string = preg_replace('/&#([0-9]+)(;)?/e', "'&#x'.dechex('\\1').';'", $string);
+ $string = preg_replace('/&#[Xx](0)*([0-9A-Fa-f]+)(;?|([^A-Za-z0-9\;\:\.\-\_]))/e', "'&#x' . strtoupper('\\2') . ';\\4'", $string);
+ $string = preg_replace('/&#x([0-9A-Fa-f]+);/e', "entity::_hex_to_utf8('\\1')", $string);
+ return $string;
+ }
+
+ /**
+ * ENTITY::numeric_to_named()
+ * convert decimal and hexadecimal numeric character references into named character references
+ *
+ * @deprecated
+ * @param String $string
+ */
+ function numeric_to_named ($string)
+ {
+ $string = preg_replace('/&#[Xx]([0-9A-Fa-f]+)/e', "'&#'.hexdec('\\1')", $string);
+ $string = strtr($string, array_flip(self::$entities['named_to_numeric']));
+ return $string;
+ }
+
+ /**
+ * ENTITY::specialchars()
+ * convert HTML entities to named character reference
+ *
+ * @deprecated
+ * @param String $string
+ */
+ function specialchars ($string, $type = 'xml')
+ {
+ $specialchars = array(
+ '"' => '"',
+ '&' => '&',
+ '<' => '<',
+ '>' => '>'
+ );
+ if ( $type != 'xml' )
+ {
+ $specialchars["'"] = ''';
+ }
+ else
+ {
+ $specialchars["'"] = ''';
+ }
+
+ $string = preg_replace('/&(#?[Xx]?[0-9A-Za-z]+);/', "[[[ENTITY:\\1]]]", $string);
+ $string = strtr($string, $specialchars);
+ $string = preg_replace('/\[\[\[ENTITY\:([^\]]+)\]\]\]/', "&\\1;", $string);
+ return $string;
+ }
+
+ /**
+ * ENTITY::_hex_to_utf8()
+ * convert decimal numeric character references to hexadecimal numeric character references
+ *
+ * @deprecated
+ * @param String $string
+ */
+ function _hex_to_utf8($s)
+ {
+ $c = hexdec($s);
+
+ if ( $c < 0x80 )
+ {
+ $str = chr($c);
+ }
+ else if ( $c < 0x800 )
+ {
+ $str = chr(0xC0 | $c>>6) . chr(0x80 | $c & 0x3F);
+ }
+ else if ( $c < 0x10000 )
+ {
+ $str = chr(0xE0 | $c>>12) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F);
+ }
+ else if ( $c < 0x200000 )
+ {
+ $str = chr(0xF0 | $c>>18) . chr(0x80 | $c>>12 & 0x3F) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F);
+ }
+ return $str;
+ }
+
+ /**
+ * ENTITY::_named()
+ * convert entities to named character reference
+ *
+ * @deprecated
+ * @param String $string
+ * @param String $extra
+ * @return
+ */
+ function _named($entity, $extra)
+ {
+ if ( $extra == '=' )
+ {
+ return $entity . '=';
+ }
+
+ $length = i18n::strlen($entity);
+
+ while ( $length > 0 )
+ {
+ $check = i18n::substr($entity, 0, $length);
+ if ( array_key_exists($check, self::$entities['named_to_numeric']) )
+ {
+ return self::$entities['named_to_numeric'][$check] . ';' . i18n::substr($entity, $length);
+ }
+ $length--;
+ }
+
+ if ( $extra != ';' )
+ {
+ return $entity;
+ }
+ else
+ {
+ return "{$entity};";
+ }
+ }
+
+ /**
+ * ENTITIY::$entities
+ *
+ * HTML 4.01 Specification
+ * @link http://www.w3.org/TR/html4/sgml/entities.html
+ * @see 24 Character entity references in HTML 4
+ *
+ * XHTML™ 1.0 The Extensible HyperText Markup Language (Second Edition)
+ * A Reformulation of HTML 4 in XML 1.0
+ * @link http://www.w3.org/TR/xhtml1/
+ * @see 4.12. Entity references as hex values
+ * @see C.16. The Named Character Reference '
+ *
+ * @static
+ * @deprecated
+ */
+ static private $entities = array (
+ 'named_to_numeric' => array (
+ ' ' => ' ',
+ '¡' => '¡',
+ '¢' => '¢',
+ '£' => '£',
+ '¤' => '¤',
+ '¥' => '¥',
+ '¦' => '¦',
+ '§' => '§',
+ '¨' => '¨',
+ '©' => '©',
+ 'ª' => 'ª',
+ '«' => '«',
+ '¬' => '¬',
+ '­' => '­',
+ '®' => '®',
+ '¯' => '¯',
+ '°' => '°',
+ '±' => '±',
+ '²' => '²',
+ '³' => '³',
+ '´' => '´',
+ 'µ' => 'µ',
+ '¶' => '¶',
+ '·' => '·',
+ '¸' => '¸',
+ '¹' => '¹',
+ 'º' => 'º',
+ '»' => '»',
+ '¼' => '¼',
+ '½' => '½',
+ '¾' => '¾',
+ '¿' => '¿',
+ 'À' => 'À',
+ 'Á' => 'Á',
+ 'Â' => 'Â',
+ 'Ã' => 'Ã',
+ 'Ä' => 'Ä',
+ 'Å' => 'Å',
+ 'Æ' => 'Æ',
+ 'Ç' => 'Ç',
+ 'È' => 'È',
+ 'É' => 'É',
+ 'Ê' => 'Ê',
+ 'Ë' => 'Ë',
+ 'Ì' => 'Ì',
+ 'Í' => 'Í',
+ 'Î' => 'Î',
+ 'Ï' => 'Ï',
+ 'Ð' => 'Ð',
+ 'Ñ' => 'Ñ',
+ 'Ò' => 'Ò',
+ 'Ó' => 'Ó',
+ 'Ô' => 'Ô',
+ 'Õ' => 'Õ',
+ 'Ö' => 'Ö',
+ '×' => '×',
+ 'Ø' => 'Ø',
+ 'Ù' => 'Ù',
+ 'Ú' => 'Ú',
+ 'Û' => 'Û',
+ 'Ü' => 'Ü',
+ 'Ý' => 'Ý',
+ 'Þ' => 'Þ',
+ 'ß' => 'ß',
+ 'à' => 'à',
+ 'á' => 'á',
+ 'â' => 'â',
+ 'ã' => 'ã',
+ 'ä' => 'ä',
+ 'å' => 'å',
+ 'æ' => 'æ',
+ 'ç' => 'ç',
+ 'è' => 'è',
+ 'é' => 'é',
+ 'ê' => 'ê',
+ 'ë' => 'ë',
+ 'ì' => 'ì',
+ 'í' => 'í',
+ 'î' => 'î',
+ 'ï' => 'ï',
+ 'ð' => 'ð',
+ 'ñ' => 'ñ',
+ 'ò' => 'ò',
+ 'ó' => 'ó',
+ 'ô' => 'ô',
+ 'õ' => 'õ',
+ 'ö' => 'ö',
+ '÷' => '÷',
+ 'ø' => 'ø',
+ 'ù' => 'ù',
+ 'ú' => 'ú',
+ 'û' => 'û',
+ 'ü' => 'ü',
+ 'ý' => 'ý',
+ 'þ' => 'þ',
+ 'ÿ' => 'ÿ',
+ '&OElig' => 'Œ',
+ '&oelig' => 'å',
+ '&Scaron' => 'Š',
+ '&scaron' => 'š',
+ '&Yuml' => 'Ÿ',
+ '&circ' => 'ˆ',
+ '&tilde' => '˜',
+ '&esnp' => ' ',
+ '&emsp' => ' ',
+ '&thinsp' => ' ',
+ '&zwnj' => '‌',
+ '&zwj' => '‍',
+ '&lrm' => '‎',
+ '&rlm' => '‏',
+ '&ndash' => '–',
+ '&mdash' => '—',
+ '&lsquo' => '‘',
+ '&rsquo' => '’',
+ '&sbquo' => '‚',
+ '&ldquo' => '“',
+ '&rdquo' => '”',
+ '&bdquo' => '„',
+ '&dagger' => '†',
+ '&Dagger' => '‡',
+ '&permil' => '‰',
+ '&lsaquo' => '‹',
+ '&rsaquo' => '›',
+ '&euro' => '€',
+ '&fnof' => 'ƒ',
+ '&Alpha' => 'Α',
+ '&Beta' => 'Β',
+ '&Gamma' => 'Γ',
+ '&Delta' => 'Δ',
+ '&Epsilon' => 'Ε',
+ '&Zeta' => 'Ζ',
+ '&Eta' => 'Η',
+ '&Theta' => 'Θ',
+ '&Iota' => 'Ι',
+ '&Kappa' => 'Κ',
+ '&Lambda' => 'Λ',
+ '&Mu' => 'Μ',
+ '&Nu' => 'Ν',
+ '&Xi' => 'Ξ',
+ '&Omicron' => 'Ο',
+ '&Pi' => 'Π',
+ '&Rho' => 'Ρ',
+ '&Sigma' => 'Σ',
+ '&Tau' => 'Τ',
+ '&Upsilon' => 'Υ',
+ '&Phi' => 'Φ',
+ '&Chi' => 'Χ',
+ '&Psi' => 'Ψ',
+ '&Omega' => 'Ω',
+ '&alpha' => 'α',
+ '&beta' => 'β',
+ '&gamma' => 'γ',
+ '&delta' => 'δ',
+ '&epsilon' => 'ε',
+ '&zeta' => 'ζ',
+ '&eta' => 'η',
+ '&theta' => 'θ',
+ '&iota' => 'ι',
+ '&kappa' => 'κ',
+ '&lambda' => 'λ',
+ '&mu' => 'μ',
+ '&nu' => 'ν',
+ '&xi' => 'ξ',
+ '&omicron' => 'ο',
+ '&pi' => 'π',
+ '&rho' => 'ρ',
+ '&sigmaf' => 'ς',
+ '&sigma' => 'σ',
+ '&tau' => 'τ',
+ '&upsilon' => 'υ',
+ '&phi' => 'φ',
+ '&chi' => 'χ',
+ '&psi' => 'ψ',
+ '&omega' => 'ω',
+ '&thetasym' => 'ϑ',
+ '&upsih' => 'ϒ',
+ '&piv' => 'ϖ',
+ '&bull' => '•',
+ '&hellip' => '…',
+ '&prime' => '′',
+ '&Prime' => '″',
+ '&oline' => '‾',
+ '&frasl' => '⁄',
+ '&weierp' => '℘',
+ '&image' => 'ℑ',
+ '&real' => 'ℜ',
+ '&trade' => 'ℒ',
+ '&alefsym' => 'ℵ',
+ '&larr' => '←',
+ '&uarr' => '↑',
+ '&rarr' => '→',
+ '&darr' => '↓',
+ '&harr' => '↔',
+ '&crarr' => '↵',
+ '&lArr' => '⇐',
+ '&uArr' => '⇑',
+ '&rArr' => '⇒',
+ '&dArr' => '⇓',
+ '&hArr' => '⇔',
+ '&forall' => '∀',
+ '&part' => '∂',
+ '&exist' => '∃',
+ '&empty' => '∅',
+ '&nabla' => '∇',
+ '&isin' => '∈',
+ '¬in' => '∉',
+ '&ni' => '∋',
+ '&prod' => '∏',
+ '&sum' => '∑',
+ '&minus' => '−',
+ '&lowast' => '∗',
+ '&radic' => '√',
+ '&prop' => '∝',
+ '&infin' => '∞',
+ '&ang' => '∠',
+ '&and' => '∧',
+ '&or' => '∨',
+ '&cap' => '∩',
+ '&cup' => '∪',
+ '&int' => '∫',
+ '&there4' => '∴',
+ '&sim' => '∼',
+ '&cong' => '≅',
+ '&asymp' => '≈',
+ '&ne' => '≠',
+ '&equiv' => '≡',
+ '&le' => '≤',
+ '&ge' => '≥',
+ '&sub' => '⊂',
+ '&sup' => '⊃',
+ '&nsub' => '⊄',
+ '&sube' => '⊆',
+ '&supe' => '⊇',
+ '&oplus' => '⊕',
+ '&otimes' => '⊖',
+ '&perp' => '⊥',
+ '&sdot' => '⋅',
+ '&lceil' => '⍨',
+ '&rceil' => '⌉',
+ '&lfloor' => '⌊',
+ '&rfloor' => '⌋',
+ '&lang' => '〈',
+ '&rang' => '⌰',
+ '&loz' => '◊',
+ '&spades' => '♠',
+ '&clubs' => '♣',
+ '&hearts' => '♥',
+ '&diams' => '♦'
+ ),
+ 'Windows-1252' => array(
+ '€' => '€',
+ '‚' => '‚',
+ 'ƒ' => 'ƒ',
+ '„' => '„',
+ '…' => '…',
+ '†' => '†',
+ '‡' => '‡',
+ 'ˆ' => 'ˆ',
+ '‰' => '‰',
+ 'Š' => 'Š',
+ '‹' => '‹',
+ 'Œ' => 'Œ',
+ 'Ž' => 'Ž',
+ '‘' => '‘',
+ '’' => '’',
+ '“' => '“',
+ '”' => '”',
+ '•' => '•',
+ '–' => '–',
+ '—' => '—',
+ '˜' => '˜',
+ '™' => '™',
+ 'š' => 'š',
+ '›' => '›',
+ 'œ' => 'œ',
+ 'ž' => 'ž',
+ 'Ÿ' => 'Ÿ',
+ )
+ );
+}