11 * @param string $string target string
12 * @param string $quotation quotation mode. please refer to the argument of PHP built-in htmlentities
13 * @return string escaped string
15 static public function hen($string, $quotation=ENT_QUOTES)
18 * we can use 'double_encode' flag instead of this when dropping supports for PHP 5.2.2 or lower
20 $string = html_entity_decode($string, $quotation, i18n::get_current_charset());
21 return (string) htmlentities($string, $quotation, i18n::get_current_charset());
26 * htmlspecialchars wrapper
28 * NOTE: htmlspecialchars_decode() is ASCII-to-ACII conversion
29 * and its target string consists of several letters.
30 * There are no problems.
34 * @param string $string target string
35 * @param string $quotation quotation mode. please refer to the argument of PHP built-in htmlspecialchars
36 * @return string escaped string
39 static public function hsc($string, $quotation=ENT_QUOTES)
42 * we can use 'double_encode' flag instead of this when dropping supports for PHP 5.2.2 or lower
44 $string = htmlspecialchars_decode($string, $quotation);
45 return (string) htmlspecialchars($string, $quotation, i18n::get_current_charset());
49 * Entity::strip_tags()
50 * Strip HTML tags from a string
52 * This function is a bit more intelligent than a regular call to strip_tags(),
53 * because it also deletes the contents of certain tags and cleans up any
54 * unneeded whitespace.
57 * @param String $string target string
58 * @return String string with stripped tags
60 static public function strip_tags($string)
62 $string = preg_replace("#<del[^>]*>.+<\/del[^>]*>#isU", '', $string);
63 $string = preg_replace("#<script[^>]*>.+<\/script[^>]*>#isU", '', $string);
64 $string = preg_replace("#<style[^>]*>.+<\/style[^>]*>#isU", '', $string);
65 $string = preg_replace('#>#', '> ', $string);
66 $string = preg_replace('#<#', ' <', $string);
67 $string = strip_tags($string);
68 $string = preg_replace("#\s+#", " ", $string);
69 $string = trim($string);
74 * shortens a text string to maxlength.
75 * $suffix is what needs to be added at the end (end length is <= $maxlength)
77 * The purpose is to limit the width of string for rendered screen in web browser.
78 * So it depends on style sheet, browser's rendering scheme, client's system font.
80 * NOTE: In general, non-Latin font such as Japanese, Chinese, Cyrillic have two times as width as Latin fonts,
81 * but this is not always correct, for example, rendered by proportional font.
84 * @param string $escaped_string target string
85 * @param integer $maxlength maximum length of return string which includes suffix
86 * @param string $suffix added in the end of shortened-string
89 static public function shorten($string, $maxlength, $suffix)
93 $decoded_entities_pcre = array();
94 $encoded_entities = array();
96 /* 1. store html entities */
97 preg_match('#&[^&]+?;#', $string, $encoded_entities);
98 if ( !$encoded_entities )
108 foreach ( $encoded_entities as $encoded_entity )
110 $decoded_entities_pcre[] = '#' . html_entity_decode($encoded_entity, ENT_QUOTES, i18n::get_current_charset()) . '#';
114 /* 2. decode string */
115 $string = html_entity_decode($string, ENT_QUOTES, i18n::get_current_charset());
117 /* 3. shorten string and add suffix if string length is longer */
118 if ( i18n::strlen($string) > $maxlength - i18n::strlen($suffix) )
120 $string = i18n::substr($string, 0, $maxlength - i18n::strlen($suffix) );
124 /* 4. recover entities */
127 $string = preg_replace($decoded_entities_pcre, $encoded_entities, $string);
134 * Entity::highlight()
135 * highlights a specific query in a given HTML text (not within HTML tags)
138 * @param string $text text to be highlighted
139 * @param string $expression regular expression to be matched (can be an array of expressions as well)
140 * @param string $highlight highlight to be used (use \\0 to indicate the matched expression)
143 static public function highlight($text, $expression, $highlight)
145 if ( !$highlight || !$expression )
150 if ( is_array($expression) && (count($expression) == 0) )
155 $text = "<!--h-->{$text}";
156 preg_match_all('#(<[^>]+>)([^<>]*)#', $text, $matches);
158 $count = count($matches[2]);
160 for ( $i = 0; $i < $count; $i++ )
164 $result .= $matches[1][$i];
167 if ( is_array($expression) )
169 foreach ( $expression as $regex )
171 $matches[2][$i] = preg_replace("#{$regex}#i", $highlight, $matches[2][$i]);
173 $result .= $matches[2][$i];
177 $result .= preg_replace("#{$expression}#i", $highlight, $matches[2][$i]);
184 * Entity::anchor_footnoting()
185 * change strings with footnoticing generated from anchor elements
188 * @param String $string strings which includes html elements
189 * @return String string with footnotes
191 static public function anchor_footnoting($string)
193 /* 1. detect anchor elements */
195 if ( !preg_match_all("#<a[^>]*href=[\"\']([^\"^']*)[\"\'][^>]*>([^<]*)<\/a>#i", $subject, $anchors) )
200 /* 2. add footnotes */
203 foreach ( $anchors as $anchor )
205 preg_replace("#{$anchor[0]}#", "{$anchor[2]} [{$count}] ", $subject);
206 $subject .= "[{$count}] {$anchor[1]}\n";
210 return strip_tags($ascii);
214 * NOTE: Obsoleted functions
218 * Entity::named_to_numeric()
221 * @param String $string
223 function named_to_numeric ($string)
225 $string = preg_replace('/(&[0-9A-Za-z]+)(;?\=?|([^A-Za-z0-9\;\:\.\-\_]))/e', "Entity::_named('\\1', '\\2') . '\\3'", $string);
230 * Entity::named_to_numeric()
233 * @param String $string
235 function normalize_numeric ($string) {
236 $string = preg_replace('/&#([0-9]+)(;)?/e', "'&#x'.dechex('\\1').';'", $string);
237 $string = preg_replace('/&#[Xx](0)*([0-9A-Fa-f]+)(;?|([^A-Za-z0-9\;\:\.\-\_]))/e', "'&#x' . strtoupper('\\2') . ';\\4'", $string);
238 $string = strtr($string, self::$entities['Windows-1252']);
243 * Entity::numeric_to_utf8()
246 * @param String $string
248 function numeric_to_utf8 ($string) {
249 $string = preg_replace('/&#([0-9]+)(;)?/e', "'&#x'.dechex('\\1').';'", $string);
250 $string = preg_replace('/&#[Xx](0)*([0-9A-Fa-f]+)(;?|([^A-Za-z0-9\;\:\.\-\_]))/e', "'&#x' . strtoupper('\\2') . ';\\4'", $string);
251 $string = preg_replace('/&#x([0-9A-Fa-f]+);/e', "Entity::_hex_to_utf8('\\1')", $string);
256 * Entity::numeric_to_named()
257 * convert decimal and hexadecimal numeric character references into named character references
260 * @param String $string
262 function numeric_to_named ($string)
264 $string = preg_replace('/&#[Xx]([0-9A-Fa-f]+)/e', "'&#'.hexdec('\\1')", $string);
265 $string = strtr($string, array_flip(self::$entities['named_to_numeric']));
270 * Entity::specialchars()
271 * convert HTML entities to named character reference
274 * @param String $string
276 function specialchars ($string, $type = 'xml')
278 $specialchars = array(
284 if ( $type != 'xml' )
286 $specialchars["'"] = ''';
290 $specialchars["'"] = ''';
293 $string = preg_replace('/&(#?[Xx]?[0-9A-Za-z]+);/', "[[[ENTITY:\\1]]]", $string);
294 $string = strtr($string, $specialchars);
295 $string = preg_replace('/\[\[\[ENTITY\:([^\]]+)\]\]\]/', "&\\1;", $string);
300 * Entity::_hex_to_utf8()
301 * convert decimal numeric character references to hexadecimal numeric character references
304 * @param String $string
306 function _hex_to_utf8($s)
314 else if ( $c < 0x800 )
316 $str = chr(0xC0 | $c>>6) . chr(0x80 | $c & 0x3F);
318 else if ( $c < 0x10000 )
320 $str = chr(0xE0 | $c>>12) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F);
322 else if ( $c < 0x200000 )
324 $str = chr(0xF0 | $c>>18) . chr(0x80 | $c>>12 & 0x3F) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F);
331 * convert entities to named character reference
334 * @param String $string
335 * @param String $extra
338 function _named($entity, $extra)
342 return $entity . '=';
345 $length = i18n::strlen($entity);
347 while ( $length > 0 )
349 $check = i18n::substr($entity, 0, $length);
350 if ( array_key_exists($check, self::$entities['named_to_numeric']) )
352 return self::$entities['named_to_numeric'][$check] . ';' . i18n::substr($entity, $length);
370 * HTML 4.01 Specification
371 * @link http://www.w3.org/TR/html4/sgml/entities.html
372 * @see 24 Character entity references in HTML 4
374 * XHTML™ 1.0 The Extensible HyperText Markup Language (Second Edition)
375 * A Reformulation of HTML 4 in XML 1.0
376 * @link http://www.w3.org/TR/xhtml1/
377 * @see 4.12. Entity references as hex values
378 * @see C.16. The Named Character Reference '
383 static private $entities = array (
384 'named_to_numeric' => array (
385 ' ' => ' ',
386 '¡' => '¡',
387 '¢' => '¢',
388 '£' => '£',
389 '¤' => '¤',
391 '¦' => '¦',
392 '§' => '§',
394 '©' => '©',
395 'ª' => 'ª',
396 '«' => '«',
400 '¯' => '¯',
402 '±' => '±',
403 '²' => '²',
404 '³' => '³',
405 '´' => '´',
406 'µ' => 'µ',
407 '¶' => '¶',
408 '·' => '·',
409 '¸' => '¸',
410 '¹' => '¹',
411 'º' => 'º',
412 '»' => '»',
413 '¼' => '¼',
414 '½' => '½',
415 '¾' => '¾',
416 '¿' => '¿',
417 'À' => 'À',
418 'Á' => 'Á',
419 'Â' => 'Â',
420 'Ã' => 'Ã',
421 'Ä' => 'Ä',
422 'Å' => 'Å',
423 'Æ' => 'Æ',
424 'Ç' => 'Ç',
425 'È' => 'È',
426 'É' => 'É',
427 'Ê' => 'Ê',
428 'Ë' => 'Ë',
429 'Ì' => 'Ì',
430 'Í' => 'Í',
431 'Î' => 'Î',
432 'Ï' => 'Ï',
434 'Ñ' => 'Ñ',
435 'Ò' => 'Ò',
436 'Ó' => 'Ó',
437 'Ô' => 'Ô',
438 'Õ' => 'Õ',
439 'Ö' => 'Ö',
440 '×' => '×',
441 'Ø' => 'Ø',
442 'Ù' => 'Ù',
443 'Ú' => 'Ú',
444 'Û' => 'Û',
445 'Ü' => 'Ü',
446 'Ý' => 'Ý',
447 'Þ' => 'Þ',
448 'ß' => 'ß',
449 'à' => 'à',
450 'á' => 'á',
451 'â' => 'â',
452 'ã' => 'ã',
453 'ä' => 'ä',
454 'å' => 'å',
455 'æ' => 'æ',
456 'ç' => 'ç',
457 'è' => 'è',
458 'é' => 'é',
459 'ê' => 'ê',
460 'ë' => 'ë',
461 'ì' => 'ì',
462 'í' => 'í',
463 'î' => 'î',
464 'ï' => 'ï',
466 'ñ' => 'ñ',
467 'ò' => 'ò',
468 'ó' => 'ó',
469 'ô' => 'ô',
470 'õ' => 'õ',
471 'ö' => 'ö',
472 '÷' => '÷',
473 'ø' => 'ø',
474 'ù' => 'ù',
475 'ú' => 'ú',
476 'û' => 'û',
477 'ü' => 'ü',
478 'ý' => 'ý',
479 'þ' => 'þ',
480 'ÿ' => 'ÿ',
481 '&OElig' => 'Œ',
482 '&oelig' => 'å',
483 '&Scaron' => 'Š',
484 '&scaron' => 'š',
485 '&Yuml' => 'Ÿ',
486 '&circ' => 'ˆ',
487 '&tilde' => '˜',
488 '&esnp' => ' ',
489 '&emsp' => ' ',
490 '&thinsp' => ' ',
491 '&zwnj' => '‌',
495 '&ndash' => '–',
496 '&mdash' => '—',
497 '&lsquo' => '‘',
498 '&rsquo' => '’',
499 '&sbquo' => '‚',
500 '&ldquo' => '“',
501 '&rdquo' => '”',
502 '&bdquo' => '„',
503 '&dagger' => '†',
504 '&Dagger' => '‡',
505 '&permil' => '‰',
506 '&lsaquo' => '‹',
507 '&rsaquo' => '›',
508 '&euro' => '€',
509 '&fnof' => 'ƒ',
510 '&Alpha' => 'Α',
511 '&Beta' => 'Β',
512 '&Gamma' => 'Γ',
513 '&Delta' => 'Δ',
514 '&Epsilon' => 'Ε',
515 '&Zeta' => 'Ζ',
517 '&Theta' => 'Θ',
518 '&Iota' => 'Ι',
519 '&Kappa' => 'Κ',
520 '&Lambda' => 'Λ',
524 '&Omicron' => 'Ο',
527 '&Sigma' => 'Σ',
529 '&Upsilon' => 'Υ',
533 '&Omega' => 'Ω',
534 '&alpha' => 'α',
535 '&beta' => 'β',
536 '&gamma' => 'γ',
537 '&delta' => 'δ',
538 '&epsilon' => 'ε',
539 '&zeta' => 'ζ',
541 '&theta' => 'θ',
542 '&iota' => 'ι',
543 '&kappa' => 'κ',
544 '&lambda' => 'λ',
548 '&omicron' => 'ο',
551 '&sigmaf' => 'ς',
552 '&sigma' => 'σ',
554 '&upsilon' => 'υ',
558 '&omega' => 'ω',
559 '&thetasym' => 'ϑ',
560 '&upsih' => 'ϒ',
562 '&bull' => '•',
563 '&hellip' => '…',
564 '&prime' => '′',
565 '&Prime' => '″',
566 '&oline' => '‾',
567 '&frasl' => '⁄',
568 '&weierp' => '℘',
569 '&image' => 'ℑ',
570 '&real' => 'ℜ',
571 '&trade' => 'ℒ',
572 '&alefsym' => 'ℵ',
573 '&larr' => '←',
574 '&uarr' => '↑',
575 '&rarr' => '→',
576 '&darr' => '↓',
577 '&harr' => '↔',
578 '&crarr' => '↵',
579 '&lArr' => '⇐',
580 '&uArr' => '⇑',
581 '&rArr' => '⇒',
582 '&dArr' => '⇓',
583 '&hArr' => '⇔',
584 '&forall' => '∀',
585 '&part' => '∂',
586 '&exist' => '∃',
587 '&empty' => '∅',
588 '&nabla' => '∇',
589 '&isin' => '∈',
590 '¬in' => '∉',
592 '&prod' => '∏',
594 '&minus' => '−',
595 '&lowast' => '∗',
596 '&radic' => '√',
597 '&prop' => '∝',
598 '&infin' => '∞',
605 '&there4' => '∴',
607 '&cong' => '≅',
608 '&asymp' => '≈',
610 '&equiv' => '≡',
615 '&nsub' => '⊄',
616 '&sube' => '⊆',
617 '&supe' => '⊇',
618 '&oplus' => '⊕',
619 '&otimes' => '⊖',
620 '&perp' => '⊥',
621 '&sdot' => '⋅',
622 '&lceil' => '⍨',
623 '&rceil' => '⌉',
624 '&lfloor' => '⌊',
625 '&rfloor' => '⌋',
626 '&lang' => '〈',
627 '&rang' => '⌰',
629 '&spades' => '♠',
630 '&clubs' => '♣',
631 '&hearts' => '♥',
632 '&diams' => '♦'
634 'Windows-1252' => array(
635 '€' => '€',
636 '‚' => '‚',
637 'ƒ' => 'ƒ',
638 '„' => '„',
639 '…' => '…',
640 '†' => '†',
641 '‡' => '‡',
642 'ˆ' => 'ˆ',
643 '‰' => '‰',
644 'Š' => 'Š',
645 '‹' => '‹',
646 'Œ' => 'Œ',
647 'Ž' => 'Ž',
648 '‘' => '‘',
649 '’' => '’',
650 '“' => '“',
651 '”' => '”',
652 '•' => '•',
653 '–' => '–',
654 '—' => '—',
655 '˜' => '˜',
656 '™' => '™',
657 'š' => 'š',
658 '›' => '›',
659 'œ' => 'œ',
660 'ž' => 'ž',
661 'Ÿ' => 'Ÿ',