-// ---------------------
-// Area pickup
-
-// Pickup all of markup areas
-function area_pickup($string = '', $method = array())
-{
- $area = array();
- if (empty($method)) return $area;
-
- // Anchor tag pair by preg_match and preg_match_all()
- // [OK] <a href></a>
- // [OK] <a href= >Good site!</a>
- // [OK] <a href= "#" >test</a>
- // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
- // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a>
- // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
- $regex = '#<a\b[^>]*\bhref\b[^>]*>.*?</a\b[^>]*(>)#is';
- if (isset($method['area_anchor'])) {
- $areas = array();
- $count = isset($method['asap']) ?
- preg_match($regex, $string) :
- preg_match_all($regex, $string, $areas);
- if (! empty($count)) $area['area_anchor'] = $count;
- }
- if (isset($method['uri_anchor'])) {
- $areas = array();
- preg_match_all($regex, $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
- foreach(array_keys($areas) as $_area) {
- $areas[$_area] = array(
- $areas[$_area][0][1], // Area start (<a href>)
- $areas[$_area][1][1], // Area end (</a>)
- );
- }
- if (! empty($areas)) $area['uri_anchor'] = $areas;
- }
-
- // phpBB's "BBCode" pair by preg_match and preg_match_all()
- // [OK] [url][/url]
- // [OK] [url]http://nasty.example.com/[/url]
- // [OK] [link]http://nasty.example.com/[/link]
- // [OK] [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
- // [OK] [link http://nasty.example.com/]buy something[/link]
- $regex = '#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#is';
- if (isset($method['area_bbcode'])) {
- $areas = array();
- $count = isset($method['asap']) ?
- preg_match($regex, $string) :
- preg_match_all($regex, $string, $areas, PREG_SET_ORDER);
- if (! empty($count)) $area['area_bbcode'] = $count;
- }
- if (isset($method['uri_bbcode'])) {
- $areas = array();
- preg_match_all($regex, $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
- foreach(array_keys($areas) as $_area) {
- $areas[$_area] = array(
- $areas[$_area][0][1], // Area start ([url])
- $areas[$_area][2][1], // Area end ([/url])
- );
- }
- if (! empty($areas)) $area['uri_bbcode'] = $areas;
- }
-
- // Various Wiki syntax
- // [text_or_uri>text_or_uri]
- // [text_or_uri:text_or_uri]
- // [text_or_uri|text_or_uri]
- // [text_or_uri->text_or_uri]
- // [text_or_uri text_or_uri] // MediaWiki
- // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
-
- return $area;
-}
-
-// If in doubt, it's a little doubtful
-// if (Area => inside <= Area) $brief += -1
-function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
-{
- if (! is_array($areas) || ! is_array($array)) return;
-
- $areas_keys = array_keys($areas);
- foreach(array_keys($array) as $u_index) {
- $offset = isset($array[$u_index][$o_key]) ?
- intval($array[$u_index][$o_key]) : 0;
- foreach($areas_keys as $a_index) {
- if (isset($array[$u_index][$a_key])) {
- $offset_s = intval($areas[$a_index][0]);
- $offset_e = intval($areas[$a_index][1]);
- // [Area => inside <= Area]
- if ($offset_s < $offset && $offset < $offset_e) {
- $array[$u_index][$a_key] += $belief;
- }
- }
- }
- }
-}
-
-// ---------------------
-// Spam-uri pickup
-
-// Domain exposure callback (See spam_uri_pickup_preprocess())
-// http://victim.example.org/?foo+site:nasty.example.com+bar
-// => http://nasty.example.com/?refer=victim.example.org
-// NOTE: 'refer=' is not so good for (at this time).
-// Consider about using IP address of the victim, try to avoid that.
-function _preg_replace_callback_domain_exposure($matches = array())
-{
- $result = '';
-
- // Preserve the victim URI as a complicity or ...
- if (isset($matches[5])) {
- $result =
- $matches[1] . '://' . // scheme
- $matches[2] . '/' . // victim.example.org
- $matches[3]; // The rest of all (before victim)
- }
-
- // Flipped URI
- if (isset($matches[4])) {
- $result =
- $matches[1] . '://' . // scheme
- $matches[4] . // nasty.example.com
- '/?refer=' . strtolower($matches[2]) . // victim.example.org
- ' ' . $result;
- }
-
- return $result;
-}
-
-// Preprocess: rawurldecode() and adding space(s) and something
-// to detect/count some URIs _if possible_
-// NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
-// [OK] http://victim.example.org/?site:nasty.example.org
-// [OK] http://victim.example.org/nasty.example.org
-// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
-// [OK] http://victim.example.org/http://nasty.example.org
-function spam_uri_pickup_preprocess($string = '')
-{
- if (! is_string($string)) return '';
-
- $string = rawurldecode($string);
-
- // Domain exposure (simple)
- // http://victim.example.org/nasty.example.org/path#frag
- // => http://nasty.example.org/?refer=victim.example.org and original
- $string = preg_replace(
- '#h?ttp://' .
- '(' .
- 'ime\.nu' . '|' . // 2ch.net
- 'ime\.st' . '|' . // 2ch.net
- 'link\.toolbot\.com' . '|' .
- 'urlx\.org' .
- ')' .
- '/([a-z0-9.%_-]+\.[a-z0-9.%_-]+)#i', // nasty.example.org
- 'http://$2/?refer=$1 $0', // Preserve $0 or remove?
- $string
- );
-
- // Domain exposure (gate-big5)
- // http://victim.example.org/gate/big5/nasty.example.org/path
- // => http://nasty.example.org/?refer=victim.example.org and original
- $string = preg_replace(
- '#h?ttp://' .
- '(' .
- 'big5.51job.com' . '|' .
- 'big5.china.com' . '|' .
- 'big5.xinhuanet.com' . '|' .
- ')' .
- '/gate/big5' .
- '/([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' .
- '#i', // nasty.example.org
- 'http://$2/?refer=$1 $0', // Preserve $0 or remove?
- $string
- );
-
- // Domain exposure (See _preg_replace_callback_domain_exposure())
- $string = preg_replace_callback(
- array(
- '#(http)://' .
- '(' .
- // Something Google: http://www.google.com/supported_domains
- '(?:[a-z0-9.]+\.)?google\.[a-z]{2,3}(?:\.[a-z]{2})?' .
- '|' .
- // AltaVista
- '(?:[a-z0-9.]+\.)?altavista.com' .
-
- ')' .
- '/' .
- '([a-z0-9?=&.%_/\'\\\+-]+)' . // path/?query=foo+bar+
- '\bsite:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // site:nasty.example.com
- //'()' . // Preserve or remove?
- '#i',
- ),
- '_preg_replace_callback_domain_exposure',
- $string
- );
-
- // URI exposure (uriuri => uri uri)
- $string = preg_replace(
- array(
- '#(?<! )(?:https?|ftp):/#i',
- // '#[a-z][a-z0-9.+-]{1,8}://#i',
- // '#[a-z][a-z0-9.+-]{1,8}://#i'
- ),
- ' $0',
- $string
- );
-
- return $string;
-}
-
-// Main function of spam-uri pickup,
-// A wrapper function of uri_pickup()
-function spam_uri_pickup($string = '', $method = array())
-{
- if (! is_array($method) || empty($method)) {
- $method = check_uri_spam_method();
- }
-
- $string = spam_uri_pickup_preprocess($string);
-
- $array = uri_pickup($string);
-
- // Area elevation of URIs, for '(especially external)link' intension
- if (! empty($array)) {
- $_method = array();
- if (isset($method['uri_anchor'])) $_method['uri_anchor'] = & $method['uri_anchor'];
- if (isset($method['uri_bbcode'])) $_method['uri_bbcode'] = & $method['uri_bbcode'];
- $areas = area_pickup($string, $_method, TRUE);
- if (! empty($areas)) {
- $area_shadow = array();
- foreach (array_keys($array) as $key) {
- $area_shadow[$key] = & $array[$key]['area'];
- foreach (array_keys($_method) as $_key) {
- $area_shadow[$key][$_key] = 0;
- }
- }
- foreach (array_keys($_method) as $_key) {
- if (isset($areas[$_key])) {
- area_measure($areas[$_key], $area_shadow, 1, $_key);
- }
- }
- }
- }
-
- // Remove 'offset's for area_measure()
- foreach(array_keys($array) as $key)
- unset($array[$key]['area']['offset']);
-
- return $array;
-}
-
-
-// ---------------------
-// Normalization
-
-// Scheme normalization: Renaming the schemes
-// snntp://example.org => nntps://example.org
-// NOTE: Keep the static lists simple. See also port_normalize().
-function scheme_normalize($scheme = '', $abbrevs_harmfull = TRUE)
-{
- // Abbreviations they have no intention of link
- static $abbrevs = array(
- 'ttp' => 'http',
- 'ttps' => 'https',
- );
-
- // Aliases => normalized ones
- static $aliases = array(
- 'pop' => 'pop3',
- 'news' => 'nntp',
- 'imap4' => 'imap',
- 'snntp' => 'nntps',
- 'snews' => 'nntps',
- 'spop3' => 'pop3s',
- 'pops' => 'pop3s',
- );
-
- if (! is_string($scheme)) return '';
-
- $scheme = strtolower($scheme);
- if (isset($abbrevs[$scheme])) {
- $scheme = $abbrevs_harmfull ? $abbrevs[$scheme] : '';
- }
- if (isset($aliases[$scheme])) {
- $scheme = $aliases[$scheme];
- }
-
- return $scheme;
-}
-
-// Hostname normlization (Destructive)
-// www.foo => www.foo ('foo' seems TLD)
-// www.foo.bar => foo.bar
-// www.10.20 => www.10.20 (Invalid hostname)
-// NOTE:
-// 'www' is mostly used as traditional hostname of WWW server.
-// 'www.foo.bar' may be identical with 'foo.bar'.
-function host_normalize($host = '')
-{
- if (! is_string($host)) return '';
-
- $host = strtolower($host);
- $matches = array();
- if (preg_match('/^www\.(.+\.[a-z]+)$/', $host, $matches)) {
- return $matches[1];
- } else {
- return $host;
- }
-}
-
-// Port normalization: Suppress the (redundant) default port
-// HTTP://example.org:80/ => http://example.org/
-// HTTP://example.org:8080/ => http://example.org:8080/
-// HTTPS://example.org:443/ => https://example.org/
-function port_normalize($port, $scheme, $scheme_normalize = FALSE)
-{
- // Schemes that users _maybe_ want to add protocol-handlers
- // to their web browsers. (and attackers _maybe_ want to use ...)
- // Reference: http://www.iana.org/assignments/port-numbers
- static $array = array(
- // scheme => default port
- 'ftp' => 21,
- 'ssh' => 22,
- 'telnet' => 23,
- 'smtp' => 25,
- 'tftp' => 69,
- 'gopher' => 70,
- 'finger' => 79,
- 'http' => 80,
- 'pop3' => 110,
- 'sftp' => 115,
- 'nntp' => 119,
- 'imap' => 143,
- 'irc' => 194,
- 'wais' => 210,
- 'https' => 443,
- 'nntps' => 563,
- 'rsync' => 873,
- 'ftps' => 990,
- 'telnets' => 992,
- 'imaps' => 993,
- 'ircs' => 994,
- 'pop3s' => 995,
- 'mysql' => 3306,
- );
-
- // intval() converts '0-1' to '0', so preg_match() rejects these invalid ones
- if (! is_numeric($port) || $port < 0 || preg_match('/[^0-9]/i', $port))
- return '';
-
- $port = intval($port);
- if ($scheme_normalize) $scheme = scheme_normalize($scheme);
- if (isset($array[$scheme]) && $port == $array[$scheme])
- $port = ''; // Ignore the defaults
-
- return $port;
-}
-
-// Path normalization
-// http://example.org => http://example.org/
-// http://example.org#hoge => http://example.org/#hoge
-// http://example.org/path/a/b/./c////./d => http://example.org/path/a/b/c/d
-// http://example.org/path/../../a/../back => http://example.org/back
-function path_normalize($path = '', $divider = '/', $add_root = TRUE)