-// Spam-uri pickup
-
-// Domain exposure callback (See spam_uri_pickup_preprocess())
-// http://victim.example.org/?foo+site:nasty.example.com+bar
-// => http://nasty.example.com/?refer=victim.example.org
-// NOTE: 'refer=' is not so good for (at this time).
-// Consider about using IP address of the victim, try to avoid that.
-function _preg_replace_callback_domain_exposure($matches = array())
-{
- $result = '';
-
- // Preserve the victim URI as a complicity or ...
- if (isset($matches[5])) {
- $result =
- $matches[1] . '://' . // scheme
- $matches[2] . '/' . // victim.example.org
- $matches[3]; // The rest of all (before victim)
- }
-
- // Flipped URI
- if (isset($matches[4])) {
- $result =
- $matches[1] . '://' . // scheme
- $matches[4] . // nasty.example.com
- '/?refer=' . strtolower($matches[2]) . // victim.example.org
- ' ' . $result;
- }
-
- return $result;
-}
-
-// Preprocess: rawurldecode() and adding space(s) and something
-// to detect/count some URIs _if possible_
-// NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
-// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
-// [OK] http://victim.example.org/http://nasty.example.org
-// TODO: link.toolbot.com, urlx.org
-function spam_uri_pickup_preprocess($string = '')
-{
- if (! is_string($string)) return '';
-
- $string = rawurldecode($string);
-
- // Domain exposure (See _preg_replace_callback_domain_exposure())
- $string = preg_replace_callback(
- array(
- '#(http)://' .
- '(' .
- // Something Google: http://www.google.com/supported_domains
- '(?:[a-z0-9.]+\.)?google\.[a-z]{2,3}(?:\.[a-z]{2})?' .
- '|' .
- // AltaVista
- '(?:[a-z0-9.]+\.)?altavista.com' .
-
- ')' .
- '/' .
- '([a-z0-9?=&.%_/\'\\\+-]+)' . // path/?query=foo+bar+
- '\bsite:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // site:nasty.example.com
- //'()' . // Preserve or remove?
- '#i',
- ),
- '_preg_replace_callback_domain_exposure',
- $string
- );
-
- // URI exposure (uriuri => uri uri)
- $string = preg_replace(
- array(
- '#(?<! )(?:https?|ftp):/#i',
- // '#[a-z][a-z0-9.+-]{1,8}://#i',
- // '#[a-z][a-z0-9.+-]{1,8}://#i'
- ),
- ' $0',
- $string
- );
-
- return $string;
-}
-
-// Main function of spam-uri pickup
-function spam_uri_pickup($string = '', $method = array())
-{
- if (! is_array($method) || empty($method)) {
- $method = check_uri_spam_method();
- }
-
- $string = spam_uri_pickup_preprocess($string);
-
- $array = uri_pickup($string);
-
- // Area elevation of URIs, for '(especially external)link' intension
- if (! empty($array)) {
- $_method = array();
- if (isset($method['uri_anchor'])) $_method['uri_anchor'] = & $method['uri_anchor'];
- if (isset($method['uri_bbcode'])) $_method['uri_bbcode'] = & $method['uri_bbcode'];
- $areas = area_pickup($string, $_method, TRUE);
- if (! empty($areas)) {
- $area_shadow = array();
- foreach (array_keys($array) as $key) {
- $area_shadow[$key] = & $array[$key]['area'];
- foreach (array_keys($_method) as $_key) {
- $area_shadow[$key][$_key] = 0;
- }
- }
- foreach (array_keys($_method) as $_key) {
- if (isset($areas[$_key])) {
- area_measure($areas[$_key], $area_shadow, 1, $_key);
- }
- }
- }
- }
-
- // Remove 'offset's for area_measure()
- foreach(array_keys($array) as $key)
- unset($array[$key]['area']['offset']);
-
- return $array;
-}
-
-
-// ---------------------
-// Normalization
-
-// Scheme normalization: Renaming the schemes
-// snntp://example.org => nntps://example.org
-// NOTE: Keep the static lists simple. See also port_normalize().
-function scheme_normalize($scheme = '', $considerd_harmfull = TRUE)
-{
- // Abbreviations considerable they don't have link intension
- static $abbrevs = array(
- 'ttp' => 'http',
- 'ttps' => 'https',
- );
-
- // Alias => normalized
- static $aliases = array(
- 'pop' => 'pop3',
- 'news' => 'nntp',
- 'imap4' => 'imap',
- 'snntp' => 'nntps',
- 'snews' => 'nntps',
- 'spop3' => 'pop3s',
- 'pops' => 'pop3s',
- );
-
- $scheme = strtolower(trim($scheme));
- if (isset($abbrevs[$scheme])) {
- if ($considerd_harmfull) {
- $scheme = $abbrevs[$scheme];
- } else {
- $scheme = '';
- }
- }
- if (isset($aliases[$scheme])) $scheme = $aliases[$scheme];
-
- return $scheme;
-}
-
-// Hostname normlization
-// www.foo => www.foo ('foo' seems TLD)
-// www.foo.bar => foo.bar
-// www.10.20 => www.10.20 (Invalid hostname)
-// NOTE:
-// 'www' is mostly used as traditional hostname of WWW server.
-// 'www.foo.bar' may be identical with 'foo.bar'.
-function host_normalize($host = '')
-{
- $host = strtolower($host);
-
- $matches = array();
- if (preg_match('/^www\.(.+\.[a-z]+)$/', $host, $matches)) {
- return $matches[1];
- } else {
- return $host;
- }
-}
-
-// Port normalization: Suppress the (redundant) default port
-// HTTP://example.org:80/ => http://example.org/
-// HTTP://example.org:8080/ => http://example.org:8080/
-// HTTPS://example.org:443/ => https://example.org/
-function port_normalize($port, $scheme, $scheme_normalize = TRUE)
-{
- // Schemes that users _maybe_ want to add protocol-handlers
- // to their web browsers. (and attackers _maybe_ want to use ...)
- // Reference: http://www.iana.org/assignments/port-numbers
- static $array = array(
- // scheme => default port
- 'ftp' => 21,
- 'ssh' => 22,
- 'telnet' => 23,
- 'smtp' => 25,
- 'tftp' => 69,
- 'gopher' => 70,
- 'finger' => 79,
- 'http' => 80,
- 'pop3' => 110,
- 'sftp' => 115,
- 'nntp' => 119,
- 'imap' => 143,
- 'irc' => 194,
- 'wais' => 210,
- 'https' => 443,
- 'nntps' => 563,
- 'rsync' => 873,
- 'ftps' => 990,
- 'telnets' => 992,
- 'imaps' => 993,
- 'ircs' => 994,
- 'pop3s' => 995,
- 'mysql' => 3306,
- );
-
- $port = trim($port);
- if ($port === '') return $port;
-
- if ($scheme_normalize) $scheme = scheme_normalize($scheme);
- if (isset($array[$scheme]) && $port == $array[$scheme])
- $port = ''; // Ignore the defaults
-
- return $port;
-}
-
-// Path normalization
-// http://example.org => http://example.org/
-// http://example.org#hoge => http://example.org/#hoge
-// http://example.org/path/a/b/./c////./d => http://example.org/path/a/b/c/d
-// http://example.org/path/../../a/../back => http://example.org/back
-function path_normalize($path = '', $divider = '/', $addroot = TRUE)
-{
- if (! is_string($path) || $path == '')
- return $addroot ? $divider : '';
-
- $path = trim($path);
- $last = ($path[strlen($path) - 1] == $divider) ? $divider : '';
- $array = explode($divider, $path);
-
- // Remove paddings
- foreach(array_keys($array) as $key) {
- if ($array[$key] == '' || $array[$key] == '.')
- unset($array[$key]);
- }
- // Back-track
- $tmp = array();
- foreach($array as $value) {
- if ($value == '..') {
- array_pop($tmp);
- } else {
- array_push($tmp, $value);
- }
- }
- $array = & $tmp;
-
- $path = $addroot ? $divider : '';
- if (! empty($array)) $path .= implode($divider, $array) . $last;
-
- return $path;
-}
-
-// DirectoryIndex normalize (Destructive and rough)
-function file_normalize($string = 'index.html.en')
-{
- static $array = array(
- 'index' => TRUE, // Some system can omit the suffix
- 'index.htm' => TRUE,
- 'index.html' => TRUE,
- 'index.shtml' => TRUE,
- 'index.jsp' => TRUE,
- 'index.php' => TRUE,
- 'index.php3' => TRUE,
- 'index.php4' => TRUE,
- //'index.pl' => TRUE,
- //'index.py' => TRUE,
- //'index.rb' => TRUE,
- 'index.cgi' => TRUE,
- 'default.htm' => TRUE,
- 'default.html' => TRUE,
- 'default.asp' => TRUE,
- 'default.aspx' => TRUE,
- );
-
- // Content-negothiation filter:
- // Roughly removing ISO 639 -like
- // 2-letter suffixes (See RFC3066)
- $matches = array();
- if (preg_match('/(.+)\.[a-z][a-z](?:-[a-z][a-z])?$/i', $string, $matches)) {
- $_string = $matches[1];
- } else {
- $_string = & $string;
- }
-
- if (isset($array[strtolower($_string)])) {
- return '';
- } else {
- return $string;
- }
-}
-
-// Sort query-strings if possible (Destructive and rough)
-// [OK] &&&&f=d&b&d&c&a=0dd => a=0dd&b&c&d&f=d
-// [OK] nothing==&eg=dummy&eg=padding&eg=foobar => eg=foobar
-function query_normalize($string = '', $equal = FALSE, $equal_cutempty = TRUE)
-{
- $array = explode('&', $string);
-
- // Remove '&' paddings
- foreach(array_keys($array) as $key) {
- if ($array[$key] == '') {
- unset($array[$key]);
- }
- }
-
- // Consider '='-sepalated input and paddings
- if ($equal) {
- $equals = $not_equals = array();
- foreach ($array as $part) {
- if (strpos($part, '=') === FALSE) {
- $not_equals[] = $part;
- } else {
- list($key, $value) = explode('=', $part, 2);
- $value = ltrim($value, '=');
- if (! $equal_cutempty || $value != '') {
- $equals[$key] = $value;
- }
- }
- }
-
- $array = & $not_equals;
- foreach ($equals as $key => $value) {
- $array[] = $key . '=' . $value;
- }
- unset($equals);
- }
-
- natsort($array);
- return implode('&', $array);
-}
-
-// ---------------------
-// Part One : Checker