X-Git-Url: http://git.osdn.net/view?a=blobdiff_plain;ds=sidebyside;f=spam%2Fspam.php;h=62916e670a21888e3be502454fad27b597c3dba8;hb=42b79c7ccf313cddf7c173967638cc3e7ab83cf9;hp=acef763965569f2ce561f68a387dd69008eca9f1;hpb=5332233cf27ea76bea4ce2240020e356cd401d0d;p=pukiwiki%2Fpukiwiki_sandbox.git diff --git a/spam/spam.php b/spam/spam.php index acef763..62916e6 100644 --- a/spam/spam.php +++ b/spam/spam.php @@ -1,8 +1,10 @@ = 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php'); @@ -58,7 +60,7 @@ function uri_pickup($string = '', $normalize = TRUE, '(' . // 3: Host '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732 - '(?:[0-9]{1-3}\.){3}[0-9]{1-3}' . '|' . // IPv4(dot-decimal): 001.22.3.44 + '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44 '[a-z0-9.-]+' . // hostname(FQDN) : foo.example.org ')' . '(?::([0-9]*))?' . // 4: Port @@ -92,7 +94,7 @@ function uri_pickup($string = '', $normalize = TRUE, unset($array[$uri]); continue; } - $_uri['host'] = strtolower($_uri['host']); + $_uri['host'] = host_normalize($_uri['host']); $_uri['port'] = port_normalize($_uri['port'], $_uri['scheme'], FALSE); $_uri['path'] = path_normalize($_uri['path']); if ($preserve_rawuri) $_uri['rawuri'] = & $_uri[0]; @@ -337,6 +339,7 @@ function _preg_replace_callback_domain_exposure($matches = array()) // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:'] // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org // [OK] http://victim.example.org/http://nasty.example.org +// TODO: link.toolbot.com, urlx.org function spam_uri_pickup_preprocess($string = '') { if (! is_string($string)) return ''; @@ -364,8 +367,6 @@ function spam_uri_pickup_preprocess($string = '') '_preg_replace_callback_domain_exposure', $string ); - - // URI exposure (uriuri => uri uri) $string = preg_replace( @@ -460,6 +461,25 @@ function scheme_normalize($scheme = '', $considerd_harmfull = TRUE) return $scheme; } +// Hostname normlization +// www.foo => www.foo ('foo' seems TLD) +// www.foo.bar => foo.bar +// www.10.20 => www.10.20 (Invalid hostname) +// NOTE: +// 'www' is mostly used as traditional hostname of WWW server. +// 'www.foo.bar' may be identical with 'foo.bar'. +function host_normalize($host = '') +{ + $host = strtolower($host); + + $matches = array(); + if (preg_match('/^www\.(.+\.[a-z]+)$/', $host, $matches)) { + return $matches[1]; + } else { + return $host; + } +} + // Port normalization: Suppress the (redundant) default port // HTTP://example.org:80/ => http://example.org/ // HTTP://example.org:8080/ => http://example.org:8080/ @@ -568,7 +588,7 @@ function file_normalize($string = 'index.html.en') // Roughly removing ISO 639 -like // 2-letter suffixes (See RFC3066) $matches = array(); - if (preg_match('/(.*)\.[a-z][a-z](?:-[a-z][a-z])?$/i', $string, $matches)) { + if (preg_match('/(.+)\.[a-z][a-z](?:-[a-z][a-z])?$/i', $string, $matches)) { $_string = $matches[1]; } else { $_string = & $string; @@ -624,6 +644,10 @@ function query_normalize($string = '', $equal = FALSE, $equal_cutempty = TRUE) // --------------------- // Part One : Checker +// Rough implementation of globbing +// +// USAGE: $regex = '/^' . generate_glob_regex('*.txt', '/') . '$/i'; +// function generate_glob_regex($string = '', $divider = '/') { static $from = array( @@ -645,21 +669,57 @@ function generate_glob_regex($string = '', $divider = '/') // 23 => ']', ); - if (is_array($string)) { - // Recurse - return '(?:' . - implode('|', // OR - array_map('generate_glob_regex', - $string, - array_pad(array(), count($string), $divider) - ) - ) . - ')'; + $string = str_replace($from, $mid, $string); // Hide + $string = preg_quote($string, $divider); + $string = str_replace($mid, $to, $string); // Unhide + + return $string; +} + +// Rough hostname checker +// [OK] 192.168. +// TODO: Strict digit, 0x, CIDR, IPv6 +function is_ip($string = '') +{ + if (preg_match('/^' . + '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . + '(?:[0-9]{1,3}\.){1,3}' . '$/', + $string)) { + return 4; // Seems IPv4(dot-decimal) } else { - $string = str_replace($from, $mid, $string); // Hide - $string = preg_quote($string, $divider); - $string = str_replace($mid, $to, $string); // Unhide - return $string; + return 0; // Seems not IP + } +} + +// Generate host (FQDN, IPv4, ...) regex +// 'localhost' : Matches with 'localhost' only +// 'example.org' : Matches with 'example.org' only (See host_normalize() about 'www') +// '.example.org' : Matches with ALL FQDN ended with '.example.org' +// '*.example.org' : Almost the same of '.example.org' except 'www.example.org' +// '10.20.30.40' : Matches with IPv4 address '10.20.30.40' only +// [TODO] '192.' : Matches with all IPv4 hosts started with '192.' +// TODO: IPv4, CIDR?, IPv6 +function generate_host_regex($string = '', $divider = '/') +{ + if (mb_strpos($string, '.') === FALSE) + return generate_glob_regex($string, $divider); + + $result = ''; + if (is_ip($string)) { + // IPv4 + return generate_glob_regex($string, $divider); + } else { + // FQDN or something + $part = explode('.', $string, 2); + if ($part[0] == '') { + $part[0] = '(?:.*\.)?'; // And all related FQDN + } else if ($part[0] == '*') { + $part[0] = '.*\.'; // All subdomains/hosts only + } else { + return generate_glob_regex($string, $divider); + } + $part[1] = generate_glob_regex($part[1], $divider); + return implode('', $part); } } @@ -679,11 +739,13 @@ function get_blocklist($list = '') foreach(array('goodhost', 'badhost') as $_list) { if (! isset($blocklist[$list])) continue; foreach ($blocklist[$_list] as $key => $value) { - if (is_string($key)) { - $regexs[$_list][$key] = $value; + if (is_array($value)) { + $regexs[$_list][$key] = array(); + foreach($value as $_key => $_value) { + get_blocklist_add($regexs[$_list][$key], $_key, $_value); + } } else { - $regexs[$_list][$value] = - '/^(?:www\.)?' . generate_glob_regex($value, '/') . '$/i'; + get_blocklist_add($regexs[$_list], $key, $value); } } } @@ -699,6 +761,16 @@ function get_blocklist($list = '') } } +// Subroutine of get_blocklist() +function get_blocklist_add(& $array, $key = 0, $value = '*.example.org') +{ + if (is_string($key)) { + $array[$key] = & $value; // Treat $value as a regex + } else { + $array[$value] = '/^' . generate_host_regex($value, '/') . '$/i'; + } +} + function is_badhost($hosts = array(), $asap = TRUE, & $remains) { $result = array(); @@ -715,12 +787,14 @@ function is_badhost($hosts = array(), $asap = TRUE, & $remains) $tmp = array(); foreach (get_blocklist('badhost') as $label => $regex) { - $result[$label] = preg_grep($regex, $hosts); - if (empty($result[$label])) { - unset($result[$label]); + if (is_array($regex)) { + $result[$label] = array(); + foreach($regex as $_label => $_regex) { + if (is_badhost_avail($_label, $_regex, $hosts, $result[$label]) && $asap) break; + } + if (empty($result[$label])) unset($result[$label]); } else { - $hosts = array_diff($hosts, $result[$label]); - if ($asap) break; + if (is_badhost_avail($label, $regex, $hosts, $result) && $asap) break; } } @@ -729,6 +803,19 @@ function is_badhost($hosts = array(), $asap = TRUE, & $remains) return $result; } +// Subroutine for is_badhost() +function is_badhost_avail($label = '*.example.org', $regex = '/^.*\.example\.org$/', & $hosts, & $result) +{ + $group = preg_grep($regex, $hosts); + if ($group) { + $result[$label] = & $group; + $hosts = array_diff($hosts, $result[$label]); + return TRUE; + } else { + return FALSE; + } +} + // Default (enabled) methods and thresholds (for content insertion) function check_uri_spam_method($times = 1, $t_area = 0, $rule = TRUE) { @@ -787,10 +874,12 @@ function check_uri_spam($target = '', $method = array()) 'is_spam' => array(), 'method' => & $method, 'remains' => array(), + 'error' => array(), ); $sum = & $progress['sum']; $is_spam = & $progress['is_spam']; $remains = & $progress['remains']; + $error = & $progress['error']; $asap = isset($method['asap']); // Recurse @@ -801,6 +890,7 @@ function check_uri_spam($target = '', $method = array()) $_sum = & $_progress['sum']; $_is_spam = & $_progress['is_spam']; $_remains = & $_progress['remains']; + $_error = & $_progress['error']; foreach (array_keys($_sum) as $key) { $sum[$key] += $_sum[$key]; } @@ -827,6 +917,7 @@ function check_uri_spam($target = '', $method = array()) } } } + if (! empty($_error)) $error += $_error; if ($asap && $is_spam) break; } return $progress; @@ -955,10 +1046,8 @@ function check_uri_spam($target = '', $method = array()) // URI: Bad host if ((! $asap || ! $is_spam) && isset($method['badhost'])) { $__remains = array(); - if ($asap) { - $badhost = is_badhost($hosts, $asap, $__remains); - } else { - $badhost = is_badhost($hosts, $asap, $__remains); + $badhost = is_badhost($hosts, $asap, $__remains); + if (! $asap) { if ($__remains) { $remains['badhost'] = array(); foreach ($__remains as $value) { @@ -968,6 +1057,7 @@ function check_uri_spam($target = '', $method = array()) } unset($__remains); if (! empty($badhost)) { + //var_dump($badhost); // BADHOST detail $sum['badhost'] += array_count_leaves($badhost); foreach(array_keys($badhost) as $keys) { $is_spam['badhost'][$keys] =