X-Git-Url: http://git.osdn.net/view?a=blobdiff_plain;f=spam.php;h=c59d798976097ac08ea805180e684dcba90acc9e;hb=25e665f54c6af5a91543c76c63784a8b95ffee44;hp=0c2cec56e6538df388d4a7bf3830f3c1027c61c8;hpb=93185099d0bf4093308816ff9889db86c795db04;p=pukiwiki%2Fpukiwiki_sandbox.git diff --git a/spam.php b/spam.php index 0c2cec5..c59d798 100644 --- a/spam.php +++ b/spam.php @@ -1,9 +1,10 @@ = 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php'); @@ -14,7 +15,7 @@ if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php'); // (PHP 4 >= 4.2.0): var_export(): mail-reporting and dump related if (! function_exists('var_export')) { function var_export() { - return 'var_export() is not found' . "\n"; + return 'var_export() is not found on this server' . "\n"; } } @@ -52,14 +53,14 @@ function uri_pickup($string = '', $normalize = TRUE, preg_match_all( // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment // Refer RFC3986 (Regex below is not strict) - '#(\b[a-z][a-z0-9.+-]{1,8})://' . // 1: Scheme + '#(\b[a-z][a-z0-9.+-]{1,8}):/+' . // 1: Scheme '(?:' . '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username) '@)?' . '(' . // 3: Host '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732 - '(?:[0-9]{1-3}\.){3}[0-9]{1-3}' . '|' . // IPv4(dot-decimal): 001.22.3.44 + '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44 '[a-z0-9.-]+' . // hostname(FQDN) : foo.example.org ')' . '(?::([0-9]*))?' . // 4: Port @@ -93,7 +94,7 @@ function uri_pickup($string = '', $normalize = TRUE, unset($array[$uri]); continue; } - $_uri['host'] = strtolower($_uri['host']); + $_uri['host'] = host_normalize($_uri['host']); $_uri['port'] = port_normalize($_uri['port'], $_uri['scheme'], FALSE); $_uri['path'] = path_normalize($_uri['path']); if ($preserve_rawuri) $_uri['rawuri'] = & $_uri[0]; @@ -338,6 +339,7 @@ function _preg_replace_callback_domain_exposure($matches = array()) // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:'] // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org // [OK] http://victim.example.org/http://nasty.example.org +// TODO: link.toolbot.com, urlx.org function spam_uri_pickup_preprocess($string = '') { if (! is_string($string)) return ''; @@ -347,10 +349,18 @@ function spam_uri_pickup_preprocess($string = '') // Domain exposure (See _preg_replace_callback_domain_exposure()) $string = preg_replace_callback( array( - // Something Google: http://www.google.com/supported_domains - '#(http)://([a-z0-9.]+\.google\.[a-z]{2,3}(?:\.[a-z]{2})?)/' . - '([a-z0-9?=&.%_+-]+)' . // ?query=foo+ - '\bsite:([a-z0-9.%_-]+)' . // site:nasty.example.com + '#(http)://' . + '(' . + // Something Google: http://www.google.com/supported_domains + '(?:[a-z0-9.]+\.)?google\.[a-z]{2,3}(?:\.[a-z]{2})?' . + '|' . + // AltaVista + '(?:[a-z0-9.]+\.)?altavista.com' . + + ')' . + '/' . + '([a-z0-9?=&.%_/\'\\\+-]+)' . // path/?query=foo+bar+ + '\bsite:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // site:nasty.example.com //'()' . // Preserve or remove? '#i', ), @@ -451,6 +461,25 @@ function scheme_normalize($scheme = '', $considerd_harmfull = TRUE) return $scheme; } +// Hostname normlization +// www.foo => www.foo ('foo' seems TLD) +// www.foo.bar => foo.bar +// www.10.20 => www.10.20 (Invalid hostname) +// NOTE: +// 'www' is mostly used as traditional hostname of WWW server. +// 'www.foo.bar' may be identical with 'foo.bar'. +function host_normalize($host = '') +{ + $host = strtolower($host); + + $matches = array(); + if (preg_match('/^www\.(.+\.[a-z]+)$/', $host, $matches)) { + return $matches[1]; + } else { + return $host; + } +} + // Port normalization: Suppress the (redundant) default port // HTTP://example.org:80/ => http://example.org/ // HTTP://example.org:8080/ => http://example.org:8080/ @@ -615,6 +644,10 @@ function query_normalize($string = '', $equal = FALSE, $equal_cutempty = TRUE) // --------------------- // Part One : Checker +// Rough implementation of globbing +// +// USAGE: $regex = '/^' . generate_glob_regex('*.txt', '/') . '$/i'; +// function generate_glob_regex($string = '', $divider = '/') { static $from = array( @@ -636,63 +669,103 @@ function generate_glob_regex($string = '', $divider = '/') // 23 => ']', ); - if (is_array($string)) { - // Recurse - return '(?:' . - implode('|', // OR - array_map('generate_glob_regex', - $string, - array_pad(array(), count($string), $divider) - ) - ) . - ')'; + $string = str_replace($from, $mid, $string); // Hide + $string = preg_quote($string, $divider); + $string = str_replace($mid, $to, $string); // Unhide + + return $string; +} + +// Rough hostname checker +// [OK] 192.168. +// TODO: Strict digit, 0x, CIDR, IPv6 +function is_ip($string = '') +{ + if (preg_match('/^' . + '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . + '(?:[0-9]{1,3}\.){1,3}' . '$/', + $string)) { + return 4; // Seems IPv4(dot-decimal) } else { - $string = str_replace($from, $mid, $string); // Hide - $string = preg_quote($string, $divider); - $string = str_replace($mid, $to, $string); // Unhide - return $string; + return 0; // Seems not IP } } -function get_blocklist($list = '') +// Generate host (FQDN, IPv4, ...) regex +// 'localhost' : Matches with 'localhost' only +// 'example.org' : Matches with 'example.org', and 'www.example.org' +// '.example.org' : Matches with ALL FQDN ended with '.example.org' +// '*.example.org' : Almost the same of '.example.org' except 'www.example.org' +// '10.20.30.40' : Matches with IPv4 address '10.20.30.40' only +// '192168.' : Matches with all IPv4 hosts started with '192.' +// TODO: IPv4, CIDR?, IPv6 +function generate_host_regex($string = '', $divider = '/') { - static $regex; - - if (! isset($regex)) { - $regex = array(); - - // Sample - if (FALSE) { - $blocklist['badhost'] = array( - //'*', // Deny all uri - //'10.20.*.*', // 10.20.example.com also matches - //'*.blogspot.com', // Blog services subdomains - //array('blogspot.com', '*.blogspot.com') - ); - foreach ($blocklist['badhost'] as $part) { - $_part = is_array($part) ? implode('/', $part) : $part; - $regex['badhost'][$_part] = '/^' . generate_glob_regex($part) . '$/i'; - } + if (mb_strpos($string, '.') === FALSE) + return generate_glob_regex($string, $divider); + + $result = ''; + if (is_ip($string)) { + // IPv4 + return generate_glob_regex($string, $divider); + } else { + // FQDN or something + $part = explode('.', $string, 2); + if ($part[0] == '') { + $part[0] = '(?:.*\.)?'; // And all related FQDN + } else if ($part[0] == '*') { + $part[0] = '.*\.'; // All subdomains/hosts only + } else { + return generate_glob_regex($string, $divider); } + $part[1] = generate_glob_regex($part[1], $divider); + return implode('', $part); + } +} - // Load +function get_blocklist($list = '') +{ + static $regexs; + + if (! isset($regexs)) { + $regexs = array(); if (file_exists(SPAM_INI_FILE)) { $blocklist = array(); - require(SPAM_INI_FILE); - foreach(array('goodhost', 'badhost') as $key) { - if (! isset($blocklist[$key])) continue; - foreach ($blocklist[$key] as $part) { - $_part = is_array($part) ? implode('/', $part) : $part; - $regex[$key][$_part] = '/^' . generate_glob_regex($part) . '$/i'; + include(SPAM_INI_FILE); + // $blocklist['badhost'] = array( + // '*.blogspot.com', // Blog services's subdomains (only) + // 'IANA-examples' => '#^(?:.*\.)?example\.(?:com|net|org)$#', + // ); + foreach(array('goodhost', 'badhost') as $_list) { + if (! isset($blocklist[$list])) continue; + foreach ($blocklist[$_list] as $key => $value) { + if (is_array($value)) { + $regexs[$_list][$key] = array(); + foreach($value as $_key => $_value) { + if (is_string($_key)) { + $regexs[$_list][$key][$_key] = $_value; // A regex + } else { + $regexs[$_list][$key][$_value] = + '/^' . generate_host_regex($_value, '/') . '$/i'; + } + } + } else { + if (is_string($key)) { + $regexs[$_list][$key] = $value; // A regex + } else { + $regexs[$_list][$value] = + '/^' . generate_host_regex($value, '/') . '$/i'; + } + } } } } } if ($list == '') { - return $regex; - } else if (isset($regex[$list])) { - return $regex[$list]; + return $regexs; // ALL + } else if (isset($regexs[$list])) { + return $regexs[$list]; } else { return array(); } @@ -707,18 +780,31 @@ function is_badhost($hosts = array(), $asap = TRUE, & $remains) } if (empty($hosts)) return $result; - foreach (get_blocklist('goodhost') as $_regex) { - $hosts = preg_grep_invert($_regex, $hosts); + foreach (get_blocklist('goodhost') as $regex) { + $hosts = preg_grep_invert($regex, $hosts); } if (empty($hosts)) return $result; $tmp = array(); - foreach (get_blocklist('badhost') as $part => $_regex) { - $result[$part] = preg_grep($_regex, $hosts); - if (empty($result[$part])) { - unset($result[$part]); + foreach (get_blocklist('badhost') as $label => $regex) { + if (is_array($regex)) { + $result[$label] = array(); + foreach($regex as $_label => $_regex) { + $_group = preg_grep($_regex, $hosts); + if ($_group) { + $result[$label][$_label] = $_group; + $hosts = array_diff($hosts, $_group); + if ($asap) break; + } + } + if (empty($result[$label])) unset($result[$label]); } else { - if ($asap) break; + $_group = preg_grep($regex, $hosts); + if ($_group) { + $result[$label] = $_group; + $hosts = array_diff($hosts, $result[$label]); + if ($asap) break; + } } } @@ -737,7 +823,7 @@ function check_uri_spam_method($times = 1, $t_area = 0, $rule = TRUE) // Thresholds 'quantity' => 8 * $times, // Allow N URIs 'non_uniqhost' => 3 * $times, // Allow N duped (and normalized) Hosts - 'non_uniquri' => 0 * $times, // Allow N duped (and normalized) URIs + //'non_uniquri'=> 3 * $times, // Allow N duped (and normalized) URIs // Areas 'area_anchor' => $t_area, // Using HTML tag @@ -785,13 +871,15 @@ function check_uri_spam($target = '', $method = array()) 'is_spam' => array(), 'method' => & $method, 'remains' => array(), + 'error' => array(), ); $sum = & $progress['sum']; $is_spam = & $progress['is_spam']; $remains = & $progress['remains']; + $error = & $progress['error']; $asap = isset($method['asap']); - // Return if ... + // Recurse if (is_array($target)) { foreach($target as $str) { // Recurse @@ -799,6 +887,7 @@ function check_uri_spam($target = '', $method = array()) $_sum = & $_progress['sum']; $_is_spam = & $_progress['is_spam']; $_remains = & $_progress['remains']; + $_error = & $_progress['error']; foreach (array_keys($_sum) as $key) { $sum[$key] += $_sum[$key]; } @@ -818,9 +907,14 @@ function check_uri_spam($target = '', $method = array()) } foreach ($_remains as $key=>$value) { foreach ($value as $_key=>$_value) { - $remains[$key][$_key] = $_value; + if (is_int($_key)) { + $remains[$key][] = $_value; + } else { + $remains[$key][$_key] = $_value; + } } } + if (! empty($_error)) $error += $_error; if ($asap && $is_spam) break; } return $progress; @@ -853,14 +947,14 @@ function check_uri_spam($target = '', $method = array()) } // Return if ... - if ($asap && $is_spam) { - return $progress; - } - // URI Init + if ($asap && $is_spam) return $progress; + + // URI: Pickup $pickups = spam_uri_pickup($target, $method); - if (empty($pickups)) { - return $progress; - } + //$remains['uri_pickup'] = & $pickups; + + // Return if ... + if (empty($pickups)) return $progress; // URI: Check quantity $sum['quantity'] += count($pickups); @@ -928,14 +1022,13 @@ function check_uri_spam($target = '', $method = array()) } // Return if ... - if ($asap && $is_spam) { - return $progress; - } + if ($asap && $is_spam) return $progress; // Host: Uniqueness (uniq / non-uniq) $hosts = array(); foreach ($pickups as $pickup) $hosts[] = & $pickup['host']; $hosts = array_unique($hosts); + //$remains['uniqhost'] = & $hosts; $sum['uniqhost'] += count($hosts); if ((! $asap || ! $is_spam) && isset($method['non_uniqhost'])) { $sum['non_uniqhost'] = $sum['quantity'] - $sum['uniqhost']; @@ -945,25 +1038,21 @@ function check_uri_spam($target = '', $method = array()) } // Return if ... - if ($asap && $is_spam) { - return $progress; - } + if ($asap && $is_spam) return $progress; // URI: Bad host if ((! $asap || ! $is_spam) && isset($method['badhost'])) { - if ($asap) { - $badhost = is_badhost($hosts, $asap); - } else { - $__remains = array(); - $badhost = is_badhost($hosts, $asap, $__remains); + $__remains = array(); + $badhost = is_badhost($hosts, $asap, $__remains); + if (! $asap) { if ($__remains) { - $progress['remains']['badhost'] = array(); + $remains['badhost'] = array(); foreach ($__remains as $value) { - $progress['remains']['badhost'][$value] = TRUE; + $remains['badhost'][$value] = TRUE; } - unset($__remains); } } + unset($__remains); if (! empty($badhost)) { $sum['badhost'] += array_count_leaves($badhost); foreach(array_keys($badhost) as $keys) { @@ -1077,12 +1166,22 @@ function pkwk_spamnotify($action, $page, $target = array('title' => ''), $progre } $summary['DETAIL_BADHOST'] = implode(', ', $badhost); } + if (! $asap && $progress['remains']['badhost']) { + $count = count($progress['remains']['badhost']); + $summary['DETAIL_NEUTRAL_HOST'] = $count . + ' (' . + preg_replace( + '/[^, a-z0-9.-]/i', '', + implode(', ', array_keys($progress['remains']['badhost'])) + ) . + ')'; + } $summary['COMMENT'] = $action; $summary['PAGE'] = '[blocked] ' . (is_pagename($page) ? $page : ''); $summary['URI'] = get_script_uri() . '?' . rawurlencode($page); $summary['USER_AGENT'] = TRUE; $summary['REMOTE_ADDR'] = TRUE; - pkwk_mail_notify($notify_subject, var_export($target, TRUE), $summary); + pkwk_mail_notify($notify_subject, var_export($target, TRUE), $summary, TRUE); } ?>