X-Git-Url: http://git.osdn.net/view?p=pukiwiki%2Fpukiwiki_sandbox.git;a=blobdiff_plain;f=spam%2Fspam.php;h=264c03cc6879683a16f400365f3ef62233037369;hp=e85ca0ec63ec117623e967451c80d8a5082157ed;hb=9b03a9b54499eef80458f77ea3465599624a97fe;hpb=0f227d6b56ebdd264d831852137d8acba59f247c diff --git a/spam/spam.php b/spam/spam.php index e85ca0e..264c03c 100644 --- a/spam/spam.php +++ b/spam/spam.php @@ -1,234 +1,22 @@ = 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature -if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php'); -if (! defined('DOMAIN_INI_FILE')) define('DOMAIN_INI_FILE', 'domain.ini.php'); - -// --------------------- -// Compat etc - -// (PHP 4 >= 4.2.0): var_export(): mail-reporting and dump related -if (! function_exists('var_export')) { - function var_export() { - return 'var_export() is not found on this server' . "\n"; - } -} - -// (PHP 4 >= 4.2.0): preg_grep() enables invert option -function preg_grep_invert($pattern = '//', $input = array()) -{ - static $invert; - if (! isset($invert)) $invert = defined('PREG_GREP_INVERT'); - - if ($invert) { - return preg_grep($pattern, $input, PREG_GREP_INVERT); - } else { - $result = preg_grep($pattern, $input); - if ($result) { - return array_diff($input, preg_grep($pattern, $input)); - } else { - return $input; - } - } -} - - -// --------------------- -// Utilities - -// Very roughly, shrink the lines of var_export() -// NOTE: If the same data exists, it must be corrupted. -function var_export_shrink($expression, $return = FALSE, $ignore_numeric_keys = FALSE) -{ - $result = var_export($expression, TRUE); - - $result = preg_replace( - // Remove a newline and spaces - '# => \n *array \(#', ' => array (', - $result - ); - - if ($ignore_numeric_keys) { - $result =preg_replace( - // Remove numeric keys - '#^( *)[0-9]+ => #m', '$1', - $result - ); - } - - if ($return) { - return $result; - } else { - echo $result; - return NULL; - } -} - -// Reverse $string with specified delimiter -function delimiter_reverse($string = 'foo.bar.example.com', $from_delim = '.', $to_delim = '.') -{ - if (! is_string($string) || ! is_string($from_delim) || ! is_string($to_delim)) - return $string; - - // com.example.bar.foo - return implode($to_delim, array_reverse(explode($from_delim, $string))); -} - -// ksort() by domain -function ksort_by_domain(& $array) -{ - $sort = array(); - foreach(array_keys($array) as $key) { - $sort[delimiter_reverse($key)] = $key; - } - ksort($sort, SORT_STRING); - $result = array(); - foreach($sort as $key) { - $result[$key] = & $array[$key]; - } - $array = $result; -} - -// Roughly strings(1) using PCRE -// This function is useful to: -// * Reduce the size of data, from removing unprintable binary data -// * Detect _bare_strings_ from binary data -// References: -// http://www.freebsd.org/cgi/man.cgi?query=strings (Man-page of GNU strings) -// http://www.pcre.org/pcre.txt -// Note: mb_ereg_replace() is one of mbstring extension's functions -// and need to init its encoding. -function strings($binary = '', $min_len = 4, $ignore_space = FALSE, $multibyte = FALSE) -{ - // String only - $binary = (is_array($binary) || $binary === TRUE) ? '' : strval($binary); - - $regex = $ignore_space ? - '[^[:graph:] \t\n]+' : // Remove "\0" etc, and readable spaces - '[^[:graph:][:space:]]+'; // Preserve readable spaces if possible - - $binary = $multibyte ? - mb_ereg_replace($regex, "\n", $binary) : - preg_replace('/' . $regex . '/s', "\n", $binary); - - if ($ignore_space) { - $binary = preg_replace( - array( - '/[ \t]{2,}/', - '/^[ \t]/m', - '/[ \t]$/m', - ), - array( - ' ', - '', - '' - ), - $binary); - } - - if ($min_len > 1) { - // The last character seems "\n" or not - $br = (! empty($binary) && $binary[strlen($binary) - 1] == "\n") ? "\n" : ''; - - $min_len = min(1024, intval($min_len)); - $regex = '/^.{' . $min_len . ',}/S'; - $binary = implode("\n", preg_grep($regex, explode("\n", $binary))) . $br; - } - - return $binary; -} - - -// --------------------- -// Utilities: Arrays - -// Count leaves (A leaf = value that is not an array, or an empty array) -function array_count_leaves($array = array(), $count_empty = FALSE) -{ - if (! is_array($array) || (empty($array) && $count_empty)) return 1; - - // Recurse - $count = 0; - foreach ($array as $part) { - $count += array_count_leaves($part, $count_empty); - } - return $count; -} -// An array-leaves to a flat array -function array_flat_leaves($array, $unique = TRUE) -{ - if (! is_array($array)) return $array; - - $tmp = array(); - foreach(array_keys($array) as $key) { - if (is_array($array[$key])) { - // Recurse - foreach(array_flat_leaves($array[$key]) as $_value) { - $tmp[] = $_value; - } - } else { - $tmp[] = & $array[$key]; - } - } +if (! defined('LIB_DIR')) define('LIB_DIR', './'); +require(LIB_DIR . 'spam_pickup.php'); +require(LIB_DIR . 'spam_util.php'); - return $unique ? array_values(array_unique($tmp)) : $tmp; -} - -// $array['something'] => $array['wanted'] -function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '') -{ - if (! is_array($array) || ! is_array($keys)) return FALSE; - - // Nondestructive test - if (! $force) - foreach(array_keys($keys) as $from) - if (! isset($array[$from])) - return FALSE; - - foreach($keys as $from => $to) { - if ($from === $to) continue; - if (! $force || isset($array[$from])) { - $array[$to] = & $array[$from]; - unset($array[$from]); - } else { - $array[$to] = $default; - } - } - - return TRUE; -} - -// Remove redundant values from array() -function array_unique_recursive($array = array()) -{ - if (! is_array($array)) return $array; - - $tmp = array(); - foreach($array as $key => $value){ - if (is_array($value)) { - $array[$key] = array_unique_recursive($value); - } else { - if (isset($tmp[$value])) { - unset($array[$key]); - } else { - $tmp[$value] = TRUE; - } - } - } - - return $array; -} +if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php'); // --------------------- -// Part One : Checker +// Regex // Rough implementation of globbing // @@ -276,43 +64,34 @@ function generate_host_regex($string = '', $divider = '/') { if (! is_string($string)) return ''; - if (mb_strpos($string, '.') === FALSE) + if (mb_strpos($string, '.') === FALSE || is_ip($string)) { + // "localhost", IPv4, etc return generate_glob_regex($string, $divider); - - $result = ''; - if (is_ip($string)) { - // IPv4 - return generate_glob_regex($string, $divider); - } else { - // FQDN or something - $part = explode('.', $string, 2); - if ($part[0] == '') { - $part[0] = '(?:.*\.)?'; // And all related FQDN - } else if ($part[0] == '*') { - $part[0] = '.*\.'; // All subdomains/hosts only - } else { - return generate_glob_regex($string, $divider); - } - $part[1] = generate_glob_regex($part[1], $divider); - return implode('', $part); } -} -// Rough hostname checker -// [OK] 192.168. -// TODO: Strict digit, 0x, CIDR, IPv6 -function is_ip($string = '') -{ - if (preg_match('/^' . - '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . - '(?:[0-9]{1,3}\.){1,3}' . '$/', - $string)) { - return 4; // Seems IPv4(dot-decimal) + // FQDN or something + $part = explode('.', $string, 2); + if ($part[0] == '') { + // ".example.org" + $part[0] = '(?:.*\.)?'; + } else if ($part[0] == '*') { + // "*.example.org" + $part[0] = '.*\.'; } else { - return 0; // Seems not IP + // example.org, etc + return generate_glob_regex($string, $divider); } + + $part[1] = generate_glob_regex($part[1], $divider); + + return implode('', $part); } + +// --------------------- +// Load + +// Load SPAM_INI_FILE and return parsed one function get_blocklist($list = '') { static $regexes; @@ -326,53 +105,65 @@ function get_blocklist($list = '') $regexes = array(); if (file_exists(SPAM_INI_FILE)) { $blocklist = array(); + include(SPAM_INI_FILE); + // $blocklist['list'] = array( + // //'goodhost' => FALSE; + // 'badhost' => TRUE; + // ); // $blocklist['badhost'] = array( // '*.blogspot.com', // Blog services's subdomains (only) // 'IANA-examples' => '#^(?:.*\.)?example\.(?:com|net|org)$#', // ); - if (isset($blocklist['list'])) { - $regexes['list'] = & $blocklist['list']; - } else { - // Default - $blocklist['list'] = array( - 'goodhost' => FALSE, - 'badhost' => TRUE, - ); - } - foreach(array_keys($blocklist['list']) as $_list) { - if (! isset($blocklist[$_list])) continue; - foreach ($blocklist[$_list] as $key => $value) { - if (is_array($value)) { - $regexes[$_list][$key] = array(); - foreach($value as $_key => $_value) { - get_blocklist_add($regexes[$_list][$key], $_key, $_value); + + foreach(array( + 'pre', + 'list', + ) as $special) { + + if (! isset($blocklist[$special])) continue; + + $regexes[$special] = $blocklist[$special]; + + foreach(array_keys($blocklist[$special]) as $_list) { + if (! isset($blocklist[$_list])) continue; + + foreach ($blocklist[$_list] as $key => $value) { + if (is_array($value)) { + $regexes[$_list][$key] = array(); + foreach($value as $_key => $_value) { + get_blocklist_add($regexes[$_list][$key], $_key, $_value); + } + } else { + get_blocklist_add($regexes[$_list], $key, $value); } - } else { - get_blocklist_add($regexes[$_list], $key, $value); } + + unset($blocklist[$_list]); } - unset($blocklist[$_list]); } } } if ($list === '') { - return $regexes; // ALL + return $regexes; // ALL of } else if (isset($regexes[$list])) { - return $regexes[$list]; + return $regexes[$list]; // A part of } else { - return array(); + return array(); // Found nothing } } -// Subroutine of get_blocklist() -function get_blocklist_add(& $array, $key = 0, $value = '*.example.org') +// Subroutine of get_blocklist(): Add new regex to the $array +function get_blocklist_add(& $array, $key = 0, $value = '*.example.org/path/to/file.html') { if (is_string($key)) { - $array[$key] = & $value; // Treat $value as a regex + $array[$key] = & $value; // Treat $value as a regex for FQDN(host)s } else { - $array[$value] = '/^' . generate_host_regex($value, '/') . '$/i'; + $regex = generate_host_regex($value, '#'); + if (! empty($regex)) { + $array[$value] = '#^' . $regex . '$#i'; + } } } @@ -487,7 +278,9 @@ function check_uri_spam($target = '', $method = array()) ), ); + // ---------------------------------------- // Aliases + $sum = & $progress['sum']; $is_spam = & $progress['is_spam']; $progress['method'] = & $method; // Argument @@ -495,7 +288,9 @@ function check_uri_spam($target = '', $method = array()) $hosts = & $progress['hosts']; $asap = isset($method['asap']); + // ---------------------------------------- // Init + if (! is_array($method) || empty($method)) { $method = check_uri_spam_method(); } @@ -504,6 +299,9 @@ function check_uri_spam($target = '', $method = array()) } if (! isset($sum['quantity'])) $sum['quantity'] = 0; + // ---------------------------------------- + // Recurse + if (is_array($target)) { foreach($target as $str) { if (! is_string($str)) continue; @@ -529,8 +327,8 @@ function check_uri_spam($target = '', $method = array()) if ($asap && $is_spam) break; // Merge only - $blocked = array_merge_recursive($blocked, $_progress['blocked']); - $hosts = array_merge_recursive($hosts, $_progress['hosts']); + $blocked = array_merge_leaves($blocked, $_progress['blocked'], FALSE); + $hosts = array_merge_leaves($hosts, $_progress['hosts'], FALSE); } // Unique values @@ -543,42 +341,91 @@ function check_uri_spam($target = '', $method = array()) return $progress; } - // Area: There's HTML anchor tag - if ((! $asap || ! $is_spam) && isset($method['area_anchor'])) { - $key = 'area_anchor'; - $_asap = isset($method['asap']) ? array('asap' => TRUE) : array(); - $result = area_pickup($target, array($key => TRUE) + $_asap); - if ($result) { - $sum[$key] = $result[$key]; - if (isset($method[$key]) && $sum[$key] > $method[$key]) { - $is_spam[$key] = TRUE; - } + // ---------------------------------------- + // Area measure + + if (! $asap || ! $is_spam) { + + // Method pickup + $_method = array(); + foreach(array( + 'area_anchor', // There's HTML anchor tag + 'area_bbcode', // There's 'BBCode' linking tag + ) as $key) { + if (isset($method[$key])) $_method[$key] = TRUE; } - } - // Area: There's 'BBCode' linking tag - if ((! $asap || ! $is_spam) && isset($method['area_bbcode'])) { - $key = 'area_bbcode'; - $_asap = isset($method['asap']) ? array('asap' => TRUE) : array(); - $result = area_pickup($target, array($key => TRUE) + $_asap); - if ($result) { - $sum[$key] = $result[$key]; - if (isset($method[$key]) && $sum[$key] > $method[$key]) { - $is_spam[$key] = TRUE; + if ($_method) { + $_asap = isset($method['asap']) ? array('asap' => TRUE) : array(); + $_result = area_pickup($target, $_method + $_asap); + $_asap = NULL; + } else { + $_result = FALSE; + } + + if ($_result) { + foreach(array_keys($_method) as $key) { + if (isset($_result[$key])) { + $sum[$key] = $_result[$key]; + if (isset($method[$key]) && $sum[$key] > $method[$key]) { + $is_spam[$key] = TRUE; + } + } } } + + unset($_asap, $_method, $_result); } // Return if ... if ($asap && $is_spam) return $progress; + // ---------------------------------------- // URI: Pickup - $pickups = uri_pickup_normalize(spam_uri_pickup($target, $method)); + + $pickups = spam_uri_pickup($target, $method); + // Return if ... if (empty($pickups)) return $progress; + // Normalize all + $pickups = uri_pickup_normalize($pickups); + + // ---------------------------------------- + // Pickup some part of URI + + $hosts = array(); + foreach ($pickups as $key => $pickup) { + $hosts[$key] = & $pickup['host']; + } + + // ---------------------------------------- + // URI: Bad host (Separate good/bad hosts from $hosts) + + if ((! $asap || ! $is_spam) && isset($method['badhost'])) { + $list = get_blocklist('pre'); + $blocked = blocklist_distiller($hosts, array_keys($list), $asap); + foreach($list as $key => $type){ + if (! $type) unset($blocked[$key]); // Ignore goodhost etc + } + unset($list); + if (! empty($blocked)) $is_spam['badhost'] = TRUE; + } + + // Return if ... + if ($asap && $is_spam) return $progress; + + // Remove blocked from $pickups + foreach(array_keys($pickups) as $key) { + if (! isset($hosts[$key])) { + unset($pickups[$key]); + } + } + + // ---------------------------------------- // URI: Check quantity + $sum['quantity'] += count($pickups); // URI quantity if ((! $asap || ! $is_spam) && isset($method['quantity']) && @@ -586,7 +433,9 @@ function check_uri_spam($target = '', $method = array()) $is_spam['quantity'] = TRUE; } + // ---------------------------------------- // URI: used inside HTML anchor tag pair + if ((! $asap || ! $is_spam) && isset($method['uri_anchor'])) { $key = 'uri_anchor'; foreach($pickups as $pickup) { @@ -602,7 +451,9 @@ function check_uri_spam($target = '', $method = array()) } } + // ---------------------------------------- // URI: used inside 'BBCode' pair + if ((! $asap || ! $is_spam) && isset($method['uri_bbcode'])) { $key = 'uri_bbcode'; foreach($pickups as $pickup) { @@ -618,7 +469,9 @@ function check_uri_spam($target = '', $method = array()) } } + // ---------------------------------------- // URI: Uniqueness (and removing non-uniques) + if ((! $asap || ! $is_spam) && isset($method['non_uniquri'])) { $uris = array(); @@ -643,10 +496,12 @@ function check_uri_spam($target = '', $method = array()) // Return if ... if ($asap && $is_spam) return $progress; + // ---------------------------------------- // Host: Uniqueness (uniq / non-uniq) - foreach ($pickups as $pickup) $hosts[] = & $pickup['host']; + $hosts = array_unique($hosts); - $sum['uniqhost'] += count($hosts); + + if (isset($sum['uniqhost'])) $sum['uniqhost'] += count($hosts); if ((! $asap || ! $is_spam) && isset($method['non_uniqhost'])) { $sum['non_uniqhost'] = $sum['quantity'] - $sum['uniqhost']; if ($sum['non_uniqhost'] > $method['non_uniqhost']) { @@ -657,20 +512,29 @@ function check_uri_spam($target = '', $method = array()) // Return if ... if ($asap && $is_spam) return $progress; + // ---------------------------------------- // URI: Bad host (Separate good/bad hosts from $hosts) - if ((! $asap || ! $is_spam) && isset($method['badhost'])) { - // is_badhost() - $list = get_blocklist('list'); - $blocked = blocklist_distiller($hosts, array_keys($list), $asap); + if ((! $asap || ! $is_spam) && isset($method['badhost'])) { + $list = get_blocklist('list'); + $blocked = array_merge_leaves( + $blocked, + blocklist_distiller($hosts, array_keys($list), $asap), + FALSE + ); foreach($list as $key=>$type){ if (! $type) unset($blocked[$key]); // Ignore goodhost etc } unset($list); - if (! empty($blocked)) $is_spam['badhost'] = TRUE; } + // Return if ... + //if ($asap && $is_spam) return $progress; + + // ---------------------------------------- + // End + return $progress; } @@ -746,7 +610,7 @@ function summarize_detail_newtral($progress = array()) } else { $rest = rtrim(substr($value, 0, - strlen($resp)), '.'); // 'A.foo.bar' } - $trie = array_merge_recursive($trie, array($resp => array($rest => NULL))); + $trie = array_merge_leaves($trie, array($resp => array($rest => NULL)), FALSE); } // Format: var_export_shrink() -like output @@ -762,9 +626,9 @@ function summarize_detail_newtral($progress = array()) $subs = array(); foreach(array_keys($trie[$key]) as $sub) { if ($sub == '') { - $subs[] = $key; + $subs[] = $key; // 'example.com' } else { - $subs[] = $sub . '.' . $key; + $subs[] = $sub . '. '; // 'A.foo.bar. ' } } $result[] = ' \'' . $key . '\' => \'' . implode(', ', $subs) . '\','; @@ -778,59 +642,6 @@ function summarize_detail_newtral($progress = array()) } -// Check responsibility-root of the FQDN -// 'foo.bar.example.com' => 'example.com' (.com has the last whois for it) -// 'foo.bar.example.au' => 'example.au' (.au has the last whois for it) -// 'foo.bar.example.edu.au' => 'example.edu.au' (.edu.au has the last whois for it) -// 'foo.bar.example.act.edu.au' => 'example.act.edu.au' (.act.edu.au has the last whois for it) -function whois_responsibility($fqdn = 'foo.bar.example.com', $parent = FALSE, $implicit = TRUE) -{ - static $domain; - - if ($fqdn === NULL) { - $domain = NULL; // Unset - return ''; - } - if (! is_string($fqdn)) return ''; - - if (is_ip($fqdn)) return $fqdn; - - if (! isset($domain)) { - $domain = array(); - if (file_exists(DOMAIN_INI_FILE)) { - include(DOMAIN_INI_FILE); // Set - } - } - - $result = array(); - $dcursor = & $domain; - $array = array_reverse(explode('.', $fqdn)); - $i = 0; - while(TRUE) { - if (! isset($array[$i])) break; - $acursor = $array[$i]; - if (is_array($dcursor) && isset($dcursor[$acursor])) { - $result[] = & $array[$i]; - $dcursor = & $dcursor[$acursor]; - } else { - if (! $parent && isset($acursor)) { - $result[] = & $array[$i]; // Whois servers must know this subdomain - } - break; - } - ++$i; - } - - // Implicit responsibility: Top-Level-Domains must not be yours - // 'bar.foo.something' => 'foo.something' - if ($implicit && count($result) == 1 && count($array) > 1) { - $result[] = & $array[1]; - } - - return $result ? implode('.', array_reverse($result)) : ''; -} - - // --------------------- // Exit @@ -853,7 +664,7 @@ function spam_exit($mode = '', $data = array()) break; case 'dump': echo('
' . "\n");
-			echo htmlspecialchars(var_export($data, TRUE));
+			echo htmlsc(var_export($data, TRUE));
 			echo('
' . "\n"); break; };