<?php
-// $Id: spam.php,v 1.196 2007/07/02 14:51:40 henoheno Exp $
-// Copyright (C) 2006-2007 PukiWiki Developers Team
+// $Id: spam.php,v 1.221 2011/01/24 14:51:50 henoheno Exp $
+// Copyright (C) 2006-2009, 2011 PukiWiki Developers Team
// License: GPL v2 or (at your option) any later version
//
// Functions for Concept-work of spam-uri metrics
//
// (PHP 4 >= 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature
-require_once('spam_pickup.php');
-if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php');
-if (! defined('DOMAIN_INI_FILE')) define('DOMAIN_INI_FILE', 'domain.ini.php');
-
-// ---------------------
-// Compat etc
-
-// (PHP 4 >= 4.2.0): var_export(): mail-reporting and dump related
-if (! function_exists('var_export')) {
- function var_export() {
- return 'var_export() is not found on this server' . "\n";
- }
-}
-
-// (PHP 4 >= 4.2.0): preg_grep() enables invert option
-function preg_grep_invert($pattern = '//', $input = array())
-{
- static $invert;
- if (! isset($invert)) $invert = defined('PREG_GREP_INVERT');
-
- if ($invert) {
- return preg_grep($pattern, $input, PREG_GREP_INVERT);
- } else {
- $result = preg_grep($pattern, $input);
- if ($result) {
- return array_diff($input, preg_grep($pattern, $input));
- } else {
- return $input;
- }
- }
-}
-
-
-// ---------------------
-// Utilities
-
-// Very roughly, shrink the lines of var_export()
-// NOTE: If the same data exists, it must be corrupted.
-function var_export_shrink($expression, $return = FALSE, $ignore_numeric_keys = FALSE)
-{
- $result = var_export($expression, TRUE);
-
- $result = preg_replace(
- // Remove a newline and spaces
- '# => \n *array \(#', ' => array (',
- $result
- );
-
- if ($ignore_numeric_keys) {
- $result =preg_replace(
- // Remove numeric keys
- '#^( *)[0-9]+ => #m', '$1',
- $result
- );
- }
-
- if ($return) {
- return $result;
- } else {
- echo $result;
- return NULL;
- }
-}
-
-// Reverse $string with specified delimiter
-function delimiter_reverse($string = 'foo.bar.example.com', $from_delim = '.', $to_delim = '.')
-{
- if (! is_string($string) || ! is_string($from_delim) || ! is_string($to_delim))
- return $string;
-
- // com.example.bar.foo
- return implode($to_delim, array_reverse(explode($from_delim, $string)));
-}
-
-// ksort() by domain
-function ksort_by_domain(& $array)
-{
- $sort = array();
- foreach(array_keys($array) as $key) {
- $sort[delimiter_reverse($key)] = $key;
- }
- ksort($sort, SORT_STRING);
- $result = array();
- foreach($sort as $key) {
- $result[$key] = & $array[$key];
- }
- $array = $result;
-}
-
-// Roughly strings(1) using PCRE
-// This function is useful to:
-// * Reduce the size of data, from removing unprintable binary data
-// * Detect _bare_strings_ from binary data
-// References:
-// http://www.freebsd.org/cgi/man.cgi?query=strings (Man-page of GNU strings)
-// http://www.pcre.org/pcre.txt
-// Note: mb_ereg_replace() is one of mbstring extension's functions
-// and need to init its encoding.
-function strings($binary = '', $min_len = 4, $ignore_space = FALSE, $multibyte = FALSE)
-{
- // String only
- $binary = (is_array($binary) || $binary === TRUE) ? '' : strval($binary);
-
- $regex = $ignore_space ?
- '[^[:graph:] \t\n]+' : // Remove "\0" etc, and readable spaces
- '[^[:graph:][:space:]]+'; // Preserve readable spaces if possible
-
- $binary = $multibyte ?
- mb_ereg_replace($regex, "\n", $binary) :
- preg_replace('/' . $regex . '/s', "\n", $binary);
-
- if ($ignore_space) {
- $binary = preg_replace(
- array(
- '/[ \t]{2,}/',
- '/^[ \t]/m',
- '/[ \t]$/m',
- ),
- array(
- ' ',
- '',
- ''
- ),
- $binary);
- }
-
- if ($min_len > 1) {
- // The last character seems "\n" or not
- $br = (! empty($binary) && $binary[strlen($binary) - 1] == "\n") ? "\n" : '';
-
- $min_len = min(1024, intval($min_len));
- $regex = '/^.{' . $min_len . ',}/S';
- $binary = implode("\n", preg_grep($regex, explode("\n", $binary))) . $br;
- }
-
- return $binary;
-}
-
-
-// ---------------------
-// Utilities: Arrays
-
-// Count leaves (A leaf = value that is not an array, or an empty array)
-function array_count_leaves($array = array(), $count_empty = FALSE)
-{
- if (! is_array($array) || (empty($array) && $count_empty)) return 1;
-
- // Recurse
- $count = 0;
- foreach ($array as $part) {
- $count += array_count_leaves($part, $count_empty);
- }
- return $count;
-}
+if (! defined('LIB_DIR')) define('LIB_DIR', './');
+require(LIB_DIR . 'spam_pickup.php');
+require(LIB_DIR . 'spam_util.php');
-// An array-leaves to a flat array
-function array_flat_leaves($array, $unique = TRUE)
-{
- if (! is_array($array)) return $array;
-
- $tmp = array();
- foreach(array_keys($array) as $key) {
- if (is_array($array[$key])) {
- // Recurse
- foreach(array_flat_leaves($array[$key]) as $_value) {
- $tmp[] = $_value;
- }
- } else {
- $tmp[] = & $array[$key];
- }
- }
-
- return $unique ? array_values(array_unique($tmp)) : $tmp;
-}
-
-// $array['something'] => $array['wanted']
-function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
-{
- if (! is_array($array) || ! is_array($keys)) return FALSE;
-
- // Nondestructive test
- if (! $force)
- foreach(array_keys($keys) as $from)
- if (! isset($array[$from]))
- return FALSE;
-
- foreach($keys as $from => $to) {
- if ($from === $to) continue;
- if (! $force || isset($array[$from])) {
- $array[$to] = & $array[$from];
- unset($array[$from]);
- } else {
- $array[$to] = $default;
- }
- }
-
- return TRUE;
-}
-
-// Remove redundant values from array()
-function array_unique_recursive($array = array())
-{
- if (! is_array($array)) return $array;
-
- $tmp = array();
- foreach($array as $key => $value){
- if (is_array($value)) {
- $array[$key] = array_unique_recursive($value);
- } else {
- if (isset($tmp[$value])) {
- unset($array[$key]);
- } else {
- $tmp[$value] = TRUE;
- }
- }
- }
-
- return $array;
-}
+if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php');
// ---------------------
-// Part One : Checker
+// Regex
// Rough implementation of globbing
//
{
if (! is_string($string)) return '';
- if (mb_strpos($string, '.') === FALSE)
- return generate_glob_regex($string, $divider);
-
- $result = '';
- if (is_ip($string)) {
- // IPv4
+ if (mb_strpos($string, '.') === FALSE || is_ip($string)) {
+ // "localhost", IPv4, etc
return generate_glob_regex($string, $divider);
- } else {
- // FQDN or something
- $part = explode('.', $string, 2);
- if ($part[0] == '') {
- $part[0] = '(?:.*\.)?'; // And all related FQDN
- } else if ($part[0] == '*') {
- $part[0] = '.*\.'; // All subdomains/hosts only
- } else {
- return generate_glob_regex($string, $divider);
- }
- $part[1] = generate_glob_regex($part[1], $divider);
- return implode('', $part);
}
-}
-// Rough hostname checker
-// [OK] 192.168.
-// TODO: Strict digit, 0x, CIDR, IPv6
-function is_ip($string = '')
-{
- if (preg_match('/^' .
- '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' .
- '(?:[0-9]{1,3}\.){1,3}' . '$/',
- $string)) {
- return 4; // Seems IPv4(dot-decimal)
+ // FQDN or something
+ $part = explode('.', $string, 2);
+ if ($part[0] == '') {
+ // ".example.org"
+ $part[0] = '(?:.*\.)?';
+ } else if ($part[0] == '*') {
+ // "*.example.org"
+ $part[0] = '.*\.';
} else {
- return 0; // Seems not IP
+ // example.org, etc
+ return generate_glob_regex($string, $divider);
}
+
+ $part[1] = generate_glob_regex($part[1], $divider);
+
+ return implode('', $part);
}
+
+// ---------------------
+// Load
+
+// Load SPAM_INI_FILE and return parsed one
function get_blocklist($list = '')
{
static $regexes;
$regexes = array();
if (file_exists(SPAM_INI_FILE)) {
$blocklist = array();
+
include(SPAM_INI_FILE);
+ // $blocklist['list'] = array(
+ // //'goodhost' => FALSE;
+ // 'badhost' => TRUE;
+ // );
// $blocklist['badhost'] = array(
// '*.blogspot.com', // Blog services's subdomains (only)
// 'IANA-examples' => '#^(?:.*\.)?example\.(?:com|net|org)$#',
// );
- if (isset($blocklist['list'])) {
- $regexes['list'] = & $blocklist['list'];
- } else {
- // Default
- $blocklist['list'] = array(
- 'goodhost' => FALSE,
- 'badhost' => TRUE,
- );
- }
- foreach(array_keys($blocklist['list']) as $_list) {
- if (! isset($blocklist[$_list])) continue;
- foreach ($blocklist[$_list] as $key => $value) {
- if (is_array($value)) {
- $regexes[$_list][$key] = array();
- foreach($value as $_key => $_value) {
- get_blocklist_add($regexes[$_list][$key], $_key, $_value);
+
+ foreach(array(
+ 'pre',
+ 'list',
+ ) as $special) {
+
+ if (! isset($blocklist[$special])) continue;
+
+ $regexes[$special] = $blocklist[$special];
+
+ foreach(array_keys($blocklist[$special]) as $_list) {
+ if (! isset($blocklist[$_list])) continue;
+
+ foreach ($blocklist[$_list] as $key => $value) {
+ if (is_array($value)) {
+ $regexes[$_list][$key] = array();
+ foreach($value as $_key => $_value) {
+ get_blocklist_add($regexes[$_list][$key], $_key, $_value);
+ }
+ } else {
+ get_blocklist_add($regexes[$_list], $key, $value);
}
- } else {
- get_blocklist_add($regexes[$_list], $key, $value);
}
+
+ unset($blocklist[$_list]);
}
- unset($blocklist[$_list]);
}
}
}
if ($list === '') {
- return $regexes; // ALL
+ return $regexes; // ALL of
} else if (isset($regexes[$list])) {
- return $regexes[$list];
+ return $regexes[$list]; // A part of
} else {
- return array();
+ return array(); // Found nothing
}
}
-// Subroutine of get_blocklist()
-function get_blocklist_add(& $array, $key = 0, $value = '*.example.org')
+// Subroutine of get_blocklist(): Add new regex to the $array
+function get_blocklist_add(& $array, $key = 0, $value = '*.example.org/path/to/file.html')
{
if (is_string($key)) {
- $array[$key] = & $value; // Treat $value as a regex
+ $array[$key] = & $value; // Treat $value as a regex for FQDN(host)s
} else {
- $array[$value] = '/^' . generate_host_regex($value, '/') . '$/i';
+ $regex = generate_host_regex($value, '#');
+ if (! empty($regex)) {
+ $array[$value] = '#^' . $regex . '$#i';
+ }
}
}
),
);
+ // ----------------------------------------
// Aliases
+
$sum = & $progress['sum'];
$is_spam = & $progress['is_spam'];
$progress['method'] = & $method; // Argument
$hosts = & $progress['hosts'];
$asap = isset($method['asap']);
+ // ----------------------------------------
// Init
+
if (! is_array($method) || empty($method)) {
$method = check_uri_spam_method();
}
}
if (! isset($sum['quantity'])) $sum['quantity'] = 0;
+ // ----------------------------------------
+ // Recurse
+
if (is_array($target)) {
foreach($target as $str) {
if (! is_string($str)) continue;
if ($asap && $is_spam) break;
// Merge only
- $blocked = array_merge_recursive($blocked, $_progress['blocked']);
- $hosts = array_merge_recursive($hosts, $_progress['hosts']);
+ $blocked = array_merge_leaves($blocked, $_progress['blocked'], FALSE);
+ $hosts = array_merge_leaves($hosts, $_progress['hosts'], FALSE);
}
// Unique values
return $progress;
}
- // Area: There's HTML anchor tag
- if ((! $asap || ! $is_spam) && isset($method['area_anchor'])) {
- $key = 'area_anchor';
- $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
- $result = area_pickup($target, array($key => TRUE) + $_asap);
- if ($result) {
- $sum[$key] = $result[$key];
- if (isset($method[$key]) && $sum[$key] > $method[$key]) {
- $is_spam[$key] = TRUE;
- }
+ // ----------------------------------------
+ // Area measure
+
+ if (! $asap || ! $is_spam) {
+
+ // Method pickup
+ $_method = array();
+ foreach(array(
+ 'area_anchor', // There's HTML anchor tag
+ 'area_bbcode', // There's 'BBCode' linking tag
+ ) as $key) {
+ if (isset($method[$key])) $_method[$key] = TRUE;
}
- }
- // Area: There's 'BBCode' linking tag
- if ((! $asap || ! $is_spam) && isset($method['area_bbcode'])) {
- $key = 'area_bbcode';
- $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
- $result = area_pickup($target, array($key => TRUE) + $_asap);
- if ($result) {
- $sum[$key] = $result[$key];
- if (isset($method[$key]) && $sum[$key] > $method[$key]) {
- $is_spam[$key] = TRUE;
+ if ($_method) {
+ $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
+ $_result = area_pickup($target, $_method + $_asap);
+ $_asap = NULL;
+ } else {
+ $_result = FALSE;
+ }
+
+ if ($_result) {
+ foreach(array_keys($_method) as $key) {
+ if (isset($_result[$key])) {
+ $sum[$key] = $_result[$key];
+ if (isset($method[$key]) && $sum[$key] > $method[$key]) {
+ $is_spam[$key] = TRUE;
+ }
+ }
}
}
+
+ unset($_asap, $_method, $_result);
}
// Return if ...
if ($asap && $is_spam) return $progress;
+ // ----------------------------------------
// URI: Pickup
- $pickups = uri_pickup_normalize(spam_uri_pickup($target, $method));
+
+ $pickups = spam_uri_pickup($target, $method);
+
// Return if ...
if (empty($pickups)) return $progress;
+ // Normalize all
+ $pickups = uri_pickup_normalize($pickups);
+
+ // ----------------------------------------
+ // Pickup some part of URI
+
+ $hosts = array();
+ foreach ($pickups as $key => $pickup) {
+ $hosts[$key] = & $pickup['host'];
+ }
+
+ // ----------------------------------------
+ // URI: Bad host <pre-filter> (Separate good/bad hosts from $hosts)
+
+ if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
+ $list = get_blocklist('pre');
+ $blocked = blocklist_distiller($hosts, array_keys($list), $asap);
+ foreach($list as $key => $type){
+ if (! $type) unset($blocked[$key]); // Ignore goodhost etc
+ }
+ unset($list);
+ if (! empty($blocked)) $is_spam['badhost'] = TRUE;
+ }
+
+ // Return if ...
+ if ($asap && $is_spam) return $progress;
+
+ // Remove blocked from $pickups
+ foreach(array_keys($pickups) as $key) {
+ if (! isset($hosts[$key])) {
+ unset($pickups[$key]);
+ }
+ }
+
+ // ----------------------------------------
// URI: Check quantity
+
$sum['quantity'] += count($pickups);
// URI quantity
if ((! $asap || ! $is_spam) && isset($method['quantity']) &&
$is_spam['quantity'] = TRUE;
}
+ // ----------------------------------------
// URI: used inside HTML anchor tag pair
+
if ((! $asap || ! $is_spam) && isset($method['uri_anchor'])) {
$key = 'uri_anchor';
foreach($pickups as $pickup) {
}
}
+ // ----------------------------------------
// URI: used inside 'BBCode' pair
+
if ((! $asap || ! $is_spam) && isset($method['uri_bbcode'])) {
$key = 'uri_bbcode';
foreach($pickups as $pickup) {
}
}
+ // ----------------------------------------
// URI: Uniqueness (and removing non-uniques)
+
if ((! $asap || ! $is_spam) && isset($method['non_uniquri'])) {
$uris = array();
// Return if ...
if ($asap && $is_spam) return $progress;
+ // ----------------------------------------
// Host: Uniqueness (uniq / non-uniq)
- foreach ($pickups as $pickup) $hosts[] = & $pickup['host'];
+
$hosts = array_unique($hosts);
- $sum['uniqhost'] += count($hosts);
+
+ if (isset($sum['uniqhost'])) $sum['uniqhost'] += count($hosts);
if ((! $asap || ! $is_spam) && isset($method['non_uniqhost'])) {
$sum['non_uniqhost'] = $sum['quantity'] - $sum['uniqhost'];
if ($sum['non_uniqhost'] > $method['non_uniqhost']) {
// Return if ...
if ($asap && $is_spam) return $progress;
+ // ----------------------------------------
// URI: Bad host (Separate good/bad hosts from $hosts)
- if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
- // is_badhost()
- $list = get_blocklist('list');
- $blocked = blocklist_distiller($hosts, array_keys($list), $asap);
+ if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
+ $list = get_blocklist('list');
+ $blocked = array_merge_leaves(
+ $blocked,
+ blocklist_distiller($hosts, array_keys($list), $asap),
+ FALSE
+ );
foreach($list as $key=>$type){
if (! $type) unset($blocked[$key]); // Ignore goodhost etc
}
unset($list);
-
if (! empty($blocked)) $is_spam['badhost'] = TRUE;
}
+ // Return if ...
+ //if ($asap && $is_spam) return $progress;
+
+ // ----------------------------------------
+ // End
+
return $progress;
}
} else {
$rest = rtrim(substr($value, 0, - strlen($resp)), '.'); // 'A.foo.bar'
}
- $trie = array_merge_recursive($trie, array($resp => array($rest => NULL)));
+ $trie = array_merge_leaves($trie, array($resp => array($rest => NULL)), FALSE);
}
// Format: var_export_shrink() -like output
$subs = array();
foreach(array_keys($trie[$key]) as $sub) {
if ($sub == '') {
- $subs[] = $key;
+ $subs[] = $key; // 'example.com'
} else {
- $subs[] = $sub . '.' . $key;
+ $subs[] = $sub . '. '; // 'A.foo.bar. '
}
}
$result[] = ' \'' . $key . '\' => \'' . implode(', ', $subs) . '\',';
}
-// Check responsibility-root of the FQDN
-// 'foo.bar.example.com' => 'example.com' (.com has the last whois for it)
-// 'foo.bar.example.au' => 'example.au' (.au has the last whois for it)
-// 'foo.bar.example.edu.au' => 'example.edu.au' (.edu.au has the last whois for it)
-// 'foo.bar.example.act.edu.au' => 'example.act.edu.au' (.act.edu.au has the last whois for it)
-function whois_responsibility($fqdn = 'foo.bar.example.com', $parent = FALSE, $implicit = TRUE)
-{
- static $domain;
-
- if ($fqdn === NULL) {
- $domain = NULL; // Unset
- return '';
- }
- if (! is_string($fqdn)) return '';
-
- if (is_ip($fqdn)) return $fqdn;
-
- if (! isset($domain)) {
- $domain = array();
- if (file_exists(DOMAIN_INI_FILE)) {
- include(DOMAIN_INI_FILE); // Set
- }
- }
-
- $result = array();
- $dcursor = & $domain;
- $array = array_reverse(explode('.', $fqdn));
- $i = 0;
- while(TRUE) {
- if (! isset($array[$i])) break;
- $acursor = $array[$i];
- if (is_array($dcursor) && isset($dcursor[$acursor])) {
- $result[] = & $array[$i];
- $dcursor = & $dcursor[$acursor];
- } else {
- if (! $parent && isset($acursor)) {
- $result[] = & $array[$i]; // Whois servers must know this subdomain
- }
- break;
- }
- ++$i;
- }
-
- // Implicit responsibility: Top-Level-Domains must not be yours
- // 'bar.foo.something' => 'foo.something'
- if ($implicit && count($result) == 1 && count($array) > 1) {
- $result[] = & $array[1];
- }
-
- return $result ? implode('.', array_reverse($result)) : '';
-}
-
-
// ---------------------
// Exit
break;
case 'dump':
echo('<pre>' . "\n");
- echo htmlspecialchars(var_export($data, TRUE));
+ echo htmlsc(var_export($data, TRUE));
echo('</pre>' . "\n");
break;
};