<?php
-// $Id: spam.php,v 1.164 2007/05/18 14:12:40 henoheno Exp $
+// $Id: spam.php,v 1.195 2007/06/29 15:35:53 henoheno Exp $
// Copyright (C) 2006-2007 PukiWiki Developers Team
// License: GPL v2 or (at your option) any later version
//
//
// (PHP 4 >= 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature
-if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php');
+if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php');
+if (! defined('DOMAIN_INI_FILE')) define('DOMAIN_INI_FILE', 'domain.ini.php');
// ---------------------
// Compat etc
// NOTE: If the same data exists, it must be corrupted.
function var_export_shrink($expression, $return = FALSE, $ignore_numeric_keys = FALSE)
{
- $result =preg_replace(
+ $result = var_export($expression, TRUE);
+
+ $result = preg_replace(
// Remove a newline and spaces
'# => \n *array \(#', ' => array (',
- var_export($expression, TRUE)
+ $result
);
if ($ignore_numeric_keys) {
// References:
// http://www.freebsd.org/cgi/man.cgi?query=strings (Man-page of GNU strings)
// http://www.pcre.org/pcre.txt
-function strings($binary = '', $min_len = 4, $ignore_space = FALSE)
+// Note: mb_ereg_replace() is one of mbstring extension's functions
+// and need to init its encoding.
+function strings($binary = '', $min_len = 4, $ignore_space = FALSE, $multibyte = FALSE)
{
+ // String only
+ $binary = (is_array($binary) || $binary === TRUE) ? '' : strval($binary);
+
+ $regex = $ignore_space ?
+ '[^[:graph:] \t\n]+' : // Remove "\0" etc, and readable spaces
+ '[^[:graph:][:space:]]+'; // Preserve readable spaces if possible
+
+ $binary = $multibyte ?
+ mb_ereg_replace($regex, "\n", $binary) :
+ preg_replace('/' . $regex . '/s', "\n", $binary);
+
if ($ignore_space) {
$binary = preg_replace(
array(
- '/(?:[^[:graph:] \t\n]|[\r])+/s',
'/[ \t]{2,}/',
'/^[ \t]/m',
'/[ \t]$/m',
),
array(
- "\n",
' ',
'',
''
),
$binary);
- } else {
- $binary = preg_replace('/(?:[^[:graph:][:space:]]|[\r])+/s', "\n", $binary);
}
if ($min_len > 1) {
+ // The last character seems "\n" or not
+ $br = (! empty($binary) && $binary[strlen($binary) - 1] == "\n") ? "\n" : '';
+
$min_len = min(1024, intval($min_len));
- $binary =
- implode("\n",
- preg_grep('/^.{' . $min_len . ',}/S',
- explode("\n", $binary)
- )
- );
+ $regex = '/^.{' . $min_len . ',}/S';
+ $binary = implode("\n", preg_grep($regex, explode("\n", $binary))) . $br;
}
return $binary;
// 3: Host
'\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
'(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
- '[a-z0-9][a-z0-9.-]+[a-z0-9]' . // hostname(FQDN) : foo.example.org
+ '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' . // hostname(FQDN) : foo.example.org
')' .
'(?::([0-9]*))?' . // 4: Port
'((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info
// ---------------------
// Spam-uri pickup
-// Domain exposure callback (See spam_uri_pickup_preprocess())
+// Preprocess: Removing uninterest part for URI detection
+function spam_uri_removing_hocus_pocus($binary = '', $method = array())
+{
+ $length = 4 ; // 'http'(1) and '://'(2) and 'fqdn'(1)
+ if (is_array($method)) {
+ // '<a'(2) or 'href='(5) or '>'(1) or '</a>'(4)
+ // '[uri'(4) or ']'(1) or '[/uri]'(6)
+ if (isset($method['area_anchor']) || isset($method['uri_anchor']) ||
+ isset($method['area_bbcode']) || isset($method['uri_bbcode']))
+ $length = 1; // Seems not effective
+ }
+
+ // Removing sequential spaces and too short lines
+ $binary = strings($binary, $length, TRUE, FALSE); // Multibyte NOT needed
+
+ // Remove words (has no '<>[]:') between spaces
+ $binary = preg_replace('/[ \t][\w.,()\ \t]+[ \t]/', ' ', $binary);
+
+ return $binary;
+}
+
+// Preprocess: Domain exposure callback (See spam_uri_pickup_preprocess())
// http://victim.example.org/?foo+site:nasty.example.com+bar
// => http://nasty.example.com/?refer=victim.example.org
// NOTE: 'refer=' is not so good for (at this time).
// [OK] http://victim.example.org/nasty.example.org
// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
// [OK] http://victim.example.org/http://nasty.example.org
-function spam_uri_pickup_preprocess($string = '')
+function spam_uri_pickup_preprocess($string = '', $method = array())
{
if (! is_string($string)) return '';
- $string = rawurldecode($string);
+ $string = spam_uri_removing_hocus_pocus(rawurldecode($string), $method);
+ //var_dump(htmlspecialchars($string));
// Domain exposure (simple)
// http://victim.example.org/nasty.example.org/path#frag
$string
);
- // Domain exposure (See _preg_replace_callback_domain_exposure())
+ // Domain exposure (site:) See _preg_replace_callback_domain_exposure()
$string = preg_replace_callback(
array(
- '#(http)://' .
+ '#(h?ttp)://' . // 1:Scheme
+ // 2:Host
'(' .
+ '(?:[a-z0-9_.-]+\.)?[a-z0-9_-]+\.[a-z0-9_-]+' .
// Something Google: http://www.google.com/supported_domains
- '(?:[a-z0-9.]+\.)?google\.[a-z]{2,3}(?:\.[a-z]{2})?' .
- '|' .
- // AltaVista
- '(?:[a-z0-9.]+\.)?altavista.com' .
-
+ // AltaVista: http://es.altavista.com/web/results?q=site%3Anasty.example.org+foobar
+ // Live Search: search.live.com
+ // MySpace: http://sads.myspace.com/Modules/Search/Pages/Search.aspx?_snip_&searchString=site:nasty.example.org
+ // (also searchresults.myspace.com)
+ // alltheweb.com
+ // search.bbc.co.uk
+ // search.orange.co.uk
+ // ...
')' .
'/' .
- '([a-z0-9?=&.%_/\'\\\+-]+)' . // path/?query=foo+bar+
- '\bsite:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // site:nasty.example.com
- //'()' . // Preserve or remove?
+ '([a-z0-9?=&.%_/\'\\\+-]+)' . // 3:path/?query=foo+bar+
+ '\bsite:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // 4:site:nasty.example.com
+ '()' . // 5:Preserve or remove?
'#i',
),
'_preg_replace_callback_domain_exposure',
$method = check_uri_spam_method();
}
- $string = spam_uri_pickup_preprocess($string);
+ $string = spam_uri_pickup_preprocess($string, $method);
$array = uri_pickup($string);
}
}
-function get_blocklist($list = '', $dispose = FALSE)
+function get_blocklist($list = '')
{
- static $f_dispose = FALSE, $regexes;
+ static $regexes;
- if ($dispose) {
- $f_dispose = TRUE;
- $regexes = NULL; // Unset
+ if ($list === NULL) {
+ $regexes = NULL; // Unset
return array();
}
if (! isset($regexes)) {
- if ($f_dispose) die(__FUNCTION__ . '(): Memory already disposed');
-
$regexes = array();
if (file_exists(SPAM_INI_FILE)) {
$blocklist = array();
foreach(array_keys($method) as $key) {
if (! isset($sum[$key])) $sum[$key] = 0;
}
+ if (! isset($sum['quantity'])) $sum['quantity'] = 0;
if (is_array($target)) {
foreach($target as $str) {
}
// An array() to an array leaf
-function array_leaf($array = array('A', 'B', 'C.D'), $stem = FALSE, $edge = array())
+function array_leaf($array = array('A', 'B', 'C.D'), $stem = FALSE, $edge = TRUE)
{
+ if (! is_array($array)) return $array;
+
$leaf = array();
$tmp = & $leaf;
foreach($array as $arg) {
! is_array($progress['hosts']) ||
empty($progress['hosts'])) return '';
- $result = '';
- if (FALSE) {
- // Sort by domain
- $tmp = array();
- foreach($progress['hosts'] as $value) {
- $tmp[delimiter_reverse($value)] = $value;
- }
- ksort($tmp, SORT_STRING);
- $result = count($tmp) . ' (' .implode(', ', $tmp) . ')';
- } else {
- $tmp = array();
- foreach($progress['hosts'] as $value) {
- $tmp = array_merge_recursive(
- $tmp,
- array_leaf(explode('.', delimiter_reverse($value) . '.'), TRUE, $value)
- );
+ // Generate a responsible $trie
+ $trie = array();
+ foreach($progress['hosts'] as $value) {
+ // 'A.foo.bar.example.com'
+ $resp = whois_responsibility($value); // 'example.com'
+ if (empty($resp)) {
+ // One or more test, or do nothing here
+ $resp = strval($value);
+ $rest = '';
+ } else {
+ $rest = rtrim(substr($value, 0, - strlen($resp)), '.'); // 'A.foo.bar'
}
- ksort($tmp, SORT_STRING);
-
- separate_and_joinkey_leaves($tmp, '.', TRUE, TRUE);
- separate_and_joinkey_leaves($tmp, '.', TRUE, FALSE);
- separate_and_joinkey_leaves($tmp, '.', TRUE, FALSE);
- //separate_and_joinkey_leaves($tmp, '.', TRUE, FALSE);
+ $trie = array_merge_recursive($trie, array($resp => array($rest => NULL)));
+ }
- foreach($tmp as $key => $value) {
- if (is_array($value)) {
- ksort($tmp[$key]);
- // $tmp[$key] = implode(', ', array_flat_leaves($value));
+ // Format: var_export_shrink() -like output
+ $result = array();
+ ksort_by_domain($trie);
+ foreach(array_keys($trie) as $key) {
+ ksort_by_domain($trie[$key]);
+ if (count($trie[$key]) == 1 && key($trie[$key]) == '') {
+ // Just one 'responsibility.example.com'
+ $result[] = ' \'' . $key . '\',';
+ } else {
+ // One subdomain-or-host, or several ones
+ $subs = array();
+ foreach(array_keys($trie[$key]) as $sub) {
+ if ($sub == '') {
+ $subs[] = $key;
+ } else {
+ $subs[] = $sub . '.' . $key;
+ }
}
+ $result[] = ' \'' . $key . '\' => \'' . implode(', ', $subs) . '\',';
}
-
- $result = var_export_shrink($tmp, TRUE, TRUE);
+ unset($trie[$key]);
}
+ return
+ 'array (' . "\n" .
+ implode("\n", $result) . "\n" .
+ ')';
+}
- return $result;
+// ksort() by domain
+function ksort_by_domain(& $array)
+{
+ $sort = array();
+ foreach(array_keys($array) as $key) {
+ $sort[delimiter_reverse($key)] = $key;
+ }
+ ksort($sort, SORT_STRING);
+ $result = array();
+ foreach($sort as $key) {
+ $result[$key] = & $array[$key];
+ }
+ $array = $result;
}
-function separate_and_joinkey_leaves(
- & $array, // array('A' => array('B' => 'C.D')),
- $delim = '.', $reversejoin = FALSE, $allowmulti = FALSE)
+// Check responsibility-root of the FQDN
+// 'foo.bar.example.com' => 'example.com' (.com has the last whois for it)
+// 'foo.bar.example.au' => 'example.au' (.au has the last whois for it)
+// 'foo.bar.example.edu.au' => 'example.edu.au' (.edu.au has the last whois for it)
+// 'foo.bar.example.act.edu.au' => 'example.act.edu.au' (.act.edu.au has the last whois for it)
+function whois_responsibility($fqdn = 'foo.bar.example.com', $parent = FALSE, $implicit = TRUE)
{
- if (! is_array($array)) return $array;
+ static $domain;
- $result = array();
- foreach(array_keys($array) as $key) {
- if (! is_array($array[$key]) || (! $allowmulti && count($array[$key]) > 1)) {
- $result[$key] = & $array[$key]; // Do nothing
+ if ($fqdn === NULL) {
+ $domain = NULL; // Unset
+ return '';
+ }
+ if (! is_string($fqdn)) return '';
+
+ if (is_ip($fqdn)) return $fqdn;
+
+ if (! isset($domain)) {
+ $domain = array();
+ if (file_exists(DOMAIN_INI_FILE)) {
+ include(DOMAIN_INI_FILE); // Set
+ }
+ }
+
+ $result = array();
+ $dcursor = & $domain;
+ $array = array_reverse(explode('.', $fqdn));
+ $i = 0;
+ while(TRUE) {
+ if (! isset($array[$i])) break;
+ $acursor = $array[$i];
+ if (is_array($dcursor) && isset($dcursor[$acursor])) {
+ $result[] = & $array[$i];
+ $dcursor = & $dcursor[$acursor];
} else {
- foreach(array_keys($array[$key]) as $_key) {
- $joinkey = $reversejoin ?
- $_key . $delim . $key :
- $key . $delim . $_key;
- $result[$joinkey] = & $array[$key][$_key];
+ if (! $parent && isset($acursor)) {
+ $result[] = & $array[$i]; // Whois servers must know this subdomain
}
+ break;
}
+ ++$i;
}
- $array = & $result;
+ // Implicit responsibility: Top-Level-Domains must not be yours
+ // 'bar.foo.something' => 'foo.something'
+ if ($implicit && count($result) == 1 && count($array) > 1) {
+ $result[] = & $array[1];
+ }
- return $result; // array('A.B' => 'C.D')
+ return $result ? implode('.', array_reverse($result)) : '';
}
// ---------------------
// Exit
+// Freeing memories
+function spam_dispose()
+{
+ get_blocklist(NULL);
+ whois_responsibility(NULL);
+}
+
// Common bahavior for blocking
// NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'
function spam_exit($mode = '', $data = array())
{
- // Dispose
- get_blocklist(NULL, TRUE);
-
$exit = TRUE;
+
switch ($mode) {
case '':
echo("\n");
{
$progress = check_uri_spam($target, $method);
- if (! empty($progress['is_spam'])) {
- // Mail to administrator(s)
- pkwk_spamnotify($action, $page, $target, $progress, $method);
+ if (empty($progress['is_spam'])) {
+ spam_dispose();
+ } else {
+
+// TODO: detect encoding from $target for mbstring functions
+// $tmp = array();
+// foreach(array_keys($target) as $key) {
+// $tmp[strings($key, 0, FALSE, TRUE)] = strings($target[$key], 0, FALSE, TRUE); // Removing "\0" etc
+// }
+// $target = & $tmp;
- // Exit
+ pkwk_spamnotify($action, $page, $target, $progress, $method);
spam_exit($exitmode, $progress);
}
}