(Separate good/bad hosts from $hosts)
+
+ if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
+ $list = get_blocklist('pre');
+ $blocked = blocklist_distiller($hosts, array_keys($list), $asap);
+ foreach($list as $key => $type){
+ if (! $type) unset($blocked[$key]); // Ignore goodhost etc
+ }
+ unset($list);
+ if (! empty($blocked)) $is_spam['badhost'] = TRUE;
}
- if (isset($uri['host']) && $uri['host'] !== '') {
- $tmp[] = & $uri['host'];
+
+ // Return if ...
+ if ($asap && $is_spam) return $progress;
+
+ // Remove blocked from $pickups
+ foreach(array_keys($pickups) as $key) {
+ if (! isset($hosts[$key])) {
+ unset($pickups[$key]);
+ }
}
- if (isset($uri['port']) && $uri['port'] !== '') {
- $tmp[] = ':';
- $tmp[] = & $uri['port'];
+
+ // ----------------------------------------
+ // URI: Check quantity
+
+ $sum['quantity'] += count($pickups);
+ // URI quantity
+ if ((! $asap || ! $is_spam) && isset($method['quantity']) &&
+ $sum['quantity'] > $method['quantity']) {
+ $is_spam['quantity'] = TRUE;
}
- if (isset($uri['path']) && $uri['path'] !== '') {
- $tmp[] = & $uri['path'];
+
+ // ----------------------------------------
+ // URI: used inside HTML anchor tag pair
+
+ if ((! $asap || ! $is_spam) && isset($method['uri_anchor'])) {
+ $key = 'uri_anchor';
+ foreach($pickups as $pickup) {
+ if (isset($pickup['area'][$key])) {
+ $sum[$key] += $pickup['area'][$key];
+ if(isset($method[$key]) &&
+ $sum[$key] > $method[$key]) {
+ $is_spam[$key] = TRUE;
+ if ($asap && $is_spam) break;
+ }
+ if ($asap && $is_spam) break;
+ }
+ }
}
- if (isset($uri['file']) && $uri['file'] !== '') {
- $tmp[] = & $uri['file'];
+
+ // ----------------------------------------
+ // URI: used inside 'BBCode' pair
+
+ if ((! $asap || ! $is_spam) && isset($method['uri_bbcode'])) {
+ $key = 'uri_bbcode';
+ foreach($pickups as $pickup) {
+ if (isset($pickup['area'][$key])) {
+ $sum[$key] += $pickup['area'][$key];
+ if(isset($method[$key]) &&
+ $sum[$key] > $method[$key]) {
+ $is_spam[$key] = TRUE;
+ if ($asap && $is_spam) break;
+ }
+ if ($asap && $is_spam) break;
+ }
+ }
}
- if (isset($uri['fragment']) && $uri['fragment'] !== '') {
- $tmp[] = '#';
- $tmp[] = & $uri['fragment'];
+
+ // ----------------------------------------
+ // URI: Uniqueness (and removing non-uniques)
+
+ if ((! $asap || ! $is_spam) && isset($method['non_uniquri'])) {
+
+ $uris = array();
+ foreach (array_keys($pickups) as $key) {
+ $uris[$key] = uri_pickup_implode($pickups[$key]);
+ }
+ $count = count($uris);
+ $uris = array_unique($uris);
+ $sum['non_uniquri'] += $count - count($uris);
+ if ($sum['non_uniquri'] > $method['non_uniquri']) {
+ $is_spam['non_uniquri'] = TRUE;
+ }
+ if (! $asap || ! $is_spam) {
+ foreach (array_diff(array_keys($pickups),
+ array_keys($uris)) as $remove) {
+ unset($pickups[$remove]);
+ }
+ }
+ unset($uris);
}
- return implode('', $tmp);
-}
+ // Return if ...
+ if ($asap && $is_spam) return $progress;
-// ---------------------
-// Part One : Checker
+ // ----------------------------------------
+ // Host: Uniqueness (uniq / non-uniq)
-function generate_glob_regex($string = '', $divider = '/')
-{
- static $from = array(
- 0 => '*',
- 1 => '?',
- 2 => '\[',
- 3 => '\]',
- 4 => '[',
- 5 => ']',
- );
- static $mid = array(
- 0 => '_AST_',
- 1 => '_QUE_',
- 2 => '_eRBR_',
- 3 => '_eLBR_',
- 4 => '_RBR_',
- 5 => '_LBR_',
- );
- static $to = array(
- 0 => '.*',
- 1 => '.',
- 2 => '\[',
- 3 => '\]',
- 4 => '[',
- 5 => ']',
- );
+ $hosts = array_unique($hosts);
- $string = str_replace($from, $mid, $string); // Hide
- $string = preg_quote($string, $divider);
- $string = str_replace($mid, $to, $string); // Unhide
+ if (isset($sum['uniqhost'])) $sum['uniqhost'] += count($hosts);
+ if ((! $asap || ! $is_spam) && isset($method['non_uniqhost'])) {
+ $sum['non_uniqhost'] = $sum['quantity'] - $sum['uniqhost'];
+ if ($sum['non_uniqhost'] > $method['non_uniqhost']) {
+ $is_spam['non_uniqhost'] = TRUE;
+ }
+ }
- return $string;
-}
+ // Return if ...
+ if ($asap && $is_spam) return $progress;
-// TODO: Ignore list
-// TODO: require_or_include_once(another file)
-function is_badhost($hosts = '', $asap = TRUE)
-{
- static $blocklist_regex;
-
- if (! isset($blocklist_regex)) {
- $blocklist_regex = array();
- $blocklist = array(
- // Deny all uri
- //'*',
-
- // IP address or ...
- //'10.20.*.*', // 10.20.example.com also matches
- //'\[1\]',
-
- // Too much malicious sub-domains
- '*.blogspot.com',
-
- // 2006-11 dev
- 'wwwtahoo.com',
-
- // 2006-11 dev
- '*.infogami.com',
-
- // 2006/11/19 17:50 dev
- '*.google0site.org',
- '*.bigpricesearch.org',
- '*.osfind.org',
- '*.bablomira.biz',
+ // ----------------------------------------
+ // URI: Bad host (Separate good/bad hosts from $hosts)
+
+ if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
+ $list = get_blocklist('list');
+ $blocked = array_merge_leaves(
+ $blocked,
+ blocklist_distiller($hosts, array_keys($list), $asap),
+ FALSE
);
- foreach ($blocklist as $part) {
- $blocklist_regex[] = '#^' . generate_glob_regex($part, '#') . '$#i';
+ foreach($list as $key=>$type){
+ if (! $type) unset($blocked[$key]); // Ignore goodhost etc
}
+ unset($list);
+ if (! empty($blocked)) $is_spam['badhost'] = TRUE;
}
- $result = 0;
- if (! is_array($hosts)) $hosts = array($hosts);
- foreach($hosts as $host) {
- if (! is_string($host)) $host = '';
- foreach ($blocklist_regex as $regex) {
- if (preg_match($regex, $host)) {
- ++$result;
- if ($asap) {
- return $result;
- } else {
- break; // Check next host
+ // Return if ...
+ //if ($asap && $is_spam) return $progress;
+
+ // ----------------------------------------
+ // End
+
+ return $progress;
+}
+
+// ---------------------
+// Reporting
+
+// Summarize $progress (blocked only)
+function summarize_spam_progress($progress = array(), $blockedonly = FALSE)
+{
+ if ($blockedonly) {
+ $tmp = array_keys($progress['is_spam']);
+ } else {
+ $tmp = array();
+ $method = & $progress['method'];
+ if (isset($progress['sum'])) {
+ foreach ($progress['sum'] as $key => $value) {
+ if (isset($method[$key]) && $value) {
+ $tmp[] = $key . '(' . $value . ')';
}
}
}
}
- return $result;
+ return implode(', ', $tmp);
}
-// TODO return TRUE or FALSE!
-// Simple/fast spam check
-function check_uri_spam($target = '', $method = array(), $asap = TRUE)
+function summarize_detail_badhost($progress = array())
{
- $is_spam = FALSE;
- $progress = array(
- 'quantity' => 0,
- 'area' => 0,
- 'non_uniq' => 0,
- 'uniqhost' => 0,
- 'badhost' => 0,
- );
-
- if (! is_array($method) || empty($method)) {
- // Default
- $method = array(
- 'quantity' => 8, // Allow N URIs
- 'area' => TRUE,
- 'non_uniq' => 3, // Allow N times dupe
- 'badhost' => TRUE,
- );
+ if (! isset($progress['blocked']) || empty($progress['blocked'])) return '';
+
+ // Flat per group
+ $blocked = array();
+ foreach($progress['blocked'] as $list => $lvalue) {
+ foreach($lvalue as $group => $gvalue) {
+ $flat = implode(', ', array_flat_leaves($gvalue));
+ if ($flat === $group) {
+ $blocked[$list][] = $flat;
+ } else {
+ $blocked[$list][$group] = $flat;
+ }
+ }
}
- if (is_array($target)) {
- foreach($target as $str) {
- // Recurse
- list($is_spam, $_progress) = check_uri_spam($str, $method);
- $progress['quantity'] += $_progress['quantity'];
- $progress['non_uniq'] += $_progress['non_uniq'];
- if ($asap || $is_spam) break;
+ // Shrink per list
+ // From: 'A-1' => array('ie.to')
+ // To: 'A-1' => 'ie.to'
+ foreach($blocked as $list => $lvalue) {
+ if (is_array($lvalue) &&
+ count($lvalue) == 1 &&
+ is_numeric(key($lvalue))) {
+ $blocked[$list] = current($lvalue);
}
- } else {
- $pickups = spam_uri_pickup($target);
- $progress['quantity'] += count($pickups);
-
- if (! empty($pickups)) {
+ }
- // URI quantity
- if ((! $is_spam || ! $asap) && isset($method['quantity']) &&
- $progress['quantity'] > $method['quantity']) {
- $is_spam = TRUE;
- }
- //var_dump($method['quantity'], $is_spam);
-
- // Using invalid area
- if ((! $is_spam || ! $asap) && isset($method['area'])) {
- foreach($pickups as $pickup) {
- if ($pickup['area'] < 0) {
- ++$progress['area'];
- $is_spam = TRUE;
- if ($asap) break;
- }
- }
- }
- //var_dump($method['area'], $is_spam);
+ return var_export_shrink($blocked, TRUE, TRUE);
+}
- // URI uniqueness (and removing non-uniques)
- if ((! $is_spam || ! $asap) && isset($method['non_uniq'])) {
- $uris = array();
- foreach ($pickups as $key => $pickup) {
- $uris[$key] = uri_array_implode($pickup);
- }
- $count = count($uris);
- $uris = array_unique($uris);
- $progress['non_uniq'] += $count - count($uris);
- if ($progress['non_uniq'] > $method['non_uniq']) {
- $is_spam = TRUE;
- }
- if (! $asap || ! $is_spam) {
- foreach (array_diff(array_keys($pickups),
- array_keys($uris)) as $remove) {
- unset($pickups[$remove]);
- }
- }
- unset($uris);
- //var_dump($uris, $pickups);
- }
- //var_dump($method['non_uniq'], $is_spam);
+function summarize_detail_newtral($progress = array())
+{
+ if (! isset($progress['hosts']) ||
+ ! is_array($progress['hosts']) ||
+ empty($progress['hosts'])) return '';
+
+ // Generate a responsible $trie
+ $trie = array();
+ foreach($progress['hosts'] as $value) {
+ // 'A.foo.bar.example.com'
+ $resp = whois_responsibility($value); // 'example.com'
+ if (empty($resp)) {
+ // One or more test, or do nothing here
+ $resp = strval($value);
+ $rest = '';
+ } else {
+ $rest = rtrim(substr($value, 0, - strlen($resp)), '.'); // 'A.foo.bar'
+ }
+ $trie = array_merge_leaves($trie, array($resp => array($rest => NULL)), FALSE);
+ }
- // Bad host
- if ((! $is_spam || ! $asap) && isset($method['badhost'])) {
- $hosts = array();
- foreach ($pickups as $pickup) {
- $hosts[] = & $pickup['host'];
+ // Format: var_export_shrink() -like output
+ $result = array();
+ ksort_by_domain($trie);
+ foreach(array_keys($trie) as $key) {
+ ksort_by_domain($trie[$key]);
+ if (count($trie[$key]) == 1 && key($trie[$key]) == '') {
+ // Just one 'responsibility.example.com'
+ $result[] = ' \'' . $key . '\',';
+ } else {
+ // One subdomain-or-host, or several ones
+ $subs = array();
+ foreach(array_keys($trie[$key]) as $sub) {
+ if ($sub == '') {
+ $subs[] = $key; // 'example.com'
+ } else {
+ $subs[] = $sub . '. '; // 'A.foo.bar. '
}
- $hosts = array_unique($hosts);
- $progress['uniqhost'] += count($hosts);
- $count = is_badhost($hosts, $asap);
- $progress['badhost'] += $count;
- if ($count !== 0) $is_spam = TRUE;
}
- //var_dump($method['badhost'], $is_spam);
+ $result[] = ' \'' . $key . '\' => \'' . implode(', ', $subs) . '\',';
}
+ unset($trie[$key]);
}
-
- return array($is_spam, $progress);
+ return
+ 'array (' . "\n" .
+ implode("\n", $result) . "\n" .
+ ')';
}
+
// ---------------------
+// Exit
-// Check User-Agent (not testing yet)
-function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
+// Freeing memories
+function spam_dispose()
{
- return $ua_name === '';
+ get_blocklist(NULL);
+ whois_responsibility(NULL);
}
+// Common bahavior for blocking
+// NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'
+function spam_exit($mode = '', $data = array())
+{
+ $exit = TRUE;
+
+ switch ($mode) {
+ case '':
+ echo("\n");
+ break;
+ case 'dump':
+ echo('' . "\n");
+ echo htmlsc(var_export($data, TRUE));
+ echo('
' . "\n");
+ break;
+ };
+
+ if ($exit) exit; // Force exit
+}
+
+
// ---------------------
+// Simple filtering
-// TODO: Separate check-part(s) and mail part
-// TODO: Mail to administrator with more measurement data?
+// TODO: Record them
// Simple/fast spam filter ($target: 'a string' or an array())
-function pkwk_spamfilter($action, $page, $target = array('title' => ''), $method = array())
+function pkwk_spamfilter($action, $page, $target = array('title' => ''), $method = array(), $exitmode = '')
{
- $is_spam = FALSE;
+ $progress = check_uri_spam($target, $method);
- //$is_spam = is_invalid_useragent('NOTYET');
- if ($is_spam) {
- $action .= ' (Invalid User-Agent)';
+ if (empty($progress['is_spam'])) {
+ spam_dispose();
} else {
- list($is_spam) = check_uri_spam($target, $method);
- }
- if ($is_spam) {
- // Mail to administrator(s)
- global $notify, $notify_subject;
- if ($notify) {
- $footer['ACTION'] = $action;
- $footer['PAGE'] = '[blocked] ' . $page;
- $footer['URI'] = get_script_uri() . '?' . rawurlencode($page);
- $footer['USER_AGENT'] = TRUE;
- $footer['REMOTE_ADDR'] = TRUE;
- pkwk_mail_notify($notify_subject, var_export($target, TRUE), $footer);
- unset($footer);
- }
- }
+// TODO: detect encoding from $target for mbstring functions
+// $tmp = array();
+// foreach(array_keys($target) as $key) {
+// $tmp[strings($key, 0, FALSE, TRUE)] = strings($target[$key], 0, FALSE, TRUE); // Removing "\0" etc
+// }
+// $target = & $tmp;
- if ($is_spam) spam_exit();
+ pkwk_spamnotify($action, $page, $target, $progress, $method);
+ spam_exit($exitmode, $progress);
+ }
}
// ---------------------
+// PukiWiki original
-// Common bahavior for blocking
-// NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'
-function spam_exit()
+// Mail to administrator(s)
+function pkwk_spamnotify($action, $page, $target = array('title' => ''), $progress = array(), $method = array())
{
- die("\n");
+ global $notify, $notify_subject;
+
+ if (! $notify) return;
+
+ $asap = isset($method['asap']);
+
+ $summary['ACTION'] = 'Blocked by: ' . summarize_spam_progress($progress, TRUE);
+ if (! $asap) {
+ $summary['METRICS'] = summarize_spam_progress($progress);
+ }
+
+ $tmp = summarize_detail_badhost($progress);
+ if ($tmp != '') $summary['DETAIL_BADHOST'] = $tmp;
+
+ $tmp = summarize_detail_newtral($progress);
+ if (! $asap && $tmp != '') $summary['DETAIL_NEUTRAL_HOST'] = $tmp;
+
+ $summary['COMMENT'] = $action;
+ $summary['PAGE'] = '[blocked] ' . (is_pagename($page) ? $page : '');
+ $summary['URI'] = get_script_uri() . '?' . rawurlencode($page);
+ $summary['USER_AGENT'] = TRUE;
+ $summary['REMOTE_ADDR'] = TRUE;
+ pkwk_mail_notify($notify_subject, var_export($target, TRUE), $summary, TRUE);
}
?>