<?php
-// $Id: spam.php,v 1.71 2006/12/16 02:01:23 henoheno Exp $
+// $Id: spam.php,v 1.78 2006/12/17 03:40:01 henoheno Exp $
// Copyright (C) 2006 PukiWiki Developers Team
// License: GPL v2 or (at your option) any later version
// Functions for Concept-work of spam-uri metrics
+// (PHP 4 >= 4.3.0): preg_match_all()
if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php');
'#i',
$string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE
);
- //var_dump(recursive_map('htmlspecialchars', $array));
// Shrink $array
static $parts = array(
function area_pickup($string = '', $method = array())
{
$area = array();
+ if (empty($method)) return $area;
- // Anchor tag pair by preg_match_all()
+ // Anchor tag pair by preg_match and preg_match_all()
// [OK] <a href></a>
// [OK] <a href= >Good site!</a>
// [OK] <a href= "#" >test</a>
// [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
// [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a>
// [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
+ $regex = '#<a\b[^>]*\bhref\b[^>]*>.*?</a\b[^>]*(>)#i';
if (isset($method['area_anchor'])) {
$areas = array();
- preg_match_all('#<a\b[^>]*\bhref\b[^>]*>.*?</a\b[^>]*(>)#i',
- $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
+ $count = isset($method['asap']) ?
+ preg_match($regex, $string) :
+ preg_match_all($regex, $string, $areas);
+ if (! empty($count)) $area['area_anchor'] = $count;
+ }
+ if (isset($method['uri_anchor'])) {
+ $areas = array();
+ preg_match_all($regex, $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
foreach(array_keys($areas) as $_area) {
$areas[$_area] = array(
$areas[$_area][0][1], // Area start (<a href>)
$areas[$_area][1][1], // Area end (</a>)
);
}
- if (! empty($areas)) $area['area_anchor'] = $areas;
+ if (! empty($areas)) $area['uri_anchor'] = $areas;
}
- // phpBB's "BBCode" pair by preg_match_all()
+ // phpBB's "BBCode" pair by preg_match and preg_match_all()
// [OK] [url][/url]
// [OK] [url]http://nasty.example.com/[/url]
// [OK] [link]http://nasty.example.com/[/link]
// [OK] [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
// [OK] [link http://nasty.example.com/]buy something[/link]
+ $regex = '#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i';
if (isset($method['area_bbcode'])) {
$areas = array();
- preg_match_all('#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i',
- $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
+ $count = isset($method['asap']) ?
+ preg_match($regex, $string) :
+ preg_match_all($regex, $string, $areas, PREG_SET_ORDER);
+ if (! empty($count)) $area['area_bbcode'] = $count;
+ }
+ if (isset($method['uri_bbcode'])) {
+ $areas = array();
+ preg_match_all($regex, $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
foreach(array_keys($areas) as $_area) {
$areas[$_area] = array(
$areas[$_area][0][1], // Area start ([url])
$areas[$_area][2][1], // Area end ([/url])
);
}
- if (! empty($areas)) $area['area_bbcode'] = $areas;
+ if (! empty($areas)) $area['uri_bbcode'] = $areas;
}
// Various Wiki syntax
$array = uri_pickup($string);
- // Area elevation for '(especially external)link' intension
+ // Area elevation of URIs, for '(especially external)link' intension
if (! empty($array)) {
- $areas = area_pickup($string, $method);
+ $_method = array();
+ if (isset($method['uri_anchor'])) $_method['uri_anchor'] = & $method['uri_anchor'];
+ if (isset($method['uri_bbcode'])) $_method['uri_bbcode'] = & $method['uri_bbcode'];
+ $areas = area_pickup($string, $_method, TRUE);
if (! empty($areas)) {
$area_shadow = array();
- foreach(array_keys($array) as $key){
+ foreach (array_keys($array) as $key) {
$area_shadow[$key] = & $array[$key]['area'];
- $area_shadow[$key]['area_anchor'] = 0;
- $area_shadow[$key]['area_bbcode'] = 0;
- }
- if (isset($areas['area_anchor'])) {
- area_measure($areas['area_anchor'], $area_shadow, 1, 'area_anchor');
+ foreach (array_keys($_method) as $_key) {
+ $area_shadow[$key][$_key] = 0;
+ }
}
- if (isset($areas['area_bbcode'])) {
- area_measure($areas['area_bbcode'], $area_shadow, 1, 'area_bbcode');
+ foreach (array_keys($_method) as $_key) {
+ if (isset($areas[$_key])) {
+ area_measure($areas[$_key], $area_shadow, 1, $_key);
+ }
}
}
}
//'10.20.*.*', // 10.20.example.com also matches
//'*.blogspot.com', // Blog services subdomains
//array('blogspot.com', '*.blogspot.com')
+
+ // Viral/Buzz marketers' site, trying to make people
+ // as commercial Wiki spammers
+ // http://pukiwiki.sourceforge.jp/image/2006-12-16_wikiviral_pressblog.gif
+ array('pressblog.jp', '*.pressblog.jp'),
);
foreach ($blocklist['badhost'] as $part) {
- if (is_array($part)) $part = implode(', ', $part);
- $regex['badhost'][$part] = '/^' . generate_glob_regex($part) . '$/i';
+ $_part = is_array($part) ? implode(', ', $part) : $part;
+ $regex['badhost'][$_part] = '/^' . generate_glob_regex($part) . '$/i';
}
}
$blocklist = array();
require(SPAM_INI_FILE);
foreach ($blocklist['badhost'] as $part) {
- if (is_array($part)) $part = implode(', ', $part);
- $regex['badhost'][$part] = '/^' . generate_glob_regex($part) . '$/i';
+ $_part = is_array($part) ? implode(', ', $part) : $part;
+ $regex['badhost'][$_part] = '/^' . generate_glob_regex($part) . '$/i';
}
}
}
// Thresholds
'quantity' => 8 * $times, // Allow N URIs
'non_uniq' => 3 * $times, // Allow N duped (and normalized) URIs
+
// Areas
- 'area_anchor' => $t_area, // Inside <a href> HTML tag
- 'area_bbcode' => $t_area, // Inside [url] or [link] BBCode
+ 'area_anchor' => $t_area, // Using <a href> HTML tag
+ 'area_bbcode' => $t_area, // Using [url] or [link] BBCode
+ //'uri_anchor' => $t_area, // URI inside <a href> HTML tag
+ //'uri_bbcode' => $t_area, // URI inside [url] or [link] BBCode
);
if ($rule) {
$bool = array(
// Rules
- 'asap' => FALSE, // Quit As Soon As Possible
+ //'asap' => TRUE, // Quit or return As Soon As Possible
'uniqhost' => TRUE, // Show uniq host (at block notification mail)
'badhost' => TRUE, // Check badhost
);
'badhost' => 0,
'area_anchor' => 0,
'area_bbcode' => 0,
+ 'uri_anchor' => 0,
+ 'uri_bbcode' => 0,
),
'is_spam' => array(),
'method' => & $method,
);
$sum = & $progress['sum'];
$is_spam = & $progress['is_spam'];
- $asap = isset($method['asap']) ? $method['asap'] : TRUE;
+ $asap = isset($method['asap']);
// Return if ...
if (is_array($target)) {
}
return $progress;
}
+
+ // Area: There's HTML anchor tag
+ if ((! $asap || ! $is_spam) && isset($method['area_anchor'])) {
+ $key = 'area_anchor';
+ $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
+ $result = area_pickup($target, array($key => TRUE) + $_asap);
+ if ($result) {
+ $sum[$key] += $result[$key];
+ $is_spam[$key] = TRUE;
+ }
+ }
+
+ // Area: There's 'BBCode' linking tag
+ if ((! $asap || ! $is_spam) && isset($method['area_bbcode'])) {
+ $key = 'area_bbcode';
+ $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
+ $result = area_pickup($target, array($key => TRUE) + $_asap);
+ if ($result) {
+ $sum[$key] += $result[$key];
+ $is_spam[$key] = TRUE;
+ }
+ }
+
+ // Return if ...
+ if ($asap && $is_spam) {
+ return $progress;
+ }
+ // URI Init
$pickups = spam_uri_pickup($target, $method);
if (empty($pickups)) {
return $progress;
}
- // Check quantity
+ // URI: Check quantity
$sum['quantity'] += count($pickups);
// URI quantity
if ((! $asap || ! $is_spam) && isset($method['quantity']) &&
$is_spam['quantity'] = TRUE;
}
- // Using invalid area: anchor
- if ((! $asap || ! $is_spam) && isset($method['area_anchor'])) {
- $key = 'area_anchor';
+ // URI: used inside HTML anchor tag pair
+ if ((! $asap || ! $is_spam) && isset($method['uri_anchor'])) {
+ $key = 'uri_anchor';
foreach($pickups as $pickup) {
- $sum[$key] += $pickup['area'][$key];
- if(isset($method[$key]) &&
- $sum[$key] > $method[$key]) {
- $is_spam[$key] = TRUE;
+ if (isset($pickup['area'][$key])) {
+ $sum[$key] += $pickup['area'][$key];
+ if(isset($method[$key]) &&
+ $sum[$key] > $method[$key]) {
+ $is_spam[$key] = TRUE;
+ if ($asap && $is_spam) break;
+ }
if ($asap && $is_spam) break;
}
- if ($asap && $is_spam) break;
}
}
- // Using invalid area: bbcode
- if ((! $asap || ! $is_spam) && isset($method['area_bbcode'])) {
- $key = 'area_bbcode';
+ // URI: used inside 'BBCode' pair
+ if ((! $asap || ! $is_spam) && isset($method['uri_bbcode'])) {
+ $key = 'uri_bbcode';
foreach($pickups as $pickup) {
- $sum[$key] += $pickup['area'][$key];
- if(isset($method[$key]) &&
- $sum[$key] > $method[$key]) {
- $is_spam[$key] = TRUE;
+ if (isset($pickup['area'][$key])) {
+ $sum[$key] += $pickup['area'][$key];
+ if(isset($method[$key]) &&
+ $sum[$key] > $method[$key]) {
+ $is_spam[$key] = TRUE;
+ if ($asap && $is_spam) break;
+ }
if ($asap && $is_spam) break;
}
- if ($asap && $is_spam) break;
}
}
- // URI uniqueness (and removing non-uniques)
+ // URI: Uniqueness (and removing non-uniques)
if ((! $asap || ! $is_spam) && isset($method['non_uniq'])) {
// Destructive normalize of URIs
unset($uris);
}
- // Unique host
+ // Return if ...
+ if ($asap && $is_spam) {
+ return $progress;
+ }
+
+ // URI: Unique host
$hosts = array();
foreach ($pickups as $pickup) $hosts[] = & $pickup['host'];
$hosts = array_unique($hosts);
$sum['uniqhost'] += count($hosts);
- // Bad host
+ // URI: Bad host
if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
$count = array_count_leaves(is_badhost($hosts, $asap));
$sum['badhost'] += $count;
{
global $notify_subject;
- $asap = isset($method['asap']) ? $method['asap'] : TRUE;
+ $asap = isset($method['asap']);
$footer['ACTION'] = 'Blocked by: ' . summarize_spam_progress($progress, TRUE);