2 // $Id: spam.php,v 1.28 2006/11/23 02:05:03 henoheno Exp $
3 // Copyright (C) 2006 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
6 // Functions for Concept-work of spam-uri metrics
8 // Return an array of URIs in the $string
9 // [OK] http://nasty.example.org#nasty_string
10 // [OK] http://nasty.example.org:80/foo/xxx#nasty_string/bar
11 // [OK] ftp://nasty.example.org:80/dfsdfs
12 // [OK] ftp://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm (from RFC3986)
13 function uri_pickup($string = '', $normalize = TRUE,
14 $preserve_rawuri = FALSE, $preserve_chunk = TRUE)
16 // Not available for: IDN(ignored)
19 // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
20 // Refer RFC3986 (Regex below is not strict)
21 '#(\b[a-z][a-z0-9.+-]{1,8})://' . // 1: Scheme
23 '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username)
27 '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
28 '(?:[0-9]{1-3}\.){3}[0-9]{1-3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
29 '[^\s<>"\'\[\]:/\#?]+' . // FQDN: foo.example.org
31 '(?::([0-9]*))?' . // 4: Port
32 '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info
33 '([^\s<>"\'\[\]\#]+)?' . // 6: File and query string
34 '(?:\#([a-z0-9._~%!$&\'()*+,;=:@-]*))?' . // 7: Fragment
36 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE
38 //var_dump(recursive_map('htmlspecialchars', $array));
41 static $parts = array(
42 1 => 'scheme', 2 => 'userinfo', 3 => 'host', 4 => 'port',
43 5 => 'path', 6 => 'file', 7 => 'fragment'
46 foreach(array_keys($array) as $uri) {
47 array_rename_keys($array[$uri], $parts, TRUE, $default);
48 $offset = $array[$uri]['scheme'][1]; // Scheme's offset
50 foreach(array_keys($array[$uri]) as $part) {
51 // Remove offsets for each part
52 $array[$uri][$part] = & $array[$uri][$part][0];
56 $array[$uri]['scheme'] = scheme_normalize($array[$uri]['scheme']);
57 $array[$uri]['host'] = strtolower($array[$uri]['host']);
58 $array[$uri]['port'] = port_normalize($array[$uri]['port'], $array[$uri]['scheme'], FALSE);
59 $array[$uri]['path'] = path_normalize($array[$uri]['path']);
61 //$array[$uri]['uri'] = uri_array_implode($array[$uri]);
62 if ($preserve_rawuri) $array[$uri]['rawuri'] = & $array[$uri][0];
64 $array[$uri]['uri'] = & $array[$uri][0]; // Raw
66 unset($array[$uri][0]); // Matched string itself
67 if (! $preserve_chunk) {
69 $array[$uri]['scheme'],
70 $array[$uri]['userinfo'],
75 $array[$uri]['fragment']
79 $array[$uri]['offset'] = $offset;
80 $array[$uri]['area'] = 0;
86 // Domain exposure callback (See spam_uri_pickup_preprocess())
87 // http://victim.example.org/?foo+site:nasty.example.com+bar
88 // => http://nasty.example.com/?refer=victim.example.org
89 // NOTE: 'refer=' is not so good for (at this time).
90 // Consider about using IP address of the victim, try to avoid that.
91 function _preg_replace_callback_domain_exposure($matches = array())
95 // Preserve the victim URI as a complicity or ...
96 if (isset($matches[5])) {
98 $matches[1] . '://' . // scheme
99 $matches[2] . '/' . // victim.example.org
100 $matches[3]; // The rest of all (before victim)
105 $matches[1] . '://' . // scheme
106 $matches[4] . // nasty.example.com
107 '/?refer=' . strtolower($matches[2]) . // victim.example.org
113 // Preprocess: rawurldecode() and adding space(s) to detect/count some URIs _if possible_
114 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
115 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
116 // [OK] http://victim.example.org/http://nasty.example.org
117 function spam_uri_pickup_preprocess($string = '')
119 if (! is_string($string)) return '';
121 $string = rawurldecode($string);
123 // Domain exposure (See _preg_replace_callback_domain_exposure())
124 $string = preg_replace_callback(
126 // Something Google: http://www.google.com/supported_domains
127 '#(http)://([a-z0-9.]+\.google\.[a-z]{2,3}(?:\.[a-z]{2})?)/' .
128 '([a-z0-9?=&.%_+-]+)' . // ?query=foo+
129 '\bsite:([a-z0-9.%_-]+)' . // site:nasty.example.com
133 '_preg_replace_callback_domain_exposure',
137 // Scheme exposure (schemescheme => scheme scheme)
138 $string = preg_replace(
140 '#(?:https?|ftp):/#',
141 '#\b[a-z][a-z0-9.+-]{1,8}://#i',
142 '#[a-z][a-z0-9.+-]{1,8}://#i'
151 // TODO: Area selection (Check BBCode only, check anchor only, check ...)
152 // Main function of spam-uri pickup
153 function spam_uri_pickup($string = '')
155 $string = spam_uri_pickup_preprocess($string);
157 $array = uri_pickup($string);
159 // Area elevation for '(especially external)link' intension
160 if (! empty($array)) {
161 // Anchor tags by preg_match_all()
162 // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
163 // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a>
164 // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
165 // [NG] <a href= >Good site!</a> <a href= "#" >test</a>
167 preg_match_all('#<a\b[^>]*href[^>]*>.*?</a\b[^>]*(>)#i',
168 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
169 //var_dump(recursive_map('htmlspecialchars', $areas));
170 foreach(array_keys($areas) as $area) {
171 $areas[$area] = array(
172 $areas[$area][0][1], // Area start (<a href>)
173 $areas[$area][1][1], // Area end (</a>)
176 area_measure($areas, $array);
178 // phpBB's "BBCode" by preg_match_all()
179 // [url]http://nasty.example.com/[/url]
180 // [link]http://nasty.example.com/[/link]
181 // [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
182 // [link http://nasty.example.com/]buy something[/link]
185 preg_match_all('#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i',
186 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
187 //var_dump(recursive_map('htmlspecialchars', $areas));
188 foreach(array_keys($areas) as $area) {
189 $areas[$area] = array(
190 $areas[$area][0][1], // Area start ([url])
191 $areas[$area][2][1], // Area end ([/url])
194 area_measure($areas, $array);
196 // Various Wiki syntax
197 // [text_or_uri>text_or_uri]
198 // [text_or_uri:text_or_uri]
199 // [text_or_uri|text_or_uri]
200 // [text_or_uri->text_or_uri]
201 // [text_or_uri text_or_uri] // MediaWiki
202 // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
204 // Remove 'offset's for area_measure()
205 //foreach(array_keys($array) as $key)
206 // unset($array[$key]['offset']);
212 // $array['something'] => $array['wanted']
213 function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
215 if (! is_array($array) || ! is_array($keys))
218 // Nondestructive test
220 foreach(array_keys($keys) as $from)
221 if (! isset($array[$from]))
224 foreach($keys as $from => $to) {
225 if ($from === $to) continue;
226 if (! $force || isset($array[$from])) {
227 $array[$to] = & $array[$from];
228 unset($array[$from]);
230 $array[$to] = $default;
237 // If in doubt, it's a little doubtful
238 function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
240 if (! is_array($areas) || ! is_array($array)) return;
242 $areas_keys = array_keys($areas);
243 foreach(array_keys($array) as $u_index) {
244 $offset = isset($array[$u_index][$o_key]) ?
245 intval($array[$u_index][$o_key]) : 0;
246 foreach($areas_keys as $a_index) {
247 if (isset($array[$u_index][$a_key])) {
248 $offset_s = intval($areas[$a_index][0]);
249 $offset_e = intval($areas[$a_index][1]);
250 // [Area => inside <= Area]
251 if ($offset_s < $offset && $offset < $offset_e) {
252 $array[$u_index][$a_key] += $belief;
260 // ---------------------
263 // Scheme normalization: Rename the schemes
264 // snntp://example.org => nntps://example.org
265 // NOTE: Keep the static list simple. See also port_normalize().
266 function scheme_normalize($scheme = '')
268 static $aliases = array(
269 // alias => normalized
279 $scheme = strtolower(trim($scheme));
280 if (isset($aliases[$scheme])) $scheme = $aliases[$scheme];
285 // Port normalization: Suppress the (redundant) default port
286 // HTTP://example.org:80/ => http://example.org/
287 // HTTP://example.org:8080/ => http://example.org:8080/
288 // HTTPS://example.org:443/ => https://example.org/
289 function port_normalize($port, $scheme, $scheme_normalize = TRUE)
291 // Schemes that users _maybe_ want to add protocol-handlers
292 // to their web browsers. (and attackers _maybe_ want to use ...)
293 // Reference: http://www.iana.org/assignments/port-numbers
294 static $array = array(
295 // scheme => default port
322 if ($port === '') return $port;
324 if ($scheme_normalize) $scheme = scheme_normalize($scheme);
325 if (isset($array[$scheme]) && $port == $array[$scheme])
326 $port = ''; // Ignore the defaults
331 // Path normalization
332 // http://example.org => http://example.org/
333 // http://example.org#hoge => http://example.org/#hoge
334 // http://example.org/path/a/b/./c////./d => http://example.org/path/a/b/c/d
335 // http://example.org/path/../../a/../back => http://example.org/back
336 function path_normalize($path = '', $divider = '/', $addroot = TRUE)
338 if (! is_string($path) || $path == '') {
339 $path = $addroot ? $divider : '';
342 $last = ($path[strlen($path) - 1] == $divider) ? $divider : '';
343 $array = explode($divider, $path);
346 foreach(array_keys($array) as $key) {
347 if ($array[$key] == '' || $array[$key] == '.')
352 foreach($array as $value) {
353 if ($value == '..') {
356 array_push($tmp, $value);
361 $path = $addroot ? $divider : '';
362 if (! empty($array)) $path .= implode($divider, $array) . $last;
368 // An URI array => An URI (See uri_pickup())
369 function uri_array_implode($uri = array())
371 if (empty($uri) || ! is_array($uri)) return NULL;
374 if (isset($uri['scheme']) && $uri['scheme'] !== '') {
375 $tmp[] = & $uri['scheme'];
378 if (isset($uri['userinfo']) && $uri['userinfo'] !== '') {
379 $tmp[] = & $uri['userinfo'];
382 if (isset($uri['host']) && $uri['host'] !== '') {
383 $tmp[] = & $uri['host'];
385 if (isset($uri['port']) && $uri['port'] !== '') {
387 $tmp[] = & $uri['port'];
389 if (isset($uri['path']) && $uri['path'] !== '') {
390 $tmp[] = & $uri['path'];
392 if (isset($uri['file']) && $uri['file'] !== '') {
393 $tmp[] = & $uri['file'];
395 if (isset($uri['fragment']) && $uri['fragment'] !== '') {
397 $tmp[] = & $uri['fragment'];
400 return implode('', $tmp);
403 // ---------------------
404 // Part One : Checker
407 // TODO: globbing for IP address or something
409 // TODO: require_or_include_once(another file)
410 function is_badhost($host = '')
412 static $blocklist_regex;
414 if (! isset($blocklist_regex)) {
422 // 2006/11/19 17:50 dev
424 '.bigpricesearch.org',
429 $blocklist_regex = array();
430 foreach ($blocklist as $part) {
431 if ($part[0] === '.') {
432 $blocklist_regex[] = '#' . preg_quote($part, '#') . '$#';
434 $blocklist_regex[] = '#^(.*\.)?' . preg_quote($part, '#') . '$#';
439 foreach ($blocklist_regex as $regex) {
440 if (preg_match($regex, $host)) {
448 // TODO return TRUE or FALSE!
449 // Simple/fast spam check
450 function is_uri_spam($target = '')
455 if (is_array($target)) {
456 foreach($target as $str) {
458 list($is_spam, $_urinum) = is_uri_spam($str);
463 $pickups = spam_uri_pickup($target);
464 $urinum += count($pickups);
465 if (! empty($pickups)) {
466 // Some users want to post some URLs, but ...
468 $is_spam = TRUE; // Too many!
470 foreach($pickups as $pickup) {
471 if ($pickup['area'] < 0) {
478 foreach ($pickups as $pickup) {
479 if (is_badhost($pickup['host'])) {
487 return array($is_spam, $urinum);
490 // ---------------------
492 // Check User-Agent (not testing yet)
493 function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
495 return $ua_name === '';
498 // ---------------------
500 // TODO: Separate check-part(s) and mail part
501 // TODO: Multi-metrics (uri, host, user-agent, ...)
502 // TODO: Mail to administrator with more measurement data?
503 // Simple/fast spam filter ($target: 'a string' or an array())
504 function pkwk_spamfilter($action, $page, $target = array('title' => ''))
508 //$is_spam = is_invalid_useragent('NOTYET');
510 $action .= ' (Invalid User-Agent)';
512 list($is_spam) = is_uri_spam($target);
516 // Mail to administrator(s)
517 global $notify, $notify_subject;
519 $footer['ACTION'] = $action;
520 $footer['PAGE'] = '[blocked] ' . $page;
521 $footer['URI'] = get_script_uri() . '?' . rawurlencode($page);
522 $footer['USER_AGENT'] = TRUE;
523 $footer['REMOTE_ADDR'] = TRUE;
524 pkwk_mail_notify($notify_subject, var_export($target, TRUE), $footer);
529 if ($is_spam) spam_exit();
532 // ---------------------
534 // Common bahavior for blocking
535 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'