2 // $Id: spam.php,v 1.26 2006/11/23 01:16:13 henoheno Exp $
3 // Copyright (C) 2006 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
6 // Functions for Concept-work of spam-uri metrics
8 // Return an array of URIs in the $string
9 // [OK] http://nasty.example.org#nasty_string
10 // [OK] http://nasty.example.org:80/foo/xxx#nasty_string/bar
11 // [OK] ftp://nasty.example.org:80/dfsdfs
12 // [OK] ftp://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm (from RFC3986)
13 function uri_pickup($string = '', $normalize = TRUE,
14 $preserve_rawuri = FALSE, $preserve_chunk = TRUE)
16 // Not available for: IDN(ignored)
19 // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
20 // Refer RFC3986 (Regex below is not strict)
21 '#(\b[a-z][a-z0-9.+-]{1,8})://' . // 1: Scheme
23 '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username)
27 '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
28 '(?:[0-9]{1-3}\.){3}[0-9]{1-3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
29 '[^\s<>"\'\[\]:/\#?]+' . // FQDN: foo.example.org
31 '(?::([0-9]*))?' . // 4: Port
32 '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info
33 '([^\s<>"\'\[\]\#]+)?' . // 6: File and query string
34 '(?:\#([a-z0-9._~%!$&\'()*+,;=:@-]*))?' . // 7: Fragment
36 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE
38 //var_dump(recursive_map('htmlspecialchars', $array));
41 static $parts = array(
42 1 => 'scheme', 2 => 'userinfo', 3 => 'host', 4 => 'port',
43 5 => 'path', 6 => 'file', 7 => 'fragment'
46 foreach(array_keys($array) as $uri) {
47 array_rename_keys($array[$uri], $parts, TRUE, $default);
48 $offset = $array[$uri]['scheme'][1]; // Scheme's offset
50 foreach(array_keys($array[$uri]) as $part) {
51 // Remove offsets for each part
52 $array[$uri][$part] = & $array[$uri][$part][0];
56 $array[$uri]['scheme'] = scheme_normalize($array[$uri]['scheme']);
57 $array[$uri]['host'] = strtolower($array[$uri]['host']);
58 $array[$uri]['port'] = port_normalize($array[$uri]['port'], $array[$uri]['scheme'], FALSE);
59 $array[$uri]['path'] = path_normalize($array[$uri]['path']);
61 //$array[$uri]['uri'] = uri_array_implode($array[$uri]);
62 if ($preserve_rawuri) $array[$uri]['rawuri'] = & $array[$uri][0];
64 $array[$uri]['uri'] = & $array[$uri][0]; // Raw
66 unset($array[$uri][0]); // Matched string itself
67 if (! $preserve_chunk) {
69 $array[$uri]['scheme'],
70 $array[$uri]['userinfo'],
75 $array[$uri]['fragment']
79 $array[$uri]['offset'] = $offset;
80 $array[$uri]['area'] = 0;
86 // Domain exposure callback (See spam_uri_pickup_preprocess())
87 // http://victim.example.org/?foo+site:nasty.example.com+bar
88 // => http://nasty.example.com/?refer=victim.example.org
89 function _preg_replace_callback_domain_exposure($matches = array())
93 // Preserve the victim URI as a complicity or ...
94 if (isset($matches[5])) {
96 $matches[1] . '://' . // scheme
97 $matches[2] . '/' . // victim.example.org
98 $matches[3]; // The rest of all (before victim)
103 $matches[1] . '://' . // scheme
104 $matches[4] . // nasty.example.com
105 '/refer=' . strtolower($matches[2]) . '/' . // victim.example.org
111 // Preprocess: rawurldecode() and adding space(s) to detect/count some URIs _if possible_
112 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
113 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
114 // [OK] http://victim.example.org/http://nasty.example.org
115 function spam_uri_pickup_preprocess($string = '')
117 if (! is_string($string)) return '';
119 $string = rawurldecode($string);
121 // Domain exposure (See _preg_replace_callback_domain_exposure())
122 $string = preg_replace_callback(
124 // Something Google: http://www.google.com/supported_domains
125 '#(http)://([a-z0-9.]+\.google\.[a-z]{2,3}(?:\.[a-z]{2})?)/' .
126 '([a-z0-9?=&.%_+-]+)' . // ?query=foo+
127 '\bsite:([a-z0-9.%_-]+)' . // site:nasty.example.com
131 '_preg_replace_callback_domain_exposure',
135 // Scheme exposure (schemescheme => scheme scheme)
136 $string = preg_replace(
138 '#(?:https?|ftp):/#',
139 '#\b[a-z][a-z0-9.+-]{1,8}://#i',
140 '#[a-z][a-z0-9.+-]{1,8}://#i'
149 // TODO: Area selection (Check BBCode only, check anchor only, check ...)
150 // Main function of spam-uri pickup
151 function spam_uri_pickup($string = '')
153 $string = spam_uri_pickup_preprocess($string);
155 $array = uri_pickup($string);
157 // Area elevation for '(especially external)link' intension
158 if (! empty($array)) {
159 // Anchor tags by preg_match_all()
160 // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
161 // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a>
162 // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
163 // [NG] <a href= >Good site!</a> <a href= "#" >test</a>
165 preg_match_all('#<a\b[^>]*href[^>]*>.*?</a\b[^>]*(>)#i',
166 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
167 //var_dump(recursive_map('htmlspecialchars', $areas));
168 foreach(array_keys($areas) as $area) {
169 $areas[$area] = array(
170 $areas[$area][0][1], // Area start (<a href>)
171 $areas[$area][1][1], // Area end (</a>)
174 area_measure($areas, $array);
176 // phpBB's "BBCode" by preg_match_all()
177 // [url]http://nasty.example.com/[/url]
178 // [link]http://nasty.example.com/[/link]
179 // [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
180 // [link http://nasty.example.com/]buy something[/link]
183 preg_match_all('#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i',
184 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
185 //var_dump(recursive_map('htmlspecialchars', $areas));
186 foreach(array_keys($areas) as $area) {
187 $areas[$area] = array(
188 $areas[$area][0][1], // Area start ([url])
189 $areas[$area][2][1], // Area end ([/url])
192 area_measure($areas, $array);
194 // Various Wiki syntax
195 // [text_or_uri>text_or_uri]
196 // [text_or_uri:text_or_uri]
197 // [text_or_uri|text_or_uri]
198 // [text_or_uri->text_or_uri]
199 // [text_or_uri text_or_uri] // MediaWiki
200 // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
202 // Remove 'offset's for area_measure()
203 //foreach(array_keys($array) as $key)
204 // unset($array[$key]['offset']);
210 // $array['something'] => $array['wanted']
211 function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
213 if (! is_array($array) || ! is_array($keys))
216 // Nondestructive test
218 foreach(array_keys($keys) as $from)
219 if (! isset($array[$from]))
222 foreach($keys as $from => $to) {
223 if ($from === $to) continue;
224 if (! $force || isset($array[$from])) {
225 $array[$to] = & $array[$from];
226 unset($array[$from]);
228 $array[$to] = $default;
235 // If in doubt, it's a little doubtful
236 function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
238 if (! is_array($areas) || ! is_array($array)) return;
240 $areas_keys = array_keys($areas);
241 foreach(array_keys($array) as $u_index) {
242 $offset = isset($array[$u_index][$o_key]) ?
243 intval($array[$u_index][$o_key]) : 0;
244 foreach($areas_keys as $a_index) {
245 if (isset($array[$u_index][$a_key])) {
246 $offset_s = intval($areas[$a_index][0]);
247 $offset_e = intval($areas[$a_index][1]);
248 // [Area => inside <= Area]
249 if ($offset_s < $offset && $offset < $offset_e) {
250 $array[$u_index][$a_key] += $belief;
258 // ---------------------
261 // Scheme normalization: Rename the schemes
262 // snntp://example.org => nntps://example.org
263 // NOTE: Keep the static list simple. See also port_normalize().
264 function scheme_normalize($scheme = '')
266 static $aliases = array(
267 // alias => normalized
277 $scheme = strtolower(trim($scheme));
278 if (isset($aliases[$scheme])) $scheme = $aliases[$scheme];
283 // Port normalization: Suppress the (redundant) default port
284 // HTTP://example.org:80/ => http://example.org/
285 // HTTP://example.org:8080/ => http://example.org:8080/
286 // HTTPS://example.org:443/ => https://example.org/
287 function port_normalize($port, $scheme, $scheme_normalize = TRUE)
289 // Schemes that users _maybe_ want to add protocol-handlers
290 // to their web browsers. (and attackers _maybe_ want to use ...)
291 // Reference: http://www.iana.org/assignments/port-numbers
292 static $array = array(
293 // scheme => default port
320 if ($port === '') return $port;
322 if ($scheme_normalize) $scheme = scheme_normalize($scheme);
323 if (isset($array[$scheme]) && $port == $array[$scheme])
324 $port = ''; // Ignore the defaults
329 // Path normalization
330 // http://example.org => http://example.org/
331 // http://example.org#hoge => http://example.org/#hoge
332 // http://example.org/path/a/b/./c////./d => http://example.org/path/a/b/c/d
333 // http://example.org/path/../../a/../back => http://example.org/back
334 function path_normalize($path = '', $divider = '/', $addroot = TRUE)
336 if (! is_string($path) || $path == '') {
337 $path = $addroot ? $divider : '';
340 $last = ($path[strlen($path) - 1] == $divider) ? $divider : '';
341 $array = explode($divider, $path);
344 foreach(array_keys($array) as $key) {
345 if ($array[$key] == '' || $array[$key] == '.')
350 foreach($array as $value) {
351 if ($value == '..') {
354 array_push($tmp, $value);
359 $path = $addroot ? $divider : '';
360 if (! empty($array)) $path .= implode($divider, $array) . $last;
366 // An URI array => An URI (See uri_pickup())
367 function uri_array_implode($uri = array())
369 if (empty($uri) || ! is_array($uri)) return NULL;
372 if (isset($uri['scheme']) && $uri['scheme'] !== '') {
373 $tmp[] = & $uri['scheme'];
376 if (isset($uri['userinfo']) && $uri['userinfo'] !== '') {
377 $tmp[] = & $uri['userinfo'];
380 if (isset($uri['host']) && $uri['host'] !== '') {
381 $tmp[] = & $uri['host'];
383 if (isset($uri['port']) && $uri['port'] !== '') {
385 $tmp[] = & $uri['port'];
387 if (isset($uri['path']) && $uri['path'] !== '') {
388 $tmp[] = & $uri['path'];
390 if (isset($uri['file']) && $uri['file'] !== '') {
391 $tmp[] = & $uri['file'];
393 if (isset($uri['fragment']) && $uri['fragment'] !== '') {
395 $tmp[] = & $uri['fragment'];
398 return implode('', $tmp);
401 // ---------------------
402 // Part One : Checker
405 // TODO: globbing for IP address or something
407 // TODO: require_or_include_once(another file)
408 function is_badhost($host = '')
410 static $blocklist_regex;
412 if (! isset($blocklist_regex)) {
416 foreach ($blocklist as $part) {
417 $blocklist_regex[] = '#\b' . preg_quote($part, '#') . '$#';
421 foreach ($blocklist_regex as $regex) {
422 if (preg_match($regex, $host)) {
430 // TODO return TRUE or FALSE!
431 // Simple/fast spam check
432 function is_uri_spam($target = '')
434 static $blocklist = array(
437 static $blocklist_regex;
439 if (! isset($blocklist_regex)) {
440 foreach ($blocklist as $part) {
441 $blocklist_regex[] = '#\b' . preg_quote($part, '#') . '$#';
448 if (is_array($target)) {
449 foreach($target as $str) {
451 list($is_spam, $_urinum) = is_uri_spam($str);
456 $pickups = spam_uri_pickup($target);
457 $urinum += count($pickups);
458 if (! empty($pickups)) {
459 // Some users want to post some URLs, but ...
461 $is_spam = TRUE; // Too many!
463 foreach($pickups as $pickup) {
464 if ($pickup['area'] < 0) {
471 foreach ($pickups as $pickup) {
472 if (is_badhost($pickup['host'])) {
480 return array($is_spam, $urinum);
483 // ---------------------
485 // Check User-Agent (not testing yet)
486 function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
488 return $ua_name === '';
491 // ---------------------
493 // TODO: Separate check-part(s) and mail part
494 // TODO: Multi-metrics (uri, host, user-agent, ...)
495 // TODO: Mail to administrator with more measurement data?
496 // Simple/fast spam filter ($target: 'a string' or an array())
497 function pkwk_spamfilter($action, $page, $target = array('title' => ''))
501 //$is_spam = is_invalid_useragent('NOTYET');
503 $action .= ' (Invalid User-Agent)';
505 list($is_spam) = is_uri_spam($target);
509 // Mail to administrator(s)
510 global $notify, $notify_subject;
512 $footer['ACTION'] = $action;
513 $footer['PAGE'] = '[blocked] ' . $page;
514 $footer['URI'] = get_script_uri() . '?' . rawurlencode($page);
515 $footer['USER_AGENT'] = TRUE;
516 $footer['REMOTE_ADDR'] = TRUE;
517 pkwk_mail_notify($notify_subject, var_export($target, TRUE), $footer);
522 if ($is_spam) spam_exit();
525 // ---------------------
527 // Common bahavior for blocking
528 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'