2 // $Id: spam.php,v 1.32 2006/11/25 02:37:21 henoheno Exp $
3 // Copyright (C) 2006 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
6 // Functions for Concept-work of spam-uri metrics
8 // Return an array of URIs in the $string
9 // [OK] http://nasty.example.org#nasty_string
10 // [OK] http://nasty.example.org:80/foo/xxx#nasty_string/bar
11 // [OK] ftp://nasty.example.org:80/dfsdfs
12 // [OK] ftp://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm (from RFC3986)
13 function uri_pickup($string = '', $normalize = TRUE,
14 $preserve_rawuri = FALSE, $preserve_chunk = TRUE)
16 // Not available for: IDN(ignored)
19 // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
20 // Refer RFC3986 (Regex below is not strict)
21 '#(\b[a-z][a-z0-9.+-]{1,8})://' . // 1: Scheme
23 '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username)
27 '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
28 '(?:[0-9]{1-3}\.){3}[0-9]{1-3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
29 '[^\s<>"\'\[\]:/\#?]+' . // FQDN: foo.example.org
31 '(?::([0-9]*))?' . // 4: Port
32 '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info
33 '([^\s<>"\'\[\]\#]+)?' . // 6: File and query string
34 '(?:\#([a-z0-9._~%!$&\'()*+,;=:@-]*))?' . // 7: Fragment
36 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE
38 //var_dump(recursive_map('htmlspecialchars', $array));
41 static $parts = array(
42 1 => 'scheme', 2 => 'userinfo', 3 => 'host', 4 => 'port',
43 5 => 'path', 6 => 'file', 7 => 'fragment'
46 foreach(array_keys($array) as $uri) {
47 array_rename_keys($array[$uri], $parts, TRUE, $default);
48 $offset = $array[$uri]['scheme'][1]; // Scheme's offset
50 foreach(array_keys($array[$uri]) as $part) {
51 // Remove offsets for each part
52 $array[$uri][$part] = & $array[$uri][$part][0];
56 $array[$uri]['scheme'] = scheme_normalize($array[$uri]['scheme']);
57 //if ($array[$uri]['scheme'] === '') {
59 // unset ($array[$uri]);
63 $array[$uri]['host'] = strtolower($array[$uri]['host']);
64 $array[$uri]['port'] = port_normalize($array[$uri]['port'], $array[$uri]['scheme'], FALSE);
65 $array[$uri]['path'] = path_normalize($array[$uri]['path']);
67 //$array[$uri]['uri'] = uri_array_implode($array[$uri]);
68 if ($preserve_rawuri) $array[$uri]['rawuri'] = & $array[$uri][0];
70 $array[$uri]['uri'] = & $array[$uri][0]; // Raw
72 unset($array[$uri][0]); // Matched string itself
73 if (! $preserve_chunk) {
75 $array[$uri]['scheme'],
76 $array[$uri]['userinfo'],
81 $array[$uri]['fragment']
85 $array[$uri]['offset'] = $offset;
86 $array[$uri]['area'] = 0;
92 // Domain exposure callback (See spam_uri_pickup_preprocess())
93 // http://victim.example.org/?foo+site:nasty.example.com+bar
94 // => http://nasty.example.com/?refer=victim.example.org
95 // NOTE: 'refer=' is not so good for (at this time).
96 // Consider about using IP address of the victim, try to avoid that.
97 function _preg_replace_callback_domain_exposure($matches = array())
101 // Preserve the victim URI as a complicity or ...
102 if (isset($matches[5])) {
104 $matches[1] . '://' . // scheme
105 $matches[2] . '/' . // victim.example.org
106 $matches[3]; // The rest of all (before victim)
111 $matches[1] . '://' . // scheme
112 $matches[4] . // nasty.example.com
113 '/?refer=' . strtolower($matches[2]) . // victim.example.org
119 // Preprocess: rawurldecode() and adding space(s) to detect/count some URIs _if possible_
120 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
121 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
122 // [OK] http://victim.example.org/http://nasty.example.org
123 function spam_uri_pickup_preprocess($string = '')
125 if (! is_string($string)) return '';
127 $string = rawurldecode($string);
129 // Domain exposure (See _preg_replace_callback_domain_exposure())
130 $string = preg_replace_callback(
132 // Something Google: http://www.google.com/supported_domains
133 '#(http)://([a-z0-9.]+\.google\.[a-z]{2,3}(?:\.[a-z]{2})?)/' .
134 '([a-z0-9?=&.%_+-]+)' . // ?query=foo+
135 '\bsite:([a-z0-9.%_-]+)' . // site:nasty.example.com
139 '_preg_replace_callback_domain_exposure',
143 // URI exposure (uriuri => uri uri)
144 $string = preg_replace(
146 '#(?<! )(?:https?|ftp):/#',
147 // '#[a-z][a-z0-9.+-]{1,8}://#i',
148 // '#[a-z][a-z0-9.+-]{1,8}://#i'
157 // TODO: Area selection (Check BBCode only, check anchor only, check ...)
158 // Main function of spam-uri pickup
159 function spam_uri_pickup($string = '')
161 $string = spam_uri_pickup_preprocess($string);
163 $array = uri_pickup($string);
165 // Area elevation for '(especially external)link' intension
166 if (! empty($array)) {
167 // Anchor tags by preg_match_all()
168 // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
169 // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a>
170 // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
171 // [NG] <a href= >Good site!</a> <a href= "#" >test</a>
173 preg_match_all('#<a\b[^>]*href[^>]*>.*?</a\b[^>]*(>)#i',
174 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
175 //var_dump(recursive_map('htmlspecialchars', $areas));
176 foreach(array_keys($areas) as $area) {
177 $areas[$area] = array(
178 $areas[$area][0][1], // Area start (<a href>)
179 $areas[$area][1][1], // Area end (</a>)
182 area_measure($areas, $array);
184 // phpBB's "BBCode" by preg_match_all()
185 // [url]http://nasty.example.com/[/url]
186 // [link]http://nasty.example.com/[/link]
187 // [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
188 // [link http://nasty.example.com/]buy something[/link]
191 preg_match_all('#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i',
192 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
193 //var_dump(recursive_map('htmlspecialchars', $areas));
194 foreach(array_keys($areas) as $area) {
195 $areas[$area] = array(
196 $areas[$area][0][1], // Area start ([url])
197 $areas[$area][2][1], // Area end ([/url])
200 area_measure($areas, $array);
202 // Various Wiki syntax
203 // [text_or_uri>text_or_uri]
204 // [text_or_uri:text_or_uri]
205 // [text_or_uri|text_or_uri]
206 // [text_or_uri->text_or_uri]
207 // [text_or_uri text_or_uri] // MediaWiki
208 // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
210 // Remove 'offset's for area_measure()
211 //foreach(array_keys($array) as $key)
212 // unset($array[$key]['offset']);
218 // $array['something'] => $array['wanted']
219 function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
221 if (! is_array($array) || ! is_array($keys))
224 // Nondestructive test
226 foreach(array_keys($keys) as $from)
227 if (! isset($array[$from]))
230 foreach($keys as $from => $to) {
231 if ($from === $to) continue;
232 if (! $force || isset($array[$from])) {
233 $array[$to] = & $array[$from];
234 unset($array[$from]);
236 $array[$to] = $default;
243 // If in doubt, it's a little doubtful
244 function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
246 if (! is_array($areas) || ! is_array($array)) return;
248 $areas_keys = array_keys($areas);
249 foreach(array_keys($array) as $u_index) {
250 $offset = isset($array[$u_index][$o_key]) ?
251 intval($array[$u_index][$o_key]) : 0;
252 foreach($areas_keys as $a_index) {
253 if (isset($array[$u_index][$a_key])) {
254 $offset_s = intval($areas[$a_index][0]);
255 $offset_e = intval($areas[$a_index][1]);
256 // [Area => inside <= Area]
257 if ($offset_s < $offset && $offset < $offset_e) {
258 $array[$u_index][$a_key] += $belief;
266 // ---------------------
269 // Scheme normalization: Renaming the schemes
270 // snntp://example.org => nntps://example.org
271 // NOTE: Keep the static lists simple. See also port_normalize().
272 function scheme_normalize($scheme = '', $considerd_harmfull = TRUE)
274 // Abbreviations considerable they don't have link intension
275 static $abbrevs = array(
280 // Alias => normalized
281 static $aliases = array(
291 $scheme = strtolower(trim($scheme));
292 if (isset($abbrevs[$scheme])) {
293 if ($considerd_harmfull) {
294 $scheme = $abbrevs[$scheme];
299 if (isset($aliases[$scheme])) $scheme = $aliases[$scheme];
304 // Port normalization: Suppress the (redundant) default port
305 // HTTP://example.org:80/ => http://example.org/
306 // HTTP://example.org:8080/ => http://example.org:8080/
307 // HTTPS://example.org:443/ => https://example.org/
308 function port_normalize($port, $scheme, $scheme_normalize = TRUE)
310 // Schemes that users _maybe_ want to add protocol-handlers
311 // to their web browsers. (and attackers _maybe_ want to use ...)
312 // Reference: http://www.iana.org/assignments/port-numbers
313 static $array = array(
314 // scheme => default port
341 if ($port === '') return $port;
343 if ($scheme_normalize) $scheme = scheme_normalize($scheme);
344 if (isset($array[$scheme]) && $port == $array[$scheme])
345 $port = ''; // Ignore the defaults
350 // Path normalization
351 // http://example.org => http://example.org/
352 // http://example.org#hoge => http://example.org/#hoge
353 // http://example.org/path/a/b/./c////./d => http://example.org/path/a/b/c/d
354 // http://example.org/path/../../a/../back => http://example.org/back
355 function path_normalize($path = '', $divider = '/', $addroot = TRUE)
357 if (! is_string($path) || $path == '') {
358 $path = $addroot ? $divider : '';
361 $last = ($path[strlen($path) - 1] == $divider) ? $divider : '';
362 $array = explode($divider, $path);
365 foreach(array_keys($array) as $key) {
366 if ($array[$key] == '' || $array[$key] == '.')
371 foreach($array as $value) {
372 if ($value == '..') {
375 array_push($tmp, $value);
380 $path = $addroot ? $divider : '';
381 if (! empty($array)) $path .= implode($divider, $array) . $last;
387 // An URI array => An URI (See uri_pickup())
388 function uri_array_implode($uri = array())
390 if (empty($uri) || ! is_array($uri)) return NULL;
393 if (isset($uri['scheme']) && $uri['scheme'] !== '') {
394 $tmp[] = & $uri['scheme'];
397 if (isset($uri['userinfo']) && $uri['userinfo'] !== '') {
398 $tmp[] = & $uri['userinfo'];
401 if (isset($uri['host']) && $uri['host'] !== '') {
402 $tmp[] = & $uri['host'];
404 if (isset($uri['port']) && $uri['port'] !== '') {
406 $tmp[] = & $uri['port'];
408 if (isset($uri['path']) && $uri['path'] !== '') {
409 $tmp[] = & $uri['path'];
411 if (isset($uri['file']) && $uri['file'] !== '') {
412 $tmp[] = & $uri['file'];
414 if (isset($uri['fragment']) && $uri['fragment'] !== '') {
416 $tmp[] = & $uri['fragment'];
419 return implode('', $tmp);
422 // ---------------------
423 // Part One : Checker
425 function generate_glob_regex($string = '', $divider = '/')
427 static $from = array(
452 $string = str_replace($from, $mid, $string); // Hide
453 $string = preg_quote($string, $divider);
454 $string = str_replace($mid, $to, $string); // Unhide
460 // TODO: require_or_include_once(another file)
461 function is_badhost($host = '')
463 static $blocklist_regex;
465 if (! isset($blocklist_regex)) {
466 $blocklist_regex = array();
472 //'10.20.*.*', // 10.20.example.com also matches
475 // Too much malicious sub-domains
484 // 2006/11/19 17:50 dev
486 '*.bigpricesearch.org',
490 foreach ($blocklist as $part) {
491 $blocklist_regex[] = '#^' . generate_glob_regex($part, '#') . '$#';
495 $host = strtolower($host);
497 foreach ($blocklist_regex as $regex) {
498 if (preg_match($regex, $host)) {
507 // TODO return TRUE or FALSE!
508 // Simple/fast spam check
509 function is_uri_spam($target = '')
514 if (is_array($target)) {
515 foreach($target as $str) {
517 list($is_spam, $_urinum) = is_uri_spam($str);
522 $pickups = spam_uri_pickup($target);
523 $urinum += count($pickups);
524 if (! empty($pickups)) {
525 // Some users want to post some URLs, but ...
527 $is_spam = TRUE; // Too many!
529 foreach($pickups as $pickup) {
530 if ($pickup['area'] < 0) {
537 foreach ($pickups as $pickup) {
538 if (is_badhost($pickup['host'])) {
546 return array($is_spam, $urinum);
549 // ---------------------
551 // Check User-Agent (not testing yet)
552 function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
554 return $ua_name === '';
557 // ---------------------
559 // TODO: Separate check-part(s) and mail part
560 // TODO: Multi-metrics (uri, host, user-agent, ...)
561 // TODO: Mail to administrator with more measurement data?
562 // Simple/fast spam filter ($target: 'a string' or an array())
563 function pkwk_spamfilter($action, $page, $target = array('title' => ''))
567 //$is_spam = is_invalid_useragent('NOTYET');
569 $action .= ' (Invalid User-Agent)';
571 list($is_spam) = is_uri_spam($target);
575 // Mail to administrator(s)
576 global $notify, $notify_subject;
578 $footer['ACTION'] = $action;
579 $footer['PAGE'] = '[blocked] ' . $page;
580 $footer['URI'] = get_script_uri() . '?' . rawurlencode($page);
581 $footer['USER_AGENT'] = TRUE;
582 $footer['REMOTE_ADDR'] = TRUE;
583 pkwk_mail_notify($notify_subject, var_export($target, TRUE), $footer);
588 if ($is_spam) spam_exit();
591 // ---------------------
593 // Common bahavior for blocking
594 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'