2 // $Id: spam.php,v 1.35 2006/11/25 12:58:20 henoheno Exp $
3 // Copyright (C) 2006 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
6 // Functions for Concept-work of spam-uri metrics
8 // Return an array of URIs in the $string
9 // [OK] http://nasty.example.org#nasty_string
10 // [OK] http://nasty.example.org:80/foo/xxx#nasty_string/bar
11 // [OK] ftp://nasty.example.org:80/dfsdfs
12 // [OK] ftp://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm (from RFC3986)
13 function uri_pickup($string = '', $normalize = TRUE,
14 $preserve_rawuri = FALSE, $preserve_chunk = TRUE)
16 // Not available for: IDN(ignored)
19 // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
20 // Refer RFC3986 (Regex below is not strict)
21 '#(\b[a-z][a-z0-9.+-]{1,8})://' . // 1: Scheme
23 '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username)
27 '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
28 '(?:[0-9]{1-3}\.){3}[0-9]{1-3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
29 '[^\s<>"\'\[\]:/\#?]+' . // FQDN: foo.example.org
31 '(?::([0-9]*))?' . // 4: Port
32 '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info
33 '([^\s<>"\'\[\]\#]+)?' . // 6: File and query string
34 '(?:\#([a-z0-9._~%!$&\'()*+,;=:@-]*))?' . // 7: Fragment
36 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE
38 //var_dump(recursive_map('htmlspecialchars', $array));
41 static $parts = array(
42 1 => 'scheme', 2 => 'userinfo', 3 => 'host', 4 => 'port',
43 5 => 'path', 6 => 'file', 7 => 'fragment'
46 foreach(array_keys($array) as $uri) {
47 array_rename_keys($array[$uri], $parts, TRUE, $default);
48 $offset = $array[$uri]['scheme'][1]; // Scheme's offset
50 foreach(array_keys($array[$uri]) as $part) {
51 // Remove offsets for each part
52 $array[$uri][$part] = & $array[$uri][$part][0];
56 $array[$uri]['scheme'] = scheme_normalize($array[$uri]['scheme']);
57 //if ($array[$uri]['scheme'] === '') {
59 // unset ($array[$uri]);
63 $array[$uri]['host'] = strtolower($array[$uri]['host']);
64 $array[$uri]['port'] = port_normalize($array[$uri]['port'], $array[$uri]['scheme'], FALSE);
65 $array[$uri]['path'] = path_normalize($array[$uri]['path']);
66 //$array[$uri]['uri'] = uri_array_implode($array[$uri]);
67 if ($preserve_rawuri) $array[$uri]['rawuri'] = & $array[$uri][0];
69 $array[$uri]['uri'] = & $array[$uri][0]; // Raw
71 unset($array[$uri][0]); // Matched string itself
72 if (! $preserve_chunk) {
74 $array[$uri]['scheme'],
75 $array[$uri]['userinfo'],
80 $array[$uri]['fragment']
84 $array[$uri]['offset'] = $offset;
85 $array[$uri]['area'] = 0;
91 // Domain exposure callback (See spam_uri_pickup_preprocess())
92 // http://victim.example.org/?foo+site:nasty.example.com+bar
93 // => http://nasty.example.com/?refer=victim.example.org
94 // NOTE: 'refer=' is not so good for (at this time).
95 // Consider about using IP address of the victim, try to avoid that.
96 function _preg_replace_callback_domain_exposure($matches = array())
100 // Preserve the victim URI as a complicity or ...
101 if (isset($matches[5])) {
103 $matches[1] . '://' . // scheme
104 $matches[2] . '/' . // victim.example.org
105 $matches[3]; // The rest of all (before victim)
110 $matches[1] . '://' . // scheme
111 $matches[4] . // nasty.example.com
112 '/?refer=' . strtolower($matches[2]) . // victim.example.org
118 // Preprocess: rawurldecode() and adding space(s) to detect/count some URIs _if possible_
119 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
120 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
121 // [OK] http://victim.example.org/http://nasty.example.org
122 function spam_uri_pickup_preprocess($string = '')
124 if (! is_string($string)) return '';
126 $string = rawurldecode($string);
128 // Domain exposure (See _preg_replace_callback_domain_exposure())
129 $string = preg_replace_callback(
131 // Something Google: http://www.google.com/supported_domains
132 '#(http)://([a-z0-9.]+\.google\.[a-z]{2,3}(?:\.[a-z]{2})?)/' .
133 '([a-z0-9?=&.%_+-]+)' . // ?query=foo+
134 '\bsite:([a-z0-9.%_-]+)' . // site:nasty.example.com
138 '_preg_replace_callback_domain_exposure',
142 // URI exposure (uriuri => uri uri)
143 $string = preg_replace(
145 '#(?<! )(?:https?|ftp):/#',
146 // '#[a-z][a-z0-9.+-]{1,8}://#i',
147 // '#[a-z][a-z0-9.+-]{1,8}://#i'
156 // TODO: Area selection (Check BBCode only, check anchor only, check ...)
157 // Main function of spam-uri pickup
158 function spam_uri_pickup($string = '')
160 $string = spam_uri_pickup_preprocess($string);
162 $array = uri_pickup($string);
164 // Area elevation for '(especially external)link' intension
165 if (! empty($array)) {
166 // Anchor tags by preg_match_all()
167 // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
168 // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a>
169 // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
170 // [NG] <a href= >Good site!</a> <a href= "#" >test</a>
172 preg_match_all('#<a\b[^>]*href[^>]*>.*?</a\b[^>]*(>)#i',
173 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
174 //var_dump(recursive_map('htmlspecialchars', $areas));
175 foreach(array_keys($areas) as $area) {
176 $areas[$area] = array(
177 $areas[$area][0][1], // Area start (<a href>)
178 $areas[$area][1][1], // Area end (</a>)
181 area_measure($areas, $array);
183 // phpBB's "BBCode" by preg_match_all()
184 // [url]http://nasty.example.com/[/url]
185 // [link]http://nasty.example.com/[/link]
186 // [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
187 // [link http://nasty.example.com/]buy something[/link]
190 preg_match_all('#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i',
191 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
192 //var_dump(recursive_map('htmlspecialchars', $areas));
193 foreach(array_keys($areas) as $area) {
194 $areas[$area] = array(
195 $areas[$area][0][1], // Area start ([url])
196 $areas[$area][2][1], // Area end ([/url])
199 area_measure($areas, $array);
201 // Various Wiki syntax
202 // [text_or_uri>text_or_uri]
203 // [text_or_uri:text_or_uri]
204 // [text_or_uri|text_or_uri]
205 // [text_or_uri->text_or_uri]
206 // [text_or_uri text_or_uri] // MediaWiki
207 // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
209 // Remove 'offset's for area_measure()
210 //foreach(array_keys($array) as $key)
211 // unset($array[$key]['offset']);
217 // $array['something'] => $array['wanted']
218 function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
220 if (! is_array($array) || ! is_array($keys))
223 // Nondestructive test
225 foreach(array_keys($keys) as $from)
226 if (! isset($array[$from]))
229 foreach($keys as $from => $to) {
230 if ($from === $to) continue;
231 if (! $force || isset($array[$from])) {
232 $array[$to] = & $array[$from];
233 unset($array[$from]);
235 $array[$to] = $default;
242 // If in doubt, it's a little doubtful
243 function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
245 if (! is_array($areas) || ! is_array($array)) return;
247 $areas_keys = array_keys($areas);
248 foreach(array_keys($array) as $u_index) {
249 $offset = isset($array[$u_index][$o_key]) ?
250 intval($array[$u_index][$o_key]) : 0;
251 foreach($areas_keys as $a_index) {
252 if (isset($array[$u_index][$a_key])) {
253 $offset_s = intval($areas[$a_index][0]);
254 $offset_e = intval($areas[$a_index][1]);
255 // [Area => inside <= Area]
256 if ($offset_s < $offset && $offset < $offset_e) {
257 $array[$u_index][$a_key] += $belief;
265 // ---------------------
268 // Scheme normalization: Renaming the schemes
269 // snntp://example.org => nntps://example.org
270 // NOTE: Keep the static lists simple. See also port_normalize().
271 function scheme_normalize($scheme = '', $considerd_harmfull = TRUE)
273 // Abbreviations considerable they don't have link intension
274 static $abbrevs = array(
279 // Alias => normalized
280 static $aliases = array(
290 $scheme = strtolower(trim($scheme));
291 if (isset($abbrevs[$scheme])) {
292 if ($considerd_harmfull) {
293 $scheme = $abbrevs[$scheme];
298 if (isset($aliases[$scheme])) $scheme = $aliases[$scheme];
303 // Port normalization: Suppress the (redundant) default port
304 // HTTP://example.org:80/ => http://example.org/
305 // HTTP://example.org:8080/ => http://example.org:8080/
306 // HTTPS://example.org:443/ => https://example.org/
307 function port_normalize($port, $scheme, $scheme_normalize = TRUE)
309 // Schemes that users _maybe_ want to add protocol-handlers
310 // to their web browsers. (and attackers _maybe_ want to use ...)
311 // Reference: http://www.iana.org/assignments/port-numbers
312 static $array = array(
313 // scheme => default port
340 if ($port === '') return $port;
342 if ($scheme_normalize) $scheme = scheme_normalize($scheme);
343 if (isset($array[$scheme]) && $port == $array[$scheme])
344 $port = ''; // Ignore the defaults
349 // Path normalization
350 // http://example.org => http://example.org/
351 // http://example.org#hoge => http://example.org/#hoge
352 // http://example.org/path/a/b/./c////./d => http://example.org/path/a/b/c/d
353 // http://example.org/path/../../a/../back => http://example.org/back
354 function path_normalize($path = '', $divider = '/', $addroot = TRUE)
356 if (! is_string($path) || $path == '') {
357 $path = $addroot ? $divider : '';
360 $last = ($path[strlen($path) - 1] == $divider) ? $divider : '';
361 $array = explode($divider, $path);
364 foreach(array_keys($array) as $key) {
365 if ($array[$key] == '' || $array[$key] == '.')
370 foreach($array as $value) {
371 if ($value == '..') {
374 array_push($tmp, $value);
379 $path = $addroot ? $divider : '';
380 if (! empty($array)) $path .= implode($divider, $array) . $last;
386 // An URI array => An URI (See uri_pickup())
387 function uri_array_implode($uri = array())
389 if (empty($uri) || ! is_array($uri)) return NULL;
392 if (isset($uri['scheme']) && $uri['scheme'] !== '') {
393 $tmp[] = & $uri['scheme'];
396 if (isset($uri['userinfo']) && $uri['userinfo'] !== '') {
397 $tmp[] = & $uri['userinfo'];
400 if (isset($uri['host']) && $uri['host'] !== '') {
401 $tmp[] = & $uri['host'];
403 if (isset($uri['port']) && $uri['port'] !== '') {
405 $tmp[] = & $uri['port'];
407 if (isset($uri['path']) && $uri['path'] !== '') {
408 $tmp[] = & $uri['path'];
410 if (isset($uri['file']) && $uri['file'] !== '') {
411 $tmp[] = & $uri['file'];
413 if (isset($uri['fragment']) && $uri['fragment'] !== '') {
415 $tmp[] = & $uri['fragment'];
418 return implode('', $tmp);
421 // ---------------------
422 // Part One : Checker
424 function generate_glob_regex($string = '', $divider = '/')
426 static $from = array(
451 $string = str_replace($from, $mid, $string); // Hide
452 $string = preg_quote($string, $divider);
453 $string = str_replace($mid, $to, $string); // Unhide
459 // TODO: require_or_include_once(another file)
460 function is_badhost($hosts = '')
462 static $blocklist_regex;
464 if (! isset($blocklist_regex)) {
465 $blocklist_regex = array();
471 //'10.20.*.*', // 10.20.example.com also matches
474 // Too much malicious sub-domains
483 // 2006/11/19 17:50 dev
485 '*.bigpricesearch.org',
489 foreach ($blocklist as $part) {
490 $blocklist_regex[] = '#^' . generate_glob_regex($part, '#') . '$#i';
494 if (! is_array($hosts)) $hosts = array($hosts);
495 foreach($hosts as $host) {
496 if (! is_string($host)) $host = '';
497 foreach ($blocklist_regex as $regex) {
498 if (preg_match($regex, $host)) {
507 // TODO return TRUE or FALSE!
508 // Simple/fast spam check
509 function check_uri_spam($target = '', $method = array())
515 if (! is_array($method) || empty($method)) {
518 'quantity' => 8, // Allow N URIs
520 'non_uniq' => 3, // Allow N times dupe
525 if (is_array($target)) {
526 foreach($target as $str) {
528 list($is_spam, $_quantity, $_non_uniq) = check_uri_spam($str, $method);
529 $quantity += $_quantity;
530 $non_uniq += $_non_uniq;
534 $pickups = spam_uri_pickup($target);
535 $quantity += count($pickups);
537 if (! empty($pickups)) {
540 if (! $is_spam && isset($method['quantity']) &&
541 $quantity > $method['quantity']) {
544 //var_dump($method['quantity'], $is_spam);
546 // Using invalid area
547 if (! $is_spam && isset($method['area'])) {
548 foreach($pickups as $pickup) {
549 if ($pickup['area'] < 0) {
555 //var_dump($method['area'], $is_spam);
557 // URI uniqueness (and removing non-uniques)
558 if (! $is_spam && isset($method['non_uniq'])) {
560 foreach ($pickups as $key => $pickup) {
561 $uris[$key] = uri_array_implode($pickup);
563 $count = count($uris);
564 $uris = array_unique($uris);
565 $non_uniq += $count - count($uris);
566 if ($non_uniq > $method['non_uniq']) {
569 foreach (array_diff(array_keys($pickups),
570 array_keys($uris)) as $remove) {
571 unset($pickups[$remove]);
575 //var_dump($uris, $pickups);
577 //var_dump($method['non_uniq'], $is_spam);
580 if (! $is_spam && isset($method['badhost'])) {
582 foreach ($pickups as $pickup) {
583 $hosts[] = & $pickup['host'];
585 $is_spam = is_badhost(array_unique($hosts));
587 //var_dump($method['badhost'], $is_spam);
591 return array($is_spam, $quantity, $non_uniq);
594 // ---------------------
596 // Check User-Agent (not testing yet)
597 function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
599 return $ua_name === '';
602 // ---------------------
604 // TODO: Separate check-part(s) and mail part
605 // TODO: Mail to administrator with more measurement data?
606 // Simple/fast spam filter ($target: 'a string' or an array())
607 function pkwk_spamfilter($action, $page, $target = array('title' => ''), $method = array())
611 //$is_spam = is_invalid_useragent('NOTYET');
613 $action .= ' (Invalid User-Agent)';
615 list($is_spam) = check_uri_spam($target, $method);
619 // Mail to administrator(s)
620 global $notify, $notify_subject;
622 $footer['ACTION'] = $action;
623 $footer['PAGE'] = '[blocked] ' . $page;
624 $footer['URI'] = get_script_uri() . '?' . rawurlencode($page);
625 $footer['USER_AGENT'] = TRUE;
626 $footer['REMOTE_ADDR'] = TRUE;
627 pkwk_mail_notify($notify_subject, var_export($target, TRUE), $footer);
632 if ($is_spam) spam_exit();
635 // ---------------------
637 // Common bahavior for blocking
638 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'