2 // $Id: spam.php,v 1.21 2006/11/18 11:57:28 henoheno Exp $
3 // Copyright (C) 2006 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
6 // Functions for Concept-work of spam-uri metrics
8 // Return an array of URIs in the $string
9 // [OK] http://nasty.example.org#nasty_string
10 // [OK] http://nasty.example.org:80/foo/xxx#nasty_string/bar
11 // [OK] ftp://nasty.example.org:80/dfsdfs
12 // [OK] ftp://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm (from RFC3986)
13 function uri_pickup($string = '', $normalize = TRUE)
15 // Not available for: IDN(ignored), Fragment(ignored)
19 '#(\b[a-z][a-z0-9.+-]{1,8})://' . // 1: Scheme
21 '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username)
23 ':([^\s<>"\'\[\]:/\#?@]*)' . // 3: Userinfo (Password)
28 '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
29 '(?:[0-9]{1-3}\.){3}[0-9]{1-3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
30 '[^\s<>"\'\[\]:/\#?]+' . // FQDN: foo.example.org
32 '(?::([a-z0-9]{2,}))?' . // 5: Port
33 '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 6: Directory path or path-info
34 '([^\s<>"\'\[\]\#]+)?' . // 7: File and query string
35 // #: Fragment(ignored)
37 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
38 //var_dump(recursive_map('htmlspecialchars', $array));
41 $parts = array(1 => 'scheme', 2 => 'user', 3 => 'pass',
42 4 => 'host', 5 => 'port', 6 => 'path', 7 => 'file');
44 foreach(array_keys($array) as $uri) {
45 unset($array[$uri][0]); // Matched string itself
46 array_rename_keys($array[$uri], $parts, TRUE, $default);
47 $offset = $array[$uri]['scheme'][1]; // Scheme's offset
49 foreach(array_keys($array[$uri]) as $part) {
50 // Remove offsets for each part
51 $array[$uri][$part] = & $array[$uri][$part][0];
54 $array[$uri]['scheme'] = scheme_normalize($array[$uri]['scheme']);
55 $array[$uri]['host'] = strtolower($array[$uri]['host']);
56 $array[$uri]['port'] = port_normalize($array[$uri]['scheme'], $array[$uri]['port'], FALSE);
57 $array[$uri]['path'] = path_normalize($array[$uri]['path']);
59 $array[$uri]['offset'] = $offset;
60 $array[$uri]['area'] = 0;
66 // Preprocess: rawurldecode() and adding space(s) to detect/count some URIs _if possible_
67 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
68 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
69 // [OK] http://victim.example.org/http://nasty.example.org
70 function spam_uri_pickup_preprocess($string = '')
72 if (is_string($string)) {
76 '#\b[a-z][a-z0-9.+-]{1,8}://#i',
77 '#[a-z][a-z0-9.+-]{1,8}://#i'
87 // TODO: Area selection (Check BBCode only, check anchor only, check ...)
88 // Main function of spam-uri pickup
89 function spam_uri_pickup($string = '')
91 $string = spam_uri_pickup_preprocess($string);
93 $array = uri_pickup($string);
95 // Area elevation for '(especially external)link' intension
96 if (! empty($array)) {
97 // Anchor tags by preg_match_all()
98 // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
99 // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a>
100 // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
101 // [NG] <a href= >Good site!</a> <a href= "#" >test</a>
103 preg_match_all('#<a\b[^>]*href[^>]*>.*?</a\b[^>]*(>)#i',
104 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
105 //var_dump(recursive_map('htmlspecialchars', $areas));
106 foreach(array_keys($areas) as $area) {
107 $areas[$area] = array(
108 $areas[$area][0][1], // Area start (<a href>)
109 $areas[$area][1][1], // Area end (</a>)
112 area_measure($areas, $array);
114 // phpBB's "BBCode" by preg_match_all()
115 // [url]http://nasty.example.com/[/url]
116 // [link]http://nasty.example.com/[/link]
117 // [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
118 // [link http://nasty.example.com/]buy something[/link]
121 preg_match_all('#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i',
122 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
123 //var_dump(recursive_map('htmlspecialchars', $areas));
124 foreach(array_keys($areas) as $area) {
125 $areas[$area] = array(
126 $areas[$area][0][1], // Area start ([url])
127 $areas[$area][2][1], // Area end ([/url])
130 area_measure($areas, $array);
132 // Various Wiki syntax
133 // [text_or_uri>text_or_uri]
134 // [text_or_uri:text_or_uri]
135 // [text_or_uri|text_or_uri]
136 // [text_or_uri->text_or_uri]
137 // [text_or_uri text_or_uri] // MediaWiki
138 // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
140 // Remove 'offset's for area_measure()
141 //foreach(array_keys($array) as $key)
142 // unset($array[$key]['offset']);
148 // $array['something'] => $array['wanted']
149 function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
151 if (! is_array($array) || ! is_array($keys))
154 // Nondestructive test
156 foreach(array_keys($keys) as $from)
157 if (! isset($array[$from]))
160 foreach($keys as $from => $to) {
161 if ($from === $to) continue;
162 if (! $force || isset($array[$from])) {
163 $array[$to] = & $array[$from];
164 unset($array[$from]);
166 $array[$to] = $default;
173 // If in doubt, it's a little doubtful
174 function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
176 if (! is_array($areas) || ! is_array($array)) return;
178 $areas_keys = array_keys($areas);
179 foreach(array_keys($array) as $u_index) {
180 $offset = isset($array[$u_index][$o_key]) ?
181 intval($array[$u_index][$o_key]) : 0;
182 foreach($areas_keys as $a_index) {
183 if (isset($array[$u_index][$a_key])) {
184 $offset_s = intval($areas[$a_index][0]);
185 $offset_e = intval($areas[$a_index][1]);
186 // [Area => inside <= Area]
187 if ($offset_s < $offset && $offset < $offset_e) {
188 $array[$u_index][$a_key] += $belief;
196 // ---------------------
199 // Scheme normalization (Before port normalization)
200 // snntp://example.org => nntps://example.org
201 // NOTE: These alias are needed only for anti URI spamming now. See port_normalize().
202 function scheme_normalize($scheme = '')
206 if (! isset($aliases)) {
218 $scheme = strtolower($scheme);
219 if (isset($aliases[$scheme])) $scheme = $aliases[$scheme];
224 // Port normalization
225 // http://example.org:80/ => http://example.org/
226 // http://example.org:8080/ => http://example.org:8080/
227 // https://example.org:443/ => https://example.org/
228 // NOTE: These alias are needed only for anti URI spamming now
229 function port_normalize($scheme, $port = '', $scheme_normalize = TRUE)
231 if ($port === '') return $port;
233 // Refer: http://www.iana.org/assignments/port-numbers
234 if ($scheme_normalize) $scheme = scheme_normalize($scheme);
236 case 21: if ($scheme == 'ftp') $port = ''; break;
237 case 22: if ($scheme == 'ssh') $port = ''; break;
238 case 23: if ($scheme == 'telnet') $port = ''; break;
239 case 25: if ($scheme == 'smtp') $port = ''; break;
240 case 69: if ($scheme == 'tftp') $port = ''; break;
241 case 70: if ($scheme == 'gopher') $port = ''; break;
242 case 79: if ($scheme == 'finger') $port = ''; break;
243 case 80: if ($scheme == 'http') $port = ''; break;
244 case 110: if ($scheme == 'pop3') $port = ''; break;
245 case 115: if ($scheme == 'sftp') $port = ''; break;
246 case 119: if ($scheme == 'nntp') $port = ''; break;
247 case 143: if ($scheme == 'imap') $port = ''; break;
248 case 194: if ($scheme == 'irc') $port = ''; break;
249 case 210: if ($scheme == 'wais') $port = ''; break;
250 case 443: if ($scheme == 'https') $port = ''; break;
251 case 563: if ($scheme == 'nntps') $port = ''; break;
252 case 873: if ($scheme == 'rsync') $port = ''; break;
253 case 990: if ($scheme == 'ftps') $port = ''; break;
254 case 992: if ($scheme == 'telnets') $port = ''; break;
255 case 993: if ($scheme == 'imaps') $port = ''; break;
256 case 994: if ($scheme == 'ircs') $port = ''; break;
257 case 995: if ($scheme == 'pop3s') $port = ''; break;
258 case 3306: if ($scheme == 'mysql') $port = ''; break;
264 // Path normalization
267 // /path/a/b/./c////./d => /path/a/b/c/d
268 // /path/../../a/../back => /back
269 function path_normalize($path = '', $divider = '/', $addroot = TRUE)
271 if (! is_string($path) || $path == '') {
272 $path = $addroot ? $divider : '';
275 $last = ($path[strlen($path) - 1] == $divider) ? $divider : '';
276 $array = explode($divider, $path);
279 foreach(array_keys($array) as $key) {
280 if ($array[$key] == '' || $array[$key] == '.')
285 foreach($array as $value) {
286 if ($value == '..') {
289 array_push($tmp, $value);
294 $path = $addroot ? $divider : '';
295 if (! empty($array)) $path .= implode($divider, $array) . $last;
302 // Output: array('' => array('a' => array('b' => NULL)))
303 function array_tree($string, $delimiter = '/', $reverse = FALSE)
307 $tmps = explode($delimiter, $string);
308 if (! $reverse) $tmps = array_reverse($tmps);
309 foreach ($tmps as $tmp) {
310 $tree = array($tmp => $tree);
316 // ---------------------
317 // Part One : Checker
319 // Simple/fast spam check
320 function is_uri_spam($target = '')
325 if (is_array($target)) {
326 foreach($target as $str) {
328 list($is_spam, $_urinum) = is_uri_spam($str);
333 $pickups = spam_uri_pickup($target);
334 $urinum += count($pickups);
335 if (! empty($pickups)) {
336 // Some users want to post some URLs, but ...
338 $is_spam = TRUE; // Too many!
340 foreach($pickups as $pickup) {
341 if ($pickup['area'] < 0) {
350 return array($is_spam, $urinum);
353 // ---------------------
355 // Check User-Agent (not testing yet)
356 function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
358 return $ua_name === '';
361 // ---------------------
363 // TODO: Multi-metrics (uri, host, user-agent, ...)
364 // TODO: Mail to administrator with more measurement data?
365 // Simple/fast spam filter ($target: 'a string' or an array())
366 function pkwk_spamfilter($action, $page, $target = array('title' => ''))
370 //$is_spam = is_invalid_useragent('NOTYET');
372 $action .= ' (Invalid User-Agent)';
374 list($is_spam) = is_uri_spam($target);
378 // Mail to administrator(s)
379 global $notify, $notify_subject;
381 $footer['ACTION'] = $action;
382 $footer['PAGE'] = '[blocked] ' . $page;
383 $footer['URI'] = get_script_uri() . '?' . rawurlencode($page);
384 $footer['USER_AGENT'] = TRUE;
385 $footer['REMOTE_ADDR'] = TRUE;
386 pkwk_mail_notify($notify_subject, var_export($target, TRUE), $footer);
391 if ($is_spam) spam_exit();
394 // ---------------------
396 // Common bahavior for blocking
397 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'