2 // $Id: spam.php,v 1.131 2007/04/22 08:04:19 henoheno Exp $
3 // Copyright (C) 2006-2007 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
6 // Functions for Concept-work of spam-uri metrics
8 // (PHP 4 >= 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature
10 if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php');
12 // ---------------------
15 // (PHP 4 >= 4.2.0): var_export(): mail-reporting and dump related
16 if (! function_exists('var_export')) {
17 function var_export() {
18 return 'var_export() is not found on this server' . "\n";
22 // (PHP 4 >= 4.2.0): preg_grep() enables invert option
23 function preg_grep_invert($pattern = '//', $input = array())
26 if (! isset($invert)) $invert = defined('PREG_GREP_INVERT');
29 return preg_grep($pattern, $input, PREG_GREP_INVERT);
31 $result = preg_grep($pattern, $input);
33 return array_diff($input, preg_grep($pattern, $input));
40 // ---------------------
43 // Return an array of URIs in the $string
44 // [OK] http://nasty.example.org#nasty_string
45 // [OK] http://nasty.example.org:80/foo/xxx#nasty_string/bar
46 // [OK] ftp://nasty.example.org:80/dfsdfs
47 // [OK] ftp://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm (from RFC3986)
48 function uri_pickup($string = '')
50 if (! is_string($string)) return array();
52 // Not available for: IDN(ignored)
55 // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
56 // Refer RFC3986 (Regex below is not strict)
57 '#(\b[a-z][a-z0-9.+-]{1,8}):/+' . // 1: Scheme
59 '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username)
63 '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
64 '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
65 '[a-z0-9.-]+' . // hostname(FQDN) : foo.example.org
67 '(?::([0-9]*))?' . // 4: Port
68 '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info
69 '([^\s<>"\'\[\]\#?]+)?' . // 6: File?
70 '(?:\?([^\s<>"\'\[\]\#]+))?' . // 7: Query string
71 '(?:\#([a-z0-9._~%!$&\'()*+,;=:@-]*))?' . // 8: Fragment
73 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE
77 static $parts = array(
78 1 => 'scheme', 2 => 'userinfo', 3 => 'host', 4 => 'port',
79 5 => 'path', 6 => 'file', 7 => 'query', 8 => 'fragment'
82 foreach(array_keys($array) as $uri) {
83 $_uri = & $array[$uri];
84 array_rename_keys($_uri, $parts, TRUE, $default);
85 $offset = $_uri['scheme'][1]; // Scheme's offset = URI's offset
86 foreach(array_keys($_uri) as $part) {
87 $_uri[$part] = & $_uri[$part][0]; // Remove offsets
91 foreach(array_keys($array) as $uri) {
92 $_uri = & $array[$uri];
93 if ($_uri['scheme'] === '') {
94 unset($array[$uri]); // Considererd harmless
97 unset($_uri[0]); // Matched string itself
98 $_uri['area']['offset'] = $offset; // Area offset for area_measure()
104 // Normalize an array of URI arrays
105 // NOTE: Give me the uri_pickup() results
106 function uri_pickup_normalize(& $pickups, $destructive = TRUE)
108 if (! is_array($pickups)) return $pickups;
111 foreach (array_keys($pickups) as $key) {
112 $_key = & $pickups[$key];
113 $_key['scheme'] = isset($_key['scheme']) ? scheme_normalize($_key['scheme']) : '';
114 $_key['host'] = isset($_key['host']) ? host_normalize($_key['host']) : '';
115 $_key['port'] = isset($_key['port']) ? port_normalize($_key['port'], $_key['scheme'], FALSE) : '';
116 $_key['path'] = isset($_key['path']) ? strtolower(path_normalize($_key['path'])) : '';
117 $_key['file'] = isset($_key['file']) ? file_normalize($_key['file']) : '';
118 $_key['query'] = isset($_key['query']) ? query_normalize($_key['query']) : '';
119 $_key['fragment'] = isset($_key['fragment']) ? strtolower($_key['fragment']) : '';
122 foreach (array_keys($pickups) as $key) {
123 $_key = & $pickups[$key];
124 $_key['scheme'] = isset($_key['scheme']) ? scheme_normalize($_key['scheme']) : '';
125 $_key['host'] = isset($_key['host']) ? strtolower($_key['host']) : '';
126 $_key['port'] = isset($_key['port']) ? port_normalize($_key['port'], $_key['scheme'], FALSE) : '';
127 $_key['path'] = isset($_key['path']) ? path_normalize($_key['path']) : '';
134 // An URI array => An URI (See uri_pickup())
136 // $pickups = uri_pickup('a string include some URIs');
138 // foreach (array_keys($pickups) as $key) {
139 // $uris[$key] = uri_pickup_implode($pickups[$key]);
141 function uri_pickup_implode($uri = array())
143 if (empty($uri) || ! is_array($uri)) return NULL;
146 if (isset($uri['scheme']) && $uri['scheme'] !== '') {
147 $tmp[] = & $uri['scheme'];
150 if (isset($uri['userinfo']) && $uri['userinfo'] !== '') {
151 $tmp[] = & $uri['userinfo'];
154 if (isset($uri['host']) && $uri['host'] !== '') {
155 $tmp[] = & $uri['host'];
157 if (isset($uri['port']) && $uri['port'] !== '') {
159 $tmp[] = & $uri['port'];
161 if (isset($uri['path']) && $uri['path'] !== '') {
162 $tmp[] = & $uri['path'];
164 if (isset($uri['file']) && $uri['file'] !== '') {
165 $tmp[] = & $uri['file'];
167 if (isset($uri['query']) && $uri['query'] !== '') {
169 $tmp[] = & $uri['query'];
171 if (isset($uri['fragment']) && $uri['fragment'] !== '') {
173 $tmp[] = & $uri['fragment'];
176 return implode('', $tmp);
179 // $array['something'] => $array['wanted']
180 function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
182 if (! is_array($array) || ! is_array($keys)) return FALSE;
184 // Nondestructive test
186 foreach(array_keys($keys) as $from)
187 if (! isset($array[$from]))
190 foreach($keys as $from => $to) {
191 if ($from === $to) continue;
192 if (! $force || isset($array[$from])) {
193 $array[$to] = & $array[$from];
194 unset($array[$from]);
196 $array[$to] = $default;
203 // ---------------------
206 // Pickup all of markup areas
207 function area_pickup($string = '', $method = array())
210 if (empty($method)) return $area;
212 // Anchor tag pair by preg_match and preg_match_all()
214 // [OK] <a href= >Good site!</a>
215 // [OK] <a href= "#" >test</a>
216 // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
217 // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a>
218 // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
219 $regex = '#<a\b[^>]*\bhref\b[^>]*>.*?</a\b[^>]*(>)#i';
220 if (isset($method['area_anchor'])) {
222 $count = isset($method['asap']) ?
223 preg_match($regex, $string) :
224 preg_match_all($regex, $string, $areas);
225 if (! empty($count)) $area['area_anchor'] = $count;
227 if (isset($method['uri_anchor'])) {
229 preg_match_all($regex, $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
230 foreach(array_keys($areas) as $_area) {
231 $areas[$_area] = array(
232 $areas[$_area][0][1], // Area start (<a href>)
233 $areas[$_area][1][1], // Area end (</a>)
236 if (! empty($areas)) $area['uri_anchor'] = $areas;
239 // phpBB's "BBCode" pair by preg_match and preg_match_all()
241 // [OK] [url]http://nasty.example.com/[/url]
242 // [OK] [link]http://nasty.example.com/[/link]
243 // [OK] [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
244 // [OK] [link http://nasty.example.com/]buy something[/link]
245 $regex = '#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i';
246 if (isset($method['area_bbcode'])) {
248 $count = isset($method['asap']) ?
249 preg_match($regex, $string) :
250 preg_match_all($regex, $string, $areas, PREG_SET_ORDER);
251 if (! empty($count)) $area['area_bbcode'] = $count;
253 if (isset($method['uri_bbcode'])) {
255 preg_match_all($regex, $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
256 foreach(array_keys($areas) as $_area) {
257 $areas[$_area] = array(
258 $areas[$_area][0][1], // Area start ([url])
259 $areas[$_area][2][1], // Area end ([/url])
262 if (! empty($areas)) $area['uri_bbcode'] = $areas;
265 // Various Wiki syntax
266 // [text_or_uri>text_or_uri]
267 // [text_or_uri:text_or_uri]
268 // [text_or_uri|text_or_uri]
269 // [text_or_uri->text_or_uri]
270 // [text_or_uri text_or_uri] // MediaWiki
271 // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
276 // If in doubt, it's a little doubtful
277 // if (Area => inside <= Area) $brief += -1
278 function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
280 if (! is_array($areas) || ! is_array($array)) return;
282 $areas_keys = array_keys($areas);
283 foreach(array_keys($array) as $u_index) {
284 $offset = isset($array[$u_index][$o_key]) ?
285 intval($array[$u_index][$o_key]) : 0;
286 foreach($areas_keys as $a_index) {
287 if (isset($array[$u_index][$a_key])) {
288 $offset_s = intval($areas[$a_index][0]);
289 $offset_e = intval($areas[$a_index][1]);
290 // [Area => inside <= Area]
291 if ($offset_s < $offset && $offset < $offset_e) {
292 $array[$u_index][$a_key] += $belief;
299 // ---------------------
302 // Domain exposure callback (See spam_uri_pickup_preprocess())
303 // http://victim.example.org/?foo+site:nasty.example.com+bar
304 // => http://nasty.example.com/?refer=victim.example.org
305 // NOTE: 'refer=' is not so good for (at this time).
306 // Consider about using IP address of the victim, try to avoid that.
307 function _preg_replace_callback_domain_exposure($matches = array())
311 // Preserve the victim URI as a complicity or ...
312 if (isset($matches[5])) {
314 $matches[1] . '://' . // scheme
315 $matches[2] . '/' . // victim.example.org
316 $matches[3]; // The rest of all (before victim)
320 if (isset($matches[4])) {
322 $matches[1] . '://' . // scheme
323 $matches[4] . // nasty.example.com
324 '/?refer=' . strtolower($matches[2]) . // victim.example.org
331 // Preprocess: rawurldecode() and adding space(s) and something
332 // to detect/count some URIs _if possible_
333 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
334 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
335 // [OK] http://victim.example.org/http://nasty.example.org
336 // TODO: link.toolbot.com, urlx.org
337 function spam_uri_pickup_preprocess($string = '')
339 if (! is_string($string)) return '';
341 $string = rawurldecode($string);
343 // Domain exposure (See _preg_replace_callback_domain_exposure())
344 $string = preg_replace_callback(
348 // Something Google: http://www.google.com/supported_domains
349 '(?:[a-z0-9.]+\.)?google\.[a-z]{2,3}(?:\.[a-z]{2})?' .
352 '(?:[a-z0-9.]+\.)?altavista.com' .
356 '([a-z0-9?=&.%_/\'\\\+-]+)' . // path/?query=foo+bar+
357 '\bsite:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // site:nasty.example.com
358 //'()' . // Preserve or remove?
361 '_preg_replace_callback_domain_exposure',
365 // URI exposure (uriuri => uri uri)
366 $string = preg_replace(
368 '#(?<! )(?:https?|ftp):/#i',
369 // '#[a-z][a-z0-9.+-]{1,8}://#i',
370 // '#[a-z][a-z0-9.+-]{1,8}://#i'
379 // Main function of spam-uri pickup,
380 // A wrapper function of uri_pickup()
381 function spam_uri_pickup($string = '', $method = array())
383 if (! is_array($method) || empty($method)) {
384 $method = check_uri_spam_method();
387 $string = spam_uri_pickup_preprocess($string);
389 $array = uri_pickup($string);
391 // Area elevation of URIs, for '(especially external)link' intension
392 if (! empty($array)) {
394 if (isset($method['uri_anchor'])) $_method['uri_anchor'] = & $method['uri_anchor'];
395 if (isset($method['uri_bbcode'])) $_method['uri_bbcode'] = & $method['uri_bbcode'];
396 $areas = area_pickup($string, $_method, TRUE);
397 if (! empty($areas)) {
398 $area_shadow = array();
399 foreach (array_keys($array) as $key) {
400 $area_shadow[$key] = & $array[$key]['area'];
401 foreach (array_keys($_method) as $_key) {
402 $area_shadow[$key][$_key] = 0;
405 foreach (array_keys($_method) as $_key) {
406 if (isset($areas[$_key])) {
407 area_measure($areas[$_key], $area_shadow, 1, $_key);
413 // Remove 'offset's for area_measure()
414 foreach(array_keys($array) as $key)
415 unset($array[$key]['area']['offset']);
421 // ---------------------
424 // Scheme normalization: Renaming the schemes
425 // snntp://example.org => nntps://example.org
426 // NOTE: Keep the static lists simple. See also port_normalize().
427 function scheme_normalize($scheme = '', $abbrevs_harmfull = TRUE)
429 // Abbreviations they have no intention of link
430 static $abbrevs = array(
435 // Aliases => normalized ones
436 static $aliases = array(
446 if (! is_string($scheme)) return '';
448 $scheme = strtolower($scheme);
449 if (isset($abbrevs[$scheme])) {
450 $scheme = $abbrevs_harmfull ? $abbrevs[$scheme] : '';
452 if (isset($aliases[$scheme])) {
453 $scheme = $aliases[$scheme];
459 // Hostname normlization (Destructive)
460 // www.foo => www.foo ('foo' seems TLD)
461 // www.foo.bar => foo.bar
462 // www.10.20 => www.10.20 (Invalid hostname)
464 // 'www' is mostly used as traditional hostname of WWW server.
465 // 'www.foo.bar' may be identical with 'foo.bar'.
466 function host_normalize($host = '')
468 if (! is_string($host)) return '';
470 $host = strtolower($host);
472 if (preg_match('/^www\.(.+\.[a-z]+)$/', $host, $matches)) {
479 // Port normalization: Suppress the (redundant) default port
480 // HTTP://example.org:80/ => http://example.org/
481 // HTTP://example.org:8080/ => http://example.org:8080/
482 // HTTPS://example.org:443/ => https://example.org/
483 function port_normalize($port, $scheme, $scheme_normalize = FALSE)
485 // Schemes that users _maybe_ want to add protocol-handlers
486 // to their web browsers. (and attackers _maybe_ want to use ...)
487 // Reference: http://www.iana.org/assignments/port-numbers
488 static $array = array(
489 // scheme => default port
515 // intval() converts '0-1' to '0', so preg_match() rejects these invalid ones
516 if (! is_numeric($port) || $port < 0 || preg_match('/[^0-9]/i', $port))
519 $port = intval($port);
520 if ($scheme_normalize) $scheme = scheme_normalize($scheme);
521 if (isset($array[$scheme]) && $port == $array[$scheme])
522 $port = ''; // Ignore the defaults
527 // Path normalization
528 // http://example.org => http://example.org/
529 // http://example.org#hoge => http://example.org/#hoge
530 // http://example.org/path/a/b/./c////./d => http://example.org/path/a/b/c/d
531 // http://example.org/path/../../a/../back => http://example.org/back
532 function path_normalize($path = '', $divider = '/', $add_root = TRUE)
534 if (! is_string($divider)) return is_string($path) ? $path : '';
537 $first_div = & $divider;
541 if (! is_string($path) || $path == '') return $first_div;
543 if (strpos($path, $divider, strlen($path) - strlen($divider)) === FALSE) {
546 $last_div = & $divider;
549 $array = explode($divider, $path);
551 // Remove paddings ('//' and '/./')
552 foreach(array_keys($array) as $key) {
553 if ($array[$key] == '' || $array[$key] == '.') {
558 // Remove back-tracks ('/../')
560 foreach($array as $value) {
561 if ($value == '..') {
564 array_push($tmp, $value);
572 return $first_div . implode($divider, $array) . $last_div;
576 // DirectoryIndex normalize (Destructive and rough)
577 // TODO: sample.en.ja.html.gz => sample.html
578 function file_normalize($file = 'index.html.en')
580 static $simple_defaults = array(
581 'default.htm' => TRUE,
582 'default.html' => TRUE,
583 'default.asp' => TRUE,
584 'default.aspx' => TRUE,
585 'index' => TRUE, // Some system can omit the suffix
588 static $content_suffix = array(
589 // index.xxx, sample.xxx
604 static $language_suffix = array(
605 // Reference: Apache 2.0.59 'AddLanguage' default
634 // Reference: Apache 2.0.59 default 'index.html' variants
640 static $charset_suffix = array(
641 // Reference: Apache 2.0.59 'AddCharset' default
642 'iso8859-1' => TRUE, // ISO-8859-1
643 'latin1' => TRUE, // ISO-8859-1
644 'iso8859-2' => TRUE, // ISO-8859-2
645 'latin2' => TRUE, // ISO-8859-2
646 'cen' => TRUE, // ISO-8859-2
647 'iso8859-3' => TRUE, // ISO-8859-3
648 'latin3' => TRUE, // ISO-8859-3
649 'iso8859-4' => TRUE, // ISO-8859-4
650 'latin4' => TRUE, // ISO-8859-4
651 'iso8859-5' => TRUE, // ISO-8859-5
652 'latin5' => TRUE, // ISO-8859-5
653 'cyr' => TRUE, // ISO-8859-5
654 'iso-ru' => TRUE, // ISO-8859-5
655 'iso8859-6' => TRUE, // ISO-8859-6
656 'latin6' => TRUE, // ISO-8859-6
657 'arb' => TRUE, // ISO-8859-6
658 'iso8859-7' => TRUE, // ISO-8859-7
659 'latin7' => TRUE, // ISO-8859-7
660 'grk' => TRUE, // ISO-8859-7
661 'iso8859-8' => TRUE, // ISO-8859-8
662 'latin8' => TRUE, // ISO-8859-8
663 'heb' => TRUE, // ISO-8859-8
664 'iso8859-9' => TRUE, // ISO-8859-9
665 'latin9' => TRUE, // ISO-8859-9
666 'trk' => TRUE, // ISO-8859-9
667 'iso2022-jp'=> TRUE, // ISO-2022-JP
668 'jis' => TRUE, // ISO-2022-JP
669 'iso2022-kr'=> TRUE, // ISO-2022-KR
670 'kis' => TRUE, // ISO-2022-KR
671 'iso2022-cn'=> TRUE, // ISO-2022-CN
672 'cis' => TRUE, // ISO-2022-CN
674 'cp-1251' => TRUE, // ru, WINDOWS-1251
675 'win-1251' => TRUE, // ru, WINDOWS-1251
676 'cp866' => TRUE, // ru
677 'koi8-r' => TRUE, // ru, KOI8-r
678 'koi8-ru' => TRUE, // ru, KOI8-r
679 'koi8-uk' => TRUE, // ru, KOI8-ru
680 'ua' => TRUE, // ru, KOI8-ru
681 'ucs2' => TRUE, // ru, ISO-10646-UCS-2
682 'ucs4' => TRUE, // ru, ISO-10646-UCS-4
685 // Reference: Apache 2.0.59 default 'index.html' variants
690 // May uncompress by web browsers on the fly
691 // Must be at the last of the filename
692 // Reference: Apache 2.0.59 'AddEncoding'
693 static $encoding_suffix = array(
698 if (! is_string($file)) return '';
699 $_file = strtolower($file);
700 if (isset($simple_defaults[$_file])) return '';
703 // Roughly removing language/character-set/encoding suffixes
705 // * Apache 2 document about 'Content-negotiaton', 'mod_mime' and 'mod_negotiation'
706 // http://httpd.apache.org/docs/2.0/content-negotiation.html
707 // http://httpd.apache.org/docs/2.0/mod/mod_mime.html
708 // http://httpd.apache.org/docs/2.0/mod/mod_negotiation.html
709 // * http://www.iana.org/assignments/character-sets
710 // * RFC3066: Tags for the Identification of Languages
711 // http://www.ietf.org/rfc/rfc3066.txt
712 // * ISO 639: codes of 'language names'
713 $suffixes = explode('.', $_file);
714 $body = array_shift($suffixes);
716 // Remove the last .gz/.z
717 $last_key = end(array_keys($suffixes));
718 if (isset($encoding_suffix[$suffixes[$last_key]])) {
719 unset($suffixes[$last_key]);
722 // Cut language and charset suffixes
723 foreach($suffixes as $key => $value){
724 if (isset($language_suffix[$value]) || isset($charset_suffix[$value])) {
725 unset($suffixes[$key]);
728 if (empty($suffixes)) return $body;
731 $count = count($suffixes);
733 $current = current($suffixes);
734 if ($body == 'index' && $count == 1 && isset($content_suffix[$current])) return '';
739 // Sort query-strings if possible (Destructive and rough)
740 // [OK] &&&&f=d&b&d&c&a=0dd => a=0dd&b&c&d&f=d
741 // [OK] nothing==&eg=dummy&eg=padding&eg=foobar => eg=foobar
742 function query_normalize($string = '', $equal = TRUE, $equal_cutempty = TRUE, $stortolower = TRUE)
744 if (! is_string($string)) return '';
745 if ($stortolower) $string = strtolower($string);
747 $array = explode('&', $string);
749 // Remove '&' paddings
750 foreach(array_keys($array) as $key) {
751 if ($array[$key] == '') {
756 // Consider '='-sepalated input and paddings
758 $equals = $not_equals = array();
759 foreach ($array as $part) {
760 if (strpos($part, '=') === FALSE) {
761 $not_equals[] = $part;
763 list($key, $value) = explode('=', $part, 2);
764 $value = ltrim($value, '=');
765 if (! $equal_cutempty || $value != '') {
766 $equals[$key] = $value;
771 $array = & $not_equals;
772 foreach ($equals as $key => $value) {
773 $array[] = $key . '=' . $value;
779 return implode('&', $array);
782 // ---------------------
783 // Part One : Checker
785 // Rough implementation of globbing
787 // USAGE: $regex = '/^' . generate_glob_regex('*.txt', '/') . '$/i';
789 function generate_glob_regex($string = '', $divider = '/')
791 static $from = array(
794 // 22 => '[', // Maybe cause regex compilation error (e.g. '[]')
810 if (! is_string($string)) return '';
812 $string = str_replace($from, $mid, $string); // Hide
813 $string = preg_quote($string, $divider);
814 $string = str_replace($mid, $to, $string); // Unhide
819 // Rough hostname checker
821 // TODO: Strict digit, 0x, CIDR, IPv6
822 function is_ip($string = '')
824 if (preg_match('/^' .
825 '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' .
826 '(?:[0-9]{1,3}\.){1,3}' . '$/',
828 return 4; // Seems IPv4(dot-decimal)
830 return 0; // Seems not IP
834 // Generate host (FQDN, IPv4, ...) regex
835 // 'localhost' : Matches with 'localhost' only
836 // 'example.org' : Matches with 'example.org' only (See host_normalize() about 'www')
837 // '.example.org' : Matches with ALL FQDN ended with '.example.org'
838 // '*.example.org' : Almost the same of '.example.org' except 'www.example.org'
839 // '10.20.30.40' : Matches with IPv4 address '10.20.30.40' only
840 // [TODO] '192.' : Matches with all IPv4 hosts started with '192.'
841 // TODO: IPv4, CIDR?, IPv6
842 function generate_host_regex($string = '', $divider = '/')
844 if (! is_string($string)) return '';
846 if (mb_strpos($string, '.') === FALSE)
847 return generate_glob_regex($string, $divider);
850 if (is_ip($string)) {
852 return generate_glob_regex($string, $divider);
855 $part = explode('.', $string, 2);
856 if ($part[0] == '') {
857 $part[0] = '(?:.*\.)?'; // And all related FQDN
858 } else if ($part[0] == '*') {
859 $part[0] = '.*\.'; // All subdomains/hosts only
861 return generate_glob_regex($string, $divider);
863 $part[1] = generate_glob_regex($part[1], $divider);
864 return implode('', $part);
868 function get_blocklist($list = '')
872 if (! isset($regexs)) {
874 if (file_exists(SPAM_INI_FILE)) {
875 $blocklist = array();
876 include(SPAM_INI_FILE);
877 // $blocklist['badhost'] = array(
878 // '*.blogspot.com', // Blog services's subdomains (only)
879 // 'IANA-examples' => '#^(?:.*\.)?example\.(?:com|net|org)$#',
881 if (isset($blocklist['list'])) {
882 $regexs['list'] = & $blocklist['list'];
885 $blocklist['list'] = array(
890 foreach(array_keys($blocklist['list']) as $_list) {
891 if (! isset($blocklist[$_list])) continue;
892 foreach ($blocklist[$_list] as $key => $value) {
893 if (is_array($value)) {
894 $regexs[$_list][$key] = array();
895 foreach($value as $_key => $_value) {
896 get_blocklist_add($regexs[$_list][$key], $_key, $_value);
899 get_blocklist_add($regexs[$_list], $key, $value);
902 unset($blocklist[$_list]);
908 return $regexs; // ALL
909 } else if (isset($regexs[$list])) {
910 return $regexs[$list];
916 // Subroutine of get_blocklist()
917 function get_blocklist_add(& $array, $key = 0, $value = '*.example.org')
919 if (is_string($key)) {
920 $array[$key] = & $value; // Treat $value as a regex
922 $array[$value] = '/^' . generate_host_regex($value, '/') . '$/i';
926 function is_badhost($hosts = array(), $asap = TRUE, & $remains)
929 if (! is_array($hosts)) $hosts = array($hosts);
930 foreach(array_keys($hosts) as $key) {
931 if (! is_string($hosts[$key])) {
935 if (empty($hosts)) return $result;
937 foreach(get_blocklist('list') as $key=>$value){
939 foreach (get_blocklist($key) as $label => $regex) {
940 if (is_array($regex)) {
941 $result[$label] = array();
942 foreach($regex as $_label => $_regex) {
943 if (is_badhost_avail($_label, $_regex, $hosts, $result[$label]) && $asap) {
947 if (empty($result[$label])) unset($result[$label]);
949 if (is_badhost_avail($label, $regex, $hosts, $result) && $asap) {
955 foreach (get_blocklist($key) as $regex) {
956 $hosts = preg_grep_invert($regex, $hosts);
958 if (empty($hosts)) return $result;
966 // Subroutine for is_badhost()
967 function is_badhost_avail($label = '*.example.org', $regex = '/^.*\.example\.org$/', & $hosts, & $result)
969 $group = preg_grep($regex, $hosts);
972 // DEBUG var_dump($group); // badhost detail
974 $result[$label] = & $group;
975 $hosts = array_diff($hosts, $result[$label]);
982 // Default (enabled) methods and thresholds (for content insertion)
983 function check_uri_spam_method($times = 1, $t_area = 0, $rule = TRUE)
985 $times = intval($times);
986 $t_area = intval($t_area);
990 'quantity' => 8 * $times, // Allow N URIs
991 'non_uniqhost' => 3 * $times, // Allow N duped (and normalized) Hosts
992 //'non_uniquri'=> 3 * $times, // Allow N duped (and normalized) URIs
995 'area_anchor' => $t_area, // Using <a href> HTML tag
996 'area_bbcode' => $t_area, // Using [url] or [link] BBCode
997 //'uri_anchor' => $t_area, // URI inside <a href> HTML tag
998 //'uri_bbcode' => $t_area, // URI inside [url] or [link] BBCode
1003 //'asap' => TRUE, // Quit or return As Soon As Possible
1004 'uniqhost' => TRUE, // Show uniq host (at block notification mail)
1005 'badhost' => TRUE, // Check badhost
1011 // Remove non-$positive values
1012 foreach (array_keys($positive) as $key) {
1013 if ($positive[$key] < 0) unset($positive[$key]);
1016 return $positive + $bool;
1019 // Simple/fast spam check
1020 function check_uri_spam($target = '', $method = array())
1022 if (! is_array($method) || empty($method)) {
1023 $method = check_uri_spam_method();
1037 'is_spam' => array(),
1038 'method' => & $method,
1039 'remains' => array(),
1042 $sum = & $progress['sum'];
1043 $is_spam = & $progress['is_spam'];
1044 $remains = & $progress['remains'];
1045 $error = & $progress['error'];
1046 $asap = isset($method['asap']);
1049 if (is_array($target)) {
1050 foreach($target as $str) {
1052 $_progress = check_uri_spam($str, $method);
1053 $_sum = & $_progress['sum'];
1054 $_is_spam = & $_progress['is_spam'];
1055 $_remains = & $_progress['remains'];
1056 $_error = & $_progress['error'];
1057 foreach (array_keys($_sum) as $key) {
1058 $sum[$key] += $_sum[$key];
1060 foreach (array_keys($_is_spam) as $key) {
1061 if (is_array($_is_spam[$key])) {
1062 // Marge keys (badhost)
1063 foreach(array_keys($_is_spam[$key]) as $_key) {
1064 if (! isset($is_spam[$key][$_key])) {
1065 $is_spam[$key][$_key] = $_is_spam[$key][$_key];
1067 $is_spam[$key][$_key] += $_is_spam[$key][$_key];
1071 $is_spam[$key] = TRUE;
1074 foreach ($_remains as $key=>$value) {
1075 foreach ($value as $_key=>$_value) {
1076 if (is_int($_key)) {
1077 $remains[$key][] = $_value;
1079 $remains[$key][$_key] = $_value;
1083 if (! empty($_error)) $error += $_error;
1084 if ($asap && $is_spam) break;
1089 // Area: There's HTML anchor tag
1090 if ((! $asap || ! $is_spam) && isset($method['area_anchor'])) {
1091 $key = 'area_anchor';
1092 $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
1093 $result = area_pickup($target, array($key => TRUE) + $_asap);
1095 $sum[$key] = $result[$key];
1096 if (isset($method[$key]) && $sum[$key] > $method[$key]) {
1097 $is_spam[$key] = TRUE;
1102 // Area: There's 'BBCode' linking tag
1103 if ((! $asap || ! $is_spam) && isset($method['area_bbcode'])) {
1104 $key = 'area_bbcode';
1105 $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
1106 $result = area_pickup($target, array($key => TRUE) + $_asap);
1108 $sum[$key] = $result[$key];
1109 if (isset($method[$key]) && $sum[$key] > $method[$key]) {
1110 $is_spam[$key] = TRUE;
1116 if ($asap && $is_spam) return $progress;
1119 $pickups = uri_pickup_normalize(spam_uri_pickup($target, $method));
1120 //$remains['uri_pickup'] = & $pickups;
1123 if (empty($pickups)) return $progress;
1125 // URI: Check quantity
1126 $sum['quantity'] += count($pickups);
1128 if ((! $asap || ! $is_spam) && isset($method['quantity']) &&
1129 $sum['quantity'] > $method['quantity']) {
1130 $is_spam['quantity'] = TRUE;
1133 // URI: used inside HTML anchor tag pair
1134 if ((! $asap || ! $is_spam) && isset($method['uri_anchor'])) {
1135 $key = 'uri_anchor';
1136 foreach($pickups as $pickup) {
1137 if (isset($pickup['area'][$key])) {
1138 $sum[$key] += $pickup['area'][$key];
1139 if(isset($method[$key]) &&
1140 $sum[$key] > $method[$key]) {
1141 $is_spam[$key] = TRUE;
1142 if ($asap && $is_spam) break;
1144 if ($asap && $is_spam) break;
1149 // URI: used inside 'BBCode' pair
1150 if ((! $asap || ! $is_spam) && isset($method['uri_bbcode'])) {
1151 $key = 'uri_bbcode';
1152 foreach($pickups as $pickup) {
1153 if (isset($pickup['area'][$key])) {
1154 $sum[$key] += $pickup['area'][$key];
1155 if(isset($method[$key]) &&
1156 $sum[$key] > $method[$key]) {
1157 $is_spam[$key] = TRUE;
1158 if ($asap && $is_spam) break;
1160 if ($asap && $is_spam) break;
1165 // URI: Uniqueness (and removing non-uniques)
1166 if ((! $asap || ! $is_spam) && isset($method['non_uniquri'])) {
1169 foreach (array_keys($pickups) as $key) {
1170 $uris[$key] = uri_pickup_implode($pickups[$key]);
1172 $count = count($uris);
1173 $uris = array_unique($uris);
1174 $sum['non_uniquri'] += $count - count($uris);
1175 if ($sum['non_uniquri'] > $method['non_uniquri']) {
1176 $is_spam['non_uniquri'] = TRUE;
1178 if (! $asap || ! $is_spam) {
1179 foreach (array_diff(array_keys($pickups),
1180 array_keys($uris)) as $remove) {
1181 unset($pickups[$remove]);
1188 if ($asap && $is_spam) return $progress;
1190 // Host: Uniqueness (uniq / non-uniq)
1192 foreach ($pickups as $pickup) $hosts[] = & $pickup['host'];
1193 $hosts = array_unique($hosts);
1194 //$remains['uniqhost'] = & $hosts;
1195 $sum['uniqhost'] += count($hosts);
1196 if ((! $asap || ! $is_spam) && isset($method['non_uniqhost'])) {
1197 $sum['non_uniqhost'] = $sum['quantity'] - $sum['uniqhost'];
1198 if ($sum['non_uniqhost'] > $method['non_uniqhost']) {
1199 $is_spam['non_uniqhost'] = TRUE;
1204 if ($asap && $is_spam) return $progress;
1207 if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
1208 $__remains = array();
1209 $badhost = is_badhost($hosts, $asap, $__remains);
1212 $remains['badhost'] = array();
1213 foreach ($__remains as $value) {
1214 $remains['badhost'][$value] = TRUE;
1219 if (! empty($badhost)) {
1220 //var_dump($badhost); // BADHOST detail
1221 $sum['badhost'] += array_count_leaves($badhost);
1222 foreach(array_keys($badhost) as $keys) {
1223 $is_spam['badhost'][$keys] =
1224 array_count_leaves($badhost[$keys]);
1234 function array_count_leaves($array = array(), $count_empty_array = FALSE)
1236 if (! is_array($array) || (empty($array) && $count_empty_array))
1241 foreach ($array as $part) {
1242 $result += array_count_leaves($part, $count_empty_array);
1247 // ---------------------
1250 // TODO: Don't show unused $method!
1251 // Summarize $progress (blocked only)
1252 function summarize_spam_progress($progress = array(), $blockedonly = FALSE)
1255 $tmp = array_keys($progress['is_spam']);
1258 $method = & $progress['method'];
1259 if (isset($progress['sum'])) {
1260 foreach ($progress['sum'] as $key => $value) {
1261 if (isset($method[$key])) {
1262 $tmp[] = $key . '(' . $value . ')';
1268 return implode(', ', $tmp);
1271 // ---------------------
1274 // Common bahavior for blocking
1275 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'
1276 function spam_exit($mode = '', $data = array())
1279 case '': echo("\n"); break;
1281 echo('<pre>' . "\n");
1282 echo htmlspecialchars(var_export($data, TRUE));
1283 echo('</pre>' . "\n");
1292 // ---------------------
1295 // TODO: Record them
1296 // Simple/fast spam filter ($target: 'a string' or an array())
1297 function pkwk_spamfilter($action, $page, $target = array('title' => ''), $method = array(), $exitmode = '')
1299 $progress = check_uri_spam($target, $method);
1301 if (! empty($progress['is_spam'])) {
1302 // Mail to administrator(s)
1303 pkwk_spamnotify($action, $page, $target, $progress, $method);
1306 spam_exit($exitmode, $progress);
1310 // ---------------------
1311 // PukiWiki original
1313 // Mail to administrator(s)
1314 function pkwk_spamnotify($action, $page, $target = array('title' => ''), $progress = array(), $method = array())
1316 global $notify, $notify_subject;
1318 if (! $notify) return;
1320 $asap = isset($method['asap']);
1322 $summary['ACTION'] = 'Blocked by: ' . summarize_spam_progress($progress, TRUE);
1324 $summary['METRICS'] = summarize_spam_progress($progress);
1326 if (isset($progress['is_spam']['badhost'])) {
1328 foreach($progress['is_spam']['badhost'] as $glob=>$number) {
1329 $badhost[] = $glob . '(' . $number . ')';
1331 $summary['DETAIL_BADHOST'] = implode(', ', $badhost);
1333 if (! $asap && $progress['remains']['badhost']) {
1334 $count = count($progress['remains']['badhost']);
1335 $summary['DETAIL_NEUTRAL_HOST'] = $count .
1338 '/[^, a-z0-9.-]/i', '',
1339 implode(', ', array_keys($progress['remains']['badhost']))
1343 $summary['COMMENT'] = $action;
1344 $summary['PAGE'] = '[blocked] ' . (is_pagename($page) ? $page : '');
1345 $summary['URI'] = get_script_uri() . '?' . rawurlencode($page);
1346 $summary['USER_AGENT'] = TRUE;
1347 $summary['REMOTE_ADDR'] = TRUE;
1348 pkwk_mail_notify($notify_subject, var_export($target, TRUE), $summary, TRUE);