OSDN Git Service

generate_glob_regex() enables globbing
[pukiwiki/pukiwiki_sandbox.git] / spam / spam.php
index 616d887..daa5355 100644 (file)
@@ -1,5 +1,5 @@
 <?php
-// $Id: spam.php,v 1.23 2006/11/21 13:42:40 henoheno Exp $
+// $Id: spam.php,v 1.32 2006/11/25 02:37:21 henoheno Exp $
 // Copyright (C) 2006 PukiWiki Developers Team
 // License: GPL v2 or (at your option) any later version
 
@@ -33,13 +33,15 @@ function uri_pickup($string = '', $normalize = TRUE,
                '([^\s<>"\'\[\]\#]+)?' .                        // 6: File and query string
                '(?:\#([a-z0-9._~%!$&\'()*+,;=:@-]*))?' .       // 7: Fragment
                '#i',
-                $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
+                $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE
+       );
        //var_dump(recursive_map('htmlspecialchars', $array));
 
        // Shrink $array
        static $parts = array(
-               1 => 'scheme', 2 => 'userinfo',
-               3 => 'host', 4 => 'port', 5 => 'path', 6 => 'file', 7 => 'fragment');
+               1 => 'scheme', 2 => 'userinfo', 3 => 'host', 4 => 'port',
+               5 => 'path', 6 => 'file', 7 => 'fragment'
+       );
        $default = array('');
        foreach(array_keys($array) as $uri) {
                array_rename_keys($array[$uri], $parts, TRUE, $default);
@@ -52,6 +54,12 @@ function uri_pickup($string = '', $normalize = TRUE,
 
                if ($normalize) {
                        $array[$uri]['scheme'] = scheme_normalize($array[$uri]['scheme']);
+                       //if ($array[$uri]['scheme'] === '') {
+                       //      // Ignore
+                       //      unset ($array[$uri]);
+                       //      continue;
+                       //}
+                       
                        $array[$uri]['host']   = strtolower($array[$uri]['host']);
                        $array[$uri]['port']   = port_normalize($array[$uri]['port'], $array[$uri]['scheme'], FALSE);
                        $array[$uri]['path']   = path_normalize($array[$uri]['path']);
@@ -71,7 +79,7 @@ function uri_pickup($string = '', $normalize = TRUE,
                                $array[$uri]['path'],
                                $array[$uri]['file'],
                                $array[$uri]['fragment']
-                               );
+                       );
                }
 
                $array[$uri]['offset'] = $offset;
@@ -81,25 +89,69 @@ function uri_pickup($string = '', $normalize = TRUE,
        return $array;
 }
 
+// Domain exposure callback (See spam_uri_pickup_preprocess())
+// http://victim.example.org/?foo+site:nasty.example.com+bar
+// => http://nasty.example.com/?refer=victim.example.org
+// NOTE: 'refer=' is not so good for (at this time).
+// Consider about using IP address of the victim, try to avoid that.
+function _preg_replace_callback_domain_exposure($matches = array())
+{
+       $result = '';
+
+       // Preserve the victim URI as a complicity or ...
+       if (isset($matches[5])) {
+               $result =
+                       $matches[1] . '://' .   // scheme
+                       $matches[2] . '/' .             // victim.example.org
+                       $matches[3];                    // The rest of all (before victim)
+       }
+
+       // Flipped URI
+       $result = 
+               $matches[1] . '://' .   // scheme
+               $matches[4] .                   // nasty.example.com
+               '/?refer=' . strtolower($matches[2]) .  // victim.example.org
+               ' ' . $result;
+
+       return $result;
+}
+
 // Preprocess: rawurldecode() and adding space(s) to detect/count some URIs _if possible_
 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
 // [OK] http://victim.example.org/http://nasty.example.org
 function spam_uri_pickup_preprocess($string = '')
 {
-       if (is_string($string)) {
-               return preg_replace(
-                       array(
-                               '#(?:https?|ftp):/#',
-                               '#\b[a-z][a-z0-9.+-]{1,8}://#i',
-                               '#[a-z][a-z0-9.+-]{1,8}://#i'
-                       ),
-                       ' $0',
-                       rawurldecode($string)
-                       );
-       } else {
-               return '';
-       }
+       if (! is_string($string)) return '';
+
+       $string = rawurldecode($string);
+
+       // Domain exposure (See _preg_replace_callback_domain_exposure())
+       $string = preg_replace_callback(
+               array(
+                       // Something Google: http://www.google.com/supported_domains
+                       '#(http)://([a-z0-9.]+\.google\.[a-z]{2,3}(?:\.[a-z]{2})?)/' .
+                       '([a-z0-9?=&.%_+-]+)' .         // ?query=foo+
+                       '\bsite:([a-z0-9.%_-]+)' .      // site:nasty.example.com
+                       '()' .  // Preserve?
+                       '#i',
+               ),
+               '_preg_replace_callback_domain_exposure',
+               $string
+       );
+
+       // URI exposure (uriuri => uri uri)
+       $string = preg_replace(
+               array(
+                       '#(?<! )(?:https?|ftp):/#',
+               //      '#[a-z][a-z0-9.+-]{1,8}://#i',
+               //      '#[a-z][a-z0-9.+-]{1,8}://#i'
+               ),
+               ' $0',
+               $string
+       );
+
+       return $string;
 }
 
 // TODO: Area selection (Check BBCode only, check anchor only, check ...)
@@ -108,7 +160,7 @@ function spam_uri_pickup($string = '')
 {
        $string = spam_uri_pickup_preprocess($string);
 
-       $array  = uri_pickup($string, FALSE, TRUE, FALSE);
+       $array  = uri_pickup($string);
 
        // Area elevation for '(especially external)link' intension
        if (! empty($array)) {
@@ -214,13 +266,19 @@ function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key =
 // ---------------------
 // Part Two
 
-// Scheme normalization: Rename the schemes
+// Scheme normalization: Renaming the schemes
 // snntp://example.org =>  nntps://example.org
-// NOTE: Keep the static list simple. See also port_normalize().
-function scheme_normalize($scheme = '')
+// NOTE: Keep the static lists simple. See also port_normalize().
+function scheme_normalize($scheme = '', $considerd_harmfull = TRUE)
 {
+       // Abbreviations considerable they don't have link intension
+       static $abbrevs = array(
+               'ttp'   => 'http',
+               'ttps'  => 'https',
+       );
+
+       // Alias => normalized
        static $aliases = array(
-               // alias => normalized
                'pop'   => 'pop3',
                'news'  => 'nntp',
                'imap4' => 'imap',
@@ -231,6 +289,13 @@ function scheme_normalize($scheme = '')
        );
 
        $scheme = strtolower(trim($scheme));
+       if (isset($abbrevs[$scheme])) {
+               if ($considerd_harmfull) {
+                       $scheme = $abbrevs[$scheme];
+               } else {
+                       $scheme = '';
+               }
+       }
        if (isset($aliases[$scheme])) $scheme = $aliases[$scheme];
 
        return $scheme;
@@ -357,6 +422,89 @@ function uri_array_implode($uri = array())
 // ---------------------
 // Part One : Checker
 
+function generate_glob_regex($string = '', $divider = '/')
+{
+       static $from = array(
+                       0 => '*',
+                       1 => '?',
+                       2 => '\[',
+                       3 => '\]',
+                       4 => '[',
+                       5 => ']',
+               );
+       static $mid = array(
+                       0 => '_AST_',
+                       1 => '_QUE_',
+                       2 => '_eRBR_',
+                       3 => '_eLBR_',
+                       4 => '_RBR_',
+                       5 => '_LBR_',
+               );
+       static $to = array(
+                       0 => '.*',
+                       1 => '.',
+                       2 => '\[',
+                       3 => '\]',
+                       4 => '[',
+                       5 => ']',
+               );
+
+       $string = str_replace($from, $mid, $string); // Hide
+       $string = preg_quote($string, $divider);
+       $string = str_replace($mid, $to, $string);   // Unhide
+
+       return $string;
+}
+
+// TODO: Ignore list
+// TODO: require_or_include_once(another file)
+function is_badhost($host = '')
+{
+       static $blocklist_regex;
+
+       if (! isset($blocklist_regex)) {
+               $blocklist_regex = array();
+               $blocklist = array(
+                       // Deny all uri
+                       //'*',
+                       
+                       // IP address or ...
+                       //'10.20.*.*',  // 10.20.example.com also matches
+                       //'\[1\]',
+                       
+                       // Too much malicious sub-domains
+                       '*.blogspot.com',
+
+                       // 2006-11 dev
+                       'wwwtahoo.com',
+
+                       // 2006-11 dev
+                       '*.infogami.com',
+
+                       // 2006/11/19 17:50 dev
+                       '*.google0site.org',
+                       '*.bigpricesearch.org',
+                       '*.osfind.org',
+                       '*.bablomira.biz',
+               );
+               foreach ($blocklist as $part) {
+                       $blocklist_regex[] = '#^' . generate_glob_regex($part, '#') . '$#';
+               }
+       }
+
+       $host = strtolower($host);
+       $result = FALSE;
+       foreach ($blocklist_regex as $regex) {
+               if (preg_match($regex, $host)) {
+                       $result = TRUE;
+                       break;
+               }
+       }
+
+       return $result;
+}
+
+// TODO return TRUE or FALSE!
 // Simple/fast spam check
 function is_uri_spam($target = '')
 {
@@ -385,6 +533,13 @@ function is_uri_spam($target = '')
                                        }
                                }
                        }
+
+                       foreach ($pickups as $pickup) {
+                               if (is_badhost($pickup['host'])) {
+                                       $is_spam = TRUE;
+                                       break;
+                               }
+                       }
                }
        }
 
@@ -401,6 +556,7 @@ function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
 
 // ---------------------
 
+// TODO: Separate check-part(s) and mail part
 // TODO: Multi-metrics (uri, host, user-agent, ...)
 // TODO: Mail to administrator with more measurement data?
 // Simple/fast spam filter ($target: 'a string' or an array())