OSDN Git Service

Stop $asap, or do them all?
[pukiwiki/pukiwiki_sandbox.git] / spam.php
index bf220d0..b3e7128 100644 (file)
--- a/spam.php
+++ b/spam.php
@@ -1,5 +1,5 @@
 <?php
-// $Id: spam.php,v 1.28 2006/11/23 02:05:03 henoheno Exp $
+// $Id: spam.php,v 1.36 2006/11/25 13:55:34 henoheno Exp $
 // Copyright (C) 2006 PukiWiki Developers Team
 // License: GPL v2 or (at your option) any later version
 
@@ -54,10 +54,15 @@ function uri_pickup($string = '', $normalize = TRUE,
 
                if ($normalize) {
                        $array[$uri]['scheme'] = scheme_normalize($array[$uri]['scheme']);
+                       //if ($array[$uri]['scheme'] === '') {
+                       //      // Ignore
+                       //      unset ($array[$uri]);
+                       //      continue;
+                       //}
+                       
                        $array[$uri]['host']   = strtolower($array[$uri]['host']);
                        $array[$uri]['port']   = port_normalize($array[$uri]['port'], $array[$uri]['scheme'], FALSE);
                        $array[$uri]['path']   = path_normalize($array[$uri]['path']);
-
                        //$array[$uri]['uri']    = uri_array_implode($array[$uri]);
                        if ($preserve_rawuri) $array[$uri]['rawuri'] = & $array[$uri][0];
                } else {
@@ -134,12 +139,12 @@ function spam_uri_pickup_preprocess($string = '')
                $string
        );
 
-       // Scheme exposure (schemescheme => scheme scheme)
+       // URI exposure (uriuri => uri uri)
        $string = preg_replace(
                array(
-                       '#(?:https?|ftp):/#',
-                       '#\b[a-z][a-z0-9.+-]{1,8}://#i',
-                       '#[a-z][a-z0-9.+-]{1,8}://#i'
+                       '#(?<! )(?:https?|ftp):/#',
+               //      '#[a-z][a-z0-9.+-]{1,8}://#i',
+               //      '#[a-z][a-z0-9.+-]{1,8}://#i'
                ),
                ' $0',
                $string
@@ -260,13 +265,19 @@ function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key =
 // ---------------------
 // Part Two
 
-// Scheme normalization: Rename the schemes
+// Scheme normalization: Renaming the schemes
 // snntp://example.org =>  nntps://example.org
-// NOTE: Keep the static list simple. See also port_normalize().
-function scheme_normalize($scheme = '')
+// NOTE: Keep the static lists simple. See also port_normalize().
+function scheme_normalize($scheme = '', $considerd_harmfull = TRUE)
 {
+       // Abbreviations considerable they don't have link intension
+       static $abbrevs = array(
+               'ttp'   => 'http',
+               'ttps'  => 'https',
+       );
+
+       // Alias => normalized
        static $aliases = array(
-               // alias => normalized
                'pop'   => 'pop3',
                'news'  => 'nntp',
                'imap4' => 'imap',
@@ -277,6 +288,13 @@ function scheme_normalize($scheme = '')
        );
 
        $scheme = strtolower(trim($scheme));
+       if (isset($abbrevs[$scheme])) {
+               if ($considerd_harmfull) {
+                       $scheme = $abbrevs[$scheme];
+               } else {
+                       $scheme = '';
+               }
+       }
        if (isset($aliases[$scheme])) $scheme = $aliases[$scheme];
 
        return $scheme;
@@ -403,88 +421,188 @@ function uri_array_implode($uri = array())
 // ---------------------
 // Part One : Checker
 
+function generate_glob_regex($string = '', $divider = '/')
+{
+       static $from = array(
+                       0 => '*',
+                       1 => '?',
+                       2 => '\[',
+                       3 => '\]',
+                       4 => '[',
+                       5 => ']',
+               );
+       static $mid = array(
+                       0 => '_AST_',
+                       1 => '_QUE_',
+                       2 => '_eRBR_',
+                       3 => '_eLBR_',
+                       4 => '_RBR_',
+                       5 => '_LBR_',
+               );
+       static $to = array(
+                       0 => '.*',
+                       1 => '.',
+                       2 => '\[',
+                       3 => '\]',
+                       4 => '[',
+                       5 => ']',
+               );
+
+       $string = str_replace($from, $mid, $string); // Hide
+       $string = preg_quote($string, $divider);
+       $string = str_replace($mid, $to, $string);   // Unhide
+
+       return $string;
+}
 
-// TODO: globbing for IP address or something
 // TODO: Ignore list
 // TODO: require_or_include_once(another file)
-function is_badhost($host = '')
+function is_badhost($hosts = '', $asap = TRUE)
 {
        static $blocklist_regex;
 
        if (! isset($blocklist_regex)) {
+               $blocklist_regex = array();
                $blocklist = array(
-                       // Well-known
-                       '.blogspot.com',
+                       // Deny all uri
+                       //'*',
+
+                       // IP address or ...
+                       //'10.20.*.*',  // 10.20.example.com also matches
+                       //'\[1\]',
+                       
+                       // Too much malicious sub-domains
+                       '*.blogspot.com',
 
                        // 2006-11 dev
                        'wwwtahoo.com',
 
+                       // 2006-11 dev
+                       '*.infogami.com',
+
                        // 2006/11/19 17:50 dev
-                       '.google0site.org',
-                       '.bigpricesearch.org',
-                       '.osfind.org',
-                       '.bablomira.biz',
+                       '*.google0site.org',
+                       '*.bigpricesearch.org',
+                       '*.osfind.org',
+                       '*.bablomira.biz',
                );
-
-               $blocklist_regex = array();
                foreach ($blocklist as $part) {
-                       if ($part[0] === '.') {
-                               $blocklist_regex[] = '#' . preg_quote($part, '#') . '$#';
-                       } else {
-                               $blocklist_regex[] = '#^(.*\.)?' . preg_quote($part, '#') . '$#';
-                       }
+                       $blocklist_regex[] = '#^' . generate_glob_regex($part, '#') . '$#i';
                }
        }
 
-       foreach ($blocklist_regex as $regex) {
-               if (preg_match($regex, $host)) {
-                       return TRUE;
+       $result = 0;
+       if (! is_array($hosts)) $hosts = array($hosts);
+       foreach($hosts as $host) {
+               if (! is_string($host)) $host = '';
+               foreach ($blocklist_regex as $regex) {
+                       if (preg_match($regex, $host)) {
+                               ++$result;
+                               if ($asap) {
+                                       return $result;
+                               } else {
+                                       break; // Check next host
+                               }
+                       }
                }
        }
 
-       return FALSE;
+       return $result;
 }
 
 // TODO return TRUE or FALSE!
 // Simple/fast spam check
-function is_uri_spam($target = '')
+function check_uri_spam($target = '', $method = array(), $asap = TRUE)
 {
-       $is_spam = FALSE;
-       $urinum = 0;
+       $is_spam  = FALSE;
+       $progress = array(
+               'quantity' => 0,
+               'area'     => 0,
+               'non_uniq' => 0,
+               'badhost'  => 0,
+               );
+
+       if (! is_array($method) || empty($method)) {
+               // Default
+               $method = array(
+                       'quantity' => 8,                // Allow N URIs
+                       'area'     => TRUE,
+                       'non_uniq' => 3,                // Allow N times dupe
+                       'badhost'  => TRUE,
+               );
+       }
 
        if (is_array($target)) {
                foreach($target as $str) {
                        // Recurse
-                       list($is_spam, $_urinum) = is_uri_spam($str);
-                       $urinum += $_urinum;
-                       if ($is_spam) break;
+                       list($is_spam, $_progress) = check_uri_spam($str, $method);
+                       $progress['quantity'] += $_progress['quantity'];
+                       $progress['non_uniq'] += $_progress['non_uniq'];
+                       if ($asap || $is_spam) break;
                }
        } else {
                $pickups = spam_uri_pickup($target);
-               $urinum += count($pickups);
+               $progress['quantity'] += count($pickups);
+
                if (! empty($pickups)) {
-                       // Some users want to post some URLs, but ...
-                       if ($urinum > 8) {
-                               $is_spam = TRUE;        // Too many!
-                       } else {
+
+                       // URI quantity
+                       if ((! $is_spam || ! $asap) && isset($method['quantity']) &&
+                               $progress['quantity'] > $method['quantity']) {
+                               $is_spam = TRUE;
+                       }
+                       //var_dump($method['quantity'], $is_spam);
+
+                       // Using invalid area
+                       if ((! $is_spam || ! $asap) && isset($method['area'])) {
                                foreach($pickups as $pickup) {
                                        if ($pickup['area'] < 0) {
+                                               ++$progress['area'];
                                                $is_spam = TRUE;
-                                               break;
+                                               if ($asap) break;
                                        }
                                }
                        }
+                       //var_dump($method['area'], $is_spam);
 
-                       foreach ($pickups as $pickup) {
-                               if (is_badhost($pickup['host'])) {
+                       // URI uniqueness (and removing non-uniques)
+                       if ((! $is_spam || ! $asap) && isset($method['non_uniq'])) {
+                               $uris = array();
+                               foreach ($pickups as $key => $pickup) {
+                                       $uris[$key] = uri_array_implode($pickup);
+                               }
+                               $count = count($uris);
+                               $uris = array_unique($uris);
+                               $progress['non_uniq'] += $count - count($uris);
+                               if ($progress['non_uniq'] > $method['non_uniq']) {
                                        $is_spam = TRUE;
-                                       break;
                                }
+                               if (! $asap || ! $is_spam) {
+                                       foreach (array_diff(array_keys($pickups),
+                                               array_keys($uris)) as $remove) {
+                                               unset($pickups[$remove]);
+                                       }
+                               }
+                               unset($uris);
+                               //var_dump($uris, $pickups);
+                       }
+                       //var_dump($method['non_uniq'], $is_spam);
+
+                       // Bad host
+                       if ((! $is_spam || ! $asap) && isset($method['badhost'])) {
+                               $hosts = array();
+                               foreach ($pickups as $pickup) {
+                                       $hosts[] = & $pickup['host'];
+                               }
+                               $count = is_badhost(array_unique($hosts), $asap);
+                               $progress['badhost'] += $count;
+                               if ($count !== 0) $is_spam = TRUE;
                        }
+                       //var_dump($method['badhost'], $is_spam);
                }
        }
 
-       return array($is_spam, $urinum);
+       return array($is_spam, $progress);
 }
 
 // ---------------------
@@ -498,10 +616,9 @@ function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
 // ---------------------
 
 // TODO: Separate check-part(s) and mail part
-// TODO: Multi-metrics (uri, host, user-agent, ...)
 // TODO: Mail to administrator with more measurement data?
 // Simple/fast spam filter ($target: 'a string' or an array())
-function pkwk_spamfilter($action, $page, $target = array('title' => ''))
+function pkwk_spamfilter($action, $page, $target = array('title' => ''), $method = array())
 {
        $is_spam = FALSE;
 
@@ -509,7 +626,7 @@ function pkwk_spamfilter($action, $page, $target = array('title' => ''))
        if ($is_spam) {
                $action .= ' (Invalid User-Agent)';
        } else {
-               list($is_spam) = is_uri_spam($target);
+               list($is_spam) = check_uri_spam($target, $method);
        }
 
        if ($is_spam) {