OSDN Git Service

Roughly reconstruncted: spam_pickup() => spam_uri_pickup()
[pukiwiki/pukiwiki_sandbox.git] / spam / spam.php
index 7fdb644..51ccb9c 100644 (file)
@@ -1,27 +1,17 @@
 <?php
-// $Id: spam.php,v 1.9 2006/11/03 07:47:57 henoheno Exp $
+// $Id: spam.php,v 1.15 2006/11/12 10:59:56 henoheno Exp $
 // Copyright (C) 2006 PukiWiki Developers Team
 // License: GPL v2 or (at your option) any later version
 
 // Functions for Concept-work of spam-uri metrics
 
-// Return an array of normalized/parsed URIs in the $string
+// Return an array of URIs in the $string
 // [OK] http://nasty.example.org#nasty_string
 // [OK] http://nasty.example.org/foo/xxx#nasty_string/bar
 // [OK] ftp://dfshodfs:80/dfsdfs
-// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
-// [OK] http://victim.example.org/gphttp://nasty.example.org
-function spam_pickup($string = '')
+function uri_pickup($string = '')
 {
-       // Preprocess: urldecode() and adding space(s)
-       $string = preg_replace(
-               array(
-                       '#(?:https?|ftp):/#',
-                       '#\b[a-z][a-z0-9.+-]{1,8}://#i',
-                       '#[a-z][a-z0-9.+-]{1,8}://#i'
-               ), ' $0', urldecode($string));
-
-       // URI pickup: Not available for user@password, IDN, Fragment(=ignored)
+       // Not available for: user@password, IDN, Fragment(=ignored)
        $array = array();
        preg_match_all(
                // Refer RFC3986
@@ -39,6 +29,7 @@ function spam_pickup($string = '')
                '#i',
                 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
        //var_dump(recursive_map('htmlspecialchars', $array));
+
        // Shrink $array
        $parts = array(1 => 'scheme', 2 => 'host', 3 => 'port',
                4 => 'path', 5 => 'file');
@@ -48,20 +39,51 @@ function spam_pickup($string = '')
                array_rename_keys($array[$uri], $parts, TRUE, $default);
                $offset = $array[$uri]['scheme'][1]; // Scheme's offset
 
-               // Remove offsets (with normalization)
+               // Remove offsets for each part
                foreach(array_keys($array[$uri]) as $part) {
-                       $array[$uri][$part] =
-                                       strtolower($array[$uri][$part][0]);
+                       $array[$uri][$part] = & $array[$uri][$part][0];
                }
-               $array[$uri]['path']   = path_normalize($array[$uri]['path']);
+
                $array[$uri]['offset'] = $offset;
                $array[$uri]['area']   = 0;
        }
 
+       return $array;
+}
+
+// Preprocess: rawurldecode() and adding space(s) to detect/count some URIs if possible
+// NOTE: It's maybe danger to var_dump() these results.
+// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
+// [OK] http://victim.example.org/http://nasty.example.org
+function spam_uri_pickup_preprocess($string = '')
+{
+       if (is_string($string)) {
+               return preg_replace(
+                       array(
+                               '#(?:https?|ftp):/#',
+                               '#\b[a-z][a-z0-9.+-]{1,8}://#i',
+                               '#[a-z][a-z0-9.+-]{1,8}://#i'
+                       ),
+                       ' $0',
+                       rawurldecode($string)
+                       );
+       } else {
+               return '';
+       }
+}
+
+// Main function of spam-uri pickup
+function spam_uri_pickup($string = '')
+{
+       $string = spam_uri_pickup_preprocess($string);
+
+       $array  = uri_pickup($string);
+
        // Area elevation for '(especially external)link' intension
        if (! empty($array)) {
                // Anchor tags by preg_match_all()
                // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
+               // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a> 
                // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
                // [NG] <a href=  >Good site!</a> <a href= "#" >test</a>
                $areas = array();
@@ -76,14 +98,6 @@ function spam_pickup($string = '')
                }
                area_measure($areas, $array);
 
-               // Various Wiki syntax
-               // [text_or_uri>text_or_uri]
-               // [text_or_uri:text_or_uri]
-               // [text_or_uri|text_or_uri]
-               // [text_or_uri->text_or_uri]
-               // [text_or_uri text_or_uri] // MediaWiki
-               // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
-
                // phpBB's "BBCode" by preg_match_all()
                // [url]http://nasty.example.com/[/url]
                // [link]http://nasty.example.com/[/link]
@@ -102,40 +116,72 @@ function spam_pickup($string = '')
                }
                area_measure($areas, $array);
 
+               // Various Wiki syntax
+               // [text_or_uri>text_or_uri]
+               // [text_or_uri:text_or_uri]
+               // [text_or_uri|text_or_uri]
+               // [text_or_uri->text_or_uri]
+               // [text_or_uri text_or_uri] // MediaWiki
+               // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
+
                // Remove 'offset's for area_measure()
-               foreach(array_keys($array) as $key)
-                       unset($array[$key]['offset']);
+               //foreach(array_keys($array) as $key)
+               //      unset($array[$key]['offset']);
        }
 
        return $array;
 }
 
-// $array[0] => $array['name']
-function array_rename_keys(& $array, $rename = array(), $force = FALSE, $default = '')
+// $array['something'] => $array['wanted']
+function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
 {
-    if ($force) {
-               foreach($rename as $from => $to) {
-                       if (isset($array[$from])) {
-                               $array[$to] = & $array[$from];
-                               unset($array[$from]);
-                       } else  {
-                               $array[$to] = $default;
-                       }
-               }
-       } else {
-               foreach(array_keys($rename) as $from) {
-                       if (! isset($array[$from])) {
+       if (! is_array($array) || ! is_array($keys))
+               return FALSE;
+
+       // Nondestructive test
+       if (! $force)
+               foreach(array_keys($keys) as $from)
+                       if (! isset($array[$from]))
                                return FALSE;
-                       }
-               }
-               foreach($rename as $from => $to) {
+
+       foreach($keys as $from => $to) {
+               if ($from === $to) continue;
+               if (! $force || isset($array[$from])) {
                        $array[$to] = & $array[$from];
                        unset($array[$from]);
+               } else  {
+                       $array[$to] = $default;
                }
        }
+
        return TRUE;
 }
 
+// If in doubt, it's a little doubtful
+function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
+{
+       if (! is_array($areas) || ! is_array($array)) return;
+
+       $areas_keys = array_keys($areas);
+       foreach(array_keys($array) as $u_index) {
+               $offset = isset($array[$u_index][$o_key]) ?
+                       intval($array[$u_index][$o_key]) : 0;
+               foreach($areas_keys as $a_index) {
+                       if (isset($array[$u_index][$a_key])) {
+                               $offset_s = intval($areas[$a_index][0]);
+                               $offset_e = intval($areas[$a_index][1]);
+                               // [Area => inside <= Area]
+                               if ($offset_s < $offset && $offset < $offset_e) {
+                                       $array[$u_index][$a_key] += $belief;
+                               }
+                       }
+               }
+       }
+}
+
+
+// ---------------------
+// Part Two
 
 // Path normalization
 // example.org => example.org/
@@ -174,28 +220,25 @@ function path_normalize($path = '', $divider = '/', $addroot = TRUE)
        return $path;
 }
 
-// If in doubt, it's a little doubtful
-function area_measure($areas, &$array, $belief = -1, $a_key = 'area', $o_key = 'offset')
+// Input: '/a/b'
+// Output: array('' => array('a' => array('b' => NULL)))
+function array_tree($string, $delimiter = '/', $reverse = FALSE)
 {
-       if (! is_array($areas) || ! is_array($array)) return;
-
-       $areas_keys = array_keys($areas);
-       foreach(array_keys($array) as $u_index) {
-               $offset = isset($array[$u_index][$o_key]) ?
-                       intval($array[$u_index][$o_key]) : 0;
-               foreach($areas_keys as $a_index) {
-                       if (isset($array[$u_index][$a_key])) {
-                               $offset_s = intval($areas[$a_index][0]);
-                               $offset_e = intval($areas[$a_index][1]);
-                               // [Area => inside <= Area]
-                               if ($offset_s < $offset && $offset < $offset_e) {
-                                       $array[$u_index][$a_key] += $belief;
-                               }
-                       }
-               }
+       // Create a branch
+       $tree = NULL;
+       $tmps = explode($delimiter, $string);
+       if (! $reverse) $tmps = array_reverse($tmps);
+       foreach ($tmps as $tmp) {
+               $tree = array($tmp => $tree);
        }
+       return $tree;
 }
 
+
+// ---------------------
+// Part One : Checker
+
+// Simple/fast spam check
 function is_uri_spam($target = '')
 {
        $is_spam = FALSE;
@@ -203,16 +246,17 @@ function is_uri_spam($target = '')
 
        if (is_array($target)) {
                foreach($target as $str) {
+                       // Recurse
                        list($is_spam, $_urinum) = is_uri_spam($str);
                        $urinum += $_urinum;
                        if ($is_spam) break;
                }
        } else {
-               $pickups = spam_pickup($target);
+               $pickups = spam_uri_pickup($target);
                $urinum += count($pickups);
                if (! empty($pickups)) {
-                       // Some users want to post one or two URL, but ...
-                       if ($urinum > 2) {
+                       // Some users want to post some URLs, but ...
+                       if ($urinum > 8) {
                                $is_spam = TRUE;        // Too many!
                        } else {
                                foreach($pickups as $pickup) {
@@ -228,33 +272,39 @@ function is_uri_spam($target = '')
        return array($is_spam, $urinum);
 }
 
+// ---------------------
+
+// Check User-Agent
+function is_invalid_useragent($ua_name = '' /*, $ua_vars = ''*/ )
+{
+       return $ua_name === '';
+}
+
+// ---------------------
+
 // Mail to administrator with more measurement data?
 // Simple/fast spam filter (for one text field)
-function pkwk_spamfilter($action, $page, $target = array())
+function pkwk_spamfilter($action, $page, $target = array('title' => ''))
 {
        $is_spam = FALSE;
-       list($is_spam) = is_uri_spam($target);
+
+       //$is_spam =  is_invalid_useragent('NOTYET');
+       if ($is_spam) {
+               $action .= ' (Invalid User-Agent)';
+       } else {
+               list($is_spam) = is_uri_spam($target);
+       }
 
        if ($is_spam) {
+               // Mail to administrator(s)
                global $notify, $notify_subject;
                if ($notify) {
                        $footer['ACTION'] = $action;
-                       $footer['PAGE']   = '[BLOCKED]: ' . $page;
+                       $footer['PAGE']   = '[blocked] ' . $page;
                        $footer['URI']    = get_script_uri() . '?' . rawurlencode($page);
                        $footer['USER_AGENT']  = TRUE;
                        $footer['REMOTE_ADDR'] = TRUE;
-
-                       // Fields
-                       if (is_array($target)) {
-                               $tmp = array();
-                               foreach($target as $key => $value){
-                                       $tmp[] = $key . ' = ' . $value . "\n";
-                               }
-                               $target = implode("\n", $tmp);
-                               unset($tmp);
-                       }
-
-                       pkwk_mail_notify($notify_subject, $target, $footer);
+                       pkwk_mail_notify($notify_subject,  var_export($target, TRUE), $footer);
                        unset($footer);
                }
        }
@@ -262,6 +312,10 @@ function pkwk_spamfilter($action, $page, $target = array())
        if ($is_spam) spam_exit();
 }
 
+// ---------------------
+
+// Common bahavior for blocking
+// NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'
 function spam_exit()
 {
        die("\n");