OSDN Git Service

Roughly reconstruncted: spam_pickup() => spam_uri_pickup()
authorhenoheno <henoheno>
Sun, 12 Nov 2006 10:59:56 +0000 (19:59 +0900)
committerhenoheno <henoheno>
Sun, 12 Nov 2006 10:59:56 +0000 (19:59 +0900)
spam.php
spam/spam.php
spam/spam_pickup.php
spam_pickup.php

index 79e0552..51ccb9c 100644 (file)
--- a/spam.php
+++ b/spam.php
@@ -1,27 +1,17 @@
 <?php
-// $Id: spam.php,v 1.14 2006/11/12 04:27:11 henoheno Exp $
+// $Id: spam.php,v 1.15 2006/11/12 10:59:56 henoheno Exp $
 // Copyright (C) 2006 PukiWiki Developers Team
 // License: GPL v2 or (at your option) any later version
 
 // Functions for Concept-work of spam-uri metrics
 
-// Return an array of normalized/parsed URIs in the $string
+// Return an array of URIs in the $string
 // [OK] http://nasty.example.org#nasty_string
 // [OK] http://nasty.example.org/foo/xxx#nasty_string/bar
 // [OK] ftp://dfshodfs:80/dfsdfs
-// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
-// [OK] http://victim.example.org/gphttp://nasty.example.org
-function spam_pickup($string = '')
+function uri_pickup($string = '')
 {
-       // Preprocess: urldecode() and adding space(s)
-       $string = preg_replace(
-               array(
-                       '#(?:https?|ftp):/#',
-                       '#\b[a-z][a-z0-9.+-]{1,8}://#i',
-                       '#[a-z][a-z0-9.+-]{1,8}://#i'
-               ), ' $0', rawurldecode($string));
-
-       // URI pickup: Not available for user@password, IDN, Fragment(=ignored)
+       // Not available for: user@password, IDN, Fragment(=ignored)
        $array = array();
        preg_match_all(
                // Refer RFC3986
@@ -39,6 +29,7 @@ function spam_pickup($string = '')
                '#i',
                 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
        //var_dump(recursive_map('htmlspecialchars', $array));
+
        // Shrink $array
        $parts = array(1 => 'scheme', 2 => 'host', 3 => 'port',
                4 => 'path', 5 => 'file');
@@ -48,16 +39,46 @@ function spam_pickup($string = '')
                array_rename_keys($array[$uri], $parts, TRUE, $default);
                $offset = $array[$uri]['scheme'][1]; // Scheme's offset
 
-               // Remove offsets (with normalization)
+               // Remove offsets for each part
                foreach(array_keys($array[$uri]) as $part) {
-                       $array[$uri][$part] =
-                                       strtolower($array[$uri][$part][0]);
+                       $array[$uri][$part] = & $array[$uri][$part][0];
                }
-               //$array[$uri]['path']   = path_normalize($array[$uri]['path']);
+
                $array[$uri]['offset'] = $offset;
                $array[$uri]['area']   = 0;
        }
 
+       return $array;
+}
+
+// Preprocess: rawurldecode() and adding space(s) to detect/count some URIs if possible
+// NOTE: It's maybe danger to var_dump() these results.
+// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
+// [OK] http://victim.example.org/http://nasty.example.org
+function spam_uri_pickup_preprocess($string = '')
+{
+       if (is_string($string)) {
+               return preg_replace(
+                       array(
+                               '#(?:https?|ftp):/#',
+                               '#\b[a-z][a-z0-9.+-]{1,8}://#i',
+                               '#[a-z][a-z0-9.+-]{1,8}://#i'
+                       ),
+                       ' $0',
+                       rawurldecode($string)
+                       );
+       } else {
+               return '';
+       }
+}
+
+// Main function of spam-uri pickup
+function spam_uri_pickup($string = '')
+{
+       $string = spam_uri_pickup_preprocess($string);
+
+       $array  = uri_pickup($string);
+
        // Area elevation for '(especially external)link' intension
        if (! empty($array)) {
                // Anchor tags by preg_match_all()
@@ -77,14 +98,6 @@ function spam_pickup($string = '')
                }
                area_measure($areas, $array);
 
-               // Various Wiki syntax
-               // [text_or_uri>text_or_uri]
-               // [text_or_uri:text_or_uri]
-               // [text_or_uri|text_or_uri]
-               // [text_or_uri->text_or_uri]
-               // [text_or_uri text_or_uri] // MediaWiki
-               // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
-
                // phpBB's "BBCode" by preg_match_all()
                // [url]http://nasty.example.com/[/url]
                // [link]http://nasty.example.com/[/link]
@@ -103,9 +116,17 @@ function spam_pickup($string = '')
                }
                area_measure($areas, $array);
 
+               // Various Wiki syntax
+               // [text_or_uri>text_or_uri]
+               // [text_or_uri:text_or_uri]
+               // [text_or_uri|text_or_uri]
+               // [text_or_uri->text_or_uri]
+               // [text_or_uri text_or_uri] // MediaWiki
+               // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
+
                // Remove 'offset's for area_measure()
-               foreach(array_keys($array) as $key)
-                       unset($array[$key]['offset']);
+               //foreach(array_keys($array) as $key)
+               //      unset($array[$key]['offset']);
        }
 
        return $array;
@@ -231,7 +252,7 @@ function is_uri_spam($target = '')
                        if ($is_spam) break;
                }
        } else {
-               $pickups = spam_pickup($target);
+               $pickups = spam_uri_pickup($target);
                $urinum += count($pickups);
                if (! empty($pickups)) {
                        // Some users want to post some URLs, but ...
index 79e0552..51ccb9c 100644 (file)
@@ -1,27 +1,17 @@
 <?php
-// $Id: spam.php,v 1.14 2006/11/12 04:27:11 henoheno Exp $
+// $Id: spam.php,v 1.15 2006/11/12 10:59:56 henoheno Exp $
 // Copyright (C) 2006 PukiWiki Developers Team
 // License: GPL v2 or (at your option) any later version
 
 // Functions for Concept-work of spam-uri metrics
 
-// Return an array of normalized/parsed URIs in the $string
+// Return an array of URIs in the $string
 // [OK] http://nasty.example.org#nasty_string
 // [OK] http://nasty.example.org/foo/xxx#nasty_string/bar
 // [OK] ftp://dfshodfs:80/dfsdfs
-// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
-// [OK] http://victim.example.org/gphttp://nasty.example.org
-function spam_pickup($string = '')
+function uri_pickup($string = '')
 {
-       // Preprocess: urldecode() and adding space(s)
-       $string = preg_replace(
-               array(
-                       '#(?:https?|ftp):/#',
-                       '#\b[a-z][a-z0-9.+-]{1,8}://#i',
-                       '#[a-z][a-z0-9.+-]{1,8}://#i'
-               ), ' $0', rawurldecode($string));
-
-       // URI pickup: Not available for user@password, IDN, Fragment(=ignored)
+       // Not available for: user@password, IDN, Fragment(=ignored)
        $array = array();
        preg_match_all(
                // Refer RFC3986
@@ -39,6 +29,7 @@ function spam_pickup($string = '')
                '#i',
                 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
        //var_dump(recursive_map('htmlspecialchars', $array));
+
        // Shrink $array
        $parts = array(1 => 'scheme', 2 => 'host', 3 => 'port',
                4 => 'path', 5 => 'file');
@@ -48,16 +39,46 @@ function spam_pickup($string = '')
                array_rename_keys($array[$uri], $parts, TRUE, $default);
                $offset = $array[$uri]['scheme'][1]; // Scheme's offset
 
-               // Remove offsets (with normalization)
+               // Remove offsets for each part
                foreach(array_keys($array[$uri]) as $part) {
-                       $array[$uri][$part] =
-                                       strtolower($array[$uri][$part][0]);
+                       $array[$uri][$part] = & $array[$uri][$part][0];
                }
-               //$array[$uri]['path']   = path_normalize($array[$uri]['path']);
+
                $array[$uri]['offset'] = $offset;
                $array[$uri]['area']   = 0;
        }
 
+       return $array;
+}
+
+// Preprocess: rawurldecode() and adding space(s) to detect/count some URIs if possible
+// NOTE: It's maybe danger to var_dump() these results.
+// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
+// [OK] http://victim.example.org/http://nasty.example.org
+function spam_uri_pickup_preprocess($string = '')
+{
+       if (is_string($string)) {
+               return preg_replace(
+                       array(
+                               '#(?:https?|ftp):/#',
+                               '#\b[a-z][a-z0-9.+-]{1,8}://#i',
+                               '#[a-z][a-z0-9.+-]{1,8}://#i'
+                       ),
+                       ' $0',
+                       rawurldecode($string)
+                       );
+       } else {
+               return '';
+       }
+}
+
+// Main function of spam-uri pickup
+function spam_uri_pickup($string = '')
+{
+       $string = spam_uri_pickup_preprocess($string);
+
+       $array  = uri_pickup($string);
+
        // Area elevation for '(especially external)link' intension
        if (! empty($array)) {
                // Anchor tags by preg_match_all()
@@ -77,14 +98,6 @@ function spam_pickup($string = '')
                }
                area_measure($areas, $array);
 
-               // Various Wiki syntax
-               // [text_or_uri>text_or_uri]
-               // [text_or_uri:text_or_uri]
-               // [text_or_uri|text_or_uri]
-               // [text_or_uri->text_or_uri]
-               // [text_or_uri text_or_uri] // MediaWiki
-               // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
-
                // phpBB's "BBCode" by preg_match_all()
                // [url]http://nasty.example.com/[/url]
                // [link]http://nasty.example.com/[/link]
@@ -103,9 +116,17 @@ function spam_pickup($string = '')
                }
                area_measure($areas, $array);
 
+               // Various Wiki syntax
+               // [text_or_uri>text_or_uri]
+               // [text_or_uri:text_or_uri]
+               // [text_or_uri|text_or_uri]
+               // [text_or_uri->text_or_uri]
+               // [text_or_uri text_or_uri] // MediaWiki
+               // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
+
                // Remove 'offset's for area_measure()
-               foreach(array_keys($array) as $key)
-                       unset($array[$key]['offset']);
+               //foreach(array_keys($array) as $key)
+               //      unset($array[$key]['offset']);
        }
 
        return $array;
@@ -231,7 +252,7 @@ function is_uri_spam($target = '')
                        if ($is_spam) break;
                }
        } else {
-               $pickups = spam_pickup($target);
+               $pickups = spam_uri_pickup($target);
                $urinum += count($pickups);
                if (! empty($pickups)) {
                        // Some users want to post some URLs, but ...
index b96d3e5..04a6d13 100644 (file)
@@ -1,5 +1,5 @@
 <?php
-// $Id: spam_pickup.php,v 1.10 2006/11/03 07:47:57 henoheno Exp $
+// $Id: spam_pickup.php,v 1.11 2006/11/12 10:59:56 henoheno Exp $
 // Concept-work of spam-uri metrics
 // Copyright (C) 2006 PukiWiki Developers Team
 // License: GPL v2 or (at your option) any later version
@@ -57,24 +57,35 @@ $msg = isset($_POST['msg']) ? $_POST['msg'] : '';
 show_form($msg);
 echo '<pre>';
 
-$results = spam_pickup($msg);
+$pickup = TRUE;
+if ($pickup) {
+       $results = spam_uri_pickup($msg);
 
-// Measure
-$count = count($results);
-$area = 0;
-foreach($results as $result)
-       if (isset($result['area']))
-               $area += $result['area'];
-$average = $count ? ($area / $count) : 'NULL';
+       // Measure
+       $count = count($results);
+       $area = 0;
+       foreach($results as $result)
+               if (isset($result['area']))
+                       $area += $result['area'];
+       $average = $count ? ($area / $count) : 'NULL';
+
+       echo "TOTAL = $count URIs, AREA_TOTAL = $area, AREA_AVERAGE = " . $average . "</br >" . "</br >";
+
+       $a = array();
+       var_dump(array_tree('/a/b/c/d/e', '/', false));
+       var_dump($a);
+       echo "<br/>";
+}
 
-echo "TOTAL = $count URIs, AREA_TOTAL = $area, AREA_AVERAGE = " . $average . "</br >" . "</br >";
 var_dump('is_uri_spam($msg)', is_uri_spam($msg));
 
 //$notify = TRUE;
 //var_dump('pkwk_spamfilter($msg)', pkwk_spamfilter('A', 'PAGE', array('msg' => $msg)));
 //echo "\n";
 
-var_dump('$results', $results);
+if ($pickup) {
+       var_dump('$results', $results);
+}
 echo '</pre>';
 
 ?>
index b96d3e5..04a6d13 100644 (file)
@@ -1,5 +1,5 @@
 <?php
-// $Id: spam_pickup.php,v 1.10 2006/11/03 07:47:57 henoheno Exp $
+// $Id: spam_pickup.php,v 1.11 2006/11/12 10:59:56 henoheno Exp $
 // Concept-work of spam-uri metrics
 // Copyright (C) 2006 PukiWiki Developers Team
 // License: GPL v2 or (at your option) any later version
@@ -57,24 +57,35 @@ $msg = isset($_POST['msg']) ? $_POST['msg'] : '';
 show_form($msg);
 echo '<pre>';
 
-$results = spam_pickup($msg);
+$pickup = TRUE;
+if ($pickup) {
+       $results = spam_uri_pickup($msg);
 
-// Measure
-$count = count($results);
-$area = 0;
-foreach($results as $result)
-       if (isset($result['area']))
-               $area += $result['area'];
-$average = $count ? ($area / $count) : 'NULL';
+       // Measure
+       $count = count($results);
+       $area = 0;
+       foreach($results as $result)
+               if (isset($result['area']))
+                       $area += $result['area'];
+       $average = $count ? ($area / $count) : 'NULL';
+
+       echo "TOTAL = $count URIs, AREA_TOTAL = $area, AREA_AVERAGE = " . $average . "</br >" . "</br >";
+
+       $a = array();
+       var_dump(array_tree('/a/b/c/d/e', '/', false));
+       var_dump($a);
+       echo "<br/>";
+}
 
-echo "TOTAL = $count URIs, AREA_TOTAL = $area, AREA_AVERAGE = " . $average . "</br >" . "</br >";
 var_dump('is_uri_spam($msg)', is_uri_spam($msg));
 
 //$notify = TRUE;
 //var_dump('pkwk_spamfilter($msg)', pkwk_spamfilter('A', 'PAGE', array('msg' => $msg)));
 //echo "\n";
 
-var_dump('$results', $results);
+if ($pickup) {
+       var_dump('$results', $results);
+}
 echo '</pre>';
 
 ?>