OSDN Git Service

spam_uri_pickup_preprocess(): not to rawurldecode(), not to decode '%20'
authorhenoheno <henoheno>
Mon, 20 Aug 2007 14:37:23 +0000 (23:37 +0900)
committerhenoheno <henoheno>
Mon, 20 Aug 2007 14:37:23 +0000 (23:37 +0900)
spam/SpamPickupTest.php
spam/spam_pickup.php

index f61f725..77890c4 100644 (file)
@@ -1,5 +1,5 @@
 <?php
-// $Id: SpamPickupTest.php,v 1.2 2007/07/02 15:27:20 henoheno Exp $
+// $Id: SpamPickupTest.php,v 1.3 2007/08/20 14:37:23 henoheno Exp $
 // Copyright (C) 2007 heno
 //
 // Design test case for spam.php (called from runner.php)
@@ -307,6 +307,11 @@ EOF;
                $results = uri_pickup_normalize(uri_pickup($test_string));
                $this->assertEquals('backslash.org',  $results[0]['host']);
 
+               // Divider: percent-encoded
+               //$test_string = ' http%3A%2F%5Cpercent-encoded.org%5Cfobar.html ';
+               //$results = uri_pickup_normalize(uri_pickup($test_string));
+               //$this->assertEquals('percent-encoded.org',  $results[0]['host']);
+
                // Host: Underscore
                $test_string = ' http://under_score.org/fobar.html ';
                $results = uri_pickup_normalize(uri_pickup($test_string));
@@ -348,6 +353,14 @@ EOF;
                $this->assertEquals('foo.html',       $results[0]['file']);
        }
 
+       function testFunc_spam_uri_pickup()
+       {
+               // Divider: percent-encoded
+               $test_string = ' http://victim.example.org/http%3A%2F%5Cnasty.example.org ';
+               $results = spam_uri_pickup($test_string);
+               $this->assertEquals('victim.example.org', $results[0]['host']);
+               $this->assertEquals('nasty.example.org',  $results[1]['host']);
+       }
 }
 
 ?>
\ No newline at end of file
index 467611a..fd68576 100644 (file)
@@ -1,5 +1,5 @@
 <?php
-// $Id: spam_pickup.php,v 1.54 2007/08/19 03:12:35 henoheno Exp $
+// $Id: spam_pickup.php,v 1.55 2007/08/20 14:37:23 henoheno Exp $
 // Copyright (C) 2006-2007 PukiWiki Developers Team
 // License: GPL v2 or (at your option) any later version
 //
@@ -23,7 +23,7 @@ function uri_pickup($string = '')
        preg_match_all(
                // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
                // Refer RFC3986 (Regex below is not strict)
-               '#(\b[a-z][a-z0-9.+-]{1,8}):[/\\\]+' .  // 1: Scheme
+               '#(\b[a-z][a-z0-9.+-]{1,8}):[/\\\]+' .          // 1: Scheme
                '(?:' .
                        '([^\s<>"\'\[\]/\#?@]*)' .              // 2: Userinfo (Username)
                '@)?' .
@@ -31,7 +31,7 @@ function uri_pickup($string = '')
                        // 3: Host
                        '\[[0-9a-f:.]+\]' . '|' .                               // IPv6([colon-hex and dot]): RFC2732
                        '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
-                       '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' .            // hostname(FQDN) : foo.example.org
+                       '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' .    // hostname(FQDN) : foo.example.org
                ')' .
                '(?::([0-9]*))?' .                                      // 4: Port
                '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' .      // 5: Directory path or path-info
@@ -656,7 +656,7 @@ function _preg_replace_callback_domain_exposure($matches = array())
        return $result;
 }
 
-// Preprocess: rawurldecode() and adding space(s) and something
+// Preprocess: minor-rawurldecode() and adding space(s) and something
 // to detect/count some URIs _if possible_
 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
 // [OK] http://victim.example.org/?site:nasty.example.org
@@ -667,7 +667,20 @@ function spam_uri_pickup_preprocess($string = '', $method = array())
 {
        if (! is_string($string)) return '';
 
-       $string = spam_uri_removing_hocus_pocus(rawurldecode($string), $method);
+       // rawurldecode(), just to catch encoded 'http://path/to/file', not to change '%20' to ' '
+       $string = strtr(
+               $string,
+               array(
+                       '%3A' => ':',
+                       '%3a' => ':',
+                       '%2F' => '/',
+                       '%2f' => '/',
+                       '%5C' => '\\',
+                       '%5c' => '\\',
+               )
+       );
+
+       $string = spam_uri_removing_hocus_pocus($string, $method);
        //var_dump(htmlspecialchars($string));
 
        // Domain exposure (simple)