From b6de709a0b68dc4b4059718db55624fc4bb80d91 Mon Sep 17 00:00:00 2001 From: henoheno Date: Mon, 20 Aug 2007 23:37:23 +0900 Subject: [PATCH] spam_uri_pickup_preprocess(): not to rawurldecode(), not to decode '%20' --- spam/SpamPickupTest.php | 15 ++++++++++++++- spam/spam_pickup.php | 23 ++++++++++++++++++----- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/spam/SpamPickupTest.php b/spam/SpamPickupTest.php index f61f725..77890c4 100644 --- a/spam/SpamPickupTest.php +++ b/spam/SpamPickupTest.php @@ -1,5 +1,5 @@ assertEquals('backslash.org', $results[0]['host']); + // Divider: percent-encoded + //$test_string = ' http%3A%2F%5Cpercent-encoded.org%5Cfobar.html '; + //$results = uri_pickup_normalize(uri_pickup($test_string)); + //$this->assertEquals('percent-encoded.org', $results[0]['host']); + // Host: Underscore $test_string = ' http://under_score.org/fobar.html '; $results = uri_pickup_normalize(uri_pickup($test_string)); @@ -348,6 +353,14 @@ EOF; $this->assertEquals('foo.html', $results[0]['file']); } + function testFunc_spam_uri_pickup() + { + // Divider: percent-encoded + $test_string = ' http://victim.example.org/http%3A%2F%5Cnasty.example.org '; + $results = spam_uri_pickup($test_string); + $this->assertEquals('victim.example.org', $results[0]['host']); + $this->assertEquals('nasty.example.org', $results[1]['host']); + } } ?> \ No newline at end of file diff --git a/spam/spam_pickup.php b/spam/spam_pickup.php index 467611a..fd68576 100644 --- a/spam/spam_pickup.php +++ b/spam/spam_pickup.php @@ -1,5 +1,5 @@ "\'\[\]/\#?@]*)' . // 2: Userinfo (Username) '@)?' . @@ -31,7 +31,7 @@ function uri_pickup($string = '') // 3: Host '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732 '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44 - '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' . // hostname(FQDN) : foo.example.org + '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' . // hostname(FQDN) : foo.example.org ')' . '(?::([0-9]*))?' . // 4: Port '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info @@ -656,7 +656,7 @@ function _preg_replace_callback_domain_exposure($matches = array()) return $result; } -// Preprocess: rawurldecode() and adding space(s) and something +// Preprocess: minor-rawurldecode() and adding space(s) and something // to detect/count some URIs _if possible_ // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:'] // [OK] http://victim.example.org/?site:nasty.example.org @@ -667,7 +667,20 @@ function spam_uri_pickup_preprocess($string = '', $method = array()) { if (! is_string($string)) return ''; - $string = spam_uri_removing_hocus_pocus(rawurldecode($string), $method); + // rawurldecode(), just to catch encoded 'http://path/to/file', not to change '%20' to ' ' + $string = strtr( + $string, + array( + '%3A' => ':', + '%3a' => ':', + '%2F' => '/', + '%2f' => '/', + '%5C' => '\\', + '%5c' => '\\', + ) + ); + + $string = spam_uri_removing_hocus_pocus($string, $method); //var_dump(htmlspecialchars($string)); // Domain exposure (simple) -- 2.11.0