<?php
-// $Id: spam_pickup.php,v 1.54 2007/08/19 03:12:35 henoheno Exp $
+// $Id: spam_pickup.php,v 1.55 2007/08/20 14:37:23 henoheno Exp $
// Copyright (C) 2006-2007 PukiWiki Developers Team
// License: GPL v2 or (at your option) any later version
//
preg_match_all(
// scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
// Refer RFC3986 (Regex below is not strict)
- '#(\b[a-z][a-z0-9.+-]{1,8}):[/\\\]+' . // 1: Scheme
+ '#(\b[a-z][a-z0-9.+-]{1,8}):[/\\\]+' . // 1: Scheme
'(?:' .
'([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username)
'@)?' .
// 3: Host
'\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
'(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
- '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' . // hostname(FQDN) : foo.example.org
+ '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' . // hostname(FQDN) : foo.example.org
')' .
'(?::([0-9]*))?' . // 4: Port
'((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info
return $result;
}
-// Preprocess: rawurldecode() and adding space(s) and something
+// Preprocess: minor-rawurldecode() and adding space(s) and something
// to detect/count some URIs _if possible_
// NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
// [OK] http://victim.example.org/?site:nasty.example.org
{
if (! is_string($string)) return '';
- $string = spam_uri_removing_hocus_pocus(rawurldecode($string), $method);
+ // rawurldecode(), just to catch encoded 'http://path/to/file', not to change '%20' to ' '
+ $string = strtr(
+ $string,
+ array(
+ '%3A' => ':',
+ '%3a' => ':',
+ '%2F' => '/',
+ '%2f' => '/',
+ '%5C' => '\\',
+ '%5c' => '\\',
+ )
+ );
+
+ $string = spam_uri_removing_hocus_pocus($string, $method);
//var_dump(htmlspecialchars($string));
// Domain exposure (simple)