<?php
-// $Id: spam_pickup.php,v 1.64 2008/12/30 11:13:49 henoheno Exp $
-// Copyright (C) 2006-2007 PukiWiki Developers Team
+// $Id: spam_pickup.php,v 1.71 2009/01/04 08:56:07 henoheno Exp $
+// Copyright (C) 2006-2009 PukiWiki Developers Team
// License: GPL v2 or (at your option) any later version
//
// Functions for Concept-work of spam-uri metrics
//
+// (PHP 4 >= 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature
+//
+
+if (! defined('DOMAIN_INI_FILE')) define('DOMAIN_INI_FILE', 'domain.ini.php');
// ---------------------
// URI pickup
// [OK] http://nasty.example.org:80/foo/xxx#nasty_string/bar
// [OK] ftp://nasty.example.org:80/dfsdfs
// [OK] ftp://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm (from RFC3986)
+// Not available for: IDN(ignored)
function uri_pickup($string = '')
{
if (! is_string($string)) return array();
- // Not available for: IDN(ignored)
$array = array();
preg_match_all(
// scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
// Refer RFC3986 (Regex below is not strict)
'#(\b[a-z][a-z0-9.+-]{1,8}):[/\\\]+' . // 1: Scheme
'(?:' .
- '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username)
+ '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username and/or password)
'@)?' .
'(' .
// 3: Host
'[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' . // hostname(FQDN) : foo.example.org
')' .
'(?::([0-9]*))?' . // 4: Port
- '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info
+ '((?:/+[^\s<>"\'\[\]/\#?]+)*/+)?' . // 5: Directory path
'([^\s<>"\'\[\]\#?]+)?' . // 6: File?
'(?:\?([^\s<>"\'\[\]\#]+))?' . // 7: Query string
'(?:\#([a-z0-9._~%!$&\'()*+,;=:@-]*))?' . // 8: Fragment
$tmp[] = & $uri['scheme'];
$tmp[] = '://';
}
+
if (isset($uri['userinfo']) && $uri['userinfo'] !== '') {
$tmp[] = & $uri['userinfo'];
$tmp[] = '@';
+ } else if (isset($uri['user']) || isset($uri['pass'])) {
+ if (isset($uri['user']) && $uri['user'] !== '') {
+ $tmp[] = & $uri['user'];
+ }
+ $tmp[] = ':';
+ if (isset($uri['pass']) && $uri['pass'] !== '') {
+ $tmp[] = & $uri['pass'];
+ }
+ $tmp[] = '@';
}
+
if (isset($uri['host']) && $uri['host'] !== '') {
$tmp[] = & $uri['host'];
}
+
if (isset($uri['port']) && $uri['port'] !== '') {
$tmp[] = ':';
$tmp[] = & $uri['port'];
}
+
if (isset($uri['path']) && $uri['path'] !== '') {
$tmp[] = & $uri['path'];
}
+
if (isset($uri['file']) && $uri['file'] !== '') {
$tmp[] = & $uri['file'];
}
+
if (isset($uri['query']) && $uri['query'] !== '') {
$tmp[] = '?';
$tmp[] = & $uri['query'];
}
+
if (isset($uri['fragment']) && $uri['fragment'] !== '') {
$tmp[] = '#';
$tmp[] = & $uri['fragment'];
return implode('', $tmp);
}
+
// ---------------------
// URI normalization
// Normalize an array of URI arrays
// NOTE: Give me the uri_pickup() results
-function uri_pickup_normalize(& $pickups, $destructive = TRUE)
+function uri_pickup_normalize(& $pickups, $destructive = TRUE, $pathfile = FALSE)
{
if (! is_array($pickups)) return $pickups;
}
}
+ if ($pathfile) {
+ return uri_pickup_normalize_pathfile($pickups);
+ } else {
+ return $pickups;
+ }
+}
+
+// Normalize: 'path' + 'file' = 'path' (Similar structure using PHP's "parse_url()" function)
+// NOTE: In some case, 'file' DOES NOT mean _filename_.
+// [EXAMPLE] http://example.com/path/to/directory-accidentally-not-ended-with-slash
+function uri_pickup_normalize_pathfile(& $pickups)
+{
+ if (! is_array($pickups)) return $pickups;
+
+ foreach (array_keys($pickups) as $key) {
+ $_key = & $pickups[$key];
+ if (isset($_key['path'], $_key['file'])) {
+ $_key['path'] = $_key['path'] . $_key['file'];
+ unset($_key['file']);
+ }
+ }
+
return $pickups;
}
return $array;
}
+// Rough hostname checker
+// TODO: Strict digit, 0x, CIDR, '999.999.999.999', ':', '::G'
+function is_ip($string = '')
+{
+ if (! is_string($string)) return FALSE;
+
+ if (strpos($string, ':') !== FALSE) {
+ return 6; // Seems IPv6
+ }
+
+ if (preg_match('/^' .
+ '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' .
+ '(?:[0-9]{1,3}\.){1,3}' . '$/',
+ $string)) {
+ return 4; // Seems IPv4(dot-decimal)
+ }
+
+ return FALSE; // Seems not IP
+}
+
+// Check responsibility-root of the FQDN
+// 'foo.bar.example.com' => 'example.com' (.com has the last whois for it)
+// 'foo.bar.example.au' => 'example.au' (.au has the last whois for it)
+// 'foo.bar.example.edu.au' => 'example.edu.au' (.edu.au has the last whois for it)
+// 'foo.bar.example.act.edu.au' => 'example.act.edu.au' (.act.edu.au has the last whois for it)
+function whois_responsibility($fqdn = 'foo.bar.example.com', $parent = FALSE, $implicit = TRUE)
+{
+ static $domain;
+
+ if ($fqdn === NULL) {
+ $domain = NULL; // Unset
+ return '';
+ }
+ if (! is_string($fqdn)) return '';
+
+ if (is_ip($fqdn)) return $fqdn;
+
+ if (! isset($domain)) {
+ $domain = array();
+ if (file_exists(DOMAIN_INI_FILE)) {
+ include(DOMAIN_INI_FILE); // Set
+ }
+ }
+
+ $result = array();
+ $dcursor = & $domain;
+ $array = array_reverse(explode('.', $fqdn));
+ $i = 0;
+ while(TRUE) {
+ if (! isset($array[$i])) break;
+ $acursor = $array[$i];
+ if (is_array($dcursor) && isset($dcursor[$acursor])) {
+ $result[] = & $array[$i];
+ $dcursor = & $dcursor[$acursor];
+ } else {
+ if (! $parent && isset($acursor)) {
+ $result[] = & $array[$i]; // Whois servers must know this subdomain
+ }
+ break;
+ }
+ ++$i;
+ }
+
+ // Implicit responsibility: Top-Level-Domains must not be yours
+ // 'bar.foo.something' => 'foo.something'
+ if ($implicit && count($result) == 1 && count($array) > 1) {
+ $result[] = & $array[1];
+ }
+
+ return $result ? implode('.', array_reverse($result)) : '';
+}
+
?>