+// Preprocess: Domain exposure callback (See spam_uri_pickup_preprocess())
+// http://victim.example.org/?foo+site:nasty.example.com+bar
+// => http://nasty.example.com/?refer=victim.example.org
+// NOTE: 'refer=' is not so good for (at this time).
+// Consider about using IP address of the victim, try to avoid that.
+function _preg_replace_callback_domain_exposure($matches = array())
+{
+ $result = '';
+
+ // Preserve the victim URI as a complicity or ...
+ if (isset($matches[5])) {
+ $result =
+ $matches[1] . '://' . // scheme
+ $matches[2] . '/' . // victim.example.org
+ $matches[3]; // The rest of all (before victim)
+ }
+
+ // Flipped URI
+ if (isset($matches[4])) {
+ $result =
+ $matches[1] . '://' . // scheme
+ $matches[4] . // nasty.example.com
+ '/?refer=' . strtolower($matches[2]) . // victim.example.org
+ ' ' . $result;
+ }
+
+ return $result;
+}
+
+// Preprocess: minor-rawurldecode() and adding space(s) and something
+// to detect/count some URIs _if possible_
+// NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
+// [OK] http://victim.example.org/?site:nasty.example.org
+// [OK] http://victim.example.org/nasty.example.org
+// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
+// [OK] http://victim.example.org/http://nasty.example.org
+function spam_uri_pickup_preprocess($string = '', $method = array())
+{
+ if (! is_string($string)) return '';
+
+ // rawurldecode(), just to catch encoded 'http://path/to/file', not to change '%20' to ' '
+ $string = strtr(
+ $string,
+ array(
+ '%3A' => ':',
+ '%3a' => ':',
+ '%2F' => '/',
+ '%2f' => '/',
+ '%5C' => '\\',
+ '%5c' => '\\',
+ )
+ );
+
+ $string = spam_uri_removing_hocus_pocus($string, $method);
+
+ // Domain exposure (simple)
+ // http://victim.example.org/nasty.example.org/path#frag
+ // => http://nasty.example.org/?refer=victim.example.org and original
+ $string = preg_replace(
+ '#h?ttp://' .
+ '(' .
+ 'a9\.com/' . '|' .
+ 'aboutus\.org/' . '|' .
+ 'alexa\.com/data/details\?url=' . '|' .
+ 'ime\.(?:nu|st)/' . '|' . // 2ch.net
+ 'link\.toolbot\.com/' . '|' .
+ 'urlx\.org/' . '|' .
+ 'big5.51job.com/gate/big5/' . '|' .
+ 'big5.china.com/gate/big5/' . '|' .
+ 'big5.shippingchina.com:8080/' . '|' .
+ 'big5.xinhuanet.com/gate/big5/' . '|' .
+ 'bhomiyo.com/en.xliterate/' . '|' .
+ 'google.com/translate_c\?u=(?:http://)?' . '|' .
+ 'web.archive.org/web/2[^/]*/(?:http://)?' . '|' .
+ 'technorati.com/blogs/' .
+ ')' .
+ '([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // nasty.example.org
+ '#i',
+ 'http://$2/?refer=$1 $0', // Preserve $0 or remove?
+ $string
+ );
+
+ // Domain exposure (site:) See _preg_replace_callback_domain_exposure()
+ $string = preg_replace_callback(
+ array(
+ '#(h?ttp)://' . // 1:Scheme
+ // 2:Host
+ '(' .
+ '(?:[a-z0-9_.-]+\.)?[a-z0-9_-]+\.[a-z0-9_-]+' .
+ // Something Google: http://www.google.com/supported_domains
+ // AltaVista: http://es.altavista.com/web/results?q=site%3Anasty.example.org+foobar
+ // Live Search: search.live.com
+ // MySpace: http://sads.myspace.com/Modules/Search/Pages/Search.aspx?_snip_&searchString=site:nasty.example.org
+ // (also searchresults.myspace.com)
+ // alltheweb.com
+ // search.bbc.co.uk
+ // search.orange.co.uk
+ // ...
+ ')' .
+ '/' .
+ //TODO: Specify URL-enable characters
+ '([a-z0-9?=&.,%_/\'\\\+-]+)' . // 3:path/?query=foo+bar+
+ '(?:\b|%20)site:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // 4:site:nasty.example.com
+ '()' . // 5:Preserve or remove?
+ '#i',
+ ),
+ '_preg_replace_callback_domain_exposure',
+ $string
+ );
+
+ // URI exposure (uriuri => uri uri)
+ $string = preg_replace(
+ array(
+ '#(?<! )(?:https?|ftp):/#i',
+ // '#[a-z][a-z0-9.+-]{1,8}://#i',
+ // '#[a-z][a-z0-9.+-]{1,8}://#i'
+ ),
+ ' $0',
+ $string
+ );
+
+ return $string;
+}
+
+// Main function of spam-uri pickup,
+// A wrapper function of uri_pickup()
+function spam_uri_pickup($string = '', $method = array())
+{
+ if (! is_array($method) || empty($method)) {
+ $method = check_uri_spam_method();
+ }
+
+ $string = spam_uri_pickup_preprocess($string, $method);
+
+ $array = uri_pickup($string);
+
+ // Area elevation of URIs, for '(especially external)link' intension
+ if (! empty($array)) {
+ $_method = array();
+ if (isset($method['uri_anchor'])) $_method['uri_anchor'] = & $method['uri_anchor'];
+ if (isset($method['uri_bbcode'])) $_method['uri_bbcode'] = & $method['uri_bbcode'];
+ $areas = area_pickup($string, $_method, TRUE);
+ if (! empty($areas)) {
+ $area_shadow = array();
+ foreach (array_keys($array) as $key) {
+ $area_shadow[$key] = & $array[$key]['area'];
+ foreach (array_keys($_method) as $_key) {
+ $area_shadow[$key][$_key] = 0;
+ }
+ }
+ foreach (array_keys($_method) as $_key) {
+ if (isset($areas[$_key])) {
+ area_measure($areas[$_key], $area_shadow, 1, $_key);
+ }
+ }
+ }
+ }
+
+ // Remove 'offset's for area_measure()
+ foreach(array_keys($array) as $key) {
+ unset($array[$key]['area']['offset']);
+ }
+
+ return $array;
+}
+
+// Rough hostname checker
+// TODO: Strict digit, 0x, CIDR, '999.999.999.999', ':', '::G'
+function is_ip($string = '')
+{
+ if (! is_string($string)) return FALSE;
+
+ if (strpos($string, ':') !== FALSE) {
+ return 6; // Seems IPv6
+ }
+
+ if (preg_match('/^' .
+ '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' .
+ '(?:[0-9]{1,3}\.){1,3}' . '$/',
+ $string)) {
+ return 4; // Seems IPv4(dot-decimal)
+ }
+
+ return FALSE; // Seems not IP
+}
+
+// Check responsibility-root of the FQDN
+// 'foo.bar.example.com' => 'example.com' (.com has the last whois for it)
+// 'foo.bar.example.au' => 'example.au' (.au has the last whois for it)
+// 'foo.bar.example.edu.au' => 'example.edu.au' (.edu.au has the last whois for it)
+// 'foo.bar.example.act.edu.au' => 'example.act.edu.au' (.act.edu.au has the last whois for it)
+function whois_responsibility($fqdn = 'foo.bar.example.com', $parent = FALSE, $implicit = TRUE)
+{
+ static $domain;
+
+ if ($fqdn === NULL) {
+ $domain = NULL; // Unset
+ return '';
+ }
+ if (! is_string($fqdn)) return '';
+
+ if (is_ip($fqdn)) return $fqdn;
+
+ if (! isset($domain)) {
+ $domain = array();
+ if (file_exists(DOMAIN_INI_FILE)) {
+ include(DOMAIN_INI_FILE); // Set
+ }
+ }
+
+ $result = array();
+ $dcursor = & $domain;
+ $array = array_reverse(explode('.', $fqdn));
+ $i = 0;
+ while(TRUE) {
+ if (! isset($array[$i])) break;
+ $acursor = $array[$i];
+ if (is_array($dcursor) && isset($dcursor[$acursor])) {
+ $result[] = & $array[$i];
+ $dcursor = & $dcursor[$acursor];
+ } else {
+ if (! $parent && isset($acursor)) {
+ $result[] = & $array[$i]; // Whois servers must know this subdomain
+ }
+ break;
+ }
+ ++$i;
+ }
+
+ // Implicit responsibility: Top-Level-Domains must not be yours
+ // 'bar.foo.something' => 'foo.something'
+ if ($implicit && count($result) == 1 && count($array) > 1) {
+ $result[] = & $array[1];
+ }
+
+ return $result ? implode('.', array_reverse($result)) : '';