From: henoheno Date: Sun, 12 Nov 2006 10:59:56 +0000 (+0900) Subject: Roughly reconstruncted: spam_pickup() => spam_uri_pickup() X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=9084427e4370c0741fb14773a7a53c9e2b4052b9;hp=ed97039718fe05d44d005478e52078dc355c024f;p=pukiwiki%2Fpukiwiki_sandbox.git Roughly reconstruncted: spam_pickup() => spam_uri_pickup() --- diff --git a/spam.php b/spam.php index 79e0552..51ccb9c 100644 --- a/spam.php +++ b/spam.php @@ -1,27 +1,17 @@ 'scheme', 2 => 'host', 3 => 'port', 4 => 'path', 5 => 'file'); @@ -48,16 +39,46 @@ function spam_pickup($string = '') array_rename_keys($array[$uri], $parts, TRUE, $default); $offset = $array[$uri]['scheme'][1]; // Scheme's offset - // Remove offsets (with normalization) + // Remove offsets for each part foreach(array_keys($array[$uri]) as $part) { - $array[$uri][$part] = - strtolower($array[$uri][$part][0]); + $array[$uri][$part] = & $array[$uri][$part][0]; } - //$array[$uri]['path'] = path_normalize($array[$uri]['path']); + $array[$uri]['offset'] = $offset; $array[$uri]['area'] = 0; } + return $array; +} + +// Preprocess: rawurldecode() and adding space(s) to detect/count some URIs if possible +// NOTE: It's maybe danger to var_dump() these results. +// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org +// [OK] http://victim.example.org/http://nasty.example.org +function spam_uri_pickup_preprocess($string = '') +{ + if (is_string($string)) { + return preg_replace( + array( + '#(?:https?|ftp):/#', + '#\b[a-z][a-z0-9.+-]{1,8}://#i', + '#[a-z][a-z0-9.+-]{1,8}://#i' + ), + ' $0', + rawurldecode($string) + ); + } else { + return ''; + } +} + +// Main function of spam-uri pickup +function spam_uri_pickup($string = '') +{ + $string = spam_uri_pickup_preprocess($string); + + $array = uri_pickup($string); + // Area elevation for '(especially external)link' intension if (! empty($array)) { // Anchor tags by preg_match_all() @@ -77,14 +98,6 @@ function spam_pickup($string = '') } area_measure($areas, $array); - // Various Wiki syntax - // [text_or_uri>text_or_uri] - // [text_or_uri:text_or_uri] - // [text_or_uri|text_or_uri] - // [text_or_uri->text_or_uri] - // [text_or_uri text_or_uri] // MediaWiki - // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/] - // phpBB's "BBCode" by preg_match_all() // [url]http://nasty.example.com/[/url] // [link]http://nasty.example.com/[/link] @@ -103,9 +116,17 @@ function spam_pickup($string = '') } area_measure($areas, $array); + // Various Wiki syntax + // [text_or_uri>text_or_uri] + // [text_or_uri:text_or_uri] + // [text_or_uri|text_or_uri] + // [text_or_uri->text_or_uri] + // [text_or_uri text_or_uri] // MediaWiki + // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/] + // Remove 'offset's for area_measure() - foreach(array_keys($array) as $key) - unset($array[$key]['offset']); + //foreach(array_keys($array) as $key) + // unset($array[$key]['offset']); } return $array; @@ -231,7 +252,7 @@ function is_uri_spam($target = '') if ($is_spam) break; } } else { - $pickups = spam_pickup($target); + $pickups = spam_uri_pickup($target); $urinum += count($pickups); if (! empty($pickups)) { // Some users want to post some URLs, but ... diff --git a/spam/spam.php b/spam/spam.php index 79e0552..51ccb9c 100644 --- a/spam/spam.php +++ b/spam/spam.php @@ -1,27 +1,17 @@ 'scheme', 2 => 'host', 3 => 'port', 4 => 'path', 5 => 'file'); @@ -48,16 +39,46 @@ function spam_pickup($string = '') array_rename_keys($array[$uri], $parts, TRUE, $default); $offset = $array[$uri]['scheme'][1]; // Scheme's offset - // Remove offsets (with normalization) + // Remove offsets for each part foreach(array_keys($array[$uri]) as $part) { - $array[$uri][$part] = - strtolower($array[$uri][$part][0]); + $array[$uri][$part] = & $array[$uri][$part][0]; } - //$array[$uri]['path'] = path_normalize($array[$uri]['path']); + $array[$uri]['offset'] = $offset; $array[$uri]['area'] = 0; } + return $array; +} + +// Preprocess: rawurldecode() and adding space(s) to detect/count some URIs if possible +// NOTE: It's maybe danger to var_dump() these results. +// [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org +// [OK] http://victim.example.org/http://nasty.example.org +function spam_uri_pickup_preprocess($string = '') +{ + if (is_string($string)) { + return preg_replace( + array( + '#(?:https?|ftp):/#', + '#\b[a-z][a-z0-9.+-]{1,8}://#i', + '#[a-z][a-z0-9.+-]{1,8}://#i' + ), + ' $0', + rawurldecode($string) + ); + } else { + return ''; + } +} + +// Main function of spam-uri pickup +function spam_uri_pickup($string = '') +{ + $string = spam_uri_pickup_preprocess($string); + + $array = uri_pickup($string); + // Area elevation for '(especially external)link' intension if (! empty($array)) { // Anchor tags by preg_match_all() @@ -77,14 +98,6 @@ function spam_pickup($string = '') } area_measure($areas, $array); - // Various Wiki syntax - // [text_or_uri>text_or_uri] - // [text_or_uri:text_or_uri] - // [text_or_uri|text_or_uri] - // [text_or_uri->text_or_uri] - // [text_or_uri text_or_uri] // MediaWiki - // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/] - // phpBB's "BBCode" by preg_match_all() // [url]http://nasty.example.com/[/url] // [link]http://nasty.example.com/[/link] @@ -103,9 +116,17 @@ function spam_pickup($string = '') } area_measure($areas, $array); + // Various Wiki syntax + // [text_or_uri>text_or_uri] + // [text_or_uri:text_or_uri] + // [text_or_uri|text_or_uri] + // [text_or_uri->text_or_uri] + // [text_or_uri text_or_uri] // MediaWiki + // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/] + // Remove 'offset's for area_measure() - foreach(array_keys($array) as $key) - unset($array[$key]['offset']); + //foreach(array_keys($array) as $key) + // unset($array[$key]['offset']); } return $array; @@ -231,7 +252,7 @@ function is_uri_spam($target = '') if ($is_spam) break; } } else { - $pickups = spam_pickup($target); + $pickups = spam_uri_pickup($target); $urinum += count($pickups); if (! empty($pickups)) { // Some users want to post some URLs, but ... diff --git a/spam/spam_pickup.php b/spam/spam_pickup.php index b96d3e5..04a6d13 100644 --- a/spam/spam_pickup.php +++ b/spam/spam_pickup.php @@ -1,5 +1,5 @@ '; -$results = spam_pickup($msg); +$pickup = TRUE; +if ($pickup) { + $results = spam_uri_pickup($msg); -// Measure -$count = count($results); -$area = 0; -foreach($results as $result) - if (isset($result['area'])) - $area += $result['area']; -$average = $count ? ($area / $count) : 'NULL'; + // Measure + $count = count($results); + $area = 0; + foreach($results as $result) + if (isset($result['area'])) + $area += $result['area']; + $average = $count ? ($area / $count) : 'NULL'; + + echo "TOTAL = $count URIs, AREA_TOTAL = $area, AREA_AVERAGE = " . $average . "
" . "
"; + + $a = array(); + var_dump(array_tree('/a/b/c/d/e', '/', false)); + var_dump($a); + echo "
"; +} -echo "TOTAL = $count URIs, AREA_TOTAL = $area, AREA_AVERAGE = " . $average . "
" . "
"; var_dump('is_uri_spam($msg)', is_uri_spam($msg)); //$notify = TRUE; //var_dump('pkwk_spamfilter($msg)', pkwk_spamfilter('A', 'PAGE', array('msg' => $msg))); //echo "\n"; -var_dump('$results', $results); +if ($pickup) { + var_dump('$results', $results); +} echo ''; ?> diff --git a/spam_pickup.php b/spam_pickup.php index b96d3e5..04a6d13 100644 --- a/spam_pickup.php +++ b/spam_pickup.php @@ -1,5 +1,5 @@ '; -$results = spam_pickup($msg); +$pickup = TRUE; +if ($pickup) { + $results = spam_uri_pickup($msg); -// Measure -$count = count($results); -$area = 0; -foreach($results as $result) - if (isset($result['area'])) - $area += $result['area']; -$average = $count ? ($area / $count) : 'NULL'; + // Measure + $count = count($results); + $area = 0; + foreach($results as $result) + if (isset($result['area'])) + $area += $result['area']; + $average = $count ? ($area / $count) : 'NULL'; + + echo "TOTAL = $count URIs, AREA_TOTAL = $area, AREA_AVERAGE = " . $average . "
" . "
"; + + $a = array(); + var_dump(array_tree('/a/b/c/d/e', '/', false)); + var_dump($a); + echo "
"; +} -echo "TOTAL = $count URIs, AREA_TOTAL = $area, AREA_AVERAGE = " . $average . "
" . "
"; var_dump('is_uri_spam($msg)', is_uri_spam($msg)); //$notify = TRUE; //var_dump('pkwk_spamfilter($msg)', pkwk_spamfilter('A', 'PAGE', array('msg' => $msg))); //echo "\n"; -var_dump('$results', $results); +if ($pickup) { + var_dump('$results', $results); +} echo ''; ?>