2 // $Id: spam.php,v 1.11 2006/11/03 15:58:00 henoheno Exp $
3 // Copyright (C) 2006 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
6 // Functions for Concept-work of spam-uri metrics
8 // Return an array of normalized/parsed URIs in the $string
9 // [OK] http://nasty.example.org#nasty_string
10 // [OK] http://nasty.example.org/foo/xxx#nasty_string/bar
11 // [OK] ftp://dfshodfs:80/dfsdfs
12 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
13 // [OK] http://victim.example.org/gphttp://nasty.example.org
14 function spam_pickup($string = '')
16 // Preprocess: urldecode() and adding space(s)
17 $string = preg_replace(
20 '#\b[a-z][a-z0-9.+-]{1,8}://#i',
21 '#[a-z][a-z0-9.+-]{1,8}://#i'
22 ), ' $0', urldecode($string));
24 // URI pickup: Not available for user@password, IDN, Fragment(=ignored)
28 '#(\b[a-z][a-z0-9.+-]{1,8})://' . // 1: Scheme
31 '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732
32 '(?:[0-9]{1-3}\.){3}[0-9]{1-3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
33 '[^\s<>"\'\[\]:/\#?]+' . // FQDN: foo.example.org
35 '(?::([a-z0-9]{2,}))?' . // 3: Port
36 '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 4: Directory path or path-info
37 '([^\s<>"\'\[\]\#]+)?' . // 5: File and query string
38 // #: Fragment(ignored)
40 $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
41 //var_dump(recursive_map('htmlspecialchars', $array));
43 $parts = array(1 => 'scheme', 2 => 'host', 3 => 'port',
44 4 => 'path', 5 => 'file');
46 foreach(array_keys($array) as $uri) {
47 unset($array[$uri][0]); // Matched string itself
48 array_rename_keys($array[$uri], $parts, TRUE, $default);
49 $offset = $array[$uri]['scheme'][1]; // Scheme's offset
51 // Remove offsets (with normalization)
52 foreach(array_keys($array[$uri]) as $part) {
54 strtolower($array[$uri][$part][0]);
56 $array[$uri]['path'] = path_normalize($array[$uri]['path']);
57 $array[$uri]['offset'] = $offset;
58 $array[$uri]['area'] = 0;
61 // Area elevation for '(especially external)link' intension
62 if (! empty($array)) {
63 // Anchor tags by preg_match_all()
64 // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
65 // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
66 // [NG] <a href= >Good site!</a> <a href= "#" >test</a>
68 preg_match_all('#<a\b[^>]*href[^>]*>.*?</a\b[^>]*(>)#i',
69 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
70 //var_dump(recursive_map('htmlspecialchars', $areas));
71 foreach(array_keys($areas) as $area) {
72 $areas[$area] = array(
73 $areas[$area][0][1], // [0][1] = Area start (<a href>)
74 $areas[$area][1][1], // [1][1] = Area end (</a>)
77 area_measure($areas, $array);
79 // Various Wiki syntax
80 // [text_or_uri>text_or_uri]
81 // [text_or_uri:text_or_uri]
82 // [text_or_uri|text_or_uri]
83 // [text_or_uri->text_or_uri]
84 // [text_or_uri text_or_uri] // MediaWiki
85 // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
87 // phpBB's "BBCode" by preg_match_all()
88 // [url]http://nasty.example.com/[/url]
89 // [link]http://nasty.example.com/[/link]
90 // [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
91 // [link http://nasty.example.com/]buy something[/link]
94 preg_match_all('#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#i',
95 $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
96 //var_dump(recursive_map('htmlspecialchars', $areas));
97 foreach(array_keys($areas) as $area) {
98 $areas[$area] = array(
99 $areas[$area][0][1], // [0][1] = Area start ([url])
100 $areas[$area][2][1], // [4][1] = Area end ([/url])
103 area_measure($areas, $array);
105 // Remove 'offset's for area_measure()
106 foreach(array_keys($array) as $key)
107 unset($array[$key]['offset']);
113 // $array[0] => $array['name']
114 function array_rename_keys(& $array, $rename = array(), $force = FALSE, $default = '')
117 foreach($rename as $from => $to) {
118 if (isset($array[$from])) {
119 $array[$to] = & $array[$from];
120 unset($array[$from]);
122 $array[$to] = $default;
126 foreach(array_keys($rename) as $from) {
127 if (! isset($array[$from])) {
131 foreach($rename as $from => $to) {
132 $array[$to] = & $array[$from];
133 unset($array[$from]);
140 // Path normalization
141 // example.org => example.org/
142 // example.org#hoge -> example.org/#hoge
143 // example.org/path/a/b/./c////./d -> example.org/path/a/b/c/d
144 // example.org/path/../../a/../back
145 function path_normalize($path = '', $divider = '/', $addroot = TRUE)
147 if (! is_string($path) || $path == '') {
148 $path = $addroot ? $divider : '';
151 $last = ($path[strlen($path) - 1] == $divider) ? $divider : '';
152 $array = explode($divider, $path);
155 foreach(array_keys($array) as $key) {
156 if ($array[$key] == '' || $array[$key] == '.')
161 foreach($array as $value) {
162 if ($value == '..') {
165 array_push($tmp, $value);
170 $path = $addroot ? $divider : '';
171 if (! empty($array)) $path .= implode($divider, $array) . $last;
177 // If in doubt, it's a little doubtful
178 function area_measure($areas, &$array, $belief = -1, $a_key = 'area', $o_key = 'offset')
180 if (! is_array($areas) || ! is_array($array)) return;
182 $areas_keys = array_keys($areas);
183 foreach(array_keys($array) as $u_index) {
184 $offset = isset($array[$u_index][$o_key]) ?
185 intval($array[$u_index][$o_key]) : 0;
186 foreach($areas_keys as $a_index) {
187 if (isset($array[$u_index][$a_key])) {
188 $offset_s = intval($areas[$a_index][0]);
189 $offset_e = intval($areas[$a_index][1]);
190 // [Area => inside <= Area]
191 if ($offset_s < $offset && $offset < $offset_e) {
192 $array[$u_index][$a_key] += $belief;
199 function is_uri_spam($target = '')
204 if (is_array($target)) {
205 foreach($target as $str) {
206 list($is_spam, $_urinum) = is_uri_spam($str);
211 $pickups = spam_pickup($target);
212 $urinum += count($pickups);
213 if (! empty($pickups)) {
214 // Some users want to post one or two URL, but ...
216 $is_spam = TRUE; // Too many!
218 foreach($pickups as $pickup) {
219 if ($pickup['area'] < 0) {
228 return array($is_spam, $urinum);
232 // TODO: tracker
\82ª
\95Ï
\93®
\82·
\82é
\8e\96\82à
\82 \82è
\81A
\82¢
\82Á
\82»
\82Ì
\82±
\82Æ$post
\82â $vars
\91S
\82Ä
\82ð
\91Î
\8fÛ
\82É
\82µ
\82½
\95û
\82ª
\82¢
\82¢
\81B
233 //
\82»
\82¤
\82·
\82ê
\82Î
\98R
\82ê
\82à
\96³
\82¢
\81B
234 //
\82Å
\81A
\83\81\81[
\83\8b\82Í
\82Ð
\82Á
\82©
\82¯
\82½
\83t
\83B
\81[
\83\8b\83h
\82¾
\82¯
\82É
\82·
\82é
\82Æ
\82©
\81B
235 // edit
\91Î
\8dô
\82Æ
\82µ
\82Ä
\82Í
\96³
\8e\8b\82·
\82é
\83t
\83B
\81[
\83\8b\83h
\82ð
\97p
\88Ó
\82·
\82é
\82Æ
\82©
\81B
237 // Mail to administrator with more measurement data?
238 // Simple/fast spam filter (for one text field)
239 function pkwk_spamfilter($action, $page, $target = array('title' => ''))
242 list($is_spam) = is_uri_spam($target);
245 global $notify, $notify_subject;
247 $footer['ACTION'] = $action;
248 $footer['PAGE'] = '[blocked] ' . $page;
249 $footer['URI'] = get_script_uri() . '?' . rawurlencode($page);
250 $footer['USER_AGENT'] = TRUE;
251 $footer['REMOTE_ADDR'] = TRUE;
252 pkwk_mail_notify($notify_subject, var_export($target, TRUE), $footer);
257 if ($is_spam) spam_exit();