OSDN Git Service

* whois_responsibility() remove array_shrinkbranch_leaves()
[pukiwiki/pukiwiki_sandbox.git] / spam / spam.php
1 <?php
2 // $Id: spam.php,v 1.177 2007/06/15 14:46:15 henoheno Exp $
3 // Copyright (C) 2006-2007 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
5 //
6 // Functions for Concept-work of spam-uri metrics
7 //
8 // (PHP 4 >= 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature
9
10 if (! defined('SPAM_INI_FILE')) define('SPAM_INI_FILE', 'spam.ini.php');
11
12 // ---------------------
13 // Compat etc
14
15 // (PHP 4 >= 4.2.0): var_export(): mail-reporting and dump related
16 if (! function_exists('var_export')) {
17         function var_export() {
18                 return 'var_export() is not found on this server' . "\n";
19         }
20 }
21
22 // (PHP 4 >= 4.2.0): preg_grep() enables invert option
23 function preg_grep_invert($pattern = '//', $input = array())
24 {
25         static $invert;
26         if (! isset($invert)) $invert = defined('PREG_GREP_INVERT');
27
28         if ($invert) {
29                 return preg_grep($pattern, $input, PREG_GREP_INVERT);
30         } else {
31                 $result = preg_grep($pattern, $input);
32                 if ($result) {
33                         return array_diff($input, preg_grep($pattern, $input));
34                 } else {
35                         return $input;
36                 }
37         }
38 }
39
40 // ----
41
42 // Very roughly, shrink the lines of var_export()
43 // NOTE: If the same data exists, it must be corrupted.
44 function var_export_shrink($expression, $return = FALSE, $ignore_numeric_keys = FALSE)
45 {
46         $result = var_export($expression, TRUE);
47
48         $result = preg_replace(
49                 // Remove a newline and spaces
50                 '# => \n *array \(#', ' => array (',
51                 $result
52         );
53
54         if ($ignore_numeric_keys) {
55                 $result =preg_replace(
56                         // Remove numeric keys
57                         '#^( *)[0-9]+ => #m', '$1',
58                         $result
59                 );
60         }
61
62         if ($return) {
63                 return $result;
64         } else {
65                 echo   $result;
66                 return NULL;
67         }
68 }
69
70 // Remove redundant values from array()
71 function array_unique_recursive($array = array())
72 {
73         if (! is_array($array)) return $array;
74
75         $tmp = array();
76         foreach($array as $key => $value){
77                 if (is_array($value)) {
78                         $array[$key] = array_unique_recursive($value);
79                 } else {
80                         if (isset($tmp[$value])) {
81                                 unset($array[$key]);
82                         } else {
83                                 $tmp[$value] = TRUE;
84                         }
85                 }
86         }
87
88         return $array;
89 }
90
91 // Renumber all numeric keys from 0
92 function array_renumber_numeric_keys(& $array)
93 {
94         if (! is_array($array)) return $array;
95
96         $count = -1;
97         $tmp = array();
98         foreach($array as $key => $value){
99                 if (is_array($value)) array_renumber_numeric_keys($array[$key]);        // Recurse
100                 if (is_numeric($key)) $tmp[$key] = ++$count;
101         }
102         array_rename_keys($array, $tmp);
103
104         return $array;
105 }
106
107 // Roughly strings(1) using PCRE
108 // This function is useful to:
109 //   * Reduce the size of data, from removing unprintable binary data
110 //   * Detect _bare_strings_ from binary data
111 // References:
112 //   http://www.freebsd.org/cgi/man.cgi?query=strings (Man-page of GNU strings)
113 //   http://www.pcre.org/pcre.txt
114 function strings($binary = '', $min_len = 4, $ignore_space = FALSE)
115 {
116         if ($ignore_space) {
117                 $binary = preg_replace(
118                         array(
119                                 '/(?:[^[:graph:] \t\n]|[\r])+/s',
120                                 '/[ \t]{2,}/',
121                                 '/^[ \t]/m',
122                                 '/[ \t]$/m',
123                         ),
124                         array(
125                                 "\n",
126                                 ' ',
127                                 '',
128                                 ''
129                         ),
130                          $binary);
131         } else {
132                 // Remove "\0" etc. Preserve readable spaces if possible.
133                 $binary = preg_replace('/(?:[^[:graph:][:space:]]|[\r])+/s', "\n", $binary);
134         }
135
136         if ($min_len > 1) {
137                 $min_len = min(1024, intval($min_len));
138                 $regex = '/^.{' . $min_len . ',}/S';
139                 if (is_array($binary)) {
140                         foreach(array_keys($binary) as $key) {
141                                 $binary[$key] = implode("\n", preg_grep($regex, explode("\n", $binary[$key])));
142                         }
143                 } else {
144                         $binary = implode("\n", preg_grep($regex, explode("\n", $binary)));
145                 }
146         }
147
148         return $binary;
149 }
150
151 // Reverse $string with specified delimiter
152 function delimiter_reverse($string = 'foo.bar.example.com', $from_delim = '.', $to_delim = '.')
153 {
154         if (! is_string($string) || ! is_string($from_delim) || ! is_string($to_delim))
155                 return $string;
156
157         // com.example.bar.foo
158         return implode($to_delim, array_reverse(explode($from_delim, $string)));
159 }
160
161
162 // ---------------------
163 // URI pickup
164
165 // Return an array of URIs in the $string
166 // [OK] http://nasty.example.org#nasty_string
167 // [OK] http://nasty.example.org:80/foo/xxx#nasty_string/bar
168 // [OK] ftp://nasty.example.org:80/dfsdfs
169 // [OK] ftp://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm (from RFC3986)
170 function uri_pickup($string = '')
171 {
172         if (! is_string($string)) return array();
173
174         // Not available for: IDN(ignored)
175         $array = array();
176         preg_match_all(
177                 // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment
178                 // Refer RFC3986 (Regex below is not strict)
179                 '#(\b[a-z][a-z0-9.+-]{1,8}):[/\\\]+' .  // 1: Scheme
180                 '(?:' .
181                         '([^\s<>"\'\[\]/\#?@]*)' .              // 2: Userinfo (Username)
182                 '@)?' .
183                 '(' .
184                         // 3: Host
185                         '\[[0-9a-f:.]+\]' . '|' .                               // IPv6([colon-hex and dot]): RFC2732
186                         '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44
187                         '[a-z0-9][a-z0-9.-]+[a-z0-9]' .                 // hostname(FQDN) : foo.example.org
188                 ')' .
189                 '(?::([0-9]*))?' .                                      // 4: Port
190                 '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' .      // 5: Directory path or path-info
191                 '([^\s<>"\'\[\]\#?]+)?' .                       // 6: File?
192                 '(?:\?([^\s<>"\'\[\]\#]+))?' .          // 7: Query string
193                 '(?:\#([a-z0-9._~%!$&\'()*+,;=:@-]*))?' .       // 8: Fragment
194                 '#i',
195                  $string, $array, PREG_SET_ORDER | PREG_OFFSET_CAPTURE
196         );
197
198         // Format the $array
199         static $parts = array(
200                 1 => 'scheme', 2 => 'userinfo', 3 => 'host', 4 => 'port',
201                 5 => 'path', 6 => 'file', 7 => 'query', 8 => 'fragment'
202         );
203         $default = array('');
204         foreach(array_keys($array) as $uri) {
205                 $_uri = & $array[$uri];
206                 array_rename_keys($_uri, $parts, TRUE, $default);
207                 $offset = $_uri['scheme'][1]; // Scheme's offset = URI's offset
208                 foreach(array_keys($_uri) as $part) {
209                         $_uri[$part] = & $_uri[$part][0];       // Remove offsets
210                 }
211         }
212
213         foreach(array_keys($array) as $uri) {
214                 $_uri = & $array[$uri];
215                 if ($_uri['scheme'] === '') {
216                         unset($array[$uri]);    // Considererd harmless
217                         continue;
218                 }
219                 unset($_uri[0]); // Matched string itself
220                 $_uri['area']['offset'] = $offset;      // Area offset for area_measure()
221         }
222
223         return $array;
224 }
225
226 // Normalize an array of URI arrays
227 // NOTE: Give me the uri_pickup() results
228 function uri_pickup_normalize(& $pickups, $destructive = TRUE)
229 {
230         if (! is_array($pickups)) return $pickups;
231
232         if ($destructive) {
233                 foreach (array_keys($pickups) as $key) {
234                         $_key = & $pickups[$key];
235                         $_key['scheme']   = isset($_key['scheme']) ? scheme_normalize($_key['scheme']) : '';
236                         $_key['host']     = isset($_key['host'])     ? host_normalize($_key['host']) : '';
237                         $_key['port']     = isset($_key['port'])       ? port_normalize($_key['port'], $_key['scheme'], FALSE) : '';
238                         $_key['path']     = isset($_key['path'])     ? strtolower(path_normalize($_key['path'])) : '';
239                         $_key['file']     = isset($_key['file'])     ? file_normalize($_key['file']) : '';
240                         $_key['query']    = isset($_key['query'])    ? query_normalize($_key['query']) : '';
241                         $_key['fragment'] = isset($_key['fragment']) ? strtolower($_key['fragment']) : '';
242                 }
243         } else {
244                 foreach (array_keys($pickups) as $key) {
245                         $_key = & $pickups[$key];
246                         $_key['scheme']   = isset($_key['scheme']) ? scheme_normalize($_key['scheme']) : '';
247                         $_key['host']     = isset($_key['host'])   ? strtolower($_key['host']) : '';
248                         $_key['port']     = isset($_key['port'])   ? port_normalize($_key['port'], $_key['scheme'], FALSE) : '';
249                         $_key['path']     = isset($_key['path'])   ? path_normalize($_key['path']) : '';
250                 }
251         }
252
253         return $pickups;
254 }
255
256 // An URI array => An URI (See uri_pickup())
257 // USAGE:
258 //      $pickups = uri_pickup('a string include some URIs');
259 //      $uris = array();
260 //      foreach (array_keys($pickups) as $key) {
261 //              $uris[$key] = uri_pickup_implode($pickups[$key]);
262 //      }
263 function uri_pickup_implode($uri = array())
264 {
265         if (empty($uri) || ! is_array($uri)) return NULL;
266
267         $tmp = array();
268         if (isset($uri['scheme']) && $uri['scheme'] !== '') {
269                 $tmp[] = & $uri['scheme'];
270                 $tmp[] = '://';
271         }
272         if (isset($uri['userinfo']) && $uri['userinfo'] !== '') {
273                 $tmp[] = & $uri['userinfo'];
274                 $tmp[] = '@';
275         }
276         if (isset($uri['host']) && $uri['host'] !== '') {
277                 $tmp[] = & $uri['host'];
278         }
279         if (isset($uri['port']) && $uri['port'] !== '') {
280                 $tmp[] = ':';
281                 $tmp[] = & $uri['port'];
282         }
283         if (isset($uri['path']) && $uri['path'] !== '') {
284                 $tmp[] = & $uri['path'];
285         }
286         if (isset($uri['file']) && $uri['file'] !== '') {
287                 $tmp[] = & $uri['file'];
288         }
289         if (isset($uri['query']) && $uri['query'] !== '') {
290                 $tmp[] = '?';
291                 $tmp[] = & $uri['query'];
292         }
293         if (isset($uri['fragment']) && $uri['fragment'] !== '') {
294                 $tmp[] = '#';
295                 $tmp[] = & $uri['fragment'];
296         }
297
298         return implode('', $tmp);
299 }
300
301 // $array['something'] => $array['wanted']
302 function array_rename_keys(& $array, $keys = array('from' => 'to'), $force = FALSE, $default = '')
303 {
304         if (! is_array($array) || ! is_array($keys)) return FALSE;
305
306         // Nondestructive test
307         if (! $force)
308                 foreach(array_keys($keys) as $from)
309                         if (! isset($array[$from]))
310                                 return FALSE;
311
312         foreach($keys as $from => $to) {
313                 if ($from === $to) continue;
314                 if (! $force || isset($array[$from])) {
315                         $array[$to] = & $array[$from];
316                         unset($array[$from]);
317                 } else  {
318                         $array[$to] = $default;
319                 }
320         }
321
322         return TRUE;
323 }
324
325 // ---------------------
326 // Area pickup
327
328 // Pickup all of markup areas
329 function area_pickup($string = '', $method = array())
330 {
331         $area = array();
332         if (empty($method)) return $area;
333
334         // Anchor tag pair by preg_match and preg_match_all()
335         // [OK] <a href></a>
336         // [OK] <a href=  >Good site!</a>
337         // [OK] <a href= "#" >test</a>
338         // [OK] <a href="http://nasty.example.com">visit http://nasty.example.com/</a>
339         // [OK] <a href=\'http://nasty.example.com/\' >discount foobar</a> 
340         // [NG] <a href="http://ng.example.com">visit http://ng.example.com _not_ended_
341         $regex = '#<a\b[^>]*\bhref\b[^>]*>.*?</a\b[^>]*(>)#is';
342         if (isset($method['area_anchor'])) {
343                 $areas = array();
344                 $count = isset($method['asap']) ?
345                         preg_match($regex, $string) :
346                         preg_match_all($regex, $string, $areas);
347                 if (! empty($count)) $area['area_anchor'] = $count;
348         }
349         if (isset($method['uri_anchor'])) {
350                 $areas = array();
351                 preg_match_all($regex, $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
352                 foreach(array_keys($areas) as $_area) {
353                         $areas[$_area] =  array(
354                                 $areas[$_area][0][1], // Area start (<a href>)
355                                 $areas[$_area][1][1], // Area end   (</a>)
356                         );
357                 }
358                 if (! empty($areas)) $area['uri_anchor'] = $areas;
359         }
360
361         // phpBB's "BBCode" pair by preg_match and preg_match_all()
362         // [OK] [url][/url]
363         // [OK] [url]http://nasty.example.com/[/url]
364         // [OK] [link]http://nasty.example.com/[/link]
365         // [OK] [url=http://nasty.example.com]visit http://nasty.example.com/[/url]
366         // [OK] [link http://nasty.example.com/]buy something[/link]
367         $regex = '#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#is';
368         if (isset($method['area_bbcode'])) {
369                 $areas = array();
370                 $count = isset($method['asap']) ?
371                         preg_match($regex, $string) :
372                         preg_match_all($regex, $string, $areas, PREG_SET_ORDER);
373                 if (! empty($count)) $area['area_bbcode'] = $count;
374         }
375         if (isset($method['uri_bbcode'])) {
376                 $areas = array();
377                 preg_match_all($regex, $string, $areas, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
378                 foreach(array_keys($areas) as $_area) {
379                         $areas[$_area] = array(
380                                 $areas[$_area][0][1], // Area start ([url])
381                                 $areas[$_area][2][1], // Area end   ([/url])
382                         );
383                 }
384                 if (! empty($areas)) $area['uri_bbcode'] = $areas;
385         }
386
387         // Various Wiki syntax
388         // [text_or_uri>text_or_uri]
389         // [text_or_uri:text_or_uri]
390         // [text_or_uri|text_or_uri]
391         // [text_or_uri->text_or_uri]
392         // [text_or_uri text_or_uri] // MediaWiki
393         // MediaWiki: [http://nasty.example.com/ visit http://nasty.example.com/]
394
395         return $area;
396 }
397
398 // If in doubt, it's a little doubtful
399 // if (Area => inside <= Area) $brief += -1
400 function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = 'offset')
401 {
402         if (! is_array($areas) || ! is_array($array)) return;
403
404         $areas_keys = array_keys($areas);
405         foreach(array_keys($array) as $u_index) {
406                 $offset = isset($array[$u_index][$o_key]) ?
407                         intval($array[$u_index][$o_key]) : 0;
408                 foreach($areas_keys as $a_index) {
409                         if (isset($array[$u_index][$a_key])) {
410                                 $offset_s = intval($areas[$a_index][0]);
411                                 $offset_e = intval($areas[$a_index][1]);
412                                 // [Area => inside <= Area]
413                                 if ($offset_s < $offset && $offset < $offset_e) {
414                                         $array[$u_index][$a_key] += $belief;
415                                 }
416                         }
417                 }
418         }
419 }
420
421 // ---------------------
422 // Spam-uri pickup
423
424 // Domain exposure callback (See spam_uri_pickup_preprocess())
425 // http://victim.example.org/?foo+site:nasty.example.com+bar
426 // => http://nasty.example.com/?refer=victim.example.org
427 // NOTE: 'refer=' is not so good for (at this time).
428 // Consider about using IP address of the victim, try to avoid that.
429 function _preg_replace_callback_domain_exposure($matches = array())
430 {
431         $result = '';
432
433         // Preserve the victim URI as a complicity or ...
434         if (isset($matches[5])) {
435                 $result =
436                         $matches[1] . '://' .   // scheme
437                         $matches[2] . '/' .             // victim.example.org
438                         $matches[3];                    // The rest of all (before victim)
439         }
440
441         // Flipped URI
442         if (isset($matches[4])) {
443                 $result = 
444                         $matches[1] . '://' .   // scheme
445                         $matches[4] .                   // nasty.example.com
446                         '/?refer=' . strtolower($matches[2]) .  // victim.example.org
447                         ' ' . $result;
448         }
449
450         return $result;
451 }
452
453 // Preprocess: rawurldecode() and adding space(s) and something
454 // to detect/count some URIs _if possible_
455 // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:']
456 // [OK] http://victim.example.org/?site:nasty.example.org
457 // [OK] http://victim.example.org/nasty.example.org
458 // [OK] http://victim.example.org/go?http%3A%2F%2Fnasty.example.org
459 // [OK] http://victim.example.org/http://nasty.example.org
460 function spam_uri_pickup_preprocess($string = '')
461 {
462         if (! is_string($string)) return '';
463
464         $string = rawurldecode($string);
465
466         // Domain exposure (simple)
467         // http://victim.example.org/nasty.example.org/path#frag
468         // => http://nasty.example.org/?refer=victim.example.org and original
469         $string = preg_replace(
470                 '#h?ttp://' .
471                 '(' .
472                         'ime\.nu' . '|' .       // 2ch.net
473                         'ime\.st' . '|' .       // 2ch.net
474                         'link\.toolbot\.com' . '|' .
475                         'urlx\.org' .
476                 ')' .
477                 '/([a-z0-9.%_-]+\.[a-z0-9.%_-]+)#i',    // nasty.example.org
478                 'http://$2/?refer=$1 $0',                               // Preserve $0 or remove?
479                 $string
480         );
481
482         // Domain exposure (gate-big5)
483         // http://victim.example.org/gate/big5/nasty.example.org/path
484         // => http://nasty.example.org/?refer=victim.example.org and original
485         $string = preg_replace(
486                 '#h?ttp://' .
487                 '(' .
488                         'big5.51job.com'         . '|' .
489                         'big5.china.com'         . '|' .
490                         'big5.xinhuanet.com' . '|' .
491                 ')' .
492                 '/gate/big5' .
493                 '/([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' .
494                  '#i',  // nasty.example.org
495                 'http://$2/?refer=$1 $0',                               // Preserve $0 or remove?
496                 $string
497         );
498
499         // Domain exposure (See _preg_replace_callback_domain_exposure())
500         $string = preg_replace_callback(
501                 array(
502                         '#(http)://' .
503                         '(' .
504                                 // Something Google: http://www.google.com/supported_domains
505                                 '(?:[a-z0-9.]+\.)?google\.[a-z]{2,3}(?:\.[a-z]{2})?' .
506                                 '|' .
507                                 // AltaVista
508                                 '(?:[a-z0-9.]+\.)?altavista.com' .
509                                 
510                         ')' .
511                         '/' .
512                         '([a-z0-9?=&.%_/\'\\\+-]+)' .                           // path/?query=foo+bar+
513                         '\bsite:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' .       // site:nasty.example.com
514                         //'()' .        // Preserve or remove?
515                         '#i',
516                 ),
517                 '_preg_replace_callback_domain_exposure',
518                 $string
519         );
520
521         // URI exposure (uriuri => uri uri)
522         $string = preg_replace(
523                 array(
524                         '#(?<! )(?:https?|ftp):/#i',
525                 //      '#[a-z][a-z0-9.+-]{1,8}://#i',
526                 //      '#[a-z][a-z0-9.+-]{1,8}://#i'
527                 ),
528                 ' $0',
529                 $string
530         );
531
532         return $string;
533 }
534
535 // Main function of spam-uri pickup,
536 // A wrapper function of uri_pickup()
537 function spam_uri_pickup($string = '', $method = array())
538 {
539         if (! is_array($method) || empty($method)) {
540                 $method = check_uri_spam_method();
541         }
542
543         $string = spam_uri_pickup_preprocess($string);
544
545         $array  = uri_pickup($string);
546
547         // Area elevation of URIs, for '(especially external)link' intension
548         if (! empty($array)) {
549                 $_method = array();
550                 if (isset($method['uri_anchor'])) $_method['uri_anchor'] = & $method['uri_anchor'];
551                 if (isset($method['uri_bbcode'])) $_method['uri_bbcode'] = & $method['uri_bbcode'];
552                 $areas = area_pickup($string, $_method, TRUE);
553                 if (! empty($areas)) {
554                         $area_shadow = array();
555                         foreach (array_keys($array) as $key) {
556                                 $area_shadow[$key] = & $array[$key]['area'];
557                                 foreach (array_keys($_method) as $_key) {
558                                         $area_shadow[$key][$_key] = 0;
559                                 }
560                         }
561                         foreach (array_keys($_method) as $_key) {
562                                 if (isset($areas[$_key])) {
563                                         area_measure($areas[$_key], $area_shadow, 1, $_key);
564                                 }
565                         }
566                 }
567         }
568
569         // Remove 'offset's for area_measure()
570         foreach(array_keys($array) as $key)
571                 unset($array[$key]['area']['offset']);
572
573         return $array;
574 }
575
576
577 // ---------------------
578 // Normalization
579
580 // Scheme normalization: Renaming the schemes
581 // snntp://example.org =>  nntps://example.org
582 // NOTE: Keep the static lists simple. See also port_normalize().
583 function scheme_normalize($scheme = '', $abbrevs_harmfull = TRUE)
584 {
585         // Abbreviations they have no intention of link
586         static $abbrevs = array(
587                 'ttp'   => 'http',
588                 'ttps'  => 'https',
589         );
590
591         // Aliases => normalized ones
592         static $aliases = array(
593                 'pop'   => 'pop3',
594                 'news'  => 'nntp',
595                 'imap4' => 'imap',
596                 'snntp' => 'nntps',
597                 'snews' => 'nntps',
598                 'spop3' => 'pop3s',
599                 'pops'  => 'pop3s',
600         );
601
602         if (! is_string($scheme)) return '';
603
604         $scheme = strtolower($scheme);
605         if (isset($abbrevs[$scheme])) {
606                 $scheme = $abbrevs_harmfull ? $abbrevs[$scheme] : '';
607         }
608         if (isset($aliases[$scheme])) {
609                 $scheme = $aliases[$scheme];
610         }
611
612         return $scheme;
613 }
614
615 // Hostname normlization (Destructive)
616 // www.foo     => www.foo   ('foo' seems TLD)
617 // www.foo.bar => foo.bar
618 // www.10.20   => www.10.20 (Invalid hostname)
619 // NOTE:
620 //   'www' is  mostly used as traditional hostname of WWW server.
621 //   'www.foo.bar' may be identical with 'foo.bar'.
622 function host_normalize($host = '')
623 {
624         if (! is_string($host)) return '';
625
626         $host = strtolower($host);
627         $matches = array();
628         if (preg_match('/^www\.(.+\.[a-z]+)$/', $host, $matches)) {
629                 return $matches[1];
630         } else {
631                 return $host;
632         }
633 }
634
635 // Port normalization: Suppress the (redundant) default port
636 // HTTP://example.org:80/ => http://example.org/
637 // HTTP://example.org:8080/ => http://example.org:8080/
638 // HTTPS://example.org:443/ => https://example.org/
639 function port_normalize($port, $scheme, $scheme_normalize = FALSE)
640 {
641         // Schemes that users _maybe_ want to add protocol-handlers
642         // to their web browsers. (and attackers _maybe_ want to use ...)
643         // Reference: http://www.iana.org/assignments/port-numbers
644         static $array = array(
645                 // scheme => default port
646                 'ftp'     =>    21,
647                 'ssh'     =>    22,
648                 'telnet'  =>    23,
649                 'smtp'    =>    25,
650                 'tftp'    =>    69,
651                 'gopher'  =>    70,
652                 'finger'  =>    79,
653                 'http'    =>    80,
654                 'pop3'    =>   110,
655                 'sftp'    =>   115,
656                 'nntp'    =>   119,
657                 'imap'    =>   143,
658                 'irc'     =>   194,
659                 'wais'    =>   210,
660                 'https'   =>   443,
661                 'nntps'   =>   563,
662                 'rsync'   =>   873,
663                 'ftps'    =>   990,
664                 'telnets' =>   992,
665                 'imaps'   =>   993,
666                 'ircs'    =>   994,
667                 'pop3s'   =>   995,
668                 'mysql'   =>  3306,
669         );
670
671         // intval() converts '0-1' to '0', so preg_match() rejects these invalid ones
672         if (! is_numeric($port) || $port < 0 || preg_match('/[^0-9]/i', $port))
673                 return '';
674
675         $port = intval($port);
676         if ($scheme_normalize) $scheme = scheme_normalize($scheme);
677         if (isset($array[$scheme]) && $port == $array[$scheme])
678                 $port = ''; // Ignore the defaults
679
680         return $port;
681 }
682
683 // Path normalization
684 // http://example.org => http://example.org/
685 // http://example.org#hoge => http://example.org/#hoge
686 // http://example.org/path/a/b/./c////./d => http://example.org/path/a/b/c/d
687 // http://example.org/path/../../a/../back => http://example.org/back
688 function path_normalize($path = '', $divider = '/', $add_root = TRUE)
689 {
690         if (! is_string($divider)) return is_string($path) ? $path : '';
691
692         if ($add_root) {
693                 $first_div = & $divider;
694         } else {
695                 $first_div = '';
696         }
697         if (! is_string($path) || $path == '') return $first_div;
698
699         if (strpos($path, $divider, strlen($path) - strlen($divider)) === FALSE) {
700                 $last_div = '';
701         } else {
702                 $last_div = & $divider;
703         }
704
705         $array = explode($divider, $path);
706
707         // Remove paddings ('//' and '/./')
708         foreach(array_keys($array) as $key) {
709                 if ($array[$key] == '' || $array[$key] == '.') {
710                          unset($array[$key]);
711                 }
712         }
713
714         // Remove back-tracks ('/../')
715         $tmp = array();
716         foreach($array as $value) {
717                 if ($value == '..') {
718                         array_pop($tmp);
719                 } else {
720                         array_push($tmp, $value);
721                 }
722         }
723         $array = & $tmp;
724
725         if (empty($array)) {
726                 return $first_div;
727         } else {
728                 return $first_div . implode($divider, $array) . $last_div;
729         }
730 }
731
732 // DirectoryIndex normalize (Destructive and rough)
733 // TODO: sample.en.ja.html.gz => sample.html
734 function file_normalize($file = 'index.html.en')
735 {
736         static $simple_defaults = array(
737                 'default.htm'   => TRUE,
738                 'default.html'  => TRUE,
739                 'default.asp'   => TRUE,
740                 'default.aspx'  => TRUE,
741                 'index'                 => TRUE,        // Some system can omit the suffix
742         );
743
744         static $content_suffix = array(
745                 // index.xxx, sample.xxx
746                 'htm'   => TRUE,
747                 'html'  => TRUE,
748                 'shtml' => TRUE,
749                 'jsp'   => TRUE,
750                 'php'   => TRUE,
751                 'php3'  => TRUE,
752                 'php4'  => TRUE,
753                 'pl'    => TRUE,
754                 'py'    => TRUE,
755                 'rb'    => TRUE,
756                 'cgi'   => TRUE,
757                 'xml'   => TRUE,
758         );
759
760         static $language_suffix = array(
761                 // Reference: Apache 2.0.59 'AddLanguage' default
762                 'ca'    => TRUE,
763                 'cs'    => TRUE,        // cs
764                 'cz'    => TRUE,        // cs
765                 'de'    => TRUE,
766                 'dk'    => TRUE,        // da
767                 'el'    => TRUE,
768                 'en'    => TRUE,
769                 'eo'    => TRUE,
770                 'es'    => TRUE,
771                 'et'    => TRUE,
772                 'fr'    => TRUE,
773                 'he'    => TRUE,
774                 'hr'    => TRUE,
775                 'it'    => TRUE,
776                 'ja'    => TRUE,
777                 'ko'    => TRUE,
778                 'ltz'   => TRUE,
779                 'nl'    => TRUE,
780                 'nn'    => TRUE,
781                 'no'    => TRUE,
782                 'po'    => TRUE,
783                 'pt'    => TRUE,
784                 'pt-br' => TRUE,
785                 'ru'    => TRUE,
786                 'sv'    => TRUE,
787                 'zh-cn' => TRUE,
788                 'zh-tw' => TRUE,
789
790                 // Reference: Apache 2.0.59 default 'index.html' variants
791                 'ee'    => TRUE,
792                 'lb'    => TRUE,
793                 'var'   => TRUE,
794         );
795
796         static $charset_suffix = array(
797                 // Reference: Apache 2.0.59 'AddCharset' default
798                 'iso8859-1'     => TRUE, // ISO-8859-1
799                 'latin1'        => TRUE, // ISO-8859-1
800                 'iso8859-2'     => TRUE, // ISO-8859-2
801                 'latin2'        => TRUE, // ISO-8859-2
802                 'cen'           => TRUE, // ISO-8859-2
803                 'iso8859-3'     => TRUE, // ISO-8859-3
804                 'latin3'        => TRUE, // ISO-8859-3
805                 'iso8859-4'     => TRUE, // ISO-8859-4
806                 'latin4'        => TRUE, // ISO-8859-4
807                 'iso8859-5'     => TRUE, // ISO-8859-5
808                 'latin5'        => TRUE, // ISO-8859-5
809                 'cyr'           => TRUE, // ISO-8859-5
810                 'iso-ru'        => TRUE, // ISO-8859-5
811                 'iso8859-6'     => TRUE, // ISO-8859-6
812                 'latin6'        => TRUE, // ISO-8859-6
813                 'arb'           => TRUE, // ISO-8859-6
814                 'iso8859-7'     => TRUE, // ISO-8859-7
815                 'latin7'        => TRUE, // ISO-8859-7
816                 'grk'           => TRUE, // ISO-8859-7
817                 'iso8859-8'     => TRUE, // ISO-8859-8
818                 'latin8'        => TRUE, // ISO-8859-8
819                 'heb'           => TRUE, // ISO-8859-8
820                 'iso8859-9'     => TRUE, // ISO-8859-9
821                 'latin9'        => TRUE, // ISO-8859-9
822                 'trk'           => TRUE, // ISO-8859-9
823                 'iso2022-jp'=> TRUE, // ISO-2022-JP
824                 'jis'           => TRUE, // ISO-2022-JP
825                 'iso2022-kr'=> TRUE, // ISO-2022-KR
826                 'kis'           => TRUE, // ISO-2022-KR
827                 'iso2022-cn'=> TRUE, // ISO-2022-CN
828                 'cis'           => TRUE, // ISO-2022-CN
829                 'big5'          => TRUE,
830                 'cp-1251'       => TRUE, // ru, WINDOWS-1251
831                 'win-1251'      => TRUE, // ru, WINDOWS-1251
832                 'cp866'         => TRUE, // ru
833                 'koi8-r'        => TRUE, // ru, KOI8-r
834                 'koi8-ru'       => TRUE, // ru, KOI8-r
835                 'koi8-uk'       => TRUE, // ru, KOI8-ru
836                 'ua'            => TRUE, // ru, KOI8-ru
837                 'ucs2'          => TRUE, // ru, ISO-10646-UCS-2
838                 'ucs4'          => TRUE, // ru, ISO-10646-UCS-4
839                 'utf8'          => TRUE,
840
841                 // Reference: Apache 2.0.59 default 'index.html' variants
842                 'euc-kr'        => TRUE,
843                 'gb2312'        => TRUE,
844         );
845
846         // May uncompress by web browsers on the fly
847         // Must be at the last of the filename
848         // Reference: Apache 2.0.59 'AddEncoding'
849         static $encoding_suffix = array(
850                 'z'             => TRUE,
851                 'gz'    => TRUE,
852         );
853
854         if (! is_string($file)) return '';
855         $_file = strtolower($file);
856         if (isset($simple_defaults[$_file])) return '';
857
858
859         // Roughly removing language/character-set/encoding suffixes
860         // References:
861         //  * Apache 2 document about 'Content-negotiaton', 'mod_mime' and 'mod_negotiation'
862         //    http://httpd.apache.org/docs/2.0/content-negotiation.html
863         //    http://httpd.apache.org/docs/2.0/mod/mod_mime.html
864         //    http://httpd.apache.org/docs/2.0/mod/mod_negotiation.html
865         //  * http://www.iana.org/assignments/character-sets
866         //  * RFC3066: Tags for the Identification of Languages
867         //    http://www.ietf.org/rfc/rfc3066.txt
868         //  * ISO 639: codes of 'language names'
869         $suffixes = explode('.', $_file);
870         $body = array_shift($suffixes);
871         if ($suffixes) {
872                 // Remove the last .gz/.z
873                 $last_key = end(array_keys($suffixes));
874                 if (isset($encoding_suffix[$suffixes[$last_key]])) {
875                         unset($suffixes[$last_key]);
876                 }
877         }
878         // Cut language and charset suffixes
879         foreach($suffixes as $key => $value){
880                 if (isset($language_suffix[$value]) || isset($charset_suffix[$value])) {
881                         unset($suffixes[$key]);
882                 }
883         }
884         if (empty($suffixes)) return $body;
885
886         // Index.xxx
887         $count = count($suffixes);
888         reset($suffixes);
889         $current = current($suffixes);
890         if ($body == 'index' && $count == 1 && isset($content_suffix[$current])) return '';
891
892         return $file;
893 }
894
895 // Sort query-strings if possible (Destructive and rough)
896 // [OK] &&&&f=d&b&d&c&a=0dd  =>  a=0dd&b&c&d&f=d
897 // [OK] nothing==&eg=dummy&eg=padding&eg=foobar  =>  eg=foobar
898 function query_normalize($string = '', $equal = TRUE, $equal_cutempty = TRUE, $stortolower = TRUE)
899 {
900         if (! is_string($string)) return '';
901         if ($stortolower) $string = strtolower($string);
902
903         $array = explode('&', $string);
904
905         // Remove '&' paddings
906         foreach(array_keys($array) as $key) {
907                 if ($array[$key] == '') {
908                          unset($array[$key]);
909                 }
910         }
911
912         // Consider '='-sepalated input and paddings
913         if ($equal) {
914                 $equals = $not_equals = array();
915                 foreach ($array as $part) {
916                         if (strpos($part, '=') === FALSE) {
917                                  $not_equals[] = $part;
918                         } else {
919                                 list($key, $value) = explode('=', $part, 2);
920                                 $value = ltrim($value, '=');
921                                 if (! $equal_cutempty || $value != '') {
922                                         $equals[$key] = $value;
923                                 }
924                         }
925                 }
926
927                 $array = & $not_equals;
928                 foreach ($equals as $key => $value) {
929                         $array[] = $key . '=' . $value;
930                 }
931                 unset($equals);
932         }
933
934         natsort($array);
935         return implode('&', $array);
936 }
937
938 // ---------------------
939 // Part One : Checker
940
941 // Rough implementation of globbing
942 //
943 // USAGE: $regex = '/^' . generate_glob_regex('*.txt', '/') . '$/i';
944 //
945 function generate_glob_regex($string = '', $divider = '/')
946 {
947         static $from = array(
948                          1 => '*',
949                         11 => '?',
950         //              22 => '[',      // Maybe cause regex compilation error (e.g. '[]')
951         //              23 => ']',      //
952                 );
953         static $mid = array(
954                          1 => '_AST_',
955                         11 => '_QUE_',
956         //              22 => '_RBR_',
957         //              23 => '_LBR_',
958                 );
959         static $to = array(
960                          1 => '.*',
961                         11 => '.',
962         //              22 => '[',
963         //              23 => ']',
964                 );
965
966         if (! is_string($string)) return '';
967
968         $string = str_replace($from, $mid, $string); // Hide
969         $string = preg_quote($string, $divider);
970         $string = str_replace($mid, $to, $string);   // Unhide
971
972         return $string;
973 }
974
975 // Rough hostname checker
976 // [OK] 192.168.
977 // TODO: Strict digit, 0x, CIDR, IPv6
978 function is_ip($string = '')
979 {
980         if (preg_match('/^' .
981                 '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' .
982                 '(?:[0-9]{1,3}\.){1,3}' . '$/',
983                 $string)) {
984                 return 4;       // Seems IPv4(dot-decimal)
985         } else {
986                 return 0;       // Seems not IP
987         }
988 }
989
990 // Generate host (FQDN, IPv4, ...) regex
991 // 'localhost'     : Matches with 'localhost' only
992 // 'example.org'   : Matches with 'example.org' only (See host_normalize() about 'www')
993 // '.example.org'  : Matches with ALL FQDN ended with '.example.org'
994 // '*.example.org' : Almost the same of '.example.org' except 'www.example.org'
995 // '10.20.30.40'   : Matches with IPv4 address '10.20.30.40' only
996 // [TODO] '192.'   : Matches with all IPv4 hosts started with '192.'
997 // TODO: IPv4, CIDR?, IPv6
998 function generate_host_regex($string = '', $divider = '/')
999 {
1000         if (! is_string($string)) return '';
1001
1002         if (mb_strpos($string, '.') === FALSE)
1003                 return generate_glob_regex($string, $divider);
1004
1005         $result = '';
1006         if (is_ip($string)) {
1007                 // IPv4
1008                 return generate_glob_regex($string, $divider);
1009         } else {
1010                 // FQDN or something
1011                 $part = explode('.', $string, 2);
1012                 if ($part[0] == '') {
1013                         $part[0] = '(?:.*\.)?'; // And all related FQDN
1014                 } else if ($part[0] == '*') {
1015                         $part[0] = '.*\.';      // All subdomains/hosts only
1016                 } else {
1017                         return generate_glob_regex($string, $divider);
1018                 }
1019                 $part[1] = generate_glob_regex($part[1], $divider);
1020                 return implode('', $part);
1021         }
1022 }
1023
1024 function get_blocklist($list = '')
1025 {
1026         static $regexes;
1027
1028         if ($list === NULL) {
1029                 $regexes = NULL;        // Unset
1030                 return array();
1031         }
1032
1033         if (! isset($regexes)) {
1034                 $regexes = array();
1035                 if (file_exists(SPAM_INI_FILE)) {
1036                         $blocklist = array();
1037                         include(SPAM_INI_FILE);
1038                         //      $blocklist['badhost'] = array(
1039                         //              '*.blogspot.com',       // Blog services's subdomains (only)
1040                         //              'IANA-examples' => '#^(?:.*\.)?example\.(?:com|net|org)$#',
1041                         //      );
1042                         if (isset($blocklist['list'])) {
1043                                 $regexes['list'] = & $blocklist['list'];
1044                         } else {
1045                                 // Default
1046                                 $blocklist['list'] = array(
1047                                         'goodhost' => FALSE,
1048                                         'badhost'  => TRUE,
1049                                 );
1050                         }
1051                         foreach(array_keys($blocklist['list']) as $_list) {
1052                                 if (! isset($blocklist[$_list])) continue;
1053                                 foreach ($blocklist[$_list] as $key => $value) {
1054                                         if (is_array($value)) {
1055                                                 $regexes[$_list][$key] = array();
1056                                                 foreach($value as $_key => $_value) {
1057                                                         get_blocklist_add($regexes[$_list][$key], $_key, $_value);
1058                                                 }
1059                                         } else {
1060                                                 get_blocklist_add($regexes[$_list], $key, $value);
1061                                         }
1062                                 }
1063                                 unset($blocklist[$_list]);
1064                         }
1065                 }
1066         }
1067
1068         if ($list === '') {
1069                 return $regexes;        // ALL
1070         } else if (isset($regexes[$list])) {
1071                 return $regexes[$list];
1072         } else {
1073                 return array();
1074         }
1075 }
1076
1077 // Subroutine of get_blocklist()
1078 function get_blocklist_add(& $array, $key = 0, $value = '*.example.org')
1079 {
1080         if (is_string($key)) {
1081                 $array[$key] = & $value; // Treat $value as a regex
1082         } else {
1083                 $array[$value] = '/^' . generate_host_regex($value, '/') . '$/i';
1084         }
1085 }
1086
1087 // Blocklist metrics: Separate $host, to $blocked and not blocked
1088 function blocklist_distiller(& $hosts, $keys = array('goodhost', 'badhost'), $asap = FALSE)
1089 {
1090         if (! is_array($hosts)) $hosts = array($hosts);
1091         if (! is_array($keys))  $keys  = array($keys);
1092
1093         $list = get_blocklist('list');
1094         $blocked = array();
1095
1096         foreach($keys as $key){
1097                 foreach (get_blocklist($key) as $label => $regex) {
1098                         if (is_array($regex)) {
1099                                 foreach($regex as $_label => $_regex) {
1100                                         $group = preg_grep($_regex, $hosts);
1101                                         if ($group) {
1102                                                 $hosts = array_diff($hosts, $group);
1103                                                 $blocked[$key][$label][$_label] = $group;
1104                                                 if ($asap && $list[$key]) break;
1105                                         }
1106                                 }
1107                         } else {
1108                                 $group = preg_grep($regex, $hosts);
1109                                 if ($group) {
1110                                         $hosts = array_diff($hosts, $group);
1111                                         $blocked[$key][$label] = $group;
1112                                         if ($asap && $list[$key]) break;
1113                                 }
1114                         }
1115                 }
1116         }
1117
1118         return $blocked;
1119 }
1120
1121 // Default (enabled) methods and thresholds (for content insertion)
1122 function check_uri_spam_method($times = 1, $t_area = 0, $rule = TRUE)
1123 {
1124         $times  = intval($times);
1125         $t_area = intval($t_area);
1126
1127         $positive = array(
1128                 // Thresholds
1129                 'quantity'     =>  8 * $times,  // Allow N URIs
1130                 'non_uniqhost' =>  3 * $times,  // Allow N duped (and normalized) Hosts
1131                 //'non_uniquri'=>  3 * $times,  // Allow N duped (and normalized) URIs
1132
1133                 // Areas
1134                 'area_anchor'  => $t_area,      // Using <a href> HTML tag
1135                 'area_bbcode'  => $t_area,      // Using [url] or [link] BBCode
1136                 //'uri_anchor' => $t_area,      // URI inside <a href> HTML tag
1137                 //'uri_bbcode' => $t_area,      // URI inside [url] or [link] BBCode
1138         );
1139         if ($rule) {
1140                 $bool = array(
1141                         // Rules
1142                         //'asap'   => TRUE,     // Quit or return As Soon As Possible
1143                         'uniqhost' => TRUE,     // Show uniq host (at block notification mail)
1144                         'badhost'  => TRUE,     // Check badhost
1145                 );
1146         } else {
1147                 $bool = array();
1148         }
1149
1150         // Remove non-$positive values
1151         foreach (array_keys($positive) as $key) {
1152                 if ($positive[$key] < 0) unset($positive[$key]);
1153         }
1154
1155         return $positive + $bool;
1156 }
1157
1158 // Simple/fast spam check
1159 function check_uri_spam($target = '', $method = array())
1160 {
1161         // Return value
1162         $progress = array(
1163                 'method'  => array(
1164                         // Theme to do  => Dummy, optional value, or optional array()
1165                         //'quantity'    => 8,
1166                         //'uniqhost'    => TRUE,
1167                         //'non_uniqhost'=> 3,
1168                         //'non_uniquri' => 3,
1169                         //'badhost'     => TRUE,
1170                         //'area_anchor' => 0,
1171                         //'area_bbcode' => 0,
1172                         //'uri_anchor'  => 0,
1173                         //'uri_bbcode'  => 0,
1174                 ),
1175                 'sum' => array(
1176                         // Theme        => Volume found (int)
1177                 ),
1178                 'is_spam' => array(
1179                         // Flag. If someting defined here,
1180                         // one or more spam will be included
1181                         // in this report
1182                 ),
1183                 'blocked' => array(
1184                         // Hosts blocked
1185                         //'category' => array(
1186                         //      'host',
1187                         //)
1188                 ),
1189                 'hosts' => array(
1190                         // Hosts not blocked
1191                 ),
1192         );
1193
1194         // Aliases
1195         $sum     = & $progress['sum'];
1196         $is_spam = & $progress['is_spam'];
1197         $progress['method'] = & $method;        // Argument
1198         $blocked = & $progress['blocked'];
1199         $hosts   = & $progress['hosts'];
1200         $asap    = isset($method['asap']);
1201
1202         // Init
1203         if (! is_array($method) || empty($method)) {
1204                 $method = check_uri_spam_method();
1205         }
1206         foreach(array_keys($method) as $key) {
1207                 if (! isset($sum[$key])) $sum[$key] = 0;
1208         }
1209
1210         if (is_array($target)) {
1211                 foreach($target as $str) {
1212                         if (! is_string($str)) continue;
1213
1214                         $_progress = check_uri_spam($str, $method);     // Recurse
1215
1216                         // Merge $sum
1217                         $_sum = & $_progress['sum'];
1218                         foreach (array_keys($_sum) as $key) {
1219                                 if (! isset($sum[$key])) {
1220                                         $sum[$key] = & $_sum[$key];
1221                                 } else {
1222                                         $sum[$key] += $_sum[$key];
1223                                 }
1224                         }
1225
1226                         // Merge $is_spam
1227                         $_is_spam = & $_progress['is_spam'];
1228                         foreach (array_keys($_is_spam) as $key) {
1229                                 $is_spam[$key] = TRUE;
1230                                 if ($asap) break;
1231                         }
1232                         if ($asap && $is_spam) break;
1233
1234                         // Merge only
1235                         $blocked = array_merge_recursive($blocked, $_progress['blocked']);
1236                         $hosts   = array_merge_recursive($hosts,   $_progress['hosts']);
1237                 }
1238
1239                 // Unique values
1240                 $blocked = array_unique_recursive($blocked);
1241                 $hosts   = array_unique_recursive($hosts);
1242
1243                 // Recount $sum['badhost']
1244                 $sum['badhost'] = array_count_leaves($blocked);
1245
1246                 return $progress;
1247         }
1248
1249         // Area: There's HTML anchor tag
1250         if ((! $asap || ! $is_spam) && isset($method['area_anchor'])) {
1251                 $key = 'area_anchor';
1252                 $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
1253                 $result = area_pickup($target, array($key => TRUE) + $_asap);
1254                 if ($result) {
1255                         $sum[$key] = $result[$key];
1256                         if (isset($method[$key]) && $sum[$key] > $method[$key]) {
1257                                 $is_spam[$key] = TRUE;
1258                         }
1259                 }
1260         }
1261
1262         // Area: There's 'BBCode' linking tag
1263         if ((! $asap || ! $is_spam) && isset($method['area_bbcode'])) {
1264                 $key = 'area_bbcode';
1265                 $_asap = isset($method['asap']) ? array('asap' => TRUE) : array();
1266                 $result = area_pickup($target, array($key => TRUE) + $_asap);
1267                 if ($result) {
1268                         $sum[$key] = $result[$key];
1269                         if (isset($method[$key]) && $sum[$key] > $method[$key]) {
1270                                 $is_spam[$key] = TRUE;
1271                         }
1272                 }
1273         }
1274
1275         // Return if ...
1276         if ($asap && $is_spam) return $progress;
1277
1278         // URI: Pickup
1279         $pickups = uri_pickup_normalize(spam_uri_pickup($target, $method));
1280
1281         // Return if ...
1282         if (empty($pickups)) return $progress;
1283
1284         // URI: Check quantity
1285         $sum['quantity'] += count($pickups);
1286                 // URI quantity
1287         if ((! $asap || ! $is_spam) && isset($method['quantity']) &&
1288                 $sum['quantity'] > $method['quantity']) {
1289                 $is_spam['quantity'] = TRUE;
1290         }
1291
1292         // URI: used inside HTML anchor tag pair
1293         if ((! $asap || ! $is_spam) && isset($method['uri_anchor'])) {
1294                 $key = 'uri_anchor';
1295                 foreach($pickups as $pickup) {
1296                         if (isset($pickup['area'][$key])) {
1297                                 $sum[$key] += $pickup['area'][$key];
1298                                 if(isset($method[$key]) &&
1299                                         $sum[$key] > $method[$key]) {
1300                                         $is_spam[$key] = TRUE;
1301                                         if ($asap && $is_spam) break;
1302                                 }
1303                                 if ($asap && $is_spam) break;
1304                         }
1305                 }
1306         }
1307
1308         // URI: used inside 'BBCode' pair
1309         if ((! $asap || ! $is_spam) && isset($method['uri_bbcode'])) {
1310                 $key = 'uri_bbcode';
1311                 foreach($pickups as $pickup) {
1312                         if (isset($pickup['area'][$key])) {
1313                                 $sum[$key] += $pickup['area'][$key];
1314                                 if(isset($method[$key]) &&
1315                                         $sum[$key] > $method[$key]) {
1316                                         $is_spam[$key] = TRUE;
1317                                         if ($asap && $is_spam) break;
1318                                 }
1319                                 if ($asap && $is_spam) break;
1320                         }
1321                 }
1322         }
1323
1324         // URI: Uniqueness (and removing non-uniques)
1325         if ((! $asap || ! $is_spam) && isset($method['non_uniquri'])) {
1326
1327                 $uris = array();
1328                 foreach (array_keys($pickups) as $key) {
1329                         $uris[$key] = uri_pickup_implode($pickups[$key]);
1330                 }
1331                 $count = count($uris);
1332                 $uris  = array_unique($uris);
1333                 $sum['non_uniquri'] += $count - count($uris);
1334                 if ($sum['non_uniquri'] > $method['non_uniquri']) {
1335                         $is_spam['non_uniquri'] = TRUE;
1336                 }
1337                 if (! $asap || ! $is_spam) {
1338                         foreach (array_diff(array_keys($pickups),
1339                                 array_keys($uris)) as $remove) {
1340                                 unset($pickups[$remove]);
1341                         }
1342                 }
1343                 unset($uris);
1344         }
1345
1346         // Return if ...
1347         if ($asap && $is_spam) return $progress;
1348
1349         // Host: Uniqueness (uniq / non-uniq)
1350         foreach ($pickups as $pickup) $hosts[] = & $pickup['host'];
1351         $hosts = array_unique($hosts);
1352         $sum['uniqhost'] += count($hosts);
1353         if ((! $asap || ! $is_spam) && isset($method['non_uniqhost'])) {
1354                 $sum['non_uniqhost'] = $sum['quantity'] - $sum['uniqhost'];
1355                 if ($sum['non_uniqhost'] > $method['non_uniqhost']) {
1356                         $is_spam['non_uniqhost'] = TRUE;
1357                 }
1358         }
1359
1360         // Return if ...
1361         if ($asap && $is_spam) return $progress;
1362
1363         // URI: Bad host (Separate good/bad hosts from $hosts)
1364         if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
1365
1366                 // is_badhost()
1367                 $list = get_blocklist('list');
1368                 $blocked = blocklist_distiller($hosts, array_keys($list), $asap);
1369                 foreach($list as $key=>$type){
1370                         if (! $type) unset($blocked[$key]); // Ignore goodhost etc
1371                 }
1372                 unset($list);
1373
1374                 if (! empty($blocked)) $is_spam['badhost'] = TRUE;
1375         }
1376
1377         return $progress;
1378 }
1379
1380 // Count leaves (A leaf = value that is not an array, or an empty array)
1381 function array_count_leaves($array = array(), $count_empty = FALSE)
1382 {
1383         if (! is_array($array) || (empty($array) && $count_empty)) return 1;
1384
1385         // Recurse
1386         $count = 0;
1387         foreach ($array as $part) {
1388                 $count += array_count_leaves($part, $count_empty);
1389         }
1390         return $count;
1391 }
1392
1393 // An array-leaves to a flat array
1394 function array_flat_leaves($array, $unique = TRUE)
1395 {
1396         if (! is_array($array)) return $array;
1397
1398         $tmp = array();
1399         foreach(array_keys($array) as $key) {
1400                 if (is_array($array[$key])) {
1401                         // Recurse
1402                         foreach(array_flat_leaves($array[$key]) as $_value) {
1403                                 $tmp[] = $_value;
1404                         }
1405                 } else {
1406                         $tmp[] = & $array[$key];
1407                 }
1408         }
1409
1410         return $unique ? array_values(array_unique($tmp)) : $tmp;
1411 }
1412
1413 // An array() to an array leaf
1414 function array_leaf($array = array('A', 'B', 'C.D'), $stem = FALSE, $edge = TRUE)
1415 {
1416         if (! is_array($array)) return $array;
1417
1418         $leaf = array();
1419         $tmp  = & $leaf;
1420         foreach($array as $arg) {
1421                 if (! is_string($arg) && ! is_int($arg)) continue;
1422                 $tmp[$arg] = array();
1423                 $parent    = & $tmp;
1424                 $tmp       = & $tmp[$arg];
1425         }
1426         if ($stem) {
1427                 $parent[key($parent)] = & $edge;
1428         } else {
1429                 $parent = key($parent);
1430         }
1431
1432         return $leaf;   // array('A' => array('B' => 'C.D'))
1433 }
1434
1435
1436 // ---------------------
1437 // Reporting
1438
1439 // Summarize $progress (blocked only)
1440 function summarize_spam_progress($progress = array(), $blockedonly = FALSE)
1441 {
1442         if ($blockedonly) {
1443                 $tmp = array_keys($progress['is_spam']);
1444         } else {
1445                 $tmp = array();
1446                 $method = & $progress['method'];
1447                 if (isset($progress['sum'])) {
1448                         foreach ($progress['sum'] as $key => $value) {
1449                                 if (isset($method[$key]) && $value) {
1450                                         $tmp[] = $key . '(' . $value . ')';
1451                                 }
1452                         }
1453                 }
1454         }
1455
1456         return implode(', ', $tmp);
1457 }
1458
1459 function summarize_detail_badhost($progress = array())
1460 {
1461         if (! isset($progress['blocked']) || empty($progress['blocked'])) return '';
1462
1463         // Flat per group
1464         $blocked = array();
1465         foreach($progress['blocked'] as $list => $lvalue) {
1466                 foreach($lvalue as $group => $gvalue) {
1467                         $flat = implode(', ', array_flat_leaves($gvalue));
1468                         if ($flat === $group) {
1469                                 $blocked[$list][]       = $flat;
1470                         } else {
1471                                 $blocked[$list][$group] = $flat;
1472                         }
1473                 }
1474         }
1475
1476         // Shrink per list
1477         // From: 'A-1' => array('ie.to')
1478         // To:   'A-1' => 'ie.to'
1479         foreach($blocked as $list => $lvalue) {
1480                 if (is_array($lvalue) &&
1481                    count($lvalue) == 1 &&
1482                    is_numeric(key($lvalue))) {
1483                     $blocked[$list] = current($lvalue);
1484                 }
1485         }
1486
1487         return var_export_shrink($blocked, TRUE, TRUE);
1488 }
1489
1490 function summarize_detail_newtral($progress = array())
1491 {
1492         if (! isset($progress['hosts'])    ||
1493             ! is_array($progress['hosts']) ||
1494             empty($progress['hosts'])) return '';
1495
1496         $result = '';
1497
1498         // Generate a $trie
1499         $trie = array();
1500         foreach($progress['hosts'] as $value) {
1501
1502                 // Try to shorten (pre) -- array('example.com', 'bar', 'foo')
1503                 $resp = whois_responsibility($value);   // 'example.com'
1504                 $rest = rtrim(substr($value, 0, - strlen($resp)), '.'); // 'foo.bar'
1505                 if ($rest) {
1506                         $parts = explode('.', delimiter_reverse('.' . $rest));
1507                         array_unshift($parts, $resp);
1508                 } else {
1509                         $parts = array($resp, $rest);
1510                 }
1511
1512                 $trie = array_merge_recursive(
1513                         $trie,
1514                         array_leaf($parts, TRUE, $value)
1515                 );
1516         }
1517
1518         // Try to shorten (post, non-recursive) -- 'foo.bar.example.com'
1519         array_joinbranch_leaf($trie, '.', 0, TRUE);
1520
1521         // Sort and flatten -- 'A.foo.bar.example.com, B.foo.bar.example.com'
1522         foreach(array_keys($trie) as $key) {
1523                 if (is_array($trie[$key])) {
1524                         ksort_by_domain($trie[$key]);
1525                         $trie[$key] = implode(', ', array_flat_leaves($trie[$key]));
1526                 }
1527         }
1528
1529         // TODO: ltrim('.') from $trie
1530
1531         ksort_by_domain($trie);
1532
1533         // TODO: from array('foobar' => 'foobar') to 'foobar'
1534
1535         return var_export_shrink($trie, TRUE, TRUE);
1536 }
1537
1538 // ksort() by domain
1539 function ksort_by_domain(& $array)
1540 {
1541         $sort = array();
1542         foreach(array_keys($array) as $key) {
1543                 $sort[delimiter_reverse($key)] = $key;
1544         }
1545         ksort($sort, SORT_STRING);
1546         $result = array();
1547         foreach($sort as $key) {
1548                 $result[$key] = & $array[$key];
1549         }
1550         $array = $result;
1551 }
1552
1553 // array('F' => array('B' => array('C' => array('d' => array('' => 'foobar')))))
1554 // to
1555 // array('F.B.C.d.' => 'foobar')
1556 function array_joinbranch_leaf(& $array, $delim = '.', $limit = 0, $reverse = FALSE)
1557 {
1558         $result = array();
1559         if (! is_array($array)) return $result; // Nothing to do
1560
1561         $limit  = max(0, intval($limit));
1562         $cstack = array();
1563
1564         foreach(array_keys($array) as $key) {
1565                 $kstack = array();
1566                 $k      = -1;
1567
1568                 $single = array($key => & $array[$key]);        // Keep it single
1569                 $cursor = & $single;
1570                 while(is_array($cursor) && count($cursor) == 1) {       // Once
1571                         ++$k;
1572                         $kstack[] = key($cursor);
1573                         $cursor   = & $cursor[$kstack[$k]];
1574                         if ($limit != 0 && $k == $limit) break;
1575                 }
1576
1577                 // Relink
1578                 if ($k != 0) {
1579                         if ($reverse) $kstack = array_reverse($kstack);
1580                         $joinkey = implode($delim, $kstack);
1581
1582                         unset($array[$key]);
1583                         $array[$joinkey]  = & $cursor;
1584                         $result[$joinkey] = $k + 1;     // Key seems not an single array => joined length
1585                 }
1586         }
1587
1588         return $result;
1589 }
1590
1591
1592 // Check responsibility-root of the FQDN
1593 // 'foo.bar.example.com'        => 'example.com'        (.com        has the last whois for it)
1594 // 'foo.bar.example.au'         => 'example.au'         (.au         has the last whois for it)
1595 // 'foo.bar.example.edu.au'     => 'example.edu.au'     (.edu.au     has the last whois for it)
1596 // 'foo.bar.example.act.edu.au' => 'example.act.edu.au' (.act.edu.au has the last whois for it)
1597 function whois_responsibility($fqdn = 'foo.bar.example.com', $parent = FALSE, $implicit = TRUE)
1598 {
1599         // Domains who have 2nd and/or 3rd level domains
1600         static $domain = array(
1601
1602                 // ccTLD: Australia
1603                 // http://www.auda.org.au/
1604                 // NIC  : http://www.aunic.net/
1605                 // Whois: http://www.ausregistry.com.au/
1606                 'au' => array(
1607                         // .au Second Level Domains
1608                         // http://www.auda.org.au/domains/
1609                         'asn'   => TRUE,
1610                         'com'   => TRUE,
1611                         'conf'  => TRUE,
1612                         'csiro' => TRUE,
1613                         'edu'   => array(       // http://www.domainname.edu.au/
1614                                 // Geographic
1615                                 'act' => TRUE,
1616                                 'nt'  => TRUE,
1617                                 'nsw' => TRUE,
1618                                 'qld' => TRUE,
1619                                 'sa'  => TRUE,
1620                                 'tas' => TRUE,
1621                                 'vic' => TRUE,
1622                                 'wa'  => TRUE,
1623                         ),
1624                         'gov'   => array(
1625                                 // Geographic
1626                                 'act' => TRUE,  // Australian Capital Territory
1627                                 'nt'  => TRUE,  // Northern Territory
1628                                 'nsw' => TRUE,  // New South Wales
1629                                 'qld' => TRUE,  // Queensland
1630                                 'sa'  => TRUE,  // South Australia
1631                                 'tas' => TRUE,  // Tasmania
1632                                 'vic' => TRUE,  // Victoria
1633                                 'wa'  => TRUE,  // Western Australia
1634                         ),
1635                         'id'    => TRUE,
1636                         'net'   => TRUE,
1637                         'org'   => TRUE,
1638                         'info'  => TRUE,
1639                 ),
1640
1641                 // ccTLD: China
1642                 // NIC  : http://www.cnnic.net.cn/en/index/
1643                 // Whois: http://ewhois.cnnic.cn/
1644                 'cn' => array(
1645                         // Provisional Administrative Rules for Registration of Domain Names in China
1646                         // http://www.cnnic.net.cn/html/Dir/2003/11/27/1520.htm
1647
1648                         // Organizational
1649                         'ac'  => TRUE,
1650                         'com' => TRUE,
1651                         'edu' => TRUE,
1652                         'gov' => TRUE,
1653                         'net' => TRUE,
1654                         'org' => TRUE,
1655
1656                         // Geographic
1657                         'ah' => TRUE,
1658                         'bj' => TRUE,
1659                         'cq' => TRUE,
1660                         'fj' => TRUE,
1661                         'gd' => TRUE,
1662                         'gs' => TRUE,
1663                         'gx' => TRUE,
1664                         'gz' => TRUE,
1665                         'ha' => TRUE,
1666                         'hb' => TRUE,
1667                         'he' => TRUE,
1668                         'hi' => TRUE,
1669                         'hk' => TRUE,
1670                         'hl' => TRUE,
1671                         'hn' => TRUE,
1672                         'jl' => TRUE,
1673                         'js' => TRUE,
1674                         'jx' => TRUE,
1675                         'ln' => TRUE,
1676                         'mo' => TRUE,
1677                         'nm' => TRUE,
1678                         'nx' => TRUE,
1679                         'qh' => TRUE,
1680                         'sc' => TRUE,
1681                         'sd' => TRUE,
1682                         'sh' => TRUE,
1683                         'sn' => TRUE,
1684                         'sx' => TRUE,
1685                         'tj' => TRUE,
1686                         'tw' => TRUE,
1687                         'xj' => TRUE,
1688                         'xz' => TRUE,
1689                         'yn' => TRUE,
1690                         'zj' => TRUE,
1691                 ),
1692
1693                 // ccTLD: South Korea
1694                 // NIC  : http://www.nic.or.kr/english/
1695                 // Whois: http://whois.nida.or.kr/english/
1696                 'kr' => array(
1697                         // .kr domain policy [appendix 1] : Qualifications for Second Level Domains
1698                         // http://domain.nida.or.kr/eng/policy.jsp
1699
1700                         // Organizational
1701                         'co'  => TRUE,
1702                         'ne ' => TRUE,
1703                         'or ' => TRUE,
1704                         're ' => TRUE,
1705                         'pe'  => TRUE,
1706                         'go ' => TRUE,
1707                         'mil' => TRUE,
1708                         'ac'  => TRUE,
1709                         'hs'  => TRUE,
1710                         'ms'  => TRUE,
1711                         'es'  => TRUE,
1712                         'sc'  => TRUE,
1713                         'kg'  => TRUE,
1714
1715                         // Geographic
1716                         'seoul'     => TRUE,
1717                         'busan'     => TRUE,
1718                         'daegu'     => TRUE,
1719                         'incheon'   => TRUE,
1720                         'gwangju'   => TRUE,
1721                         'daejeon'   => TRUE,
1722                         'ulsan'     => TRUE,
1723                         'gyeonggi'  => TRUE,
1724                         'gangwon'   => TRUE,
1725                         'chungbuk'  => TRUE,
1726                         'chungnam'  => TRUE,
1727                         'jeonbuk'   => TRUE,
1728                         'jeonnam'   => TRUE,
1729                         'gyeongbuk' => TRUE,
1730                         'gyeongnam' => TRUE,
1731                         'jeju'      => TRUE,
1732                 ),
1733
1734                 // ccTLD: Japan
1735                 // NIC  : http://jprs.co.jp/en/
1736                 // Whois: http://whois.jprs.jp/en/
1737                 'jp' => array(
1738                         // Guide to JP Domain Name
1739                         // http://jprs.co.jp/en/jpdomain.html
1740
1741                         // Organizational
1742                         'ac' => TRUE,
1743                         'ad' => TRUE,
1744                         'co' => TRUE,
1745                         'go' => TRUE,
1746                         'gr' => TRUE,
1747                         'lg' => TRUE,
1748                         'ne' => TRUE,
1749                         'or' => TRUE,
1750
1751                         // Geographic
1752                         //
1753                         // Examples for 3rd level domains
1754                         //'kumamoto'  => array(
1755                         //      // http://www.pref.kumamoto.jp/link/list.asp#4
1756                         //      'amakusa'   => TRUE,
1757                         //      'hitoyoshi' => TRUE,
1758                         //      'jonan'     => TRUE,
1759                         //      'kumamoto'  => TRUE,
1760                         //      ...
1761                         //),
1762                         'aichi'     => TRUE,
1763                         'akita'     => TRUE,
1764                         'aomori'    => TRUE,
1765                         'chiba'     => TRUE,
1766                         'ehime'     => TRUE,
1767                         'fukui'     => TRUE,
1768                         'fukuoka'   => TRUE,
1769                         'fukushima' => TRUE,
1770                         'gifu'      => TRUE,
1771                         'gunma'     => TRUE,
1772                         'hiroshima' => TRUE,
1773                         'hokkaido'  => TRUE,
1774                         'hyogo'     => TRUE,
1775                         'ibaraki'   => TRUE,
1776                         'ishikawa'  => TRUE,
1777                         'iwate'     => TRUE,
1778                         'kagawa'    => TRUE,
1779                         'kagoshima' => TRUE,
1780                         'kanagawa'  => TRUE,
1781                         'kawasaki'  => TRUE,
1782                         'kitakyushu'=> TRUE,
1783                         'kobe'      => TRUE,
1784                         'kochi'     => TRUE,
1785                         'kumamoto'  => TRUE,
1786                         'kyoto'     => TRUE,
1787                         'mie'       => TRUE,
1788                         'miyagi'    => TRUE,
1789                         'miyazaki'  => TRUE,
1790                         'nagano'    => TRUE,
1791                         'nagasaki'  => TRUE,
1792                         'nagoya'    => TRUE,
1793                         'nara'      => TRUE,
1794                         'niigata'   => TRUE,
1795                         'oita'      => TRUE,
1796                         'okayama'   => TRUE,
1797                         'okinawa'   => TRUE,
1798                         'osaka'     => TRUE,
1799                         'saga'      => TRUE,
1800                         'saitama'   => TRUE,
1801                         'sapporo'   => TRUE,
1802                         'sendai'    => TRUE,
1803                         'shiga'     => TRUE,
1804                         'shimane'   => TRUE,
1805                         'shizuoka'  => TRUE,
1806                         'tochigi'   => TRUE,
1807                         'tokushima' => TRUE,
1808                         'tokyo'     => TRUE,
1809                         'tottori'   => TRUE,
1810                         'toyama'    => TRUE,
1811                         'wakayama'  => TRUE,
1812                         'yamagata'  => TRUE,
1813                         'yamaguchi' => TRUE,
1814                         'yamanashi' => TRUE,
1815                         'yokohama'  => TRUE,
1816                 ),
1817
1818                 // ccTLD: Ukraine
1819                 // NIC  : http://www.nic.net.ua/
1820                 // Whois: http://whois.com.ua/
1821                 'ua' => array(
1822                         // policy for alternative 2nd level domain names (a2ld)
1823                         // http://www.nic.net.ua/doc/a2ld
1824                         // http://whois.com.ua/
1825                         'cherkassy'  => TRUE,
1826                         'chernigov'  => TRUE,
1827                         'chernovtsy' => TRUE,
1828                         'ck'         => TRUE,
1829                         'cn'         => TRUE,
1830                         'com'        => TRUE,
1831                         'crimea'     => TRUE,
1832                         'cv'         => TRUE,
1833                         'dn'         => TRUE,
1834                         'dnepropetrovsk' => TRUE,
1835                         'donetsk'    => TRUE,
1836                         'dp'         => TRUE,
1837                         'edu'        => TRUE,
1838                         'gov'        => TRUE,
1839                         'if'         => TRUE,
1840                         'ivano-frankivsk' => TRUE,
1841                         'kh'         => TRUE,
1842                         'kharkov'    => TRUE,
1843                         'kherson'    => TRUE,
1844                         'kiev'       => TRUE,
1845                         'kirovograd' => TRUE,
1846                         'km'         => TRUE,
1847                         'kr'         => TRUE,
1848                         'ks'         => TRUE,
1849                         'lg'         => TRUE,
1850                         'lugansk'    => TRUE,
1851                         'lutsk'      => TRUE,
1852                         'lviv'       => TRUE,
1853                         'mk'         => TRUE,
1854                         'net'        => TRUE,
1855                         'nikolaev'   => TRUE,
1856                         'od'         => TRUE,
1857                         'odessa'     => TRUE,
1858                         'org'        => TRUE,
1859                         'pl'         => TRUE,
1860                         'poltava'    => TRUE,
1861                         'rovno'      => TRUE,
1862                         'rv'         => TRUE,
1863                         'sebastopol' => TRUE,
1864                         'sumy'       => TRUE,
1865                         'te'         => TRUE,
1866                         'ternopil'   => TRUE,
1867                         'uz'         => TRUE,
1868                         'uzhgorod'   => TRUE,
1869                         'vinnica'    => TRUE,
1870                         'vn'         => TRUE,
1871                         'zaporizhzhe' => TRUE,
1872                         'zhitomir'   => TRUE,
1873                         'zp'         => TRUE,
1874                         'zt'         => TRUE,
1875                 ),
1876
1877                 // ccTLD: United Kingdom
1878                 // NIC  : http://www.nic.uk/
1879                 'uk' => array(
1880                         // Second Level Domains
1881                         // http://www.nic.uk/registrants/aboutdomainnames/sld/
1882                         'co'     => TRUE,
1883                         'ltd'    => TRUE,
1884                         'me'     => TRUE,
1885                         'net'    => TRUE,
1886                         'nic'    => TRUE,
1887                         'org'    => TRUE,
1888                         'plc'    => TRUE,
1889                         'sch'    => TRUE,
1890                         
1891                         // Delegated Second Level Domains
1892                         // http://www.nic.uk/registrants/aboutdomainnames/sld/delegated/
1893                         'ac'     => TRUE,
1894                         'gov'    => TRUE,
1895                         'mil'    => TRUE,
1896                         'mod'    => TRUE,
1897                         'nhs'    => TRUE,
1898                         'police' => TRUE,
1899                 ),
1900
1901                 // ccTLD: United States of America
1902                 // NIC  : http://nic.us/
1903                 // Whois: http://whois.us/
1904                 'us' => array(
1905                         // See RFC1480
1906
1907                         // Organizational
1908                         'dni',
1909                         'fed',
1910                         'isa',
1911                         'kids',
1912                         'nsn',
1913
1914                         // Geographical
1915                         // United States Postal Service: State abbreviations (for postal codes)
1916                         // http://www.usps.com/ncsc/lookups/abbreviations.html
1917                         'ak' => TRUE, // Alaska
1918                         'al' => TRUE, // Alabama
1919                         'ar' => TRUE, // Arkansas
1920                         'as' => TRUE, // American samoa
1921                         'az' => TRUE, // Arizona
1922                         'ca' => TRUE, // California
1923                         'co' => TRUE, // Colorado
1924                         'ct' => TRUE, // Connecticut
1925                         'dc' => TRUE, // District of Columbia
1926                         'de' => TRUE, // Delaware
1927                         'fl' => TRUE, // Florida
1928                         'fm' => TRUE, // Federated states of Micronesia
1929                         'ga' => TRUE, // Georgia
1930                         'gu' => TRUE, // Guam
1931                         'hi' => TRUE, // Hawaii
1932                         'ia' => TRUE, // Iowa
1933                         'id' => TRUE, // Idaho
1934                         'il' => TRUE, // Illinois
1935                         'in' => TRUE, // Indiana
1936                         'ks' => TRUE, // Kansas
1937                         'ky' => TRUE, // Kentucky
1938                         'la' => TRUE, // Louisiana
1939                         'ma' => TRUE, // Massachusetts
1940                         'md' => TRUE, // Maryland
1941                         'me' => TRUE, // Maine
1942                         'mh' => TRUE, // Marshall Islands
1943                         'mi' => TRUE, // Michigan
1944                         'mn' => TRUE, // Minnesota
1945                         'mo' => TRUE, // Missouri
1946                         'mp' => TRUE, // Northern mariana islands
1947                         'ms' => TRUE, // Mississippi
1948                         'mt' => TRUE, // Montana
1949                         'nc' => TRUE, // North Carolina
1950                         'nd' => TRUE, // North Dakota
1951                         'ne' => TRUE, // Nebraska
1952                         'nh' => TRUE, // New Hampshire
1953                         'nj' => TRUE, // New Jersey
1954                         'nm' => TRUE, // New Mexico
1955                         'nv' => TRUE, // Nevada
1956                         'ny' => TRUE, // New York
1957                         'oh' => TRUE, // Ohio
1958                         'ok' => TRUE, // Oklahoma
1959                         'or' => TRUE, // Oregon
1960                         'pa' => TRUE, // Pennsylvania
1961                         'pr' => TRUE, // Puerto Rico
1962                         'pw' => TRUE, // Palau
1963                         'ri' => TRUE, // Rhode Island
1964                         'sc' => TRUE, // South Carolina
1965                         'sd' => TRUE, // South Dakota
1966                         'tn' => TRUE, // Tennessee
1967                         'tx' => TRUE, // Texas
1968                         'ut' => TRUE, // Utah
1969                         'va' => TRUE, // Virginia
1970                         'vi' => TRUE, // Virgin Islands
1971                         'vt' => TRUE, // Vermont
1972                         'wa' => TRUE, // Washington
1973                         'wi' => TRUE, // Wisconsin
1974                         'wv' => TRUE, // West Virginia
1975                         'wy' => TRUE, // Wyoming
1976                 ),
1977         );
1978
1979         if (! is_string($fqdn)) return '';
1980
1981         $result  = array();
1982         $dcursor = & $domain;
1983         $array   = array_reverse(explode('.', $fqdn));
1984         $i = 0;
1985         while(TRUE) {
1986                 $acursor = $array[$i];
1987                 if (is_array($dcursor) && isset($dcursor[$acursor])) {
1988                         $result[] = & $array[$i];
1989                         $dcursor  = & $dcursor[$acursor];
1990                 } else {
1991                         if (! $parent && isset($acursor)) {
1992                                 $result[] = & $array[$i];       // Whois servers must know this subdomain
1993                         }
1994                         break;
1995                 }
1996                 ++$i;
1997         }
1998
1999         // Implicit responsibility: Top-Level-Domains must not be yours
2000         // 'bar.foo.something' => 'foo.something'
2001         if ($implicit && count($result) == 1 && count($array) > 1) {
2002                 $result[] = & $array[1];
2003         }
2004
2005         return $result ? implode('.', array_reverse($result)) : '';
2006 }
2007
2008
2009 // ---------------------
2010 // Exit
2011
2012 // Freeing memories
2013 function spam_dispose()
2014 {
2015         get_blocklist(NULL);
2016 }
2017
2018 // Common bahavior for blocking
2019 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'
2020 function spam_exit($mode = '', $data = array())
2021 {
2022
2023         $exit = TRUE;
2024         switch ($mode) {
2025                 case '':
2026                         echo("\n");
2027                         break;
2028                 case 'dump':
2029                         echo('<pre>' . "\n");
2030                         echo htmlspecialchars(var_export($data, TRUE));
2031                         echo('</pre>' . "\n");
2032                         break;
2033         };
2034
2035         if ($exit) exit;        // Force exit
2036 }
2037
2038
2039 // ---------------------
2040 // Simple filtering
2041
2042 // TODO: Record them
2043 // Simple/fast spam filter ($target: 'a string' or an array())
2044 function pkwk_spamfilter($action, $page, $target = array('title' => ''), $method = array(), $exitmode = '')
2045 {
2046         $progress = check_uri_spam($target, $method);
2047
2048         if (empty($progress['is_spam'])) {
2049                 spam_dispose();
2050         } else {
2051                 $target = string($target, 0);   // Removing "\0" etc
2052                 pkwk_spamnotify($action, $page, $target, $progress, $method);
2053                 spam_exit($exitmode, $progress);
2054         }
2055 }
2056
2057 // ---------------------
2058 // PukiWiki original
2059
2060 // Mail to administrator(s)
2061 function pkwk_spamnotify($action, $page, $target = array('title' => ''), $progress = array(), $method = array())
2062 {
2063         global $notify, $notify_subject;
2064
2065         if (! $notify) return;
2066
2067         $asap = isset($method['asap']);
2068
2069         $summary['ACTION']  = 'Blocked by: ' . summarize_spam_progress($progress, TRUE);
2070         if (! $asap) {
2071                 $summary['METRICS'] = summarize_spam_progress($progress);
2072         }
2073
2074         $tmp = summarize_detail_badhost($progress);
2075         if ($tmp != '') $summary['DETAIL_BADHOST'] = $tmp;
2076
2077         $tmp = summarize_detail_newtral($progress);
2078         if (! $asap && $tmp != '') $summary['DETAIL_NEUTRAL_HOST'] = $tmp;
2079
2080         $summary['COMMENT'] = $action;
2081         $summary['PAGE']    = '[blocked] ' . (is_pagename($page) ? $page : '');
2082         $summary['URI']     = get_script_uri() . '?' . rawurlencode($page);
2083         $summary['USER_AGENT']  = TRUE;
2084         $summary['REMOTE_ADDR'] = TRUE;
2085         pkwk_mail_notify($notify_subject,  var_export($target, TRUE), $summary, TRUE);
2086 }
2087
2088 ?>