OSDN Git Service

Separate some utility functions to spam_util.php
[pukiwiki/pukiwiki_sandbox.git] / spam / spam.php
1 <?php
2 // $Id: spam.php,v 1.216 2009/01/02 10:44:53 henoheno Exp $
3 // Copyright (C) 2006-2007 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
5 //
6 // Functions for Concept-work of spam-uri metrics
7 //
8 // (PHP 4 >= 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature
9
10 if (! defined('SPAM_INI_FILE'))   define('SPAM_INI_FILE',   'spam.ini.php');
11
12
13 // ---------------------
14 // Part One : Checker
15
16 // Rough implementation of globbing
17 //
18 // USAGE: $regex = '/^' . generate_glob_regex('*.txt', '/') . '$/i';
19 //
20 function generate_glob_regex($string = '', $divider = '/')
21 {
22         static $from = array(
23                          1 => '*',
24                         11 => '?',
25         //              22 => '[',      // Maybe cause regex compilation error (e.g. '[]')
26         //              23 => ']',      //
27                 );
28         static $mid = array(
29                          1 => '_AST_',
30                         11 => '_QUE_',
31         //              22 => '_RBR_',
32         //              23 => '_LBR_',
33                 );
34         static $to = array(
35                          1 => '.*',
36                         11 => '.',
37         //              22 => '[',
38         //              23 => ']',
39                 );
40
41         if (! is_string($string)) return '';
42
43         $string = str_replace($from, $mid, $string); // Hide
44         $string = preg_quote($string, $divider);
45         $string = str_replace($mid, $to, $string);   // Unhide
46
47         return $string;
48 }
49
50 // Generate host (FQDN, IPv4, ...) regex
51 // 'localhost'     : Matches with 'localhost' only
52 // 'example.org'   : Matches with 'example.org' only (See host_normalize() about 'www')
53 // '.example.org'  : Matches with ALL FQDN ended with '.example.org'
54 // '*.example.org' : Almost the same of '.example.org' except 'www.example.org'
55 // '10.20.30.40'   : Matches with IPv4 address '10.20.30.40' only
56 // [TODO] '192.'   : Matches with all IPv4 hosts started with '192.'
57 // TODO: IPv4, CIDR?, IPv6
58 function generate_host_regex($string = '', $divider = '/')
59 {
60         if (! is_string($string)) return '';
61
62         if (mb_strpos($string, '.') === FALSE || is_ip($string)) {
63                 // "localhost", IPv4, etc
64                 return generate_glob_regex($string, $divider);
65         }
66
67         // FQDN or something
68         $part = explode('.', $string, 2);
69         if ($part[0] == '') {
70                 // ".example.org"
71                 $part[0] = '(?:.*\.)?';
72         } else if ($part[0] == '*') {
73                 // "*.example.org"
74                 $part[0] = '.*\.';
75         } else {
76                 // example.org, etc
77                 return generate_glob_regex($string, $divider);
78         }
79
80         $part[1] = generate_glob_regex($part[1], $divider);
81
82         return implode('', $part);
83 }
84
85 // Load SPAM_INI_FILE and return parsed one
86 function get_blocklist($list = '')
87 {
88         static $regexes;
89
90         if ($list === NULL) {
91                 $regexes = NULL;        // Unset
92                 return array();
93         }
94
95         if (! isset($regexes)) {
96                 $regexes = array();
97                 if (file_exists(SPAM_INI_FILE)) {
98                         $blocklist = array();
99
100                         include(SPAM_INI_FILE);
101                         //      $blocklist['list'] = array(
102                         //      //'goodhost' => FALSE;
103                         //      'badhost' => TRUE;
104                         // );
105                         //      $blocklist['badhost'] = array(
106                         //              '*.blogspot.com',       // Blog services's subdomains (only)
107                         //              'IANA-examples' => '#^(?:.*\.)?example\.(?:com|net|org)$#',
108                         //      );
109
110                         foreach(array(
111                                         'pre',
112                                         'list',
113                                 ) as $special) {
114
115                                 if (! isset($blocklist[$special])) continue;
116
117                                 $regexes[$special] = $blocklist[$special];
118
119                                 foreach(array_keys($blocklist[$special]) as $_list) {
120                                         if (! isset($blocklist[$_list])) continue;
121
122                                         foreach ($blocklist[$_list] as $key => $value) {
123                                                 if (is_array($value)) {
124                                                         $regexes[$_list][$key] = array();
125                                                         foreach($value as $_key => $_value) {
126                                                                 get_blocklist_add($regexes[$_list][$key], $_key, $_value);
127                                                         }
128                                                 } else {
129                                                         get_blocklist_add($regexes[$_list], $key, $value);
130                                                 }
131                                         }
132
133                                         unset($blocklist[$_list]);
134                                 }
135                         }
136                 }
137         }
138
139         if ($list === '') {
140                 return $regexes;                // ALL of
141         } else if (isset($regexes[$list])) {
142                 return $regexes[$list]; // A part of
143         } else {
144                 return array();                 // Found nothing
145         }
146 }
147
148 // Subroutine of get_blocklist(): Add new regex to the $array
149 function get_blocklist_add(& $array, $key = 0, $value = '*.example.org/path/to/file.html')
150 {
151         if (is_string($key)) {
152                 $array[$key]   = & $value; // Treat $value as a regex for FQDN(host)s
153         } else {
154                 $regex = generate_host_regex($value, '#');
155                 if (! empty($regex)) {
156                         $array[$value] = '#^' . $regex . '$#i';
157                 }
158         }
159 }
160
161 // Blocklist metrics: Separate $host, to $blocked and not blocked
162 function blocklist_distiller(& $hosts, $keys = array('goodhost', 'badhost'), $asap = FALSE)
163 {
164         if (! is_array($hosts)) $hosts = array($hosts);
165         if (! is_array($keys))  $keys  = array($keys);
166
167         $list = get_blocklist('list');
168         $blocked = array();
169
170         foreach($keys as $key){
171                 foreach (get_blocklist($key) as $label => $regex) {
172                         if (is_array($regex)) {
173                                 foreach($regex as $_label => $_regex) {
174                                         $group = preg_grep($_regex, $hosts);
175                                         if ($group) {
176                                                 $hosts = array_diff($hosts, $group);
177                                                 $blocked[$key][$label][$_label] = $group;
178                                                 if ($asap && $list[$key]) break;
179                                         }
180                                 }
181                         } else {
182                                 $group = preg_grep($regex, $hosts);
183                                 if ($group) {
184                                         $hosts = array_diff($hosts, $group);
185                                         $blocked[$key][$label] = $group;
186                                         if ($asap && $list[$key]) break;
187                                 }
188                         }
189                 }
190         }
191
192         return $blocked;
193 }
194
195
196 // ---------------------
197
198
199 // Default (enabled) methods and thresholds (for content insertion)
200 function check_uri_spam_method($times = 1, $t_area = 0, $rule = TRUE)
201 {
202         $times  = intval($times);
203         $t_area = intval($t_area);
204
205         $positive = array(
206                 // Thresholds
207                 'quantity'     =>  8 * $times,  // Allow N URIs
208                 'non_uniqhost' =>  3 * $times,  // Allow N duped (and normalized) Hosts
209                 //'non_uniquri'=>  3 * $times,  // Allow N duped (and normalized) URIs
210
211                 // Areas
212                 'area_anchor'  => $t_area,      // Using <a href> HTML tag
213                 'area_bbcode'  => $t_area,      // Using [url] or [link] BBCode
214                 //'uri_anchor' => $t_area,      // URI inside <a href> HTML tag
215                 //'uri_bbcode' => $t_area,      // URI inside [url] or [link] BBCode
216         );
217         if ($rule) {
218                 $bool = array(
219                         // Rules
220                         //'asap'   => TRUE,     // Quit or return As Soon As Possible
221                         'uniqhost' => TRUE,     // Show uniq host (at block notification mail)
222                         'badhost'  => TRUE,     // Check badhost
223                 );
224         } else {
225                 $bool = array();
226         }
227
228         // Remove non-$positive values
229         foreach (array_keys($positive) as $key) {
230                 if ($positive[$key] < 0) unset($positive[$key]);
231         }
232
233         return $positive + $bool;
234 }
235
236 // Simple/fast spam check
237 function check_uri_spam($target = '', $method = array())
238 {
239         // Return value
240         $progress = array(
241                 'method'  => array(
242                         // Theme to do  => Dummy, optional value, or optional array()
243                         //'quantity'    => 8,
244                         //'uniqhost'    => TRUE,
245                         //'non_uniqhost'=> 3,
246                         //'non_uniquri' => 3,
247                         //'badhost'     => TRUE,
248                         //'area_anchor' => 0,
249                         //'area_bbcode' => 0,
250                         //'uri_anchor'  => 0,
251                         //'uri_bbcode'  => 0,
252                 ),
253                 'sum' => array(
254                         // Theme        => Volume found (int)
255                 ),
256                 'is_spam' => array(
257                         // Flag. If someting defined here,
258                         // one or more spam will be included
259                         // in this report
260                 ),
261                 'blocked' => array(
262                         // Hosts blocked
263                         //'category' => array(
264                         //      'host',
265                         //)
266                 ),
267                 'hosts' => array(
268                         // Hosts not blocked
269                 ),
270         );
271
272         // ----------------------------------------
273         // Aliases
274
275         $sum     = & $progress['sum'];
276         $is_spam = & $progress['is_spam'];
277         $progress['method'] = & $method;        // Argument
278         $blocked = & $progress['blocked'];
279         $hosts   = & $progress['hosts'];
280         $asap    = isset($method['asap']);
281
282         // ----------------------------------------
283         // Init
284
285         if (! is_array($method) || empty($method)) {
286                 $method = check_uri_spam_method();
287         }
288         foreach(array_keys($method) as $key) {
289                 if (! isset($sum[$key])) $sum[$key] = 0;
290         }
291         if (! isset($sum['quantity'])) $sum['quantity'] = 0;
292
293         // ----------------------------------------
294         // Recurse
295
296         if (is_array($target)) {
297                 foreach($target as $str) {
298                         if (! is_string($str)) continue;
299
300                         $_progress = check_uri_spam($str, $method);     // Recurse
301
302                         // Merge $sum
303                         $_sum = & $_progress['sum'];
304                         foreach (array_keys($_sum) as $key) {
305                                 if (! isset($sum[$key])) {
306                                         $sum[$key] = & $_sum[$key];
307                                 } else {
308                                         $sum[$key] += $_sum[$key];
309                                 }
310                         }
311
312                         // Merge $is_spam
313                         $_is_spam = & $_progress['is_spam'];
314                         foreach (array_keys($_is_spam) as $key) {
315                                 $is_spam[$key] = TRUE;
316                                 if ($asap) break;
317                         }
318                         if ($asap && $is_spam) break;
319
320                         // Merge only
321                         $blocked = array_merge_leaves($blocked, $_progress['blocked'], FALSE);
322                         $hosts   = array_merge_leaves($hosts,   $_progress['hosts'],   FALSE);
323                 }
324
325                 // Unique values
326                 $blocked = array_unique_recursive($blocked);
327                 $hosts   = array_unique_recursive($hosts);
328
329                 // Recount $sum['badhost']
330                 $sum['badhost'] = array_count_leaves($blocked);
331
332                 return $progress;
333         }
334
335         // ----------------------------------------
336         // Area measure
337
338         if (! $asap || ! $is_spam) {
339         
340                 // Method pickup
341                 $_method = array();
342                 foreach(array(
343                                 'area_anchor',  // There's HTML anchor tag
344                                 'area_bbcode',  // There's 'BBCode' linking tag
345                         ) as $key) {
346                         if (isset($method[$key])) $_method[$key] = TRUE;
347                 }
348
349                 if ($_method) {
350                         $_asap   = isset($method['asap']) ? array('asap' => TRUE) : array();
351                         $_result = area_pickup($target, $_method + $_asap);
352                         $_asap   = NULL;
353                 } else {
354                         $_result = FALSE;
355                 }
356
357                 if ($_result) {
358                         foreach(array_keys($_method) as $key) {
359                                 if (isset($_result[$key])) {
360                                         $sum[$key] = $_result[$key];
361                                         if (isset($method[$key]) && $sum[$key] > $method[$key]) {
362                                                 $is_spam[$key] = TRUE;
363                                         }
364                                 }
365                         }
366                 }
367
368                 unset($_asap, $_method, $_result);
369         }
370
371         // Return if ...
372         if ($asap && $is_spam) return $progress;
373
374         // ----------------------------------------
375         // URI: Pickup
376
377         $pickups = spam_uri_pickup($target, $method);
378
379
380         // Return if ...
381         if (empty($pickups)) return $progress;
382
383         // Normalize all
384         $pickups = uri_pickup_normalize($pickups);
385
386         // ----------------------------------------
387         // Pickup some part of URI
388
389         $hosts = array();
390         foreach ($pickups as $key => $pickup) {
391                 $hosts[$key] = & $pickup['host'];
392         }
393
394         // ----------------------------------------
395         // URI: Bad host <pre-filter> (Separate good/bad hosts from $hosts)
396
397         if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
398                 $list    = get_blocklist('pre');
399                 $blocked = blocklist_distiller($hosts, array_keys($list), $asap);
400                 foreach($list as $key => $type){
401                         if (! $type) unset($blocked[$key]); // Ignore goodhost etc
402                 }
403                 unset($list);
404                 if (! empty($blocked)) $is_spam['badhost'] = TRUE;
405         }
406
407         // Return if ...
408         if ($asap && $is_spam) return $progress;
409
410         // Remove blocked from $pickups
411         foreach(array_keys($pickups) as $key) {
412                 if (! isset($hosts[$key])) {
413                         unset($pickups[$key]);
414                 }
415         }
416
417         // ----------------------------------------
418         // URI: Check quantity
419
420         $sum['quantity'] += count($pickups);
421                 // URI quantity
422         if ((! $asap || ! $is_spam) && isset($method['quantity']) &&
423                 $sum['quantity'] > $method['quantity']) {
424                 $is_spam['quantity'] = TRUE;
425         }
426
427         // ----------------------------------------
428         // URI: used inside HTML anchor tag pair
429
430         if ((! $asap || ! $is_spam) && isset($method['uri_anchor'])) {
431                 $key = 'uri_anchor';
432                 foreach($pickups as $pickup) {
433                         if (isset($pickup['area'][$key])) {
434                                 $sum[$key] += $pickup['area'][$key];
435                                 if(isset($method[$key]) &&
436                                         $sum[$key] > $method[$key]) {
437                                         $is_spam[$key] = TRUE;
438                                         if ($asap && $is_spam) break;
439                                 }
440                                 if ($asap && $is_spam) break;
441                         }
442                 }
443         }
444
445         // ----------------------------------------
446         // URI: used inside 'BBCode' pair
447
448         if ((! $asap || ! $is_spam) && isset($method['uri_bbcode'])) {
449                 $key = 'uri_bbcode';
450                 foreach($pickups as $pickup) {
451                         if (isset($pickup['area'][$key])) {
452                                 $sum[$key] += $pickup['area'][$key];
453                                 if(isset($method[$key]) &&
454                                         $sum[$key] > $method[$key]) {
455                                         $is_spam[$key] = TRUE;
456                                         if ($asap && $is_spam) break;
457                                 }
458                                 if ($asap && $is_spam) break;
459                         }
460                 }
461         }
462
463         // ----------------------------------------
464         // URI: Uniqueness (and removing non-uniques)
465
466         if ((! $asap || ! $is_spam) && isset($method['non_uniquri'])) {
467
468                 $uris = array();
469                 foreach (array_keys($pickups) as $key) {
470                         $uris[$key] = uri_pickup_implode($pickups[$key]);
471                 }
472                 $count = count($uris);
473                 $uris  = array_unique($uris);
474                 $sum['non_uniquri'] += $count - count($uris);
475                 if ($sum['non_uniquri'] > $method['non_uniquri']) {
476                         $is_spam['non_uniquri'] = TRUE;
477                 }
478                 if (! $asap || ! $is_spam) {
479                         foreach (array_diff(array_keys($pickups),
480                                 array_keys($uris)) as $remove) {
481                                 unset($pickups[$remove]);
482                         }
483                 }
484                 unset($uris);
485         }
486
487         // Return if ...
488         if ($asap && $is_spam) return $progress;
489
490         // ----------------------------------------
491         // Host: Uniqueness (uniq / non-uniq)
492
493         $hosts = array_unique($hosts);
494
495         if (isset($sum['uniqhost'])) $sum['uniqhost'] += count($hosts);
496         if ((! $asap || ! $is_spam) && isset($method['non_uniqhost'])) {
497                 $sum['non_uniqhost'] = $sum['quantity'] - $sum['uniqhost'];
498                 if ($sum['non_uniqhost'] > $method['non_uniqhost']) {
499                         $is_spam['non_uniqhost'] = TRUE;
500                 }
501         }
502
503         // Return if ...
504         if ($asap && $is_spam) return $progress;
505
506         // ----------------------------------------
507         // URI: Bad host (Separate good/bad hosts from $hosts)
508
509         if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
510                 $list    = get_blocklist('list');
511                 $blocked = array_merge_leaves(
512                         $blocked,
513                         blocklist_distiller($hosts, array_keys($list), $asap),
514                         FALSE
515                 );
516                 foreach($list as $key=>$type){
517                         if (! $type) unset($blocked[$key]); // Ignore goodhost etc
518                 }
519                 unset($list);
520                 if (! empty($blocked)) $is_spam['badhost'] = TRUE;
521         }
522
523         // Return if ...
524         //if ($asap && $is_spam) return $progress;
525
526         // ----------------------------------------
527         // End
528
529         return $progress;
530 }
531
532 // ---------------------
533 // Reporting
534
535 // Summarize $progress (blocked only)
536 function summarize_spam_progress($progress = array(), $blockedonly = FALSE)
537 {
538         if ($blockedonly) {
539                 $tmp = array_keys($progress['is_spam']);
540         } else {
541                 $tmp = array();
542                 $method = & $progress['method'];
543                 if (isset($progress['sum'])) {
544                         foreach ($progress['sum'] as $key => $value) {
545                                 if (isset($method[$key]) && $value) {
546                                         $tmp[] = $key . '(' . $value . ')';
547                                 }
548                         }
549                 }
550         }
551
552         return implode(', ', $tmp);
553 }
554
555 function summarize_detail_badhost($progress = array())
556 {
557         if (! isset($progress['blocked']) || empty($progress['blocked'])) return '';
558
559         // Flat per group
560         $blocked = array();
561         foreach($progress['blocked'] as $list => $lvalue) {
562                 foreach($lvalue as $group => $gvalue) {
563                         $flat = implode(', ', array_flat_leaves($gvalue));
564                         if ($flat === $group) {
565                                 $blocked[$list][]       = $flat;
566                         } else {
567                                 $blocked[$list][$group] = $flat;
568                         }
569                 }
570         }
571
572         // Shrink per list
573         // From: 'A-1' => array('ie.to')
574         // To:   'A-1' => 'ie.to'
575         foreach($blocked as $list => $lvalue) {
576                 if (is_array($lvalue) &&
577                    count($lvalue) == 1 &&
578                    is_numeric(key($lvalue))) {
579                     $blocked[$list] = current($lvalue);
580                 }
581         }
582
583         return var_export_shrink($blocked, TRUE, TRUE);
584 }
585
586 function summarize_detail_newtral($progress = array())
587 {
588         if (! isset($progress['hosts'])    ||
589             ! is_array($progress['hosts']) ||
590             empty($progress['hosts'])) return '';
591
592         // Generate a responsible $trie
593         $trie = array();
594         foreach($progress['hosts'] as $value) {
595                 // 'A.foo.bar.example.com'
596                 $resp = whois_responsibility($value);   // 'example.com'
597                 if (empty($resp)) {
598                         // One or more test, or do nothing here
599                         $resp = strval($value);
600                         $rest = '';
601                 } else {
602                         $rest = rtrim(substr($value, 0, - strlen($resp)), '.'); // 'A.foo.bar'
603                 }
604                 $trie = array_merge_leaves($trie, array($resp => array($rest => NULL)), FALSE);
605         }
606
607         // Format: var_export_shrink() -like output
608         $result = array();
609         ksort_by_domain($trie);
610         foreach(array_keys($trie) as $key) {
611                 ksort_by_domain($trie[$key]);
612                 if (count($trie[$key]) == 1 && key($trie[$key]) == '') {
613                         // Just one 'responsibility.example.com'
614                         $result[] = '  \'' . $key . '\',';
615                 } else {
616                         // One subdomain-or-host, or several ones
617                         $subs = array();
618                         foreach(array_keys($trie[$key]) as $sub) {
619                                 if ($sub == '') {
620                                         $subs[] = $key;                 // 'example.com'
621                                 } else {
622                                         $subs[] = $sub . '. ';  // 'A.foo.bar. '
623                                 }
624                         }
625                         $result[] = '  \'' . $key . '\' => \'' . implode(', ', $subs) . '\',';
626                 }
627                 unset($trie[$key]);
628         }
629         return
630                 'array (' . "\n" .
631                         implode("\n", $result) . "\n" .
632                 ')';
633 }
634
635
636 // ---------------------
637 // Exit
638
639 // Freeing memories
640 function spam_dispose()
641 {
642         get_blocklist(NULL);
643         whois_responsibility(NULL);
644 }
645
646 // Common bahavior for blocking
647 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'
648 function spam_exit($mode = '', $data = array())
649 {
650         $exit = TRUE;
651
652         switch ($mode) {
653                 case '':
654                         echo("\n");
655                         break;
656                 case 'dump':
657                         echo('<pre>' . "\n");
658                         echo htmlspecialchars(var_export($data, TRUE));
659                         echo('</pre>' . "\n");
660                         break;
661         };
662
663         if ($exit) exit;        // Force exit
664 }
665
666
667 // ---------------------
668 // Simple filtering
669
670 // TODO: Record them
671 // Simple/fast spam filter ($target: 'a string' or an array())
672 function pkwk_spamfilter($action, $page, $target = array('title' => ''), $method = array(), $exitmode = '')
673 {
674         $progress = check_uri_spam($target, $method);
675
676         if (empty($progress['is_spam'])) {
677                 spam_dispose();
678         } else {
679
680 // TODO: detect encoding from $target for mbstring functions
681 //              $tmp = array();
682 //              foreach(array_keys($target) as $key) {
683 //                      $tmp[strings($key, 0, FALSE, TRUE)] = strings($target[$key], 0, FALSE, TRUE);   // Removing "\0" etc
684 //              }
685 //              $target = & $tmp;
686
687                 pkwk_spamnotify($action, $page, $target, $progress, $method);
688                 spam_exit($exitmode, $progress);
689         }
690 }
691
692 // ---------------------
693 // PukiWiki original
694
695 // Mail to administrator(s)
696 function pkwk_spamnotify($action, $page, $target = array('title' => ''), $progress = array(), $method = array())
697 {
698         global $notify, $notify_subject;
699
700         if (! $notify) return;
701
702         $asap = isset($method['asap']);
703
704         $summary['ACTION']  = 'Blocked by: ' . summarize_spam_progress($progress, TRUE);
705         if (! $asap) {
706                 $summary['METRICS'] = summarize_spam_progress($progress);
707         }
708
709         $tmp = summarize_detail_badhost($progress);
710         if ($tmp != '') $summary['DETAIL_BADHOST'] = $tmp;
711
712         $tmp = summarize_detail_newtral($progress);
713         if (! $asap && $tmp != '') $summary['DETAIL_NEUTRAL_HOST'] = $tmp;
714
715         $summary['COMMENT'] = $action;
716         $summary['PAGE']    = '[blocked] ' . (is_pagename($page) ? $page : '');
717         $summary['URI']     = get_script_uri() . '?' . rawurlencode($page);
718         $summary['USER_AGENT']  = TRUE;
719         $summary['REMOTE_ADDR'] = TRUE;
720         pkwk_mail_notify($notify_subject,  var_export($target, TRUE), $summary, TRUE);
721 }
722
723 ?>