OSDN Git Service

420492ee4b66cd39e5daa0ab43dff8f8b8599567
[pukiwiki/pukiwiki_sandbox.git] / spam / spam.php
1 <?php
2 // $Id: spam.php,v 1.217 2009/01/02 11:55:45 henoheno Exp $
3 // Copyright (C) 2006-2007 PukiWiki Developers Team
4 // License: GPL v2 or (at your option) any later version
5 //
6 // Functions for Concept-work of spam-uri metrics
7 //
8 // (PHP 4 >= 4.3.0): preg_match_all(PREG_OFFSET_CAPTURE): $method['uri_XXX'] related feature
9
10
11 if (! defined('LIB_DIR'))   define('LIB_DIR', './');
12 require(LIB_DIR . 'spam_pickup.php');
13 require(LIB_DIR . 'spam_util.php');
14
15 if (! defined('SPAM_INI_FILE'))   define('SPAM_INI_FILE',   'spam.ini.php');
16
17
18 // ---------------------
19 // Part One : Checker
20
21 // Rough implementation of globbing
22 //
23 // USAGE: $regex = '/^' . generate_glob_regex('*.txt', '/') . '$/i';
24 //
25 function generate_glob_regex($string = '', $divider = '/')
26 {
27         static $from = array(
28                          1 => '*',
29                         11 => '?',
30         //              22 => '[',      // Maybe cause regex compilation error (e.g. '[]')
31         //              23 => ']',      //
32                 );
33         static $mid = array(
34                          1 => '_AST_',
35                         11 => '_QUE_',
36         //              22 => '_RBR_',
37         //              23 => '_LBR_',
38                 );
39         static $to = array(
40                          1 => '.*',
41                         11 => '.',
42         //              22 => '[',
43         //              23 => ']',
44                 );
45
46         if (! is_string($string)) return '';
47
48         $string = str_replace($from, $mid, $string); // Hide
49         $string = preg_quote($string, $divider);
50         $string = str_replace($mid, $to, $string);   // Unhide
51
52         return $string;
53 }
54
55 // Generate host (FQDN, IPv4, ...) regex
56 // 'localhost'     : Matches with 'localhost' only
57 // 'example.org'   : Matches with 'example.org' only (See host_normalize() about 'www')
58 // '.example.org'  : Matches with ALL FQDN ended with '.example.org'
59 // '*.example.org' : Almost the same of '.example.org' except 'www.example.org'
60 // '10.20.30.40'   : Matches with IPv4 address '10.20.30.40' only
61 // [TODO] '192.'   : Matches with all IPv4 hosts started with '192.'
62 // TODO: IPv4, CIDR?, IPv6
63 function generate_host_regex($string = '', $divider = '/')
64 {
65         if (! is_string($string)) return '';
66
67         if (mb_strpos($string, '.') === FALSE || is_ip($string)) {
68                 // "localhost", IPv4, etc
69                 return generate_glob_regex($string, $divider);
70         }
71
72         // FQDN or something
73         $part = explode('.', $string, 2);
74         if ($part[0] == '') {
75                 // ".example.org"
76                 $part[0] = '(?:.*\.)?';
77         } else if ($part[0] == '*') {
78                 // "*.example.org"
79                 $part[0] = '.*\.';
80         } else {
81                 // example.org, etc
82                 return generate_glob_regex($string, $divider);
83         }
84
85         $part[1] = generate_glob_regex($part[1], $divider);
86
87         return implode('', $part);
88 }
89
90 // Load SPAM_INI_FILE and return parsed one
91 function get_blocklist($list = '')
92 {
93         static $regexes;
94
95         if ($list === NULL) {
96                 $regexes = NULL;        // Unset
97                 return array();
98         }
99
100         if (! isset($regexes)) {
101                 $regexes = array();
102                 if (file_exists(SPAM_INI_FILE)) {
103                         $blocklist = array();
104
105                         include(SPAM_INI_FILE);
106                         //      $blocklist['list'] = array(
107                         //      //'goodhost' => FALSE;
108                         //      'badhost' => TRUE;
109                         // );
110                         //      $blocklist['badhost'] = array(
111                         //              '*.blogspot.com',       // Blog services's subdomains (only)
112                         //              'IANA-examples' => '#^(?:.*\.)?example\.(?:com|net|org)$#',
113                         //      );
114
115                         foreach(array(
116                                         'pre',
117                                         'list',
118                                 ) as $special) {
119
120                                 if (! isset($blocklist[$special])) continue;
121
122                                 $regexes[$special] = $blocklist[$special];
123
124                                 foreach(array_keys($blocklist[$special]) as $_list) {
125                                         if (! isset($blocklist[$_list])) continue;
126
127                                         foreach ($blocklist[$_list] as $key => $value) {
128                                                 if (is_array($value)) {
129                                                         $regexes[$_list][$key] = array();
130                                                         foreach($value as $_key => $_value) {
131                                                                 get_blocklist_add($regexes[$_list][$key], $_key, $_value);
132                                                         }
133                                                 } else {
134                                                         get_blocklist_add($regexes[$_list], $key, $value);
135                                                 }
136                                         }
137
138                                         unset($blocklist[$_list]);
139                                 }
140                         }
141                 }
142         }
143
144         if ($list === '') {
145                 return $regexes;                // ALL of
146         } else if (isset($regexes[$list])) {
147                 return $regexes[$list]; // A part of
148         } else {
149                 return array();                 // Found nothing
150         }
151 }
152
153 // Subroutine of get_blocklist(): Add new regex to the $array
154 function get_blocklist_add(& $array, $key = 0, $value = '*.example.org/path/to/file.html')
155 {
156         if (is_string($key)) {
157                 $array[$key]   = & $value; // Treat $value as a regex for FQDN(host)s
158         } else {
159                 $regex = generate_host_regex($value, '#');
160                 if (! empty($regex)) {
161                         $array[$value] = '#^' . $regex . '$#i';
162                 }
163         }
164 }
165
166 // Blocklist metrics: Separate $host, to $blocked and not blocked
167 function blocklist_distiller(& $hosts, $keys = array('goodhost', 'badhost'), $asap = FALSE)
168 {
169         if (! is_array($hosts)) $hosts = array($hosts);
170         if (! is_array($keys))  $keys  = array($keys);
171
172         $list = get_blocklist('list');
173         $blocked = array();
174
175         foreach($keys as $key){
176                 foreach (get_blocklist($key) as $label => $regex) {
177                         if (is_array($regex)) {
178                                 foreach($regex as $_label => $_regex) {
179                                         $group = preg_grep($_regex, $hosts);
180                                         if ($group) {
181                                                 $hosts = array_diff($hosts, $group);
182                                                 $blocked[$key][$label][$_label] = $group;
183                                                 if ($asap && $list[$key]) break;
184                                         }
185                                 }
186                         } else {
187                                 $group = preg_grep($regex, $hosts);
188                                 if ($group) {
189                                         $hosts = array_diff($hosts, $group);
190                                         $blocked[$key][$label] = $group;
191                                         if ($asap && $list[$key]) break;
192                                 }
193                         }
194                 }
195         }
196
197         return $blocked;
198 }
199
200
201 // ---------------------
202
203
204 // Default (enabled) methods and thresholds (for content insertion)
205 function check_uri_spam_method($times = 1, $t_area = 0, $rule = TRUE)
206 {
207         $times  = intval($times);
208         $t_area = intval($t_area);
209
210         $positive = array(
211                 // Thresholds
212                 'quantity'     =>  8 * $times,  // Allow N URIs
213                 'non_uniqhost' =>  3 * $times,  // Allow N duped (and normalized) Hosts
214                 //'non_uniquri'=>  3 * $times,  // Allow N duped (and normalized) URIs
215
216                 // Areas
217                 'area_anchor'  => $t_area,      // Using <a href> HTML tag
218                 'area_bbcode'  => $t_area,      // Using [url] or [link] BBCode
219                 //'uri_anchor' => $t_area,      // URI inside <a href> HTML tag
220                 //'uri_bbcode' => $t_area,      // URI inside [url] or [link] BBCode
221         );
222         if ($rule) {
223                 $bool = array(
224                         // Rules
225                         //'asap'   => TRUE,     // Quit or return As Soon As Possible
226                         'uniqhost' => TRUE,     // Show uniq host (at block notification mail)
227                         'badhost'  => TRUE,     // Check badhost
228                 );
229         } else {
230                 $bool = array();
231         }
232
233         // Remove non-$positive values
234         foreach (array_keys($positive) as $key) {
235                 if ($positive[$key] < 0) unset($positive[$key]);
236         }
237
238         return $positive + $bool;
239 }
240
241 // Simple/fast spam check
242 function check_uri_spam($target = '', $method = array())
243 {
244         // Return value
245         $progress = array(
246                 'method'  => array(
247                         // Theme to do  => Dummy, optional value, or optional array()
248                         //'quantity'    => 8,
249                         //'uniqhost'    => TRUE,
250                         //'non_uniqhost'=> 3,
251                         //'non_uniquri' => 3,
252                         //'badhost'     => TRUE,
253                         //'area_anchor' => 0,
254                         //'area_bbcode' => 0,
255                         //'uri_anchor'  => 0,
256                         //'uri_bbcode'  => 0,
257                 ),
258                 'sum' => array(
259                         // Theme        => Volume found (int)
260                 ),
261                 'is_spam' => array(
262                         // Flag. If someting defined here,
263                         // one or more spam will be included
264                         // in this report
265                 ),
266                 'blocked' => array(
267                         // Hosts blocked
268                         //'category' => array(
269                         //      'host',
270                         //)
271                 ),
272                 'hosts' => array(
273                         // Hosts not blocked
274                 ),
275         );
276
277         // ----------------------------------------
278         // Aliases
279
280         $sum     = & $progress['sum'];
281         $is_spam = & $progress['is_spam'];
282         $progress['method'] = & $method;        // Argument
283         $blocked = & $progress['blocked'];
284         $hosts   = & $progress['hosts'];
285         $asap    = isset($method['asap']);
286
287         // ----------------------------------------
288         // Init
289
290         if (! is_array($method) || empty($method)) {
291                 $method = check_uri_spam_method();
292         }
293         foreach(array_keys($method) as $key) {
294                 if (! isset($sum[$key])) $sum[$key] = 0;
295         }
296         if (! isset($sum['quantity'])) $sum['quantity'] = 0;
297
298         // ----------------------------------------
299         // Recurse
300
301         if (is_array($target)) {
302                 foreach($target as $str) {
303                         if (! is_string($str)) continue;
304
305                         $_progress = check_uri_spam($str, $method);     // Recurse
306
307                         // Merge $sum
308                         $_sum = & $_progress['sum'];
309                         foreach (array_keys($_sum) as $key) {
310                                 if (! isset($sum[$key])) {
311                                         $sum[$key] = & $_sum[$key];
312                                 } else {
313                                         $sum[$key] += $_sum[$key];
314                                 }
315                         }
316
317                         // Merge $is_spam
318                         $_is_spam = & $_progress['is_spam'];
319                         foreach (array_keys($_is_spam) as $key) {
320                                 $is_spam[$key] = TRUE;
321                                 if ($asap) break;
322                         }
323                         if ($asap && $is_spam) break;
324
325                         // Merge only
326                         $blocked = array_merge_leaves($blocked, $_progress['blocked'], FALSE);
327                         $hosts   = array_merge_leaves($hosts,   $_progress['hosts'],   FALSE);
328                 }
329
330                 // Unique values
331                 $blocked = array_unique_recursive($blocked);
332                 $hosts   = array_unique_recursive($hosts);
333
334                 // Recount $sum['badhost']
335                 $sum['badhost'] = array_count_leaves($blocked);
336
337                 return $progress;
338         }
339
340         // ----------------------------------------
341         // Area measure
342
343         if (! $asap || ! $is_spam) {
344         
345                 // Method pickup
346                 $_method = array();
347                 foreach(array(
348                                 'area_anchor',  // There's HTML anchor tag
349                                 'area_bbcode',  // There's 'BBCode' linking tag
350                         ) as $key) {
351                         if (isset($method[$key])) $_method[$key] = TRUE;
352                 }
353
354                 if ($_method) {
355                         $_asap   = isset($method['asap']) ? array('asap' => TRUE) : array();
356                         $_result = area_pickup($target, $_method + $_asap);
357                         $_asap   = NULL;
358                 } else {
359                         $_result = FALSE;
360                 }
361
362                 if ($_result) {
363                         foreach(array_keys($_method) as $key) {
364                                 if (isset($_result[$key])) {
365                                         $sum[$key] = $_result[$key];
366                                         if (isset($method[$key]) && $sum[$key] > $method[$key]) {
367                                                 $is_spam[$key] = TRUE;
368                                         }
369                                 }
370                         }
371                 }
372
373                 unset($_asap, $_method, $_result);
374         }
375
376         // Return if ...
377         if ($asap && $is_spam) return $progress;
378
379         // ----------------------------------------
380         // URI: Pickup
381
382         $pickups = spam_uri_pickup($target, $method);
383
384
385         // Return if ...
386         if (empty($pickups)) return $progress;
387
388         // Normalize all
389         $pickups = uri_pickup_normalize($pickups);
390
391         // ----------------------------------------
392         // Pickup some part of URI
393
394         $hosts = array();
395         foreach ($pickups as $key => $pickup) {
396                 $hosts[$key] = & $pickup['host'];
397         }
398
399         // ----------------------------------------
400         // URI: Bad host <pre-filter> (Separate good/bad hosts from $hosts)
401
402         if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
403                 $list    = get_blocklist('pre');
404                 $blocked = blocklist_distiller($hosts, array_keys($list), $asap);
405                 foreach($list as $key => $type){
406                         if (! $type) unset($blocked[$key]); // Ignore goodhost etc
407                 }
408                 unset($list);
409                 if (! empty($blocked)) $is_spam['badhost'] = TRUE;
410         }
411
412         // Return if ...
413         if ($asap && $is_spam) return $progress;
414
415         // Remove blocked from $pickups
416         foreach(array_keys($pickups) as $key) {
417                 if (! isset($hosts[$key])) {
418                         unset($pickups[$key]);
419                 }
420         }
421
422         // ----------------------------------------
423         // URI: Check quantity
424
425         $sum['quantity'] += count($pickups);
426                 // URI quantity
427         if ((! $asap || ! $is_spam) && isset($method['quantity']) &&
428                 $sum['quantity'] > $method['quantity']) {
429                 $is_spam['quantity'] = TRUE;
430         }
431
432         // ----------------------------------------
433         // URI: used inside HTML anchor tag pair
434
435         if ((! $asap || ! $is_spam) && isset($method['uri_anchor'])) {
436                 $key = 'uri_anchor';
437                 foreach($pickups as $pickup) {
438                         if (isset($pickup['area'][$key])) {
439                                 $sum[$key] += $pickup['area'][$key];
440                                 if(isset($method[$key]) &&
441                                         $sum[$key] > $method[$key]) {
442                                         $is_spam[$key] = TRUE;
443                                         if ($asap && $is_spam) break;
444                                 }
445                                 if ($asap && $is_spam) break;
446                         }
447                 }
448         }
449
450         // ----------------------------------------
451         // URI: used inside 'BBCode' pair
452
453         if ((! $asap || ! $is_spam) && isset($method['uri_bbcode'])) {
454                 $key = 'uri_bbcode';
455                 foreach($pickups as $pickup) {
456                         if (isset($pickup['area'][$key])) {
457                                 $sum[$key] += $pickup['area'][$key];
458                                 if(isset($method[$key]) &&
459                                         $sum[$key] > $method[$key]) {
460                                         $is_spam[$key] = TRUE;
461                                         if ($asap && $is_spam) break;
462                                 }
463                                 if ($asap && $is_spam) break;
464                         }
465                 }
466         }
467
468         // ----------------------------------------
469         // URI: Uniqueness (and removing non-uniques)
470
471         if ((! $asap || ! $is_spam) && isset($method['non_uniquri'])) {
472
473                 $uris = array();
474                 foreach (array_keys($pickups) as $key) {
475                         $uris[$key] = uri_pickup_implode($pickups[$key]);
476                 }
477                 $count = count($uris);
478                 $uris  = array_unique($uris);
479                 $sum['non_uniquri'] += $count - count($uris);
480                 if ($sum['non_uniquri'] > $method['non_uniquri']) {
481                         $is_spam['non_uniquri'] = TRUE;
482                 }
483                 if (! $asap || ! $is_spam) {
484                         foreach (array_diff(array_keys($pickups),
485                                 array_keys($uris)) as $remove) {
486                                 unset($pickups[$remove]);
487                         }
488                 }
489                 unset($uris);
490         }
491
492         // Return if ...
493         if ($asap && $is_spam) return $progress;
494
495         // ----------------------------------------
496         // Host: Uniqueness (uniq / non-uniq)
497
498         $hosts = array_unique($hosts);
499
500         if (isset($sum['uniqhost'])) $sum['uniqhost'] += count($hosts);
501         if ((! $asap || ! $is_spam) && isset($method['non_uniqhost'])) {
502                 $sum['non_uniqhost'] = $sum['quantity'] - $sum['uniqhost'];
503                 if ($sum['non_uniqhost'] > $method['non_uniqhost']) {
504                         $is_spam['non_uniqhost'] = TRUE;
505                 }
506         }
507
508         // Return if ...
509         if ($asap && $is_spam) return $progress;
510
511         // ----------------------------------------
512         // URI: Bad host (Separate good/bad hosts from $hosts)
513
514         if ((! $asap || ! $is_spam) && isset($method['badhost'])) {
515                 $list    = get_blocklist('list');
516                 $blocked = array_merge_leaves(
517                         $blocked,
518                         blocklist_distiller($hosts, array_keys($list), $asap),
519                         FALSE
520                 );
521                 foreach($list as $key=>$type){
522                         if (! $type) unset($blocked[$key]); // Ignore goodhost etc
523                 }
524                 unset($list);
525                 if (! empty($blocked)) $is_spam['badhost'] = TRUE;
526         }
527
528         // Return if ...
529         //if ($asap && $is_spam) return $progress;
530
531         // ----------------------------------------
532         // End
533
534         return $progress;
535 }
536
537 // ---------------------
538 // Reporting
539
540 // Summarize $progress (blocked only)
541 function summarize_spam_progress($progress = array(), $blockedonly = FALSE)
542 {
543         if ($blockedonly) {
544                 $tmp = array_keys($progress['is_spam']);
545         } else {
546                 $tmp = array();
547                 $method = & $progress['method'];
548                 if (isset($progress['sum'])) {
549                         foreach ($progress['sum'] as $key => $value) {
550                                 if (isset($method[$key]) && $value) {
551                                         $tmp[] = $key . '(' . $value . ')';
552                                 }
553                         }
554                 }
555         }
556
557         return implode(', ', $tmp);
558 }
559
560 function summarize_detail_badhost($progress = array())
561 {
562         if (! isset($progress['blocked']) || empty($progress['blocked'])) return '';
563
564         // Flat per group
565         $blocked = array();
566         foreach($progress['blocked'] as $list => $lvalue) {
567                 foreach($lvalue as $group => $gvalue) {
568                         $flat = implode(', ', array_flat_leaves($gvalue));
569                         if ($flat === $group) {
570                                 $blocked[$list][]       = $flat;
571                         } else {
572                                 $blocked[$list][$group] = $flat;
573                         }
574                 }
575         }
576
577         // Shrink per list
578         // From: 'A-1' => array('ie.to')
579         // To:   'A-1' => 'ie.to'
580         foreach($blocked as $list => $lvalue) {
581                 if (is_array($lvalue) &&
582                    count($lvalue) == 1 &&
583                    is_numeric(key($lvalue))) {
584                     $blocked[$list] = current($lvalue);
585                 }
586         }
587
588         return var_export_shrink($blocked, TRUE, TRUE);
589 }
590
591 function summarize_detail_newtral($progress = array())
592 {
593         if (! isset($progress['hosts'])    ||
594             ! is_array($progress['hosts']) ||
595             empty($progress['hosts'])) return '';
596
597         // Generate a responsible $trie
598         $trie = array();
599         foreach($progress['hosts'] as $value) {
600                 // 'A.foo.bar.example.com'
601                 $resp = whois_responsibility($value);   // 'example.com'
602                 if (empty($resp)) {
603                         // One or more test, or do nothing here
604                         $resp = strval($value);
605                         $rest = '';
606                 } else {
607                         $rest = rtrim(substr($value, 0, - strlen($resp)), '.'); // 'A.foo.bar'
608                 }
609                 $trie = array_merge_leaves($trie, array($resp => array($rest => NULL)), FALSE);
610         }
611
612         // Format: var_export_shrink() -like output
613         $result = array();
614         ksort_by_domain($trie);
615         foreach(array_keys($trie) as $key) {
616                 ksort_by_domain($trie[$key]);
617                 if (count($trie[$key]) == 1 && key($trie[$key]) == '') {
618                         // Just one 'responsibility.example.com'
619                         $result[] = '  \'' . $key . '\',';
620                 } else {
621                         // One subdomain-or-host, or several ones
622                         $subs = array();
623                         foreach(array_keys($trie[$key]) as $sub) {
624                                 if ($sub == '') {
625                                         $subs[] = $key;                 // 'example.com'
626                                 } else {
627                                         $subs[] = $sub . '. ';  // 'A.foo.bar. '
628                                 }
629                         }
630                         $result[] = '  \'' . $key . '\' => \'' . implode(', ', $subs) . '\',';
631                 }
632                 unset($trie[$key]);
633         }
634         return
635                 'array (' . "\n" .
636                         implode("\n", $result) . "\n" .
637                 ')';
638 }
639
640
641 // ---------------------
642 // Exit
643
644 // Freeing memories
645 function spam_dispose()
646 {
647         get_blocklist(NULL);
648         whois_responsibility(NULL);
649 }
650
651 // Common bahavior for blocking
652 // NOTE: Call this function from various blocking feature, to disgueise the reason 'why blocked'
653 function spam_exit($mode = '', $data = array())
654 {
655         $exit = TRUE;
656
657         switch ($mode) {
658                 case '':
659                         echo("\n");
660                         break;
661                 case 'dump':
662                         echo('<pre>' . "\n");
663                         echo htmlspecialchars(var_export($data, TRUE));
664                         echo('</pre>' . "\n");
665                         break;
666         };
667
668         if ($exit) exit;        // Force exit
669 }
670
671
672 // ---------------------
673 // Simple filtering
674
675 // TODO: Record them
676 // Simple/fast spam filter ($target: 'a string' or an array())
677 function pkwk_spamfilter($action, $page, $target = array('title' => ''), $method = array(), $exitmode = '')
678 {
679         $progress = check_uri_spam($target, $method);
680
681         if (empty($progress['is_spam'])) {
682                 spam_dispose();
683         } else {
684
685 // TODO: detect encoding from $target for mbstring functions
686 //              $tmp = array();
687 //              foreach(array_keys($target) as $key) {
688 //                      $tmp[strings($key, 0, FALSE, TRUE)] = strings($target[$key], 0, FALSE, TRUE);   // Removing "\0" etc
689 //              }
690 //              $target = & $tmp;
691
692                 pkwk_spamnotify($action, $page, $target, $progress, $method);
693                 spam_exit($exitmode, $progress);
694         }
695 }
696
697 // ---------------------
698 // PukiWiki original
699
700 // Mail to administrator(s)
701 function pkwk_spamnotify($action, $page, $target = array('title' => ''), $progress = array(), $method = array())
702 {
703         global $notify, $notify_subject;
704
705         if (! $notify) return;
706
707         $asap = isset($method['asap']);
708
709         $summary['ACTION']  = 'Blocked by: ' . summarize_spam_progress($progress, TRUE);
710         if (! $asap) {
711                 $summary['METRICS'] = summarize_spam_progress($progress);
712         }
713
714         $tmp = summarize_detail_badhost($progress);
715         if ($tmp != '') $summary['DETAIL_BADHOST'] = $tmp;
716
717         $tmp = summarize_detail_newtral($progress);
718         if (! $asap && $tmp != '') $summary['DETAIL_NEUTRAL_HOST'] = $tmp;
719
720         $summary['COMMENT'] = $action;
721         $summary['PAGE']    = '[blocked] ' . (is_pagename($page) ? $page : '');
722         $summary['URI']     = get_script_uri() . '?' . rawurlencode($page);
723         $summary['USER_AGENT']  = TRUE;
724         $summary['REMOTE_ADDR'] = TRUE;
725         pkwk_mail_notify($notify_subject,  var_export($target, TRUE), $summary, TRUE);
726 }
727
728 ?>