4 * Modified by hsur ( http://blog.cles.jp/np_cles )
5 * $Id: spambayes.php,v 1.5 2007-06-25 11:47:30 hsur Exp $
7 ***** BEGIN LICENSE BLOCK *****
8 This file is part of PHP Naive Bayesian Filter.
9 The Initial Developer of the Original Code is
10 Loic d'Anterroches [loic_at_xhtml.net].
11 Portions created by the Initial Developer are Copyright (C) 2003
12 the Initial Developer. All Rights Reserved.
14 PHP Naive Bayesian Filter is free software; you can redistribute it
15 and/or modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of
17 the License, or (at your option) any later version.
19 PHP Naive Bayesian Filter is distributed in the hope that it will
20 be useful, but WITHOUT ANY WARRANTY; without even the implied
21 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 See the GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with Foobar; if not, write to the Free Software
26 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 Alternatively, the contents of this file may be used under the terms of
29 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 in which case the provisions of the LGPL are applicable instead
32 ***** END LICENSE BLOCK ******/
34 //define('NP_SPAMBAYES_TOKENIZER', '/usr/local/bin/mecab -F "%h\t%m\t%f[6]\n" -E ""');
35 define('NP_SPAMBAYES_APIURL', 'http://api.jlp.yahoo.co.jp/MAService/V1/parse');
38 /** min token length for it to be taken into consideration */
39 var $min_token_length = 2;
40 /** max token length for it to be taken into consideration */
41 var $max_token_length = 40;
42 /** list of token to ignore @see getIgnoreList() */
43 var $ignore_list = array();
47 function NaiveBayesian(&$parent) {
48 $this->nbs = new NaiveBayesianStorage(&$parent);
49 $this->parent = &$parent;
51 $this->appid = $this->parent->getOption('appid');
55 /** categorize a document.
56 Get list of categories in which the document can be categorized
57 with a score for each category.
59 @return array keys = category ids, values = scores
60 @param string document
62 function categorize($document) {
64 $categories = $this->nbs->getCategories();
66 $tokens = $this->_getTokens($document);
67 // calculate the score in each category
70 while (list($category, $data) = each($categories)) {
71 $total_words += $data['wordcount'];
75 while (list($category, $data) = each($categories)) {
76 $scores[$category] = $data['probability'];
77 //debug: print_r($scores);
78 // small probability for a word not in the category
79 // maybe putting 1.0 as a 'no effect' word can also be good
80 $small_proba = 1.0 / ($data['wordcount'] * 2);
82 while (list($token, $count) = each($tokens)) {
83 //debug: echo "<br/>$token; $count ";
84 if ($this->nbs->wordExists($token)) {
85 //debug: echo "$category = known $small_proba wordcount: ";
86 $word = $this->nbs->getWord($token, $category);
87 //debug: echo $word['wordcount'];
88 if ($word['wordcount']) $proba = $word['wordcount']/$data['wordcount'];
89 else $proba = $small_proba;
90 $newval = $scores[$category] * pow($proba, $count)*pow($total_words/$ncat, $count);
91 if (is_finite($newval)) {
92 $scores[$category] = $newval;
97 return $this->_rescale($scores);
98 } // function categorize
101 function explain($content) {
102 $categories = $this->nbs->getCategories(); // ham, spam
104 $tokens = $this->_getTokens($content);
105 // calculate the score in each category
108 while (list($category, $data) = each($categories)) {
109 $total_words += $data['wordcount'];
114 while (list($category, $data) = each($categories)) {
115 $scores[$category] = $data['probability'];
116 //debug: echo $category.'<br />';
117 $small_proba = 1.0 / ($data['wordcount'] * 2);
120 while (list($token, $count) = each($tokens)) {
122 //echo "<br/>$token; $count ";
123 if ($this->nbs->wordExists($token)) {
124 $word = $this->nbs->getWord($token, $category);
125 $result[$word['word']][$category] = $word['wordcount'];
128 if ($word['wordcount']) $proba = $word['wordcount']/$data['wordcount'];
129 else $proba = $small_proba;
130 $newval = $scores[$category] * pow($proba, $count)*pow($total_words/$ncat, $count);
131 if (is_finite($newval)) {
132 $scores[$category] = $newval;
137 $scores = $this->_rescale($scores);
138 array_multisort($result, SORT_DESC);
141 echo '<tr><th>word</th><th>Ham</th><th>Spam</th></tr>';
142 foreach($result as $key => $value) {
144 echo '<td>'.$key.'</td>';
145 echo '<td>'.$value['ham'].'</td>';
146 echo '<td>'.$value['spam'].'</td>';
149 echo '<tr><td>Rescaled probability:</td><th>'.$scores['ham'].'</th><th>'.$scores['spam'].'</th></tr>';
151 //debug: print_r ($scores);
154 /** training against a document.
155 Set a document as being in a specific category. The document becomes a reference
156 and is saved in the table of references. After a set of training is done
157 the updateProbabilities() function must be run.
159 @see updateProbabilities()
162 @param string document id, must be unique
163 @param string category_id the category id in which the document should be
164 @param string content of the document
166 function train($doc_id, $category_id, $content) {
167 $tokens = $this->_getTokens($content);
168 //debug: print_r($tokens);
169 while (list($token, $count) = each($tokens)) {
170 $this->nbs->updateWord($token, $count, $category_id);
172 $this->nbs->saveReference($doc_id, $category_id, $content);
176 function trainnew($doc_id, $category_id, $content) {
177 $reference = $this->nbs->getReference($doc_id);
179 $this->train($doc_id, $category_id, $content);
183 /** untraining of a document.
184 To remove just one document from the references.
186 @see updateProbabilities()
189 @param string document id, must be unique
192 function untrain($doc_id) {
193 $ref = $this->nbs->getReference($doc_id);
194 $tokens = $this->_getTokens($ref['content']);
195 while (list($token, $count) = each($tokens)) {
196 $this->nbs->removeWord($token, $count, $ref['catcode']);
198 $this->nbs->removeReference($doc_id);
200 } // function untrain
202 /** rescale the results between 0 and 1.
203 @author Ken Williams, ken@mathforum.org
205 @return array normalized scores (keys => category, values => scores)
206 @param array scores (keys => category, values => scores)
209 function _rescale($scores) {
210 // Scale everything back to a reasonable area in
211 // logspace (near zero), un-loggify, and normalize
215 while (list($cat, $score) = each($scores)) {
216 if ($score >= $max) $max = $score;
219 while (list($cat, $score) = each($scores)) {
220 $scores[$cat] = (float) exp($score - $max);
221 $total += (float) pow($scores[$cat],2);
223 $total = (float) sqrt($total);
225 while (list($cat, $score) = each($scores)) {
226 $scores[$cat] = (float) $scores[$cat]/$total;
230 } // function _rescale
232 /** update the probabilities of the categories and word count.
233 This function must be run after a set of training
239 function updateProbabilities() {
240 // this function is really only database manipulation
241 // that is why all is done in the NaiveBayesianStorage
242 return $this->nbs->updateProbabilities();
243 } // function updateProbabilities
245 /** Get the list of token to ignore.
246 @return array ignore list
249 function getIgnoreList() {
250 $ignore = $this->parent->getOption('ignorelist');
251 $arr = explode(',',$ignore);
252 $ignore = implode(' ',$arr);
253 $arr = explode(' ',$ignore);
257 /** get the tokens from a string
258 @author James Seng. [http://james.seng.cc/] (based on his perl version)
261 @param string the string to get the tokens from
264 function _getTokens($string) {
265 $rawtokens = array();
268 if (count(0 >= $this->ignore_list))
269 $this->ignore_list = $this->getIgnoreList();
271 $string = strip_tags($string);
273 if( defined('NP_SPAMBAYES_APIURL') && $this->appid ){
275 if( _CHARSET != 'UTF-8' )
276 $string = mb_convert_encoding($string, 'UTF-8', _CHARSET);
278 $postData['appid'] = $this->appid;
279 $postData['results'] = 'ma';
280 $postData['filter'] = '1|2|3|4|5|7|8|9|10';
281 $postData['response'] = 'baseform';
282 $postData['sentence'] = $string;
284 $data = $this->_http(NP_SPAMBAYES_APIURL, 'POST', '', $postData);
286 $p = new NP_SpamBayes_XMLParser();
287 $rawtokens = $p->parse($data);
289 if( _CHARSET != 'UTF-8' ){
290 if( is_array($rawtokens) ) foreach( $rawtokens as $index => $word ){
291 $rawtokens[$index] = mb_convert_encoding($word, _CHARSET, 'UTF-8');
296 ACTIONLOG :: add(WARNING, 'NP_SpamBayes: Y!API Error( '. (isset($rawtokens[0]) ? $rawtokens[0] : 'Unknown Error') . ' )');
297 $rawtokens = array();
302 } else if( defined('NP_SPAMBAYES_TOKENIZER') && function_exists(proc_open) ) {
304 $string = preg_replace('/\r|\n/', '', $string);
305 $string = strtr($string, array_flip(get_html_translation_table(HTML_SPECIALCHARS)));
306 $string = strip_tags($string);
308 0 => array("pipe", "r"),
309 1 => array("pipe", "w"),
310 2 => array("file", "/dev/null", "w")
312 $process = proc_open(NP_SPAMBAYES_TOKENIZER, $dspec, $pipes);
313 if(is_resource($process)) {
314 stream_set_blocking($pipes[0], FALSE);
315 stream_set_blocking($pipes[1], FALSE);
316 fwrite($pipes[0], $string . "\n");
318 while(!feof($pipes[1])) {
319 list($id, $origStr, $regStr) = explode("\t", trim(fgets($pipes[1], 32768)), 3);
320 if( ( 31 <= $id && $id <= 67 ) || ( 10 <= $id && $id <= 12 ) )
321 $rawtokens[] = trim($regStr ? $regStr : $origStr);
324 proc_close($process);
328 $string = $this->_cleanString($string);
329 $rawtokens = preg_split('/[\W]+/', $string);
332 // remove some tokens
333 if( is_array($rawtokens) ) foreach($rawtokens as $token) {
334 if (!(('' == $token) ||
335 (mb_strlen($token) < $this->min_token_length) ||
336 (mb_strlen($token) > $this->max_token_length) ||
337 (preg_match('/^[0-9]+$/', $token)) ||
338 (preg_match('/['.preg_quote('"\':;/\_[](){}!#%&$=+*|~?<>,.-','/').']+/', $token)) ||
339 (in_array($token, $this->ignore_list))
344 } // function _getTokens
346 function _http($url, $method = "GET", $headers = "", $post = array ("")) {
347 $URL = parse_url($url);
349 if (isset ($URL['query'])) {
350 $URL['query'] = "?".$URL['query'];
355 if (!isset ($URL['port']))
358 $request = $method." ".$URL['path'].$URL['query']." HTTP/1.0\r\n";
360 $request .= "Host: ".$URL['host']."\r\n";
361 $request .= "User-Agent: NP_SpamBayes\r\n";
363 if (isset ($URL['user']) && isset ($URL['pass'])) {
364 $request .= "Authorization: Basic ".base64_encode($URL['user'].":".$URL['pass'])."\r\n";
367 $request .= $headers;
369 if (strtoupper($method) == "POST") {
370 while (list ($name, $value) = each($post)) {
371 $POST[] = $name."=".urlencode($value);
373 $postdata = implode("&", $POST);
374 $request .= "Content-Type: application/x-www-form-urlencoded\r\n";
375 $request .= "Content-Length: ".strlen($postdata)."\r\n";
377 $request .= $postdata;
383 $test = fopen("/tmp/postdata.dat","wb");
384 fwrite($test, $request);
388 $fp = fsockopen($URL['host'], $URL['port'], $errno, $errstr, 20);
391 socket_set_timeout($fp, 20);
392 fputs($fp, $request);
395 $response .= fgets($fp, 4096);
398 $DATA = split("\r\n\r\n", $response, 2);
401 $host = $URL['host'];
402 $port = $URL['port'];
403 ACTIONLOG :: add(WARNING, 'NP_SpamBayes: HTTP Error: '."[$errno]($host:$port) $errstr");
408 /** clean a string from the diacritics
409 @author Antoine Bajolet [phpdig_at_toiletoine.net]
410 @author SPIP [http://uzine.net/spip/]
412 @return string clean string
413 @param string string with accents
416 function _cleanString($string) {
418 /* A */ chr(192).chr(193).chr(194).chr(195).chr(196).chr(197).
419 /* a */ chr(224).chr(225).chr(226).chr(227).chr(228).chr(229).
420 /* O */ chr(210).chr(211).chr(212).chr(213).chr(214).chr(216).
421 /* o */ chr(242).chr(243).chr(244).chr(245).chr(246).chr(248).
422 /* E */ chr(200).chr(201).chr(202).chr(203).
423 /* e */ chr(232).chr(233).chr(234).chr(235).
424 /* Cc */ chr(199).chr(231).
425 /* I */ chr(204).chr(205).chr(206).chr(207).
426 /* i */ chr(236).chr(237).chr(238).chr(239).
427 /* U */ chr(217).chr(218).chr(219).chr(220).
428 /* u */ chr(249).chr(250).chr(251).chr(252).
429 /* yNn */ chr(255).chr(209).chr(241);
430 return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
432 } // class NaiveBaysian
434 class NP_SpamBayes_XMLParser {
435 function NP_SpamBayes_XMLParser(){
436 $this->parser = xml_parser_create();
437 xml_set_object($this->parser, $this);
438 xml_set_element_handler($this->parser, "_open", "_close");
439 xml_set_character_data_handler($this->parser, "_cdata");
441 $this->target = null ;
442 $this->inTarget = false;
445 function parse($data){
446 $this->words = array();
447 xml_parse($this->parser, $data);
452 xml_parser_free($this->parser);
456 function _open($parser, $name, $attribute){
459 $this->inTarget = 'BASEFORM';
462 $this->inTarget = 'MESSAGE';
465 $this->isError = true;
470 function _close($parser, $name){
471 if( $name == $this->target ) $this->inTarget = null;
474 function _cdata($parser, $data){
475 if( $this->inTarget ){
476 $this->words[] = trim($data);
481 /** Access to the storage of the data for the filter.
483 To avoid dependency with respect to any database, this class handle all the
484 access to the data storage. You can provide your own class as long as
485 all the methods are available. The current one rely on a MySQL database.
488 - array getCategories()
489 - bool wordExists(string $word)
490 - array getWord(string $word, string $categoryid)
493 class NaiveBayesianStorage {
494 function NaiveBayesianStorage(&$plugin) {
495 $this->table_cat = sql_table('plug_sb_cat'); // categories
496 $this->table_wf = sql_table('plug_sb_wf'); // word frequencies
497 $this->table_ref = sql_table('plug_sb_ref'); // references
498 $this->table_log = sql_table('plug_sb_log'); // logging
499 $this->plugin = &$plugin;
501 /** get the list of categories with basic data.
502 @return array key = category ids, values = array(keys = 'probability', 'word_count')
504 function getCategories() {
505 $categories = array();
507 $rs = sql_query('SELECT * FROM '.$this->table_cat);
510 while ($row = mysql_fetch_array($rs)) {
511 $categories[$row['catcode']] = array('probability' => $row['probability'], 'wordcount' => $row['wordcount'] );
514 $categories[0] = 'No categories found';
519 /** see if the word is an already learnt word.
523 function wordExists($word) {
524 $rs = sql_query("SELECT count(*) as amount FROM ".$this->table_wf." WHERE word='". mysql_real_escape_string($word)."'");
525 $obj = mysql_fetch_object($rs);
526 if ($obj->amount == 0) return false;
530 /** get details of a word in a category.
531 @return array ('count' => count)
533 @param string category id
535 function getWord($word, $catcode){
537 $rs = sql_query("SELECT * FROM ".$this->table_wf." WHERE word='".mysql_real_escape_string($word)."' AND catcode='".mysql_real_escape_string($catcode)."'");
538 $obj = mysql_fetch_object($rs);
540 $details['wordcount'] = $obj->wordcount;
541 $details['catcode'] = $obj->catcode;
542 $details['word'] = $obj->word;
544 $details['wordcount'] = 0;
545 $details['catcode'] = $catcode;
546 $details['word'] = $word;
551 /** update a word in a category.
552 If the word is new in this category it is added, else only the count is updated.
556 @paran string category id
559 function updateWord($word, $wordcount, $catcode) {
560 $oldword = $this->getWord($word, $catcode);
561 if (0 == $oldword['wordcount']) {
562 return sql_query("INSERT INTO ".$this->table_wf." (word, catcode, wordcount) VALUES ('".mysql_real_escape_string($word)."','".mysql_real_escape_string($catcode)."','".mysql_real_escape_string((int)$wordcount)."')");
564 return sql_query("UPDATE ".$this->table_wf." SET wordcount = wordcount +".(int)$wordcount." WHERE catcode = '".mysql_real_escape_string($catcode)."' AND word = '".mysql_real_escape_string($word)."'");
566 } // function updateWord
568 /** remove a word from a category.
572 @param string category id
575 function removeWord($word, $wordcount, $catcode) {
576 $oldword = $this->getWord($word, $catcode);
577 if (0 != $oldword['wordcount'] && 0 >= ($oldword['wordcount']-$wordcount)) {
578 return sql_query("DELETE FROM ".$this->table_wf." WHERE word='".mysql_real_escape_string($word)."' AND catcode ='".mysql_real_escape_string($catcode)."'");
580 return sql_query("UPDATE ".$this->table_wf." SET wordcount = wordcount - ".(int)$wordcount." WHERE catcode = '".mysql_real_escape_string($catcode)."' AND word = '".mysql_real_escape_string($word)."'");
582 } // function removeWord
584 /** update the probabilities of the categories and word count.
585 This function must be run after a set of training
588 function updateProbabilities() {
589 // first update the word count of each category
590 $rs = sql_query("SELECT catcode, SUM(wordcount) AS total FROM ".$this->table_wf." WHERE 1 GROUP BY catcode");
592 while ($obj = mysql_fetch_object($rs)) {
593 $total_words += $obj->total;
596 if ($total_words == 0) {
597 sql_query("UPDATE ".$this->table_cat." SET wordcount = 0, probability = 0 WHERE 1");
599 $rs = sql_query("SELECT catcode, SUM(wordcount) AS total FROM ".$this->table_wf." WHERE 1 GROUP BY catcode");
600 while ($obj = mysql_fetch_object($rs)) {
601 $proba = $obj->total / $total_words;
602 sql_query("UPDATE ".$this->table_cat." SET wordcount=".(int)$obj->total.", probability=".$proba." WHERE catcode = '".$obj->catcode."'");
606 } // updateProbabilities
608 /** save a reference in the database.
610 @param string reference if, must be unique
611 @param string category id
612 @param string content of the reference
614 function saveReference($ref, $catcode, $content) {
615 return sql_query("INSERT INTO ".$this->table_ref." (ref, catcode, content) VALUES (".intval($ref).", '".mysql_real_escape_string($catcode)."','".mysql_real_escape_string($content)."')");
616 } // function saveReference
618 /** get a reference from the database.
619 @return array reference( catcode => ...., content => ....)
622 function getReference($ref) {
623 $reference = array();
624 $rs = sql_query("SELECT * FROM ".$this->table_ref." WHERE ref=".intval($ref));
626 $reference = mysql_fetch_array($rs);
631 /** remove a reference from the database
633 @param string reference id
636 function removeReference($ref) {
637 return sql_query("DELETE FROM ".$this->table_ref." WHERE ref=".intval($ref));
640 function nextdocid() {
641 $res = sql_query ("select ref from ".$this->table_ref." where ref >= 500000000 order by ref desc limit 0,1");
642 $obj = @ mysql_fetch_object($res);
644 return $obj->ref + 1;
650 function logevent($log,$content,$catcode) {
651 if ($this->plugin->getOption('enableLogging') == 'yes') {
652 if (isset($log) && isset($content)) {
653 sql_query("insert into ".$this->table_log." (log,content,catcode) values ('".mysql_real_escape_string($log)."','".mysql_real_escape_string($content)."','".mysql_real_escape_string($catcode)."')");
658 function clearlog($filter = 'all', $filtertype = 'all', $keyword = '', $ipp = 10) {
659 $query = 'delete from '.$this->table_log;
660 if ($filter != 'all' || $filtertype != 'all') {
662 if ($filter != 'all') {
663 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
665 if ($filter != 'all' && $filtertype != 'all') {
668 if ($filtertype != 'all') {
669 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
671 if ($keyword != '') {
672 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
674 } elseif ($keyword != '') {
675 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
677 if ($_REQUEST['amount'] == 'cp') { //only current page?
678 $query .= ' order by logtime desc limit '.$ipp;
681 } // function clearlog
683 function getlogtable($startpos, $filter = 'all',$filtertype = 'all', $keyword, $ipp = 10) {
684 $query = 'select * from '.$this->table_log;
685 if ($filter != 'all' || $filtertype != 'all') {
687 if ($filter != 'all') {
688 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
690 if ($filter != 'all' && $filtertype != 'all') {
693 if ($filtertype != 'all') {
694 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
696 if ($keyword != '') {
697 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
699 } elseif ($keyword != '') {
700 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
702 $query .= ' order by logtime desc limit '.$startpos.','.$ipp;
703 return sql_query($query);
704 } // function getlogtable
706 function countlogtable($filter = 'all', $filtertype = 'all', $keyword = '') {
707 $query = 'select count(*) as total from '.$this->table_log;
708 if ($filter != 'all' || $filtertype != 'all') {
710 if ($filter != 'all') {
711 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
713 if ($filter != 'all' && $filtertype != 'all') {
716 if ($filtertype != 'all') {
717 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
719 if ($keyword != '') {
720 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
722 } elseif ($keyword != '') {
723 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
725 $res = sql_query($query);
726 $arr = mysql_fetch_array($res);
727 return $arr['total'];
730 function getlogtypes() {
731 $query = "select distinct(substring_index(log,' ', 2)) as logtype from ".$this->table_log;
733 $res = sql_query($query);
734 while ($arr = mysql_fetch_array($res)) {
735 $logtypes[] = $arr['logtype'];
740 function getreftable($startpos) {
741 $query = 'select * from '.$this->table_ref.' where ref >= 1000000 order by ref desc limit '.$startpos.',10';
742 return sql_query($query);
745 function getLogevent($id) {
746 $query = 'select * from '.$this->table_log.' where id = '.$id;
747 $res = sql_query($query);
748 return mysql_fetch_array($res);
751 function removeLogevent($id) {
752 $query = ' delete from '.$this->table_log.' where id = '.$id;
753 $res = sql_query($query);
756 function countreftable() {
757 $query = 'select count(*) as total from '.$this->table_ref.' where ref >= 1000000';
758 $res = sql_query($query);
759 $arr = mysql_fetch_array($res);
760 return $arr['total'];
763 } // class NaiveBayesianStorage