4 * Modified by hsur ( http://blog.cles.jp/np_cles )
5 * $Id: spambayes.php,v 1.6 2008-05-03 22:38:17 hsur Exp $
7 ***** BEGIN LICENSE BLOCK *****
8 This file is part of PHP Naive Bayesian Filter.
9 The Initial Developer of the Original Code is
10 Loic d'Anterroches [loic_at_xhtml.net].
11 Portions created by the Initial Developer are Copyright (C) 2003
12 the Initial Developer. All Rights Reserved.
14 PHP Naive Bayesian Filter is free software; you can redistribute it
15 and/or modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of
17 the License, or (at your option) any later version.
19 PHP Naive Bayesian Filter is distributed in the hope that it will
20 be useful, but WITHOUT ANY WARRANTY; without even the implied
21 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 See the GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with Foobar; if not, write to the Free Software
26 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 Alternatively, the contents of this file may be used under the terms of
29 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 in which case the provisions of the LGPL are applicable instead
32 ***** END LICENSE BLOCK ******/
34 //define('NP_SPAMBAYES_TOKENIZER', '/usr/local/bin/mecab -F "%h\t%m\t%f[6]\n" -E ""');
35 define('NP_SPAMBAYES_APIURL', 'http://api.jlp.yahoo.co.jp/MAService/V1/parse');
37 require_once(dirname(__FILE__).'/../sharedlibs/sharedlibs.php');
38 require_once('cles/AsyncHTTP.php');
41 /** min token length for it to be taken into consideration */
42 var $min_token_length = 2;
43 /** max token length for it to be taken into consideration */
44 var $max_token_length = 40;
45 /** list of token to ignore @see getIgnoreList() */
46 var $ignore_list = array();
50 function NaiveBayesian(&$parent) {
51 $this->nbs = new NaiveBayesianStorage(&$parent);
52 $this->parent = &$parent;
54 $this->appid = $this->parent->getOption('appid');
58 /** categorize a document.
59 Get list of categories in which the document can be categorized
60 with a score for each category.
62 @return array keys = category ids, values = scores
63 @param string document
65 function categorize($document) {
67 $categories = $this->nbs->getCategories();
69 $tokens = $this->_getTokens($document);
70 // calculate the score in each category
73 while (list($category, $data) = each($categories)) {
74 $total_words += $data['wordcount'];
78 while (list($category, $data) = each($categories)) {
79 $scores[$category] = $data['probability'];
80 //debug: print_r($scores);
81 // small probability for a word not in the category
82 // maybe putting 1.0 as a 'no effect' word can also be good
83 $small_proba = 1.0 / ($data['wordcount'] * 2);
85 while (list($token, $count) = each($tokens)) {
86 //debug: echo "<br/>$token; $count ";
87 if ($this->nbs->wordExists($token)) {
88 //debug: echo "$category = known $small_proba wordcount: ";
89 $word = $this->nbs->getWord($token, $category);
90 //debug: echo $word['wordcount'];
91 if ($word['wordcount']) $proba = $word['wordcount']/$data['wordcount'];
92 else $proba = $small_proba;
93 $newval = $scores[$category] * pow($proba, $count)*pow($total_words/$ncat, $count);
94 if (is_finite($newval)) {
95 $scores[$category] = $newval;
100 return $this->_rescale($scores);
101 } // function categorize
104 function explain($content) {
105 $categories = $this->nbs->getCategories(); // ham, spam
107 $tokens = $this->_getTokens($content);
108 // calculate the score in each category
111 while (list($category, $data) = each($categories)) {
112 $total_words += $data['wordcount'];
117 while (list($category, $data) = each($categories)) {
118 $scores[$category] = $data['probability'];
119 //debug: echo $category.'<br />';
120 $small_proba = 1.0 / ($data['wordcount'] * 2);
123 while (list($token, $count) = each($tokens)) {
125 //echo "<br/>$token; $count ";
126 if ($this->nbs->wordExists($token)) {
127 $word = $this->nbs->getWord($token, $category);
128 $result[$word['word']][$category] = $word['wordcount'];
131 if ($word['wordcount']) $proba = $word['wordcount']/$data['wordcount'];
132 else $proba = $small_proba;
133 $newval = $scores[$category] * pow($proba, $count)*pow($total_words/$ncat, $count);
134 if (is_finite($newval)) {
135 $scores[$category] = $newval;
140 $scores = $this->_rescale($scores);
141 array_multisort($result, SORT_DESC);
144 echo '<tr><th>word</th><th>Ham</th><th>Spam</th></tr>';
145 foreach($result as $key => $value) {
147 echo '<td>'.$key.'</td>';
148 echo '<td>'.$value['ham'].'</td>';
149 echo '<td>'.$value['spam'].'</td>';
152 echo '<tr><td>調整後のスコア:</td><th>'.$scores['ham'].'</th><th>'.$scores['spam'].'</th></tr>';
154 //debug: print_r ($scores);
157 /** training against a document.
158 Set a document as being in a specific category. The document becomes a reference
159 and is saved in the table of references. After a set of training is done
160 the updateProbabilities() function must be run.
162 @see updateProbabilities()
165 @param string document id, must be unique
166 @param string category_id the category id in which the document should be
167 @param string content of the document
169 function train($doc_id, $category_id, $content) {
170 $tokens = $this->_getTokens($content);
171 //debug: print_r($tokens);
172 while (list($token, $count) = each($tokens)) {
173 $this->nbs->updateWord($token, $count, $category_id);
175 $this->nbs->saveReference($doc_id, $category_id, $content);
179 function trainnew($doc_id, $category_id, $content) {
180 $reference = $this->nbs->getReference($doc_id);
182 $this->train($doc_id, $category_id, $content);
186 /** untraining of a document.
187 To remove just one document from the references.
189 @see updateProbabilities()
192 @param string document id, must be unique
195 function untrain($doc_id) {
196 $ref = $this->nbs->getReference($doc_id);
197 $tokens = $this->_getTokens($ref['content']);
198 while (list($token, $count) = each($tokens)) {
199 $this->nbs->removeWord($token, $count, $ref['catcode']);
201 $this->nbs->removeReference($doc_id);
203 } // function untrain
205 /** rescale the results between 0 and 1.
206 @author Ken Williams, ken@mathforum.org
208 @return array normalized scores (keys => category, values => scores)
209 @param array scores (keys => category, values => scores)
212 function _rescale($scores) {
213 // Scale everything back to a reasonable area in
214 // logspace (near zero), un-loggify, and normalize
218 while (list($cat, $score) = each($scores)) {
219 if ($score >= $max) $max = $score;
222 while (list($cat, $score) = each($scores)) {
223 $scores[$cat] = (float) exp($score - $max);
224 $total += (float) pow($scores[$cat],2);
226 $total = (float) sqrt($total);
228 while (list($cat, $score) = each($scores)) {
229 $scores[$cat] = (float) $scores[$cat]/$total;
233 } // function _rescale
235 /** update the probabilities of the categories and word count.
236 This function must be run after a set of training
242 function updateProbabilities() {
243 // this function is really only database manipulation
244 // that is why all is done in the NaiveBayesianStorage
245 return $this->nbs->updateProbabilities();
246 } // function updateProbabilities
248 /** Get the list of token to ignore.
249 @return array ignore list
252 function getIgnoreList() {
253 $ignore = $this->parent->getOption('ignorelist');
254 $arr = explode(',',$ignore);
255 $ignore = implode(' ',$arr);
256 $arr = explode(' ',$ignore);
260 /** get the tokens from a string
261 @author James Seng. [http://james.seng.cc/] (based on his perl version)
264 @param string the string to get the tokens from
267 function _getTokens($string) {
268 $rawtokens = array();
271 if (count(0 >= $this->ignore_list))
272 $this->ignore_list = $this->getIgnoreList();
274 $string = strip_tags($string);
276 if( defined('NP_SPAMBAYES_APIURL') && $this->appid ){
278 if( _CHARSET != 'UTF-8' )
279 $string = mb_convert_encoding($string, 'UTF-8', _CHARSET);
281 $postData['appid'] = $this->appid;
282 $postData['results'] = 'ma';
283 $postData['filter'] = '1|2|3|4|5|7|8|9|10';
284 $postData['response'] = 'baseform';
285 $postData['sentence'] = $string;
287 $ahttp = new cles_AsyncHTTP();
288 $ahttp->asyncMode = false;
289 $ahttp->userAgent = 'NP_SpamBayesJP';
290 $ahttp->setRequest(NP_SPAMBAYES_APIURL, 'POST', '', $postData);
291 list($data) = $ahttp->getResponses();
294 $p = new NP_SpamBayes_XMLParser();
295 $rawtokens = $p->parse($data);
297 if( _CHARSET != 'UTF-8' ){
298 if( is_array($rawtokens) ) foreach( $rawtokens as $index => $word ){
299 $rawtokens[$index] = mb_convert_encoding($word, _CHARSET, 'UTF-8');
304 ACTIONLOG :: add(WARNING, 'NP_SpamBayes: Y!API Error( '. (isset($rawtokens[0]) ? $rawtokens[0] : 'Unknown Error') . ' )');
305 $rawtokens = array();
310 ACTIONLOG :: add(WARNING, 'NP_SpamBayes: AsyncHTTP Error['.$ahttp->getErrorNo(0).']'.$ahttp->getError(0));
313 } else if( defined('NP_SPAMBAYES_TOKENIZER') && function_exists(proc_open) ) {
315 $string = preg_replace('/\r|\n/', '', $string);
316 $string = strtr($string, array_flip(get_html_translation_table(HTML_SPECIALCHARS)));
317 $string = strip_tags($string);
319 0 => array("pipe", "r"),
320 1 => array("pipe", "w"),
321 2 => array("file", "/dev/null", "w")
323 $process = proc_open(NP_SPAMBAYES_TOKENIZER, $dspec, $pipes);
324 if(is_resource($process)) {
325 stream_set_blocking($pipes[0], FALSE);
326 stream_set_blocking($pipes[1], FALSE);
327 fwrite($pipes[0], $string . "\n");
329 while(!feof($pipes[1])) {
330 list($id, $origStr, $regStr) = explode("\t", trim(fgets($pipes[1], 32768)), 3);
331 if( ( 31 <= $id && $id <= 67 ) || ( 10 <= $id && $id <= 12 ) )
332 $rawtokens[] = trim($regStr ? $regStr : $origStr);
335 proc_close($process);
339 $string = $this->_cleanString($string);
340 $rawtokens = preg_split('/[\W]+/', $string);
343 // remove some tokens
344 if( is_array($rawtokens) ) foreach($rawtokens as $token) {
345 if (!(('' == $token) ||
346 (mb_strlen($token) < $this->min_token_length) ||
347 (mb_strlen($token) > $this->max_token_length) ||
348 (preg_match('/^[0-9]+$/', $token)) ||
349 (preg_match('/['.preg_quote('"\':;/\_[](){}!#%&$=+*|~?<>,.-','/').']+/', $token)) ||
350 (in_array($token, $this->ignore_list))
355 } // function _getTokens
357 /** clean a string from the diacritics
358 @author Antoine Bajolet [phpdig_at_toiletoine.net]
359 @author SPIP [http://uzine.net/spip/]
361 @return string clean string
362 @param string string with accents
365 function _cleanString($string) {
367 /* A */ chr(192).chr(193).chr(194).chr(195).chr(196).chr(197).
368 /* a */ chr(224).chr(225).chr(226).chr(227).chr(228).chr(229).
369 /* O */ chr(210).chr(211).chr(212).chr(213).chr(214).chr(216).
370 /* o */ chr(242).chr(243).chr(244).chr(245).chr(246).chr(248).
371 /* E */ chr(200).chr(201).chr(202).chr(203).
372 /* e */ chr(232).chr(233).chr(234).chr(235).
373 /* Cc */ chr(199).chr(231).
374 /* I */ chr(204).chr(205).chr(206).chr(207).
375 /* i */ chr(236).chr(237).chr(238).chr(239).
376 /* U */ chr(217).chr(218).chr(219).chr(220).
377 /* u */ chr(249).chr(250).chr(251).chr(252).
378 /* yNn */ chr(255).chr(209).chr(241);
379 return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
381 } // class NaiveBaysian
383 class NP_SpamBayes_XMLParser {
384 function NP_SpamBayes_XMLParser(){
385 $this->parser = xml_parser_create('UTF-8');
386 xml_set_object($this->parser, $this);
387 xml_set_element_handler($this->parser, "_open", "_close");
388 xml_set_character_data_handler($this->parser, "_cdata");
390 $this->isError = false;
391 $this->inTarget = false;
394 function parse($data){
395 $this->words = array();
396 xml_parse($this->parser, $data);
397 $errcode = xml_get_error_code($this->parser);
398 if ( $errcode != XML_ERROR_NONE ) {
399 $this->isError = true;
400 $this->words = array();
401 $this->words[] = 'XML Parse Error: ' . xml_error_string($errcode) . ' in '. xml_get_current_line_number($this->parser);
407 xml_parser_free($this->parser);
411 function _open($parser, $name, $attribute){
414 $this->inTarget = 'BASEFORM';
417 $this->inTarget = 'MESSAGE';
420 $this->isError = true;
425 function _close($parser, $name){
426 if( $name == $this->inTarget ) $this->inTarget = null;
429 function _cdata($parser, $data){
430 if( $this->inTarget ){
431 $this->words[] = trim($data);
436 /** Access to the storage of the data for the filter.
438 To avoid dependency with respect to any database, this class handle all the
439 access to the data storage. You can provide your own class as long as
440 all the methods are available. The current one rely on a MySQL database.
443 - array getCategories()
444 - bool wordExists(string $word)
445 - array getWord(string $word, string $categoryid)
448 class NaiveBayesianStorage {
449 function NaiveBayesianStorage(&$plugin) {
450 $this->table_cat = sql_table('plug_sb_cat'); // categories
451 $this->table_wf = sql_table('plug_sb_wf'); // word frequencies
452 $this->table_ref = sql_table('plug_sb_ref'); // references
453 $this->table_log = sql_table('plug_sb_log'); // logging
454 $this->plugin = &$plugin;
456 /** get the list of categories with basic data.
457 @return array key = category ids, values = array(keys = 'probability', 'word_count')
459 function getCategories() {
460 $categories = array();
462 $rs = sql_query('SELECT * FROM '.$this->table_cat);
465 while ($row = mysql_fetch_array($rs)) {
466 $categories[$row['catcode']] = array('probability' => $row['probability'], 'wordcount' => $row['wordcount'] );
469 $categories[0] = 'No categories found';
474 /** see if the word is an already learnt word.
478 function wordExists($word) {
479 $rs = sql_query("SELECT count(*) as amount FROM ".$this->table_wf." WHERE word='". mysql_real_escape_string($word)."'");
480 $obj = mysql_fetch_object($rs);
481 if ($obj->amount == 0) return false;
485 /** get details of a word in a category.
486 @return array ('count' => count)
488 @param string category id
490 function getWord($word, $catcode){
492 $rs = sql_query("SELECT * FROM ".$this->table_wf." WHERE word='".mysql_real_escape_string($word)."' AND catcode='".mysql_real_escape_string($catcode)."'");
493 $obj = mysql_fetch_object($rs);
495 $details['wordcount'] = $obj->wordcount;
496 $details['catcode'] = $obj->catcode;
497 $details['word'] = $obj->word;
499 $details['wordcount'] = 0;
500 $details['catcode'] = $catcode;
501 $details['word'] = $word;
506 /** update a word in a category.
507 If the word is new in this category it is added, else only the count is updated.
511 @paran string category id
514 function updateWord($word, $wordcount, $catcode) {
515 $oldword = $this->getWord($word, $catcode);
516 if (0 == $oldword['wordcount']) {
517 return sql_query("INSERT INTO ".$this->table_wf." (word, catcode, wordcount) VALUES ('".mysql_real_escape_string($word)."','".mysql_real_escape_string($catcode)."','".mysql_real_escape_string((int)$wordcount)."')");
519 return sql_query("UPDATE ".$this->table_wf." SET wordcount = wordcount +".(int)$wordcount." WHERE catcode = '".mysql_real_escape_string($catcode)."' AND word = '".mysql_real_escape_string($word)."'");
521 } // function updateWord
523 /** remove a word from a category.
527 @param string category id
530 function removeWord($word, $wordcount, $catcode) {
531 $oldword = $this->getWord($word, $catcode);
532 if (0 != $oldword['wordcount'] && 0 >= ($oldword['wordcount']-$wordcount)) {
533 return sql_query("DELETE FROM ".$this->table_wf." WHERE word='".mysql_real_escape_string($word)."' AND catcode ='".mysql_real_escape_string($catcode)."'");
535 return sql_query("UPDATE ".$this->table_wf." SET wordcount = wordcount - ".(int)$wordcount." WHERE catcode = '".mysql_real_escape_string($catcode)."' AND word = '".mysql_real_escape_string($word)."'");
537 } // function removeWord
539 /** update the probabilities of the categories and word count.
540 This function must be run after a set of training
543 function updateProbabilities() {
544 // first update the word count of each category
545 $rs = sql_query("SELECT catcode, SUM(wordcount) AS total FROM ".$this->table_wf." WHERE 1 GROUP BY catcode");
547 while ($obj = mysql_fetch_object($rs)) {
548 $total_words += $obj->total;
551 if ($total_words == 0) {
552 sql_query("UPDATE ".$this->table_cat." SET wordcount = 0, probability = 0 WHERE 1");
554 $rs = sql_query("SELECT catcode, SUM(wordcount) AS total FROM ".$this->table_wf." WHERE 1 GROUP BY catcode");
555 while ($obj = mysql_fetch_object($rs)) {
556 $proba = $obj->total / $total_words;
557 sql_query("UPDATE ".$this->table_cat." SET wordcount=".(int)$obj->total.", probability=".$proba." WHERE catcode = '".$obj->catcode."'");
561 } // updateProbabilities
563 /** save a reference in the database.
565 @param string reference if, must be unique
566 @param string category id
567 @param string content of the reference
569 function saveReference($ref, $catcode, $content) {
570 return sql_query("INSERT INTO ".$this->table_ref." (ref, catcode, content) VALUES (".intval($ref).", '".mysql_real_escape_string($catcode)."','".mysql_real_escape_string($content)."')");
571 } // function saveReference
573 /** get a reference from the database.
574 @return array reference( catcode => ...., content => ....)
577 function getReference($ref) {
578 $reference = array();
579 $rs = sql_query("SELECT * FROM ".$this->table_ref." WHERE ref=".intval($ref));
581 $reference = mysql_fetch_array($rs);
586 /** remove a reference from the database
588 @param string reference id
591 function removeReference($ref) {
592 return sql_query("DELETE FROM ".$this->table_ref." WHERE ref=".intval($ref));
595 function nextdocid() {
596 $res = sql_query ("select ref from ".$this->table_ref." where ref >= 500000000 order by ref desc limit 0,1");
597 $obj = @ mysql_fetch_object($res);
599 return $obj->ref + 1;
605 function logevent($log,$content,$catcode) {
606 if ($this->plugin->getOption('enableLogging') == 'yes') {
607 if (isset($log) && isset($content)) {
608 sql_query("insert into ".$this->table_log." (log,content,catcode) values ('".mysql_real_escape_string($log)."','".mysql_real_escape_string($content)."','".mysql_real_escape_string($catcode)."')");
613 function clearlog($filter = 'all', $filtertype = 'all', $keyword = '', $ipp = 10) {
614 $query = 'delete from '.$this->table_log;
615 if ($filter != 'all' || $filtertype != 'all') {
617 if ($filter != 'all') {
618 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
620 if ($filter != 'all' && $filtertype != 'all') {
623 if ($filtertype != 'all') {
624 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
626 if ($keyword != '') {
627 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
629 } elseif ($keyword != '') {
630 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
632 if ($_REQUEST['amount'] == 'cp') { //only current page?
633 $query .= ' order by logtime desc limit '.$ipp;
636 } // function clearlog
638 function getlogtable($startpos, $filter = 'all',$filtertype = 'all', $keyword, $ipp = 10) {
639 $query = 'select * from '.$this->table_log;
640 if ($filter != 'all' || $filtertype != 'all') {
642 if ($filter != 'all') {
643 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
645 if ($filter != 'all' && $filtertype != 'all') {
648 if ($filtertype != 'all') {
649 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
651 if ($keyword != '') {
652 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
654 } elseif ($keyword != '') {
655 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
657 $query .= ' order by logtime desc limit '.$startpos.','.$ipp;
658 return sql_query($query);
659 } // function getlogtable
661 function countlogtable($filter = 'all', $filtertype = 'all', $keyword = '') {
662 $query = 'select count(*) as total from '.$this->table_log;
663 if ($filter != 'all' || $filtertype != 'all') {
665 if ($filter != 'all') {
666 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
668 if ($filter != 'all' && $filtertype != 'all') {
671 if ($filtertype != 'all') {
672 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
674 if ($keyword != '') {
675 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
677 } elseif ($keyword != '') {
678 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
680 $res = sql_query($query);
681 $arr = mysql_fetch_array($res);
682 return $arr['total'];
685 function getlogtypes() {
686 $query = "select distinct(substring_index(log,' ', 2)) as logtype from ".$this->table_log;
688 $res = sql_query($query);
689 while ($arr = mysql_fetch_array($res)) {
690 $logtypes[] = $arr['logtype'];
695 function getreftable($startpos) {
696 $query = 'select * from '.$this->table_ref.' where ref >= 1000000 order by ref desc limit '.$startpos.',10';
697 return sql_query($query);
700 function getLogevent($id) {
701 $query = 'select * from '.$this->table_log.' where id = '.$id;
702 $res = sql_query($query);
703 return mysql_fetch_array($res);
706 function removeLogevent($id) {
707 $query = ' delete from '.$this->table_log.' where id = '.$id;
708 $res = sql_query($query);
711 function countreftable() {
712 $query = 'select count(*) as total from '.$this->table_ref.' where ref >= 1000000';
713 $res = sql_query($query);
714 $arr = mysql_fetch_array($res);
715 return $arr['total'];
718 } // class NaiveBayesianStorage