2 /****** BEGIN LICENSE BLOCK *****
3 This file is part of PHP Naive Bayesian Filter.
4 The Initial Developer of the Original Code is
5 Loic d'Anterroches [loic_at_xhtml.net].
6 Portions created by the Initial Developer are Copyright (C) 2003
7 the Initial Developer. All Rights Reserved.
9 PHP Naive Bayesian Filter is free software; you can redistribute it
10 and/or modify it under the terms of the GNU General Public License as
11 published by the Free Software Foundation; either version 2 of
12 the License, or (at your option) any later version.
14 PHP Naive Bayesian Filter is distributed in the hope that it will
15 be useful, but WITHOUT ANY WARRANTY; without even the implied
16 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
17 See the GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with Foobar; if not, write to the Free Software
21 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 Alternatively, the contents of this file may be used under the terms of
24 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
25 in which case the provisions of the LGPL are applicable instead
27 ***** END LICENSE BLOCK ******/
29 //define('NP_SPAMBAYES_TOKENIZER', '/usr/local/bin/mecab -F "%h\t%m\t%f[6]\n" -E ""');
30 define('NP_SPAMBAYES_APIURL', 'http://api.jlp.yahoo.co.jp/MAService/V1/parse');
33 /** min token length for it to be taken into consideration */
34 var $min_token_length = 2;
35 /** max token length for it to be taken into consideration */
36 var $max_token_length = 40;
37 /** list of token to ignore @see getIgnoreList() */
38 var $ignore_list = array();
42 function NaiveBayesian(&$parent) {
43 $this->nbs = new NaiveBayesianStorage(&$parent);
44 $this->parent = &$parent;
48 /** categorize a document.
49 Get list of categories in which the document can be categorized
50 with a score for each category.
52 @return array keys = category ids, values = scores
53 @param string document
55 function categorize($document) {
57 $categories = $this->nbs->getCategories();
59 $tokens = $this->_getTokens($document);
60 // calculate the score in each category
63 while (list($category, $data) = each($categories)) {
64 $total_words += $data['wordcount'];
68 while (list($category, $data) = each($categories)) {
69 $scores[$category] = $data['probability'];
70 //debug: print_r($scores);
71 // small probability for a word not in the category
72 // maybe putting 1.0 as a 'no effect' word can also be good
73 $small_proba = 1.0 / ($data['wordcount'] * 2);
75 while (list($token, $count) = each($tokens)) {
76 //debug: echo "<br/>$token; $count ";
77 if ($this->nbs->wordExists($token)) {
78 //debug: echo "$category = known $small_proba wordcount: ";
79 $word = $this->nbs->getWord($token, $category);
80 //debug: echo $word['wordcount'];
81 if ($word['wordcount']) $proba = $word['wordcount']/$data['wordcount'];
82 else $proba = $small_proba;
83 $newval = $scores[$category] * pow($proba, $count)*pow($total_words/$ncat, $count);
84 if (is_finite($newval)) {
85 $scores[$category] = $newval;
90 return $this->_rescale($scores);
91 } // function categorize
94 function explain($content) {
95 $categories = $this->nbs->getCategories(); // ham, spam
97 $tokens = $this->_getTokens($content);
98 // calculate the score in each category
101 while (list($category, $data) = each($categories)) {
102 $total_words += $data['wordcount'];
107 while (list($category, $data) = each($categories)) {
108 $scores[$category] = $data['probability'];
109 //debug: echo $category.'<br />';
110 $small_proba = 1.0 / ($data['wordcount'] * 2);
113 while (list($token, $count) = each($tokens)) {
115 //echo "<br/>$token; $count ";
116 if ($this->nbs->wordExists($token)) {
117 $word = $this->nbs->getWord($token, $category);
118 $result[$word['word']][$category] = $word['wordcount'];
121 if ($word['wordcount']) $proba = $word['wordcount']/$data['wordcount'];
122 else $proba = $small_proba;
123 $newval = $scores[$category] * pow($proba, $count)*pow($total_words/$ncat, $count);
124 if (is_finite($newval)) {
125 $scores[$category] = $newval;
130 $scores = $this->_rescale($scores);
131 array_multisort($result, SORT_DESC);
134 echo '<tr><th>word</th><th>Ham</th><th>Spam</th></tr>';
135 foreach($result as $key => $value) {
137 echo '<td>'.$key.'</td>';
138 echo '<td>'.$value['ham'].'</td>';
139 echo '<td>'.$value['spam'].'</td>';
142 echo '<tr><td>Rescaled probability:</td><th>'.$scores['ham'].'</th><th>'.$scores['spam'].'</th></tr>';
144 //debug: print_r ($scores);
147 /** training against a document.
148 Set a document as being in a specific category. The document becomes a reference
149 and is saved in the table of references. After a set of training is done
150 the updateProbabilities() function must be run.
152 @see updateProbabilities()
155 @param string document id, must be unique
156 @param string category_id the category id in which the document should be
157 @param string content of the document
159 function train($doc_id, $category_id, $content) {
160 $tokens = $this->_getTokens($content);
161 //debug: print_r($tokens);
162 while (list($token, $count) = each($tokens)) {
163 $this->nbs->updateWord($token, $count, $category_id);
165 $this->nbs->saveReference($doc_id, $category_id, $content);
169 function trainnew($doc_id, $category_id, $content) {
170 $reference = $this->nbs->getReference($doc_id);
172 $this->train($doc_id, $category_id, $content);
176 /** untraining of a document.
177 To remove just one document from the references.
179 @see updateProbabilities()
182 @param string document id, must be unique
185 function untrain($doc_id) {
186 $ref = $this->nbs->getReference($doc_id);
187 $tokens = $this->_getTokens($ref['content']);
188 while (list($token, $count) = each($tokens)) {
189 $this->nbs->removeWord($token, $count, $ref['catcode']);
191 $this->nbs->removeReference($doc_id);
193 } // function untrain
195 /** rescale the results between 0 and 1.
196 @author Ken Williams, ken@mathforum.org
198 @return array normalized scores (keys => category, values => scores)
199 @param array scores (keys => category, values => scores)
202 function _rescale($scores) {
203 // Scale everything back to a reasonable area in
204 // logspace (near zero), un-loggify, and normalize
208 while (list($cat, $score) = each($scores)) {
209 if ($score >= $max) $max = $score;
212 while (list($cat, $score) = each($scores)) {
213 $scores[$cat] = (float) exp($score - $max);
214 $total += (float) pow($scores[$cat],2);
216 $total = (float) sqrt($total);
218 while (list($cat, $score) = each($scores)) {
219 $scores[$cat] = (float) $scores[$cat]/$total;
223 } // function _rescale
225 /** update the probabilities of the categories and word count.
226 This function must be run after a set of training
232 function updateProbabilities() {
233 // this function is really only database manipulation
234 // that is why all is done in the NaiveBayesianStorage
235 return $this->nbs->updateProbabilities();
236 } // function updateProbabilities
238 /** Get the list of token to ignore.
239 @return array ignore list
242 function getIgnoreList() {
243 $ignore = $this->parent->getOption('ignorelist');
244 $arr = explode(',',$ignore);
245 $ignore = implode(' ',$arr);
246 $arr = explode(' ',$ignore);
250 /** get the tokens from a string
251 @author James Seng. [http://james.seng.cc/] (based on his perl version)
254 @param string the string to get the tokens from
257 function _getTokens($string) {
258 $rawtokens = array();
260 //$string = $this->_cleanString($string);
261 if (count(0 >= $this->ignore_list))
262 $this->ignore_list = $this->getIgnoreList();
264 $string = strip_tags($string);
266 if( defined('NP_SPAMBAYES_TOKENIZER') && function_exists(proc_open) ) {
269 $string = preg_replace('/\r|\n/', '', $string);
270 $string = strtr($string, array_flip(get_html_translation_table(HTML_SPECIALCHARS)));
271 $string = strip_tags($string);
273 0 => array("pipe", "r"),
274 1 => array("pipe", "w"),
275 2 => array("file", "/dev/null", "w")
277 $process = proc_open(NP_SPAMBAYES_TOKENIZER, $dspec, $pipes);
278 if(is_resource($process)) {
279 stream_set_blocking($pipes[0], FALSE);
280 stream_set_blocking($pipes[1], FALSE);
281 fwrite($pipes[0], $string . "\n");
283 while(!feof($pipes[1])) {
284 list($id, $origStr, $regStr) = explode("\t", trim(fgets($pipes[1], 32768)), 3);
285 if( ( 31 <= $id && $id <= 67 ) || ( 10 <= $id && $id <= 12 ) )
286 $rawtokens[] = trim($regStr ? $regStr : $origStr);
289 proc_close($process);
293 if( _CHARSET != 'UTF-8' )
294 $string = mb_convert_encoding($string, 'UTF-8', _CHARSET);
296 $postData['appid'] = $this->parent->getOption('appid');
297 $postData['results'] = 'ma';
298 $postData['filter'] = '1|2|3|4|5|7|8|9|10';
299 $postData['response'] = 'baseform';
300 $postData['sentence'] = $string;
302 $data = $this->_http(NP_SPAMBAYES_APIURL, 'POST', '', $postData);
304 $p = new NP_SpamBayes_XMLParser();
305 $rawtokens = $p->parse($data);
308 if( _CHARSET != 'UTF-8' ){
309 foreach( $rawtokens as $index => $word ){
310 $rawtokens[$index] = mb_convert_encoding($word, _CHARSET, 'UTF-8');
315 // remove some tokens
316 foreach($rawtokens as $token) {
317 if (!(('' == $token) ||
318 (mb_strlen($token) < $this->min_token_length) ||
319 (mb_strlen($token) > $this->max_token_length) ||
320 (preg_match('/^[0-9]+$/', $token)) ||
321 (preg_match('/['.preg_quote('"\':;/\_[](){}!#%&$=+*|~?<>,.-','/').']+/', $token)) ||
322 (in_array($token, $this->ignore_list))
327 } // function _getTokens
329 function _http($url, $method = "GET", $headers = "", $post = array ("")) {
330 $URL = parse_url($url);
332 if (isset ($URL['query'])) {
333 $URL['query'] = "?".$URL['query'];
338 if (!isset ($URL['port']))
341 $request = $method." ".$URL['path'].$URL['query']." HTTP/1.0\r\n";
343 $request .= "Host: ".$URL['host']."\r\n";
344 $request .= "User-Agent: NP_SpamBayes\r\n";
346 if (isset ($URL['user']) && isset ($URL['pass'])) {
347 $request .= "Authorization: Basic ".base64_encode($URL['user'].":".$URL['pass'])."\r\n";
350 $request .= $headers;
352 if (strtoupper($method) == "POST") {
353 while (list ($name, $value) = each($post)) {
354 $POST[] = $name."=".urlencode($value);
356 $postdata = implode("&", $POST);
357 $request .= "Content-Type: application/x-www-form-urlencoded\r\n";
358 $request .= "Content-Length: ".strlen($postdata)."\r\n";
360 $request .= $postdata;
366 $test = fopen("/tmp/postdata.dat","wb");
367 fwrite($test, $request);
371 $fp = fsockopen($URL['host'], $URL['port'], $errno, $errstr, 20);
374 socket_set_timeout($fp, 20);
375 fputs($fp, $request);
378 $response .= fgets($fp, 4096);
381 $DATA = split("\r\n\r\n", $response, 2);
384 $host = $URL['host'];
385 $port = $URL['port'];
386 ACTIONLOG :: add(WARNING, 'NP_SpamBayes' .':'."[$errno]($host:$port) $errstr");
391 /** clean a string from the diacritics
392 @author Antoine Bajolet [phpdig_at_toiletoine.net]
393 @author SPIP [http://uzine.net/spip/]
395 @return string clean string
396 @param string string with accents
399 function _cleanString($string) {
401 /* A */ chr(192).chr(193).chr(194).chr(195).chr(196).chr(197).
402 /* a */ chr(224).chr(225).chr(226).chr(227).chr(228).chr(229).
403 /* O */ chr(210).chr(211).chr(212).chr(213).chr(214).chr(216).
404 /* o */ chr(242).chr(243).chr(244).chr(245).chr(246).chr(248).
405 /* E */ chr(200).chr(201).chr(202).chr(203).
406 /* e */ chr(232).chr(233).chr(234).chr(235).
407 /* Cc */ chr(199).chr(231).
408 /* I */ chr(204).chr(205).chr(206).chr(207).
409 /* i */ chr(236).chr(237).chr(238).chr(239).
410 /* U */ chr(217).chr(218).chr(219).chr(220).
411 /* u */ chr(249).chr(250).chr(251).chr(252).
412 /* yNn */ chr(255).chr(209).chr(241);
413 return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
415 } // class NaiveBaysian
417 class NP_SpamBayes_XMLParser {
418 function NP_SpamBayes_XMLParser(){
419 $this->parser = xml_parser_create();
420 xml_set_object($this->parser, $this);
421 xml_set_element_handler($this->parser, "_open", "_close");
422 xml_set_character_data_handler($this->parser, "_cdata");
424 $this->target = 'BASEFORM';
425 $this->inTarget = false;
428 function parse($data){
429 $this->words = array();
430 xml_parse($this->parser, $data);
435 xml_parser_free($this->parser);
439 function _open($parser, $name, $attribute){
440 if( $name == $this->target ) $this->inTarget = true;
443 function _close($parser, $name){
444 if( $name == $this->target ) $this->inTarget = false;
447 function _cdata($parser, $data){
448 if( $this->inTarget ){
449 $this->words[] = trim($data);
454 /** Access to the storage of the data for the filter.
456 To avoid dependency with respect to any database, this class handle all the
457 access to the data storage. You can provide your own class as long as
458 all the methods are available. The current one rely on a MySQL database.
461 - array getCategories()
462 - bool wordExists(string $word)
463 - array getWord(string $word, string $categoryid)
466 class NaiveBayesianStorage {
467 function NaiveBayesianStorage(&$plugin) {
468 $this->table_cat = sql_table('plug_sb_cat'); // categories
469 $this->table_wf = sql_table('plug_sb_wf'); // word frequencies
470 $this->table_ref = sql_table('plug_sb_ref'); // references
471 $this->table_log = sql_table('plug_sb_log'); // logging
472 $this->plugin = &$plugin;
474 /** get the list of categories with basic data.
475 @return array key = category ids, values = array(keys = 'probability', 'word_count')
477 function getCategories() {
478 $categories = array();
480 $rs = sql_query('SELECT * FROM '.$this->table_cat);
483 while ($row = mysql_fetch_array($rs)) {
484 $categories[$row['catcode']] = array('probability' => $row['probability'], 'wordcount' => $row['wordcount'] );
487 $categories[0] = 'No categories found';
492 /** see if the word is an already learnt word.
496 function wordExists($word) {
497 $rs = sql_query("SELECT count(*) as amount FROM ".$this->table_wf." WHERE word='". mysql_real_escape_string($word)."'");
498 $obj = mysql_fetch_object($rs);
499 if ($obj->amount == 0) return false;
503 /** get details of a word in a category.
504 @return array ('count' => count)
506 @param string category id
508 function getWord($word, $catcode){
510 $rs = sql_query("SELECT * FROM ".$this->table_wf." WHERE word='".mysql_real_escape_string($word)."' AND catcode='".mysql_real_escape_string($catcode)."'");
511 $obj = mysql_fetch_object($rs);
513 $details['wordcount'] = $obj->wordcount;
514 $details['catcode'] = $obj->catcode;
515 $details['word'] = $obj->word;
517 $details['wordcount'] = 0;
518 $details['catcode'] = $catcode;
519 $details['word'] = $word;
524 /** update a word in a category.
525 If the word is new in this category it is added, else only the count is updated.
529 @paran string category id
532 function updateWord($word, $wordcount, $catcode) {
533 $oldword = $this->getWord($word, $catcode);
534 if (0 == $oldword['wordcount']) {
535 return sql_query("INSERT INTO ".$this->table_wf." (word, catcode, wordcount) VALUES ('".mysql_real_escape_string($word)."','".mysql_real_escape_string($catcode)."','".mysql_real_escape_string((int)$wordcount)."')");
537 return sql_query("UPDATE ".$this->table_wf." SET wordcount = wordcount +".(int)$wordcount." WHERE catcode = '".mysql_real_escape_string($catcode)."' AND word = '".mysql_real_escape_string($word)."'");
539 } // function updateWord
541 /** remove a word from a category.
545 @param string category id
548 function removeWord($word, $wordcount, $catcode) {
549 $oldword = $this->getWord($word, $catcode);
550 if (0 != $oldword['wordcount'] && 0 >= ($oldword['wordcount']-$wordcount)) {
551 return sql_query("DELETE FROM ".$this->table_wf." WHERE word='".mysql_real_escape_string($word)."' AND catcode ='".mysql_real_escape_string($catcode)."'");
553 return sql_query("UPDATE ".$this->table_wf." SET wordcount = wordcount - ".(int)$wordcount." WHERE catcode = '".mysql_real_escape_string($catcode)."' AND word = '".mysql_real_escape_string($word)."'");
555 } // function removeWord
557 /** update the probabilities of the categories and word count.
558 This function must be run after a set of training
561 function updateProbabilities() {
562 // first update the word count of each category
563 $rs = sql_query("SELECT catcode, SUM(wordcount) AS total FROM ".$this->table_wf." WHERE 1 GROUP BY catcode");
565 while ($obj = mysql_fetch_object($rs)) {
566 $total_words += $obj->total;
569 if ($total_words == 0) {
570 sql_query("UPDATE ".$this->table_cat." SET wordcount = 0, probability = 0 WHERE 1");
572 $rs = sql_query("SELECT catcode, SUM(wordcount) AS total FROM ".$this->table_wf." WHERE 1 GROUP BY catcode");
573 while ($obj = mysql_fetch_object($rs)) {
574 $proba = $obj->total / $total_words;
575 sql_query("UPDATE ".$this->table_cat." SET wordcount=".(int)$obj->total.", probability=".$proba." WHERE catcode = '".$obj->catcode."'");
579 } // updateProbabilities
581 /** save a reference in the database.
583 @param string reference if, must be unique
584 @param string category id
585 @param string content of the reference
587 function saveReference($ref, $catcode, $content) {
588 return sql_query("INSERT INTO ".$this->table_ref." (ref, catcode, content) VALUES (".intval($ref).", '".mysql_real_escape_string($catcode)."','".mysql_real_escape_string($content)."')");
589 } // function saveReference
591 /** get a reference from the database.
592 @return array reference( catcode => ...., content => ....)
595 function getReference($ref) {
596 $reference = array();
597 $rs = sql_query("SELECT * FROM ".$this->table_ref." WHERE ref=".intval($ref));
599 $reference = mysql_fetch_array($rs);
604 /** remove a reference from the database
606 @param string reference id
609 function removeReference($ref) {
610 return sql_query("DELETE FROM ".$this->table_ref." WHERE ref=".intval($ref));
613 function nextdocid() {
614 $res = sql_query ("select ref from ".$this->table_ref." where ref >= 500000000 order by ref desc limit 0,1");
615 $obj = @ mysql_fetch_object($res);
617 return $obj->ref + 1;
623 function logevent($log,$content,$catcode) {
624 if ($this->plugin->getOption('enableLogging') == 'yes') {
625 if (isset($log) && isset($content)) {
626 sql_query("insert into ".$this->table_log." (log,content,catcode) values ('".mysql_real_escape_string($log)."','".mysql_real_escape_string($content)."','".mysql_real_escape_string($catcode)."')");
631 function clearlog($filter = 'all', $filtertype = 'all', $keyword = '', $ipp = 10) {
632 $query = 'delete from '.$this->table_log;
633 if ($filter != 'all' || $filtertype != 'all') {
635 if ($filter != 'all') {
636 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
638 if ($filter != 'all' && $filtertype != 'all') {
641 if ($filtertype != 'all') {
642 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
644 if ($keyword != '') {
645 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
647 } elseif ($keyword != '') {
648 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
650 if ($_REQUEST['amount'] == 'cp') { //only current page?
651 $query .= ' order by logtime desc limit '.$ipp;
654 } // function clearlog
656 function getlogtable($startpos, $filter = 'all',$filtertype = 'all', $keyword, $ipp = 10) {
657 $query = 'select * from '.$this->table_log;
658 if ($filter != 'all' || $filtertype != 'all') {
660 if ($filter != 'all') {
661 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
663 if ($filter != 'all' && $filtertype != 'all') {
666 if ($filtertype != 'all') {
667 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
669 if ($keyword != '') {
670 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
672 } elseif ($keyword != '') {
673 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
675 $query .= ' order by logtime desc limit '.$startpos.','.$ipp;
676 return sql_query($query);
677 } // function getlogtable
679 function countlogtable($filter = 'all', $filtertype = 'all', $keyword = '') {
680 $query = 'select count(*) as total from '.$this->table_log;
681 if ($filter != 'all' || $filtertype != 'all') {
683 if ($filter != 'all') {
684 $query .= " catcode = '".mysql_real_escape_string($filter)."'";
686 if ($filter != 'all' && $filtertype != 'all') {
689 if ($filtertype != 'all') {
690 $query .= " log like '".mysql_real_escape_string($filtertype)."%'";
692 if ($keyword != '') {
693 $query .= " and content like '%".mysql_real_escape_string($keyword)."%'";
695 } elseif ($keyword != '') {
696 $query .= " where content like '%".mysql_real_escape_string($keyword)."%'";
698 $res = sql_query($query);
699 $arr = mysql_fetch_array($res);
700 return $arr['total'];
703 function getlogtypes() {
704 $query = "select distinct(substring_index(log,' ', 2)) as logtype from ".$this->table_log;
706 $res = sql_query($query);
707 while ($arr = mysql_fetch_array($res)) {
708 $logtypes[] = $arr['logtype'];
713 function getreftable($startpos) {
714 $query = 'select * from '.$this->table_ref.' where ref >= 1000000 order by ref desc limit '.$startpos.',10';
715 return sql_query($query);
718 function getLogevent($id) {
719 $query = 'select * from '.$this->table_log.' where id = '.$id;
720 $res = sql_query($query);
721 return mysql_fetch_array($res);
724 function removeLogevent($id) {
725 $query = ' delete from '.$this->table_log.' where id = '.$id;
726 $res = sql_query($query);
729 function countreftable() {
730 $query = 'select count(*) as total from '.$this->table_ref.' where ref >= 1000000';
731 $res = sql_query($query);
732 $arr = mysql_fetch_array($res);
733 return $arr['total'];
736 } // class NaiveBayesianStorage