OSDN Git Service

・スレタイの特定ワードを非表示にする機能に「©bbspink.com」も追加
[gikonavigoeson/gikonavi.git] / GikoBayesian.pas
1 unit GikoBayesian;
2
3 {!
4 \file           GikoBayesian.pas
5 \brief  \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
6
7
8 $Id: GikoBayesian.pas,v 1.22 2009/01/31 15:47:15 h677 Exp $
9 }
10
11 //! \95½\89¼\96¼\82ð\8e«\8f\91\82É\8aÜ\82ß\82È\82¢
12 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
13
14 interface
15
16 //==================================================
17 uses
18 //==================================================
19         Classes;
20
21 //==================================================
22 type
23 //==================================================
24
25         {!***********************************************************
26         \brief \92P\8cê\83v\83\8d\83p\83e\83B
27         ************************************************************}
28         TWordInfo       = class( TObject )
29         private
30                 FNormalWord                     :       Integer;        //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
31                 FImportantWord  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
32                 FNormalText                     : Integer;      //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
33                 FImportantText  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
34
35         public
36                 property NormalWord                     : Integer       read FNormalWord write FNormalWord;
37                 property ImportantWord  : Integer       read FImportantWord write FImportantWord;
38                 property NormalText                     : Integer       read FNormalText write FNormalText;
39                 property ImportantText  : Integer       read FImportantText write FImportantText;
40         end;
41
42         {!***********************************************************
43         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83v\83\8d\83p\83e\83B
44         ************************************************************}
45         TWordCountInfo  = class( TObject )
46         private
47                 FWordCount      :       Integer;        //!< \92P\8cê\90\94
48
49         public
50                 property WordCount      : Integer       read FWordCount write FWordCount;
51         end;
52
53         {!***********************************************************
54         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83\8a\83X\83g
55         ************************************************************}
56 //      TWordCount      = class( THashedStringList )    // \8c\83\92x
57         TWordCount      = class( TStringList )
58         public
59                 constructor Create;
60                 destructor Destroy; override;
61         end;
62
63         {!***********************************************************
64         \brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
65         ************************************************************}
66         TGikoBayesianAlgorithm =
67                 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
68
69         {!***********************************************************
70         \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
71         ************************************************************}
72 //      TGikoBayesian = class( THashedStringList )      // \8c\83\92x
73         TGikoBayesian = class( TStringList )
74         private
75                 FFilePath       : string;       //!< \93Ç\82Ý\8d\9e\82ñ\82¾\83t\83@\83C\83\8b\83p\83X
76                 function GetObject( const name : string ) : TWordInfo;
77                 procedure SetObject( const name : string; value : TWordInfo );
78
79         public
80                 constructor Create;
81                 destructor Destroy; override;
82
83                 //! \83t\83@\83C\83\8b\82©\82ç\8aw\8fK\97\9a\97ð\82ð\93Ç\82Ý\8fo\82µ\82Ü\82·
84                 procedure LoadFromFile( const filePath : string );
85
86                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
87                 procedure SaveToFile( const filePath : string );
88
89                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
90                 procedure Save;
91
92                 //! \92P\8cê\82É\91Î\82·\82é\8fî\95ñ\82ð\8eæ\93¾\82µ\82Ü\82·
93                 property Objects[ const name : string ] : TWordInfo
94                         read GetObject write SetObject; default;
95
96                 //! \95\8fÍ\82É\8aÜ\82Ü\82ê\82é\92P\8cê\82ð\83J\83E\83\93\83g\82µ\82Ü\82·
97                 procedure CountWord(
98                         const text      : string;
99                         wordCount               : TWordCount );
100
101                 {!
102                 \brief  Paul Graham \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
103                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
104                 }
105                 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
106
107                 {!
108                 \brief  GaryRobinson \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
109                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
110                 }
111                 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
112
113                 {!
114                 \brief  GaryRobinson-Fisher \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
115                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
116                 }
117                 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
118
119                 {!
120                 \brief  \95\8fÍ\82ð\89ð\90Í
121                 \param  text                                    \89ð\90Í\82·\82é\95\8fÍ
122                 \param  wordCount                       \89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g\82ª\95Ô\82é
123                 \param  algorithm                       \92\8d\96Ú\93x\82Ì\8c\88\92è\82É\97p\82¢\82é\83A\83\8b\83S\83\8a\83Y\83\80\82ð\8ew\92è\82µ\82Ü\82·
124                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
125
126                 CountWord \82Æ Calcxxxxx \82ð\82Ü\82Æ\82ß\82Ä\8eÀ\8ds\82·\82é\82¾\82¯\82Å\82·\81B
127                 }
128                 function Parse(
129                         const text                              : string;
130                         wordCount                                       : TWordCount;
131                         algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
132                 ) : Extended;
133
134                 {!
135                 \brief  \8aw\8fK\82·\82é
136                 \param  wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
137                 \param  isImportant \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82é\82È\82ç True
138                 }
139                 procedure Learn(
140                         wordCount                : TWordCount;
141                         isImportant      : Boolean );
142
143                 {!
144                 \brief          \8aw\8fK\8c\8b\89Ê\82ð\96Y\82ê\82é
145                 \param          wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
146                 \param          isImportant     \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82ç\82ê\82Ä\82¢\82½\82È\82ç True
147                 \warning        \8aw\8fK\8dÏ\82Ý\82Ì\95\8fÍ\82©\82Ç\82¤\82©\82Í\8am\94F\8fo\97\88\82Ü\82¹\82ñ\81B<br>
148                                                         Learn \82µ\82Ä\82¢\82È\82¢\95\8fÍ\82â isImportant \82ª\8aÔ\88á\82Á\82Ä\82¢\82é\95\8fÍ\82ð
149                                                         Forget \82·\82é\82Æ\83f\81[\83^\83x\81[\83X\82ª\94j\91¹\82µ\82Ü\82·\81B<br>
150                                                         \8aw\8fK\8dÏ\82Ý\82©\82Ç\82¤\82©\82Í\93Æ\8e©\82É\8aÇ\97\9d\82µ\82Ä\82­\82¾\82³\82¢\81B
151
152                 \91S\82Ä\82Ì\8aw\8fK\8c\8b\89Ê\82ð\83N\83\8a\83A\82·\82é\82í\82¯\82Å\82Í\82 \82è\82Ü\82¹\82ñ\81B<br>
153                 wordCount \82ð\93¾\82½\95\8fÍ (Parse \82Ì text \88ø\90\94\82Ì\8aw\8fK\8c\8b\89Ê\82Ì\82Ý\83N\83\8a\83A\82µ\82Ü\82·\81B<br><br>
154
155                 \8eå\82É\92\8d\96Ú\95\8fÍ\82Æ\94ñ\92\8d\96Ú\95\8fÍ\82ð\90Ø\82è\91Ö\82¦\82é\82½\82ß\82É Forget -> Learn \82Ì\8f\87\82Å\8eg\97p\82µ\82Ü\82·\81B
156                 }
157                 procedure       Forget(
158                         wordCount               : TWordCount;
159                         isImportant     : Boolean );
160         end;
161
162 //==================================================
163 implementation
164 //==================================================
165
166 uses
167         SysUtils, Math, Windows,
168         MojuUtils;
169
170 const
171         GIKO_BAYESIAN_FILE_VERSION      = '1.0';
172 {
173         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
174                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
175                                                                 ModeWHira, ModeWKata, ModeWKanji);
176 }
177         CharMode1 : array [ 0..255 ] of Byte =
178         (
179                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
182                 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
183                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
184                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
185                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
186                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
187
188                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190                 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
194                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
195                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
196         );
197
198 //************************************************************
199 // misc
200 //************************************************************
201
202 //==============================
203 // RemoveToken
204 //==============================
205 function RemoveToken(var s: string;const delimiter: string): string;
206 var
207         p: Integer;
208 begin
209         p := AnsiPos(delimiter, s);
210         if p = 0 then
211                 Result := s
212         else
213                 Result := Copy(s, 1, p - 1);
214         s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
215 end;
216
217 //==============================
218 // AbsSort
219 //==============================
220 function AbsSort( p1, p2 : Pointer ) : Integer;
221 var
222         v1, v2 : Single;
223 begin
224
225         v1 := Abs( Single( p1 ) - 0.5 );
226         v2 := Abs( Single( p2 ) - 0.5 );
227         if v1 > v2 then
228                 Result := -1
229         else if v1 = v2 then
230                 Result := 0
231         else
232                 Result := 1;
233
234 end;
235
236 //************************************************************
237 // TWordCount class
238 //************************************************************
239 constructor TWordCount.Create;
240 begin
241
242                 Duplicates              := dupIgnore;
243                 CaseSensitive   := True;
244                 Sorted                          := True;
245
246 end;
247
248 destructor TWordCount.Destroy;
249 var
250         i : Integer;
251 begin
252
253         for i := Count - 1 downto 0 do
254                 if Objects[ i ] <> nil then
255                         Objects[ i ].Free;
256
257         inherited;
258
259 end;
260
261 //************************************************************
262 // TGikoBayesian class
263 //************************************************************
264
265 //==============================
266 // Create
267 //==============================
268 constructor TGikoBayesian.Create;
269 begin
270
271         Duplicates              := dupIgnore;
272         CaseSensitive   := True;
273         Sorted                          := True;
274
275 end;
276
277 //==============================
278 // Destroy
279 //==============================
280 destructor TGikoBayesian.Destroy;
281 var
282         i : Integer;
283 begin
284
285         for i := Count - 1 downto 0 do
286                 if inherited Objects[ i ] <> nil then
287                         inherited Objects[ i ].Free;
288
289         inherited;
290
291 end;
292
293 procedure TGikoBayesian.LoadFromFile( const filePath : string );
294 var
295         i                       : Integer;
296         sl              : TStringList;
297         s                       : string;
298         name    : string;
299         info    : TWordInfo;
300 begin
301
302         FFilePath := filePath;
303
304         if not FileExists( filePath ) then
305                 Exit;
306
307         sl := TStringList.Create;
308         try
309                 sl.LoadFromFile( filePath );
310
311                 for i := 1 to sl.Count - 1 do begin
312                         s := sl[ i ];
313                         name := GikoBayesian.RemoveToken( s, #1 );
314                         info := TWordInfo.Create;
315                         info.NormalWord                 := StrToIntDef( '$' + GikoBayesian.RemoveToken( s, #1 ), 0 );
316                         info.ImportantWord      := StrToIntDef( '$' + GikoBayesian.RemoveToken( s, #1 ), 0 );
317                         info.NormalText                 := StrToIntDef( '$' + GikoBayesian.RemoveToken( s, #1 ), 0 );
318                         info.ImportantText      := StrToIntDef( '$' + GikoBayesian.RemoveToken( s, #1 ), 0 );
319
320                         AddObject( name, info );
321                 end;
322         finally
323                 sl.Free;
324         end;
325
326 end;
327
328 procedure TGikoBayesian.SaveToFile( const filePath : string );
329 var
330         i                       : Integer;
331         sl              : TStringList;
332         s                       : string;
333         info    : TWordInfo;
334 begin
335
336         FFilePath := filePath;
337
338         sl := TStringList.Create;
339         try
340                 sl.BeginUpdate;
341                 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
342
343                 for i := 0 to Count - 1 do begin
344                         info := TWordInfo( inherited Objects[ i ] );
345                         s := Strings[ i ] + #1
346                                  + Format('%x', [info.NormalWord]) + #1
347                                  + Format('%x', [info.ImportantWord]) + #1
348                                  + Format('%x', [info.NormalText]) + #1
349                                  + Format('%x', [info.ImportantText]);
350
351                         sl.Add(s);
352                 end;
353                 sl.EndUpdate;
354                 sl.SaveToFile( filePath );
355         finally
356                 sl.Free;
357         end;
358
359 end;
360
361 procedure TGikoBayesian.Save;
362 begin
363
364         if FFilePath <> '' then
365                 SaveToFile( FFilePath );
366
367 end;
368
369 //==============================
370 // GetObject
371 //==============================
372 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
373 var
374         idx : Integer;
375 begin
376
377         if Find( name, idx ) then
378                 Result := TWordInfo( inherited Objects[ idx ] )
379         else
380                 Result := nil;
381
382 end;
383
384 //==============================
385 // SetObject
386 //==============================
387 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
388 var
389         idx : Integer;
390 begin
391
392         if Find( name, idx ) then
393                 inherited Objects[ idx ] := value
394         else
395                 AddObject( name, value );
396
397 end;
398
399
400 //==============================
401 // CountWord
402 //==============================
403 procedure TGikoBayesian.CountWord(
404         const text      : string;
405         wordCount               : TWordCount );
406 type
407         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
408                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
409                                                                 ModeWHira, ModeWKata, ModeWKanji);
410 var
411         p, tail, last                   : PChar;
412         mode, newMode                   : Modes;
413         ch                                                              : Longword;
414         chSize                                          : Integer;
415         wHiraDelimiter          : TStringList;
416         wHiraFinalDelimiter     : TStringList;
417         wKanjiDelimiter         : TStringList;
418         words                                                   : TStringList;
419         aWord                                                   : string;
420 //      countInfo                                       : TWordCountInfo;
421
422         function cutBoth( _aWord : string; _delim : TStringList ) : string;
423         var
424                 _i                      : Integer;
425         begin
426                 for _i := 0 to _delim.Count - 1 do begin
427                         _aWord := CustomStringReplace(
428                                 _aWord,
429                                 _delim[ _i ],
430                                 #10 + _delim[ _i ] + #10, False );
431                 end;
432                 Result := _aWord;
433         end;
434
435         function cutFirst( _aWord : string; _delim : TStringList ) : string;
436         var
437                 _i                      : Integer;
438         begin
439                 for _i := 0 to _delim.Count - 1 do begin
440                         _aWord := CustomStringReplace(
441                                 _aWord,
442                                 _delim[ _i ],
443                                 #10 + _delim[ _i ], False );
444                 end;
445                 Result := _aWord;
446         end;
447
448         function cutFinal( _aWord : string; _delim : TStringList ) : string;
449         var
450                 _i                      : Integer;
451         begin
452                 for _i := 0 to _delim.Count - 1 do begin
453                         _aWord := CustomStringReplace(
454                                 _aWord,
455                                 _delim[ _i ],
456                                 _delim[ _i ] + #10, False );
457                 end;
458                 Result := _aWord;
459         end;
460
461         procedure addWord( _dst : TWordCount; _words : TStringList );
462         var
463                 _aWord                  : string;
464                 _i, _idx                : Integer;
465                 _countInfo      : TWordCountInfo;
466         begin
467                 for _i := 0 to _words.Count - 1 do begin
468                         _aWord := _words[ _i ];
469                         if Length( _aWord ) > 0 then begin
470                                 if _dst.Find( _aWord, _idx ) then begin
471                                         _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
472                                 end else begin
473                                         _countInfo := TWordCountInfo.Create;
474                                         _dst.AddObject( _aWord, _countInfo );
475                                 end;
476                                 _countInfo.WordCount := _countInfo.WordCount + 1;
477                         end;
478                 end;
479         end;
480
481         function changeMode( _aWord : string; _mode : Modes ) : string;
482         var
483                 _i                                                                      : Integer;
484                 _aWord2                                                 : string;
485                 _pWord, _pWord2                 : PChar;
486                 _pWordTail, _pFound     : PChar;
487         const
488                 _delim : string = #10;
489         begin
490 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
491                 if mode = ModeWHira then begin
492                         Result := '';
493                         Exit;
494                 end;
495 {$ENDIF}
496                 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
497                         // \93ú\96{\8cê
498                         // \83X\83y\81[\83X\82ð\8bl\82ß\82é
499                         _aWord := CustomStringReplace( _aWord, ' ', '', False );
500                         _aWord := CustomStringReplace( _aWord, '\81@', '', False );
501
502                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
503                         case mode of
504                         ModeWHira:
505                                 begin
506                                         _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
507                                         Result := cutBoth( _aWord, wHiraDelimiter );
508                                 end;
509
510                         ModeWKanji:
511                                 begin
512                                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
513                                         _aWord := cutBoth( _aWord, wKanjiDelimiter );
514                                         // 4 byte (2 \8e\9a\82¸\82Â\82Å\92P\8cê\95ª\82¯
515                                         _pWord := PChar( _aWord );
516                                         _i := Length( _aWord );
517                                         _pWordTail := _pWord + _i;
518                                         SetLength( _aWord2, _i + (_i shr 2) );
519                                         _pWord2 := PChar( _aWord2 );
520
521                                         while _pWord < _pWordTail do begin
522                                                 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
523                                                 if _pFound = nil then
524                                                         _pFound := _pWordTail;
525                                                 _pFound := _pFound - 3;
526
527                                                 while _pWord <= _pFound do begin
528                                                         CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
529                                                         _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
530                                                 end;
531                                                 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
532                                                 CopyMemory( _pWord2, _pWord, _i );
533                                                 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
534                                         end;
535                                         if _pWord < _pWordTail then begin
536                                                 _i := _pWordTail - _pWord;
537                                                 CopyMemory( _pWord2, _pWord, _i );
538                                                 _pWord2 := _pWord2 + _i;
539                                         end;
540                                         SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
541
542                                         Result := _aWord2;
543                                 end;
544
545                         else
546                                 Result := _aWord;
547                         end;
548                 end else begin
549                         Result := _aWord;
550                 end;
551         end;
552 const
553         WHIRA_DELIMITER = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç'
554                 + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å'+ #10 + '\82Å'
555                 + #10 + '\82±\82±' + #10 + '\82»\82±' + #10 + '\82Ç\82±'
556                 + #10 + '\82±\82ê' + #10 + '\82»\82ê' + #10 + '\82 \82ê' + #10 + '\82Ç\82ê'
557                 + #10 + '\82±\82Ì' + #10 + '\82»\82Ì' + #10 + '\82 \82Ì' + #10 + '\82Ç\82Ì'
558                 + #10 + '\82±\82¤' + #10 + '\82»\82¤' + #10 + '\82 \82 ' + #10 + '\82Ç\82¤'
559                 + #10 + '\82±\82ñ\82È' + #10 + '\82»\82ñ\82È' + #10 + '\82 \82ñ\82È' + #10 + '\82Ç\82ñ\82È'
560                 + #10 + '\82ê\82½' + #10 + '\82ê\82Ä' + #10 + '\82ê\82ê' + #10 + '\82ê\82ë'
561                 + #10 + '\82ê\82é' + #10 + '\82ç\82ê\82é'
562                 + #10 + '\82Å\82·' + #10 + '\82Ü\82·' + #10 + '\82Ü\82¹\82ñ'
563                 + #10 + '\82Å\82µ\82½' + #10 + '\82Ü\82µ\82½'
564                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82³\82ê\82é' + #10 + '\82³\82ê\82È\82¢'
565                 ;
566         WKANJI_DELIMITER = '\93I' + #10 + '\90«' + #10 + '\8e®' + #10 + '\89»' + #10 + '\96@'
567                 + #10 + '\95s' + #10 + '\96³' + #10 + '\94ñ' + #10 + '\94½'
568                 ;
569         WHIRA_FINAL_DELIMITER = '\82Á\82½' + #10 + '\82Á\82Ä'
570                 ;{
571                 + #10 + '\82æ\82Á\82Ä' + #10 + '\82µ\82½\82ª\82Á\82Ä' + #10 + '\82È\82Ì\82Å'
572                 + #10 + '\82¾\82©\82ç' + #10 + '\82Å\82·\82©\82ç'
573                 + #10 + '\82Ü\82½'
574                 + #10 + '\82µ\82©\82µ' + #10 + '\82¾\82ª' + #10 + '\82¯\82Ç' + #10 + '\82¯\82ê\82Ç'
575                 + #10 + '\82â\82Í\82è' + #10 + '\82â\82Á\82Ï\82è'
576                 + #10 + '\82Å\82µ' + #10 + '\82¾\82ë'
577                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82µ\82½' + #10 + '\82µ\82È\82¢'
578                 ;}
579         // '\81[' \82ð '\82\9f\82¡\82£\82¥\82§' \82É\81B
580         HA_LINE = '\82 \82©\82³\82½\82È\82Í\82Ü\82â\82ç\82í\82ª\82´\82¾\82Î\82Ï\82\9f\82ì';
581         HI_LINE = '\82¢\82«\82µ\82¿\82É\82Ð\82Ý\82è\82î\82¬\82\82Ñ\82Ò\82¡';
582         HU_LINE = '\82¤\82­\82·\82Â\82Ê\82Ó\82Þ\82ä\82é\82®\82Ô\82Õ\82£';
583         HE_LINE = '\82¦\82¯\82¹\82Ä\82Ë\82Ö\82ß\82ê\82ï\82°\82×\82Ø\82¥';
584         HO_LINE = '\82¨\82±\82»\82Æ\82Ì\82Ù\82à\82æ\82ë\82ð\82²\82Ú\82Û\82§';
585         KA_LINE = '\83A\83J\83T\83^\83i\83n\83}\83\84\83\89\83\8f\83K\83U\83_\83o\83p\83@\83\95\83\8e';
586         KI_LINE = '\83C\83L\83V\83`\83j\83q\83~\83\8a\83\90\83M\83W\83r\83s\83B';
587         KU_LINE = '\83E\83N\83X\83c\83k\83t\83\80\83\86\83\8b\83O\83u\83v\83D\83\94';
588         KE_LINE = '\83G\83P\83Z\83e\83l\83w\83\81\83\8c\83\91\83Q\83x\83y\83F\83\96';
589         KO_LINE = '\83I\83R\83\\83g\83m\83z\83\82\83\88\83\8d\83\92\83S\83{\83|\83H';
590         kKanji = [$80..$A0, $E0..$ff];
591 begin
592
593         wHiraDelimiter  := TStringList.Create;
594         wHiraFinalDelimiter := TStringList.Create;
595         wKanjiDelimiter := TStringList.Create;
596         words := TStringList.Create;
597         try
598                 mode := ModeWhite;
599 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
600                 wHiraDelimiter.Text := WHIRA_DELIMITER;
601                 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
602 {$ENDIF}
603                 wKanjiDelimiter.Text := WKANJI_DELIMITER;
604                 p                       := PChar( text );
605                 tail    := p + Length( text );
606                 last    := p;
607
608                 while p < tail do begin
609                         // \95\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
610                         // \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82­\82Ä\82à\82¢\82¢
611 //                      if Byte(Byte( p^ ) - $a1) < $5e then begin
612                         if Byte( p^ ) in kKanji then begin
613                                 if p + 1 < tail then begin
614                                         ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
615                                         case ch of
616                                         // \83X\83y\81[\83X\82Å\92P\8cê\95ª\82¯\82¹\82¸\82É\8bl\82ß\82é
617                                         //$8140:                                                        newMode := ModeWhite;
618                                         $8141..$824e:                           newMode := ModeWGraph;
619                                         $824f..$8258:                           newMode := ModeWNum;
620                                         $8260..$829a:                           newMode := ModeWAlpha;
621                                         $829f..$82f1:                           newMode := ModeWHira;
622                                         $8340..$8396:                           newMode := ModeWKata;
623                                         else                                                            newMode := ModeWKanji;
624                                         end;
625                                         // '\81J\81K\81[' \82Í\95½\89¼\96¼\81A\82Ü\82½\82Í\83J\83^\83J\83i\82É\8aÜ\82Ü\82ê\82é
626                                         if (mode = ModeWHira) or (mode = ModeWKata) then
627                                                 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
628                                                         newMode := mode;
629                                 end else begin
630                                         newMode := ModeWhite;
631                                 end;
632
633                                 chSize := 2;
634                         end else begin
635                                 newMode := Modes( CharMode1[ Byte( p^ ) ] );
636                                 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
637                                         // \8d¡\82Ü\82Å\93ú\96{\8cê\82Å\8d¡\83X\83y\81[\83X
638                                         // \92P\8cê\82ð\8cq\82°\82Ä\8cã\82Å\83X\83y\81[\83X\82ð\8bl\82ß\82é
639                                         // \81¦\94¼\8ap\83J\83i\82Í\92Ê\8fí\83X\83y\81[\83X\82Å\8bæ\90Ø\82é\82¾\82ë\82¤\82©\82ç\8bl\82ß\82È\82¢
640                                         newMode := mode;
641                                 end;
642
643                                 chSize := 1;
644                         end;
645
646                         if mode <> newMode then begin
647
648                                 // \95\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
649                                 if mode <> ModeWhite then begin
650                                         SetLength( aWord, p - last );
651                                         CopyMemory( PChar( aWord ), last, p - last );
652
653                                         words.Text := changeMode( aWord, mode );
654
655                                         // \92P\8cê\93o\98^
656                                         addWord( wordCount, words );
657                                 end;
658
659                                 last := p;
660                                 mode := newMode;
661
662                         end;
663
664                         p := p + chSize;
665                 end;    // while
666
667                 if mode <> ModeWhite then begin
668                         SetLength( aWord, p - last );
669                         CopyMemory( PChar( aWord ), last, p - last );
670
671                         words.Text := changeMode( aWord, mode );
672
673                         // \92P\8cê\93o\98^
674                         addWord( wordCount, words );
675                 end;
676         finally
677                 words.Free;
678                 wKanjiDelimiter.Free;
679                 wHiraFinalDelimiter.Free;
680                 wHiraDelimiter.Free;
681         end;
682
683 end;
684
685 //==============================
686 // CalcPaulGraham
687 //==============================
688 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
689
690         function p( const aWord : string ) : Single;
691         var
692                 info : TWordInfo;
693         begin
694                 info := Objects[ aWord ];
695                 if info = nil then
696                         Result := 0.415
697                 else if info.NormalWord = 0 then
698                         Result := 0.99
699                 else if info.ImportantWord = 0 then
700                         Result := 0.01
701                 else if info.ImportantWord + info.NormalWord * 2 < 5 then
702                         Result := 0.5
703                 else begin
704                         try
705                                 Result := ( info.ImportantWord / info.ImportantText ) /
706                                         ((info.NormalWord * 2 / info.NormalText ) +
707                                          (info.ImportantWord / info.ImportantText));
708                         except
709                 on EZeroDivide do Result := 0.99;
710                         end;
711                 end;
712         end;
713
714 var
715         s, q                            : Extended;
716         i                                               : Integer;
717         narray                  : TList;
718 const
719         SAMPLE_COUNT    = 15;
720 begin
721
722         Result := 1;
723         if wordCount.Count = 0 then
724                 Exit;
725
726         narray := TList.Create;
727         try
728                 for i := 0 to wordCount.Count - 1 do begin
729                         narray.Add( Pointer( p( wordCount[ i ] ) ) );
730                 end;
731
732                 narray.Sort( AbsSort );
733
734                 s := 1;
735                 q := 1;
736                 i := min( SAMPLE_COUNT, narray.Count );
737                 while i > 0 do begin
738                         Dec( i );
739
740                         s := s * Single( narray[ i ] );
741                         q := q * (1 - Single( narray[ i ] ));
742                 end;
743                 try
744                         Result := s / (s + q);
745                 except
746             Result := 0.5;
747                 end;
748         finally
749                 narray.Free;
750         end;
751
752 end;
753
754 //==============================
755 // CalcGaryRobinson
756 //==============================
757 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
758
759         function p( const aWord : string ) : Single;
760         var
761                 info : TWordInfo;
762         begin
763                 info := Objects[ aWord ];
764                 if info = nil then
765                         Result := 0.415
766                 else if info.ImportantWord = 0 then
767                         Result := 0.01
768                 else if info.NormalWord = 0 then
769                         Result := 0.99
770                 else
771                 {
772                         Result := ( info.ImportantWord / info.ImportantText ) /
773                                 ((info.NormalWord / info.NormalText ) +
774                                  (info.ImportantWord / info.ImportantText));
775                 }
776                         try
777                                 Result := (info.ImportantWord * info.NormalText) /
778                                         (info.NormalWord * info.ImportantText +
779                                         info.ImportantWord * info.NormalText);
780                         except
781                                 Result := 0.5;
782                         end;
783         end;
784
785         function f( cnt : Integer; n, mean : Single ) : Extended;
786         const
787                 k = 0.001;
788         begin
789                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
790         end;
791
792 var
793         n                                               : Extended;
794         narray                  : array of Single;
795         mean                            : Extended;
796         countInfo               : TWordCountInfo;
797         i                                               : Integer;
798         P1, Q1{, R1}    : Extended;
799         cnt                                     : Extended;
800 begin
801
802         if wordCount.Count = 0 then begin
803                 Result := 1;
804                 Exit;
805         end;
806
807         SetLength( narray, wordCount.Count );
808         mean := 0;
809         for i := 0 to wordCount.Count - 1 do begin
810                 n                                               := p( wordCount[ i ] );
811                 narray[ i ]     := n;
812                 mean                            := mean + n;
813         end;
814         mean := mean / wordCount.Count;
815
816         P1 := 1;
817         Q1 := 1;
818         for i := 0 to wordCount.Count - 1 do begin
819                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
820                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
821                 P1 := P1 * ( 1 - n );
822                 Q1 := Q1 * n;
823         end;
824         cnt := wordCount.Count;
825         if cnt = 0 then
826                 cnt := 1;
827         try
828                 P1 := 1 - Power( P1, 1 / cnt );
829         except
830         end;
831         try
832                 Q1 := 1 - Power( Q1, 1 / cnt );
833         except
834         end;
835
836         if P1 + Q1 = 0 then begin
837                 Result := 0.5
838         end else begin
839                 n := (P1 - Q1) / (P1 + Q1);
840                 Result := (1 + n) / 2;
841         end;
842
843 end;
844
845 //==============================
846 // CalcGaryRobinsonFisher
847 //==============================
848 function TGikoBayesian.CalcGaryRobinsonFisher(
849         wordCount : TWordCount
850 ) : Extended;
851
852         function p( const aWord : string ) : Single;
853         var
854                 info                            : TWordInfo;
855         begin
856                 info := Objects[ aWord ];
857                 if info = nil then
858                         Result := 0.415
859                 else if info.ImportantWord = 0 then
860                         Result := 0.01
861                 else if info.NormalWord = 0 then
862                         Result := 0.99
863                 else
864                 {
865                         Result := ( info.ImportantWord / info.ImportantText ) /
866                                 ((info.NormalWord / info.NormalText ) +
867                                  (info.ImportantWord / info.ImportantText));
868                 }
869                         Result := (info.ImportantWord * info.NormalText) /
870                                 (info.NormalWord * info.ImportantText +
871                                 info.ImportantWord * info.NormalText);
872         end;
873
874         function f( cnt : Integer; n, mean : Single ) : Extended;
875         const
876                 k = 0.001;
877         begin
878                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
879         end;
880
881         function prbx( x2, degree : Extended ) : Extended;
882         begin
883
884                 Result := 0.5;
885
886         end;
887
888 var
889         n                                               : Extended;
890         narray                  : array of Single;
891         mean                            : Extended;
892         countInfo               : TWordCountInfo;
893         i                                               : Integer;
894 //      normal                  : Extended;
895 //      important               : Extended;
896         P1, Q1                  : Extended;
897         cnt                                     : Extended;
898 begin
899
900         if wordCount.Count = 0 then begin
901                 Result := 1;
902                 Exit;
903         end;
904
905         SetLength( narray, wordCount.Count );
906         mean := 0;
907         for i := 0 to wordCount.Count - 1 do begin
908                 n                                               := p( wordCount[ i ] );
909                 narray[ i ]     := n;
910                 mean                            := mean + n;
911         end;
912         mean := mean / wordCount.Count;
913
914         P1 := 1;
915         Q1 := 1;
916         for i := 0 to wordCount.Count - 1 do begin
917                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
918                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
919                 P1 := P1 * ( 1 - n );
920                 Q1 := Q1 * n;
921         end;
922         cnt := wordCount.Count;
923         if cnt = 0 then
924                 cnt := 1;
925         try
926                 P1 := Power( P1, 1 / cnt );
927         except
928         end;
929         try
930                 Q1 := Power( Q1, 1 / cnt );
931         except
932         end;
933
934         P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
935         Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
936
937         Result := (1 + P1 - Q1) / 2;
938
939 end;
940
941 //==============================
942 // Parse
943 //==============================
944 function TGikoBayesian.Parse(
945         const text                              : string;
946         wordCount                                       : TWordCount;
947         algorithm                                       : TGikoBayesianAlgorithm
948 ) : Extended;
949 begin
950
951         CountWord( text, wordCount );
952         case algorithm of
953         gbaPaulGraham:          Result := CalcPaulGraham( wordCount );
954         gbaGaryRobinson:        Result := CalcGaryRobinson( wordCount );
955         gbaGaryRobinsonFisher:
956                                                                                 Result := CalcGaryRobinsonFisher( wordCount );
957         else                                                    Result := 0;
958         end;
959
960 end;
961
962 //==============================
963 // Learn
964 //==============================
965 procedure TGikoBayesian.Learn(
966         wordCount                : TWordCount;
967         isImportant      : Boolean );
968 var
969         aWord                   : string;
970         wordinfo        : TWordInfo;
971         countinfo       : TWordCountInfo;
972         i                                       : Integer;
973 begin
974
975         for i := 0 to wordCount.Count - 1 do begin
976                 aWord := wordCount[ i ];
977                 wordinfo := Objects[ aWord ];
978                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
979                 if wordinfo = nil then begin
980                         wordinfo := TWordInfo.Create;
981                         Objects[ aWord ] := wordinfo;
982                 end;
983
984                 if isImportant then begin
985                         wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
986                         wordinfo.ImportantText := wordinfo.ImportantText + 1;
987                 end else begin
988                         wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
989                         wordinfo.NormalText := wordinfo.NormalText + 1;
990                 end;
991         end;
992
993 end;
994
995 //==============================
996 // Forget
997 //==============================
998 procedure       TGikoBayesian.Forget(
999         wordCount               : TWordCount;
1000         isImportant     : Boolean );
1001 var
1002         aWord                   : string;
1003         wordinfo        : TWordInfo;
1004         countinfo       : TWordCountInfo;
1005         i                       : Integer;
1006 begin
1007
1008         for i := 0 to wordCount.Count - 1 do begin
1009                 aWord := wordCount[ i ];
1010                 wordinfo := Objects[ aWord ];
1011                 if wordinfo = nil then
1012                         Continue;
1013
1014                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1015                 if isImportant then begin
1016                         if wordInfo.ImportantText > 0 then begin
1017                                 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1018                                 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1019                         end;
1020                 end else begin
1021                         if wordinfo.NormalText > 0 then begin
1022                                 wordinfo.NormalText := wordinfo.NormalText - 1;
1023                                 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1024                         end;
1025                 end;
1026         end;
1027
1028 end;
1029
1030 end.