OSDN Git Service

前回のコミットで ModeWhite のチェックが抜けてしまっていたので修正。
[gikonavigoeson/gikonavi.git] / GikoBayesian.pas
1 unit GikoBayesian;
2
3 {!
4 \file           GikoBayesian.pas
5 \brief  \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
6
7 $Id: GikoBayesian.pas,v 1.13 2004/11/01 05:18:21 yoffy Exp $
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15         Classes;
16
17 //==================================================
18 type
19 //==================================================
20
21         {!***********************************************************
22         \brief \92P\8cê\83v\83\8d\83p\83e\83B
23         ************************************************************}
24         TWordInfo       = class( TObject )
25         private
26                 FNormalWord                     :       Integer;        //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
27                 FImportantWord  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
28                 FNormalText                     : Integer;      //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
29                 FImportantText  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
30
31         public
32                 property NormalWord                     : Integer       read FNormalWord write FNormalWord;
33                 property ImportantWord  : Integer       read FImportantWord write FImportantWord;
34                 property NormalText                     : Integer       read FNormalText write FNormalText;
35                 property ImportantText  : Integer       read FImportantText write FImportantText;
36         end;
37
38         {!***********************************************************
39         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83v\83\8d\83p\83e\83B
40         ************************************************************}
41         TWordCountInfo  = class( TObject )
42         private
43                 FWordCount      :       Integer;        //!< \92P\8cê\90\94
44
45         public
46                 property WordCount      : Integer       read FWordCount write FWordCount;
47         end;
48
49         {!***********************************************************
50         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83\8a\83X\83g
51         ************************************************************}
52 //      TWordCount      = class( THashedStringList )    // \8c\83\92x
53         TWordCount      = class( TStringList )
54         public
55                 constructor Create;
56                 destructor Destroy; override;
57         end;
58
59         {!***********************************************************
60         \brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
61         ************************************************************}
62         TGikoBayesianAlgorithm =
63                 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
64
65         {!***********************************************************
66         \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
67         ************************************************************}
68 //      TGikoBayesian = class( THashedStringList )      // \8c\83\92x
69         TGikoBayesian = class( TStringList )
70         private
71                 FFilePath       : string;       //!< \93Ç\82Ý\8d\9e\82ñ\82¾\83t\83@\83C\83\8b\83p\83X
72                 function GetObject( const name : string ) : TWordInfo;
73                 procedure SetObject( const name : string; value : TWordInfo );
74
75         public
76                 constructor Create;
77                 destructor Destroy; override;
78
79                 //! \83t\83@\83C\83\8b\82©\82ç\8aw\8fK\97\9a\97ð\82ð\93Ç\82Ý\8fo\82µ\82Ü\82·
80                 procedure LoadFromFile( const filePath : string );
81
82                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
83                 procedure SaveToFile( const filePath : string );
84
85                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
86                 procedure Save;
87
88                 //! \92P\8cê\82É\91Î\82·\82é\8fî\95ñ\82ð\8eæ\93¾\82µ\82Ü\82·
89                 property Objects[ const name : string ] : TWordInfo
90                         read GetObject write SetObject; default;
91
92                 //! \95\8fÍ\82É\8aÜ\82Ü\82ê\82é\92P\8cê\82ð\83J\83E\83\93\83g\82µ\82Ü\82·
93                 procedure CountWord(
94                         const text      : string;
95                         wordCount               : TWordCount );
96
97                 {!
98                 \brief  Paul Graham \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
99                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
100                 }
101                 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
102
103                 {!
104                 \brief  GaryRobinson \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
105                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
106                 }
107                 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
108
109                 {!
110                 \brief  GaryRobinson-Fisher \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
111                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
112                 }
113                 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
114
115                 {!
116                 \brief  \95\8fÍ\82ð\89ð\90Í
117                 \param  text                                    \89ð\90Í\82·\82é\95\8fÍ
118                 \param  wordCount                       \89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g\82ª\95Ô\82é
119                 \param  algorithm                       \92\8d\96Ú\93x\82Ì\8c\88\92è\82É\97p\82¢\82é\83A\83\8b\83S\83\8a\83Y\83\80\82ð\8ew\92è\82µ\82Ü\82·
120                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
121
122                 CountWord \82Æ Calcxxxxx \82ð\82Ü\82Æ\82ß\82Ä\8eÀ\8ds\82·\82é\82¾\82¯\82Å\82·\81B
123                 }
124                 function Parse(
125                         const text                              : string;
126                         wordCount                                       : TWordCount;
127                         algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
128                 ) : Extended;
129
130                 {!
131                 \brief  \8aw\8fK\82·\82é
132                 \param  wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
133                 \param  isImportant \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82é\82È\82ç True
134                 }
135                 procedure Learn(
136                         wordCount                : TWordCount;
137                         isImportant      : Boolean );
138
139                 {!
140                 \brief          \8aw\8fK\8c\8b\89Ê\82ð\96Y\82ê\82é
141                 \param          wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
142                 \param          isImportant     \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82ç\82ê\82Ä\82¢\82½\82È\82ç True
143                 \warning        \8aw\8fK\8dÏ\82Ý\82Ì\95\8fÍ\82©\82Ç\82¤\82©\82Í\8am\94F\8fo\97\88\82Ü\82¹\82ñ\81B<br>
144                                                         Learn \82µ\82Ä\82¢\82È\82¢\95\8fÍ\82â isImportant \82ª\8aÔ\88á\82Á\82Ä\82¢\82é\95\8fÍ\82ð
145                                                         Forget \82·\82é\82Æ\83f\81[\83^\83x\81[\83X\82ª\94j\91¹\82µ\82Ü\82·\81B<br>
146                                                         \8aw\8fK\8dÏ\82Ý\82©\82Ç\82¤\82©\82Í\93Æ\8e©\82É\8aÇ\97\9d\82µ\82Ä\82­\82¾\82³\82¢\81B
147
148                 \91S\82Ä\82Ì\8aw\8fK\8c\8b\89Ê\82ð\83N\83\8a\83A\82·\82é\82í\82¯\82Å\82Í\82 \82è\82Ü\82¹\82ñ\81B<br>
149                 wordCount \82ð\93¾\82½\95\8fÍ (Parse \82Ì text \88ø\90\94\82Ì\8aw\8fK\8c\8b\89Ê\82Ì\82Ý\83N\83\8a\83A\82µ\82Ü\82·\81B<br><br>
150
151                 \8eå\82É\92\8d\96Ú\95\8fÍ\82Æ\94ñ\92\8d\96Ú\95\8fÍ\82ð\90Ø\82è\91Ö\82¦\82é\82½\82ß\82É Forget -> Learn \82Ì\8f\87\82Å\8eg\97p\82µ\82Ü\82·\81B
152                 }
153                 procedure       Forget(
154                         wordCount               : TWordCount;
155                         isImportant     : Boolean );
156         end;
157
158 //==================================================
159 implementation
160 //==================================================
161
162 uses
163         SysUtils, Math, Windows,
164         MojuUtils;
165
166 const
167         GIKO_BAYESIAN_FILE_VERSION      = '1.0';
168 {
169         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
170                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
171                                                                 ModeWHira, ModeWKata, ModeWKanji);
172 }
173         CharMode1 : array [ 0..255 ] of Byte =
174         (
175                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
178                 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
179                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
180                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
181                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
182                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
183
184                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186                 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
187                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
188                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
189                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
192         );
193
194 //************************************************************
195 // misc
196 //************************************************************
197
198 //==============================
199 // RemoveToken
200 //==============================
201 function RemoveToken(var s: string;const delimiter: string): string;
202 var
203         p: Integer;
204 begin
205         p := AnsiPos(delimiter, s);
206         if p = 0 then
207                 Result := s
208         else
209                 Result := Copy(s, 1, p - 1);
210         s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
211 end;
212
213 //==============================
214 // AbsSort
215 //==============================
216 function AbsSort( p1, p2 : Pointer ) : Integer;
217 var
218         v1, v2 : Single;
219 begin
220
221         v1 := Abs( Single( p1 ) - 0.5 );
222         v2 := Abs( Single( p2 ) - 0.5 );
223         if v1 > v2 then
224                 Result := -1
225         else if v1 = v2 then
226                 Result := 0
227         else
228                 Result := 1;
229
230 end;
231
232 //************************************************************
233 // TWordCount class
234 //************************************************************
235 constructor TWordCount.Create;
236 begin
237
238                 Duplicates              := dupIgnore;
239                 CaseSensitive   := True;
240                 Sorted                          := True;
241
242 end;
243
244 destructor TWordCount.Destroy;
245 var
246         i : Integer;
247 begin
248
249         for i := Count - 1 downto 0 do
250                 if Objects[ i ] <> nil then
251                         Objects[ i ].Free;
252
253         inherited;
254
255 end;
256
257 //************************************************************
258 // TGikoBayesian class
259 //************************************************************
260
261 //==============================
262 // Create
263 //==============================
264 constructor TGikoBayesian.Create;
265 begin
266
267         Duplicates              := dupIgnore;
268         CaseSensitive   := True;
269         Sorted                          := True;
270
271 end;
272
273 //==============================
274 // Destroy
275 //==============================
276 destructor TGikoBayesian.Destroy;
277 var
278         i : Integer;
279 begin
280
281         for i := Count - 1 downto 0 do
282                 if inherited Objects[ i ] <> nil then
283                         inherited Objects[ i ].Free;
284
285         inherited;
286
287 end;
288
289 procedure TGikoBayesian.LoadFromFile( const filePath : string );
290 var
291         i                       : Integer;
292         sl              : TStringList;
293         s                       : string;
294         name    : string;
295         info    : TWordInfo;
296 begin
297
298         FFilePath := filePath;
299
300         if not FileExists( filePath ) then
301                 Exit;
302
303         sl := TStringList.Create;
304         try
305                 sl.LoadFromFile( filePath );
306
307                 for i := 1 to sl.Count - 1 do begin
308                         s := sl[ i ];
309                         name := RemoveToken( s, #1 );
310                         info := TWordInfo.Create;
311                         info.NormalWord                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
312                         info.ImportantWord      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
313                         info.NormalText                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
314                         info.ImportantText      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315
316                         AddObject( name, info );
317                 end;
318         finally
319                 sl.Free;
320         end;
321
322 end;
323
324 procedure TGikoBayesian.SaveToFile( const filePath : string );
325 var
326         i                       : Integer;
327         sl              : TStringList;
328         s                       : string;
329         info    : TWordInfo;
330 begin
331
332         FFilePath := filePath;
333
334         sl := TStringList.Create;
335         try
336                 sl.BeginUpdate;
337                 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
338
339                 for i := 0 to Count - 1 do begin
340                         info := TWordInfo( inherited Objects[ i ] );
341                         s := Strings[ i ] + #1
342                                  + Format('%x', [info.NormalWord]) + #1
343                                  + Format('%x', [info.ImportantWord]) + #1
344                                  + Format('%x', [info.NormalText]) + #1
345                                  + Format('%x', [info.ImportantText]);
346
347                         sl.Add(s);
348                 end;
349                 sl.EndUpdate;
350                 sl.SaveToFile( filePath );
351         finally
352                 sl.Free;
353         end;
354
355 end;
356
357 procedure TGikoBayesian.Save;
358 begin
359
360         if FFilePath <> '' then
361                 SaveToFile( FFilePath );
362
363 end;
364
365 //==============================
366 // GetObject
367 //==============================
368 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
369 var
370         idx : Integer;
371 begin
372
373         if Find( name, idx ) then
374                 Result := TWordInfo( inherited Objects[ idx ] )
375         else
376                 Result := nil;
377
378 end;
379
380 //==============================
381 // SetObject
382 //==============================
383 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
384 var
385         idx : Integer;
386 begin
387
388         if Find( name, idx ) then
389                 inherited Objects[ idx ] := value
390         else
391                 AddObject( name, value );
392
393 end;
394
395
396 //==============================
397 // CountWord
398 //==============================
399 procedure TGikoBayesian.CountWord(
400         const text      : string;
401         wordCount               : TWordCount );
402 type
403         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
404                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
405                                                                 ModeWHira, ModeWKata, ModeWKanji);
406 var
407         p, tail, last                   : PChar;
408         mode, newMode                   : Modes;
409         ch                                                              : Longword;
410         chSize                                          : Integer;
411         wHiraDelimiter          : TStringList;
412         wHiraFinalDelimiter     : TStringList;
413         wKanjiDelimiter         : TStringList;
414         words                                                   : TStringList;
415         aWord                                                   : string;
416         countInfo                                       : TWordCountInfo;
417
418         function cutBoth( _aWord : string; _delim : TStringList ) : string;
419         var
420                 _i                      : Integer;
421         begin
422                 for _i := 0 to _delim.Count - 1 do begin
423                         _aWord := CustomStringReplace(
424                                 _aWord,
425                                 _delim[ _i ],
426                                 #10 + _delim[ _i ] + #10, False );
427                 end;
428                 Result := _aWord;
429         end;
430
431         function cutFirst( _aWord : string; _delim : TStringList ) : string;
432         var
433                 _i                      : Integer;
434         begin
435                 for _i := 0 to _delim.Count - 1 do begin
436                         _aWord := CustomStringReplace(
437                                 _aWord,
438                                 _delim[ _i ],
439                                 #10 + _delim[ _i ], False );
440                 end;
441                 Result := _aWord;
442         end;
443
444         function cutFinal( _aWord : string; _delim : TStringList ) : string;
445         var
446                 _i                      : Integer;
447         begin
448                 for _i := 0 to _delim.Count - 1 do begin
449                         _aWord := CustomStringReplace(
450                                 _aWord,
451                                 _delim[ _i ],
452                                 _delim[ _i ] + #10, False );
453                 end;
454                 Result := _aWord;
455         end;
456
457         procedure addWord( _dst : TWordCount; _words : TStringList );
458         var
459                 _aWord                  : string;
460                 _i, _idx                : Integer;
461                 _countInfo      : TWordCountInfo;
462         begin
463                 for _i := 0 to _words.Count - 1 do begin
464                         _aWord := _words[ _i ];
465                         if Length( _aWord ) > 0 then begin
466                                 if _dst.Find( _aWord, _idx ) then begin
467                                         _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
468                                 end else begin
469                                         _countInfo := TWordCountInfo.Create;
470                                         _dst.AddObject( _aWord, _countInfo );
471                                 end;
472                                 _countInfo.WordCount := _countInfo.WordCount + 1;
473                         end;
474                 end;
475         end;
476
477         function changeMode( _aWord : string; _mode : Modes ) : string;
478         var
479                 _i                                                                      : Integer;
480                 _aWord2                                                 : string;
481                 _pWord, _pWord2                 : PChar;
482                 _pWordTail, _pFound     : PChar;
483         const
484                 _delim : string = #10;
485         begin
486                 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
487                         // \93ú\96{\8cê
488                         // \83X\83y\81[\83X\82ð\8bl\82ß\82é
489                         _aWord := CustomStringReplace( _aWord, ' ', '', False );
490                         _aWord := CustomStringReplace( _aWord, '\81@', '', False );
491
492                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
493                         case mode of
494                         ModeWHira:
495                                 begin
496                                         _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
497                                         Result := cutBoth( _aWord, wHiraDelimiter );
498                                 end;
499
500                         ModeWKanji:
501                                 begin
502                                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
503                                         _aWord := cutBoth( _aWord, wKanjiDelimiter );
504                                         // 4 byte (2 \8e\9a\82¸\82Â\82Å\92P\8cê\95ª\82¯
505                                         _pWord := PChar( _aWord );
506                                         _i := Length( _aWord );
507                                         _pWordTail := _pWord + _i;
508                                         SetLength( _aWord2, _i + (_i shr 2) );
509                                         _pWord2 := PChar( _aWord2 );
510
511                                         while _pWord < _pWordTail do begin
512                                                 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
513                                                 if _pFound = nil then
514                                                         _pFound := _pWordTail;
515                                                 _pFound := _pFound - 3;
516
517                                                 while _pWord <= _pFound do begin
518                                                         CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
519                                                         _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
520                                                 end;
521                                                 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
522                                                 CopyMemory( _pWord2, _pWord, _i );
523                                                 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
524                                         end;
525                                         if _pWord < _pWordTail then begin
526                                                 _i := _pWordTail - _pWord;
527                                                 CopyMemory( _pWord2, _pWord, _i );
528                                                 _pWord2 := _pWord2 + _i;
529                                         end;
530                                         SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
531
532                                         Result := _aWord2;
533                                 end;
534
535                         else
536                                 Result := _aWord;
537                         end;
538                 end else begin
539                         Result := _aWord;
540                 end;
541         end;
542 const
543         WHIRA_DELIMITER = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç'
544                 + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å'+ #10 + '\82Å'
545                 + #10 + '\82±\82±' + #10 + '\82»\82±' + #10 + '\82Ç\82±'
546                 + #10 + '\82±\82ê' + #10 + '\82»\82ê' + #10 + '\82 \82ê' + #10 + '\82Ç\82ê'
547                 + #10 + '\82±\82Ì' + #10 + '\82»\82Ì' + #10 + '\82 \82Ì' + #10 + '\82Ç\82Ì'
548                 + #10 + '\82±\82¤' + #10 + '\82»\82¤' + #10 + '\82 \82 ' + #10 + '\82Ç\82¤'
549                 + #10 + '\82±\82ñ\82È' + #10 + '\82»\82ñ\82È' + #10 + '\82 \82ñ\82È' + #10 + '\82Ç\82ñ\82È'
550                 + #10 + '\82ê\82½' + #10 + '\82ê\82Ä' + #10 + '\82ê\82ê' + #10 + '\82ê\82ë'
551                 + #10 + '\82ê\82é' + #10 + '\82ç\82ê\82é'
552                 + #10 + '\82Å\82·' + #10 + '\82Ü\82·' + #10 + '\82Ü\82¹\82ñ'
553                 + #10 + '\82Å\82µ\82½' + #10 + '\82Ü\82µ\82½'
554                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82³\82ê\82é' + #10 + '\82³\82ê\82È\82¢'
555                 ;
556         WKANJI_DELIMITER = '\93I' + #10 + '\90«' + #10 + '\8e®' + #10 + '\89»' + #10 + '\96@'
557                 + #10 + '\95s' + #10 + '\96³' + #10 + '\94ñ' + #10 + '\94½'
558                 ;
559         WHIRA_FINAL_DELIMITER = '\82Á\82½' + #10 + '\82Á\82Ä'
560                 ;{
561                 + #10 + '\82æ\82Á\82Ä' + #10 + '\82µ\82½\82ª\82Á\82Ä' + #10 + '\82È\82Ì\82Å'
562                 + #10 + '\82¾\82©\82ç' + #10 + '\82Å\82·\82©\82ç'
563                 + #10 + '\82Ü\82½'
564                 + #10 + '\82µ\82©\82µ' + #10 + '\82¾\82ª' + #10 + '\82¯\82Ç' + #10 + '\82¯\82ê\82Ç'
565                 + #10 + '\82â\82Í\82è' + #10 + '\82â\82Á\82Ï\82è'
566                 + #10 + '\82Å\82µ' + #10 + '\82¾\82ë'
567                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82µ\82½' + #10 + '\82µ\82È\82¢'
568                 ;}
569         // '\81[' \82ð '\82\9f\82¡\82£\82¥\82§' \82É\81B
570         HA_LINE = '\82 \82©\82³\82½\82È\82Í\82Ü\82â\82ç\82í\82ª\82´\82¾\82Î\82Ï\82\9f\82ì';
571         HI_LINE = '\82¢\82«\82µ\82¿\82É\82Ð\82Ý\82è\82î\82¬\82\82Ñ\82Ò\82¡';
572         HU_LINE = '\82¤\82­\82·\82Â\82Ê\82Ó\82Þ\82ä\82é\82®\82Ô\82Õ\82£';
573         HE_LINE = '\82¦\82¯\82¹\82Ä\82Ë\82Ö\82ß\82ê\82ï\82°\82×\82Ø\82¥';
574         HO_LINE = '\82¨\82±\82»\82Æ\82Ì\82Ù\82à\82æ\82ë\82ð\82²\82Ú\82Û\82§';
575         KA_LINE = '\83A\83J\83T\83^\83i\83n\83}\83\84\83\89\83\8f\83K\83U\83_\83o\83p\83@\83\95\83\8e';
576         KI_LINE = '\83C\83L\83V\83`\83j\83q\83~\83\8a\83\90\83M\83W\83r\83s\83B';
577         KU_LINE = '\83E\83N\83X\83c\83k\83t\83\80\83\86\83\8b\83O\83u\83v\83D\83\94';
578         KE_LINE = '\83G\83P\83Z\83e\83l\83w\83\81\83\8c\83\91\83Q\83x\83y\83F\83\96';
579         KO_LINE = '\83I\83R\83\\83g\83m\83z\83\82\83\88\83\8d\83\92\83S\83{\83|\83H';
580         kKanji = [$80..$A0, $E0..$ff];
581 begin
582
583         wHiraDelimiter  := TStringList.Create;
584         wHiraFinalDelimiter := TStringList.Create;
585         wKanjiDelimiter := TStringList.Create;
586         words := TStringList.Create;
587         try
588                 mode := ModeWhite;
589                 wHiraDelimiter.Text := WHIRA_DELIMITER;
590                 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
591                 wKanjiDelimiter.Text := WKANJI_DELIMITER;
592                 p                       := PChar( text );
593                 tail    := p + Length( text );
594                 last    := p;
595
596                 while p < tail do begin
597                         // \95\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
598                         // \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82­\82Ä\82à\82¢\82¢
599 //                      if Byte(Byte( p^ ) - $a1) < $5e then begin
600                         if Byte( p^ ) in kKanji then begin
601                                 if p + 1 < tail then begin
602                                         ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
603                                         case ch of
604                                         // \83X\83y\81[\83X\82Å\92P\8cê\95ª\82¯\82¹\82¸\82É\8bl\82ß\82é
605                                         //$8140:                                                        newMode := ModeWhite;
606                                         $8141..$824e:                           newMode := ModeWGraph;
607                                         $824f..$8258:                           newMode := ModeWNum;
608                                         $8260..$829a:                           newMode := ModeWAlpha;
609                                         $829f..$82f1:                           newMode := ModeWHira;
610                                         $8340..$8396:                           newMode := ModeWKata;
611                                         else                                                            newMode := ModeWKanji;
612                                         end;
613                                         // '\81J\81K\81[' \82Í\95½\89¼\96¼\81A\82Ü\82½\82Í\83J\83^\83J\83i\82É\8aÜ\82Ü\82ê\82é
614                                         if (mode = ModeWHira) or (mode = ModeWKata) then
615                                                 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
616                                                         newMode := mode;
617                                 end else begin
618                                         newMode := ModeWhite;
619                                 end;
620
621                                 chSize := 2;
622                         end else begin
623                                 newMode := Modes( CharMode1[ Byte( p^ ) ] );
624                                 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
625                                         // \8d¡\82Ü\82Å\93ú\96{\8cê\82Å\8d¡\83X\83y\81[\83X
626                                         // \92P\8cê\82ð\8cq\82°\82Ä\8cã\82Å\83X\83y\81[\83X\82ð\8bl\82ß\82é
627                                         // \81¦\94¼\8ap\83J\83i\82Í\92Ê\8fí\83X\83y\81[\83X\82Å\8bæ\90Ø\82é\82¾\82ë\82¤\82©\82ç\8bl\82ß\82È\82¢
628                                         newMode := mode;
629                                 end;
630
631                                 chSize := 1;
632                         end;
633
634                         if mode <> newMode then begin
635
636                                 // \95\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
637                                 if mode <> ModeWhite then begin
638                                         SetLength( aWord, p - last );
639                                         CopyMemory( PChar( aWord ), last, p - last );
640
641                                         words.Text := changeMode( aWord, mode );
642
643                                         // \92P\8cê\93o\98^
644                                         addWord( wordCount, words );
645                                 end;
646
647                                 last := p;
648                                 mode := newMode;
649
650                         end;
651
652                         p := p + chSize;
653                 end;    // while
654
655                 if mode <> ModeWhite then begin
656                         aWord := Copy( last, 0, p - last );
657                         words.Text := changeMode( aWord, mode );
658
659                         // \92P\8cê\93o\98^
660                         addWord( wordCount, words );
661                 end;
662         finally
663                 words.Free;
664                 wKanjiDelimiter.Free;
665                 wHiraFinalDelimiter.Free;
666                 wHiraDelimiter.Free;
667         end;
668
669 end;
670
671 //==============================
672 // CalcPaulGraham
673 //==============================
674 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
675
676         function p( const aWord : string ) : Single;
677         var
678                 info : TWordInfo;
679         begin
680                 info := Objects[ aWord ];
681                 if info = nil then
682                         Result := 0.415
683                 else if info.NormalWord = 0 then
684                         Result := 0.99
685                 else if info.ImportantWord = 0 then
686                         Result := 0.01
687                 else if info.ImportantWord + info.NormalWord * 2 < 5 then
688                         Result := 0.5
689                 else
690                         Result := ( info.ImportantWord / info.ImportantText ) /
691                                 ((info.NormalWord * 2 / info.NormalText ) +
692                                  (info.ImportantWord / info.ImportantText));
693         end;
694
695 var
696         s, q                            : Extended;
697         i                                               : Integer;
698         narray                  : TList;
699 const
700         SAMPLE_COUNT    = 15;
701 begin
702
703         Result := 1;
704         if wordCount.Count = 0 then
705                 Exit;
706
707         narray := TList.Create;
708         try
709                 for i := 0 to wordCount.Count - 1 do begin
710                         narray.Add( Pointer( p( wordCount[ i ] ) ) );
711                 end;
712
713                 narray.Sort( AbsSort );
714
715                 s := 1;
716                 q := 1;
717                 i := min( SAMPLE_COUNT, narray.Count );
718                 while i > 0 do begin
719                         Dec( i );
720
721                         s := s * Single( narray[ i ] );
722                         q := q * (1 - Single( narray[ i ] ));
723                 end;
724
725                 Result := s / (s + q);
726         finally
727                 narray.Free;
728         end;
729
730 end;
731
732 //==============================
733 // CalcGaryRobinson
734 //==============================
735 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
736
737         function p( const aWord : string ) : Single;
738         var
739                 info : TWordInfo;
740         begin
741                 info := Objects[ aWord ];
742                 if info = nil then
743                         Result := 0.415
744                 else if info.ImportantWord = 0 then
745                         Result := 0.01
746                 else if info.NormalWord = 0 then
747                         Result := 0.99
748                 else
749                         Result := ( info.ImportantWord / info.ImportantText ) /
750                                 ((info.NormalWord / info.NormalText ) +
751                                  (info.ImportantWord / info.ImportantText));
752         end;
753
754         function f( cnt : Integer; n, mean : Single ) : Extended;
755         const
756                 k = 0.00001;
757         begin
758                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
759         end;
760
761 var
762         n                                               : Extended;
763         narray                  : array of Single;
764         mean                            : Extended;
765         countInfo               : TWordCountInfo;
766         i                                               : Integer;
767         P1, Q1, R1      : Extended;
768         cnt                                     : Extended;
769 begin
770
771         if wordCount.Count = 0 then begin
772                 Result := 1;
773                 Exit;
774         end;
775
776         SetLength( narray, wordCount.Count );
777         mean := 0;
778         for i := 0 to wordCount.Count - 1 do begin
779                 n                                               := p( wordCount[ i ] );
780                 narray[ i ]     := n;
781                 mean                            := mean + n;
782         end;
783         mean := mean / wordCount.Count;
784
785         P1 := 1;
786         Q1 := 1;
787         for i := 0 to wordCount.Count - 1 do begin
788                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
789                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
790                 P1 := P1 * ( 1 - n );
791                 Q1 := Q1 * n;
792         end;
793         cnt := wordCount.Count;
794         if cnt = 0 then
795                 cnt := 1
796         else
797         P1 := 1 - Power( P1, 1 / cnt );
798         Q1 := 1 - Power( Q1, 1 / cnt );
799
800         if P1 + Q1 = 0 then begin
801                 Result := 0.5
802         end else begin
803                 n := (P1 - Q1) / (P1 + Q1);
804                 Result := (1 + n) / 2;
805         end;
806
807 end;
808
809 //==============================
810 // CalcGaryRobinsonFisher
811 //==============================
812 function TGikoBayesian.CalcGaryRobinsonFisher(
813         wordCount : TWordCount
814 ) : Extended;
815
816         function p( const aWord : string ) : Single;
817         var
818                 info                            : TWordInfo;
819         begin
820                 info := Objects[ aWord ];
821                 if info = nil then
822                         Result := 0.415
823                 else if info.ImportantWord = 0 then
824                         Result := 0.01
825                 else if info.NormalWord = 0 then
826                         Result := 0.99
827                 else
828                         Result := info.ImportantWord /
829                                 (info.ImportantWord + info.NormalWord *
830                                  info.ImportantText / info.NormalText);
831         end;
832
833         function f( cnt : Integer; n, mean : Single ) : Extended;
834         const
835                 k = 0.00001;
836         begin
837                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
838         end;
839
840         function prbx( x2, degree : Extended ) : Extended;
841         var
842                 m : Extended;
843                 sum : Extended;
844                 term : Extended;
845                 i : extended;
846         begin
847
848                 m := x2 / 2;
849                 sum := exp( -m );
850                 term := -m;
851
852                 i := 1;
853                 while i < (degree / 2 - 1) do begin
854                         term := term + ln( m / i );
855                         sum := sum + exp( term );
856                         i := i + 1;
857                 end;
858
859                 if sum < 1 then
860                         Result := sum
861                 else
862                         Result := 1.0;
863
864         end;
865
866 var
867         n                                               : Extended;
868         narray                  : array of Single;
869         mean                            : Extended;
870         countInfo               : TWordCountInfo;
871         i                                               : Integer;
872         normal                  : Extended;
873         important               : Extended;
874         P1, Q1                  : Extended;
875         cnt                                     : Extended;
876 begin
877
878         if wordCount.Count = 0 then begin
879                 Result := 1;
880                 Exit;
881         end;
882
883         SetLength( narray, wordCount.Count );
884         mean := 0;
885         for i := 0 to wordCount.Count - 1 do begin
886                 n                                               := p( wordCount[ i ] );
887                 narray[ i ]     := n;
888                 mean                            := mean + n;
889         end;
890         mean := mean / wordCount.Count;
891
892         cnt := 0;
893 (*
894         P1 := 1;
895         Q1 := 1;
896 (*)
897         P1 := 0;
898         Q1 := 0;
899 //*
900         for i := 0 to wordCount.Count - 1 do begin
901                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
902                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
903                 if countInfo <> nil then
904                         cnt := cnt + countInfo.WordCount;
905 (*
906                 P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
907                 Q1 := Q1 + Ln( n ) * countInfo.WordCount;
908 (*)
909                 P1 := P1 + Ln( 1 - n );
910                 Q1 := Q1 + Ln( n );
911 //*)
912         end;
913         if cnt = 0 then
914                 cnt := 1;
915 //(*
916         P1 := prbx( -2 * P1, 2 * cnt );
917         Q1 := prbx( -2 * Q1, 2 * cnt );
918 (*)
919         P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
920         Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
921 //*)
922         if P1 + Q1 = 0 then begin
923                 Result := 0.5
924         end else begin
925                 Result := (1 + Q1 + P1) / 2;
926         end;
927
928 end;
929
930 //==============================
931 // Parse
932 //==============================
933 function TGikoBayesian.Parse(
934         const text                              : string;
935         wordCount                                       : TWordCount;
936         algorithm                                       : TGikoBayesianAlgorithm
937 ) : Extended;
938 begin
939
940         CountWord( text, wordCount );
941         case algorithm of
942         gbaPaulGraham:          Result := CalcPaulGraham( wordCount );
943         gbaGaryRobinson:        Result := CalcGaryRobinson( wordCount );
944         gbaGaryRobinsonFisher:
945                                                                                 Result := CalcGaryRobinsonFisher( wordCount );
946         else                                                    Result := 0;
947         end;
948
949 end;
950
951 //==============================
952 // Learn
953 //==============================
954 procedure TGikoBayesian.Learn(
955         wordCount                : TWordCount;
956         isImportant      : Boolean );
957 var
958         aWord                   : string;
959         wordinfo        : TWordInfo;
960         countinfo       : TWordCountInfo;
961         i                                       : Integer;
962 begin
963
964         for i := 0 to wordCount.Count - 1 do begin
965                 aWord := wordCount[ i ];
966                 wordinfo := Objects[ aWord ];
967                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
968                 if wordinfo = nil then begin
969                         wordinfo := TWordInfo.Create;
970                         Objects[ aWord ] := wordinfo;
971                 end;
972
973                 if isImportant then begin
974                         wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
975                         wordinfo.ImportantText := wordinfo.ImportantText + 1;
976                 end else begin
977                         wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
978                         wordinfo.NormalText := wordinfo.NormalText + 1;
979                 end;
980         end;
981
982 end;
983
984 //==============================
985 // Forget
986 //==============================
987 procedure       TGikoBayesian.Forget(
988         wordCount               : TWordCount;
989         isImportant     : Boolean );
990 var
991         aWord                   : string;
992         wordinfo        : TWordInfo;
993         countinfo       : TWordCountInfo;
994         i                       : Integer;
995 begin
996
997         for i := 0 to wordCount.Count - 1 do begin
998                 aWord := wordCount[ i ];
999                 wordinfo := Objects[ aWord ];
1000                 if wordinfo = nil then
1001                         Continue;
1002
1003                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1004                 if isImportant then begin
1005                         if wordInfo.ImportantText > 0 then begin
1006                                 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1007                                 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1008                         end;
1009                 end else begin
1010                         if wordinfo.NormalText > 0 then begin
1011                                 wordinfo.NormalText := wordinfo.NormalText - 1;
1012                                 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1013                         end;
1014                 end;
1015         end;
1016
1017 end;
1018
1019 end.