OSDN Git Service

Gary Robinson-Fisher をバグバグのまま放置しておくのは危険なので
[gikonavigoeson/gikonavi.git] / GikoBayesian.pas
1 unit GikoBayesian;
2
3 {!
4 \file           GikoBayesian.pas
5 \brief  \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
6
7 $Id: GikoBayesian.pas,v 1.15 2004/11/01 10:28:24 yoffy Exp $
8 }
9
10 //! \95½\89¼\96¼\82ð\8e«\8f\91\82É\8aÜ\82ß\82È\82¢
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
12
13 interface
14
15 //==================================================
16 uses
17 //==================================================
18         Classes;
19
20 //==================================================
21 type
22 //==================================================
23
24         {!***********************************************************
25         \brief \92P\8cê\83v\83\8d\83p\83e\83B
26         ************************************************************}
27         TWordInfo       = class( TObject )
28         private
29                 FNormalWord                     :       Integer;        //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
30                 FImportantWord  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
31                 FNormalText                     : Integer;      //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
32                 FImportantText  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
33
34         public
35                 property NormalWord                     : Integer       read FNormalWord write FNormalWord;
36                 property ImportantWord  : Integer       read FImportantWord write FImportantWord;
37                 property NormalText                     : Integer       read FNormalText write FNormalText;
38                 property ImportantText  : Integer       read FImportantText write FImportantText;
39         end;
40
41         {!***********************************************************
42         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83v\83\8d\83p\83e\83B
43         ************************************************************}
44         TWordCountInfo  = class( TObject )
45         private
46                 FWordCount      :       Integer;        //!< \92P\8cê\90\94
47
48         public
49                 property WordCount      : Integer       read FWordCount write FWordCount;
50         end;
51
52         {!***********************************************************
53         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83\8a\83X\83g
54         ************************************************************}
55 //      TWordCount      = class( THashedStringList )    // \8c\83\92x
56         TWordCount      = class( TStringList )
57         public
58                 constructor Create;
59                 destructor Destroy; override;
60         end;
61
62         {!***********************************************************
63         \brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
64         ************************************************************}
65         TGikoBayesianAlgorithm =
66                 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
67
68         {!***********************************************************
69         \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
70         ************************************************************}
71 //      TGikoBayesian = class( THashedStringList )      // \8c\83\92x
72         TGikoBayesian = class( TStringList )
73         private
74                 FFilePath       : string;       //!< \93Ç\82Ý\8d\9e\82ñ\82¾\83t\83@\83C\83\8b\83p\83X
75                 function GetObject( const name : string ) : TWordInfo;
76                 procedure SetObject( const name : string; value : TWordInfo );
77
78         public
79                 constructor Create;
80                 destructor Destroy; override;
81
82                 //! \83t\83@\83C\83\8b\82©\82ç\8aw\8fK\97\9a\97ð\82ð\93Ç\82Ý\8fo\82µ\82Ü\82·
83                 procedure LoadFromFile( const filePath : string );
84
85                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
86                 procedure SaveToFile( const filePath : string );
87
88                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
89                 procedure Save;
90
91                 //! \92P\8cê\82É\91Î\82·\82é\8fî\95ñ\82ð\8eæ\93¾\82µ\82Ü\82·
92                 property Objects[ const name : string ] : TWordInfo
93                         read GetObject write SetObject; default;
94
95                 //! \95\8fÍ\82É\8aÜ\82Ü\82ê\82é\92P\8cê\82ð\83J\83E\83\93\83g\82µ\82Ü\82·
96                 procedure CountWord(
97                         const text      : string;
98                         wordCount               : TWordCount );
99
100                 {!
101                 \brief  Paul Graham \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
102                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
103                 }
104                 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
105
106                 {!
107                 \brief  GaryRobinson \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
108                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
109                 }
110                 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
111
112                 {!
113                 \brief  GaryRobinson-Fisher \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
114                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
115                 }
116                 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
117
118                 {!
119                 \brief  \95\8fÍ\82ð\89ð\90Í
120                 \param  text                                    \89ð\90Í\82·\82é\95\8fÍ
121                 \param  wordCount                       \89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g\82ª\95Ô\82é
122                 \param  algorithm                       \92\8d\96Ú\93x\82Ì\8c\88\92è\82É\97p\82¢\82é\83A\83\8b\83S\83\8a\83Y\83\80\82ð\8ew\92è\82µ\82Ü\82·
123                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
124
125                 CountWord \82Æ Calcxxxxx \82ð\82Ü\82Æ\82ß\82Ä\8eÀ\8ds\82·\82é\82¾\82¯\82Å\82·\81B
126                 }
127                 function Parse(
128                         const text                              : string;
129                         wordCount                                       : TWordCount;
130                         algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131                 ) : Extended;
132
133                 {!
134                 \brief  \8aw\8fK\82·\82é
135                 \param  wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
136                 \param  isImportant \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82é\82È\82ç True
137                 }
138                 procedure Learn(
139                         wordCount                : TWordCount;
140                         isImportant      : Boolean );
141
142                 {!
143                 \brief          \8aw\8fK\8c\8b\89Ê\82ð\96Y\82ê\82é
144                 \param          wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
145                 \param          isImportant     \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82ç\82ê\82Ä\82¢\82½\82È\82ç True
146                 \warning        \8aw\8fK\8dÏ\82Ý\82Ì\95\8fÍ\82©\82Ç\82¤\82©\82Í\8am\94F\8fo\97\88\82Ü\82¹\82ñ\81B<br>
147                                                         Learn \82µ\82Ä\82¢\82È\82¢\95\8fÍ\82â isImportant \82ª\8aÔ\88á\82Á\82Ä\82¢\82é\95\8fÍ\82ð
148                                                         Forget \82·\82é\82Æ\83f\81[\83^\83x\81[\83X\82ª\94j\91¹\82µ\82Ü\82·\81B<br>
149                                                         \8aw\8fK\8dÏ\82Ý\82©\82Ç\82¤\82©\82Í\93Æ\8e©\82É\8aÇ\97\9d\82µ\82Ä\82­\82¾\82³\82¢\81B
150
151                 \91S\82Ä\82Ì\8aw\8fK\8c\8b\89Ê\82ð\83N\83\8a\83A\82·\82é\82í\82¯\82Å\82Í\82 \82è\82Ü\82¹\82ñ\81B<br>
152                 wordCount \82ð\93¾\82½\95\8fÍ (Parse \82Ì text \88ø\90\94\82Ì\8aw\8fK\8c\8b\89Ê\82Ì\82Ý\83N\83\8a\83A\82µ\82Ü\82·\81B<br><br>
153
154                 \8eå\82É\92\8d\96Ú\95\8fÍ\82Æ\94ñ\92\8d\96Ú\95\8fÍ\82ð\90Ø\82è\91Ö\82¦\82é\82½\82ß\82É Forget -> Learn \82Ì\8f\87\82Å\8eg\97p\82µ\82Ü\82·\81B
155                 }
156                 procedure       Forget(
157                         wordCount               : TWordCount;
158                         isImportant     : Boolean );
159         end;
160
161 //==================================================
162 implementation
163 //==================================================
164
165 uses
166         SysUtils, Math, Windows,
167         MojuUtils;
168
169 const
170         GIKO_BAYESIAN_FILE_VERSION      = '1.0';
171 {
172         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
174                                                                 ModeWHira, ModeWKata, ModeWKanji);
175 }
176         CharMode1 : array [ 0..255 ] of Byte =
177         (
178                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181                 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
186
187                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189                 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
195         );
196
197 //************************************************************
198 // misc
199 //************************************************************
200
201 //==============================
202 // RemoveToken
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
205 var
206         p: Integer;
207 begin
208         p := AnsiPos(delimiter, s);
209         if p = 0 then
210                 Result := s
211         else
212                 Result := Copy(s, 1, p - 1);
213         s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
214 end;
215
216 //==============================
217 // AbsSort
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
220 var
221         v1, v2 : Single;
222 begin
223
224         v1 := Abs( Single( p1 ) - 0.5 );
225         v2 := Abs( Single( p2 ) - 0.5 );
226         if v1 > v2 then
227                 Result := -1
228         else if v1 = v2 then
229                 Result := 0
230         else
231                 Result := 1;
232
233 end;
234
235 //************************************************************
236 // TWordCount class
237 //************************************************************
238 constructor TWordCount.Create;
239 begin
240
241                 Duplicates              := dupIgnore;
242                 CaseSensitive   := True;
243                 Sorted                          := True;
244
245 end;
246
247 destructor TWordCount.Destroy;
248 var
249         i : Integer;
250 begin
251
252         for i := Count - 1 downto 0 do
253                 if Objects[ i ] <> nil then
254                         Objects[ i ].Free;
255
256         inherited;
257
258 end;
259
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
263
264 //==============================
265 // Create
266 //==============================
267 constructor TGikoBayesian.Create;
268 begin
269
270         Duplicates              := dupIgnore;
271         CaseSensitive   := True;
272         Sorted                          := True;
273
274 end;
275
276 //==============================
277 // Destroy
278 //==============================
279 destructor TGikoBayesian.Destroy;
280 var
281         i : Integer;
282 begin
283
284         for i := Count - 1 downto 0 do
285                 if inherited Objects[ i ] <> nil then
286                         inherited Objects[ i ].Free;
287
288         inherited;
289
290 end;
291
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
293 var
294         i                       : Integer;
295         sl              : TStringList;
296         s                       : string;
297         name    : string;
298         info    : TWordInfo;
299 begin
300
301         FFilePath := filePath;
302
303         if not FileExists( filePath ) then
304                 Exit;
305
306         sl := TStringList.Create;
307         try
308                 sl.LoadFromFile( filePath );
309
310                 for i := 1 to sl.Count - 1 do begin
311                         s := sl[ i ];
312                         name := RemoveToken( s, #1 );
313                         info := TWordInfo.Create;
314                         info.NormalWord                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315                         info.ImportantWord      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316                         info.NormalText                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317                         info.ImportantText      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318
319                         AddObject( name, info );
320                 end;
321         finally
322                 sl.Free;
323         end;
324
325 end;
326
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
328 var
329         i                       : Integer;
330         sl              : TStringList;
331         s                       : string;
332         info    : TWordInfo;
333 begin
334
335         FFilePath := filePath;
336
337         sl := TStringList.Create;
338         try
339                 sl.BeginUpdate;
340                 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
341
342                 for i := 0 to Count - 1 do begin
343                         info := TWordInfo( inherited Objects[ i ] );
344                         s := Strings[ i ] + #1
345                                  + Format('%x', [info.NormalWord]) + #1
346                                  + Format('%x', [info.ImportantWord]) + #1
347                                  + Format('%x', [info.NormalText]) + #1
348                                  + Format('%x', [info.ImportantText]);
349
350                         sl.Add(s);
351                 end;
352                 sl.EndUpdate;
353                 sl.SaveToFile( filePath );
354         finally
355                 sl.Free;
356         end;
357
358 end;
359
360 procedure TGikoBayesian.Save;
361 begin
362
363         if FFilePath <> '' then
364                 SaveToFile( FFilePath );
365
366 end;
367
368 //==============================
369 // GetObject
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
372 var
373         idx : Integer;
374 begin
375
376         if Find( name, idx ) then
377                 Result := TWordInfo( inherited Objects[ idx ] )
378         else
379                 Result := nil;
380
381 end;
382
383 //==============================
384 // SetObject
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
387 var
388         idx : Integer;
389 begin
390
391         if Find( name, idx ) then
392                 inherited Objects[ idx ] := value
393         else
394                 AddObject( name, value );
395
396 end;
397
398
399 //==============================
400 // CountWord
401 //==============================
402 procedure TGikoBayesian.CountWord(
403         const text      : string;
404         wordCount               : TWordCount );
405 type
406         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
408                                                                 ModeWHira, ModeWKata, ModeWKanji);
409 var
410         p, tail, last                   : PChar;
411         mode, newMode                   : Modes;
412         ch                                                              : Longword;
413         chSize                                          : Integer;
414         wHiraDelimiter          : TStringList;
415         wHiraFinalDelimiter     : TStringList;
416         wKanjiDelimiter         : TStringList;
417         words                                                   : TStringList;
418         aWord                                                   : string;
419         countInfo                                       : TWordCountInfo;
420
421         function cutBoth( _aWord : string; _delim : TStringList ) : string;
422         var
423                 _i                      : Integer;
424         begin
425                 for _i := 0 to _delim.Count - 1 do begin
426                         _aWord := CustomStringReplace(
427                                 _aWord,
428                                 _delim[ _i ],
429                                 #10 + _delim[ _i ] + #10, False );
430                 end;
431                 Result := _aWord;
432         end;
433
434         function cutFirst( _aWord : string; _delim : TStringList ) : string;
435         var
436                 _i                      : Integer;
437         begin
438                 for _i := 0 to _delim.Count - 1 do begin
439                         _aWord := CustomStringReplace(
440                                 _aWord,
441                                 _delim[ _i ],
442                                 #10 + _delim[ _i ], False );
443                 end;
444                 Result := _aWord;
445         end;
446
447         function cutFinal( _aWord : string; _delim : TStringList ) : string;
448         var
449                 _i                      : Integer;
450         begin
451                 for _i := 0 to _delim.Count - 1 do begin
452                         _aWord := CustomStringReplace(
453                                 _aWord,
454                                 _delim[ _i ],
455                                 _delim[ _i ] + #10, False );
456                 end;
457                 Result := _aWord;
458         end;
459
460         procedure addWord( _dst : TWordCount; _words : TStringList );
461         var
462                 _aWord                  : string;
463                 _i, _idx                : Integer;
464                 _countInfo      : TWordCountInfo;
465         begin
466                 for _i := 0 to _words.Count - 1 do begin
467                         _aWord := _words[ _i ];
468                         if Length( _aWord ) > 0 then begin
469                                 if _dst.Find( _aWord, _idx ) then begin
470                                         _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
471                                 end else begin
472                                         _countInfo := TWordCountInfo.Create;
473                                         _dst.AddObject( _aWord, _countInfo );
474                                 end;
475                                 _countInfo.WordCount := _countInfo.WordCount + 1;
476                         end;
477                 end;
478         end;
479
480         function changeMode( _aWord : string; _mode : Modes ) : string;
481         var
482                 _i                                                                      : Integer;
483                 _aWord2                                                 : string;
484                 _pWord, _pWord2                 : PChar;
485                 _pWordTail, _pFound     : PChar;
486         const
487                 _delim : string = #10;
488         begin
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490                 if mode = ModeWHira then begin
491                         Result := '';
492                         Exit;
493                 end;
494 {$ENDIF}
495                 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
496                         // \93ú\96{\8cê
497                         // \83X\83y\81[\83X\82ð\8bl\82ß\82é
498                         _aWord := CustomStringReplace( _aWord, ' ', '', False );
499                         _aWord := CustomStringReplace( _aWord, '\81@', '', False );
500
501                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
502                         case mode of
503                         ModeWHira:
504                                 begin
505                                         _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506                                         Result := cutBoth( _aWord, wHiraDelimiter );
507                                 end;
508
509                         ModeWKanji:
510                                 begin
511                                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
512                                         _aWord := cutBoth( _aWord, wKanjiDelimiter );
513                                         // 4 byte (2 \8e\9a\82¸\82Â\82Å\92P\8cê\95ª\82¯
514                                         _pWord := PChar( _aWord );
515                                         _i := Length( _aWord );
516                                         _pWordTail := _pWord + _i;
517                                         SetLength( _aWord2, _i + (_i shr 2) );
518                                         _pWord2 := PChar( _aWord2 );
519
520                                         while _pWord < _pWordTail do begin
521                                                 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522                                                 if _pFound = nil then
523                                                         _pFound := _pWordTail;
524                                                 _pFound := _pFound - 3;
525
526                                                 while _pWord <= _pFound do begin
527                                                         CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528                                                         _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
529                                                 end;
530                                                 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531                                                 CopyMemory( _pWord2, _pWord, _i );
532                                                 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
533                                         end;
534                                         if _pWord < _pWordTail then begin
535                                                 _i := _pWordTail - _pWord;
536                                                 CopyMemory( _pWord2, _pWord, _i );
537                                                 _pWord2 := _pWord2 + _i;
538                                         end;
539                                         SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
540
541                                         Result := _aWord2;
542                                 end;
543
544                         else
545                                 Result := _aWord;
546                         end;
547                 end else begin
548                         Result := _aWord;
549                 end;
550         end;
551 const
552         WHIRA_DELIMITER = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç'
553                 + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å'+ #10 + '\82Å'
554                 + #10 + '\82±\82±' + #10 + '\82»\82±' + #10 + '\82Ç\82±'
555                 + #10 + '\82±\82ê' + #10 + '\82»\82ê' + #10 + '\82 \82ê' + #10 + '\82Ç\82ê'
556                 + #10 + '\82±\82Ì' + #10 + '\82»\82Ì' + #10 + '\82 \82Ì' + #10 + '\82Ç\82Ì'
557                 + #10 + '\82±\82¤' + #10 + '\82»\82¤' + #10 + '\82 \82 ' + #10 + '\82Ç\82¤'
558                 + #10 + '\82±\82ñ\82È' + #10 + '\82»\82ñ\82È' + #10 + '\82 \82ñ\82È' + #10 + '\82Ç\82ñ\82È'
559                 + #10 + '\82ê\82½' + #10 + '\82ê\82Ä' + #10 + '\82ê\82ê' + #10 + '\82ê\82ë'
560                 + #10 + '\82ê\82é' + #10 + '\82ç\82ê\82é'
561                 + #10 + '\82Å\82·' + #10 + '\82Ü\82·' + #10 + '\82Ü\82¹\82ñ'
562                 + #10 + '\82Å\82µ\82½' + #10 + '\82Ü\82µ\82½'
563                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82³\82ê\82é' + #10 + '\82³\82ê\82È\82¢'
564                 ;
565         WKANJI_DELIMITER = '\93I' + #10 + '\90«' + #10 + '\8e®' + #10 + '\89»' + #10 + '\96@'
566                 + #10 + '\95s' + #10 + '\96³' + #10 + '\94ñ' + #10 + '\94½'
567                 ;
568         WHIRA_FINAL_DELIMITER = '\82Á\82½' + #10 + '\82Á\82Ä'
569                 ;{
570                 + #10 + '\82æ\82Á\82Ä' + #10 + '\82µ\82½\82ª\82Á\82Ä' + #10 + '\82È\82Ì\82Å'
571                 + #10 + '\82¾\82©\82ç' + #10 + '\82Å\82·\82©\82ç'
572                 + #10 + '\82Ü\82½'
573                 + #10 + '\82µ\82©\82µ' + #10 + '\82¾\82ª' + #10 + '\82¯\82Ç' + #10 + '\82¯\82ê\82Ç'
574                 + #10 + '\82â\82Í\82è' + #10 + '\82â\82Á\82Ï\82è'
575                 + #10 + '\82Å\82µ' + #10 + '\82¾\82ë'
576                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82µ\82½' + #10 + '\82µ\82È\82¢'
577                 ;}
578         // '\81[' \82ð '\82\9f\82¡\82£\82¥\82§' \82É\81B
579         HA_LINE = '\82 \82©\82³\82½\82È\82Í\82Ü\82â\82ç\82í\82ª\82´\82¾\82Î\82Ï\82\9f\82ì';
580         HI_LINE = '\82¢\82«\82µ\82¿\82É\82Ð\82Ý\82è\82î\82¬\82\82Ñ\82Ò\82¡';
581         HU_LINE = '\82¤\82­\82·\82Â\82Ê\82Ó\82Þ\82ä\82é\82®\82Ô\82Õ\82£';
582         HE_LINE = '\82¦\82¯\82¹\82Ä\82Ë\82Ö\82ß\82ê\82ï\82°\82×\82Ø\82¥';
583         HO_LINE = '\82¨\82±\82»\82Æ\82Ì\82Ù\82à\82æ\82ë\82ð\82²\82Ú\82Û\82§';
584         KA_LINE = '\83A\83J\83T\83^\83i\83n\83}\83\84\83\89\83\8f\83K\83U\83_\83o\83p\83@\83\95\83\8e';
585         KI_LINE = '\83C\83L\83V\83`\83j\83q\83~\83\8a\83\90\83M\83W\83r\83s\83B';
586         KU_LINE = '\83E\83N\83X\83c\83k\83t\83\80\83\86\83\8b\83O\83u\83v\83D\83\94';
587         KE_LINE = '\83G\83P\83Z\83e\83l\83w\83\81\83\8c\83\91\83Q\83x\83y\83F\83\96';
588         KO_LINE = '\83I\83R\83\\83g\83m\83z\83\82\83\88\83\8d\83\92\83S\83{\83|\83H';
589         kKanji = [$80..$A0, $E0..$ff];
590 begin
591
592         wHiraDelimiter  := TStringList.Create;
593         wHiraFinalDelimiter := TStringList.Create;
594         wKanjiDelimiter := TStringList.Create;
595         words := TStringList.Create;
596         try
597                 mode := ModeWhite;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599                 wHiraDelimiter.Text := WHIRA_DELIMITER;
600                 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
601 {$ENDIF}
602                 wKanjiDelimiter.Text := WKANJI_DELIMITER;
603                 p                       := PChar( text );
604                 tail    := p + Length( text );
605                 last    := p;
606
607                 while p < tail do begin
608                         // \95\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
609                         // \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82­\82Ä\82à\82¢\82¢
610 //                      if Byte(Byte( p^ ) - $a1) < $5e then begin
611                         if Byte( p^ ) in kKanji then begin
612                                 if p + 1 < tail then begin
613                                         ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
614                                         case ch of
615                                         // \83X\83y\81[\83X\82Å\92P\8cê\95ª\82¯\82¹\82¸\82É\8bl\82ß\82é
616                                         //$8140:                                                        newMode := ModeWhite;
617                                         $8141..$824e:                           newMode := ModeWGraph;
618                                         $824f..$8258:                           newMode := ModeWNum;
619                                         $8260..$829a:                           newMode := ModeWAlpha;
620                                         $829f..$82f1:                           newMode := ModeWHira;
621                                         $8340..$8396:                           newMode := ModeWKata;
622                                         else                                                            newMode := ModeWKanji;
623                                         end;
624                                         // '\81J\81K\81[' \82Í\95½\89¼\96¼\81A\82Ü\82½\82Í\83J\83^\83J\83i\82É\8aÜ\82Ü\82ê\82é
625                                         if (mode = ModeWHira) or (mode = ModeWKata) then
626                                                 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
627                                                         newMode := mode;
628                                 end else begin
629                                         newMode := ModeWhite;
630                                 end;
631
632                                 chSize := 2;
633                         end else begin
634                                 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635                                 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636                                         // \8d¡\82Ü\82Å\93ú\96{\8cê\82Å\8d¡\83X\83y\81[\83X
637                                         // \92P\8cê\82ð\8cq\82°\82Ä\8cã\82Å\83X\83y\81[\83X\82ð\8bl\82ß\82é
638                                         // \81¦\94¼\8ap\83J\83i\82Í\92Ê\8fí\83X\83y\81[\83X\82Å\8bæ\90Ø\82é\82¾\82ë\82¤\82©\82ç\8bl\82ß\82È\82¢
639                                         newMode := mode;
640                                 end;
641
642                                 chSize := 1;
643                         end;
644
645                         if mode <> newMode then begin
646
647                                 // \95\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
648                                 if mode <> ModeWhite then begin
649                                         SetLength( aWord, p - last );
650                                         CopyMemory( PChar( aWord ), last, p - last );
651
652                                         words.Text := changeMode( aWord, mode );
653
654                                         // \92P\8cê\93o\98^
655                                         addWord( wordCount, words );
656                                 end;
657
658                                 last := p;
659                                 mode := newMode;
660
661                         end;
662
663                         p := p + chSize;
664                 end;    // while
665
666                 if mode <> ModeWhite then begin
667                         SetLength( aWord, p - last );
668                         CopyMemory( PChar( aWord ), last, p - last );
669
670                         words.Text := changeMode( aWord, mode );
671
672                         // \92P\8cê\93o\98^
673                         addWord( wordCount, words );
674                 end;
675         finally
676                 words.Free;
677                 wKanjiDelimiter.Free;
678                 wHiraFinalDelimiter.Free;
679                 wHiraDelimiter.Free;
680         end;
681
682 end;
683
684 //==============================
685 // CalcPaulGraham
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
688
689         function p( const aWord : string ) : Single;
690         var
691                 info : TWordInfo;
692         begin
693                 info := Objects[ aWord ];
694                 if info = nil then
695                         Result := 0.415
696                 else if info.NormalWord = 0 then
697                         Result := 0.99
698                 else if info.ImportantWord = 0 then
699                         Result := 0.01
700                 else if info.ImportantWord + info.NormalWord * 2 < 5 then
701                         Result := 0.5
702                 else
703                         Result := ( info.ImportantWord / info.ImportantText ) /
704                                 ((info.NormalWord * 2 / info.NormalText ) +
705                                  (info.ImportantWord / info.ImportantText));
706         end;
707
708 var
709         s, q                            : Extended;
710         i                                               : Integer;
711         narray                  : TList;
712 const
713         SAMPLE_COUNT    = 15;
714 begin
715
716         Result := 1;
717         if wordCount.Count = 0 then
718                 Exit;
719
720         narray := TList.Create;
721         try
722                 for i := 0 to wordCount.Count - 1 do begin
723                         narray.Add( Pointer( p( wordCount[ i ] ) ) );
724                 end;
725
726                 narray.Sort( AbsSort );
727
728                 s := 1;
729                 q := 1;
730                 i := min( SAMPLE_COUNT, narray.Count );
731                 while i > 0 do begin
732                         Dec( i );
733
734                         s := s * Single( narray[ i ] );
735                         q := q * (1 - Single( narray[ i ] ));
736                 end;
737
738                 Result := s / (s + q);
739         finally
740                 narray.Free;
741         end;
742
743 end;
744
745 //==============================
746 // CalcGaryRobinson
747 //==============================
748 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
749
750         function p( const aWord : string ) : Single;
751         var
752                 info : TWordInfo;
753         begin
754                 info := Objects[ aWord ];
755                 if info = nil then
756                         Result := 0.415
757                 else if info.ImportantWord = 0 then
758                         Result := 0.01
759                 else if info.NormalWord = 0 then
760                         Result := 0.99
761                 else
762                 {
763                         Result := ( info.ImportantWord / info.ImportantText ) /
764                                 ((info.NormalWord / info.NormalText ) +
765                                  (info.ImportantWord / info.ImportantText));
766                 }
767                         Result := (info.ImportantWord * info.NormalText) /
768                                 (info.NormalWord * info.ImportantText +
769                                 info.ImportantWord * info.NormalText);
770         end;
771
772         function f( cnt : Integer; n, mean : Single ) : Extended;
773         const
774                 k = 0.001;
775         begin
776                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
777         end;
778
779 var
780         n                                               : Extended;
781         narray                  : array of Single;
782         mean                            : Extended;
783         countInfo               : TWordCountInfo;
784         i                                               : Integer;
785         P1, Q1, R1      : Extended;
786         cnt                                     : Extended;
787 begin
788
789         if wordCount.Count = 0 then begin
790                 Result := 1;
791                 Exit;
792         end;
793
794         SetLength( narray, wordCount.Count );
795         mean := 0;
796         for i := 0 to wordCount.Count - 1 do begin
797                 n                                               := p( wordCount[ i ] );
798                 narray[ i ]     := n;
799                 mean                            := mean + n;
800         end;
801         mean := mean / wordCount.Count;
802
803         P1 := 1;
804         Q1 := 1;
805         for i := 0 to wordCount.Count - 1 do begin
806                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
807                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
808                 P1 := P1 * ( 1 - n );
809                 Q1 := Q1 * n;
810         end;
811         cnt := wordCount.Count;
812         if cnt = 0 then
813                 cnt := 1
814         else
815         P1 := 1 - Power( P1, 1 / cnt );
816         Q1 := 1 - Power( Q1, 1 / cnt );
817
818         if P1 + Q1 = 0 then begin
819                 Result := 0.5
820         end else begin
821                 n := (P1 - Q1) / (P1 + Q1);
822                 Result := (1 + n) / 2;
823         end;
824
825 end;
826
827 //==============================
828 // CalcGaryRobinsonFisher
829 //==============================
830 function TGikoBayesian.CalcGaryRobinsonFisher(
831         wordCount : TWordCount
832 ) : Extended;
833
834         function p( const aWord : string ) : Single;
835         var
836                 info                            : TWordInfo;
837         begin
838                 info := Objects[ aWord ];
839                 if info = nil then
840                         Result := 0.415
841                 else if info.ImportantWord = 0 then
842                         Result := 0.01
843                 else if info.NormalWord = 0 then
844                         Result := 0.99
845                 else
846                 {
847                         Result := ( info.ImportantWord / info.ImportantText ) /
848                                 ((info.NormalWord / info.NormalText ) +
849                                  (info.ImportantWord / info.ImportantText));
850                 }
851                         Result := (info.ImportantWord * info.NormalText) /
852                                 (info.NormalWord * info.ImportantText +
853                                 info.ImportantWord * info.NormalText);
854         end;
855
856         function f( cnt : Integer; n, mean : Single ) : Extended;
857         const
858                 k = 0.001;
859         begin
860                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
861         end;
862
863         function prbx( x2, degree : Extended ) : Extended;
864         begin
865
866                 Result := 0.5;
867
868         end;
869
870 var
871         n                                               : Extended;
872         narray                  : array of Single;
873         mean                            : Extended;
874         countInfo               : TWordCountInfo;
875         i                                               : Integer;
876         normal                  : Extended;
877         important               : Extended;
878         P1, Q1                  : Extended;
879         cnt                                     : Extended;
880 begin
881
882         if wordCount.Count = 0 then begin
883                 Result := 1;
884                 Exit;
885         end;
886
887         SetLength( narray, wordCount.Count );
888         mean := 0;
889         for i := 0 to wordCount.Count - 1 do begin
890                 n                                               := p( wordCount[ i ] );
891                 narray[ i ]     := n;
892                 mean                            := mean + n;
893         end;
894         mean := mean / wordCount.Count;
895
896         P1 := 1;
897         Q1 := 1;
898         for i := 0 to wordCount.Count - 1 do begin
899                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
900                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
901                 P1 := P1 * ( 1 - n );
902                 Q1 := Q1 * n;
903         end;
904         cnt := wordCount.Count;
905         if cnt = 0 then
906                 cnt := 1
907         else
908         P1 := Power( P1, 1 / cnt );
909         Q1 := Power( Q1, 1 / cnt );
910
911         P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
912         Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
913
914         Result := (1 + P1 - Q1) / 2;
915
916 end;
917
918 //==============================
919 // Parse
920 //==============================
921 function TGikoBayesian.Parse(
922         const text                              : string;
923         wordCount                                       : TWordCount;
924         algorithm                                       : TGikoBayesianAlgorithm
925 ) : Extended;
926 begin
927
928         CountWord( text, wordCount );
929         case algorithm of
930         gbaPaulGraham:          Result := CalcPaulGraham( wordCount );
931         gbaGaryRobinson:        Result := CalcGaryRobinson( wordCount );
932         gbaGaryRobinsonFisher:
933                                                                                 Result := CalcGaryRobinsonFisher( wordCount );
934         else                                                    Result := 0;
935         end;
936
937 end;
938
939 //==============================
940 // Learn
941 //==============================
942 procedure TGikoBayesian.Learn(
943         wordCount                : TWordCount;
944         isImportant      : Boolean );
945 var
946         aWord                   : string;
947         wordinfo        : TWordInfo;
948         countinfo       : TWordCountInfo;
949         i                                       : Integer;
950 begin
951
952         for i := 0 to wordCount.Count - 1 do begin
953                 aWord := wordCount[ i ];
954                 wordinfo := Objects[ aWord ];
955                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
956                 if wordinfo = nil then begin
957                         wordinfo := TWordInfo.Create;
958                         Objects[ aWord ] := wordinfo;
959                 end;
960
961                 if isImportant then begin
962                         wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
963                         wordinfo.ImportantText := wordinfo.ImportantText + 1;
964                 end else begin
965                         wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
966                         wordinfo.NormalText := wordinfo.NormalText + 1;
967                 end;
968         end;
969
970 end;
971
972 //==============================
973 // Forget
974 //==============================
975 procedure       TGikoBayesian.Forget(
976         wordCount               : TWordCount;
977         isImportant     : Boolean );
978 var
979         aWord                   : string;
980         wordinfo        : TWordInfo;
981         countinfo       : TWordCountInfo;
982         i                       : Integer;
983 begin
984
985         for i := 0 to wordCount.Count - 1 do begin
986                 aWord := wordCount[ i ];
987                 wordinfo := Objects[ aWord ];
988                 if wordinfo = nil then
989                         Continue;
990
991                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
992                 if isImportant then begin
993                         if wordInfo.ImportantText > 0 then begin
994                                 wordinfo.ImportantText := wordinfo.ImportantText - 1;
995                                 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
996                         end;
997                 end else begin
998                         if wordinfo.NormalText > 0 then begin
999                                 wordinfo.NormalText := wordinfo.NormalText - 1;
1000                                 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1001                         end;
1002                 end;
1003         end;
1004
1005 end;
1006
1007 end.