OSDN Git Service

f72a08b11247cbffc7b90a61def4bd4059b76128
[gikonavigoeson/gikonavi.git] / GikoBayesian.pas
1 unit GikoBayesian;
2
3 {!
4 \file           GikoBayesian.pas
5 \brief  \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
6
7 $Id: GikoBayesian.pas,v 1.17 2004/11/05 14:24:26 h677 Exp $
8 }
9
10 //! \95½\89¼\96¼\82ð\8e«\8f\91\82É\8aÜ\82ß\82È\82¢
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
12
13 interface
14
15 //==================================================
16 uses
17 //==================================================
18         Classes;
19
20 //==================================================
21 type
22 //==================================================
23
24         {!***********************************************************
25         \brief \92P\8cê\83v\83\8d\83p\83e\83B
26         ************************************************************}
27         TWordInfo       = class( TObject )
28         private
29                 FNormalWord                     :       Integer;        //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
30                 FImportantWord  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
31                 FNormalText                     : Integer;      //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
32                 FImportantText  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
33
34         public
35                 property NormalWord                     : Integer       read FNormalWord write FNormalWord;
36                 property ImportantWord  : Integer       read FImportantWord write FImportantWord;
37                 property NormalText                     : Integer       read FNormalText write FNormalText;
38                 property ImportantText  : Integer       read FImportantText write FImportantText;
39         end;
40
41         {!***********************************************************
42         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83v\83\8d\83p\83e\83B
43         ************************************************************}
44         TWordCountInfo  = class( TObject )
45         private
46                 FWordCount      :       Integer;        //!< \92P\8cê\90\94
47
48         public
49                 property WordCount      : Integer       read FWordCount write FWordCount;
50         end;
51
52         {!***********************************************************
53         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83\8a\83X\83g
54         ************************************************************}
55 //      TWordCount      = class( THashedStringList )    // \8c\83\92x
56         TWordCount      = class( TStringList )
57         public
58                 constructor Create;
59                 destructor Destroy; override;
60         end;
61
62         {!***********************************************************
63         \brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
64         ************************************************************}
65         TGikoBayesianAlgorithm =
66                 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
67
68         {!***********************************************************
69         \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
70         ************************************************************}
71 //      TGikoBayesian = class( THashedStringList )      // \8c\83\92x
72         TGikoBayesian = class( TStringList )
73         private
74                 FFilePath       : string;       //!< \93Ç\82Ý\8d\9e\82ñ\82¾\83t\83@\83C\83\8b\83p\83X
75                 function GetObject( const name : string ) : TWordInfo;
76                 procedure SetObject( const name : string; value : TWordInfo );
77
78         public
79                 constructor Create;
80                 destructor Destroy; override;
81
82                 //! \83t\83@\83C\83\8b\82©\82ç\8aw\8fK\97\9a\97ð\82ð\93Ç\82Ý\8fo\82µ\82Ü\82·
83                 procedure LoadFromFile( const filePath : string );
84
85                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
86                 procedure SaveToFile( const filePath : string );
87
88                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
89                 procedure Save;
90
91                 //! \92P\8cê\82É\91Î\82·\82é\8fî\95ñ\82ð\8eæ\93¾\82µ\82Ü\82·
92                 property Objects[ const name : string ] : TWordInfo
93                         read GetObject write SetObject; default;
94
95                 //! \95\8fÍ\82É\8aÜ\82Ü\82ê\82é\92P\8cê\82ð\83J\83E\83\93\83g\82µ\82Ü\82·
96                 procedure CountWord(
97                         const text      : string;
98                         wordCount               : TWordCount );
99
100                 {!
101                 \brief  Paul Graham \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
102                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
103                 }
104                 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
105
106                 {!
107                 \brief  GaryRobinson \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
108                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
109                 }
110                 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
111
112                 {!
113                 \brief  GaryRobinson-Fisher \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
114                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
115                 }
116                 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
117
118                 {!
119                 \brief  \95\8fÍ\82ð\89ð\90Í
120                 \param  text                                    \89ð\90Í\82·\82é\95\8fÍ
121                 \param  wordCount                       \89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g\82ª\95Ô\82é
122                 \param  algorithm                       \92\8d\96Ú\93x\82Ì\8c\88\92è\82É\97p\82¢\82é\83A\83\8b\83S\83\8a\83Y\83\80\82ð\8ew\92è\82µ\82Ü\82·
123                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
124
125                 CountWord \82Æ Calcxxxxx \82ð\82Ü\82Æ\82ß\82Ä\8eÀ\8ds\82·\82é\82¾\82¯\82Å\82·\81B
126                 }
127                 function Parse(
128                         const text                              : string;
129                         wordCount                                       : TWordCount;
130                         algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131                 ) : Extended;
132
133                 {!
134                 \brief  \8aw\8fK\82·\82é
135                 \param  wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
136                 \param  isImportant \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82é\82È\82ç True
137                 }
138                 procedure Learn(
139                         wordCount                : TWordCount;
140                         isImportant      : Boolean );
141
142                 {!
143                 \brief          \8aw\8fK\8c\8b\89Ê\82ð\96Y\82ê\82é
144                 \param          wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
145                 \param          isImportant     \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82ç\82ê\82Ä\82¢\82½\82È\82ç True
146                 \warning        \8aw\8fK\8dÏ\82Ý\82Ì\95\8fÍ\82©\82Ç\82¤\82©\82Í\8am\94F\8fo\97\88\82Ü\82¹\82ñ\81B<br>
147                                                         Learn \82µ\82Ä\82¢\82È\82¢\95\8fÍ\82â isImportant \82ª\8aÔ\88á\82Á\82Ä\82¢\82é\95\8fÍ\82ð
148                                                         Forget \82·\82é\82Æ\83f\81[\83^\83x\81[\83X\82ª\94j\91¹\82µ\82Ü\82·\81B<br>
149                                                         \8aw\8fK\8dÏ\82Ý\82©\82Ç\82¤\82©\82Í\93Æ\8e©\82É\8aÇ\97\9d\82µ\82Ä\82­\82¾\82³\82¢\81B
150
151                 \91S\82Ä\82Ì\8aw\8fK\8c\8b\89Ê\82ð\83N\83\8a\83A\82·\82é\82í\82¯\82Å\82Í\82 \82è\82Ü\82¹\82ñ\81B<br>
152                 wordCount \82ð\93¾\82½\95\8fÍ (Parse \82Ì text \88ø\90\94\82Ì\8aw\8fK\8c\8b\89Ê\82Ì\82Ý\83N\83\8a\83A\82µ\82Ü\82·\81B<br><br>
153
154                 \8eå\82É\92\8d\96Ú\95\8fÍ\82Æ\94ñ\92\8d\96Ú\95\8fÍ\82ð\90Ø\82è\91Ö\82¦\82é\82½\82ß\82É Forget -> Learn \82Ì\8f\87\82Å\8eg\97p\82µ\82Ü\82·\81B
155                 }
156                 procedure       Forget(
157                         wordCount               : TWordCount;
158                         isImportant     : Boolean );
159         end;
160
161 //==================================================
162 implementation
163 //==================================================
164
165 uses
166         SysUtils, Math, Windows,
167         MojuUtils;
168
169 const
170         GIKO_BAYESIAN_FILE_VERSION      = '1.0';
171 {
172         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
174                                                                 ModeWHira, ModeWKata, ModeWKanji);
175 }
176         CharMode1 : array [ 0..255 ] of Byte =
177         (
178                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181                 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
186
187                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189                 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
195         );
196
197 //************************************************************
198 // misc
199 //************************************************************
200
201 //==============================
202 // RemoveToken
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
205 var
206         p: Integer;
207 begin
208         p := AnsiPos(delimiter, s);
209         if p = 0 then
210                 Result := s
211         else
212                 Result := Copy(s, 1, p - 1);
213         s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
214 end;
215
216 //==============================
217 // AbsSort
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
220 var
221         v1, v2 : Single;
222 begin
223
224         v1 := Abs( Single( p1 ) - 0.5 );
225         v2 := Abs( Single( p2 ) - 0.5 );
226         if v1 > v2 then
227                 Result := -1
228         else if v1 = v2 then
229                 Result := 0
230         else
231                 Result := 1;
232
233 end;
234
235 //************************************************************
236 // TWordCount class
237 //************************************************************
238 constructor TWordCount.Create;
239 begin
240
241                 Duplicates              := dupIgnore;
242                 CaseSensitive   := True;
243                 Sorted                          := True;
244
245 end;
246
247 destructor TWordCount.Destroy;
248 var
249         i : Integer;
250 begin
251
252         for i := Count - 1 downto 0 do
253                 if Objects[ i ] <> nil then
254                         Objects[ i ].Free;
255
256         inherited;
257
258 end;
259
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
263
264 //==============================
265 // Create
266 //==============================
267 constructor TGikoBayesian.Create;
268 begin
269
270         Duplicates              := dupIgnore;
271         CaseSensitive   := True;
272         Sorted                          := True;
273
274 end;
275
276 //==============================
277 // Destroy
278 //==============================
279 destructor TGikoBayesian.Destroy;
280 var
281         i : Integer;
282 begin
283
284         for i := Count - 1 downto 0 do
285                 if inherited Objects[ i ] <> nil then
286                         inherited Objects[ i ].Free;
287
288         inherited;
289
290 end;
291
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
293 var
294         i                       : Integer;
295         sl              : TStringList;
296         s                       : string;
297         name    : string;
298         info    : TWordInfo;
299 begin
300
301         FFilePath := filePath;
302
303         if not FileExists( filePath ) then
304                 Exit;
305
306         sl := TStringList.Create;
307         try
308                 sl.LoadFromFile( filePath );
309
310                 for i := 1 to sl.Count - 1 do begin
311                         s := sl[ i ];
312                         name := RemoveToken( s, #1 );
313                         info := TWordInfo.Create;
314                         info.NormalWord                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315                         info.ImportantWord      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316                         info.NormalText                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317                         info.ImportantText      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318
319                         AddObject( name, info );
320                 end;
321         finally
322                 sl.Free;
323         end;
324
325 end;
326
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
328 var
329         i                       : Integer;
330         sl              : TStringList;
331         s                       : string;
332         info    : TWordInfo;
333 begin
334
335         FFilePath := filePath;
336
337         sl := TStringList.Create;
338         try
339                 sl.BeginUpdate;
340                 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
341
342                 for i := 0 to Count - 1 do begin
343                         info := TWordInfo( inherited Objects[ i ] );
344                         s := Strings[ i ] + #1
345                                  + Format('%x', [info.NormalWord]) + #1
346                                  + Format('%x', [info.ImportantWord]) + #1
347                                  + Format('%x', [info.NormalText]) + #1
348                                  + Format('%x', [info.ImportantText]);
349
350                         sl.Add(s);
351                 end;
352                 sl.EndUpdate;
353                 sl.SaveToFile( filePath );
354         finally
355                 sl.Free;
356         end;
357
358 end;
359
360 procedure TGikoBayesian.Save;
361 begin
362
363         if FFilePath <> '' then
364                 SaveToFile( FFilePath );
365
366 end;
367
368 //==============================
369 // GetObject
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
372 var
373         idx : Integer;
374 begin
375
376         if Find( name, idx ) then
377                 Result := TWordInfo( inherited Objects[ idx ] )
378         else
379                 Result := nil;
380
381 end;
382
383 //==============================
384 // SetObject
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
387 var
388         idx : Integer;
389 begin
390
391         if Find( name, idx ) then
392                 inherited Objects[ idx ] := value
393         else
394                 AddObject( name, value );
395
396 end;
397
398
399 //==============================
400 // CountWord
401 //==============================
402 procedure TGikoBayesian.CountWord(
403         const text      : string;
404         wordCount               : TWordCount );
405 type
406         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
408                                                                 ModeWHira, ModeWKata, ModeWKanji);
409 var
410         p, tail, last                   : PChar;
411         mode, newMode                   : Modes;
412         ch                                                              : Longword;
413         chSize                                          : Integer;
414         wHiraDelimiter          : TStringList;
415         wHiraFinalDelimiter     : TStringList;
416         wKanjiDelimiter         : TStringList;
417         words                                                   : TStringList;
418         aWord                                                   : string;
419         countInfo                                       : TWordCountInfo;
420
421         function cutBoth( _aWord : string; _delim : TStringList ) : string;
422         var
423                 _i                      : Integer;
424         begin
425                 for _i := 0 to _delim.Count - 1 do begin
426                         _aWord := CustomStringReplace(
427                                 _aWord,
428                                 _delim[ _i ],
429                                 #10 + _delim[ _i ] + #10, False );
430                 end;
431                 Result := _aWord;
432         end;
433
434         function cutFirst( _aWord : string; _delim : TStringList ) : string;
435         var
436                 _i                      : Integer;
437         begin
438                 for _i := 0 to _delim.Count - 1 do begin
439                         _aWord := CustomStringReplace(
440                                 _aWord,
441                                 _delim[ _i ],
442                                 #10 + _delim[ _i ], False );
443                 end;
444                 Result := _aWord;
445         end;
446
447         function cutFinal( _aWord : string; _delim : TStringList ) : string;
448         var
449                 _i                      : Integer;
450         begin
451                 for _i := 0 to _delim.Count - 1 do begin
452                         _aWord := CustomStringReplace(
453                                 _aWord,
454                                 _delim[ _i ],
455                                 _delim[ _i ] + #10, False );
456                 end;
457                 Result := _aWord;
458         end;
459
460         procedure addWord( _dst : TWordCount; _words : TStringList );
461         var
462                 _aWord                  : string;
463                 _i, _idx                : Integer;
464                 _countInfo      : TWordCountInfo;
465         begin
466                 for _i := 0 to _words.Count - 1 do begin
467                         _aWord := _words[ _i ];
468                         if Length( _aWord ) > 0 then begin
469                                 if _dst.Find( _aWord, _idx ) then begin
470                                         _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
471                                 end else begin
472                                         _countInfo := TWordCountInfo.Create;
473                                         _dst.AddObject( _aWord, _countInfo );
474                                 end;
475                                 _countInfo.WordCount := _countInfo.WordCount + 1;
476                         end;
477                 end;
478         end;
479
480         function changeMode( _aWord : string; _mode : Modes ) : string;
481         var
482                 _i                                                                      : Integer;
483                 _aWord2                                                 : string;
484                 _pWord, _pWord2                 : PChar;
485                 _pWordTail, _pFound     : PChar;
486         const
487                 _delim : string = #10;
488         begin
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490                 if mode = ModeWHira then begin
491                         Result := '';
492                         Exit;
493                 end;
494 {$ENDIF}
495                 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
496                         // \93ú\96{\8cê
497                         // \83X\83y\81[\83X\82ð\8bl\82ß\82é
498                         _aWord := CustomStringReplace( _aWord, ' ', '', False );
499                         _aWord := CustomStringReplace( _aWord, '\81@', '', False );
500
501                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
502                         case mode of
503                         ModeWHira:
504                                 begin
505                                         _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506                                         Result := cutBoth( _aWord, wHiraDelimiter );
507                                 end;
508
509                         ModeWKanji:
510                                 begin
511                                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
512                                         _aWord := cutBoth( _aWord, wKanjiDelimiter );
513                                         // 4 byte (2 \8e\9a\82¸\82Â\82Å\92P\8cê\95ª\82¯
514                                         _pWord := PChar( _aWord );
515                                         _i := Length( _aWord );
516                                         _pWordTail := _pWord + _i;
517                                         SetLength( _aWord2, _i + (_i shr 2) );
518                                         _pWord2 := PChar( _aWord2 );
519
520                                         while _pWord < _pWordTail do begin
521                                                 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522                                                 if _pFound = nil then
523                                                         _pFound := _pWordTail;
524                                                 _pFound := _pFound - 3;
525
526                                                 while _pWord <= _pFound do begin
527                                                         CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528                                                         _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
529                                                 end;
530                                                 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531                                                 CopyMemory( _pWord2, _pWord, _i );
532                                                 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
533                                         end;
534                                         if _pWord < _pWordTail then begin
535                                                 _i := _pWordTail - _pWord;
536                                                 CopyMemory( _pWord2, _pWord, _i );
537                                                 _pWord2 := _pWord2 + _i;
538                                         end;
539                                         SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
540
541                                         Result := _aWord2;
542                                 end;
543
544                         else
545                                 Result := _aWord;
546                         end;
547                 end else begin
548                         Result := _aWord;
549                 end;
550         end;
551 const
552         WHIRA_DELIMITER = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç'
553                 + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å'+ #10 + '\82Å'
554                 + #10 + '\82±\82±' + #10 + '\82»\82±' + #10 + '\82Ç\82±'
555                 + #10 + '\82±\82ê' + #10 + '\82»\82ê' + #10 + '\82 \82ê' + #10 + '\82Ç\82ê'
556                 + #10 + '\82±\82Ì' + #10 + '\82»\82Ì' + #10 + '\82 \82Ì' + #10 + '\82Ç\82Ì'
557                 + #10 + '\82±\82¤' + #10 + '\82»\82¤' + #10 + '\82 \82 ' + #10 + '\82Ç\82¤'
558                 + #10 + '\82±\82ñ\82È' + #10 + '\82»\82ñ\82È' + #10 + '\82 \82ñ\82È' + #10 + '\82Ç\82ñ\82È'
559                 + #10 + '\82ê\82½' + #10 + '\82ê\82Ä' + #10 + '\82ê\82ê' + #10 + '\82ê\82ë'
560                 + #10 + '\82ê\82é' + #10 + '\82ç\82ê\82é'
561                 + #10 + '\82Å\82·' + #10 + '\82Ü\82·' + #10 + '\82Ü\82¹\82ñ'
562                 + #10 + '\82Å\82µ\82½' + #10 + '\82Ü\82µ\82½'
563                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82³\82ê\82é' + #10 + '\82³\82ê\82È\82¢'
564                 ;
565         WKANJI_DELIMITER = '\93I' + #10 + '\90«' + #10 + '\8e®' + #10 + '\89»' + #10 + '\96@'
566                 + #10 + '\95s' + #10 + '\96³' + #10 + '\94ñ' + #10 + '\94½'
567                 ;
568         WHIRA_FINAL_DELIMITER = '\82Á\82½' + #10 + '\82Á\82Ä'
569                 ;{
570                 + #10 + '\82æ\82Á\82Ä' + #10 + '\82µ\82½\82ª\82Á\82Ä' + #10 + '\82È\82Ì\82Å'
571                 + #10 + '\82¾\82©\82ç' + #10 + '\82Å\82·\82©\82ç'
572                 + #10 + '\82Ü\82½'
573                 + #10 + '\82µ\82©\82µ' + #10 + '\82¾\82ª' + #10 + '\82¯\82Ç' + #10 + '\82¯\82ê\82Ç'
574                 + #10 + '\82â\82Í\82è' + #10 + '\82â\82Á\82Ï\82è'
575                 + #10 + '\82Å\82µ' + #10 + '\82¾\82ë'
576                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82µ\82½' + #10 + '\82µ\82È\82¢'
577                 ;}
578         // '\81[' \82ð '\82\9f\82¡\82£\82¥\82§' \82É\81B
579         HA_LINE = '\82 \82©\82³\82½\82È\82Í\82Ü\82â\82ç\82í\82ª\82´\82¾\82Î\82Ï\82\9f\82ì';
580         HI_LINE = '\82¢\82«\82µ\82¿\82É\82Ð\82Ý\82è\82î\82¬\82\82Ñ\82Ò\82¡';
581         HU_LINE = '\82¤\82­\82·\82Â\82Ê\82Ó\82Þ\82ä\82é\82®\82Ô\82Õ\82£';
582         HE_LINE = '\82¦\82¯\82¹\82Ä\82Ë\82Ö\82ß\82ê\82ï\82°\82×\82Ø\82¥';
583         HO_LINE = '\82¨\82±\82»\82Æ\82Ì\82Ù\82à\82æ\82ë\82ð\82²\82Ú\82Û\82§';
584         KA_LINE = '\83A\83J\83T\83^\83i\83n\83}\83\84\83\89\83\8f\83K\83U\83_\83o\83p\83@\83\95\83\8e';
585         KI_LINE = '\83C\83L\83V\83`\83j\83q\83~\83\8a\83\90\83M\83W\83r\83s\83B';
586         KU_LINE = '\83E\83N\83X\83c\83k\83t\83\80\83\86\83\8b\83O\83u\83v\83D\83\94';
587         KE_LINE = '\83G\83P\83Z\83e\83l\83w\83\81\83\8c\83\91\83Q\83x\83y\83F\83\96';
588         KO_LINE = '\83I\83R\83\\83g\83m\83z\83\82\83\88\83\8d\83\92\83S\83{\83|\83H';
589         kKanji = [$80..$A0, $E0..$ff];
590 begin
591
592         wHiraDelimiter  := TStringList.Create;
593         wHiraFinalDelimiter := TStringList.Create;
594         wKanjiDelimiter := TStringList.Create;
595         words := TStringList.Create;
596         try
597                 mode := ModeWhite;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599                 wHiraDelimiter.Text := WHIRA_DELIMITER;
600                 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
601 {$ENDIF}
602                 wKanjiDelimiter.Text := WKANJI_DELIMITER;
603                 p                       := PChar( text );
604                 tail    := p + Length( text );
605                 last    := p;
606
607                 while p < tail do begin
608                         // \95\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
609                         // \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82­\82Ä\82à\82¢\82¢
610 //                      if Byte(Byte( p^ ) - $a1) < $5e then begin
611                         if Byte( p^ ) in kKanji then begin
612                                 if p + 1 < tail then begin
613                                         ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
614                                         case ch of
615                                         // \83X\83y\81[\83X\82Å\92P\8cê\95ª\82¯\82¹\82¸\82É\8bl\82ß\82é
616                                         //$8140:                                                        newMode := ModeWhite;
617                                         $8141..$824e:                           newMode := ModeWGraph;
618                                         $824f..$8258:                           newMode := ModeWNum;
619                                         $8260..$829a:                           newMode := ModeWAlpha;
620                                         $829f..$82f1:                           newMode := ModeWHira;
621                                         $8340..$8396:                           newMode := ModeWKata;
622                                         else                                                            newMode := ModeWKanji;
623                                         end;
624                                         // '\81J\81K\81[' \82Í\95½\89¼\96¼\81A\82Ü\82½\82Í\83J\83^\83J\83i\82É\8aÜ\82Ü\82ê\82é
625                                         if (mode = ModeWHira) or (mode = ModeWKata) then
626                                                 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
627                                                         newMode := mode;
628                                 end else begin
629                                         newMode := ModeWhite;
630                                 end;
631
632                                 chSize := 2;
633                         end else begin
634                                 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635                                 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636                                         // \8d¡\82Ü\82Å\93ú\96{\8cê\82Å\8d¡\83X\83y\81[\83X
637                                         // \92P\8cê\82ð\8cq\82°\82Ä\8cã\82Å\83X\83y\81[\83X\82ð\8bl\82ß\82é
638                                         // \81¦\94¼\8ap\83J\83i\82Í\92Ê\8fí\83X\83y\81[\83X\82Å\8bæ\90Ø\82é\82¾\82ë\82¤\82©\82ç\8bl\82ß\82È\82¢
639                                         newMode := mode;
640                                 end;
641
642                                 chSize := 1;
643                         end;
644
645                         if mode <> newMode then begin
646
647                                 // \95\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
648                                 if mode <> ModeWhite then begin
649                                         SetLength( aWord, p - last );
650                                         CopyMemory( PChar( aWord ), last, p - last );
651
652                                         words.Text := changeMode( aWord, mode );
653
654                                         // \92P\8cê\93o\98^
655                                         addWord( wordCount, words );
656                                 end;
657
658                                 last := p;
659                                 mode := newMode;
660
661                         end;
662
663                         p := p + chSize;
664                 end;    // while
665
666                 if mode <> ModeWhite then begin
667                         SetLength( aWord, p - last );
668                         CopyMemory( PChar( aWord ), last, p - last );
669
670                         words.Text := changeMode( aWord, mode );
671
672                         // \92P\8cê\93o\98^
673                         addWord( wordCount, words );
674                 end;
675         finally
676                 words.Free;
677                 wKanjiDelimiter.Free;
678                 wHiraFinalDelimiter.Free;
679                 wHiraDelimiter.Free;
680         end;
681
682 end;
683
684 //==============================
685 // CalcPaulGraham
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
688
689         function p( const aWord : string ) : Single;
690         var
691                 info : TWordInfo;
692         begin
693                 info := Objects[ aWord ];
694                 if info = nil then
695                         Result := 0.415
696                 else if info.NormalWord = 0 then
697                         Result := 0.99
698                 else if info.ImportantWord = 0 then
699                         Result := 0.01
700                 else if info.ImportantWord + info.NormalWord * 2 < 5 then
701                         Result := 0.5
702                 else begin
703                         try
704                                 Result := ( info.ImportantWord / info.ImportantText ) /
705                                         ((info.NormalWord * 2 / info.NormalText ) +
706                                          (info.ImportantWord / info.ImportantText));
707                         except
708                 on EZeroDivide do Result := 0.99;
709                         end;
710                 end;
711         end;
712
713 var
714         s, q                            : Extended;
715         i                                               : Integer;
716         narray                  : TList;
717 const
718         SAMPLE_COUNT    = 15;
719 begin
720
721         Result := 1;
722         if wordCount.Count = 0 then
723                 Exit;
724
725         narray := TList.Create;
726         try
727                 for i := 0 to wordCount.Count - 1 do begin
728                         narray.Add( Pointer( p( wordCount[ i ] ) ) );
729                 end;
730
731                 narray.Sort( AbsSort );
732
733                 s := 1;
734                 q := 1;
735                 i := min( SAMPLE_COUNT, narray.Count );
736                 while i > 0 do begin
737                         Dec( i );
738
739                         s := s * Single( narray[ i ] );
740                         q := q * (1 - Single( narray[ i ] ));
741                 end;
742
743                 Result := s / (s + q);
744         finally
745                 narray.Free;
746         end;
747
748 end;
749
750 //==============================
751 // CalcGaryRobinson
752 //==============================
753 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
754
755         function p( const aWord : string ) : Single;
756         var
757                 info : TWordInfo;
758         begin
759                 info := Objects[ aWord ];
760                 if info = nil then
761                         Result := 0.415
762                 else if info.ImportantWord = 0 then
763                         Result := 0.01
764                 else if info.NormalWord = 0 then
765                         Result := 0.99
766                 else
767                 {
768                         Result := ( info.ImportantWord / info.ImportantText ) /
769                                 ((info.NormalWord / info.NormalText ) +
770                                  (info.ImportantWord / info.ImportantText));
771                 }
772                         Result := (info.ImportantWord * info.NormalText) /
773                                 (info.NormalWord * info.ImportantText +
774                                 info.ImportantWord * info.NormalText);
775         end;
776
777         function f( cnt : Integer; n, mean : Single ) : Extended;
778         const
779                 k = 0.001;
780         begin
781                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
782         end;
783
784 var
785         n                                               : Extended;
786         narray                  : array of Single;
787         mean                            : Extended;
788         countInfo               : TWordCountInfo;
789         i                                               : Integer;
790         P1, Q1, R1      : Extended;
791         cnt                                     : Extended;
792 begin
793
794         if wordCount.Count = 0 then begin
795                 Result := 1;
796                 Exit;
797         end;
798
799         SetLength( narray, wordCount.Count );
800         mean := 0;
801         for i := 0 to wordCount.Count - 1 do begin
802                 n                                               := p( wordCount[ i ] );
803                 narray[ i ]     := n;
804                 mean                            := mean + n;
805         end;
806         mean := mean / wordCount.Count;
807
808         P1 := 1;
809         Q1 := 1;
810         for i := 0 to wordCount.Count - 1 do begin
811                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
812                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
813                 P1 := P1 * ( 1 - n );
814                 Q1 := Q1 * n;
815         end;
816         cnt := wordCount.Count;
817         if cnt = 0 then
818                 cnt := 1;
819         P1 := 1 - Power( P1, 1 / cnt );
820         Q1 := 1 - Power( Q1, 1 / cnt );
821
822         if P1 + Q1 = 0 then begin
823                 Result := 0.5
824         end else begin
825                 n := (P1 - Q1) / (P1 + Q1);
826                 Result := (1 + n) / 2;
827         end;
828
829 end;
830
831 //==============================
832 // CalcGaryRobinsonFisher
833 //==============================
834 function TGikoBayesian.CalcGaryRobinsonFisher(
835         wordCount : TWordCount
836 ) : Extended;
837
838         function p( const aWord : string ) : Single;
839         var
840                 info                            : TWordInfo;
841         begin
842                 info := Objects[ aWord ];
843                 if info = nil then
844                         Result := 0.415
845                 else if info.ImportantWord = 0 then
846                         Result := 0.01
847                 else if info.NormalWord = 0 then
848                         Result := 0.99
849                 else
850                 {
851                         Result := ( info.ImportantWord / info.ImportantText ) /
852                                 ((info.NormalWord / info.NormalText ) +
853                                  (info.ImportantWord / info.ImportantText));
854                 }
855                         Result := (info.ImportantWord * info.NormalText) /
856                                 (info.NormalWord * info.ImportantText +
857                                 info.ImportantWord * info.NormalText);
858         end;
859
860         function f( cnt : Integer; n, mean : Single ) : Extended;
861         const
862                 k = 0.001;
863         begin
864                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
865         end;
866
867         function prbx( x2, degree : Extended ) : Extended;
868         begin
869
870                 Result := 0.5;
871
872         end;
873
874 var
875         n                                               : Extended;
876         narray                  : array of Single;
877         mean                            : Extended;
878         countInfo               : TWordCountInfo;
879         i                                               : Integer;
880         normal                  : Extended;
881         important               : Extended;
882         P1, Q1                  : Extended;
883         cnt                                     : Extended;
884 begin
885
886         if wordCount.Count = 0 then begin
887                 Result := 1;
888                 Exit;
889         end;
890
891         SetLength( narray, wordCount.Count );
892         mean := 0;
893         for i := 0 to wordCount.Count - 1 do begin
894                 n                                               := p( wordCount[ i ] );
895                 narray[ i ]     := n;
896                 mean                            := mean + n;
897         end;
898         mean := mean / wordCount.Count;
899
900         P1 := 1;
901         Q1 := 1;
902         for i := 0 to wordCount.Count - 1 do begin
903                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
904                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
905                 P1 := P1 * ( 1 - n );
906                 Q1 := Q1 * n;
907         end;
908         cnt := wordCount.Count;
909         if cnt = 0 then
910                 cnt := 1;
911         P1 := Power( P1, 1 / cnt );
912         Q1 := Power( Q1, 1 / cnt );
913
914         P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
915         Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
916
917         Result := (1 + P1 - Q1) / 2;
918
919 end;
920
921 //==============================
922 // Parse
923 //==============================
924 function TGikoBayesian.Parse(
925         const text                              : string;
926         wordCount                                       : TWordCount;
927         algorithm                                       : TGikoBayesianAlgorithm
928 ) : Extended;
929 begin
930
931         CountWord( text, wordCount );
932         case algorithm of
933         gbaPaulGraham:          Result := CalcPaulGraham( wordCount );
934         gbaGaryRobinson:        Result := CalcGaryRobinson( wordCount );
935         gbaGaryRobinsonFisher:
936                                                                                 Result := CalcGaryRobinsonFisher( wordCount );
937         else                                                    Result := 0;
938         end;
939
940 end;
941
942 //==============================
943 // Learn
944 //==============================
945 procedure TGikoBayesian.Learn(
946         wordCount                : TWordCount;
947         isImportant      : Boolean );
948 var
949         aWord                   : string;
950         wordinfo        : TWordInfo;
951         countinfo       : TWordCountInfo;
952         i                                       : Integer;
953 begin
954
955         for i := 0 to wordCount.Count - 1 do begin
956                 aWord := wordCount[ i ];
957                 wordinfo := Objects[ aWord ];
958                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
959                 if wordinfo = nil then begin
960                         wordinfo := TWordInfo.Create;
961                         Objects[ aWord ] := wordinfo;
962                 end;
963
964                 if isImportant then begin
965                         wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
966                         wordinfo.ImportantText := wordinfo.ImportantText + 1;
967                 end else begin
968                         wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
969                         wordinfo.NormalText := wordinfo.NormalText + 1;
970                 end;
971         end;
972
973 end;
974
975 //==============================
976 // Forget
977 //==============================
978 procedure       TGikoBayesian.Forget(
979         wordCount               : TWordCount;
980         isImportant     : Boolean );
981 var
982         aWord                   : string;
983         wordinfo        : TWordInfo;
984         countinfo       : TWordCountInfo;
985         i                       : Integer;
986 begin
987
988         for i := 0 to wordCount.Count - 1 do begin
989                 aWord := wordCount[ i ];
990                 wordinfo := Objects[ aWord ];
991                 if wordinfo = nil then
992                         Continue;
993
994                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
995                 if isImportant then begin
996                         if wordInfo.ImportantText > 0 then begin
997                                 wordinfo.ImportantText := wordinfo.ImportantText - 1;
998                                 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
999                         end;
1000                 end else begin
1001                         if wordinfo.NormalText > 0 then begin
1002                                 wordinfo.NormalText := wordinfo.NormalText - 1;
1003                                 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1004                         end;
1005                 end;
1006         end;
1007
1008 end;
1009
1010 end.