OSDN Git Service

else が紛れ込んでたので修正。
[gikonavigoeson/gikonavi.git] / GikoBayesian.pas
1 unit GikoBayesian;
2
3 {!
4 \file           GikoBayesian.pas
5 \brief  \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
6
7 $Id: GikoBayesian.pas,v 1.16 2004/11/01 10:32:02 yoffy Exp $
8 }
9
10 //! \95½\89¼\96¼\82ð\8e«\8f\91\82É\8aÜ\82ß\82È\82¢
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
12
13 interface
14
15 //==================================================
16 uses
17 //==================================================
18         Classes;
19
20 //==================================================
21 type
22 //==================================================
23
24         {!***********************************************************
25         \brief \92P\8cê\83v\83\8d\83p\83e\83B
26         ************************************************************}
27         TWordInfo       = class( TObject )
28         private
29                 FNormalWord                     :       Integer;        //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
30                 FImportantWord  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
31                 FNormalText                     : Integer;      //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
32                 FImportantText  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
33
34         public
35                 property NormalWord                     : Integer       read FNormalWord write FNormalWord;
36                 property ImportantWord  : Integer       read FImportantWord write FImportantWord;
37                 property NormalText                     : Integer       read FNormalText write FNormalText;
38                 property ImportantText  : Integer       read FImportantText write FImportantText;
39         end;
40
41         {!***********************************************************
42         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83v\83\8d\83p\83e\83B
43         ************************************************************}
44         TWordCountInfo  = class( TObject )
45         private
46                 FWordCount      :       Integer;        //!< \92P\8cê\90\94
47
48         public
49                 property WordCount      : Integer       read FWordCount write FWordCount;
50         end;
51
52         {!***********************************************************
53         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83\8a\83X\83g
54         ************************************************************}
55 //      TWordCount      = class( THashedStringList )    // \8c\83\92x
56         TWordCount      = class( TStringList )
57         public
58                 constructor Create;
59                 destructor Destroy; override;
60         end;
61
62         {!***********************************************************
63         \brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
64         ************************************************************}
65         TGikoBayesianAlgorithm =
66                 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
67
68         {!***********************************************************
69         \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
70         ************************************************************}
71 //      TGikoBayesian = class( THashedStringList )      // \8c\83\92x
72         TGikoBayesian = class( TStringList )
73         private
74                 FFilePath       : string;       //!< \93Ç\82Ý\8d\9e\82ñ\82¾\83t\83@\83C\83\8b\83p\83X
75                 function GetObject( const name : string ) : TWordInfo;
76                 procedure SetObject( const name : string; value : TWordInfo );
77
78         public
79                 constructor Create;
80                 destructor Destroy; override;
81
82                 //! \83t\83@\83C\83\8b\82©\82ç\8aw\8fK\97\9a\97ð\82ð\93Ç\82Ý\8fo\82µ\82Ü\82·
83                 procedure LoadFromFile( const filePath : string );
84
85                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
86                 procedure SaveToFile( const filePath : string );
87
88                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
89                 procedure Save;
90
91                 //! \92P\8cê\82É\91Î\82·\82é\8fî\95ñ\82ð\8eæ\93¾\82µ\82Ü\82·
92                 property Objects[ const name : string ] : TWordInfo
93                         read GetObject write SetObject; default;
94
95                 //! \95\8fÍ\82É\8aÜ\82Ü\82ê\82é\92P\8cê\82ð\83J\83E\83\93\83g\82µ\82Ü\82·
96                 procedure CountWord(
97                         const text      : string;
98                         wordCount               : TWordCount );
99
100                 {!
101                 \brief  Paul Graham \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
102                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
103                 }
104                 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
105
106                 {!
107                 \brief  GaryRobinson \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
108                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
109                 }
110                 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
111
112                 {!
113                 \brief  GaryRobinson-Fisher \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
114                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
115                 }
116                 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
117
118                 {!
119                 \brief  \95\8fÍ\82ð\89ð\90Í
120                 \param  text                                    \89ð\90Í\82·\82é\95\8fÍ
121                 \param  wordCount                       \89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g\82ª\95Ô\82é
122                 \param  algorithm                       \92\8d\96Ú\93x\82Ì\8c\88\92è\82É\97p\82¢\82é\83A\83\8b\83S\83\8a\83Y\83\80\82ð\8ew\92è\82µ\82Ü\82·
123                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
124
125                 CountWord \82Æ Calcxxxxx \82ð\82Ü\82Æ\82ß\82Ä\8eÀ\8ds\82·\82é\82¾\82¯\82Å\82·\81B
126                 }
127                 function Parse(
128                         const text                              : string;
129                         wordCount                                       : TWordCount;
130                         algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131                 ) : Extended;
132
133                 {!
134                 \brief  \8aw\8fK\82·\82é
135                 \param  wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
136                 \param  isImportant \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82é\82È\82ç True
137                 }
138                 procedure Learn(
139                         wordCount                : TWordCount;
140                         isImportant      : Boolean );
141
142                 {!
143                 \brief          \8aw\8fK\8c\8b\89Ê\82ð\96Y\82ê\82é
144                 \param          wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
145                 \param          isImportant     \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82ç\82ê\82Ä\82¢\82½\82È\82ç True
146                 \warning        \8aw\8fK\8dÏ\82Ý\82Ì\95\8fÍ\82©\82Ç\82¤\82©\82Í\8am\94F\8fo\97\88\82Ü\82¹\82ñ\81B<br>
147                                                         Learn \82µ\82Ä\82¢\82È\82¢\95\8fÍ\82â isImportant \82ª\8aÔ\88á\82Á\82Ä\82¢\82é\95\8fÍ\82ð
148                                                         Forget \82·\82é\82Æ\83f\81[\83^\83x\81[\83X\82ª\94j\91¹\82µ\82Ü\82·\81B<br>
149                                                         \8aw\8fK\8dÏ\82Ý\82©\82Ç\82¤\82©\82Í\93Æ\8e©\82É\8aÇ\97\9d\82µ\82Ä\82­\82¾\82³\82¢\81B
150
151                 \91S\82Ä\82Ì\8aw\8fK\8c\8b\89Ê\82ð\83N\83\8a\83A\82·\82é\82í\82¯\82Å\82Í\82 \82è\82Ü\82¹\82ñ\81B<br>
152                 wordCount \82ð\93¾\82½\95\8fÍ (Parse \82Ì text \88ø\90\94\82Ì\8aw\8fK\8c\8b\89Ê\82Ì\82Ý\83N\83\8a\83A\82µ\82Ü\82·\81B<br><br>
153
154                 \8eå\82É\92\8d\96Ú\95\8fÍ\82Æ\94ñ\92\8d\96Ú\95\8fÍ\82ð\90Ø\82è\91Ö\82¦\82é\82½\82ß\82É Forget -> Learn \82Ì\8f\87\82Å\8eg\97p\82µ\82Ü\82·\81B
155                 }
156                 procedure       Forget(
157                         wordCount               : TWordCount;
158                         isImportant     : Boolean );
159         end;
160
161 //==================================================
162 implementation
163 //==================================================
164
165 uses
166         SysUtils, Math, Windows,
167         MojuUtils;
168
169 const
170         GIKO_BAYESIAN_FILE_VERSION      = '1.0';
171 {
172         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
174                                                                 ModeWHira, ModeWKata, ModeWKanji);
175 }
176         CharMode1 : array [ 0..255 ] of Byte =
177         (
178                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181                 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
186
187                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189                 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
195         );
196
197 //************************************************************
198 // misc
199 //************************************************************
200
201 //==============================
202 // RemoveToken
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
205 var
206         p: Integer;
207 begin
208         p := AnsiPos(delimiter, s);
209         if p = 0 then
210                 Result := s
211         else
212                 Result := Copy(s, 1, p - 1);
213         s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
214 end;
215
216 //==============================
217 // AbsSort
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
220 var
221         v1, v2 : Single;
222 begin
223
224         v1 := Abs( Single( p1 ) - 0.5 );
225         v2 := Abs( Single( p2 ) - 0.5 );
226         if v1 > v2 then
227                 Result := -1
228         else if v1 = v2 then
229                 Result := 0
230         else
231                 Result := 1;
232
233 end;
234
235 //************************************************************
236 // TWordCount class
237 //************************************************************
238 constructor TWordCount.Create;
239 begin
240
241                 Duplicates              := dupIgnore;
242                 CaseSensitive   := True;
243                 Sorted                          := True;
244
245 end;
246
247 destructor TWordCount.Destroy;
248 var
249         i : Integer;
250 begin
251
252         for i := Count - 1 downto 0 do
253                 if Objects[ i ] <> nil then
254                         Objects[ i ].Free;
255
256         inherited;
257
258 end;
259
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
263
264 //==============================
265 // Create
266 //==============================
267 constructor TGikoBayesian.Create;
268 begin
269
270         Duplicates              := dupIgnore;
271         CaseSensitive   := True;
272         Sorted                          := True;
273
274 end;
275
276 //==============================
277 // Destroy
278 //==============================
279 destructor TGikoBayesian.Destroy;
280 var
281         i : Integer;
282 begin
283
284         for i := Count - 1 downto 0 do
285                 if inherited Objects[ i ] <> nil then
286                         inherited Objects[ i ].Free;
287
288         inherited;
289
290 end;
291
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
293 var
294         i                       : Integer;
295         sl              : TStringList;
296         s                       : string;
297         name    : string;
298         info    : TWordInfo;
299 begin
300
301         FFilePath := filePath;
302
303         if not FileExists( filePath ) then
304                 Exit;
305
306         sl := TStringList.Create;
307         try
308                 sl.LoadFromFile( filePath );
309
310                 for i := 1 to sl.Count - 1 do begin
311                         s := sl[ i ];
312                         name := RemoveToken( s, #1 );
313                         info := TWordInfo.Create;
314                         info.NormalWord                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315                         info.ImportantWord      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316                         info.NormalText                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317                         info.ImportantText      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318
319                         AddObject( name, info );
320                 end;
321         finally
322                 sl.Free;
323         end;
324
325 end;
326
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
328 var
329         i                       : Integer;
330         sl              : TStringList;
331         s                       : string;
332         info    : TWordInfo;
333 begin
334
335         FFilePath := filePath;
336
337         sl := TStringList.Create;
338         try
339                 sl.BeginUpdate;
340                 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
341
342                 for i := 0 to Count - 1 do begin
343                         info := TWordInfo( inherited Objects[ i ] );
344                         s := Strings[ i ] + #1
345                                  + Format('%x', [info.NormalWord]) + #1
346                                  + Format('%x', [info.ImportantWord]) + #1
347                                  + Format('%x', [info.NormalText]) + #1
348                                  + Format('%x', [info.ImportantText]);
349
350                         sl.Add(s);
351                 end;
352                 sl.EndUpdate;
353                 sl.SaveToFile( filePath );
354         finally
355                 sl.Free;
356         end;
357
358 end;
359
360 procedure TGikoBayesian.Save;
361 begin
362
363         if FFilePath <> '' then
364                 SaveToFile( FFilePath );
365
366 end;
367
368 //==============================
369 // GetObject
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
372 var
373         idx : Integer;
374 begin
375
376         if Find( name, idx ) then
377                 Result := TWordInfo( inherited Objects[ idx ] )
378         else
379                 Result := nil;
380
381 end;
382
383 //==============================
384 // SetObject
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
387 var
388         idx : Integer;
389 begin
390
391         if Find( name, idx ) then
392                 inherited Objects[ idx ] := value
393         else
394                 AddObject( name, value );
395
396 end;
397
398
399 //==============================
400 // CountWord
401 //==============================
402 procedure TGikoBayesian.CountWord(
403         const text      : string;
404         wordCount               : TWordCount );
405 type
406         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
408                                                                 ModeWHira, ModeWKata, ModeWKanji);
409 var
410         p, tail, last                   : PChar;
411         mode, newMode                   : Modes;
412         ch                                                              : Longword;
413         chSize                                          : Integer;
414         wHiraDelimiter          : TStringList;
415         wHiraFinalDelimiter     : TStringList;
416         wKanjiDelimiter         : TStringList;
417         words                                                   : TStringList;
418         aWord                                                   : string;
419         countInfo                                       : TWordCountInfo;
420
421         function cutBoth( _aWord : string; _delim : TStringList ) : string;
422         var
423                 _i                      : Integer;
424         begin
425                 for _i := 0 to _delim.Count - 1 do begin
426                         _aWord := CustomStringReplace(
427                                 _aWord,
428                                 _delim[ _i ],
429                                 #10 + _delim[ _i ] + #10, False );
430                 end;
431                 Result := _aWord;
432         end;
433
434         function cutFirst( _aWord : string; _delim : TStringList ) : string;
435         var
436                 _i                      : Integer;
437         begin
438                 for _i := 0 to _delim.Count - 1 do begin
439                         _aWord := CustomStringReplace(
440                                 _aWord,
441                                 _delim[ _i ],
442                                 #10 + _delim[ _i ], False );
443                 end;
444                 Result := _aWord;
445         end;
446
447         function cutFinal( _aWord : string; _delim : TStringList ) : string;
448         var
449                 _i                      : Integer;
450         begin
451                 for _i := 0 to _delim.Count - 1 do begin
452                         _aWord := CustomStringReplace(
453                                 _aWord,
454                                 _delim[ _i ],
455                                 _delim[ _i ] + #10, False );
456                 end;
457                 Result := _aWord;
458         end;
459
460         procedure addWord( _dst : TWordCount; _words : TStringList );
461         var
462                 _aWord                  : string;
463                 _i, _idx                : Integer;
464                 _countInfo      : TWordCountInfo;
465         begin
466                 for _i := 0 to _words.Count - 1 do begin
467                         _aWord := _words[ _i ];
468                         if Length( _aWord ) > 0 then begin
469                                 if _dst.Find( _aWord, _idx ) then begin
470                                         _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
471                                 end else begin
472                                         _countInfo := TWordCountInfo.Create;
473                                         _dst.AddObject( _aWord, _countInfo );
474                                 end;
475                                 _countInfo.WordCount := _countInfo.WordCount + 1;
476                         end;
477                 end;
478         end;
479
480         function changeMode( _aWord : string; _mode : Modes ) : string;
481         var
482                 _i                                                                      : Integer;
483                 _aWord2                                                 : string;
484                 _pWord, _pWord2                 : PChar;
485                 _pWordTail, _pFound     : PChar;
486         const
487                 _delim : string = #10;
488         begin
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490                 if mode = ModeWHira then begin
491                         Result := '';
492                         Exit;
493                 end;
494 {$ENDIF}
495                 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
496                         // \93ú\96{\8cê
497                         // \83X\83y\81[\83X\82ð\8bl\82ß\82é
498                         _aWord := CustomStringReplace( _aWord, ' ', '', False );
499                         _aWord := CustomStringReplace( _aWord, '\81@', '', False );
500
501                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
502                         case mode of
503                         ModeWHira:
504                                 begin
505                                         _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506                                         Result := cutBoth( _aWord, wHiraDelimiter );
507                                 end;
508
509                         ModeWKanji:
510                                 begin
511                                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
512                                         _aWord := cutBoth( _aWord, wKanjiDelimiter );
513                                         // 4 byte (2 \8e\9a\82¸\82Â\82Å\92P\8cê\95ª\82¯
514                                         _pWord := PChar( _aWord );
515                                         _i := Length( _aWord );
516                                         _pWordTail := _pWord + _i;
517                                         SetLength( _aWord2, _i + (_i shr 2) );
518                                         _pWord2 := PChar( _aWord2 );
519
520                                         while _pWord < _pWordTail do begin
521                                                 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522                                                 if _pFound = nil then
523                                                         _pFound := _pWordTail;
524                                                 _pFound := _pFound - 3;
525
526                                                 while _pWord <= _pFound do begin
527                                                         CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528                                                         _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
529                                                 end;
530                                                 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531                                                 CopyMemory( _pWord2, _pWord, _i );
532                                                 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
533                                         end;
534                                         if _pWord < _pWordTail then begin
535                                                 _i := _pWordTail - _pWord;
536                                                 CopyMemory( _pWord2, _pWord, _i );
537                                                 _pWord2 := _pWord2 + _i;
538                                         end;
539                                         SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
540
541                                         Result := _aWord2;
542                                 end;
543
544                         else
545                                 Result := _aWord;
546                         end;
547                 end else begin
548                         Result := _aWord;
549                 end;
550         end;
551 const
552         WHIRA_DELIMITER = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç'
553                 + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å'+ #10 + '\82Å'
554                 + #10 + '\82±\82±' + #10 + '\82»\82±' + #10 + '\82Ç\82±'
555                 + #10 + '\82±\82ê' + #10 + '\82»\82ê' + #10 + '\82 \82ê' + #10 + '\82Ç\82ê'
556                 + #10 + '\82±\82Ì' + #10 + '\82»\82Ì' + #10 + '\82 \82Ì' + #10 + '\82Ç\82Ì'
557                 + #10 + '\82±\82¤' + #10 + '\82»\82¤' + #10 + '\82 \82 ' + #10 + '\82Ç\82¤'
558                 + #10 + '\82±\82ñ\82È' + #10 + '\82»\82ñ\82È' + #10 + '\82 \82ñ\82È' + #10 + '\82Ç\82ñ\82È'
559                 + #10 + '\82ê\82½' + #10 + '\82ê\82Ä' + #10 + '\82ê\82ê' + #10 + '\82ê\82ë'
560                 + #10 + '\82ê\82é' + #10 + '\82ç\82ê\82é'
561                 + #10 + '\82Å\82·' + #10 + '\82Ü\82·' + #10 + '\82Ü\82¹\82ñ'
562                 + #10 + '\82Å\82µ\82½' + #10 + '\82Ü\82µ\82½'
563                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82³\82ê\82é' + #10 + '\82³\82ê\82È\82¢'
564                 ;
565         WKANJI_DELIMITER = '\93I' + #10 + '\90«' + #10 + '\8e®' + #10 + '\89»' + #10 + '\96@'
566                 + #10 + '\95s' + #10 + '\96³' + #10 + '\94ñ' + #10 + '\94½'
567                 ;
568         WHIRA_FINAL_DELIMITER = '\82Á\82½' + #10 + '\82Á\82Ä'
569                 ;{
570                 + #10 + '\82æ\82Á\82Ä' + #10 + '\82µ\82½\82ª\82Á\82Ä' + #10 + '\82È\82Ì\82Å'
571                 + #10 + '\82¾\82©\82ç' + #10 + '\82Å\82·\82©\82ç'
572                 + #10 + '\82Ü\82½'
573                 + #10 + '\82µ\82©\82µ' + #10 + '\82¾\82ª' + #10 + '\82¯\82Ç' + #10 + '\82¯\82ê\82Ç'
574                 + #10 + '\82â\82Í\82è' + #10 + '\82â\82Á\82Ï\82è'
575                 + #10 + '\82Å\82µ' + #10 + '\82¾\82ë'
576                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82µ\82½' + #10 + '\82µ\82È\82¢'
577                 ;}
578         // '\81[' \82ð '\82\9f\82¡\82£\82¥\82§' \82É\81B
579         HA_LINE = '\82 \82©\82³\82½\82È\82Í\82Ü\82â\82ç\82í\82ª\82´\82¾\82Î\82Ï\82\9f\82ì';
580         HI_LINE = '\82¢\82«\82µ\82¿\82É\82Ð\82Ý\82è\82î\82¬\82\82Ñ\82Ò\82¡';
581         HU_LINE = '\82¤\82­\82·\82Â\82Ê\82Ó\82Þ\82ä\82é\82®\82Ô\82Õ\82£';
582         HE_LINE = '\82¦\82¯\82¹\82Ä\82Ë\82Ö\82ß\82ê\82ï\82°\82×\82Ø\82¥';
583         HO_LINE = '\82¨\82±\82»\82Æ\82Ì\82Ù\82à\82æ\82ë\82ð\82²\82Ú\82Û\82§';
584         KA_LINE = '\83A\83J\83T\83^\83i\83n\83}\83\84\83\89\83\8f\83K\83U\83_\83o\83p\83@\83\95\83\8e';
585         KI_LINE = '\83C\83L\83V\83`\83j\83q\83~\83\8a\83\90\83M\83W\83r\83s\83B';
586         KU_LINE = '\83E\83N\83X\83c\83k\83t\83\80\83\86\83\8b\83O\83u\83v\83D\83\94';
587         KE_LINE = '\83G\83P\83Z\83e\83l\83w\83\81\83\8c\83\91\83Q\83x\83y\83F\83\96';
588         KO_LINE = '\83I\83R\83\\83g\83m\83z\83\82\83\88\83\8d\83\92\83S\83{\83|\83H';
589         kKanji = [$80..$A0, $E0..$ff];
590 begin
591
592         wHiraDelimiter  := TStringList.Create;
593         wHiraFinalDelimiter := TStringList.Create;
594         wKanjiDelimiter := TStringList.Create;
595         words := TStringList.Create;
596         try
597                 mode := ModeWhite;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599                 wHiraDelimiter.Text := WHIRA_DELIMITER;
600                 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
601 {$ENDIF}
602                 wKanjiDelimiter.Text := WKANJI_DELIMITER;
603                 p                       := PChar( text );
604                 tail    := p + Length( text );
605                 last    := p;
606
607                 while p < tail do begin
608                         // \95\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
609                         // \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82­\82Ä\82à\82¢\82¢
610 //                      if Byte(Byte( p^ ) - $a1) < $5e then begin
611                         if Byte( p^ ) in kKanji then begin
612                                 if p + 1 < tail then begin
613                                         ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
614                                         case ch of
615                                         // \83X\83y\81[\83X\82Å\92P\8cê\95ª\82¯\82¹\82¸\82É\8bl\82ß\82é
616                                         //$8140:                                                        newMode := ModeWhite;
617                                         $8141..$824e:                           newMode := ModeWGraph;
618                                         $824f..$8258:                           newMode := ModeWNum;
619                                         $8260..$829a:                           newMode := ModeWAlpha;
620                                         $829f..$82f1:                           newMode := ModeWHira;
621                                         $8340..$8396:                           newMode := ModeWKata;
622                                         else                                                            newMode := ModeWKanji;
623                                         end;
624                                         // '\81J\81K\81[' \82Í\95½\89¼\96¼\81A\82Ü\82½\82Í\83J\83^\83J\83i\82É\8aÜ\82Ü\82ê\82é
625                                         if (mode = ModeWHira) or (mode = ModeWKata) then
626                                                 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
627                                                         newMode := mode;
628                                 end else begin
629                                         newMode := ModeWhite;
630                                 end;
631
632                                 chSize := 2;
633                         end else begin
634                                 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635                                 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636                                         // \8d¡\82Ü\82Å\93ú\96{\8cê\82Å\8d¡\83X\83y\81[\83X
637                                         // \92P\8cê\82ð\8cq\82°\82Ä\8cã\82Å\83X\83y\81[\83X\82ð\8bl\82ß\82é
638                                         // \81¦\94¼\8ap\83J\83i\82Í\92Ê\8fí\83X\83y\81[\83X\82Å\8bæ\90Ø\82é\82¾\82ë\82¤\82©\82ç\8bl\82ß\82È\82¢
639                                         newMode := mode;
640                                 end;
641
642                                 chSize := 1;
643                         end;
644
645                         if mode <> newMode then begin
646
647                                 // \95\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
648                                 if mode <> ModeWhite then begin
649                                         SetLength( aWord, p - last );
650                                         CopyMemory( PChar( aWord ), last, p - last );
651
652                                         words.Text := changeMode( aWord, mode );
653
654                                         // \92P\8cê\93o\98^
655                                         addWord( wordCount, words );
656                                 end;
657
658                                 last := p;
659                                 mode := newMode;
660
661                         end;
662
663                         p := p + chSize;
664                 end;    // while
665
666                 if mode <> ModeWhite then begin
667                         SetLength( aWord, p - last );
668                         CopyMemory( PChar( aWord ), last, p - last );
669
670                         words.Text := changeMode( aWord, mode );
671
672                         // \92P\8cê\93o\98^
673                         addWord( wordCount, words );
674                 end;
675         finally
676                 words.Free;
677                 wKanjiDelimiter.Free;
678                 wHiraFinalDelimiter.Free;
679                 wHiraDelimiter.Free;
680         end;
681
682 end;
683
684 //==============================
685 // CalcPaulGraham
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
688
689         function p( const aWord : string ) : Single;
690         var
691                 info : TWordInfo;
692         begin
693                 info := Objects[ aWord ];
694                 if info = nil then
695                         Result := 0.415
696                 else if info.NormalWord = 0 then
697                         Result := 0.99
698                 else if info.ImportantWord = 0 then
699                         Result := 0.01
700                 else if info.ImportantWord + info.NormalWord * 2 < 5 then
701                         Result := 0.5
702                 else
703                         Result := ( info.ImportantWord / info.ImportantText ) /
704                                 ((info.NormalWord * 2 / info.NormalText ) +
705                                  (info.ImportantWord / info.ImportantText));
706         end;
707
708 var
709         s, q                            : Extended;
710         i                                               : Integer;
711         narray                  : TList;
712 const
713         SAMPLE_COUNT    = 15;
714 begin
715
716         Result := 1;
717         if wordCount.Count = 0 then
718                 Exit;
719
720         narray := TList.Create;
721         try
722                 for i := 0 to wordCount.Count - 1 do begin
723                         narray.Add( Pointer( p( wordCount[ i ] ) ) );
724                 end;
725
726                 narray.Sort( AbsSort );
727
728                 s := 1;
729                 q := 1;
730                 i := min( SAMPLE_COUNT, narray.Count );
731                 while i > 0 do begin
732                         Dec( i );
733
734                         s := s * Single( narray[ i ] );
735                         q := q * (1 - Single( narray[ i ] ));
736                 end;
737
738                 Result := s / (s + q);
739         finally
740                 narray.Free;
741         end;
742
743 end;
744
745 //==============================
746 // CalcGaryRobinson
747 //==============================
748 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
749
750         function p( const aWord : string ) : Single;
751         var
752                 info : TWordInfo;
753         begin
754                 info := Objects[ aWord ];
755                 if info = nil then
756                         Result := 0.415
757                 else if info.ImportantWord = 0 then
758                         Result := 0.01
759                 else if info.NormalWord = 0 then
760                         Result := 0.99
761                 else
762                 {
763                         Result := ( info.ImportantWord / info.ImportantText ) /
764                                 ((info.NormalWord / info.NormalText ) +
765                                  (info.ImportantWord / info.ImportantText));
766                 }
767                         Result := (info.ImportantWord * info.NormalText) /
768                                 (info.NormalWord * info.ImportantText +
769                                 info.ImportantWord * info.NormalText);
770         end;
771
772         function f( cnt : Integer; n, mean : Single ) : Extended;
773         const
774                 k = 0.001;
775         begin
776                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
777         end;
778
779 var
780         n                                               : Extended;
781         narray                  : array of Single;
782         mean                            : Extended;
783         countInfo               : TWordCountInfo;
784         i                                               : Integer;
785         P1, Q1, R1      : Extended;
786         cnt                                     : Extended;
787 begin
788
789         if wordCount.Count = 0 then begin
790                 Result := 1;
791                 Exit;
792         end;
793
794         SetLength( narray, wordCount.Count );
795         mean := 0;
796         for i := 0 to wordCount.Count - 1 do begin
797                 n                                               := p( wordCount[ i ] );
798                 narray[ i ]     := n;
799                 mean                            := mean + n;
800         end;
801         mean := mean / wordCount.Count;
802
803         P1 := 1;
804         Q1 := 1;
805         for i := 0 to wordCount.Count - 1 do begin
806                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
807                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
808                 P1 := P1 * ( 1 - n );
809                 Q1 := Q1 * n;
810         end;
811         cnt := wordCount.Count;
812         if cnt = 0 then
813                 cnt := 1;
814         P1 := 1 - Power( P1, 1 / cnt );
815         Q1 := 1 - Power( Q1, 1 / cnt );
816
817         if P1 + Q1 = 0 then begin
818                 Result := 0.5
819         end else begin
820                 n := (P1 - Q1) / (P1 + Q1);
821                 Result := (1 + n) / 2;
822         end;
823
824 end;
825
826 //==============================
827 // CalcGaryRobinsonFisher
828 //==============================
829 function TGikoBayesian.CalcGaryRobinsonFisher(
830         wordCount : TWordCount
831 ) : Extended;
832
833         function p( const aWord : string ) : Single;
834         var
835                 info                            : TWordInfo;
836         begin
837                 info := Objects[ aWord ];
838                 if info = nil then
839                         Result := 0.415
840                 else if info.ImportantWord = 0 then
841                         Result := 0.01
842                 else if info.NormalWord = 0 then
843                         Result := 0.99
844                 else
845                 {
846                         Result := ( info.ImportantWord / info.ImportantText ) /
847                                 ((info.NormalWord / info.NormalText ) +
848                                  (info.ImportantWord / info.ImportantText));
849                 }
850                         Result := (info.ImportantWord * info.NormalText) /
851                                 (info.NormalWord * info.ImportantText +
852                                 info.ImportantWord * info.NormalText);
853         end;
854
855         function f( cnt : Integer; n, mean : Single ) : Extended;
856         const
857                 k = 0.001;
858         begin
859                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
860         end;
861
862         function prbx( x2, degree : Extended ) : Extended;
863         begin
864
865                 Result := 0.5;
866
867         end;
868
869 var
870         n                                               : Extended;
871         narray                  : array of Single;
872         mean                            : Extended;
873         countInfo               : TWordCountInfo;
874         i                                               : Integer;
875         normal                  : Extended;
876         important               : Extended;
877         P1, Q1                  : Extended;
878         cnt                                     : Extended;
879 begin
880
881         if wordCount.Count = 0 then begin
882                 Result := 1;
883                 Exit;
884         end;
885
886         SetLength( narray, wordCount.Count );
887         mean := 0;
888         for i := 0 to wordCount.Count - 1 do begin
889                 n                                               := p( wordCount[ i ] );
890                 narray[ i ]     := n;
891                 mean                            := mean + n;
892         end;
893         mean := mean / wordCount.Count;
894
895         P1 := 1;
896         Q1 := 1;
897         for i := 0 to wordCount.Count - 1 do begin
898                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
899                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
900                 P1 := P1 * ( 1 - n );
901                 Q1 := Q1 * n;
902         end;
903         cnt := wordCount.Count;
904         if cnt = 0 then
905                 cnt := 1;
906         P1 := Power( P1, 1 / cnt );
907         Q1 := Power( Q1, 1 / cnt );
908
909         P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
910         Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
911
912         Result := (1 + P1 - Q1) / 2;
913
914 end;
915
916 //==============================
917 // Parse
918 //==============================
919 function TGikoBayesian.Parse(
920         const text                              : string;
921         wordCount                                       : TWordCount;
922         algorithm                                       : TGikoBayesianAlgorithm
923 ) : Extended;
924 begin
925
926         CountWord( text, wordCount );
927         case algorithm of
928         gbaPaulGraham:          Result := CalcPaulGraham( wordCount );
929         gbaGaryRobinson:        Result := CalcGaryRobinson( wordCount );
930         gbaGaryRobinsonFisher:
931                                                                                 Result := CalcGaryRobinsonFisher( wordCount );
932         else                                                    Result := 0;
933         end;
934
935 end;
936
937 //==============================
938 // Learn
939 //==============================
940 procedure TGikoBayesian.Learn(
941         wordCount                : TWordCount;
942         isImportant      : Boolean );
943 var
944         aWord                   : string;
945         wordinfo        : TWordInfo;
946         countinfo       : TWordCountInfo;
947         i                                       : Integer;
948 begin
949
950         for i := 0 to wordCount.Count - 1 do begin
951                 aWord := wordCount[ i ];
952                 wordinfo := Objects[ aWord ];
953                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
954                 if wordinfo = nil then begin
955                         wordinfo := TWordInfo.Create;
956                         Objects[ aWord ] := wordinfo;
957                 end;
958
959                 if isImportant then begin
960                         wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
961                         wordinfo.ImportantText := wordinfo.ImportantText + 1;
962                 end else begin
963                         wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
964                         wordinfo.NormalText := wordinfo.NormalText + 1;
965                 end;
966         end;
967
968 end;
969
970 //==============================
971 // Forget
972 //==============================
973 procedure       TGikoBayesian.Forget(
974         wordCount               : TWordCount;
975         isImportant     : Boolean );
976 var
977         aWord                   : string;
978         wordinfo        : TWordInfo;
979         countinfo       : TWordCountInfo;
980         i                       : Integer;
981 begin
982
983         for i := 0 to wordCount.Count - 1 do begin
984                 aWord := wordCount[ i ];
985                 wordinfo := Objects[ aWord ];
986                 if wordinfo = nil then
987                         Continue;
988
989                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
990                 if isImportant then begin
991                         if wordInfo.ImportantText > 0 then begin
992                                 wordinfo.ImportantText := wordinfo.ImportantText - 1;
993                                 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
994                         end;
995                 end else begin
996                         if wordinfo.NormalText > 0 then begin
997                                 wordinfo.NormalText := wordinfo.NormalText - 1;
998                                 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
999                         end;
1000                 end;
1001         end;
1002
1003 end;
1004
1005 end.