5 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
7 $Id: GikoBayesian.pas,v 1.17 2004/11/05 14:24:26 h677 Exp $
10 //!
\95½
\89¼
\96¼
\82ð
\8e«
\8f\91\82É
\8aÜ
\82ß
\82È
\82¢
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
15 //==================================================
17 //==================================================
20 //==================================================
22 //==================================================
24 {!***********************************************************
25 \brief
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
26 ************************************************************}
27 TWordInfo = class( TObject )
29 FNormalWord : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
30 FImportantWord : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
31 FNormalText : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
32 FImportantText : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
35 property NormalWord : Integer read FNormalWord write FNormalWord;
36 property ImportantWord : Integer read FImportantWord write FImportantWord;
37 property NormalText : Integer read FNormalText write FNormalText;
38 property ImportantText : Integer read FImportantText write FImportantText;
41 {!***********************************************************
42 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
43 ************************************************************}
44 TWordCountInfo = class( TObject )
46 FWordCount : Integer; //!<
\92P
\8cê
\90\94
49 property WordCount : Integer read FWordCount write FWordCount;
52 {!***********************************************************
53 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83\8a\83X
\83g
54 ************************************************************}
55 // TWordCount = class( THashedStringList ) //
\8c\83\92x
56 TWordCount = class( TStringList )
59 destructor Destroy; override;
62 {!***********************************************************
63 \brief
\83t
\83B
\83\8b\83^
\83A
\83\8b\83S
\83\8a\83Y
\83\80
64 ************************************************************}
65 TGikoBayesianAlgorithm =
66 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
68 {!***********************************************************
69 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
70 ************************************************************}
71 // TGikoBayesian = class( THashedStringList ) //
\8c\83\92x
72 TGikoBayesian = class( TStringList )
74 FFilePath : string; //!<
\93Ç
\82Ý
\8d\9e\82ñ
\82¾
\83t
\83@
\83C
\83\8b\83p
\83X
75 function GetObject( const name : string ) : TWordInfo;
76 procedure SetObject( const name : string; value : TWordInfo );
80 destructor Destroy; override;
82 //!
\83t
\83@
\83C
\83\8b\82©
\82ç
\8aw
\8fK
\97\9a\97ð
\82ð
\93Ç
\82Ý
\8fo
\82µ
\82Ü
\82·
83 procedure LoadFromFile( const filePath : string );
85 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
86 procedure SaveToFile( const filePath : string );
88 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
91 //!
\92P
\8cê
\82É
\91Î
\82·
\82é
\8fî
\95ñ
\82ð
\8eæ
\93¾
\82µ
\82Ü
\82·
92 property Objects[ const name : string ] : TWordInfo
93 read GetObject write SetObject; default;
95 //!
\95¶
\8fÍ
\82É
\8aÜ
\82Ü
\82ê
\82é
\92P
\8cê
\82ð
\83J
\83E
\83\93\83g
\82µ
\82Ü
\82·
98 wordCount : TWordCount );
101 \brief Paul Graham
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
102 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
104 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
107 \brief GaryRobinson
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
108 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
110 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
113 \brief GaryRobinson-Fisher
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
114 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
116 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
119 \brief
\95¶
\8fÍ
\82ð
\89ð
\90Í
120 \param text
\89ð
\90Í
\82·
\82é
\95¶
\8fÍ
121 \param wordCount
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
\82ª
\95Ô
\82é
122 \param algorithm
\92\8d\96Ú
\93x
\82Ì
\8c\88\92è
\82É
\97p
\82¢
\82é
\83A
\83\8b\83S
\83\8a\83Y
\83\80\82ð
\8ew
\92è
\82µ
\82Ü
\82·
123 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
125 CountWord
\82Æ Calcxxxxx
\82ð
\82Ü
\82Æ
\82ß
\82Ä
\8eÀ
\8ds
\82·
\82é
\82¾
\82¯
\82Å
\82·
\81B
129 wordCount : TWordCount;
130 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
134 \brief
\8aw
\8fK
\82·
\82é
135 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
136 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82é
\82È
\82ç True
139 wordCount : TWordCount;
140 isImportant : Boolean );
143 \brief
\8aw
\8fK
\8c\8b\89Ê
\82ð
\96Y
\82ê
\82é
144 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
145 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82ç
\82ê
\82Ä
\82¢
\82½
\82È
\82ç True
146 \warning
\8aw
\8fK
\8dÏ
\82Ý
\82Ì
\95¶
\8fÍ
\82©
\82Ç
\82¤
\82©
\82Í
\8am
\94F
\8fo
\97\88\82Ü
\82¹
\82ñ
\81B<br>
147 Learn
\82µ
\82Ä
\82¢
\82È
\82¢
\95¶
\8fÍ
\82â isImportant
\82ª
\8aÔ
\88á
\82Á
\82Ä
\82¢
\82é
\95¶
\8fÍ
\82ð
148 Forget
\82·
\82é
\82Æ
\83f
\81[
\83^
\83x
\81[
\83X
\82ª
\94j
\91¹
\82µ
\82Ü
\82·
\81B<br>
149 \8aw
\8fK
\8dÏ
\82Ý
\82©
\82Ç
\82¤
\82©
\82Í
\93Æ
\8e©
\82É
\8aÇ
\97\9d\82µ
\82Ä
\82
\82¾
\82³
\82¢
\81B
151 \91S
\82Ä
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82ð
\83N
\83\8a\83A
\82·
\82é
\82í
\82¯
\82Å
\82Í
\82 \82è
\82Ü
\82¹
\82ñ
\81B<br>
152 wordCount
\82ð
\93¾
\82½
\95¶
\8fÍ (Parse
\82Ì text
\88ø
\90\94)
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82Ì
\82Ý
\83N
\83\8a\83A
\82µ
\82Ü
\82·
\81B<br><br>
154 \8eå
\82É
\92\8d\96Ú
\95¶
\8fÍ
\82Æ
\94ñ
\92\8d\96Ú
\95¶
\8fÍ
\82ð
\90Ø
\82è
\91Ö
\82¦
\82é
\82½
\82ß
\82É Forget -> Learn
\82Ì
\8f\87\82Å
\8eg
\97p
\82µ
\82Ü
\82·
\81B
157 wordCount : TWordCount;
158 isImportant : Boolean );
161 //==================================================
163 //==================================================
166 SysUtils, Math, Windows,
170 GIKO_BAYESIAN_FILE_VERSION = '1.0';
172 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173 ModeWGraph, ModeWAlpha, ModeWNum,
174 ModeWHira, ModeWKata, ModeWKanji);
176 CharMode1 : array [ 0..255 ] of Byte =
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
197 //************************************************************
199 //************************************************************
201 //==============================
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
208 p := AnsiPos(delimiter, s);
212 Result := Copy(s, 1, p - 1);
213 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
216 //==============================
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
224 v1 := Abs( Single( p1 ) - 0.5 );
225 v2 := Abs( Single( p2 ) - 0.5 );
235 //************************************************************
237 //************************************************************
238 constructor TWordCount.Create;
241 Duplicates := dupIgnore;
242 CaseSensitive := True;
247 destructor TWordCount.Destroy;
252 for i := Count - 1 downto 0 do
253 if Objects[ i ] <> nil then
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
264 //==============================
266 //==============================
267 constructor TGikoBayesian.Create;
270 Duplicates := dupIgnore;
271 CaseSensitive := True;
276 //==============================
278 //==============================
279 destructor TGikoBayesian.Destroy;
284 for i := Count - 1 downto 0 do
285 if inherited Objects[ i ] <> nil then
286 inherited Objects[ i ].Free;
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
301 FFilePath := filePath;
303 if not FileExists( filePath ) then
306 sl := TStringList.Create;
308 sl.LoadFromFile( filePath );
310 for i := 1 to sl.Count - 1 do begin
312 name := RemoveToken( s, #1 );
313 info := TWordInfo.Create;
314 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
319 AddObject( name, info );
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
335 FFilePath := filePath;
337 sl := TStringList.Create;
340 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
342 for i := 0 to Count - 1 do begin
343 info := TWordInfo( inherited Objects[ i ] );
344 s := Strings[ i ] + #1
345 + Format('%x', [info.NormalWord]) + #1
346 + Format('%x', [info.ImportantWord]) + #1
347 + Format('%x', [info.NormalText]) + #1
348 + Format('%x', [info.ImportantText]);
353 sl.SaveToFile( filePath );
360 procedure TGikoBayesian.Save;
363 if FFilePath <> '' then
364 SaveToFile( FFilePath );
368 //==============================
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
376 if Find( name, idx ) then
377 Result := TWordInfo( inherited Objects[ idx ] )
383 //==============================
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
391 if Find( name, idx ) then
392 inherited Objects[ idx ] := value
394 AddObject( name, value );
399 //==============================
401 //==============================
402 procedure TGikoBayesian.CountWord(
404 wordCount : TWordCount );
406 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407 ModeWGraph, ModeWAlpha, ModeWNum,
408 ModeWHira, ModeWKata, ModeWKanji);
410 p, tail, last : PChar;
411 mode, newMode : Modes;
414 wHiraDelimiter : TStringList;
415 wHiraFinalDelimiter : TStringList;
416 wKanjiDelimiter : TStringList;
419 countInfo : TWordCountInfo;
421 function cutBoth( _aWord : string; _delim : TStringList ) : string;
425 for _i := 0 to _delim.Count - 1 do begin
426 _aWord := CustomStringReplace(
429 #10 + _delim[ _i ] + #10, False );
434 function cutFirst( _aWord : string; _delim : TStringList ) : string;
438 for _i := 0 to _delim.Count - 1 do begin
439 _aWord := CustomStringReplace(
442 #10 + _delim[ _i ], False );
447 function cutFinal( _aWord : string; _delim : TStringList ) : string;
451 for _i := 0 to _delim.Count - 1 do begin
452 _aWord := CustomStringReplace(
455 _delim[ _i ] + #10, False );
460 procedure addWord( _dst : TWordCount; _words : TStringList );
464 _countInfo : TWordCountInfo;
466 for _i := 0 to _words.Count - 1 do begin
467 _aWord := _words[ _i ];
468 if Length( _aWord ) > 0 then begin
469 if _dst.Find( _aWord, _idx ) then begin
470 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
472 _countInfo := TWordCountInfo.Create;
473 _dst.AddObject( _aWord, _countInfo );
475 _countInfo.WordCount := _countInfo.WordCount + 1;
480 function changeMode( _aWord : string; _mode : Modes ) : string;
484 _pWord, _pWord2 : PChar;
485 _pWordTail, _pFound : PChar;
487 _delim : string = #10;
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490 if mode = ModeWHira then begin
495 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
497 //
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
498 _aWord := CustomStringReplace( _aWord, ' ', '', False );
499 _aWord := CustomStringReplace( _aWord, '
\81@', '', False );
501 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
505 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506 Result := cutBoth( _aWord, wHiraDelimiter );
511 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
512 _aWord := cutBoth( _aWord, wKanjiDelimiter );
513 // 4 byte (2
\8e\9a)
\82¸
\82Â
\82Å
\92P
\8cê
\95ª
\82¯
514 _pWord := PChar( _aWord );
515 _i := Length( _aWord );
516 _pWordTail := _pWord + _i;
517 SetLength( _aWord2, _i + (_i shr 2) );
518 _pWord2 := PChar( _aWord2 );
520 while _pWord < _pWordTail do begin
521 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522 if _pFound = nil then
523 _pFound := _pWordTail;
524 _pFound := _pFound - 3;
526 while _pWord <= _pFound do begin
527 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
530 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531 CopyMemory( _pWord2, _pWord, _i );
532 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
534 if _pWord < _pWordTail then begin
535 _i := _pWordTail - _pWord;
536 CopyMemory( _pWord2, _pWord, _i );
537 _pWord2 := _pWord2 + _i;
539 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
552 WHIRA_DELIMITER = '
\82ð' + #10 + '
\82É' + #10 + '
\82ª' + #10 + '
\82Æ' + #10 + '
\82©
\82ç'
553 + #10 + '
\82Ö' + #10 + '
\82æ
\82è' + #10 + '
\82Ü
\82Å'+ #10 + '
\82Å'
554 + #10 + '
\82±
\82±' + #10 + '
\82»
\82±' + #10 + '
\82Ç
\82±'
555 + #10 + '
\82±
\82ê' + #10 + '
\82»
\82ê' + #10 + '
\82 \82ê' + #10 + '
\82Ç
\82ê'
556 + #10 + '
\82±
\82Ì' + #10 + '
\82»
\82Ì' + #10 + '
\82 \82Ì' + #10 + '
\82Ç
\82Ì'
557 + #10 + '
\82±
\82¤' + #10 + '
\82»
\82¤' + #10 + '
\82 \82 ' + #10 + '
\82Ç
\82¤'
558 + #10 + '
\82±
\82ñ
\82È' + #10 + '
\82»
\82ñ
\82È' + #10 + '
\82 \82ñ
\82È' + #10 + '
\82Ç
\82ñ
\82È'
559 + #10 + '
\82ê
\82½' + #10 + '
\82ê
\82Ä' + #10 + '
\82ê
\82ê' + #10 + '
\82ê
\82ë'
560 + #10 + '
\82ê
\82é' + #10 + '
\82ç
\82ê
\82é'
561 + #10 + '
\82Å
\82·' + #10 + '
\82Ü
\82·' + #10 + '
\82Ü
\82¹
\82ñ'
562 + #10 + '
\82Å
\82µ
\82½' + #10 + '
\82Ü
\82µ
\82½'
563 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82³
\82ê
\82é' + #10 + '
\82³
\82ê
\82È
\82¢'
565 WKANJI_DELIMITER = '
\93I' + #10 + '
\90«' + #10 + '
\8e®' + #10 + '
\89»' + #10 + '
\96@'
566 + #10 + '
\95s' + #10 + '
\96³' + #10 + '
\94ñ' + #10 + '
\94½'
568 WHIRA_FINAL_DELIMITER = '
\82Á
\82½' + #10 + '
\82Á
\82Ä'
570 + #10 + '
\82æ
\82Á
\82Ä' + #10 + '
\82µ
\82½
\82ª
\82Á
\82Ä' + #10 + '
\82È
\82Ì
\82Å'
571 + #10 + '
\82¾
\82©
\82ç' + #10 + '
\82Å
\82·
\82©
\82ç'
573 + #10 + '
\82µ
\82©
\82µ' + #10 + '
\82¾
\82ª' + #10 + '
\82¯
\82Ç' + #10 + '
\82¯
\82ê
\82Ç'
574 + #10 + '
\82â
\82Í
\82è' + #10 + '
\82â
\82Á
\82Ï
\82è'
575 + #10 + '
\82Å
\82µ' + #10 + '
\82¾
\82ë'
576 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82µ
\82½' + #10 + '
\82µ
\82È
\82¢'
578 // '
\81['
\82ð '
\82\9f\82¡
\82£
\82¥
\82§'
\82É
\81B
579 HA_LINE = '
\82 \82©
\82³
\82½
\82È
\82Í
\82Ü
\82â
\82ç
\82í
\82ª
\82´
\82¾
\82Î
\82Ï
\82\9f\82ì';
580 HI_LINE = '
\82¢
\82«
\82µ
\82¿
\82É
\82Ð
\82Ý
\82è
\82î
\82¬
\82¶
\82Ñ
\82Ò
\82¡';
581 HU_LINE = '
\82¤
\82
\82·
\82Â
\82Ê
\82Ó
\82Þ
\82ä
\82é
\82®
\82Ô
\82Õ
\82£';
582 HE_LINE = '
\82¦
\82¯
\82¹
\82Ä
\82Ë
\82Ö
\82ß
\82ê
\82ï
\82°
\82×
\82Ø
\82¥';
583 HO_LINE = '
\82¨
\82±
\82»
\82Æ
\82Ì
\82Ù
\82à
\82æ
\82ë
\82ð
\82²
\82Ú
\82Û
\82§';
584 KA_LINE = '
\83A
\83J
\83T
\83^
\83i
\83n
\83}
\83\84\83\89\83\8f\83K
\83U
\83_
\83o
\83p
\83@
\83\95\83\8e';
585 KI_LINE = '
\83C
\83L
\83V
\83`
\83j
\83q
\83~
\83\8a\83\90\83M
\83W
\83r
\83s
\83B';
586 KU_LINE = '
\83E
\83N
\83X
\83c
\83k
\83t
\83\80\83\86\83\8b\83O
\83u
\83v
\83D
\83\94';
587 KE_LINE = '
\83G
\83P
\83Z
\83e
\83l
\83w
\83\81\83\8c\83\91\83Q
\83x
\83y
\83F
\83\96';
588 KO_LINE = '
\83I
\83R
\83\
\83g
\83m
\83z
\83\82\83\88\83\8d\83\92\83S
\83{
\83|
\83H';
589 kKanji = [$80..$A0, $E0..$ff];
592 wHiraDelimiter := TStringList.Create;
593 wHiraFinalDelimiter := TStringList.Create;
594 wKanjiDelimiter := TStringList.Create;
595 words := TStringList.Create;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599 wHiraDelimiter.Text := WHIRA_DELIMITER;
600 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
602 wKanjiDelimiter.Text := WKANJI_DELIMITER;
604 tail := p + Length( text );
607 while p < tail do begin
608 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ð
\94»
\95Ê
609 //
\81¦
\8bå
\93Ç
\93_
\82Í ModeGraph
\82É
\82È
\82é
\82Ì
\82Å
\8cÂ
\95Ê
\82É
\91Î
\89\9e\82µ
\82È
\82
\82Ä
\82à
\82¢
\82¢
610 // if Byte(Byte( p^ ) - $a1) < $5e then begin
611 if Byte( p^ ) in kKanji then begin
612 if p + 1 < tail then begin
613 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
615 //
\83X
\83y
\81[
\83X
\82Å
\92P
\8cê
\95ª
\82¯
\82¹
\82¸
\82É
\8bl
\82ß
\82é
616 //$8140: newMode := ModeWhite;
617 $8141..$824e: newMode := ModeWGraph;
618 $824f..$8258: newMode := ModeWNum;
619 $8260..$829a: newMode := ModeWAlpha;
620 $829f..$82f1: newMode := ModeWHira;
621 $8340..$8396: newMode := ModeWKata;
622 else newMode := ModeWKanji;
624 // '
\81J
\81K
\81['
\82Í
\95½
\89¼
\96¼
\81A
\82Ü
\82½
\82Í
\83J
\83^
\83J
\83i
\82É
\8aÜ
\82Ü
\82ê
\82é
625 if (mode = ModeWHira) or (mode = ModeWKata) then
626 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
629 newMode := ModeWhite;
634 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636 //
\8d¡
\82Ü
\82Å
\93ú
\96{
\8cê
\82Å
\8d¡
\83X
\83y
\81[
\83X
637 //
\92P
\8cê
\82ð
\8cq
\82°
\82Ä
\8cã
\82Å
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
638 //
\81¦
\94¼
\8ap
\83J
\83i
\82Í
\92Ê
\8fí
\83X
\83y
\81[
\83X
\82Å
\8bæ
\90Ø
\82é
\82¾
\82ë
\82¤
\82©
\82ç
\8bl
\82ß
\82È
\82¢
645 if mode <> newMode then begin
647 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ª
\95Ï
\8dX
\82³
\82ê
\82½
648 if mode <> ModeWhite then begin
649 SetLength( aWord, p - last );
650 CopyMemory( PChar( aWord ), last, p - last );
652 words.Text := changeMode( aWord, mode );
655 addWord( wordCount, words );
666 if mode <> ModeWhite then begin
667 SetLength( aWord, p - last );
668 CopyMemory( PChar( aWord ), last, p - last );
670 words.Text := changeMode( aWord, mode );
673 addWord( wordCount, words );
677 wKanjiDelimiter.Free;
678 wHiraFinalDelimiter.Free;
684 //==============================
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
689 function p( const aWord : string ) : Single;
693 info := Objects[ aWord ];
696 else if info.NormalWord = 0 then
698 else if info.ImportantWord = 0 then
700 else if info.ImportantWord + info.NormalWord * 2 < 5 then
704 Result := ( info.ImportantWord / info.ImportantText ) /
705 ((info.NormalWord * 2 / info.NormalText ) +
706 (info.ImportantWord / info.ImportantText));
708 on EZeroDivide do Result := 0.99;
722 if wordCount.Count = 0 then
725 narray := TList.Create;
727 for i := 0 to wordCount.Count - 1 do begin
728 narray.Add( Pointer( p( wordCount[ i ] ) ) );
731 narray.Sort( AbsSort );
735 i := min( SAMPLE_COUNT, narray.Count );
739 s := s * Single( narray[ i ] );
740 q := q * (1 - Single( narray[ i ] ));
743 Result := s / (s + q);
750 //==============================
752 //==============================
753 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
755 function p( const aWord : string ) : Single;
759 info := Objects[ aWord ];
762 else if info.ImportantWord = 0 then
764 else if info.NormalWord = 0 then
768 Result := ( info.ImportantWord / info.ImportantText ) /
769 ((info.NormalWord / info.NormalText ) +
770 (info.ImportantWord / info.ImportantText));
772 Result := (info.ImportantWord * info.NormalText) /
773 (info.NormalWord * info.ImportantText +
774 info.ImportantWord * info.NormalText);
777 function f( cnt : Integer; n, mean : Single ) : Extended;
781 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
786 narray : array of Single;
788 countInfo : TWordCountInfo;
790 P1, Q1, R1 : Extended;
794 if wordCount.Count = 0 then begin
799 SetLength( narray, wordCount.Count );
801 for i := 0 to wordCount.Count - 1 do begin
802 n := p( wordCount[ i ] );
806 mean := mean / wordCount.Count;
810 for i := 0 to wordCount.Count - 1 do begin
811 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
812 n := f( countInfo.WordCount, narray[ i ], mean );
813 P1 := P1 * ( 1 - n );
816 cnt := wordCount.Count;
819 P1 := 1 - Power( P1, 1 / cnt );
820 Q1 := 1 - Power( Q1, 1 / cnt );
822 if P1 + Q1 = 0 then begin
825 n := (P1 - Q1) / (P1 + Q1);
826 Result := (1 + n) / 2;
831 //==============================
832 // CalcGaryRobinsonFisher
833 //==============================
834 function TGikoBayesian.CalcGaryRobinsonFisher(
835 wordCount : TWordCount
838 function p( const aWord : string ) : Single;
842 info := Objects[ aWord ];
845 else if info.ImportantWord = 0 then
847 else if info.NormalWord = 0 then
851 Result := ( info.ImportantWord / info.ImportantText ) /
852 ((info.NormalWord / info.NormalText ) +
853 (info.ImportantWord / info.ImportantText));
855 Result := (info.ImportantWord * info.NormalText) /
856 (info.NormalWord * info.ImportantText +
857 info.ImportantWord * info.NormalText);
860 function f( cnt : Integer; n, mean : Single ) : Extended;
864 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
867 function prbx( x2, degree : Extended ) : Extended;
876 narray : array of Single;
878 countInfo : TWordCountInfo;
881 important : Extended;
886 if wordCount.Count = 0 then begin
891 SetLength( narray, wordCount.Count );
893 for i := 0 to wordCount.Count - 1 do begin
894 n := p( wordCount[ i ] );
898 mean := mean / wordCount.Count;
902 for i := 0 to wordCount.Count - 1 do begin
903 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
904 n := f( countInfo.WordCount, narray[ i ], mean );
905 P1 := P1 * ( 1 - n );
908 cnt := wordCount.Count;
911 P1 := Power( P1, 1 / cnt );
912 Q1 := Power( Q1, 1 / cnt );
914 P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
915 Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
917 Result := (1 + P1 - Q1) / 2;
921 //==============================
923 //==============================
924 function TGikoBayesian.Parse(
926 wordCount : TWordCount;
927 algorithm : TGikoBayesianAlgorithm
931 CountWord( text, wordCount );
933 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
934 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
935 gbaGaryRobinsonFisher:
936 Result := CalcGaryRobinsonFisher( wordCount );
942 //==============================
944 //==============================
945 procedure TGikoBayesian.Learn(
946 wordCount : TWordCount;
947 isImportant : Boolean );
950 wordinfo : TWordInfo;
951 countinfo : TWordCountInfo;
955 for i := 0 to wordCount.Count - 1 do begin
956 aWord := wordCount[ i ];
957 wordinfo := Objects[ aWord ];
958 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
959 if wordinfo = nil then begin
960 wordinfo := TWordInfo.Create;
961 Objects[ aWord ] := wordinfo;
964 if isImportant then begin
965 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
966 wordinfo.ImportantText := wordinfo.ImportantText + 1;
968 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
969 wordinfo.NormalText := wordinfo.NormalText + 1;
975 //==============================
977 //==============================
978 procedure TGikoBayesian.Forget(
979 wordCount : TWordCount;
980 isImportant : Boolean );
983 wordinfo : TWordInfo;
984 countinfo : TWordCountInfo;
988 for i := 0 to wordCount.Count - 1 do begin
989 aWord := wordCount[ i ];
990 wordinfo := Objects[ aWord ];
991 if wordinfo = nil then
994 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
995 if isImportant then begin
996 if wordInfo.ImportantText > 0 then begin
997 wordinfo.ImportantText := wordinfo.ImportantText - 1;
998 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1001 if wordinfo.NormalText > 0 then begin
1002 wordinfo.NormalText := wordinfo.NormalText - 1;
1003 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;