5 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
7 $Id: GikoBayesian.pas,v 1.13 2004/11/01 05:18:21 yoffy Exp $
12 //==================================================
14 //==================================================
17 //==================================================
19 //==================================================
21 {!***********************************************************
22 \brief
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
23 ************************************************************}
24 TWordInfo = class( TObject )
26 FNormalWord : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
27 FImportantWord : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
28 FNormalText : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
29 FImportantText : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
38 {!***********************************************************
39 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
40 ************************************************************}
41 TWordCountInfo = class( TObject )
43 FWordCount : Integer; //!<
\92P
\8cê
\90\94
46 property WordCount : Integer read FWordCount write FWordCount;
49 {!***********************************************************
50 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83\8a\83X
\83g
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) //
\8c\83\92x
53 TWordCount = class( TStringList )
56 destructor Destroy; override;
59 {!***********************************************************
60 \brief
\83t
\83B
\83\8b\83^
\83A
\83\8b\83S
\83\8a\83Y
\83\80
61 ************************************************************}
62 TGikoBayesianAlgorithm =
63 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
65 {!***********************************************************
66 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
67 ************************************************************}
68 // TGikoBayesian = class( THashedStringList ) //
\8c\83\92x
69 TGikoBayesian = class( TStringList )
71 FFilePath : string; //!<
\93Ç
\82Ý
\8d\9e\82ñ
\82¾
\83t
\83@
\83C
\83\8b\83p
\83X
72 function GetObject( const name : string ) : TWordInfo;
73 procedure SetObject( const name : string; value : TWordInfo );
77 destructor Destroy; override;
79 //!
\83t
\83@
\83C
\83\8b\82©
\82ç
\8aw
\8fK
\97\9a\97ð
\82ð
\93Ç
\82Ý
\8fo
\82µ
\82Ü
\82·
80 procedure LoadFromFile( const filePath : string );
82 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
83 procedure SaveToFile( const filePath : string );
85 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
88 //!
\92P
\8cê
\82É
\91Î
\82·
\82é
\8fî
\95ñ
\82ð
\8eæ
\93¾
\82µ
\82Ü
\82·
89 property Objects[ const name : string ] : TWordInfo
90 read GetObject write SetObject; default;
92 //!
\95¶
\8fÍ
\82É
\8aÜ
\82Ü
\82ê
\82é
\92P
\8cê
\82ð
\83J
\83E
\83\93\83g
\82µ
\82Ü
\82·
95 wordCount : TWordCount );
98 \brief Paul Graham
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
99 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
101 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
104 \brief GaryRobinson
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
105 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
107 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
110 \brief GaryRobinson-Fisher
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
111 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
113 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
116 \brief
\95¶
\8fÍ
\82ð
\89ð
\90Í
117 \param text
\89ð
\90Í
\82·
\82é
\95¶
\8fÍ
118 \param wordCount
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
\82ª
\95Ô
\82é
119 \param algorithm
\92\8d\96Ú
\93x
\82Ì
\8c\88\92è
\82É
\97p
\82¢
\82é
\83A
\83\8b\83S
\83\8a\83Y
\83\80\82ð
\8ew
\92è
\82µ
\82Ü
\82·
120 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
122 CountWord
\82Æ Calcxxxxx
\82ð
\82Ü
\82Æ
\82ß
\82Ä
\8eÀ
\8ds
\82·
\82é
\82¾
\82¯
\82Å
\82·
\81B
126 wordCount : TWordCount;
127 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131 \brief
\8aw
\8fK
\82·
\82é
132 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
133 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82é
\82È
\82ç True
136 wordCount : TWordCount;
137 isImportant : Boolean );
140 \brief
\8aw
\8fK
\8c\8b\89Ê
\82ð
\96Y
\82ê
\82é
141 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
142 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82ç
\82ê
\82Ä
\82¢
\82½
\82È
\82ç True
143 \warning
\8aw
\8fK
\8dÏ
\82Ý
\82Ì
\95¶
\8fÍ
\82©
\82Ç
\82¤
\82©
\82Í
\8am
\94F
\8fo
\97\88\82Ü
\82¹
\82ñ
\81B<br>
144 Learn
\82µ
\82Ä
\82¢
\82È
\82¢
\95¶
\8fÍ
\82â isImportant
\82ª
\8aÔ
\88á
\82Á
\82Ä
\82¢
\82é
\95¶
\8fÍ
\82ð
145 Forget
\82·
\82é
\82Æ
\83f
\81[
\83^
\83x
\81[
\83X
\82ª
\94j
\91¹
\82µ
\82Ü
\82·
\81B<br>
146 \8aw
\8fK
\8dÏ
\82Ý
\82©
\82Ç
\82¤
\82©
\82Í
\93Æ
\8e©
\82É
\8aÇ
\97\9d\82µ
\82Ä
\82
\82¾
\82³
\82¢
\81B
148 \91S
\82Ä
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82ð
\83N
\83\8a\83A
\82·
\82é
\82í
\82¯
\82Å
\82Í
\82 \82è
\82Ü
\82¹
\82ñ
\81B<br>
149 wordCount
\82ð
\93¾
\82½
\95¶
\8fÍ (Parse
\82Ì text
\88ø
\90\94)
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82Ì
\82Ý
\83N
\83\8a\83A
\82µ
\82Ü
\82·
\81B<br><br>
151 \8eå
\82É
\92\8d\96Ú
\95¶
\8fÍ
\82Æ
\94ñ
\92\8d\96Ú
\95¶
\8fÍ
\82ð
\90Ø
\82è
\91Ö
\82¦
\82é
\82½
\82ß
\82É Forget -> Learn
\82Ì
\8f\87\82Å
\8eg
\97p
\82µ
\82Ü
\82·
\81B
154 wordCount : TWordCount;
155 isImportant : Boolean );
158 //==================================================
160 //==================================================
163 SysUtils, Math, Windows,
167 GIKO_BAYESIAN_FILE_VERSION = '1.0';
169 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
170 ModeWGraph, ModeWAlpha, ModeWNum,
171 ModeWHira, ModeWKata, ModeWKanji);
173 CharMode1 : array [ 0..255 ] of Byte =
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
178 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
179 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
180 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
181 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
182 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
187 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
188 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
189 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
194 //************************************************************
196 //************************************************************
198 //==============================
200 //==============================
201 function RemoveToken(var s: string;const delimiter: string): string;
205 p := AnsiPos(delimiter, s);
209 Result := Copy(s, 1, p - 1);
210 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
213 //==============================
215 //==============================
216 function AbsSort( p1, p2 : Pointer ) : Integer;
221 v1 := Abs( Single( p1 ) - 0.5 );
222 v2 := Abs( Single( p2 ) - 0.5 );
232 //************************************************************
234 //************************************************************
235 constructor TWordCount.Create;
238 Duplicates := dupIgnore;
239 CaseSensitive := True;
244 destructor TWordCount.Destroy;
249 for i := Count - 1 downto 0 do
250 if Objects[ i ] <> nil then
257 //************************************************************
258 // TGikoBayesian class
259 //************************************************************
261 //==============================
263 //==============================
264 constructor TGikoBayesian.Create;
267 Duplicates := dupIgnore;
268 CaseSensitive := True;
273 //==============================
275 //==============================
276 destructor TGikoBayesian.Destroy;
281 for i := Count - 1 downto 0 do
282 if inherited Objects[ i ] <> nil then
283 inherited Objects[ i ].Free;
289 procedure TGikoBayesian.LoadFromFile( const filePath : string );
298 FFilePath := filePath;
300 if not FileExists( filePath ) then
303 sl := TStringList.Create;
305 sl.LoadFromFile( filePath );
307 for i := 1 to sl.Count - 1 do begin
309 name := RemoveToken( s, #1 );
310 info := TWordInfo.Create;
311 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
312 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
313 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
314 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 AddObject( name, info );
324 procedure TGikoBayesian.SaveToFile( const filePath : string );
332 FFilePath := filePath;
334 sl := TStringList.Create;
337 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
339 for i := 0 to Count - 1 do begin
340 info := TWordInfo( inherited Objects[ i ] );
341 s := Strings[ i ] + #1
342 + Format('%x', [info.NormalWord]) + #1
343 + Format('%x', [info.ImportantWord]) + #1
344 + Format('%x', [info.NormalText]) + #1
345 + Format('%x', [info.ImportantText]);
350 sl.SaveToFile( filePath );
357 procedure TGikoBayesian.Save;
360 if FFilePath <> '' then
361 SaveToFile( FFilePath );
365 //==============================
367 //==============================
368 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
373 if Find( name, idx ) then
374 Result := TWordInfo( inherited Objects[ idx ] )
380 //==============================
382 //==============================
383 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
388 if Find( name, idx ) then
389 inherited Objects[ idx ] := value
391 AddObject( name, value );
396 //==============================
398 //==============================
399 procedure TGikoBayesian.CountWord(
401 wordCount : TWordCount );
403 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
404 ModeWGraph, ModeWAlpha, ModeWNum,
405 ModeWHira, ModeWKata, ModeWKanji);
407 p, tail, last : PChar;
408 mode, newMode : Modes;
411 wHiraDelimiter : TStringList;
412 wHiraFinalDelimiter : TStringList;
413 wKanjiDelimiter : TStringList;
416 countInfo : TWordCountInfo;
418 function cutBoth( _aWord : string; _delim : TStringList ) : string;
422 for _i := 0 to _delim.Count - 1 do begin
423 _aWord := CustomStringReplace(
426 #10 + _delim[ _i ] + #10, False );
431 function cutFirst( _aWord : string; _delim : TStringList ) : string;
435 for _i := 0 to _delim.Count - 1 do begin
436 _aWord := CustomStringReplace(
439 #10 + _delim[ _i ], False );
444 function cutFinal( _aWord : string; _delim : TStringList ) : string;
448 for _i := 0 to _delim.Count - 1 do begin
449 _aWord := CustomStringReplace(
452 _delim[ _i ] + #10, False );
457 procedure addWord( _dst : TWordCount; _words : TStringList );
461 _countInfo : TWordCountInfo;
463 for _i := 0 to _words.Count - 1 do begin
464 _aWord := _words[ _i ];
465 if Length( _aWord ) > 0 then begin
466 if _dst.Find( _aWord, _idx ) then begin
467 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
469 _countInfo := TWordCountInfo.Create;
470 _dst.AddObject( _aWord, _countInfo );
472 _countInfo.WordCount := _countInfo.WordCount + 1;
477 function changeMode( _aWord : string; _mode : Modes ) : string;
481 _pWord, _pWord2 : PChar;
482 _pWordTail, _pFound : PChar;
484 _delim : string = #10;
486 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
488 //
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
489 _aWord := CustomStringReplace( _aWord, ' ', '', False );
490 _aWord := CustomStringReplace( _aWord, '
\81@', '', False );
492 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
496 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
497 Result := cutBoth( _aWord, wHiraDelimiter );
502 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
503 _aWord := cutBoth( _aWord, wKanjiDelimiter );
504 // 4 byte (2
\8e\9a)
\82¸
\82Â
\82Å
\92P
\8cê
\95ª
\82¯
505 _pWord := PChar( _aWord );
506 _i := Length( _aWord );
507 _pWordTail := _pWord + _i;
508 SetLength( _aWord2, _i + (_i shr 2) );
509 _pWord2 := PChar( _aWord2 );
511 while _pWord < _pWordTail do begin
512 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
513 if _pFound = nil then
514 _pFound := _pWordTail;
515 _pFound := _pFound - 3;
517 while _pWord <= _pFound do begin
518 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
519 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
521 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
522 CopyMemory( _pWord2, _pWord, _i );
523 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
525 if _pWord < _pWordTail then begin
526 _i := _pWordTail - _pWord;
527 CopyMemory( _pWord2, _pWord, _i );
528 _pWord2 := _pWord2 + _i;
530 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
543 WHIRA_DELIMITER = '
\82ð' + #10 + '
\82É' + #10 + '
\82ª' + #10 + '
\82Æ' + #10 + '
\82©
\82ç'
544 + #10 + '
\82Ö' + #10 + '
\82æ
\82è' + #10 + '
\82Ü
\82Å'+ #10 + '
\82Å'
545 + #10 + '
\82±
\82±' + #10 + '
\82»
\82±' + #10 + '
\82Ç
\82±'
546 + #10 + '
\82±
\82ê' + #10 + '
\82»
\82ê' + #10 + '
\82 \82ê' + #10 + '
\82Ç
\82ê'
547 + #10 + '
\82±
\82Ì' + #10 + '
\82»
\82Ì' + #10 + '
\82 \82Ì' + #10 + '
\82Ç
\82Ì'
548 + #10 + '
\82±
\82¤' + #10 + '
\82»
\82¤' + #10 + '
\82 \82 ' + #10 + '
\82Ç
\82¤'
549 + #10 + '
\82±
\82ñ
\82È' + #10 + '
\82»
\82ñ
\82È' + #10 + '
\82 \82ñ
\82È' + #10 + '
\82Ç
\82ñ
\82È'
550 + #10 + '
\82ê
\82½' + #10 + '
\82ê
\82Ä' + #10 + '
\82ê
\82ê' + #10 + '
\82ê
\82ë'
551 + #10 + '
\82ê
\82é' + #10 + '
\82ç
\82ê
\82é'
552 + #10 + '
\82Å
\82·' + #10 + '
\82Ü
\82·' + #10 + '
\82Ü
\82¹
\82ñ'
553 + #10 + '
\82Å
\82µ
\82½' + #10 + '
\82Ü
\82µ
\82½'
554 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82³
\82ê
\82é' + #10 + '
\82³
\82ê
\82È
\82¢'
556 WKANJI_DELIMITER = '
\93I' + #10 + '
\90«' + #10 + '
\8e®' + #10 + '
\89»' + #10 + '
\96@'
557 + #10 + '
\95s' + #10 + '
\96³' + #10 + '
\94ñ' + #10 + '
\94½'
559 WHIRA_FINAL_DELIMITER = '
\82Á
\82½' + #10 + '
\82Á
\82Ä'
561 + #10 + '
\82æ
\82Á
\82Ä' + #10 + '
\82µ
\82½
\82ª
\82Á
\82Ä' + #10 + '
\82È
\82Ì
\82Å'
562 + #10 + '
\82¾
\82©
\82ç' + #10 + '
\82Å
\82·
\82©
\82ç'
564 + #10 + '
\82µ
\82©
\82µ' + #10 + '
\82¾
\82ª' + #10 + '
\82¯
\82Ç' + #10 + '
\82¯
\82ê
\82Ç'
565 + #10 + '
\82â
\82Í
\82è' + #10 + '
\82â
\82Á
\82Ï
\82è'
566 + #10 + '
\82Å
\82µ' + #10 + '
\82¾
\82ë'
567 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82µ
\82½' + #10 + '
\82µ
\82È
\82¢'
569 // '
\81['
\82ð '
\82\9f\82¡
\82£
\82¥
\82§'
\82É
\81B
570 HA_LINE = '
\82 \82©
\82³
\82½
\82È
\82Í
\82Ü
\82â
\82ç
\82í
\82ª
\82´
\82¾
\82Î
\82Ï
\82\9f\82ì';
571 HI_LINE = '
\82¢
\82«
\82µ
\82¿
\82É
\82Ð
\82Ý
\82è
\82î
\82¬
\82¶
\82Ñ
\82Ò
\82¡';
572 HU_LINE = '
\82¤
\82
\82·
\82Â
\82Ê
\82Ó
\82Þ
\82ä
\82é
\82®
\82Ô
\82Õ
\82£';
573 HE_LINE = '
\82¦
\82¯
\82¹
\82Ä
\82Ë
\82Ö
\82ß
\82ê
\82ï
\82°
\82×
\82Ø
\82¥';
574 HO_LINE = '
\82¨
\82±
\82»
\82Æ
\82Ì
\82Ù
\82à
\82æ
\82ë
\82ð
\82²
\82Ú
\82Û
\82§';
575 KA_LINE = '
\83A
\83J
\83T
\83^
\83i
\83n
\83}
\83\84\83\89\83\8f\83K
\83U
\83_
\83o
\83p
\83@
\83\95\83\8e';
576 KI_LINE = '
\83C
\83L
\83V
\83`
\83j
\83q
\83~
\83\8a\83\90\83M
\83W
\83r
\83s
\83B';
577 KU_LINE = '
\83E
\83N
\83X
\83c
\83k
\83t
\83\80\83\86\83\8b\83O
\83u
\83v
\83D
\83\94';
578 KE_LINE = '
\83G
\83P
\83Z
\83e
\83l
\83w
\83\81\83\8c\83\91\83Q
\83x
\83y
\83F
\83\96';
579 KO_LINE = '
\83I
\83R
\83\
\83g
\83m
\83z
\83\82\83\88\83\8d\83\92\83S
\83{
\83|
\83H';
580 kKanji = [$80..$A0, $E0..$ff];
583 wHiraDelimiter := TStringList.Create;
584 wHiraFinalDelimiter := TStringList.Create;
585 wKanjiDelimiter := TStringList.Create;
586 words := TStringList.Create;
589 wHiraDelimiter.Text := WHIRA_DELIMITER;
590 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
591 wKanjiDelimiter.Text := WKANJI_DELIMITER;
593 tail := p + Length( text );
596 while p < tail do begin
597 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ð
\94»
\95Ê
598 //
\81¦
\8bå
\93Ç
\93_
\82Í ModeGraph
\82É
\82È
\82é
\82Ì
\82Å
\8cÂ
\95Ê
\82É
\91Î
\89\9e\82µ
\82È
\82
\82Ä
\82à
\82¢
\82¢
599 // if Byte(Byte( p^ ) - $a1) < $5e then begin
600 if Byte( p^ ) in kKanji then begin
601 if p + 1 < tail then begin
602 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
604 //
\83X
\83y
\81[
\83X
\82Å
\92P
\8cê
\95ª
\82¯
\82¹
\82¸
\82É
\8bl
\82ß
\82é
605 //$8140: newMode := ModeWhite;
606 $8141..$824e: newMode := ModeWGraph;
607 $824f..$8258: newMode := ModeWNum;
608 $8260..$829a: newMode := ModeWAlpha;
609 $829f..$82f1: newMode := ModeWHira;
610 $8340..$8396: newMode := ModeWKata;
611 else newMode := ModeWKanji;
613 // '
\81J
\81K
\81['
\82Í
\95½
\89¼
\96¼
\81A
\82Ü
\82½
\82Í
\83J
\83^
\83J
\83i
\82É
\8aÜ
\82Ü
\82ê
\82é
614 if (mode = ModeWHira) or (mode = ModeWKata) then
615 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
618 newMode := ModeWhite;
623 newMode := Modes( CharMode1[ Byte( p^ ) ] );
624 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
625 //
\8d¡
\82Ü
\82Å
\93ú
\96{
\8cê
\82Å
\8d¡
\83X
\83y
\81[
\83X
626 //
\92P
\8cê
\82ð
\8cq
\82°
\82Ä
\8cã
\82Å
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
627 //
\81¦
\94¼
\8ap
\83J
\83i
\82Í
\92Ê
\8fí
\83X
\83y
\81[
\83X
\82Å
\8bæ
\90Ø
\82é
\82¾
\82ë
\82¤
\82©
\82ç
\8bl
\82ß
\82È
\82¢
634 if mode <> newMode then begin
636 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ª
\95Ï
\8dX
\82³
\82ê
\82½
637 if mode <> ModeWhite then begin
638 SetLength( aWord, p - last );
639 CopyMemory( PChar( aWord ), last, p - last );
641 words.Text := changeMode( aWord, mode );
644 addWord( wordCount, words );
655 if mode <> ModeWhite then begin
656 aWord := Copy( last, 0, p - last );
657 words.Text := changeMode( aWord, mode );
660 addWord( wordCount, words );
664 wKanjiDelimiter.Free;
665 wHiraFinalDelimiter.Free;
671 //==============================
673 //==============================
674 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
676 function p( const aWord : string ) : Single;
680 info := Objects[ aWord ];
683 else if info.NormalWord = 0 then
685 else if info.ImportantWord = 0 then
687 else if info.ImportantWord + info.NormalWord * 2 < 5 then
690 Result := ( info.ImportantWord / info.ImportantText ) /
691 ((info.NormalWord * 2 / info.NormalText ) +
692 (info.ImportantWord / info.ImportantText));
704 if wordCount.Count = 0 then
707 narray := TList.Create;
709 for i := 0 to wordCount.Count - 1 do begin
710 narray.Add( Pointer( p( wordCount[ i ] ) ) );
713 narray.Sort( AbsSort );
717 i := min( SAMPLE_COUNT, narray.Count );
721 s := s * Single( narray[ i ] );
722 q := q * (1 - Single( narray[ i ] ));
725 Result := s / (s + q);
732 //==============================
734 //==============================
735 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
737 function p( const aWord : string ) : Single;
741 info := Objects[ aWord ];
744 else if info.ImportantWord = 0 then
746 else if info.NormalWord = 0 then
749 Result := ( info.ImportantWord / info.ImportantText ) /
750 ((info.NormalWord / info.NormalText ) +
751 (info.ImportantWord / info.ImportantText));
754 function f( cnt : Integer; n, mean : Single ) : Extended;
758 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
763 narray : array of Single;
765 countInfo : TWordCountInfo;
767 P1, Q1, R1 : Extended;
771 if wordCount.Count = 0 then begin
776 SetLength( narray, wordCount.Count );
778 for i := 0 to wordCount.Count - 1 do begin
779 n := p( wordCount[ i ] );
783 mean := mean / wordCount.Count;
787 for i := 0 to wordCount.Count - 1 do begin
788 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
789 n := f( countInfo.WordCount, narray[ i ], mean );
790 P1 := P1 * ( 1 - n );
793 cnt := wordCount.Count;
797 P1 := 1 - Power( P1, 1 / cnt );
798 Q1 := 1 - Power( Q1, 1 / cnt );
800 if P1 + Q1 = 0 then begin
803 n := (P1 - Q1) / (P1 + Q1);
804 Result := (1 + n) / 2;
809 //==============================
810 // CalcGaryRobinsonFisher
811 //==============================
812 function TGikoBayesian.CalcGaryRobinsonFisher(
813 wordCount : TWordCount
816 function p( const aWord : string ) : Single;
820 info := Objects[ aWord ];
823 else if info.ImportantWord = 0 then
825 else if info.NormalWord = 0 then
828 Result := info.ImportantWord /
829 (info.ImportantWord + info.NormalWord *
830 info.ImportantText / info.NormalText);
833 function f( cnt : Integer; n, mean : Single ) : Extended;
837 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
840 function prbx( x2, degree : Extended ) : Extended;
853 while i < (degree / 2 - 1) do begin
854 term := term + ln( m / i );
855 sum := sum + exp( term );
868 narray : array of Single;
870 countInfo : TWordCountInfo;
873 important : Extended;
878 if wordCount.Count = 0 then begin
883 SetLength( narray, wordCount.Count );
885 for i := 0 to wordCount.Count - 1 do begin
886 n := p( wordCount[ i ] );
890 mean := mean / wordCount.Count;
900 for i := 0 to wordCount.Count - 1 do begin
901 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
902 n := f( countInfo.WordCount, narray[ i ], mean );
903 if countInfo <> nil then
904 cnt := cnt + countInfo.WordCount;
906 P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
907 Q1 := Q1 + Ln( n ) * countInfo.WordCount;
909 P1 := P1 + Ln( 1 - n );
916 P1 := prbx( -2 * P1, 2 * cnt );
917 Q1 := prbx( -2 * Q1, 2 * cnt );
919 P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
920 Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
922 if P1 + Q1 = 0 then begin
925 Result := (1 + Q1 + P1) / 2;
930 //==============================
932 //==============================
933 function TGikoBayesian.Parse(
935 wordCount : TWordCount;
936 algorithm : TGikoBayesianAlgorithm
940 CountWord( text, wordCount );
942 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
943 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
944 gbaGaryRobinsonFisher:
945 Result := CalcGaryRobinsonFisher( wordCount );
951 //==============================
953 //==============================
954 procedure TGikoBayesian.Learn(
955 wordCount : TWordCount;
956 isImportant : Boolean );
959 wordinfo : TWordInfo;
960 countinfo : TWordCountInfo;
964 for i := 0 to wordCount.Count - 1 do begin
965 aWord := wordCount[ i ];
966 wordinfo := Objects[ aWord ];
967 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
968 if wordinfo = nil then begin
969 wordinfo := TWordInfo.Create;
970 Objects[ aWord ] := wordinfo;
973 if isImportant then begin
974 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
975 wordinfo.ImportantText := wordinfo.ImportantText + 1;
977 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
978 wordinfo.NormalText := wordinfo.NormalText + 1;
984 //==============================
986 //==============================
987 procedure TGikoBayesian.Forget(
988 wordCount : TWordCount;
989 isImportant : Boolean );
992 wordinfo : TWordInfo;
993 countinfo : TWordCountInfo;
997 for i := 0 to wordCount.Count - 1 do begin
998 aWord := wordCount[ i ];
999 wordinfo := Objects[ aWord ];
1000 if wordinfo = nil then
1003 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1004 if isImportant then begin
1005 if wordInfo.ImportantText > 0 then begin
1006 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1007 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1010 if wordinfo.NormalText > 0 then begin
1011 wordinfo.NormalText := wordinfo.NormalText - 1;
1012 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;