5 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
8 $Id: GikoBayesian.pas,v 1.21 2006/06/26 14:57:15 h677 Exp $
11 //!
\95½
\89¼
\96¼
\82ð
\8e«
\8f\91\82É
\8aÜ
\82ß
\82È
\82¢
12 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
16 //==================================================
18 //==================================================
21 //==================================================
23 //==================================================
25 {!***********************************************************
26 \brief
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
27 ************************************************************}
28 TWordInfo = class( TObject )
30 FNormalWord : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
31 FImportantWord : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
32 FNormalText : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
33 FImportantText : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
36 property NormalWord : Integer read FNormalWord write FNormalWord;
37 property ImportantWord : Integer read FImportantWord write FImportantWord;
38 property NormalText : Integer read FNormalText write FNormalText;
39 property ImportantText : Integer read FImportantText write FImportantText;
42 {!***********************************************************
43 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
44 ************************************************************}
45 TWordCountInfo = class( TObject )
47 FWordCount : Integer; //!<
\92P
\8cê
\90\94
50 property WordCount : Integer read FWordCount write FWordCount;
53 {!***********************************************************
54 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83\8a\83X
\83g
55 ************************************************************}
56 // TWordCount = class( THashedStringList ) //
\8c\83\92x
57 TWordCount = class( TStringList )
60 destructor Destroy; override;
63 {!***********************************************************
64 \brief
\83t
\83B
\83\8b\83^
\83A
\83\8b\83S
\83\8a\83Y
\83\80
65 ************************************************************}
66 TGikoBayesianAlgorithm =
67 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
69 {!***********************************************************
70 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
71 ************************************************************}
72 // TGikoBayesian = class( THashedStringList ) //
\8c\83\92x
73 TGikoBayesian = class( TStringList )
75 FFilePath : string; //!<
\93Ç
\82Ý
\8d\9e\82ñ
\82¾
\83t
\83@
\83C
\83\8b\83p
\83X
76 function GetObject( const name : string ) : TWordInfo;
77 procedure SetObject( const name : string; value : TWordInfo );
81 destructor Destroy; override;
83 //!
\83t
\83@
\83C
\83\8b\82©
\82ç
\8aw
\8fK
\97\9a\97ð
\82ð
\93Ç
\82Ý
\8fo
\82µ
\82Ü
\82·
84 procedure LoadFromFile( const filePath : string );
86 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
87 procedure SaveToFile( const filePath : string );
89 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
92 //!
\92P
\8cê
\82É
\91Î
\82·
\82é
\8fî
\95ñ
\82ð
\8eæ
\93¾
\82µ
\82Ü
\82·
93 property Objects[ const name : string ] : TWordInfo
94 read GetObject write SetObject; default;
96 //!
\95¶
\8fÍ
\82É
\8aÜ
\82Ü
\82ê
\82é
\92P
\8cê
\82ð
\83J
\83E
\83\93\83g
\82µ
\82Ü
\82·
99 wordCount : TWordCount );
102 \brief Paul Graham
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
103 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
105 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
108 \brief GaryRobinson
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
109 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
111 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
114 \brief GaryRobinson-Fisher
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
115 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
117 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
120 \brief
\95¶
\8fÍ
\82ð
\89ð
\90Í
121 \param text
\89ð
\90Í
\82·
\82é
\95¶
\8fÍ
122 \param wordCount
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
\82ª
\95Ô
\82é
123 \param algorithm
\92\8d\96Ú
\93x
\82Ì
\8c\88\92è
\82É
\97p
\82¢
\82é
\83A
\83\8b\83S
\83\8a\83Y
\83\80\82ð
\8ew
\92è
\82µ
\82Ü
\82·
124 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
126 CountWord
\82Æ Calcxxxxx
\82ð
\82Ü
\82Æ
\82ß
\82Ä
\8eÀ
\8ds
\82·
\82é
\82¾
\82¯
\82Å
\82·
\81B
130 wordCount : TWordCount;
131 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
135 \brief
\8aw
\8fK
\82·
\82é
136 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
137 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82é
\82È
\82ç True
140 wordCount : TWordCount;
141 isImportant : Boolean );
144 \brief
\8aw
\8fK
\8c\8b\89Ê
\82ð
\96Y
\82ê
\82é
145 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
146 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82ç
\82ê
\82Ä
\82¢
\82½
\82È
\82ç True
147 \warning
\8aw
\8fK
\8dÏ
\82Ý
\82Ì
\95¶
\8fÍ
\82©
\82Ç
\82¤
\82©
\82Í
\8am
\94F
\8fo
\97\88\82Ü
\82¹
\82ñ
\81B<br>
148 Learn
\82µ
\82Ä
\82¢
\82È
\82¢
\95¶
\8fÍ
\82â isImportant
\82ª
\8aÔ
\88á
\82Á
\82Ä
\82¢
\82é
\95¶
\8fÍ
\82ð
149 Forget
\82·
\82é
\82Æ
\83f
\81[
\83^
\83x
\81[
\83X
\82ª
\94j
\91¹
\82µ
\82Ü
\82·
\81B<br>
150 \8aw
\8fK
\8dÏ
\82Ý
\82©
\82Ç
\82¤
\82©
\82Í
\93Æ
\8e©
\82É
\8aÇ
\97\9d\82µ
\82Ä
\82
\82¾
\82³
\82¢
\81B
152 \91S
\82Ä
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82ð
\83N
\83\8a\83A
\82·
\82é
\82í
\82¯
\82Å
\82Í
\82 \82è
\82Ü
\82¹
\82ñ
\81B<br>
153 wordCount
\82ð
\93¾
\82½
\95¶
\8fÍ (Parse
\82Ì text
\88ø
\90\94)
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82Ì
\82Ý
\83N
\83\8a\83A
\82µ
\82Ü
\82·
\81B<br><br>
155 \8eå
\82É
\92\8d\96Ú
\95¶
\8fÍ
\82Æ
\94ñ
\92\8d\96Ú
\95¶
\8fÍ
\82ð
\90Ø
\82è
\91Ö
\82¦
\82é
\82½
\82ß
\82É Forget -> Learn
\82Ì
\8f\87\82Å
\8eg
\97p
\82µ
\82Ü
\82·
\81B
158 wordCount : TWordCount;
159 isImportant : Boolean );
162 //==================================================
164 //==================================================
167 SysUtils, Math, Windows,
171 GIKO_BAYESIAN_FILE_VERSION = '1.0';
173 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
174 ModeWGraph, ModeWAlpha, ModeWNum,
175 ModeWHira, ModeWKata, ModeWKanji);
177 CharMode1 : array [ 0..255 ] of Byte =
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
182 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
183 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
184 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
185 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
186 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
198 //************************************************************
200 //************************************************************
202 //==============================
204 //==============================
205 function RemoveToken(var s: string;const delimiter: string): string;
209 p := AnsiPos(delimiter, s);
213 Result := Copy(s, 1, p - 1);
214 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
217 //==============================
219 //==============================
220 function AbsSort( p1, p2 : Pointer ) : Integer;
225 v1 := Abs( Single( p1 ) - 0.5 );
226 v2 := Abs( Single( p2 ) - 0.5 );
236 //************************************************************
238 //************************************************************
239 constructor TWordCount.Create;
242 Duplicates := dupIgnore;
243 CaseSensitive := True;
248 destructor TWordCount.Destroy;
253 for i := Count - 1 downto 0 do
254 if Objects[ i ] <> nil then
261 //************************************************************
262 // TGikoBayesian class
263 //************************************************************
265 //==============================
267 //==============================
268 constructor TGikoBayesian.Create;
271 Duplicates := dupIgnore;
272 CaseSensitive := True;
277 //==============================
279 //==============================
280 destructor TGikoBayesian.Destroy;
285 for i := Count - 1 downto 0 do
286 if inherited Objects[ i ] <> nil then
287 inherited Objects[ i ].Free;
293 procedure TGikoBayesian.LoadFromFile( const filePath : string );
302 FFilePath := filePath;
304 if not FileExists( filePath ) then
307 sl := TStringList.Create;
309 sl.LoadFromFile( filePath );
311 for i := 1 to sl.Count - 1 do begin
313 name := RemoveToken( s, #1 );
314 info := TWordInfo.Create;
315 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
320 AddObject( name, info );
328 procedure TGikoBayesian.SaveToFile( const filePath : string );
336 FFilePath := filePath;
338 sl := TStringList.Create;
341 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
343 for i := 0 to Count - 1 do begin
344 info := TWordInfo( inherited Objects[ i ] );
345 s := Strings[ i ] + #1
346 + Format('%x', [info.NormalWord]) + #1
347 + Format('%x', [info.ImportantWord]) + #1
348 + Format('%x', [info.NormalText]) + #1
349 + Format('%x', [info.ImportantText]);
354 sl.SaveToFile( filePath );
361 procedure TGikoBayesian.Save;
364 if FFilePath <> '' then
365 SaveToFile( FFilePath );
369 //==============================
371 //==============================
372 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
377 if Find( name, idx ) then
378 Result := TWordInfo( inherited Objects[ idx ] )
384 //==============================
386 //==============================
387 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
392 if Find( name, idx ) then
393 inherited Objects[ idx ] := value
395 AddObject( name, value );
400 //==============================
402 //==============================
403 procedure TGikoBayesian.CountWord(
405 wordCount : TWordCount );
407 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
408 ModeWGraph, ModeWAlpha, ModeWNum,
409 ModeWHira, ModeWKata, ModeWKanji);
411 p, tail, last : PChar;
412 mode, newMode : Modes;
415 wHiraDelimiter : TStringList;
416 wHiraFinalDelimiter : TStringList;
417 wKanjiDelimiter : TStringList;
420 // countInfo : TWordCountInfo;
422 function cutBoth( _aWord : string; _delim : TStringList ) : string;
426 for _i := 0 to _delim.Count - 1 do begin
427 _aWord := CustomStringReplace(
430 #10 + _delim[ _i ] + #10, False );
435 function cutFirst( _aWord : string; _delim : TStringList ) : string;
439 for _i := 0 to _delim.Count - 1 do begin
440 _aWord := CustomStringReplace(
443 #10 + _delim[ _i ], False );
448 function cutFinal( _aWord : string; _delim : TStringList ) : string;
452 for _i := 0 to _delim.Count - 1 do begin
453 _aWord := CustomStringReplace(
456 _delim[ _i ] + #10, False );
461 procedure addWord( _dst : TWordCount; _words : TStringList );
465 _countInfo : TWordCountInfo;
467 for _i := 0 to _words.Count - 1 do begin
468 _aWord := _words[ _i ];
469 if Length( _aWord ) > 0 then begin
470 if _dst.Find( _aWord, _idx ) then begin
471 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
473 _countInfo := TWordCountInfo.Create;
474 _dst.AddObject( _aWord, _countInfo );
476 _countInfo.WordCount := _countInfo.WordCount + 1;
481 function changeMode( _aWord : string; _mode : Modes ) : string;
485 _pWord, _pWord2 : PChar;
486 _pWordTail, _pFound : PChar;
488 _delim : string = #10;
490 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
491 if mode = ModeWHira then begin
496 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
498 //
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
499 _aWord := CustomStringReplace( _aWord, ' ', '', False );
500 _aWord := CustomStringReplace( _aWord, '
\81@', '', False );
502 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
506 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
507 Result := cutBoth( _aWord, wHiraDelimiter );
512 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
513 _aWord := cutBoth( _aWord, wKanjiDelimiter );
514 // 4 byte (2
\8e\9a)
\82¸
\82Â
\82Å
\92P
\8cê
\95ª
\82¯
515 _pWord := PChar( _aWord );
516 _i := Length( _aWord );
517 _pWordTail := _pWord + _i;
518 SetLength( _aWord2, _i + (_i shr 2) );
519 _pWord2 := PChar( _aWord2 );
521 while _pWord < _pWordTail do begin
522 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
523 if _pFound = nil then
524 _pFound := _pWordTail;
525 _pFound := _pFound - 3;
527 while _pWord <= _pFound do begin
528 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
529 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
531 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
532 CopyMemory( _pWord2, _pWord, _i );
533 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
535 if _pWord < _pWordTail then begin
536 _i := _pWordTail - _pWord;
537 CopyMemory( _pWord2, _pWord, _i );
538 _pWord2 := _pWord2 + _i;
540 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
553 WHIRA_DELIMITER = '
\82ð' + #10 + '
\82É' + #10 + '
\82ª' + #10 + '
\82Æ' + #10 + '
\82©
\82ç'
554 + #10 + '
\82Ö' + #10 + '
\82æ
\82è' + #10 + '
\82Ü
\82Å'+ #10 + '
\82Å'
555 + #10 + '
\82±
\82±' + #10 + '
\82»
\82±' + #10 + '
\82Ç
\82±'
556 + #10 + '
\82±
\82ê' + #10 + '
\82»
\82ê' + #10 + '
\82 \82ê' + #10 + '
\82Ç
\82ê'
557 + #10 + '
\82±
\82Ì' + #10 + '
\82»
\82Ì' + #10 + '
\82 \82Ì' + #10 + '
\82Ç
\82Ì'
558 + #10 + '
\82±
\82¤' + #10 + '
\82»
\82¤' + #10 + '
\82 \82 ' + #10 + '
\82Ç
\82¤'
559 + #10 + '
\82±
\82ñ
\82È' + #10 + '
\82»
\82ñ
\82È' + #10 + '
\82 \82ñ
\82È' + #10 + '
\82Ç
\82ñ
\82È'
560 + #10 + '
\82ê
\82½' + #10 + '
\82ê
\82Ä' + #10 + '
\82ê
\82ê' + #10 + '
\82ê
\82ë'
561 + #10 + '
\82ê
\82é' + #10 + '
\82ç
\82ê
\82é'
562 + #10 + '
\82Å
\82·' + #10 + '
\82Ü
\82·' + #10 + '
\82Ü
\82¹
\82ñ'
563 + #10 + '
\82Å
\82µ
\82½' + #10 + '
\82Ü
\82µ
\82½'
564 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82³
\82ê
\82é' + #10 + '
\82³
\82ê
\82È
\82¢'
566 WKANJI_DELIMITER = '
\93I' + #10 + '
\90«' + #10 + '
\8e®' + #10 + '
\89»' + #10 + '
\96@'
567 + #10 + '
\95s' + #10 + '
\96³' + #10 + '
\94ñ' + #10 + '
\94½'
569 WHIRA_FINAL_DELIMITER = '
\82Á
\82½' + #10 + '
\82Á
\82Ä'
571 + #10 + '
\82æ
\82Á
\82Ä' + #10 + '
\82µ
\82½
\82ª
\82Á
\82Ä' + #10 + '
\82È
\82Ì
\82Å'
572 + #10 + '
\82¾
\82©
\82ç' + #10 + '
\82Å
\82·
\82©
\82ç'
574 + #10 + '
\82µ
\82©
\82µ' + #10 + '
\82¾
\82ª' + #10 + '
\82¯
\82Ç' + #10 + '
\82¯
\82ê
\82Ç'
575 + #10 + '
\82â
\82Í
\82è' + #10 + '
\82â
\82Á
\82Ï
\82è'
576 + #10 + '
\82Å
\82µ' + #10 + '
\82¾
\82ë'
577 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82µ
\82½' + #10 + '
\82µ
\82È
\82¢'
579 // '
\81['
\82ð '
\82\9f\82¡
\82£
\82¥
\82§'
\82É
\81B
580 HA_LINE = '
\82 \82©
\82³
\82½
\82È
\82Í
\82Ü
\82â
\82ç
\82í
\82ª
\82´
\82¾
\82Î
\82Ï
\82\9f\82ì';
581 HI_LINE = '
\82¢
\82«
\82µ
\82¿
\82É
\82Ð
\82Ý
\82è
\82î
\82¬
\82¶
\82Ñ
\82Ò
\82¡';
582 HU_LINE = '
\82¤
\82
\82·
\82Â
\82Ê
\82Ó
\82Þ
\82ä
\82é
\82®
\82Ô
\82Õ
\82£';
583 HE_LINE = '
\82¦
\82¯
\82¹
\82Ä
\82Ë
\82Ö
\82ß
\82ê
\82ï
\82°
\82×
\82Ø
\82¥';
584 HO_LINE = '
\82¨
\82±
\82»
\82Æ
\82Ì
\82Ù
\82à
\82æ
\82ë
\82ð
\82²
\82Ú
\82Û
\82§';
585 KA_LINE = '
\83A
\83J
\83T
\83^
\83i
\83n
\83}
\83\84\83\89\83\8f\83K
\83U
\83_
\83o
\83p
\83@
\83\95\83\8e';
586 KI_LINE = '
\83C
\83L
\83V
\83`
\83j
\83q
\83~
\83\8a\83\90\83M
\83W
\83r
\83s
\83B';
587 KU_LINE = '
\83E
\83N
\83X
\83c
\83k
\83t
\83\80\83\86\83\8b\83O
\83u
\83v
\83D
\83\94';
588 KE_LINE = '
\83G
\83P
\83Z
\83e
\83l
\83w
\83\81\83\8c\83\91\83Q
\83x
\83y
\83F
\83\96';
589 KO_LINE = '
\83I
\83R
\83\
\83g
\83m
\83z
\83\82\83\88\83\8d\83\92\83S
\83{
\83|
\83H';
590 kKanji = [$80..$A0, $E0..$ff];
593 wHiraDelimiter := TStringList.Create;
594 wHiraFinalDelimiter := TStringList.Create;
595 wKanjiDelimiter := TStringList.Create;
596 words := TStringList.Create;
599 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
600 wHiraDelimiter.Text := WHIRA_DELIMITER;
601 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
603 wKanjiDelimiter.Text := WKANJI_DELIMITER;
605 tail := p + Length( text );
608 while p < tail do begin
609 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ð
\94»
\95Ê
610 //
\81¦
\8bå
\93Ç
\93_
\82Í ModeGraph
\82É
\82È
\82é
\82Ì
\82Å
\8cÂ
\95Ê
\82É
\91Î
\89\9e\82µ
\82È
\82
\82Ä
\82à
\82¢
\82¢
611 // if Byte(Byte( p^ ) - $a1) < $5e then begin
612 if Byte( p^ ) in kKanji then begin
613 if p + 1 < tail then begin
614 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
616 //
\83X
\83y
\81[
\83X
\82Å
\92P
\8cê
\95ª
\82¯
\82¹
\82¸
\82É
\8bl
\82ß
\82é
617 //$8140: newMode := ModeWhite;
618 $8141..$824e: newMode := ModeWGraph;
619 $824f..$8258: newMode := ModeWNum;
620 $8260..$829a: newMode := ModeWAlpha;
621 $829f..$82f1: newMode := ModeWHira;
622 $8340..$8396: newMode := ModeWKata;
623 else newMode := ModeWKanji;
625 // '
\81J
\81K
\81['
\82Í
\95½
\89¼
\96¼
\81A
\82Ü
\82½
\82Í
\83J
\83^
\83J
\83i
\82É
\8aÜ
\82Ü
\82ê
\82é
626 if (mode = ModeWHira) or (mode = ModeWKata) then
627 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
630 newMode := ModeWhite;
635 newMode := Modes( CharMode1[ Byte( p^ ) ] );
636 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
637 //
\8d¡
\82Ü
\82Å
\93ú
\96{
\8cê
\82Å
\8d¡
\83X
\83y
\81[
\83X
638 //
\92P
\8cê
\82ð
\8cq
\82°
\82Ä
\8cã
\82Å
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
639 //
\81¦
\94¼
\8ap
\83J
\83i
\82Í
\92Ê
\8fí
\83X
\83y
\81[
\83X
\82Å
\8bæ
\90Ø
\82é
\82¾
\82ë
\82¤
\82©
\82ç
\8bl
\82ß
\82È
\82¢
646 if mode <> newMode then begin
648 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ª
\95Ï
\8dX
\82³
\82ê
\82½
649 if mode <> ModeWhite then begin
650 SetLength( aWord, p - last );
651 CopyMemory( PChar( aWord ), last, p - last );
653 words.Text := changeMode( aWord, mode );
656 addWord( wordCount, words );
667 if mode <> ModeWhite then begin
668 SetLength( aWord, p - last );
669 CopyMemory( PChar( aWord ), last, p - last );
671 words.Text := changeMode( aWord, mode );
674 addWord( wordCount, words );
678 wKanjiDelimiter.Free;
679 wHiraFinalDelimiter.Free;
685 //==============================
687 //==============================
688 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
690 function p( const aWord : string ) : Single;
694 info := Objects[ aWord ];
697 else if info.NormalWord = 0 then
699 else if info.ImportantWord = 0 then
701 else if info.ImportantWord + info.NormalWord * 2 < 5 then
705 Result := ( info.ImportantWord / info.ImportantText ) /
706 ((info.NormalWord * 2 / info.NormalText ) +
707 (info.ImportantWord / info.ImportantText));
709 on EZeroDivide do Result := 0.99;
723 if wordCount.Count = 0 then
726 narray := TList.Create;
728 for i := 0 to wordCount.Count - 1 do begin
729 narray.Add( Pointer( p( wordCount[ i ] ) ) );
732 narray.Sort( AbsSort );
736 i := min( SAMPLE_COUNT, narray.Count );
740 s := s * Single( narray[ i ] );
741 q := q * (1 - Single( narray[ i ] ));
744 Result := s / (s + q);
754 //==============================
756 //==============================
757 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
759 function p( const aWord : string ) : Single;
763 info := Objects[ aWord ];
766 else if info.ImportantWord = 0 then
768 else if info.NormalWord = 0 then
772 Result := ( info.ImportantWord / info.ImportantText ) /
773 ((info.NormalWord / info.NormalText ) +
774 (info.ImportantWord / info.ImportantText));
777 Result := (info.ImportantWord * info.NormalText) /
778 (info.NormalWord * info.ImportantText +
779 info.ImportantWord * info.NormalText);
785 function f( cnt : Integer; n, mean : Single ) : Extended;
789 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
794 narray : array of Single;
796 countInfo : TWordCountInfo;
798 P1, Q1{, R1} : Extended;
802 if wordCount.Count = 0 then begin
807 SetLength( narray, wordCount.Count );
809 for i := 0 to wordCount.Count - 1 do begin
810 n := p( wordCount[ i ] );
814 mean := mean / wordCount.Count;
818 for i := 0 to wordCount.Count - 1 do begin
819 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
820 n := f( countInfo.WordCount, narray[ i ], mean );
821 P1 := P1 * ( 1 - n );
824 cnt := wordCount.Count;
828 P1 := 1 - Power( P1, 1 / cnt );
832 Q1 := 1 - Power( Q1, 1 / cnt );
836 if P1 + Q1 = 0 then begin
839 n := (P1 - Q1) / (P1 + Q1);
840 Result := (1 + n) / 2;
845 //==============================
846 // CalcGaryRobinsonFisher
847 //==============================
848 function TGikoBayesian.CalcGaryRobinsonFisher(
849 wordCount : TWordCount
852 function p( const aWord : string ) : Single;
856 info := Objects[ aWord ];
859 else if info.ImportantWord = 0 then
861 else if info.NormalWord = 0 then
865 Result := ( info.ImportantWord / info.ImportantText ) /
866 ((info.NormalWord / info.NormalText ) +
867 (info.ImportantWord / info.ImportantText));
869 Result := (info.ImportantWord * info.NormalText) /
870 (info.NormalWord * info.ImportantText +
871 info.ImportantWord * info.NormalText);
874 function f( cnt : Integer; n, mean : Single ) : Extended;
878 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
881 function prbx( x2, degree : Extended ) : Extended;
890 narray : array of Single;
892 countInfo : TWordCountInfo;
894 // normal : Extended;
895 // important : Extended;
900 if wordCount.Count = 0 then begin
905 SetLength( narray, wordCount.Count );
907 for i := 0 to wordCount.Count - 1 do begin
908 n := p( wordCount[ i ] );
912 mean := mean / wordCount.Count;
916 for i := 0 to wordCount.Count - 1 do begin
917 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
918 n := f( countInfo.WordCount, narray[ i ], mean );
919 P1 := P1 * ( 1 - n );
922 cnt := wordCount.Count;
926 P1 := Power( P1, 1 / cnt );
930 Q1 := Power( Q1, 1 / cnt );
934 P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
935 Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
937 Result := (1 + P1 - Q1) / 2;
941 //==============================
943 //==============================
944 function TGikoBayesian.Parse(
946 wordCount : TWordCount;
947 algorithm : TGikoBayesianAlgorithm
951 CountWord( text, wordCount );
953 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
954 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
955 gbaGaryRobinsonFisher:
956 Result := CalcGaryRobinsonFisher( wordCount );
962 //==============================
964 //==============================
965 procedure TGikoBayesian.Learn(
966 wordCount : TWordCount;
967 isImportant : Boolean );
970 wordinfo : TWordInfo;
971 countinfo : TWordCountInfo;
975 for i := 0 to wordCount.Count - 1 do begin
976 aWord := wordCount[ i ];
977 wordinfo := Objects[ aWord ];
978 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
979 if wordinfo = nil then begin
980 wordinfo := TWordInfo.Create;
981 Objects[ aWord ] := wordinfo;
984 if isImportant then begin
985 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
986 wordinfo.ImportantText := wordinfo.ImportantText + 1;
988 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
989 wordinfo.NormalText := wordinfo.NormalText + 1;
995 //==============================
997 //==============================
998 procedure TGikoBayesian.Forget(
999 wordCount : TWordCount;
1000 isImportant : Boolean );
1003 wordinfo : TWordInfo;
1004 countinfo : TWordCountInfo;
1008 for i := 0 to wordCount.Count - 1 do begin
1009 aWord := wordCount[ i ];
1010 wordinfo := Objects[ aWord ];
1011 if wordinfo = nil then
1014 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1015 if isImportant then begin
1016 if wordInfo.ImportantText > 0 then begin
1017 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1018 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1021 if wordinfo.NormalText > 0 then begin
1022 wordinfo.NormalText := wordinfo.NormalText - 1;
1023 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;