5 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
7 $Id: GikoBayesian.pas,v 1.15 2004/11/01 10:28:24 yoffy Exp $
10 //!
\95½
\89¼
\96¼
\82ð
\8e«
\8f\91\82É
\8aÜ
\82ß
\82È
\82¢
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
15 //==================================================
17 //==================================================
20 //==================================================
22 //==================================================
24 {!***********************************************************
25 \brief
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
26 ************************************************************}
27 TWordInfo = class( TObject )
29 FNormalWord : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
30 FImportantWord : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
31 FNormalText : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
32 FImportantText : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
35 property NormalWord : Integer read FNormalWord write FNormalWord;
36 property ImportantWord : Integer read FImportantWord write FImportantWord;
37 property NormalText : Integer read FNormalText write FNormalText;
38 property ImportantText : Integer read FImportantText write FImportantText;
41 {!***********************************************************
42 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
43 ************************************************************}
44 TWordCountInfo = class( TObject )
46 FWordCount : Integer; //!<
\92P
\8cê
\90\94
49 property WordCount : Integer read FWordCount write FWordCount;
52 {!***********************************************************
53 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83\8a\83X
\83g
54 ************************************************************}
55 // TWordCount = class( THashedStringList ) //
\8c\83\92x
56 TWordCount = class( TStringList )
59 destructor Destroy; override;
62 {!***********************************************************
63 \brief
\83t
\83B
\83\8b\83^
\83A
\83\8b\83S
\83\8a\83Y
\83\80
64 ************************************************************}
65 TGikoBayesianAlgorithm =
66 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
68 {!***********************************************************
69 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
70 ************************************************************}
71 // TGikoBayesian = class( THashedStringList ) //
\8c\83\92x
72 TGikoBayesian = class( TStringList )
74 FFilePath : string; //!<
\93Ç
\82Ý
\8d\9e\82ñ
\82¾
\83t
\83@
\83C
\83\8b\83p
\83X
75 function GetObject( const name : string ) : TWordInfo;
76 procedure SetObject( const name : string; value : TWordInfo );
80 destructor Destroy; override;
82 //!
\83t
\83@
\83C
\83\8b\82©
\82ç
\8aw
\8fK
\97\9a\97ð
\82ð
\93Ç
\82Ý
\8fo
\82µ
\82Ü
\82·
83 procedure LoadFromFile( const filePath : string );
85 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
86 procedure SaveToFile( const filePath : string );
88 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
91 //!
\92P
\8cê
\82É
\91Î
\82·
\82é
\8fî
\95ñ
\82ð
\8eæ
\93¾
\82µ
\82Ü
\82·
92 property Objects[ const name : string ] : TWordInfo
93 read GetObject write SetObject; default;
95 //!
\95¶
\8fÍ
\82É
\8aÜ
\82Ü
\82ê
\82é
\92P
\8cê
\82ð
\83J
\83E
\83\93\83g
\82µ
\82Ü
\82·
98 wordCount : TWordCount );
101 \brief Paul Graham
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
102 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
104 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
107 \brief GaryRobinson
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
108 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
110 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
113 \brief GaryRobinson-Fisher
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
114 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
116 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
119 \brief
\95¶
\8fÍ
\82ð
\89ð
\90Í
120 \param text
\89ð
\90Í
\82·
\82é
\95¶
\8fÍ
121 \param wordCount
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
\82ª
\95Ô
\82é
122 \param algorithm
\92\8d\96Ú
\93x
\82Ì
\8c\88\92è
\82É
\97p
\82¢
\82é
\83A
\83\8b\83S
\83\8a\83Y
\83\80\82ð
\8ew
\92è
\82µ
\82Ü
\82·
123 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
125 CountWord
\82Æ Calcxxxxx
\82ð
\82Ü
\82Æ
\82ß
\82Ä
\8eÀ
\8ds
\82·
\82é
\82¾
\82¯
\82Å
\82·
\81B
129 wordCount : TWordCount;
130 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
134 \brief
\8aw
\8fK
\82·
\82é
135 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
136 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82é
\82È
\82ç True
139 wordCount : TWordCount;
140 isImportant : Boolean );
143 \brief
\8aw
\8fK
\8c\8b\89Ê
\82ð
\96Y
\82ê
\82é
144 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
145 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82ç
\82ê
\82Ä
\82¢
\82½
\82È
\82ç True
146 \warning
\8aw
\8fK
\8dÏ
\82Ý
\82Ì
\95¶
\8fÍ
\82©
\82Ç
\82¤
\82©
\82Í
\8am
\94F
\8fo
\97\88\82Ü
\82¹
\82ñ
\81B<br>
147 Learn
\82µ
\82Ä
\82¢
\82È
\82¢
\95¶
\8fÍ
\82â isImportant
\82ª
\8aÔ
\88á
\82Á
\82Ä
\82¢
\82é
\95¶
\8fÍ
\82ð
148 Forget
\82·
\82é
\82Æ
\83f
\81[
\83^
\83x
\81[
\83X
\82ª
\94j
\91¹
\82µ
\82Ü
\82·
\81B<br>
149 \8aw
\8fK
\8dÏ
\82Ý
\82©
\82Ç
\82¤
\82©
\82Í
\93Æ
\8e©
\82É
\8aÇ
\97\9d\82µ
\82Ä
\82
\82¾
\82³
\82¢
\81B
151 \91S
\82Ä
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82ð
\83N
\83\8a\83A
\82·
\82é
\82í
\82¯
\82Å
\82Í
\82 \82è
\82Ü
\82¹
\82ñ
\81B<br>
152 wordCount
\82ð
\93¾
\82½
\95¶
\8fÍ (Parse
\82Ì text
\88ø
\90\94)
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82Ì
\82Ý
\83N
\83\8a\83A
\82µ
\82Ü
\82·
\81B<br><br>
154 \8eå
\82É
\92\8d\96Ú
\95¶
\8fÍ
\82Æ
\94ñ
\92\8d\96Ú
\95¶
\8fÍ
\82ð
\90Ø
\82è
\91Ö
\82¦
\82é
\82½
\82ß
\82É Forget -> Learn
\82Ì
\8f\87\82Å
\8eg
\97p
\82µ
\82Ü
\82·
\81B
157 wordCount : TWordCount;
158 isImportant : Boolean );
161 //==================================================
163 //==================================================
166 SysUtils, Math, Windows,
170 GIKO_BAYESIAN_FILE_VERSION = '1.0';
172 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173 ModeWGraph, ModeWAlpha, ModeWNum,
174 ModeWHira, ModeWKata, ModeWKanji);
176 CharMode1 : array [ 0..255 ] of Byte =
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
197 //************************************************************
199 //************************************************************
201 //==============================
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
208 p := AnsiPos(delimiter, s);
212 Result := Copy(s, 1, p - 1);
213 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
216 //==============================
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
224 v1 := Abs( Single( p1 ) - 0.5 );
225 v2 := Abs( Single( p2 ) - 0.5 );
235 //************************************************************
237 //************************************************************
238 constructor TWordCount.Create;
241 Duplicates := dupIgnore;
242 CaseSensitive := True;
247 destructor TWordCount.Destroy;
252 for i := Count - 1 downto 0 do
253 if Objects[ i ] <> nil then
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
264 //==============================
266 //==============================
267 constructor TGikoBayesian.Create;
270 Duplicates := dupIgnore;
271 CaseSensitive := True;
276 //==============================
278 //==============================
279 destructor TGikoBayesian.Destroy;
284 for i := Count - 1 downto 0 do
285 if inherited Objects[ i ] <> nil then
286 inherited Objects[ i ].Free;
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
301 FFilePath := filePath;
303 if not FileExists( filePath ) then
306 sl := TStringList.Create;
308 sl.LoadFromFile( filePath );
310 for i := 1 to sl.Count - 1 do begin
312 name := RemoveToken( s, #1 );
313 info := TWordInfo.Create;
314 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
319 AddObject( name, info );
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
335 FFilePath := filePath;
337 sl := TStringList.Create;
340 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
342 for i := 0 to Count - 1 do begin
343 info := TWordInfo( inherited Objects[ i ] );
344 s := Strings[ i ] + #1
345 + Format('%x', [info.NormalWord]) + #1
346 + Format('%x', [info.ImportantWord]) + #1
347 + Format('%x', [info.NormalText]) + #1
348 + Format('%x', [info.ImportantText]);
353 sl.SaveToFile( filePath );
360 procedure TGikoBayesian.Save;
363 if FFilePath <> '' then
364 SaveToFile( FFilePath );
368 //==============================
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
376 if Find( name, idx ) then
377 Result := TWordInfo( inherited Objects[ idx ] )
383 //==============================
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
391 if Find( name, idx ) then
392 inherited Objects[ idx ] := value
394 AddObject( name, value );
399 //==============================
401 //==============================
402 procedure TGikoBayesian.CountWord(
404 wordCount : TWordCount );
406 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407 ModeWGraph, ModeWAlpha, ModeWNum,
408 ModeWHira, ModeWKata, ModeWKanji);
410 p, tail, last : PChar;
411 mode, newMode : Modes;
414 wHiraDelimiter : TStringList;
415 wHiraFinalDelimiter : TStringList;
416 wKanjiDelimiter : TStringList;
419 countInfo : TWordCountInfo;
421 function cutBoth( _aWord : string; _delim : TStringList ) : string;
425 for _i := 0 to _delim.Count - 1 do begin
426 _aWord := CustomStringReplace(
429 #10 + _delim[ _i ] + #10, False );
434 function cutFirst( _aWord : string; _delim : TStringList ) : string;
438 for _i := 0 to _delim.Count - 1 do begin
439 _aWord := CustomStringReplace(
442 #10 + _delim[ _i ], False );
447 function cutFinal( _aWord : string; _delim : TStringList ) : string;
451 for _i := 0 to _delim.Count - 1 do begin
452 _aWord := CustomStringReplace(
455 _delim[ _i ] + #10, False );
460 procedure addWord( _dst : TWordCount; _words : TStringList );
464 _countInfo : TWordCountInfo;
466 for _i := 0 to _words.Count - 1 do begin
467 _aWord := _words[ _i ];
468 if Length( _aWord ) > 0 then begin
469 if _dst.Find( _aWord, _idx ) then begin
470 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
472 _countInfo := TWordCountInfo.Create;
473 _dst.AddObject( _aWord, _countInfo );
475 _countInfo.WordCount := _countInfo.WordCount + 1;
480 function changeMode( _aWord : string; _mode : Modes ) : string;
484 _pWord, _pWord2 : PChar;
485 _pWordTail, _pFound : PChar;
487 _delim : string = #10;
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490 if mode = ModeWHira then begin
495 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
497 //
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
498 _aWord := CustomStringReplace( _aWord, ' ', '', False );
499 _aWord := CustomStringReplace( _aWord, '
\81@', '', False );
501 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
505 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506 Result := cutBoth( _aWord, wHiraDelimiter );
511 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
512 _aWord := cutBoth( _aWord, wKanjiDelimiter );
513 // 4 byte (2
\8e\9a)
\82¸
\82Â
\82Å
\92P
\8cê
\95ª
\82¯
514 _pWord := PChar( _aWord );
515 _i := Length( _aWord );
516 _pWordTail := _pWord + _i;
517 SetLength( _aWord2, _i + (_i shr 2) );
518 _pWord2 := PChar( _aWord2 );
520 while _pWord < _pWordTail do begin
521 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522 if _pFound = nil then
523 _pFound := _pWordTail;
524 _pFound := _pFound - 3;
526 while _pWord <= _pFound do begin
527 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
530 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531 CopyMemory( _pWord2, _pWord, _i );
532 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
534 if _pWord < _pWordTail then begin
535 _i := _pWordTail - _pWord;
536 CopyMemory( _pWord2, _pWord, _i );
537 _pWord2 := _pWord2 + _i;
539 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
552 WHIRA_DELIMITER = '
\82ð' + #10 + '
\82É' + #10 + '
\82ª' + #10 + '
\82Æ' + #10 + '
\82©
\82ç'
553 + #10 + '
\82Ö' + #10 + '
\82æ
\82è' + #10 + '
\82Ü
\82Å'+ #10 + '
\82Å'
554 + #10 + '
\82±
\82±' + #10 + '
\82»
\82±' + #10 + '
\82Ç
\82±'
555 + #10 + '
\82±
\82ê' + #10 + '
\82»
\82ê' + #10 + '
\82 \82ê' + #10 + '
\82Ç
\82ê'
556 + #10 + '
\82±
\82Ì' + #10 + '
\82»
\82Ì' + #10 + '
\82 \82Ì' + #10 + '
\82Ç
\82Ì'
557 + #10 + '
\82±
\82¤' + #10 + '
\82»
\82¤' + #10 + '
\82 \82 ' + #10 + '
\82Ç
\82¤'
558 + #10 + '
\82±
\82ñ
\82È' + #10 + '
\82»
\82ñ
\82È' + #10 + '
\82 \82ñ
\82È' + #10 + '
\82Ç
\82ñ
\82È'
559 + #10 + '
\82ê
\82½' + #10 + '
\82ê
\82Ä' + #10 + '
\82ê
\82ê' + #10 + '
\82ê
\82ë'
560 + #10 + '
\82ê
\82é' + #10 + '
\82ç
\82ê
\82é'
561 + #10 + '
\82Å
\82·' + #10 + '
\82Ü
\82·' + #10 + '
\82Ü
\82¹
\82ñ'
562 + #10 + '
\82Å
\82µ
\82½' + #10 + '
\82Ü
\82µ
\82½'
563 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82³
\82ê
\82é' + #10 + '
\82³
\82ê
\82È
\82¢'
565 WKANJI_DELIMITER = '
\93I' + #10 + '
\90«' + #10 + '
\8e®' + #10 + '
\89»' + #10 + '
\96@'
566 + #10 + '
\95s' + #10 + '
\96³' + #10 + '
\94ñ' + #10 + '
\94½'
568 WHIRA_FINAL_DELIMITER = '
\82Á
\82½' + #10 + '
\82Á
\82Ä'
570 + #10 + '
\82æ
\82Á
\82Ä' + #10 + '
\82µ
\82½
\82ª
\82Á
\82Ä' + #10 + '
\82È
\82Ì
\82Å'
571 + #10 + '
\82¾
\82©
\82ç' + #10 + '
\82Å
\82·
\82©
\82ç'
573 + #10 + '
\82µ
\82©
\82µ' + #10 + '
\82¾
\82ª' + #10 + '
\82¯
\82Ç' + #10 + '
\82¯
\82ê
\82Ç'
574 + #10 + '
\82â
\82Í
\82è' + #10 + '
\82â
\82Á
\82Ï
\82è'
575 + #10 + '
\82Å
\82µ' + #10 + '
\82¾
\82ë'
576 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82µ
\82½' + #10 + '
\82µ
\82È
\82¢'
578 // '
\81['
\82ð '
\82\9f\82¡
\82£
\82¥
\82§'
\82É
\81B
579 HA_LINE = '
\82 \82©
\82³
\82½
\82È
\82Í
\82Ü
\82â
\82ç
\82í
\82ª
\82´
\82¾
\82Î
\82Ï
\82\9f\82ì';
580 HI_LINE = '
\82¢
\82«
\82µ
\82¿
\82É
\82Ð
\82Ý
\82è
\82î
\82¬
\82¶
\82Ñ
\82Ò
\82¡';
581 HU_LINE = '
\82¤
\82
\82·
\82Â
\82Ê
\82Ó
\82Þ
\82ä
\82é
\82®
\82Ô
\82Õ
\82£';
582 HE_LINE = '
\82¦
\82¯
\82¹
\82Ä
\82Ë
\82Ö
\82ß
\82ê
\82ï
\82°
\82×
\82Ø
\82¥';
583 HO_LINE = '
\82¨
\82±
\82»
\82Æ
\82Ì
\82Ù
\82à
\82æ
\82ë
\82ð
\82²
\82Ú
\82Û
\82§';
584 KA_LINE = '
\83A
\83J
\83T
\83^
\83i
\83n
\83}
\83\84\83\89\83\8f\83K
\83U
\83_
\83o
\83p
\83@
\83\95\83\8e';
585 KI_LINE = '
\83C
\83L
\83V
\83`
\83j
\83q
\83~
\83\8a\83\90\83M
\83W
\83r
\83s
\83B';
586 KU_LINE = '
\83E
\83N
\83X
\83c
\83k
\83t
\83\80\83\86\83\8b\83O
\83u
\83v
\83D
\83\94';
587 KE_LINE = '
\83G
\83P
\83Z
\83e
\83l
\83w
\83\81\83\8c\83\91\83Q
\83x
\83y
\83F
\83\96';
588 KO_LINE = '
\83I
\83R
\83\
\83g
\83m
\83z
\83\82\83\88\83\8d\83\92\83S
\83{
\83|
\83H';
589 kKanji = [$80..$A0, $E0..$ff];
592 wHiraDelimiter := TStringList.Create;
593 wHiraFinalDelimiter := TStringList.Create;
594 wKanjiDelimiter := TStringList.Create;
595 words := TStringList.Create;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599 wHiraDelimiter.Text := WHIRA_DELIMITER;
600 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
602 wKanjiDelimiter.Text := WKANJI_DELIMITER;
604 tail := p + Length( text );
607 while p < tail do begin
608 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ð
\94»
\95Ê
609 //
\81¦
\8bå
\93Ç
\93_
\82Í ModeGraph
\82É
\82È
\82é
\82Ì
\82Å
\8cÂ
\95Ê
\82É
\91Î
\89\9e\82µ
\82È
\82
\82Ä
\82à
\82¢
\82¢
610 // if Byte(Byte( p^ ) - $a1) < $5e then begin
611 if Byte( p^ ) in kKanji then begin
612 if p + 1 < tail then begin
613 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
615 //
\83X
\83y
\81[
\83X
\82Å
\92P
\8cê
\95ª
\82¯
\82¹
\82¸
\82É
\8bl
\82ß
\82é
616 //$8140: newMode := ModeWhite;
617 $8141..$824e: newMode := ModeWGraph;
618 $824f..$8258: newMode := ModeWNum;
619 $8260..$829a: newMode := ModeWAlpha;
620 $829f..$82f1: newMode := ModeWHira;
621 $8340..$8396: newMode := ModeWKata;
622 else newMode := ModeWKanji;
624 // '
\81J
\81K
\81['
\82Í
\95½
\89¼
\96¼
\81A
\82Ü
\82½
\82Í
\83J
\83^
\83J
\83i
\82É
\8aÜ
\82Ü
\82ê
\82é
625 if (mode = ModeWHira) or (mode = ModeWKata) then
626 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
629 newMode := ModeWhite;
634 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636 //
\8d¡
\82Ü
\82Å
\93ú
\96{
\8cê
\82Å
\8d¡
\83X
\83y
\81[
\83X
637 //
\92P
\8cê
\82ð
\8cq
\82°
\82Ä
\8cã
\82Å
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
638 //
\81¦
\94¼
\8ap
\83J
\83i
\82Í
\92Ê
\8fí
\83X
\83y
\81[
\83X
\82Å
\8bæ
\90Ø
\82é
\82¾
\82ë
\82¤
\82©
\82ç
\8bl
\82ß
\82È
\82¢
645 if mode <> newMode then begin
647 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ª
\95Ï
\8dX
\82³
\82ê
\82½
648 if mode <> ModeWhite then begin
649 SetLength( aWord, p - last );
650 CopyMemory( PChar( aWord ), last, p - last );
652 words.Text := changeMode( aWord, mode );
655 addWord( wordCount, words );
666 if mode <> ModeWhite then begin
667 SetLength( aWord, p - last );
668 CopyMemory( PChar( aWord ), last, p - last );
670 words.Text := changeMode( aWord, mode );
673 addWord( wordCount, words );
677 wKanjiDelimiter.Free;
678 wHiraFinalDelimiter.Free;
684 //==============================
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
689 function p( const aWord : string ) : Single;
693 info := Objects[ aWord ];
696 else if info.NormalWord = 0 then
698 else if info.ImportantWord = 0 then
700 else if info.ImportantWord + info.NormalWord * 2 < 5 then
703 Result := ( info.ImportantWord / info.ImportantText ) /
704 ((info.NormalWord * 2 / info.NormalText ) +
705 (info.ImportantWord / info.ImportantText));
717 if wordCount.Count = 0 then
720 narray := TList.Create;
722 for i := 0 to wordCount.Count - 1 do begin
723 narray.Add( Pointer( p( wordCount[ i ] ) ) );
726 narray.Sort( AbsSort );
730 i := min( SAMPLE_COUNT, narray.Count );
734 s := s * Single( narray[ i ] );
735 q := q * (1 - Single( narray[ i ] ));
738 Result := s / (s + q);
745 //==============================
747 //==============================
748 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
750 function p( const aWord : string ) : Single;
754 info := Objects[ aWord ];
757 else if info.ImportantWord = 0 then
759 else if info.NormalWord = 0 then
763 Result := ( info.ImportantWord / info.ImportantText ) /
764 ((info.NormalWord / info.NormalText ) +
765 (info.ImportantWord / info.ImportantText));
767 Result := (info.ImportantWord * info.NormalText) /
768 (info.NormalWord * info.ImportantText +
769 info.ImportantWord * info.NormalText);
772 function f( cnt : Integer; n, mean : Single ) : Extended;
776 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
781 narray : array of Single;
783 countInfo : TWordCountInfo;
785 P1, Q1, R1 : Extended;
789 if wordCount.Count = 0 then begin
794 SetLength( narray, wordCount.Count );
796 for i := 0 to wordCount.Count - 1 do begin
797 n := p( wordCount[ i ] );
801 mean := mean / wordCount.Count;
805 for i := 0 to wordCount.Count - 1 do begin
806 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
807 n := f( countInfo.WordCount, narray[ i ], mean );
808 P1 := P1 * ( 1 - n );
811 cnt := wordCount.Count;
815 P1 := 1 - Power( P1, 1 / cnt );
816 Q1 := 1 - Power( Q1, 1 / cnt );
818 if P1 + Q1 = 0 then begin
821 n := (P1 - Q1) / (P1 + Q1);
822 Result := (1 + n) / 2;
827 //==============================
828 // CalcGaryRobinsonFisher
829 //==============================
830 function TGikoBayesian.CalcGaryRobinsonFisher(
831 wordCount : TWordCount
834 function p( const aWord : string ) : Single;
838 info := Objects[ aWord ];
841 else if info.ImportantWord = 0 then
843 else if info.NormalWord = 0 then
847 Result := ( info.ImportantWord / info.ImportantText ) /
848 ((info.NormalWord / info.NormalText ) +
849 (info.ImportantWord / info.ImportantText));
851 Result := (info.ImportantWord * info.NormalText) /
852 (info.NormalWord * info.ImportantText +
853 info.ImportantWord * info.NormalText);
856 function f( cnt : Integer; n, mean : Single ) : Extended;
860 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
863 function prbx( x2, degree : Extended ) : Extended;
872 narray : array of Single;
874 countInfo : TWordCountInfo;
877 important : Extended;
882 if wordCount.Count = 0 then begin
887 SetLength( narray, wordCount.Count );
889 for i := 0 to wordCount.Count - 1 do begin
890 n := p( wordCount[ i ] );
894 mean := mean / wordCount.Count;
898 for i := 0 to wordCount.Count - 1 do begin
899 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
900 n := f( countInfo.WordCount, narray[ i ], mean );
901 P1 := P1 * ( 1 - n );
904 cnt := wordCount.Count;
908 P1 := Power( P1, 1 / cnt );
909 Q1 := Power( Q1, 1 / cnt );
911 P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
912 Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
914 Result := (1 + P1 - Q1) / 2;
918 //==============================
920 //==============================
921 function TGikoBayesian.Parse(
923 wordCount : TWordCount;
924 algorithm : TGikoBayesianAlgorithm
928 CountWord( text, wordCount );
930 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
931 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
932 gbaGaryRobinsonFisher:
933 Result := CalcGaryRobinsonFisher( wordCount );
939 //==============================
941 //==============================
942 procedure TGikoBayesian.Learn(
943 wordCount : TWordCount;
944 isImportant : Boolean );
947 wordinfo : TWordInfo;
948 countinfo : TWordCountInfo;
952 for i := 0 to wordCount.Count - 1 do begin
953 aWord := wordCount[ i ];
954 wordinfo := Objects[ aWord ];
955 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
956 if wordinfo = nil then begin
957 wordinfo := TWordInfo.Create;
958 Objects[ aWord ] := wordinfo;
961 if isImportant then begin
962 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
963 wordinfo.ImportantText := wordinfo.ImportantText + 1;
965 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
966 wordinfo.NormalText := wordinfo.NormalText + 1;
972 //==============================
974 //==============================
975 procedure TGikoBayesian.Forget(
976 wordCount : TWordCount;
977 isImportant : Boolean );
980 wordinfo : TWordInfo;
981 countinfo : TWordCountInfo;
985 for i := 0 to wordCount.Count - 1 do begin
986 aWord := wordCount[ i ];
987 wordinfo := Objects[ aWord ];
988 if wordinfo = nil then
991 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
992 if isImportant then begin
993 if wordInfo.ImportantText > 0 then begin
994 wordinfo.ImportantText := wordinfo.ImportantText - 1;
995 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
998 if wordinfo.NormalText > 0 then begin
999 wordinfo.NormalText := wordinfo.NormalText - 1;
1000 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;