5 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
7 $Id: GikoBayesian.pas,v 1.16 2004/11/01 10:32:02 yoffy Exp $
10 //!
\95½
\89¼
\96¼
\82ð
\8e«
\8f\91\82É
\8aÜ
\82ß
\82È
\82¢
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
15 //==================================================
17 //==================================================
20 //==================================================
22 //==================================================
24 {!***********************************************************
25 \brief
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
26 ************************************************************}
27 TWordInfo = class( TObject )
29 FNormalWord : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
30 FImportantWord : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
31 FNormalText : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
32 FImportantText : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
35 property NormalWord : Integer read FNormalWord write FNormalWord;
36 property ImportantWord : Integer read FImportantWord write FImportantWord;
37 property NormalText : Integer read FNormalText write FNormalText;
38 property ImportantText : Integer read FImportantText write FImportantText;
41 {!***********************************************************
42 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
43 ************************************************************}
44 TWordCountInfo = class( TObject )
46 FWordCount : Integer; //!<
\92P
\8cê
\90\94
49 property WordCount : Integer read FWordCount write FWordCount;
52 {!***********************************************************
53 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83\8a\83X
\83g
54 ************************************************************}
55 // TWordCount = class( THashedStringList ) //
\8c\83\92x
56 TWordCount = class( TStringList )
59 destructor Destroy; override;
62 {!***********************************************************
63 \brief
\83t
\83B
\83\8b\83^
\83A
\83\8b\83S
\83\8a\83Y
\83\80
64 ************************************************************}
65 TGikoBayesianAlgorithm =
66 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
68 {!***********************************************************
69 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
70 ************************************************************}
71 // TGikoBayesian = class( THashedStringList ) //
\8c\83\92x
72 TGikoBayesian = class( TStringList )
74 FFilePath : string; //!<
\93Ç
\82Ý
\8d\9e\82ñ
\82¾
\83t
\83@
\83C
\83\8b\83p
\83X
75 function GetObject( const name : string ) : TWordInfo;
76 procedure SetObject( const name : string; value : TWordInfo );
80 destructor Destroy; override;
82 //!
\83t
\83@
\83C
\83\8b\82©
\82ç
\8aw
\8fK
\97\9a\97ð
\82ð
\93Ç
\82Ý
\8fo
\82µ
\82Ü
\82·
83 procedure LoadFromFile( const filePath : string );
85 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
86 procedure SaveToFile( const filePath : string );
88 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
91 //!
\92P
\8cê
\82É
\91Î
\82·
\82é
\8fî
\95ñ
\82ð
\8eæ
\93¾
\82µ
\82Ü
\82·
92 property Objects[ const name : string ] : TWordInfo
93 read GetObject write SetObject; default;
95 //!
\95¶
\8fÍ
\82É
\8aÜ
\82Ü
\82ê
\82é
\92P
\8cê
\82ð
\83J
\83E
\83\93\83g
\82µ
\82Ü
\82·
98 wordCount : TWordCount );
101 \brief Paul Graham
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
102 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
104 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
107 \brief GaryRobinson
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
108 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
110 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
113 \brief GaryRobinson-Fisher
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
114 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
116 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
119 \brief
\95¶
\8fÍ
\82ð
\89ð
\90Í
120 \param text
\89ð
\90Í
\82·
\82é
\95¶
\8fÍ
121 \param wordCount
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
\82ª
\95Ô
\82é
122 \param algorithm
\92\8d\96Ú
\93x
\82Ì
\8c\88\92è
\82É
\97p
\82¢
\82é
\83A
\83\8b\83S
\83\8a\83Y
\83\80\82ð
\8ew
\92è
\82µ
\82Ü
\82·
123 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
125 CountWord
\82Æ Calcxxxxx
\82ð
\82Ü
\82Æ
\82ß
\82Ä
\8eÀ
\8ds
\82·
\82é
\82¾
\82¯
\82Å
\82·
\81B
129 wordCount : TWordCount;
130 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
134 \brief
\8aw
\8fK
\82·
\82é
135 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
136 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82é
\82È
\82ç True
139 wordCount : TWordCount;
140 isImportant : Boolean );
143 \brief
\8aw
\8fK
\8c\8b\89Ê
\82ð
\96Y
\82ê
\82é
144 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
145 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82ç
\82ê
\82Ä
\82¢
\82½
\82È
\82ç True
146 \warning
\8aw
\8fK
\8dÏ
\82Ý
\82Ì
\95¶
\8fÍ
\82©
\82Ç
\82¤
\82©
\82Í
\8am
\94F
\8fo
\97\88\82Ü
\82¹
\82ñ
\81B<br>
147 Learn
\82µ
\82Ä
\82¢
\82È
\82¢
\95¶
\8fÍ
\82â isImportant
\82ª
\8aÔ
\88á
\82Á
\82Ä
\82¢
\82é
\95¶
\8fÍ
\82ð
148 Forget
\82·
\82é
\82Æ
\83f
\81[
\83^
\83x
\81[
\83X
\82ª
\94j
\91¹
\82µ
\82Ü
\82·
\81B<br>
149 \8aw
\8fK
\8dÏ
\82Ý
\82©
\82Ç
\82¤
\82©
\82Í
\93Æ
\8e©
\82É
\8aÇ
\97\9d\82µ
\82Ä
\82
\82¾
\82³
\82¢
\81B
151 \91S
\82Ä
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82ð
\83N
\83\8a\83A
\82·
\82é
\82í
\82¯
\82Å
\82Í
\82 \82è
\82Ü
\82¹
\82ñ
\81B<br>
152 wordCount
\82ð
\93¾
\82½
\95¶
\8fÍ (Parse
\82Ì text
\88ø
\90\94)
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82Ì
\82Ý
\83N
\83\8a\83A
\82µ
\82Ü
\82·
\81B<br><br>
154 \8eå
\82É
\92\8d\96Ú
\95¶
\8fÍ
\82Æ
\94ñ
\92\8d\96Ú
\95¶
\8fÍ
\82ð
\90Ø
\82è
\91Ö
\82¦
\82é
\82½
\82ß
\82É Forget -> Learn
\82Ì
\8f\87\82Å
\8eg
\97p
\82µ
\82Ü
\82·
\81B
157 wordCount : TWordCount;
158 isImportant : Boolean );
161 //==================================================
163 //==================================================
166 SysUtils, Math, Windows,
170 GIKO_BAYESIAN_FILE_VERSION = '1.0';
172 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173 ModeWGraph, ModeWAlpha, ModeWNum,
174 ModeWHira, ModeWKata, ModeWKanji);
176 CharMode1 : array [ 0..255 ] of Byte =
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
197 //************************************************************
199 //************************************************************
201 //==============================
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
208 p := AnsiPos(delimiter, s);
212 Result := Copy(s, 1, p - 1);
213 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
216 //==============================
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
224 v1 := Abs( Single( p1 ) - 0.5 );
225 v2 := Abs( Single( p2 ) - 0.5 );
235 //************************************************************
237 //************************************************************
238 constructor TWordCount.Create;
241 Duplicates := dupIgnore;
242 CaseSensitive := True;
247 destructor TWordCount.Destroy;
252 for i := Count - 1 downto 0 do
253 if Objects[ i ] <> nil then
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
264 //==============================
266 //==============================
267 constructor TGikoBayesian.Create;
270 Duplicates := dupIgnore;
271 CaseSensitive := True;
276 //==============================
278 //==============================
279 destructor TGikoBayesian.Destroy;
284 for i := Count - 1 downto 0 do
285 if inherited Objects[ i ] <> nil then
286 inherited Objects[ i ].Free;
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
301 FFilePath := filePath;
303 if not FileExists( filePath ) then
306 sl := TStringList.Create;
308 sl.LoadFromFile( filePath );
310 for i := 1 to sl.Count - 1 do begin
312 name := RemoveToken( s, #1 );
313 info := TWordInfo.Create;
314 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
319 AddObject( name, info );
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
335 FFilePath := filePath;
337 sl := TStringList.Create;
340 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
342 for i := 0 to Count - 1 do begin
343 info := TWordInfo( inherited Objects[ i ] );
344 s := Strings[ i ] + #1
345 + Format('%x', [info.NormalWord]) + #1
346 + Format('%x', [info.ImportantWord]) + #1
347 + Format('%x', [info.NormalText]) + #1
348 + Format('%x', [info.ImportantText]);
353 sl.SaveToFile( filePath );
360 procedure TGikoBayesian.Save;
363 if FFilePath <> '' then
364 SaveToFile( FFilePath );
368 //==============================
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
376 if Find( name, idx ) then
377 Result := TWordInfo( inherited Objects[ idx ] )
383 //==============================
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
391 if Find( name, idx ) then
392 inherited Objects[ idx ] := value
394 AddObject( name, value );
399 //==============================
401 //==============================
402 procedure TGikoBayesian.CountWord(
404 wordCount : TWordCount );
406 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407 ModeWGraph, ModeWAlpha, ModeWNum,
408 ModeWHira, ModeWKata, ModeWKanji);
410 p, tail, last : PChar;
411 mode, newMode : Modes;
414 wHiraDelimiter : TStringList;
415 wHiraFinalDelimiter : TStringList;
416 wKanjiDelimiter : TStringList;
419 countInfo : TWordCountInfo;
421 function cutBoth( _aWord : string; _delim : TStringList ) : string;
425 for _i := 0 to _delim.Count - 1 do begin
426 _aWord := CustomStringReplace(
429 #10 + _delim[ _i ] + #10, False );
434 function cutFirst( _aWord : string; _delim : TStringList ) : string;
438 for _i := 0 to _delim.Count - 1 do begin
439 _aWord := CustomStringReplace(
442 #10 + _delim[ _i ], False );
447 function cutFinal( _aWord : string; _delim : TStringList ) : string;
451 for _i := 0 to _delim.Count - 1 do begin
452 _aWord := CustomStringReplace(
455 _delim[ _i ] + #10, False );
460 procedure addWord( _dst : TWordCount; _words : TStringList );
464 _countInfo : TWordCountInfo;
466 for _i := 0 to _words.Count - 1 do begin
467 _aWord := _words[ _i ];
468 if Length( _aWord ) > 0 then begin
469 if _dst.Find( _aWord, _idx ) then begin
470 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
472 _countInfo := TWordCountInfo.Create;
473 _dst.AddObject( _aWord, _countInfo );
475 _countInfo.WordCount := _countInfo.WordCount + 1;
480 function changeMode( _aWord : string; _mode : Modes ) : string;
484 _pWord, _pWord2 : PChar;
485 _pWordTail, _pFound : PChar;
487 _delim : string = #10;
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490 if mode = ModeWHira then begin
495 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
497 //
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
498 _aWord := CustomStringReplace( _aWord, ' ', '', False );
499 _aWord := CustomStringReplace( _aWord, '
\81@', '', False );
501 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
505 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506 Result := cutBoth( _aWord, wHiraDelimiter );
511 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
512 _aWord := cutBoth( _aWord, wKanjiDelimiter );
513 // 4 byte (2
\8e\9a)
\82¸
\82Â
\82Å
\92P
\8cê
\95ª
\82¯
514 _pWord := PChar( _aWord );
515 _i := Length( _aWord );
516 _pWordTail := _pWord + _i;
517 SetLength( _aWord2, _i + (_i shr 2) );
518 _pWord2 := PChar( _aWord2 );
520 while _pWord < _pWordTail do begin
521 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522 if _pFound = nil then
523 _pFound := _pWordTail;
524 _pFound := _pFound - 3;
526 while _pWord <= _pFound do begin
527 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
530 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531 CopyMemory( _pWord2, _pWord, _i );
532 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
534 if _pWord < _pWordTail then begin
535 _i := _pWordTail - _pWord;
536 CopyMemory( _pWord2, _pWord, _i );
537 _pWord2 := _pWord2 + _i;
539 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
552 WHIRA_DELIMITER = '
\82ð' + #10 + '
\82É' + #10 + '
\82ª' + #10 + '
\82Æ' + #10 + '
\82©
\82ç'
553 + #10 + '
\82Ö' + #10 + '
\82æ
\82è' + #10 + '
\82Ü
\82Å'+ #10 + '
\82Å'
554 + #10 + '
\82±
\82±' + #10 + '
\82»
\82±' + #10 + '
\82Ç
\82±'
555 + #10 + '
\82±
\82ê' + #10 + '
\82»
\82ê' + #10 + '
\82 \82ê' + #10 + '
\82Ç
\82ê'
556 + #10 + '
\82±
\82Ì' + #10 + '
\82»
\82Ì' + #10 + '
\82 \82Ì' + #10 + '
\82Ç
\82Ì'
557 + #10 + '
\82±
\82¤' + #10 + '
\82»
\82¤' + #10 + '
\82 \82 ' + #10 + '
\82Ç
\82¤'
558 + #10 + '
\82±
\82ñ
\82È' + #10 + '
\82»
\82ñ
\82È' + #10 + '
\82 \82ñ
\82È' + #10 + '
\82Ç
\82ñ
\82È'
559 + #10 + '
\82ê
\82½' + #10 + '
\82ê
\82Ä' + #10 + '
\82ê
\82ê' + #10 + '
\82ê
\82ë'
560 + #10 + '
\82ê
\82é' + #10 + '
\82ç
\82ê
\82é'
561 + #10 + '
\82Å
\82·' + #10 + '
\82Ü
\82·' + #10 + '
\82Ü
\82¹
\82ñ'
562 + #10 + '
\82Å
\82µ
\82½' + #10 + '
\82Ü
\82µ
\82½'
563 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82³
\82ê
\82é' + #10 + '
\82³
\82ê
\82È
\82¢'
565 WKANJI_DELIMITER = '
\93I' + #10 + '
\90«' + #10 + '
\8e®' + #10 + '
\89»' + #10 + '
\96@'
566 + #10 + '
\95s' + #10 + '
\96³' + #10 + '
\94ñ' + #10 + '
\94½'
568 WHIRA_FINAL_DELIMITER = '
\82Á
\82½' + #10 + '
\82Á
\82Ä'
570 + #10 + '
\82æ
\82Á
\82Ä' + #10 + '
\82µ
\82½
\82ª
\82Á
\82Ä' + #10 + '
\82È
\82Ì
\82Å'
571 + #10 + '
\82¾
\82©
\82ç' + #10 + '
\82Å
\82·
\82©
\82ç'
573 + #10 + '
\82µ
\82©
\82µ' + #10 + '
\82¾
\82ª' + #10 + '
\82¯
\82Ç' + #10 + '
\82¯
\82ê
\82Ç'
574 + #10 + '
\82â
\82Í
\82è' + #10 + '
\82â
\82Á
\82Ï
\82è'
575 + #10 + '
\82Å
\82µ' + #10 + '
\82¾
\82ë'
576 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82µ
\82½' + #10 + '
\82µ
\82È
\82¢'
578 // '
\81['
\82ð '
\82\9f\82¡
\82£
\82¥
\82§'
\82É
\81B
579 HA_LINE = '
\82 \82©
\82³
\82½
\82È
\82Í
\82Ü
\82â
\82ç
\82í
\82ª
\82´
\82¾
\82Î
\82Ï
\82\9f\82ì';
580 HI_LINE = '
\82¢
\82«
\82µ
\82¿
\82É
\82Ð
\82Ý
\82è
\82î
\82¬
\82¶
\82Ñ
\82Ò
\82¡';
581 HU_LINE = '
\82¤
\82
\82·
\82Â
\82Ê
\82Ó
\82Þ
\82ä
\82é
\82®
\82Ô
\82Õ
\82£';
582 HE_LINE = '
\82¦
\82¯
\82¹
\82Ä
\82Ë
\82Ö
\82ß
\82ê
\82ï
\82°
\82×
\82Ø
\82¥';
583 HO_LINE = '
\82¨
\82±
\82»
\82Æ
\82Ì
\82Ù
\82à
\82æ
\82ë
\82ð
\82²
\82Ú
\82Û
\82§';
584 KA_LINE = '
\83A
\83J
\83T
\83^
\83i
\83n
\83}
\83\84\83\89\83\8f\83K
\83U
\83_
\83o
\83p
\83@
\83\95\83\8e';
585 KI_LINE = '
\83C
\83L
\83V
\83`
\83j
\83q
\83~
\83\8a\83\90\83M
\83W
\83r
\83s
\83B';
586 KU_LINE = '
\83E
\83N
\83X
\83c
\83k
\83t
\83\80\83\86\83\8b\83O
\83u
\83v
\83D
\83\94';
587 KE_LINE = '
\83G
\83P
\83Z
\83e
\83l
\83w
\83\81\83\8c\83\91\83Q
\83x
\83y
\83F
\83\96';
588 KO_LINE = '
\83I
\83R
\83\
\83g
\83m
\83z
\83\82\83\88\83\8d\83\92\83S
\83{
\83|
\83H';
589 kKanji = [$80..$A0, $E0..$ff];
592 wHiraDelimiter := TStringList.Create;
593 wHiraFinalDelimiter := TStringList.Create;
594 wKanjiDelimiter := TStringList.Create;
595 words := TStringList.Create;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599 wHiraDelimiter.Text := WHIRA_DELIMITER;
600 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
602 wKanjiDelimiter.Text := WKANJI_DELIMITER;
604 tail := p + Length( text );
607 while p < tail do begin
608 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ð
\94»
\95Ê
609 //
\81¦
\8bå
\93Ç
\93_
\82Í ModeGraph
\82É
\82È
\82é
\82Ì
\82Å
\8cÂ
\95Ê
\82É
\91Î
\89\9e\82µ
\82È
\82
\82Ä
\82à
\82¢
\82¢
610 // if Byte(Byte( p^ ) - $a1) < $5e then begin
611 if Byte( p^ ) in kKanji then begin
612 if p + 1 < tail then begin
613 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
615 //
\83X
\83y
\81[
\83X
\82Å
\92P
\8cê
\95ª
\82¯
\82¹
\82¸
\82É
\8bl
\82ß
\82é
616 //$8140: newMode := ModeWhite;
617 $8141..$824e: newMode := ModeWGraph;
618 $824f..$8258: newMode := ModeWNum;
619 $8260..$829a: newMode := ModeWAlpha;
620 $829f..$82f1: newMode := ModeWHira;
621 $8340..$8396: newMode := ModeWKata;
622 else newMode := ModeWKanji;
624 // '
\81J
\81K
\81['
\82Í
\95½
\89¼
\96¼
\81A
\82Ü
\82½
\82Í
\83J
\83^
\83J
\83i
\82É
\8aÜ
\82Ü
\82ê
\82é
625 if (mode = ModeWHira) or (mode = ModeWKata) then
626 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
629 newMode := ModeWhite;
634 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636 //
\8d¡
\82Ü
\82Å
\93ú
\96{
\8cê
\82Å
\8d¡
\83X
\83y
\81[
\83X
637 //
\92P
\8cê
\82ð
\8cq
\82°
\82Ä
\8cã
\82Å
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
638 //
\81¦
\94¼
\8ap
\83J
\83i
\82Í
\92Ê
\8fí
\83X
\83y
\81[
\83X
\82Å
\8bæ
\90Ø
\82é
\82¾
\82ë
\82¤
\82©
\82ç
\8bl
\82ß
\82È
\82¢
645 if mode <> newMode then begin
647 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ª
\95Ï
\8dX
\82³
\82ê
\82½
648 if mode <> ModeWhite then begin
649 SetLength( aWord, p - last );
650 CopyMemory( PChar( aWord ), last, p - last );
652 words.Text := changeMode( aWord, mode );
655 addWord( wordCount, words );
666 if mode <> ModeWhite then begin
667 SetLength( aWord, p - last );
668 CopyMemory( PChar( aWord ), last, p - last );
670 words.Text := changeMode( aWord, mode );
673 addWord( wordCount, words );
677 wKanjiDelimiter.Free;
678 wHiraFinalDelimiter.Free;
684 //==============================
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
689 function p( const aWord : string ) : Single;
693 info := Objects[ aWord ];
696 else if info.NormalWord = 0 then
698 else if info.ImportantWord = 0 then
700 else if info.ImportantWord + info.NormalWord * 2 < 5 then
703 Result := ( info.ImportantWord / info.ImportantText ) /
704 ((info.NormalWord * 2 / info.NormalText ) +
705 (info.ImportantWord / info.ImportantText));
717 if wordCount.Count = 0 then
720 narray := TList.Create;
722 for i := 0 to wordCount.Count - 1 do begin
723 narray.Add( Pointer( p( wordCount[ i ] ) ) );
726 narray.Sort( AbsSort );
730 i := min( SAMPLE_COUNT, narray.Count );
734 s := s * Single( narray[ i ] );
735 q := q * (1 - Single( narray[ i ] ));
738 Result := s / (s + q);
745 //==============================
747 //==============================
748 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
750 function p( const aWord : string ) : Single;
754 info := Objects[ aWord ];
757 else if info.ImportantWord = 0 then
759 else if info.NormalWord = 0 then
763 Result := ( info.ImportantWord / info.ImportantText ) /
764 ((info.NormalWord / info.NormalText ) +
765 (info.ImportantWord / info.ImportantText));
767 Result := (info.ImportantWord * info.NormalText) /
768 (info.NormalWord * info.ImportantText +
769 info.ImportantWord * info.NormalText);
772 function f( cnt : Integer; n, mean : Single ) : Extended;
776 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
781 narray : array of Single;
783 countInfo : TWordCountInfo;
785 P1, Q1, R1 : Extended;
789 if wordCount.Count = 0 then begin
794 SetLength( narray, wordCount.Count );
796 for i := 0 to wordCount.Count - 1 do begin
797 n := p( wordCount[ i ] );
801 mean := mean / wordCount.Count;
805 for i := 0 to wordCount.Count - 1 do begin
806 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
807 n := f( countInfo.WordCount, narray[ i ], mean );
808 P1 := P1 * ( 1 - n );
811 cnt := wordCount.Count;
814 P1 := 1 - Power( P1, 1 / cnt );
815 Q1 := 1 - Power( Q1, 1 / cnt );
817 if P1 + Q1 = 0 then begin
820 n := (P1 - Q1) / (P1 + Q1);
821 Result := (1 + n) / 2;
826 //==============================
827 // CalcGaryRobinsonFisher
828 //==============================
829 function TGikoBayesian.CalcGaryRobinsonFisher(
830 wordCount : TWordCount
833 function p( const aWord : string ) : Single;
837 info := Objects[ aWord ];
840 else if info.ImportantWord = 0 then
842 else if info.NormalWord = 0 then
846 Result := ( info.ImportantWord / info.ImportantText ) /
847 ((info.NormalWord / info.NormalText ) +
848 (info.ImportantWord / info.ImportantText));
850 Result := (info.ImportantWord * info.NormalText) /
851 (info.NormalWord * info.ImportantText +
852 info.ImportantWord * info.NormalText);
855 function f( cnt : Integer; n, mean : Single ) : Extended;
859 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
862 function prbx( x2, degree : Extended ) : Extended;
871 narray : array of Single;
873 countInfo : TWordCountInfo;
876 important : Extended;
881 if wordCount.Count = 0 then begin
886 SetLength( narray, wordCount.Count );
888 for i := 0 to wordCount.Count - 1 do begin
889 n := p( wordCount[ i ] );
893 mean := mean / wordCount.Count;
897 for i := 0 to wordCount.Count - 1 do begin
898 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
899 n := f( countInfo.WordCount, narray[ i ], mean );
900 P1 := P1 * ( 1 - n );
903 cnt := wordCount.Count;
906 P1 := Power( P1, 1 / cnt );
907 Q1 := Power( Q1, 1 / cnt );
909 P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
910 Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
912 Result := (1 + P1 - Q1) / 2;
916 //==============================
918 //==============================
919 function TGikoBayesian.Parse(
921 wordCount : TWordCount;
922 algorithm : TGikoBayesianAlgorithm
926 CountWord( text, wordCount );
928 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
929 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
930 gbaGaryRobinsonFisher:
931 Result := CalcGaryRobinsonFisher( wordCount );
937 //==============================
939 //==============================
940 procedure TGikoBayesian.Learn(
941 wordCount : TWordCount;
942 isImportant : Boolean );
945 wordinfo : TWordInfo;
946 countinfo : TWordCountInfo;
950 for i := 0 to wordCount.Count - 1 do begin
951 aWord := wordCount[ i ];
952 wordinfo := Objects[ aWord ];
953 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
954 if wordinfo = nil then begin
955 wordinfo := TWordInfo.Create;
956 Objects[ aWord ] := wordinfo;
959 if isImportant then begin
960 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
961 wordinfo.ImportantText := wordinfo.ImportantText + 1;
963 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
964 wordinfo.NormalText := wordinfo.NormalText + 1;
970 //==============================
972 //==============================
973 procedure TGikoBayesian.Forget(
974 wordCount : TWordCount;
975 isImportant : Boolean );
978 wordinfo : TWordInfo;
979 countinfo : TWordCountInfo;
983 for i := 0 to wordCount.Count - 1 do begin
984 aWord := wordCount[ i ];
985 wordinfo := Objects[ aWord ];
986 if wordinfo = nil then
989 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
990 if isImportant then begin
991 if wordInfo.ImportantText > 0 then begin
992 wordinfo.ImportantText := wordinfo.ImportantText - 1;
993 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
996 if wordinfo.NormalText > 0 then begin
997 wordinfo.NormalText := wordinfo.NormalText - 1;
998 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;