\file GikoBayesian.pas
\brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
-$Id: GikoBayesian.pas,v 1.1 2004/10/20 18:25:00 yoffy Exp $
+$Id: GikoBayesian.pas,v 1.9 2004/10/27 00:10:12 yoffy Exp $
}
interface
//==================================================
uses
//==================================================
- Classes, IniFiles;
+ Classes;
//==================================================
type
\brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83\8a\83X\83g
************************************************************}
// TWordCount = class( THashedStringList ) // \8c\83\92x
- TWordCount = class( TStringList ) // \92x
+ TWordCount = class( TStringList )
public
+ constructor Create;
destructor Destroy; override;
end;
\brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
************************************************************}
TGikoBayesianAlgorithm =
- (gbaPaulGraham, gbaGaryRonbinson{, gbaGaryRonbinsonFisher});
+ (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
{!***********************************************************
\brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
************************************************************}
- TGikoBayesian = class( THashedStringList )
+// TGikoBayesian = class( THashedStringList ) // \8c\83\92x
+ TGikoBayesian = class( TStringList )
private
FFilePath : string; //!< \93Ç\82Ý\8d\9e\82ñ\82¾\83t\83@\83C\83\8b\83p\83X
function GetObject( const name : string ) : TWordInfo;
}
function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
-// function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
+ {!
+ \brief GaryRobinson-Fisher \96@\82É\8aî\82Ã\82¢\82Ä\95¶\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
+ \return \95¶\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
+ }
+ function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
{!
\brief \95¶\8fÍ\82ð\89ð\90Í
function Parse(
const text : string;
wordCount : TWordCount;
- algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
+ algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
) : Extended;
{!
//==================================================
uses
- SysUtils, Math;
+ SysUtils, Math, Windows;
const
GIKO_BAYESIAN_FILE_VERSION = '1.0';
- kYofKanji : TSysCharSet = [#$80..#$A0, #$E0..#$ff];
+{
+ Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
+ ModeWGraph, ModeWAlpha, ModeWNum,
+ ModeWHira, ModeWKata, ModeWKanji);
+}
+ CharMode1 : array [ 0..255 ] of Byte =
+ (
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+ 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
+ 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ );
//************************************************************
// misc
//************************************************************
// TWordCount class
//************************************************************
+constructor TWordCount.Create;
+begin
+
+ Duplicates := dupIgnore;
+ CaseSensitive := True;
+ Sorted := True;
+
+end;
+
destructor TWordCount.Destroy;
var
i : Integer;
constructor TGikoBayesian.Create;
begin
- Duplicates := dupIgnore;
- Sorted := True;
+ Duplicates := dupIgnore;
+ CaseSensitive := True;
+ Sorted := True;
end;
info : TWordInfo;
begin
+ FFilePath := filePath;
+
if not FileExists( filePath ) then
Exit;
-
+
sl := TStringList.Create;
try
sl.LoadFromFile( filePath );
info : TWordInfo;
begin
+ FFilePath := filePath;
+
sl := TStringList.Create;
try
sl.BeginUpdate;
idx : Integer;
begin
- idx := IndexOf( name );
+ idx := IndexOf( name ); // \8c\83\92x
if idx < 0 then
Result := nil
else
const text : string;
wordCount : TWordCount );
type
- Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
+ Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
ModeWGraph, ModeWAlpha, ModeWNum,
ModeWHira, ModeWKata, ModeWKanji);
var
const
KAKUJOSI = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç' +
#10 + '\82Å' + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å';
+ kKanji = [$80..$A0, $E0..$ff];
begin
delimiter := TStringList.Create;
try
- //*** \91¬\93x\83e\83X\83g\92\86
- wordCount.Duplicates := dupIgnore;
- wordCount.CaseSensitive := True;
- wordCount.Capacity := 1000;
- wordCount.Sorted := True;
- //***
-
mode := ModeWhite;
delimiter.Text := KAKUJOSI;
- SetLength( aWord, 256 );
p := PChar( text );
tail := p + Length( text );
last := p;
delimited := False;
// \95¶\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
// \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82\82Ä\82à\82¢\82¢
- if p^ in kYofKanji then begin
+// if Byte(Byte( p^ ) - $a1) < $5e then begin
+ if Byte( p^ ) in kKanji then begin
if p + 1 < tail then begin
ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
case ch of
$8340..$8396: newMode := ModeWKata;
else newMode := ModeWKanji;
end;
+ // '\81J\81K\81[' \82Í\95½\89¼\96¼\81A\82Ü\82½\82Í\83J\83^\83J\83i\82É\8aÜ\82Ü\82ê\82é
+ if (mode = ModeWHira) or (mode = ModeWKata) then
+ if (ch = $814a) or (ch = $814b) or (ch = $815b) then
+ newMode := mode;
end else begin
newMode := ModeWhite;
end;
end;
end;
end else begin
- case p^ of
- #$0..#$20, #$7f: newMode := ModeWhite;
- '0'..'9': newMode := ModeNum;
- 'a'..'z', 'A'..'Z': newMode := ModeAlpha;
- #$A6..#$DD: newMode := ModeHanKana;
- else newMode := ModeGraph;
- end;
+ newMode := Modes( CharMode1[ Byte( p^ ) ] );
chSize := 1;
end;
// \95¶\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
// \82à\82µ\82\82Í\8bæ\90Ø\82è\82É\82È\82é\95¶\8e\9a\82É\91\98\8bö\82µ\82½
if mode <> ModeWhite then begin
- aWord := Copy( last, 0, p - last ); // \8c\83\92x
-// SetLength( aWord, p - last );
-// CopyMemory( PChar( aWord ), last, p - last );
- idx := wordCount.IndexOf( aWord ); // \8c\83\92x
+ SetLength( aWord, p - last );
+ CopyMemory( PChar( aWord ), last, p - last );
+ //aWord := Copy( last, 0, p - last );
+ idx := wordCount.IndexOf( aWord ); // \92x
if idx < 0 then begin
countInfo := TWordCountInfo.Create;
wordCount.AddObject( aWord, countInfo );
begin
info := Objects[ aWord ];
if info = nil then
- Result := 0.4
+ Result := 0.415
else if info.NormalWord = 0 then
Result := 0.99
else if info.ImportantWord = 0 then
Result := 0.01
+ else if info.ImportantWord + info.NormalWord * 2 < 5 then
+ Result := 0.5
else
Result := ( info.ImportantWord / info.ImportantText ) /
((info.NormalWord * 2 / info.NormalText ) +
i := min( SAMPLE_COUNT, narray.Count );
while i > 0 do begin
Dec( i );
+
s := s * Single( narray[ i ] );
q := q * (1 - Single( narray[ i ] ));
end;
if info = nil then
Result := 0.415
else if info.ImportantWord = 0 then
- Result := 0.0001
+ Result := 0.01
else if info.NormalWord = 0 then
- Result := 0.9999
+ Result := 0.99
else
Result := ( info.ImportantWord / info.ImportantText ) /
((info.NormalWord / info.NormalText ) +
mean : Extended;
countInfo : TWordCountInfo;
i : Integer;
+ P1, Q1, R1 : Extended;
+ cnt : Extended;
+begin
+
+ if wordCount.Count = 0 then begin
+ Result := 1;
+ Exit;
+ end;
+
+ SetLength( narray, wordCount.Count );
+ mean := 0;
+ for i := 0 to wordCount.Count - 1 do begin
+ n := p( wordCount[ i ] );
+ narray[ i ] := n;
+ mean := mean + n;
+ end;
+ mean := mean / wordCount.Count;
+
+ cnt := 0;
+ P1 := 0;
+ Q1 := 0;
+ for i := 0 to wordCount.Count - 1 do begin
+ countInfo := TWordCountInfo( wordCount.Objects[ i ] );
+ n := f( countInfo.WordCount, narray[ i ], mean );
+ if countInfo <> nil then
+ cnt := cnt + countInfo.WordCount;
+ P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
+ Q1 := Q1 + Ln( n ) * countInfo.WordCount;
+ end;
+ if cnt = 0 then
+ cnt := 1;
+ P1 := 1 - Exp( P1 * (1 / cnt) );
+ Q1 := 1 - Exp( Q1 * (1 / cnt) );
+
+ if P1 + Q1 = 0 then begin
+ Result := 0.5
+ end else begin
+ n := (P1 - Q1) / (P1 + Q1);
+ Result := (1 + n) / 2;
+ end;
+
+end;
+
+//==============================
+// CalcGaryRobinsonFisher
+//==============================
+function TGikoBayesian.CalcGaryRobinsonFisher(
+ wordCount : TWordCount
+) : Extended;
+
+ function p( const aWord : string ) : Single;
+ var
+ info : TWordInfo;
+ begin
+ info := Objects[ aWord ];
+ if info = nil then
+ Result := 0.415
+ else if info.ImportantWord = 0 then
+ Result := 0.01
+ else if info.NormalWord = 0 then
+ Result := 0.99
+ else
+ Result := info.ImportantWord /
+ (info.ImportantWord + info.NormalWord *
+ info.ImportantText / info.NormalText);
+ end;
+
+ function f( cnt : Integer; n, mean : Single ) : Extended;
+ const
+ k = 0.00001;
+ begin
+ Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
+ end;
+
+ function prbx( x2, degree : Extended ) : Extended;
+ var
+ m : Extended;
+ sum : Extended;
+ term : Extended;
+ i : extended;
+ begin
+
+ m := x2 / 2;
+ sum := exp( -m );
+ term := -m;
+
+ i := 1;
+ while i < (degree / 2 - 1) do begin
+ term := term + ln( m / i );
+ sum := sum + exp( term );
+ i := i + 1;
+ end;
+
+ if sum < 1 then
+ Result := sum
+ else
+ Result := 1.0;
+
+ end;
+
+var
+ n : Extended;
+ narray : array of Single;
+ mean : Extended;
+ countInfo : TWordCountInfo;
+ i : Integer;
normal : Extended;
important : Extended;
+ P1, Q1 : Extended;
cnt : Extended;
begin
end;
mean := mean / wordCount.Count;
- cnt := 0;
- normal := 1;
- important := 1;
+ cnt := 0;
+(*
+ P1 := 1;
+ Q1 := 1;
+(*)
+ P1 := 0;
+ Q1 := 0;
+//*
for i := 0 to wordCount.Count - 1 do begin
countInfo := TWordCountInfo( wordCount.Objects[ i ] );
n := f( countInfo.WordCount, narray[ i ], mean );
- normal := normal * n;
- important := important * (1 - n);
if countInfo <> nil then
- cnt := cnt + countInfo.WordCount;
+ cnt := cnt + countInfo.WordCount;
+(*
+ P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
+ Q1 := Q1 + Ln( n ) * countInfo.WordCount;
+(*)
+ P1 := P1 + Ln( 1 - n );
+ Q1 := Q1 + Ln( n );
+//*)
end;
if cnt = 0 then
cnt := 1;
- normal := 1 - Exp( Ln( normal ) * (1 / cnt) );
- important := 1 - Exp( Ln( important ) * (1 / cnt) );
-
- n := (important - normal+ 0.00001) / (important + normal + 0.00001);
- Result := (1 + n) / 2;
+//(*
+ P1 := prbx( -2 * P1, 2 * cnt );
+ Q1 := prbx( -2 * Q1, 2 * cnt );
+(*)
+ P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
+ Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
+//*)
+ if P1 + Q1 = 0 then begin
+ Result := 0.5
+ end else begin
+ Result := (1 + Q1 + P1) / 2;
+ end;
end;
function TGikoBayesian.Parse(
const text : string;
wordCount : TWordCount;
- algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
+ algorithm : TGikoBayesianAlgorithm
) : Extended;
begin
CountWord( text, wordCount );
case algorithm of
gbaPaulGraham: Result := CalcPaulGraham( wordCount );
- gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount );
+ gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
+ gbaGaryRobinsonFisher:
+ Result := CalcGaryRobinsonFisher( wordCount );
else Result := 0;
end;
aWord : string;
wordinfo : TWordInfo;
countinfo : TWordCountInfo;
- i : Integer;
+ i : Integer;
begin
for i := 0 to wordCount.Count - 1 do begin
aWord := wordCount[ i ];
wordinfo := Objects[ aWord ];
+ countinfo := TWordCountInfo( wordCount.Objects[ i ] );
if wordinfo = nil then begin
wordinfo := TWordInfo.Create;
Objects[ aWord ] := wordinfo;
end;
- countinfo := TWordCountInfo( wordCount.Objects[ i ] );
if isImportant then begin
wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
wordinfo.ImportantText := wordinfo.ImportantText + 1;
countinfo := TWordCountInfo( wordCount.Objects[ i ] );
if isImportant then begin
- wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
- wordinfo.ImportantText := wordinfo.ImportantText - 1;
+ if wordInfo.ImportantText > 0 then begin
+ wordinfo.ImportantText := wordinfo.ImportantText - 1;
+ wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
+ end;
end else begin
- wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
- wordinfo.NormalText := wordinfo.NormalText - 1;
+ if wordinfo.NormalText > 0 then begin
+ wordinfo.NormalText := wordinfo.NormalText - 1;
+ wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
+ end;
end;
end;