OSDN Git Service

skin 用に <NONSPAMMINESS/> を追加。
[gikonavigoeson/gikonavi.git] / GikoBayesian.pas
index 96646b1..04c34a6 100644 (file)
@@ -4,7 +4,7 @@ unit GikoBayesian;
 \file          GikoBayesian.pas
 \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
 
-$Id: GikoBayesian.pas,v 1.8 2004/10/21 05:59:39 yoffy Exp $
+$Id: GikoBayesian.pas,v 1.9 2004/10/27 00:10:12 yoffy Exp $
 }
 
 interface
@@ -60,7 +60,7 @@ type
        \brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
        ************************************************************}
        TGikoBayesianAlgorithm =
-               (gbaPaulGraham, gbaGaryRonbinson{, gbaGaryRonbinsonFisher});
+               (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
 
        {!***********************************************************
        \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
@@ -106,7 +106,11 @@ type
                }
                function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
 
-//             function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
+               {!
+               \brief  GaryRobinson-Fisher \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
+               \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
+               }
+               function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
 
                {!
                \brief  \95\8fÍ\82ð\89ð\90Í
@@ -120,7 +124,7 @@ type
                function Parse(
                        const text                              : string;
                        wordCount                                       : TWordCount;
-                       algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRonbinson
+                       algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
                ) : Extended;
 
                {!
@@ -522,11 +526,13 @@ function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
        begin
                info := Objects[ aWord ];
                if info = nil then
-                       Result := 0.4
+                       Result := 0.415
                else if info.NormalWord = 0 then
                        Result := 0.99
                else if info.ImportantWord = 0 then
                        Result := 0.01
+               else if info.ImportantWord + info.NormalWord * 2 < 5 then
+                       Result := 0.5
                else
                        Result := ( info.ImportantWord / info.ImportantText ) /
                                ((info.NormalWord * 2 / info.NormalText ) +
@@ -558,6 +564,7 @@ begin
                i := min( SAMPLE_COUNT, narray.Count );
                while i > 0 do begin
                        Dec( i );
+
                        s := s * Single( narray[ i ] );
                        q := q * (1 - Single( narray[ i ] ));
                end;
@@ -582,9 +589,9 @@ function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
                if info = nil then
                        Result := 0.415
                else if info.ImportantWord = 0 then
-                       Result := 0.0001
+                       Result := 0.01
                else if info.NormalWord = 0 then
-                       Result := 0.9999
+                       Result := 0.99
                else
                        Result := ( info.ImportantWord / info.ImportantText ) /
                                ((info.NormalWord / info.NormalText ) +
@@ -604,8 +611,115 @@ var
        mean                            : Extended;
        countInfo               : TWordCountInfo;
        i                                               : Integer;
+       P1, Q1, R1      : Extended;
+       cnt                                     : Extended;
+begin
+
+       if wordCount.Count = 0 then begin
+               Result := 1;
+               Exit;
+       end;
+
+       SetLength( narray, wordCount.Count );
+       mean := 0;
+       for i := 0 to wordCount.Count - 1 do begin
+               n                                               := p( wordCount[ i ] );
+               narray[ i ]     := n;
+               mean                            := mean + n;
+       end;
+       mean := mean / wordCount.Count;
+
+       cnt := 0;
+       P1 := 0;
+       Q1 := 0;
+       for i := 0 to wordCount.Count - 1 do begin
+               countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
+               n                                               := f( countInfo.WordCount, narray[ i ], mean );
+               if countInfo <> nil then
+                       cnt := cnt + countInfo.WordCount;
+               P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
+               Q1 := Q1 + Ln( n ) * countInfo.WordCount;
+       end;
+       if cnt = 0 then
+               cnt := 1;
+       P1 := 1 - Exp( P1 * (1 / cnt) );
+       Q1 := 1 - Exp( Q1 * (1 / cnt) );
+
+       if P1 + Q1 = 0 then begin
+               Result := 0.5
+       end else begin
+               n := (P1 - Q1) / (P1 + Q1);
+               Result := (1 + n) / 2;
+       end;
+
+end;
+
+//==============================
+// CalcGaryRobinsonFisher
+//==============================
+function TGikoBayesian.CalcGaryRobinsonFisher(
+       wordCount : TWordCount
+) : Extended;
+
+       function p( const aWord : string ) : Single;
+       var
+               info                            : TWordInfo;
+       begin
+               info := Objects[ aWord ];
+               if info = nil then
+                       Result := 0.415
+               else if info.ImportantWord = 0 then
+                       Result := 0.01
+               else if info.NormalWord = 0 then
+                       Result := 0.99
+               else
+                       Result := info.ImportantWord /
+                               (info.ImportantWord + info.NormalWord *
+                                info.ImportantText / info.NormalText);
+       end;
+
+       function f( cnt : Integer; n, mean : Single ) : Extended;
+       const
+               k = 0.00001;
+       begin
+               Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
+       end;
+
+       function prbx( x2, degree : Extended ) : Extended;
+       var
+               m : Extended;
+               sum : Extended;
+               term : Extended;
+               i : extended;
+       begin
+
+               m := x2 / 2;
+               sum := exp( -m );
+               term := -m;
+
+               i := 1;
+               while i < (degree / 2 - 1) do begin
+                       term := term + ln( m / i );
+                       sum := sum + exp( term );
+                       i := i + 1;
+               end;
+
+               if sum < 1 then
+                       Result := sum
+               else
+                       Result := 1.0;
+
+       end;
+
+var
+       n                                               : Extended;
+       narray                  : array of Single;
+       mean                            : Extended;
+       countInfo               : TWordCountInfo;
+       i                                               : Integer;
        normal                  : Extended;
        important               : Extended;
+       P1, Q1                  : Extended;
        cnt                                     : Extended;
 begin
 
@@ -623,24 +737,41 @@ begin
        end;
        mean := mean / wordCount.Count;
 
-       cnt                             := 0;
-       normal          := 1;
-       important       := 1;
+       cnt := 0;
+(*
+       P1 := 1;
+       Q1 := 1;
+(*)
+       P1 := 0;
+       Q1 := 0;
+//*
        for i := 0 to wordCount.Count - 1 do begin
                countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
                n                                               := f( countInfo.WordCount, narray[ i ], mean );
-               normal                  := normal * n;
-               important               := important * (1 - n);
                if countInfo <> nil then
-                       cnt                                     := cnt + countInfo.WordCount;
+                       cnt := cnt + countInfo.WordCount;
+(*
+               P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
+               Q1 := Q1 + Ln( n ) * countInfo.WordCount;
+(*)
+               P1 := P1 + Ln( 1 - n );
+               Q1 := Q1 + Ln( n );
+//*)
        end;
        if cnt = 0 then
                cnt := 1;
-       normal := 1 - Exp( Ln( normal ) * (1 / cnt) );
-       important := 1 - Exp( Ln( important ) * (1 / cnt) );
-
-       n := (important - normal+ 0.00001) / (important + normal + 0.00001);
-       Result := (1 + n) / 2;
+//(*
+       P1 := prbx( -2 * P1, 2 * cnt );
+       Q1 := prbx( -2 * Q1, 2 * cnt );
+(*)
+       P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
+       Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
+//*)
+       if P1 + Q1 = 0 then begin
+               Result := 0.5
+       end else begin
+               Result := (1 + Q1 + P1) / 2;
+       end;
 
 end;
 
@@ -650,14 +781,16 @@ end;
 function TGikoBayesian.Parse(
        const text                              : string;
        wordCount                                       : TWordCount;
-       algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRonbinson
+       algorithm                                       : TGikoBayesianAlgorithm
 ) : Extended;
 begin
 
        CountWord( text, wordCount );
        case algorithm of
        gbaPaulGraham:          Result := CalcPaulGraham( wordCount );
-       gbaGaryRonbinson:       Result := CalcGaryRobinson( wordCount );
+       gbaGaryRobinson:        Result := CalcGaryRobinson( wordCount );
+       gbaGaryRobinsonFisher:
+                                                                               Result := CalcGaryRobinsonFisher( wordCount );
        else                                                    Result := 0;
        end;