OSDN Git Service

- 平仮名の単語分けワードを増やしてより細分化。
authoryoffy <yoffy>
Mon, 1 Nov 2004 04:45:25 +0000 (04:45 +0000)
committeryoffy <yoffy>
Mon, 1 Nov 2004 04:45:25 +0000 (04:45 +0000)
- 漢字にも単語分けワードを用意。
- 漢字は最長 2 字で切るようになった。

GikoBayesian.pas

index 0e52a80..788799e 100644 (file)
@@ -4,7 +4,7 @@ unit GikoBayesian;
 \file          GikoBayesian.pas
 \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
 
-$Id: GikoBayesian.pas,v 1.11 2004/10/31 16:48:44 yoffy Exp $
+$Id: GikoBayesian.pas,v 1.12 2004/11/01 04:45:25 yoffy Exp $
 }
 
 interface
@@ -160,7 +160,8 @@ implementation
 //==================================================
 
 uses
-       SysUtils, Math, Windows;
+       SysUtils, Math, Windows,
+       MojuUtils;
 
 const
        GIKO_BAYESIAN_FILE_VERSION      = '1.0';
@@ -399,48 +400,200 @@ procedure TGikoBayesian.CountWord(
        const text      : string;
        wordCount               : TWordCount );
 type
-       Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
+       Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
                                                                ModeWGraph, ModeWAlpha, ModeWNum,
                                                                ModeWHira, ModeWKata, ModeWKanji);
 var
-       p, tail, last   : PChar;
-       mode, newMode   : Modes;
-       aWord                                   : string;
-       ch                                              : Longword;
-       chSize                          : Integer;
-       delimiter                       : TStringList;
-       delimited                       : Boolean;
-       i, idx                          : Integer;
-       countInfo                       : TWordCountInfo;
+       p, tail, last                   : PChar;
+       mode, newMode                   : Modes;
+       ch                                                              : Longword;
+       chSize                                          : Integer;
+       wHiraDelimiter          : TStringList;
+       wHiraFinalDelimiter     : TStringList;
+       wKanjiDelimiter         : TStringList;
+       words                                                   : TStringList;
+       aWord                                                   : string;
+       countInfo                                       : TWordCountInfo;
+
+       function cutBoth( _aWord : string; _delim : TStringList ) : string;
+       var
+               _i                      : Integer;
+       begin
+               for _i := 0 to _delim.Count - 1 do begin
+                       _aWord := CustomStringReplace(
+                               _aWord,
+                               _delim[ _i ],
+                               #10 + _delim[ _i ] + #10, False );
+               end;
+               Result := _aWord;
+       end;
+
+       function cutFirst( _aWord : string; _delim : TStringList ) : string;
+       var
+               _i                      : Integer;
+       begin
+               for _i := 0 to _delim.Count - 1 do begin
+                       _aWord := CustomStringReplace(
+                               _aWord,
+                               _delim[ _i ],
+                               #10 + _delim[ _i ], False );
+               end;
+               Result := _aWord;
+       end;
+
+       function cutFinal( _aWord : string; _delim : TStringList ) : string;
+       var
+               _i                      : Integer;
+       begin
+               for _i := 0 to _delim.Count - 1 do begin
+                       _aWord := CustomStringReplace(
+                               _aWord,
+                               _delim[ _i ],
+                               _delim[ _i ] + #10, False );
+               end;
+               Result := _aWord;
+       end;
+
+       procedure addWord( _dst : TWordCount; _words : TStringList );
+       var
+               _aWord                  : string;
+               _i, _idx                : Integer;
+               _countInfo      : TWordCountInfo;
+       begin
+               for _i := 0 to _words.Count - 1 do begin
+                       _aWord := _words[ _i ];
+                       if Length( _aWord ) > 0 then begin
+                               if _dst.Find( _aWord, _idx ) then begin
+                                       _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
+                               end else begin
+                                       _countInfo := TWordCountInfo.Create;
+                                       _dst.AddObject( _aWord, _countInfo );
+                               end;
+                               _countInfo.WordCount := _countInfo.WordCount + 1;
+                       end;
+               end;
+       end;
+
+       function changeMode( _aWord : string; _mode : Modes ) : string;
+       var
+               _i                                                                      : Integer;
+               _aWord2                                                 : string;
+               _pWord, _pWord2                 : PChar;
+               _pWordTail, _pFound     : PChar;
+       const
+               _delim : string = #10;
+       begin
+               if Ord( _mode ) >= Ord( ModeWGraph ) then begin
+                       // \93ú\96{\8cê
+                       // \83X\83y\81[\83X\82ð\8bl\82ß\82é
+                       _aWord := CustomStringReplace( _aWord, ' ', '', False );
+                       _aWord := CustomStringReplace( _aWord, '\81@', '', False );
+
+                       // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
+                       case mode of
+                       ModeWHira:
+                               begin
+                                       _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
+                                       Result := cutBoth( _aWord, wHiraDelimiter );
+                               end;
+
+                       ModeWKanji:
+                               begin
+                                       // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
+                                       _aWord := cutBoth( _aWord, wKanjiDelimiter );
+                                       // 4 byte (2 \8e\9a\82¸\82Â\82Å\92P\8cê\95ª\82¯
+                                       _pWord := PChar( _aWord );
+                                       _i := Length( _aWord );
+                                       _pWordTail := _pWord + _i;
+                                       SetLength( _aWord2, _i + (_i shr 2) );
+                                       _pWord2 := PChar( _aWord2 );
+
+                                       while _pWord < _pWordTail do begin
+                                               _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
+                                               if _pFound = nil then
+                                                       _pFound := _pWordTail;
+                                               _pFound := _pFound - 3;
+
+                                               while _pWord <= _pFound do begin
+                                                       CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
+                                                       _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
+                                               end;
+                                               _i := _pFound + 4 - _pWord; // 4 = 3 + #10
+                                               CopyMemory( _pWord2, _pWord, _i );
+                                               _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
+                                       end;
+                                       if _pWord < _pWordTail then begin
+                                               _i := _pWordTail - _pWord;
+                                               CopyMemory( _pWord2, _pWord, _i );
+                                               _pWord2 := _pWord2 + _i;
+                                       end;
+                                       SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
+
+                                       Result := _aWord2;
+                               end;
+
+                       else
+                               Result := _aWord;
+                       end;
+               end else begin
+                       Result := _aWord;
+               end;
+       end;
 const
-       KAKUJOSI = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç'
-               + #10 + '\82Å' + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å'
-               ;{
+       WHIRA_DELIMITER = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç'
+               + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å'+ #10 + '\82Å'
+               + #10 + '\82±\82±' + #10 + '\82»\82±' + #10 + '\82Ç\82±'
                + #10 + '\82±\82ê' + #10 + '\82»\82ê' + #10 + '\82 \82ê' + #10 + '\82Ç\82ê'
                + #10 + '\82±\82Ì' + #10 + '\82»\82Ì' + #10 + '\82 \82Ì' + #10 + '\82Ç\82Ì'
                + #10 + '\82±\82¤' + #10 + '\82»\82¤' + #10 + '\82 \82 ' + #10 + '\82Ç\82¤'
                + #10 + '\82±\82ñ\82È' + #10 + '\82»\82ñ\82È' + #10 + '\82 \82ñ\82È' + #10 + '\82Ç\82ñ\82È'
-               + #10 + '\93I' + #10 + '\90«' + #10 + '\8e®' + #10 + '\89»' + #10 + '\96@'
-               + #10 + '\95s' + #10 + '\96³' + #10 + '\94ñ'
+               + #10 + '\82ê\82½' + #10 + '\82ê\82Ä' + #10 + '\82ê\82ê' + #10 + '\82ê\82ë'
+               + #10 + '\82ê\82é' + #10 + '\82ç\82ê\82é'
+               + #10 + '\82Å\82·' + #10 + '\82Ü\82·' + #10 + '\82Ü\82¹\82ñ'
+               + #10 + '\82Å\82µ\82½' + #10 + '\82Ü\82µ\82½'
+               + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82³\82ê\82é' + #10 + '\82³\82ê\82È\82¢'
+               ;
+       WKANJI_DELIMITER = '\93I' + #10 + '\90«' + #10 + '\8e®' + #10 + '\89»' + #10 + '\96@'
+               + #10 + '\95s' + #10 + '\96³' + #10 + '\94ñ' + #10 + '\94½'
+               ;
+       WHIRA_FINAL_DELIMITER = '\82Á\82½' + #10 + '\82Á\82Ä'
+               ;{
+               + #10 + '\82æ\82Á\82Ä' + #10 + '\82µ\82½\82ª\82Á\82Ä' + #10 + '\82È\82Ì\82Å'
+               + #10 + '\82¾\82©\82ç' + #10 + '\82Å\82·\82©\82ç'
                + #10 + '\82Ü\82½'
                + #10 + '\82µ\82©\82µ' + #10 + '\82¾\82ª' + #10 + '\82¯\82Ç' + #10 + '\82¯\82ê\82Ç'
                + #10 + '\82â\82Í\82è' + #10 + '\82â\82Á\82Ï\82è'
-               + #10 + '\82Å\82·' + #10 + '\82Ü\82·' + #10 + '\82Å\82µ' + #10 + '\82¾\82ë'
+               + #10 + '\82Å\82µ' + #10 + '\82¾\82ë'
                + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82µ\82½' + #10 + '\82µ\82È\82¢'
                ;}
+       // '\81[' \82ð '\82\9f\82¡\82£\82¥\82§' \82É\81B
+       HA_LINE = '\82 \82©\82³\82½\82È\82Í\82Ü\82â\82ç\82í\82ª\82´\82¾\82Î\82Ï\82\9f\82ì';
+       HI_LINE = '\82¢\82«\82µ\82¿\82É\82Ð\82Ý\82è\82î\82¬\82\82Ñ\82Ò\82¡';
+       HU_LINE = '\82¤\82­\82·\82Â\82Ê\82Ó\82Þ\82ä\82é\82®\82Ô\82Õ\82£';
+       HE_LINE = '\82¦\82¯\82¹\82Ä\82Ë\82Ö\82ß\82ê\82ï\82°\82×\82Ø\82¥';
+       HO_LINE = '\82¨\82±\82»\82Æ\82Ì\82Ù\82à\82æ\82ë\82ð\82²\82Ú\82Û\82§';
+       KA_LINE = '\83A\83J\83T\83^\83i\83n\83}\83\84\83\89\83\8f\83K\83U\83_\83o\83p\83@\83\95\83\8e';
+       KI_LINE = '\83C\83L\83V\83`\83j\83q\83~\83\8a\83\90\83M\83W\83r\83s\83B';
+       KU_LINE = '\83E\83N\83X\83c\83k\83t\83\80\83\86\83\8b\83O\83u\83v\83D\83\94';
+       KE_LINE = '\83G\83P\83Z\83e\83l\83w\83\81\83\8c\83\91\83Q\83x\83y\83F\83\96';
+       KO_LINE = '\83I\83R\83\\83g\83m\83z\83\82\83\88\83\8d\83\92\83S\83{\83|\83H';
        kKanji = [$80..$A0, $E0..$ff];
 begin
 
-       delimiter := TStringList.Create;
+       wHiraDelimiter  := TStringList.Create;
+       wHiraFinalDelimiter := TStringList.Create;
+       wKanjiDelimiter := TStringList.Create;
+       words := TStringList.Create;
        try
                mode := ModeWhite;
-               delimiter.Text := KAKUJOSI;
+               wHiraDelimiter.Text := WHIRA_DELIMITER;
+               wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
+               wKanjiDelimiter.Text := WKANJI_DELIMITER;
                p                       := PChar( text );
                tail    := p + Length( text );
                last    := p;
 
                while p < tail do begin
-                       delimited := False;
                        // \95\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
                        // \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82­\82Ä\82à\82¢\82¢
 //                     if Byte(Byte( p^ ) - $a1) < $5e then begin
@@ -448,7 +601,8 @@ begin
                                if p + 1 < tail then begin
                                        ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
                                        case ch of
-                                       $8140:                                                  newMode := ModeWhite;
+                                       // \83X\83y\81[\83X\82Å\92P\8cê\95ª\82¯\82¹\82¸\82É\8bl\82ß\82é
+                                       //$8140:                                                        newMode := ModeWhite;
                                        $8141..$824e:                           newMode := ModeWGraph;
                                        $824f..$8258:                           newMode := ModeWNum;
                                        $8260..$829a:                           newMode := ModeWAlpha;
@@ -465,57 +619,30 @@ begin
                                end;
 
                                chSize := 2;
-
-                               // \8bæ\90Ø\82è\82É\82È\82é\95\8e\9a\82ª\82 \82é\82©\8c\9f\8d¸\82·\82é
-                               if p + 3 < tail then begin      // 3 = delimiter \82Ì\8dÅ\91å\8e\9a\90\94 - 1
-                                       for i := 0 to delimiter.Count - 1 do begin
-                                               if CompareMem(
-                                                       p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
-                                                       delimited := True;
-                                                       chSize := Length( delimiter[ i ] );
-                                                       Break;
-                                               end;
-                                       end;
-                               end;
                        end else begin
                                newMode := Modes( CharMode1[ Byte( p^ ) ] );
+                               if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
+                                       // \8d¡\82Ü\82Å\93ú\96{\8cê\82Å\8d¡\83X\83y\81[\83X
+                                       // \92P\8cê\82ð\8cq\82°\82Ä\8cã\82Å\83X\83y\81[\83X\82ð\8bl\82ß\82é
+                                       // \81¦\94¼\8ap\83J\83i\82Í\92Ê\8fí\83X\83y\81[\83X\82Å\8bæ\90Ø\82é\82¾\82ë\82¤\82©\82ç\8bl\82ß\82È\82¢
+                                       newMode := mode;
+                               end;
 
                                chSize := 1;
                        end;
 
-                       if (mode <> newMode) or delimited then begin
+                       if mode <> newMode then begin
 
                                // \95\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
-                               if mode <> ModeWhite then begin
-                                       SetLength( aWord, p - last );
-                                       CopyMemory( PChar( aWord ), last, p - last );
-                                       //aWord := Copy( last, 0, p - last );
-                                       if wordCount.Find( aWord, idx ) then begin
-                                               countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
-                                       end else begin
-                                               countInfo := TWordCountInfo.Create;
-                                               wordCount.AddObject( aWord, countInfo );
-                                       end;
-                                       countInfo.WordCount := countInfo.WordCount + 1;
-                               end;
+                               SetLength( aWord, p - last );
+                               CopyMemory( PChar( aWord ), last, p - last );
 
-                               last := p;
+                               words.Text := changeMode( aWord, mode );
 
-                               // \8bæ\90Ø\82è\82É\82È\82é\95\8e\9a\82É\91\98\8bö\82µ\82½
-                               if delimited then begin
-                                       SetLength( aWord, chSize );
-                                       CopyMemory( PChar( aWord ), last, chSize );
-                                       //aWord := Copy( last, 0, p - last );
-                                       if wordCount.Find( aWord, idx ) then begin
-                                               countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
-                                       end else begin
-                                               countInfo := TWordCountInfo.Create;
-                                               wordCount.AddObject( aWord, countInfo );
-                                       end;
-                                       countInfo.WordCount := countInfo.WordCount + 1;
-                                       last := last + chSize;
-                               end;
+                               // \92P\8cê\93o\98^
+                               addWord( wordCount, words );
 
+                               last := p;
                                mode := newMode;
 
                        end;
@@ -525,16 +652,16 @@ begin
 
                if mode <> ModeWhite then begin
                        aWord := Copy( last, 0, p - last );
-                       if wordCount.Find( aWord, idx ) then begin
-                               countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
-                       end else begin
-                               countInfo := TWordCountInfo.Create;
-                               wordCount.AddObject( aWord, countInfo );
-                       end;
-                       countInfo.WordCount := countInfo.WordCount + 1;
+                       words.Text := changeMode( aWord, mode );
+
+                       // \92P\8cê\93o\98^
+                       addWord( wordCount, words );
                end;
        finally
-               delimiter.Free;
+               words.Free;
+               wKanjiDelimiter.Free;
+               wHiraFinalDelimiter.Free;
+               wHiraDelimiter.Free;
        end;
 
 end;