OSDN Git Service

単語抽出の向上
[chnosproject/CHNOSProject.git] / CHNOSProject / AI003 / AI003 / main.c
index 1f3f6b4..05fbd0d 100755 (executable)
@@ -24,8 +24,9 @@ int main(int argc, const char * argv[])
     //エントリポイント\r
     int i, i_max;\r
     CHNLIB_String *input, *temp;\r
-    CHNLIB_UIPArray *separated;\r
+    CHNLIB_UIPArray *separated, *sorted;\r
     int passthink;\r
+    FILE *readfp;\r
     \r
     CHNLIB_Environment_SetCurrentWorkingDirectory(argv[0]);\r
     \r
@@ -34,10 +35,21 @@ int main(int argc, const char * argv[])
     AI_System_InitializeSystemWorkingSet();\r
     \r
     AI_System_LoadMemory(AI_CONFIG_FILE_NAME);\r
+    \r
+    readfp = NULL;\r
 \r
     for(;;){\r
         passthink = False;\r
-        input = CHNLIB_ReadLine(stdin);\r
+        if(readfp == NULL){\r
+            input = CHNLIB_ReadLine(stdin);\r
+        } else{\r
+            input = CHNLIB_ReadLine(readfp);\r
+            if(input == NULL){\r
+                readfp = NULL;\r
+            } else{\r
+                puts(CHNLIB_String_GetReferencePointerOfCString(input));\r
+            }\r
+        }\r
 \r
         if(CHNLIB_String_CompareStringWithCString(input, "::")){\r
             //システムコマンド解釈\r
@@ -52,28 +64,32 @@ int main(int argc, const char * argv[])
                 } else if(CHNLIB_String_CompareStringWithCString(temp, "exit")){\r
                     break;\r
                 } else if(CHNLIB_String_CompareStringWithCString(temp, "wordlist")){\r
-                    i_max = CHNLIB_UIPArray_GetNumberOfDatas(WorkingSet.RootWordList);\r
+                    sorted = CHNLIB_UIPArray_SortInDescendingOrderByData32(WorkingSet.RootWordList);\r
+                    i_max = CHNLIB_UIPArray_GetNumberOfDatas(sorted);\r
                     for(i = 0; i < i_max; i++){\r
-                        printf("word%3d:%s\n", i, CHNLIB_String_GetReferencePointerOfCString(CHNLIB_UIPArray_GetPointerByIndex(WorkingSet.RootWordList, i)));\r
+                        printf("word%3d:%3d:%s\n", i, CHNLIB_UIPArray_GetData32ByIndex(sorted, i), CHNLIB_String_GetReferencePointerOfCString(CHNLIB_UIPArray_GetPointerByIndex(sorted, i)));\r
+                    }\r
+                    CHNLIB_UIPArray_FreeOnlyArray(&sorted);\r
+                } else if(CHNLIB_String_CompareStringWithCString(temp, "readfile")){\r
+                    //::readfile:filename\r
+                    readfp = fopen(CHNLIB_String_GetReferencePointerOfCString(CHNLIB_UIPArray_GetPointerByIndex(separated, 4)), "rb");\r
+                    if(readfp == NULL){\r
+                        puts("File open error.");\r
                     }\r
                 } else{\r
                     passthink = False;\r
                 }\r
             }\r
             \r
+            //WorkingSet.SystemWordList0を使っているので、解放しないように注意\r
             CHNLIB_UIPArray_FreeSelectedAll(&separated);\r
         }\r
         \r
         if(!passthink){\r
             //AIへの入力\r
-            separated = CHNLIB_UIPArray_Initialize();\r
-            separated = AI_Think_SlideLookUpWordByHistory(input);\r
-            i_max = CHNLIB_UIPArray_GetNumberOfDatas(separated);\r
-            printf("Index(Decimal),CountOfContain(Decimal), String\n");\r
-            for(i = 0; i < i_max; i++){\r
-                printf("%d,%d,%s\n", i, CHNLIB_UIPArray_GetData32ByIndex(separated, i), CHNLIB_String_GetReferencePointerOfCString(CHNLIB_UIPArray_GetPointerByIndex(separated, i)));\r
-            }\r
-            CHNLIB_UIPArray_FreeAll(&separated);\r
+            //::readfile:AITestData_ja.txt\r
+            AI_Think_LearnWordFromInputString(input);\r
+            \r
             CHNLIB_UIPArray_AppendLast(&WorkingSet.InputHistory, CHNLIB_UIPArray_GetNumberOfDatas(WorkingSet.InputHistory), input);\r
         }\r
     }\r
@@ -81,6 +97,39 @@ int main(int argc, const char * argv[])
     return 0;\r
 }\r
 \r
+void AI_Think_LearnWordFromInputString(CHNLIB_String *input)\r
+{\r
+    //入力文字列から単語を抽出して記憶する。\r
+    CHNLIB_UIPArray *candidateWordList;\r
+    int i, i_max;\r
+    int tagIndex, maxExistingWordIndex;\r
+    \r
+    candidateWordList = CHNLIB_UIPArray_Initialize();\r
+    candidateWordList = AI_Think_SlideLookUpWordByHistory(input);\r
+    i_max = CHNLIB_UIPArray_GetNumberOfDatas(candidateWordList);\r
+    maxExistingWordIndex = CHNLIB_UIPArray_GetNumberOfDatas(WorkingSet.RootWordList) - 1;\r
+    \r
+    printf("Index(Decimal),CountOfContain(Decimal), String\n");\r
+    for(i = 0; i < i_max; i++){\r
+        printf("%d,%d,%s\n", i, CHNLIB_UIPArray_GetData32ByIndex(candidateWordList, i), CHNLIB_String_GetReferencePointerOfCString(CHNLIB_UIPArray_GetPointerByIndex(candidateWordList, i)));\r
+        \r
+        //単語リストに登録。\r
+        tagIndex = AI_Memory_AddRootWordData(CHNLIB_UIPArray_GetPointerByIndex(candidateWordList, i));\r
+        \r
+        CHNLIB_UIPArray_SetData32ByIndex(WorkingSet.RootWordList, tagIndex, CHNLIB_UIPArray_GetData32ByIndex(WorkingSet.RootWordList, tagIndex) + CHNLIB_UIPArray_GetData32ByIndex(candidateWordList, i));\r
+        \r
+        if(tagIndex <= maxExistingWordIndex){\r
+            //既存のタグなので、解放されるようにdata32をFalseにしておく。\r
+            CHNLIB_UIPArray_SetData32ByIndex(candidateWordList, i, False);\r
+        }\r
+        //新規追加されたタグについては0以上の値つまりTrueが設定されているはずなので、解放されない。\r
+    }\r
+    //data32 == Falseのタグ、つまり既存のタグと同じだったもののみ解放する。\r
+    CHNLIB_UIPArray_FreeSelectedAll(&candidateWordList);\r
+\r
+    return;\r
+}\r
+\r
 CHNLIB_UIPArray *AI_Think_SlideLookUpWordByHistory(CHNLIB_String *input)\r
 {\r
     //[UTF-8]\r
@@ -144,47 +193,84 @@ CHNLIB_UIPArray *AI_Think_SlideLookUpWordByHistory(CHNLIB_String *input)
         CHNLIB_UIPArray_SetData32ByIndex(candidatewordlist, k, j);\r
     }\r
     \r
+    //重複抽出フィルタリング\r
+    AI_Think_CandidateWordList_Filter01(&candidatewordlist, 1);\r
+    AI_Think_CandidateWordList_Filter00(&candidatewordlist);\r
+    \r
     return candidatewordlist;\r
 }\r
 \r
-int AI_Think_CandidateWordList_Filter00(CHNLIB_UIPArray *candidatewordlist)\r
+int AI_Think_CandidateWordList_Filter00(CHNLIB_UIPArray **candidatewordlist)\r
 {\r
     //[Not implemented]\r
     //[UTF-8]\r
     //AI_Think_SlideLookUpWordByHistoryで返されたArray内の候補単語に対して、フィルターをかける。\r
+    //00:長い単語に含まれており、かつ出現頻度が長い単語と等しい単語を削除\r
     int i, i_max;\r
     CHNLIB_String *nowstr, *basestr;\r
-    int nowstrlen, basestrlen;\r
+    int basestrCoC;     //CountOfContainInHistoryStrings\r
     \r
-    i_max = CHNLIB_UIPArray_GetNumberOfDatas(candidatewordlist);\r
-    basestr = CHNLIB_UIPArray_GetPointerByIndex(WorkingSet.InputHistory, 0);\r
-    //basestrlen =\r
+    i_max = CHNLIB_UIPArray_GetNumberOfDatas(*candidatewordlist);\r
+    basestr = CHNLIB_UIPArray_GetPointerByIndex(*candidatewordlist, 0);\r
+    basestrCoC = CHNLIB_UIPArray_GetData32ByIndex(*candidatewordlist, 0);\r
     for(i = 1; i < i_max; i++){\r
-        nowstr = CHNLIB_UIPArray_GetPointerByIndex(WorkingSet.InputHistory, i);\r
+        nowstr = CHNLIB_UIPArray_GetPointerByIndex(*candidatewordlist, i);\r
+        if(CHNLIB_UTF8_GetCountOfContain(CHNLIB_String_GetReferencePointerOfCString(basestr), CHNLIB_String_GetReferencePointerOfCString(nowstr)) > 0){\r
+            //nowstrはbasestrに含まれている\r
+            if(basestrCoC == CHNLIB_UIPArray_GetData32ByIndex(*candidatewordlist, i)){\r
+                //かつ出現頻度が等しいので不要な単語\r
+                //出現頻度を0にして、後で削除する。\r
+                CHNLIB_UIPArray_SetData32ByIndex(*candidatewordlist, i, 0);\r
+            }\r
+        }\r
         \r
+        if(CHNLIB_UIPArray_GetData32ByIndex(*candidatewordlist, i) != 0){\r
+            //単語は削除されなかった、つまり異なる単語なので、basestrを更新\r
+            basestr = nowstr;\r
+            basestrCoC = CHNLIB_UIPArray_GetData32ByIndex(*candidatewordlist, i);\r
+        }\r
     }\r
+    \r
+    for(i = 1; i < i_max; i++){\r
+        if(CHNLIB_UIPArray_GetData32ByIndex(*candidatewordlist, i) == 0){\r
+            //削除対象\r
+            CHNLIB_UIPArray_RemoveByIndex(candidatewordlist, i);\r
+            i--;\r
+            i_max--;\r
+        }\r
+    }\r
+    \r
     return 0;\r
 }\r
 \r
-void AI_Memory_AddRootWordData(CHNLIB_String *tag)\r
+int AI_Think_CandidateWordList_Filter01(CHNLIB_UIPArray **candidatewordlist, int length)\r
 {\r
-    //WorkingSet.RootWordListに文字列を追加する。\r
-    //重複がある場合は警告を出し、追加しない。\r
+    //[Not implemented]\r
+    //[UTF-8]\r
+    //AI_Think_SlideLookUpWordByHistoryで返されたArray内の候補単語に対して、フィルターをかける。\r
+    //01:length以下の文字数の単語を削除\r
+    \r
     int i, i_max;\r
     \r
-    i_max = CHNLIB_UIPArray_GetNumberOfDatas(WorkingSet.RootWordList);\r
+    i_max = CHNLIB_UIPArray_GetNumberOfDatas(*candidatewordlist);\r
     for(i = 0; i < i_max; i++){\r
-        if(CHNLIB_String_CompareString_Strict(CHNLIB_UIPArray_GetPointerByIndex(WorkingSet.RootWordList, i), tag)){\r
-            CHNLIB_ReportError("Word[%s] has already existed.", CHNLIB_DEBUG_ARGUMENTS, CHNLIB_String_GetReferencePointerOfCString(tag));\r
-            return;\r
+        if(CHNLIB_UTF8_GetStringLengthByCharacter(CHNLIB_String_GetReferencePointerOfCString(CHNLIB_UIPArray_GetPointerByIndex(*candidatewordlist, i))) <= length){\r
+            CHNLIB_UIPArray_RemoveByIndex(candidatewordlist, i);\r
+            i--;\r
+            i_max--;\r
         }\r
     }\r
     \r
-    CHNLIB_UIPArray_AppendLast(&WorkingSet.RootWordList, i_max, tag);\r
-    \r
-    CHNLIB_UIPArray_AppendLast_ProtectFromDuplication(&WorkingSet.RootWordList, CHNLIB_UIPArray_GetNumberOfDatas(WorkingSet.RootWordList), tag, &AI_Memory_AddRootWordData_IsDuplicated);\r
+    return 0;\r
+}\r
+\r
+int AI_Memory_AddRootWordData(CHNLIB_String *tag)\r
+{\r
+    //WorkingSet.RootWordListに文字列を追加する。\r
+    //重複がある場合は警告を出し、追加しない。\r
+    //戻り値は追加されたまたは重複したタグのIndex.\r
     \r
-    return;\r
+    return CHNLIB_UIPArray_AppendLast_ProtectFromDuplication(&WorkingSet.RootWordList, 0, tag, &AI_Memory_AddRootWordData_IsDuplicated);\r
 }\r
 \r
 int AI_Memory_AddRootWordData_IsDuplicated(const void *listtag, const void *newtag)\r