fix remove error.

[rec10/rec10-git.git] / rec10 / trunk / src / n_gram.py
diff --git a/rec10/trunk/src/n_gram.py b/rec10/trunk/src/n_gram.py

index 6564813..970f44c 100644 (file)
--- a/rec10/trunk/src/n_gram.py
+++ b/rec10/trunk/src/n_gram.py
@@ -1,7 +1,7 @@
  #!/usr/bin/python
  # coding: UTF-8
  # Rec10 TS Recording Tools
-# Copyright (C) 2009 Yukikaze
+# Copyright (C) 2009-2010 Yukikaze
  
  import zenhan
  def bigram(str1, str2):
@@ -30,13 +30,14 @@ def bigram(str1, str2):
          point = point + i
      return point
  def trigram(str1, str2):
+    """
+    trigramによる単語の近さを計算します。
+    単語文字列の長さも考慮にいれます。
+    """
      str1 = zenhan.toHankaku(str1)
      str2 = zenhan.toHankaku(str2)
      str1 = str1.replace(" ", "")
      str2 = str2.replace(" ", "")
-    """
-    trigramによる単語の近さを計算します。
-    """
      gram = []
      if len(str1) < 3:
          gram.append(str1)
@@ -71,4 +72,49 @@ def find_gram(gram, s):
          i = i + 1
          st = st[st.find(gram) + 1:]
      return i
+def get_noun_quad_gram(s):
+    """
+    。
+    """
+    str1 = zenhan.toHankaku(s)
+    gram={}
+    if len(str1) > 4:
+        tmp1 = str1[0]
+        tmp2 = str1[1]
+        tmp3 = str1[2]
+        skipnum=0
+        for x in str1[3:]:
+            if skipnum>0:
+                skipnum=skipnum-1
+            else:
+                tmps=tmp1 + tmp2 + tmp3 + x
+                if tmps.find(" ")<0:
+                    if zenhan.check_Character_Type(tmp1)>0:
+                        if (zenhan.check_Character_Type(tmp1)==zenhan.check_Character_Type(tmp2)):
+                            if (zenhan.check_Character_Type(tmp2)==zenhan.check_Character_Type(tmp3)):
+                                if (zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
+                                    gram[tmps]=gram.get(tmps,0)+1
+                                    skipnum=0
+                                else:
+                                    skipnum=3
+                            else:
+                                if (zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
+                                    skipnum=2
+                                else:
+                                    skipnum=3
+                        else:
+                            if (zenhan.check_Character_Type(tmp2)==zenhan.check_Character_Type(tmp3)):
+                                if (zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
+                                    skipnum=0
+                                else:
+                                    skipnum=3
+                            else:
+                                if (zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
+                                    skipnum=2
+                                else:
+                                    skipnum=3
+            tmp1 = tmp2
+            tmp2 = tmp3
+            tmp3 = x
+    return gram
  ""
 \ No newline at end of file