--- /dev/null
+#!/usr/bin/python
+# coding: UTF-8
+# Rec10 TS Recording Tools
+# Copyright (C) 2009-2011 Yukikaze
+
+import zenhan
+def bigram(str1, str2):
+ str1 = zenhan.toHankaku(str1)
+ str2 = zenhan.toHankaku(str2)
+ str1 = str1.replace(" ", "")
+ str2 = str2.replace(" ", "")
+ """
+ bigramによる単語の近さを計算します。
+ """
+ gram = []
+ if len(str1) < 2:
+ gram.append(str1)
+ else:
+ tmp = str1[0]
+ for x in str1[1:]:
+ gram.append(tmp + x)
+ tmp = x
+ point = 0
+ for x in gram:
+ i = findGram(x, str2)
+ if i > 0:
+ i = 90 + 10 * i
+ else:
+ i = 0
+ point = point + i
+ return point
+def trigram(str1, str2):
+ """
+ trigramによる単語の近さを計算します。
+ 単語文字列の長さも考慮にいれます。
+ """
+ str1 = zenhan.toHankaku(str1)
+ str2 = zenhan.toHankaku(str2)
+ str1 = str1.replace(" ", "")
+ str2 = str2.replace(" ", "")
+ gram = []
+ if len(str1) < 3:
+ gram.append(str1)
+ else:
+ tmp1 = str1[0]
+ tmp2 = str1[1]
+ for x in str1[2:]:
+ gram.append(tmp1 + tmp2 + x)
+ tmp1 = tmp2
+ tmp2 = x
+ point = 0
+ count = 0
+ for x in gram:
+ i = findGram(x, str2)
+ if i > 0:
+ i = 90 + 10 * i
+ count=count + 1
+ else:
+ i = 0
+ point = point + i
+ point = point + 20*count*count
+ point = point / len(str1) * 10
+ point = point / len(str2) * 10
+ return point
+def findGram(gram, s):
+ """
+ s中にあらわれるgramの数を調べる
+ """
+ st = s
+ i = 0
+ while st.find(gram) != -1:
+ i = i + 1
+ st = st[st.find(gram) + 1:]
+ return i
+def getNounQuadGram(s):
+ """
+ 。
+ """
+ str1 = zenhan.toHankaku(s)
+ gram={}
+ if len(str1) > 4:
+ tmp1 = str1[0]
+ tmp2 = str1[1]
+ tmp3 = str1[2]
+ skipnum=0
+ for x in str1[3:]:
+ if skipnum>0:
+ skipnum=skipnum-1
+ else:
+ tmps=tmp1 + tmp2 + tmp3 + x
+ if tmps.find(" ")<0:
+ if zenhan.checkCharacterType(tmp1)>0:
+ if (zenhan.checkCharacterType(tmp1)==zenhan.checkCharacterType(tmp2)):
+ if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)):
+ if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
+ gram[tmps]=gram.get(tmps,0)+1
+ skipnum=0
+ else:
+ skipnum=3
+ else:
+ if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
+ skipnum=2
+ else:
+ skipnum=3
+ else:
+ if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)):
+ if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
+ skipnum=0
+ else:
+ skipnum=3
+ else:
+ if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
+ skipnum=2
+ else:
+ skipnum=3
+ tmp1 = tmp2
+ tmp2 = tmp3
+ tmp3 = x
+ return gram
+""
\ No newline at end of file