#!/usr/bin/python
# coding: UTF-8
# Rec10 TS Recording Tools
-# Copyright (C) 2009 Yukikaze
+# Copyright (C) 2009-2011 Yukikaze
import zenhan
def bigram(str1, str2):
tmp = x
point = 0
for x in gram:
- i = find_gram(x, str2)
+ i = findGram(x, str2)
if i > 0:
i = 90 + 10 * i
else:
point = point + i
return point
def trigram(str1, str2):
+ """
+ trigramによる単語の近さを計算します。
+ 単語文字列の長さも考慮にいれます。
+ """
str1 = zenhan.toHankaku(str1)
str2 = zenhan.toHankaku(str2)
str1 = str1.replace(" ", "")
str2 = str2.replace(" ", "")
- """
- trigramによる単語の近さを計算します。
- """
gram = []
if len(str1) < 3:
gram.append(str1)
point = 0
count = 0
for x in gram:
- i = find_gram(x, str2)
+ i = findGram(x, str2)
if i > 0:
i = 90 + 10 * i
count=count + 1
point = point / len(str1) * 10
point = point / len(str2) * 10
return point
-def find_gram(gram, s):
+def findGram(gram, s):
"""
s中にあらわれるgramの数を調べる
"""
i = i + 1
st = st[st.find(gram) + 1:]
return i
-def get_noun_quad_gram(s):
- str1 = zenhan.toHankaku(s)
+def getNounQuadGram(s):
"""
- trigramによる単語の近さを計算します。
+ 。
"""
- gram = {}
+ str1 = zenhan.toHankaku(s)
+ gram={}
if len(str1) > 4:
tmp1 = str1[0]
tmp2 = str1[1]
tmp3 = str1[2]
+ skipnum=0
for x in str1[3:]:
- tmps=tmp1 + tmp2 + tmp3 + x
- if tmps.find(" ")==-1:
- if (zenhan.check_Character_Type(tmp1)==zenhan.check_Character_Type(tmp2))and(zenhan.check_Character_Type(tmp2)==zenhan.check_Character_Type(tmp3))and(zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
- if gram.has_key(tmps):
- gram[tmps]=gram[tmps]+1
- else:
- gram[tmps]=1
- tmp1 = tmp2
- tmp2 = tmp3
- tmp3 = x
+ if skipnum>0:
+ skipnum=skipnum-1
+ else:
+ tmps=tmp1 + tmp2 + tmp3 + x
+ if tmps.find(" ")<0:
+ if zenhan.checkCharacterType(tmp1)>0:
+ if (zenhan.checkCharacterType(tmp1)==zenhan.checkCharacterType(tmp2)):
+ if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)):
+ if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
+ gram[tmps]=gram.get(tmps,0)+1
+ skipnum=0
+ else:
+ skipnum=3
+ else:
+ if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
+ skipnum=2
+ else:
+ skipnum=3
+ else:
+ if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)):
+ if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
+ skipnum=0
+ else:
+ skipnum=3
+ else:
+ if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
+ skipnum=2
+ else:
+ skipnum=3
+ tmp1 = tmp2
+ tmp2 = tmp3
+ tmp3 = x
return gram
""
\ No newline at end of file