X-Git-Url: http://git.osdn.net/view?p=rec10%2Frec10-git.git;a=blobdiff_plain;f=rec10%2Ftrunk%2Fsrc%2Fn_gram.py;h=f24a0975579da9a1b320fda469eec7045e2d1b48;hp=e653d03734c8afa76fa579099958bfc4c523faf5;hb=12414a3ba81d93316e351ab51aa49bb71cf42f60;hpb=f618263e9f23e0399e14d2757fab0789fdc8cf3b diff --git a/rec10/trunk/src/n_gram.py b/rec10/trunk/src/n_gram.py index e653d03..f24a097 100644 --- a/rec10/trunk/src/n_gram.py +++ b/rec10/trunk/src/n_gram.py @@ -1,40 +1,120 @@ #!/usr/bin/python # coding: UTF-8 # Rec10 TS Recording Tools -# Copyright (C) 2009 Yukikaze +# Copyright (C) 2009-2011 Yukikaze + import zenhan -def bigram(str1,str2): - str1=zenhan.toHankaku(str1) - str2=zenhan.toHankaku(str2) - str1=str1.replace(" ","") - str2=str2.replace(" ","") +def bigram(str1, str2): + str1 = zenhan.toHankaku(str1) + str2 = zenhan.toHankaku(str2) + str1 = str1.replace(" ", "") + str2 = str2.replace(" ", "") """ bigramによる単語の近さを計算します。 """ - gram=[] - if len(str1)<2: + gram = [] + if len(str1) < 2: gram.append(str1) else: - tmp=str1[0] + tmp = str1[0] for x in str1[1:]: - gram.append(tmp+x) - tmp=x - point=0 + gram.append(tmp + x) + tmp = x + point = 0 for x in gram: - i=find_gram(x, str2) - if i>0: - i=90+10*i + i = findGram(x, str2) + if i > 0: + i = 90 + 10 * i else: - i=0 - point=point+i + i = 0 + point = point + i return point -def find_gram(gram,s): +def trigram(str1, str2): + """ + trigramによる単語の近さを計算します。 + 単語文字列の長さも考慮にいれます。 + """ + str1 = zenhan.toHankaku(str1) + str2 = zenhan.toHankaku(str2) + str1 = str1.replace(" ", "") + str2 = str2.replace(" ", "") + gram = [] + if len(str1) < 3: + gram.append(str1) + else: + tmp1 = str1[0] + tmp2 = str1[1] + for x in str1[2:]: + gram.append(tmp1 + tmp2 + x) + tmp1 = tmp2 + tmp2 = x + point = 0 + count = 0 + for x in gram: + i = findGram(x, str2) + if i > 0: + i = 90 + 10 * i + count=count + 1 + else: + i = 0 + point = point + i + point = point + 20*count*count + point = point / len(str1) * 10 + point = point / len(str2) * 10 + return point +def findGram(gram, s): """ s中にあらわれるgramの数を調べる """ - st=s - i=0 - while st.find(gram)!=-1: - i=i+1 - st=st[st.find(gram)+1:] - return i \ No newline at end of file + st = s + i = 0 + while st.find(gram) != -1: + i = i + 1 + st = st[st.find(gram) + 1:] + return i +def getNounQuadGram(s): + """ + 。 + """ + str1 = zenhan.toHankaku(s) + gram={} + if len(str1) > 4: + tmp1 = str1[0] + tmp2 = str1[1] + tmp3 = str1[2] + skipnum=0 + for x in str1[3:]: + if skipnum>0: + skipnum=skipnum-1 + else: + tmps=tmp1 + tmp2 + tmp3 + x + if tmps.find(" ")<0: + if zenhan.checkCharacterType(tmp1)>0: + if (zenhan.checkCharacterType(tmp1)==zenhan.checkCharacterType(tmp2)): + if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)): + if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)): + gram[tmps]=gram.get(tmps,0)+1 + skipnum=0 + else: + skipnum=3 + else: + if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)): + skipnum=2 + else: + skipnum=3 + else: + if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)): + if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)): + skipnum=0 + else: + skipnum=3 + else: + if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)): + skipnum=2 + else: + skipnum=3 + tmp1 = tmp2 + tmp2 = tmp3 + tmp3 = x + return gram +"" \ No newline at end of file