+++ /dev/null
-#!/usr/bin/python
-# coding: UTF-8
-# Rec10 TS Recording Tools
-# Copyright (C) 2009-2011 Yukikaze
-
-import zenhan
-def bigram(str1, str2):
- str1 = zenhan.toHankaku(str1)
- str2 = zenhan.toHankaku(str2)
- str1 = str1.replace(" ", "")
- str2 = str2.replace(" ", "")
- """
- bigramによる単語の近さを計算します。
- """
- gram = []
- if len(str1) < 2:
- gram.append(str1)
- else:
- tmp = str1[0]
- for x in str1[1:]:
- gram.append(tmp + x)
- tmp = x
- point = 0
- for x in gram:
- i = findGram(x, str2)
- if i > 0:
- i = 90 + 10 * i
- else:
- i = 0
- point = point + i
- return point
-def trigram(str1, str2):
- """
- trigramによる単語の近さを計算します。
- 単語文字列の長さも考慮にいれます。
- """
- str1 = zenhan.toHankaku(str1)
- str2 = zenhan.toHankaku(str2)
- str1 = str1.replace(" ", "")
- str2 = str2.replace(" ", "")
- gram = []
- if len(str1) < 3:
- gram.append(str1)
- else:
- tmp1 = str1[0]
- tmp2 = str1[1]
- for x in str1[2:]:
- gram.append(tmp1 + tmp2 + x)
- tmp1 = tmp2
- tmp2 = x
- point = 0
- count = 0
- for x in gram:
- i = findGram(x, str2)
- if i > 0:
- i = 90 + 10 * i
- count=count + 1
- else:
- i = 0
- point = point + i
- point = point + 20*count*count
- point = point / len(str1) * 10
- point = point / len(str2) * 10
- return point
-def findGram(gram, s):
- """
- s中にあらわれるgramの数を調べる
- """
- st = s
- i = 0
- while st.find(gram) != -1:
- i = i + 1
- st = st[st.find(gram) + 1:]
- return i
-def getNounQuadGram(s):
- """
- 。
- """
- str1 = zenhan.toHankaku(s)
- gram={}
- if len(str1) > 4:
- tmp1 = str1[0]
- tmp2 = str1[1]
- tmp3 = str1[2]
- skipnum=0
- for x in str1[3:]:
- if skipnum>0:
- skipnum=skipnum-1
- else:
- tmps=tmp1 + tmp2 + tmp3 + x
- if tmps.find(" ")<0:
- if zenhan.checkCharacterType(tmp1)>0:
- if (zenhan.checkCharacterType(tmp1)==zenhan.checkCharacterType(tmp2)):
- if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)):
- if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
- gram[tmps]=gram.get(tmps,0)+1
- skipnum=0
- else:
- skipnum=3
- else:
- if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
- skipnum=2
- else:
- skipnum=3
- else:
- if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)):
- if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
- skipnum=0
- else:
- skipnum=3
- else:
- if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
- skipnum=2
- else:
- skipnum=3
- tmp1 = tmp2
- tmp2 = tmp3
- tmp3 = x
- return gram
-""
\ No newline at end of file