OSDN Git Service

classify : fix DB id undefined.
[rec10/rec10-git.git] / rec10 / n_gram.py
1 #!/usr/bin/python
2 # coding: UTF-8
3 # Rec10 TS Recording Tools
4 # Copyright (C) 2009-2011 Yukikaze
5
6 import zenhan
7 def bigram(str1, str2):
8     str1 = zenhan.toHankaku(str1)
9     str2 = zenhan.toHankaku(str2)
10     str1 = str1.replace(" ", "")
11     str2 = str2.replace(" ", "")
12     """
13     bigramによる単語の近さを計算します。
14     """
15     gram = []
16     if len(str1) < 2:
17         gram.append(str1)
18     else:
19         tmp = str1[0]
20         for x in str1[1:]:
21             gram.append(tmp + x)
22             tmp = x
23     point = 0
24     for x in gram:
25         i = findGram(x, str2)
26         if i > 0:
27             i = 90 + 10 * i
28         else:
29             i = 0
30         point = point + i
31     return point
32 def trigram(str1, str2):
33     """
34     trigramによる単語の近さを計算します。
35     単語文字列の長さも考慮にいれます。
36     """
37     str1 = zenhan.toHankaku(str1)
38     str2 = zenhan.toHankaku(str2)
39     str1 = str1.replace(" ", "")
40     str2 = str2.replace(" ", "")
41     gram = []
42     if len(str1) < 3:
43         gram.append(str1)
44     else:
45         tmp1 = str1[0]
46         tmp2 = str1[1]
47         for x in str1[2:]:
48             gram.append(tmp1 + tmp2 + x)
49             tmp1 = tmp2
50             tmp2 = x
51     point = 0
52     count = 0
53     for x in gram:
54         i = findGram(x, str2)
55         if i > 0:
56             i = 90 + 10 * i
57             count=count + 1
58         else:
59             i = 0
60         point = point + i
61     point = point + 20*count*count
62     point = point / len(str1) * 10
63     point = point / len(str2) * 10
64     return point
65 def findGram(gram, s):
66     """
67     s中にあらわれるgramの数を調べる
68     """
69     st = s
70     i = 0
71     while st.find(gram) != -1:
72         i = i + 1
73         st = st[st.find(gram) + 1:]
74     return i
75 def getNounQuadGram(s):
76     """
77     。
78     """
79     str1 = zenhan.toHankaku(s)
80     gram={}
81     if len(str1) > 4:
82         tmp1 = str1[0]
83         tmp2 = str1[1]
84         tmp3 = str1[2]
85         skipnum=0
86         for x in str1[3:]:
87             if skipnum>0:
88                 skipnum=skipnum-1
89             else:
90                 tmps=tmp1 + tmp2 + tmp3 + x
91                 if tmps.find(" ")<0:
92                     if zenhan.checkCharacterType(tmp1)>0:
93                         if (zenhan.checkCharacterType(tmp1)==zenhan.checkCharacterType(tmp2)):
94                             if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)):
95                                 if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
96                                     gram[tmps]=gram.get(tmps,0)+1
97                                     skipnum=0
98                                 else:
99                                     skipnum=3
100                             else:
101                                 if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
102                                     skipnum=2
103                                 else:
104                                     skipnum=3
105                         else:
106                             if (zenhan.checkCharacterType(tmp2)==zenhan.checkCharacterType(tmp3)):
107                                 if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
108                                     skipnum=0
109                                 else:
110                                     skipnum=3
111                             else:
112                                 if (zenhan.checkCharacterType(tmp3)==zenhan.checkCharacterType(x)):
113                                     skipnum=2
114                                 else:
115                                     skipnum=3
116             tmp1 = tmp2
117             tmp2 = tmp3
118             tmp3 = x
119     return gram
120 ""