OSDN Git Service

sys.setdefaultencoding('UTF-8') required version.
[rec10/rec10-git.git] / rec10 / branches / 0.9.0 / src / n_gram.py
1 #!/usr/bin/python
2 # coding: UTF-8
3 # Rec10 TS Recording Tools
4 # Copyright (C) 2009 Yukikaze
5
6 import zenhan
7 def bigram(str1, str2):
8     str1 = zenhan.toHankaku(str1)
9     str2 = zenhan.toHankaku(str2)
10     str1 = str1.replace(" ", "")
11     str2 = str2.replace(" ", "")
12     """
13     bigramによる単語の近さを計算します。
14     """
15     gram = []
16     if len(str1) < 2:
17         gram.append(str1)
18     else:
19         tmp = str1[0]
20         for x in str1[1:]:
21             gram.append(tmp + x)
22             tmp = x
23     point = 0
24     for x in gram:
25         i = find_gram(x, str2)
26         if i > 0:
27             i = 90 + 10 * i
28         else:
29             i = 0
30         point = point + i
31     return point
32 def trigram(str1, str2):
33     str1 = zenhan.toHankaku(str1)
34     str2 = zenhan.toHankaku(str2)
35     str1 = str1.replace(" ", "")
36     str2 = str2.replace(" ", "")
37     """
38     trigramによる単語の近さを計算します。
39     """
40     gram = []
41     if len(str1) < 3:
42         gram.append(str1)
43     else:
44         tmp1 = str1[0]
45         tmp2 = str1[1]
46         for x in str1[2:]:
47             gram.append(tmp1 + tmp2 + x)
48             tmp1 = tmp2
49             tmp2 = x
50     point = 0
51     count = 0
52     for x in gram:
53         i = find_gram(x, str2)
54         if i > 0:
55             i = 90 + 10 * i
56             count=count + 1
57         else:
58             i = 0
59         point = point + i
60     point = point + 20*count*count
61     point = point / len(str1) * 10
62     point = point / len(str2) * 10
63     return point
64 def find_gram(gram, s):
65     """
66     s中にあらわれるgramの数を調べる
67     """
68     st = s
69     i = 0
70     while st.find(gram) != -1:
71         i = i + 1
72         st = st[st.find(gram) + 1:]
73     return i
74 def get_noun_quad_gram(s):
75     """
76     。
77     """
78     str1 = zenhan.toHankaku(s)
79     gram={}
80     if len(str1) > 4:
81         tmp1 = str1[0]
82         tmp2 = str1[1]
83         tmp3 = str1[2]
84         skipnum=0
85         for x in str1[3:]:
86             if skipnum>0:
87                 skipnum=skipnum-1
88             else:
89                 tmps=tmp1 + tmp2 + tmp3 + x
90                 if tmps.find(" ")<0:
91                     if zenhan.check_Character_Type(tmp1)>0:
92                         if (zenhan.check_Character_Type(tmp1)==zenhan.check_Character_Type(tmp2)):
93                             if (zenhan.check_Character_Type(tmp2)==zenhan.check_Character_Type(tmp3)):
94                                 if (zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
95                                     gram[tmps]=gram.get(tmps,0)+1
96                                     skipnum=0
97                                 else:
98                                     skipnum=3
99                             else:
100                                 if (zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
101                                     skipnum=2
102                                 else:
103                                     skipnum=3
104                         else:
105                             if (zenhan.check_Character_Type(tmp2)==zenhan.check_Character_Type(tmp3)):
106                                 if (zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
107                                     skipnum=0
108                                 else:
109                                     skipnum=3
110                             else:
111                                 if (zenhan.check_Character_Type(tmp3)==zenhan.check_Character_Type(x)):
112                                     skipnum=2
113                                 else:
114                                     skipnum=3
115             tmp1 = tmp2
116             tmp2 = tmp3
117             tmp3 = x
118     return gram
119 ""