1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
3 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
4 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 using System.Collections.Generic;
8 using System.Text.RegularExpressions;
12 public class Viterbi : IDisposable
16 private class ThreadData
18 public MeCabNode EosNode;
19 public MeCabNode BosNode;
20 public MeCabNode[] EndNodeList;
21 public MeCabNode[] BeginNodeList;
27 #region Field/Property
29 private readonly Tokenizer tokenizer = new Tokenizer();
30 private readonly Connector connector = new Connector();
32 private MeCabLatticeLevel level;
34 private int costFactor;
38 get { return this.theta * this.costFactor; }
39 set { this.theta = value / this.costFactor; }
42 public unsafe MeCabLatticeLevel LatticeLevel
51 this.connect = this.ConnectNomal;
52 this.analyze = this.DoViterbi;
53 if (value >= MeCabLatticeLevel.One)
54 this.connect = this.ConnectWithAllPath;
55 if (value >= MeCabLatticeLevel.Two)
56 this.analyze = this.ForwardBackward;
60 public bool Partial { get; set; }
66 return this.buildLattice == this.BuildAllLattice;
71 this.buildLattice = this.BuildAllLattice;
73 this.buildLattice = this.BuildBestLattice;
81 public void Open(MeCabParam param)
83 tokenizer.Open(param);
84 connector.Open(param);
86 this.costFactor = param.CostFactor;
87 this.Theta = param.Theta;
88 this.LatticeLevel = param.LatticeLevel;
89 this.Partial = param.Partial;
90 this.AllMorphs = param.AllMorphs;
96 this.tokenizer.Clear();
104 public unsafe MeCabNode Analyze(char* str, int len)
110 ThreadData work = new ThreadData()
112 EndNodeList = new MeCabNode[len + 4],
113 BeginNodeList = new MeCabNode[len + 4]
118 string newStr = this.InitConstraints(str, len, work);
119 fixed (char* pNewStr = newStr)
121 this.analyze(pNewStr, newStr.Length, work);
122 return this.buildLattice(work);
126 this.analyze(str, len, work);
127 return this.buildLattice(work);
134 private unsafe delegate void AnalyzeAction(char* str, int len, ThreadData work);
136 private AnalyzeAction analyze;
138 private unsafe void ForwardBackward(char* sentence, int len, ThreadData work)
140 this.DoViterbi(sentence, len, work);
142 work.EndNodeList[0].Alpha = 0f;
143 for (int pos = 0; pos <= len; pos++)
144 for (MeCabNode node = work.BeginNodeList[pos]; node != null; node = node.BNext)
145 this.CalcAlpha(node, this.theta);
147 work.BeginNodeList[len].Beta = 0f;
148 for (int pos = len; pos >= 0; pos--)
149 for (MeCabNode node = work.EndNodeList[pos]; node != null; node = node.ENext)
150 this.CalcBeta(node, this.theta);
152 work.Z = work.BeginNodeList[len].Alpha; // alpha of EOS
154 for (int pos = 0; pos <= len; pos++)
155 for (MeCabNode node = work.BeginNodeList[pos]; node != null; node = node.BNext)
156 node.Prob = (float)Math.Exp(node.Alpha + node.Beta - work.Z);
160 private void CalcAlpha(MeCabNode n, double beta)
163 for (MeCabPath path = n.LPath; path != null; path = path.LNext)
165 n.Alpha = (float)Utils.LogSumExp(n.Alpha,
166 -beta * path.Cost + path.LNode.Alpha,
171 private void CalcBeta(MeCabNode n, double beta)
174 for (MeCabPath path = n.RPath; path != null; path = path.RNext)
176 n.Beta = (float)Utils.LogSumExp(n.Beta,
177 -beta * path.Cost + path.RNode.Beta,
182 private unsafe void DoViterbi(char* sentence, int len, ThreadData work)
184 work.BosNode = this.tokenizer.GetBosNode();
185 work.BosNode.Length = len;
187 char* begin = sentence;
188 char* end = begin + len;
189 work.BosNode.Surface = new string(begin, 0, len);
190 work.EndNodeList[0] = work.BosNode;
192 for (int pos = 0; pos < len; pos++)
194 if (work.EndNodeList[pos] != null)
196 MeCabNode rNode = tokenizer.Lookup(begin + pos, end);
197 rNode = this.FilterNode(rNode, pos, work);
199 rNode.EPos = pos + rNode.RLength;
200 work.BeginNodeList[pos] = rNode;
201 this.connect(pos, rNode, work);
205 work.EosNode = tokenizer.GetEosNode();
206 work.EosNode.Surface = end->ToString();
207 work.BeginNodeList[len] = work.EosNode;
208 for (int pos = len; pos >= 0; pos--)
210 if (work.EndNodeList[pos] != null)
212 this.connect(pos, work.EosNode, work);
222 private delegate void ConnectAction(int pos, MeCabNode rNode, ThreadData work);
224 private ConnectAction connect;
226 private void ConnectWithAllPath(int pos, MeCabNode rNode, ThreadData work)
228 for (; rNode != null; rNode = rNode.BNext)
230 long bestCost = int.MaxValue; // 2147483647
232 MeCabNode bestNode = null;
234 for (MeCabNode lNode = work.EndNodeList[pos]; lNode != null; lNode = lNode.ENext)
236 int lCost = this.connector.Cost(lNode, rNode); // local cost
237 long cost = lNode.Cost + lCost;
245 MeCabPath path = new MeCabPath()
257 if (bestNode == null) throw new ArgumentException("too long sentence.");
259 rNode.Prev = bestNode;
261 rNode.Cost = bestCost;
262 int x = rNode.RLength + pos;
263 rNode.ENext = work.EndNodeList[x];
264 work.EndNodeList[x] = rNode;
268 private void ConnectNomal(int pos, MeCabNode rNode, ThreadData work)
270 for (; rNode != null; rNode = rNode.BNext)
272 long bestCost = int.MaxValue; // 2147483647
274 MeCabNode bestNode = null;
276 for (MeCabNode lNode = work.EndNodeList[pos]; lNode != null; lNode = lNode.ENext)
278 long cost = lNode.Cost + this.connector.Cost(lNode, rNode);
287 if (bestNode == null) throw new MeCabException("too long sentence.");
289 rNode.Prev = bestNode;
291 rNode.Cost = bestCost;
292 int x = rNode.RLength + pos;
293 rNode.ENext = work.EndNodeList[x];
294 work.EndNodeList[x] = rNode;
302 private delegate MeCabNode BuildLatticeFunc(ThreadData work);
304 private BuildLatticeFunc buildLattice;
306 private MeCabNode BuildAllLattice(ThreadData work)
308 if (this.BuildBestLattice(work) == null) return null;
310 MeCabNode prev = work.BosNode;
312 for (int pos = 0; pos < work.BeginNodeList.Length; pos++)
314 for (MeCabNode node = work.BeginNodeList[pos]; node != null; node = node.BNext)
319 for (MeCabPath path = node.LPath; path != null; path = path.LNext)
321 path.Prob = (float)(path.LNode.Alpha
322 - this.theta * path.Cost
323 + path.RNode.Beta - work.Z);
331 private MeCabNode BuildBestLattice(ThreadData work)
333 MeCabNode node = work.EosNode;
334 for (MeCabNode prevNode; node.Prev != null; )
337 prevNode = node.Prev;
338 prevNode.Next = node;
348 private unsafe string InitConstraints(char* sentence, int sentenceLen, ThreadData work)
350 string str = new string(sentence, 0, sentenceLen);
351 StringBuilder os = new StringBuilder();
355 foreach (string line in str.Split('\r', '\n'))
357 if (line == "") continue;
358 if (line == "EOS") break;
360 string[] column = line.Split('\t');
361 os.Append(column[0]).Append(' ');
362 int len = column[0].Length;
364 if (column.Length == 2)
366 if (column[1] == "\0") throw new ArgumentException("use \\t as separator");
367 MeCabNode c = this.tokenizer.GetNewNode();
368 c.Surface = column[0];
369 c.Feature = column[1];
374 work.BeginNodeList[pos] = c;
380 return os.ToString();
383 private MeCabNode FilterNode(MeCabNode node, int pos, ThreadData work)
385 if (!this.Partial) return node;
387 MeCabNode c = work.BeginNodeList[pos];
388 if (c == null) return node;
389 bool wild = (c.Feature == "*");
391 MeCabNode prev = null;
392 MeCabNode result = null;
394 for (MeCabNode n = node; n != null; n = n.BNext)
396 if (c.Surface == n.Surface
397 && (wild || this.PartialMatch(c.Feature, n.Feature)))
411 if (result == null) result = c;
412 if (prev != null) prev.BNext = null;
417 private bool PartialMatch(string f1, string f2)
419 string[] c1 = f1.Split(',');
420 string[] c2 = f2.Split(',');
422 int n = Math.Min(c1.Length, c2.Length);
424 for (int i = 0; i < n; i++)
425 if (c1[i] != "*" && c2[i] != "*" && c1[i] != c2[i]) return false;
434 private bool disposed;
439 public void Dispose()
442 GC.SuppressFinalize(this);
445 protected virtual void Dispose(bool disposing)
447 if (disposed) return;
451 this.tokenizer.Dispose(); //Nullチェック不要
452 this.connector.Dispose(); //Nullチェック不要
455 this.disposed = true;