RE_KANSUJI = re.compile('^[一二三四五六七八九〇零十拾百千壱二参]+$')\r
\r
# http://programminblog.blogspot.jp/2010/11/python.html\r
-def kansuji2arabic(text):\r
+def kansuji2arabic(text, logwrite=None):\r
if not RE_KANSUJI.match(text):\r
- return None\r
+ return (0, None) # 漢数字ではない場合\r
result = 0\r
+ prevDigit = 0\r
digit = 1\r
numgroup = 1\r
kanindex = len(text)\r
+ if logwrite: logwrite('kansuji2arabic: ' + text)\r
while kanindex > 0:\r
c = text[(kanindex - 1):kanindex]\r
c1 = text[kanindex:(kanindex + 1)]\r
elif c in '九':\r
result += 9 * digit * numgroup\r
digit *= 10\r
+ if logwrite: logwrite('kansuji2arabic c(%s) c1(%s) kanindex(%d) prevDigit(%d) digit(%d) result(%d) numgroup(%d)' % (c, c1, kanindex, prevDigit, digit, result, numgroup))\r
+ if prevDigit > digit:\r
+ return (2, None) # およその数で数が重なる場合\r
+ prevDigit = digit\r
if (digit == 10 and text[:1] in '十拾') or \\r
(digit == 100 and text[:1] in '百') or \\r
(digit == 1000 and text[:1] in '千'):\r
result += digit * numgroup\r
text = '%d' % result\r
- return text\r
+ return (1, text) # 漢数字の場合\r
\r
-def rewrite_number(li):\r
+def rewrite_number(li, logwrite=None):\r
new_li = []\r
for mo in li:\r
m = copy.deepcopy(mo)\r
if m.hinshi2 != '固有名詞':\r
- ret = kansuji2arabic(m.hyouki)\r
- if ret:\r
- m.output = ret\r
+ flag, num = kansuji2arabic(m.hyouki, logwrite)\r
+ if flag == 1:\r
+ m.output = str(num)\r
+ elif flag == 2 and len(m.hyouki) >= 2:\r
+ # 「二十二三」のような場合「二十二」「三」に分割\r
+ h1 = m.hyouki[:-1]\r
+ flag1, num1 = kansuji2arabic(h1, logwrite)\r
+ h2 = m.hyouki[-1:]\r
+ flag2, num2 = kansuji2arabic(h2, logwrite)\r
+ if flag1 == 1 and flag2 == 1:\r
+ m.output = str(num1) + '⠼' + str(num2)\r
new_li.append(m)\r
return new_li\r
\r
\r
li = replace_morphs(li, CONNECTED_MORPHS)\r
li = replace_digit_morphs(li)\r
- li = rewrite_number(li)\r
+ li = rewrite_number(li, logwrite)\r
\r
# before: う,う,助動詞,*,*,*,ウ,ウ,0/1,ウ,0\r
# after: う,う,助動詞,*,*,*,ウ,ウ,0/1,ー,0\r