jptools/jtalkRunner.py

   1 # jtalkRunner.py
   2 # -*- coding: utf-8 -*-
   3 # Japanese speech engine test module
   4 # by Takuya Nishimoto
   5 # http://ja.nishimotz.com/project:libopenjtalk
   6 # Usage:
   7 # > python jtalkRunner.py
   8 # requires pyaudio (PortAudio wrapper)
   9 # http://people.csail.mit.edu/hubert/pyaudio/
  10
  11 from __future__ import unicode_literals, print_function
  12 import os
  13 import sys
  14 import wave
  15 import time
  16 import pyaudio
  17 import cProfile
  18 import pstats
  19 JT_DIR = os.path.normpath(
  20         os.path.join(os.getcwdu(), '..', 'source', 'synthDrivers', 'jtalk')
  21         )
  22 sys.path.append(JT_DIR)
  23 from jtalkCore import *
  24 import jtalkPrepare
  25
  26 JT_DLL = os.path.join(JT_DIR, 'libopenjtalk.dll')
  27
  28 voices = [
  29         {"id": "V1",
  30          "name": "m1",
  31          "lang":"ja",
  32          "samp_rate": 48000,
  33          "fperiod": 240,
  34          "lf0_base":5.0,
  35          "speaker_attenuation":1.0,
  36          "htsvoice": os.path.join(JT_DIR, 'm001', 'm001.htsvoice'),
  37          #"espeak_variant": "max",
  38          },
  39         {"id": "V2",
  40          "name": "mei",
  41          "lang":"ja",
  42          "samp_rate": 48000,
  43          "fperiod": 240,
  44          "lf0_base": 5.86,
  45          "pitch_bias": -10,
  46          "speaker_attenuation": 0.5,
  47          "htsvoice": os.path.join(JT_DIR, 'mei', 'mei_normal.htsvoice'),
  48          #"espeak_variant": "f1",
  49          },
  50         {"id": "V3",
  51          "name": "lite",
  52          "lang":"ja",
  53          "samp_rate": 16000,
  54          "fperiod": 80,
  55          "lf0_base": 5.0,
  56          "pitch_bias": 0,
  57          "speaker_attenuation": 1.0,
  58          "htsvoice": os.path.join(JT_DIR, 'lite', 'voice.htsvoice'),
  59          #"espeak_variant": "max",
  60          },
  61         ]
  62
  63 def pa_play(data, samp_rate = 16000):
  64         p = pyaudio.PyAudio()
  65         stream = p.open(format = p.get_format_from_width(2),
  66                 channels = 1, rate = samp_rate, output = True)
  67         size = len(data)
  68         pos = 0 # byte count
  69         while pos < size:
  70                 a = stream.get_write_available() * 2
  71                 o = data[pos:pos+a]
  72                 stream.write(o)
  73                 pos += a
  74         time.sleep(float(size) / 2 / samp_rate)
  75         stream.close()
  76         p.terminate()
  77
  78 def __print(s):
  79         print(s.encode('cp932', 'ignore'))
  80
  81 def print_code(msg):
  82         s = ''
  83         for c in msg:
  84                 s += '%04x ' % ord(c)
  85         print(s)
  86
  87 def do_synthesis(msg, voice_args, do_play, do_write, do_log, fperiod, pitch=50, inflection=50, vol=50):
  88         msg = jtalkPrepare.convert(msg)
  89         s = text2mecab(msg)
  90         __print("utf-8: (%s)" % s.decode('utf-8', 'ignore'))
  91         mf = MecabFeatures()
  92         Mecab_analysis(s, mf)
  93         Mecab_print(mf, __print)
  94         Mecab_correctFeatures(mf)
  95         ar = Mecab_splitFeatures(mf)
  96         __print('array size %d' % len(ar))
  97         max_level = int(326.67 * int(vol) + 100) # 100..32767
  98         level = int(max_level * voice_args['speaker_attenuation'])
  99         lf0_amp = 0.020 * inflection # 50 = original range
 100         ls = 0.015 * (pitch - 50.0 + voice_args['pitch_bias']) # 50 = no shift
 101         lf0_offset = ls + voice_args['lf0_base'] * (1 - lf0_amp)
 102         count = 0
 103         for a in ar:
 104                 count += 1
 105                 __print('feature size %d' % a.size)
 106                 Mecab_print(a, __print)
 107                 Mecab_utf8_to_cp932(a)
 108                 if do_write:
 109                         w = "_test%d.jt.wav" % count
 110                 else:
 111                         w = None
 112                 if do_log:
 113                         l = "_test%d.jtlog" % count
 114                 else:
 115                         l = None
 116                 data = libjt_synthesis(a.feature,
 117                                                            a.size,
 118                                                            begin_thres_=32,
 119                                                            end_thres_=32,
 120                                                            level_=level,
 121                                                            fperiod_ = fperiod,
 122                                                            lf0_offset_ = lf0_offset,
 123                                                            lf0_amp_ = lf0_amp,
 124                                                            logwrite_ = __print,
 125                                                            jtlogfile_ = l,
 126                                                            jtwavfile_ = w)
 127                 if data:
 128                         __print('data size %d' % len(data))
 129                         if do_play:
 130                                 pa_play(data, samp_rate = voice_args['samp_rate'])
 131                         if do_write:
 132                                 w = wave.Wave_write("_test%d.wav" % count)
 133                                 w.setparams( (1, 2, voice_args['samp_rate'], len(data)/2,
 134                                                           'NONE', 'not compressed') )
 135                                 w.writeframes(data)
 136                                 w.close()
 137                 libjt_refresh()
 138                 del a
 139         del mf
 140
 141 def main(do_play = False, do_write = True, do_log = False):
 142         njd = NJD()
 143         jpcommon = JPCommon()
 144         engine = HTS_Engine()
 145         libjt_initialize(JT_DLL)
 146         v = voices[1]
 147         libjt_load(v['htsvoice'])
 148         Mecab_initialize(__print, JT_DIR, os.path.join(JT_DIR, 'dic'))
 149
 150         msgs = [
 151                 '100.25ドル。ウェルカムトゥー nvda テンキーのinsertキーと、メインのinsertキーの両方が、nvdaキーとして動作します',
 152                 'マーク。まーく。',
 153                 ]
 154         s = msgs[0]
 155         fperiod = v['fperiod']
 156         do_synthesis(s, v, do_play, do_write, do_log, fperiod, pitch=50, inflection=50)
 157
 158 if __name__ == '__main__':
 159         main(do_play=False, do_write=True)
 160         #prof = cProfile.run("main(do_play=True)", '_cprof.prof')
 161         #p = pstats.Stats('_cprof.prof')
 162         #p.strip_dirs()
 163         #p.sort_stats('time', 'calls')
 164         #p.print_stats()