1 #characterProcessing.py
\r
2 #A part of NonVisual Desktop Access (NVDA)
\r
3 #Copyright (C) 2010-2011 NV Access Inc, World Light Information Limited, Hong Kong Blind Union
\r
4 #This file is covered by the GNU General Public License.
\r
5 #See the file COPYING for more details.
\r
12 from logHandler import log
\r
15 class LocaleDataMap(object):
\r
16 """Allows access to locale-specific data objects, dynamically loading them if needed on request"""
\r
18 def __init__(self,localeDataFactory):
\r
20 @param localeDataFactory: the factory to create data objects for the requested locale.
\r
22 self._localeDataFactory=localeDataFactory
\r
25 def fetchLocaleData(self,locale,fallback=True):
\r
27 Fetches a data object for the given locale.
\r
28 This may mean that the data object is first created and stored if it does not yet exist in the map.
\r
29 The locale is also simplified (country is dropped) if the fallback argument is True and the full locale can not be used to create a data object.
\r
30 @param locale: the locale of the data object requested
\r
31 @type locale: string
\r
32 @param fallback: if true and there is no data for the locale, then the country (if it exists) is stripped and just the language is tried.
\r
33 @type fallback: boolean
\r
34 @return: the data object for the given locale
\r
37 if fallback and '_' in locale:
\r
38 localeList.append(locale.split('_')[0])
\r
39 for l in localeList:
\r
40 data=self._dataMap.get(l)
\r
41 if data: return data
\r
43 data=self._localeDataFactory(l)
\r
46 if not data: continue
\r
47 self._dataMap[l]=data
\r
49 raise LookupError(locale)
\r
51 def invalidateLocaleData(self, locale):
\r
52 """Invalidate the data object (if any) for the given locale.
\r
53 This will cause a new data object to be created when this locale is next requested.
\r
54 @param locale: The locale for which the data object should be invalidated.
\r
58 del self._dataMap[locale]
\r
62 class CharacterDescriptions(object):
\r
64 Represents a map of characters to one or more descriptions (examples) for that character.
\r
65 The data is loaded from a file from the requested locale.
\r
68 def __init__(self,locale):
\r
70 @param locale: The characterDescriptions.dic file will be found by using this locale.
\r
71 @type locale: string
\r
74 fileName=os.path.join('locale',locale,'characterDescriptions.dic')
\r
75 if not os.path.isfile(fileName):
\r
76 raise LookupError(fileName)
\r
77 f = codecs.open(fileName,"r","utf_8_sig",errors="replace")
\r
79 if line.isspace() or line.startswith('#'):
\r
81 line=line.rstrip('\r\n')
\r
82 temp=line.split("\t")
\r
85 self._entries[key] = temp
\r
87 log.warning("can't parse line '%s'" % line)
\r
88 log.debug("Loaded %d entries." % len(self._entries))
\r
91 def getCharacterDescription(self, character):
\r
93 Looks up the given character and returns a list containing all the description strings found.
\r
95 return self._entries.get(character)
\r
97 _charDescLocaleDataMap=LocaleDataMap(CharacterDescriptions)
\r
99 def getCharacterDescription(locale,character):
\r
101 Finds a description or examples for the given character, which makes sence in the given locale.
\r
102 @param locale: the locale (language[_COUNTRY]) the description should be for.
\r
103 @type locale: string
\r
104 @param character: the character who's description should be retreaved.
\r
105 @type character: string
\r
106 @return: the found description for the given character
\r
107 @rtype: list of strings
\r
110 l=_charDescLocaleDataMap.fetchLocaleData(locale)
\r
111 except LookupError:
\r
112 if not locale.startswith('en'):
\r
113 return getCharacterDescription('en',character)
\r
114 raise LookupError("en")
\r
115 desc=l.getCharacterDescription(character)
\r
116 if not desc and not locale.startswith('en'):
\r
117 desc=getCharacterDescription('en',character)
\r
120 # Speech symbol levels
\r
126 SPEECH_SYMBOL_LEVEL_LABELS = {
\r
127 # Translators: The level at which the given symbol will be spoken.
\r
128 SYMLVL_NONE: pgettext("symbolLevel", "none"),
\r
129 # Translators: The level at which the given symbol will be spoken.
\r
130 SYMLVL_SOME: pgettext("symbolLevel", "some"),
\r
131 # Translators: The level at which the given symbol will be spoken.
\r
132 SYMLVL_MOST: pgettext("symbolLevel", "most"),
\r
133 # Translators: The level at which the given symbol will be spoken.
\r
134 SYMLVL_ALL: pgettext("symbolLevel", "all"),
\r
135 # Translators: The level at which the given symbol will be spoken.
\r
136 SYMLVL_CHAR: pgettext("symbolLevel", "character"),
\r
138 CONFIGURABLE_SPEECH_SYMBOL_LEVELS = (SYMLVL_NONE, SYMLVL_SOME, SYMLVL_MOST, SYMLVL_ALL)
\r
139 SPEECH_SYMBOL_LEVELS = CONFIGURABLE_SPEECH_SYMBOL_LEVELS + (SYMLVL_CHAR,)
\r
141 # Speech symbol preserve modes
\r
146 class SpeechSymbol(object):
\r
147 __slots__ = ("identifier", "pattern", "replacement", "level", "preserve", "displayName")
\r
149 def __init__(self, identifier, pattern=None, replacement=None, level=None, preserve=None, displayName=None):
\r
150 self.identifier = identifier
\r
151 self.pattern = pattern
\r
152 self.replacement = replacement
\r
154 self.preserve = preserve
\r
155 self.displayName = displayName
\r
157 def __repr__(self):
\r
159 for attr in self.__slots__:
\r
160 attrs.append("{name}={val!r}".format(
\r
161 name=attr, val=getattr(self, attr)))
\r
162 return "SpeechSymbol(%s)" % ", ".join(attrs)
\r
164 class SpeechSymbols(object):
\r
166 Contains raw information about the pronunciation of symbols.
\r
167 It does not handle inheritance of data from other sources, processing of text, etc.
\r
168 This is all handled by L{SpeechSymbolProcessor}.
\r
171 def __init__(self):
\r
174 self.complexSymbols = collections.OrderedDict()
\r
175 self.symbols = collections.OrderedDict()
\r
176 self.fileName = None
\r
178 def load(self, fileName, allowComplexSymbols=True):
\r
179 """Load symbol information from a file.
\r
180 @param fileName: The name of the file from which to load symbol information.
\r
181 @type fileName: str
\r
182 @param allowComplexSymbols: Whether to allow complex symbols.
\r
183 @type allowComplexSymbols: bool
\r
184 @raise IOError: If the file cannot be read.
\r
186 self.fileName = fileName
\r
187 with codecs.open(fileName, "r", "utf_8_sig", errors="replace") as f:
\r
190 if line.isspace() or line.startswith("#"):
\r
191 # Whitespace or comment.
\r
193 line = line.rstrip("\r\n")
\r
195 if line == "complexSymbols:" and allowComplexSymbols:
\r
196 handler = self._loadComplexSymbol
\r
197 elif line == "symbols:":
\r
198 handler = self._loadSymbol
\r
200 # This is a line within a section, so handle it according to which section we're in.
\r
205 log.warning(u"Invalid line in file {file}: {line}".format(
\r
206 file=fileName, line=line))
\r
208 def _loadComplexSymbol(self, line):
\r
210 identifier, pattern = line.split("\t")
\r
213 self.complexSymbols[identifier] = pattern
\r
215 def _loadSymbolField(self, input, inputMap=None):
\r
222 return inputMap[input]
\r
226 IDENTIFIER_ESCAPES_INPUT = {
\r
236 IDENTIFIER_ESCAPES_OUTPUT = {v: k for k, v in IDENTIFIER_ESCAPES_INPUT.iteritems()}
\r
238 "none": SYMLVL_NONE,
\r
239 "some": SYMLVL_SOME,
\r
240 "most": SYMLVL_MOST,
\r
242 "char": SYMLVL_CHAR,
\r
244 LEVEL_OUTPUT = {v:k for k, v in LEVEL_INPUT.iteritems()}
\r
246 "never": SYMPRES_NEVER,
\r
247 "always": SYMPRES_ALWAYS,
\r
248 "norep": SYMPRES_NOREP,
\r
250 PRESERVE_OUTPUT = {v: k for k, v in PRESERVE_INPUT.iteritems()}
\r
252 def _loadSymbol(self, line):
\r
253 line = line.split("\t")
\r
254 identifier = replacement = level = preserve = displayName = None
\r
255 if line[-1].startswith("#"):
\r
256 # Regardless of how many fields there are,
\r
257 # if the last field is a comment, it is the display name.
\r
258 displayName = line[-1][1:].lstrip()
\r
262 identifier = next(line)
\r
264 # Empty identifier is not allowed.
\r
266 if identifier.startswith("\\") and len(identifier) >= 2:
\r
267 identifier = self.IDENTIFIER_ESCAPES_INPUT.get(identifier[1], identifier[1]) + identifier[2:]
\r
268 replacement = self._loadSymbolField(next(line))
\r
269 except StopIteration:
\r
270 # These fields are mandatory.
\r
273 level = self._loadSymbolField(next(line), self.LEVEL_INPUT)
\r
274 preserve = self._loadSymbolField(next(line), self.PRESERVE_INPUT)
\r
275 except StopIteration:
\r
276 # These fields are optional. Defaults will be used for unspecified fields.
\r
278 self.symbols[identifier] = SpeechSymbol(identifier, None, replacement, level, preserve, displayName)
\r
280 def save(self, fileName=None):
\r
281 """Save symbol information to a file.
\r
282 @param fileName: The name of the file to which to save symbol information,
\r
283 C{None} to use the file name last passed to L{load} or L{save}.
\r
284 @type fileName: str
\r
285 @raise IOError: If the file cannot be written.
\r
286 @raise ValueError: If C{fileName} is C{None}
\r
287 and L{load} or L{save} has not been called.
\r
290 self.fileName = fileName
\r
291 elif self.fileName:
\r
292 fileName = self.fileName
\r
294 raise ValueError("No file name")
\r
296 with codecs.open(fileName, "w", "utf_8_sig", errors="replace") as f:
\r
297 if self.complexSymbols:
\r
298 f.write(u"complexSymbols:\r\n")
\r
299 for identifier, pattern in self.complexSymbols.iteritems():
\r
300 f.write(u"%s\t%s\r\n" % (identifier, pattern))
\r
304 f.write(u"symbols:\r\n")
\r
305 for symbol in self.symbols.itervalues():
\r
306 f.write(u"%s\r\n" % self._saveSymbol(symbol))
\r
308 def _saveSymbolField(self, output, outputMap=None):
\r
314 return outputMap[output]
\r
318 def _saveSymbol(self, symbol):
\r
319 identifier = symbol.identifier
\r
321 identifier = u"\\%s%s" % (
\r
322 self.IDENTIFIER_ESCAPES_OUTPUT[identifier[0]], identifier[1:])
\r
325 fields = [identifier,
\r
326 self._saveSymbolField(symbol.replacement),
\r
327 self._saveSymbolField(symbol.level, self.LEVEL_OUTPUT),
\r
328 self._saveSymbolField(symbol.preserve, self.PRESERVE_OUTPUT)
\r
330 # Strip optional fields with default values.
\r
331 for field in reversed(fields[2:]):
\r
335 # This field specifies a value, so no more fields can be stripped.
\r
337 if symbol.displayName:
\r
338 fields.append("# %s" % symbol.displayName)
\r
339 return u"\t".join(fields)
\r
341 def _getSpeechSymbolsForLocale(locale):
\r
342 builtin = SpeechSymbols()
\r
344 builtin.load(os.path.join("locale", locale, "symbols.dic"))
\r
346 raise LookupError("No symbol information for locale %s" % locale)
\r
347 user = SpeechSymbols()
\r
349 # Don't allow users to specify complex symbols
\r
350 # because an error will cause the whole processor to fail.
\r
351 user.load(os.path.join(globalVars.appArgs.configPath, "symbols-%s.dic" % locale),
\r
352 allowComplexSymbols=False)
\r
354 # An empty user SpeechSymbols is okay.
\r
356 return builtin, user
\r
358 class SpeechSymbolProcessor(object):
\r
360 Handles processing of symbol pronunciation for a locale.
\r
361 Pronunciation information is taken from one or more L{SpeechSymbols} instances.
\r
364 #: Caches symbol data for locales.
\r
365 localeSymbols = LocaleDataMap(_getSpeechSymbolsForLocale)
\r
367 def __init__(self, locale):
\r
369 @param locale: The locale for which symbol pronunciation should be processed.
\r
372 self.locale = locale
\r
374 # We need to merge symbol data from several sources.
\r
375 sources = self.sources = []
\r
376 builtin, user = self.localeSymbols.fetchLocaleData(locale,fallback=False)
\r
377 self.userSymbols = user
\r
378 sources.append(user)
\r
379 sources.append(builtin)
\r
381 # Always use English as a base.
\r
383 # Only the builtin data.
\r
384 sources.append(self.localeSymbols.fetchLocaleData("en")[0])
\r
386 # The computed symbol information from all sources.
\r
387 symbols = self.computedSymbols = collections.OrderedDict()
\r
388 # An indexable list of complex symbols for use in building/executing the regexp.
\r
389 complexSymbolsList = self._computedComplexSymbolsList = []
\r
390 # A list of multi-character simple symbols for use in building the regexp.
\r
392 # A list of single character symbols for use in building the regexp.
\r
395 # Add all complex symbols first, as they take priority.
\r
396 for source in sources:
\r
397 for identifier, pattern in source.complexSymbols.iteritems():
\r
398 if identifier in symbols:
\r
401 symbol = SpeechSymbol(identifier, pattern)
\r
402 symbols[identifier] = symbol
\r
403 complexSymbolsList.append(symbol)
\r
405 # Supplement the data for complex symbols and add all simple symbols.
\r
406 for source in sources:
\r
407 for identifier, sourceSymbol in source.symbols.iteritems():
\r
409 symbol = symbols[identifier]
\r
410 # We're updating an already existing symbol.
\r
412 # This is a new simple symbol.
\r
413 # (All complex symbols have already been added.)
\r
414 symbol = symbols[identifier] = SpeechSymbol(identifier)
\r
415 if len(identifier) == 1:
\r
416 characters.append(identifier)
\r
418 multiChars.append(identifier)
\r
419 # If fields weren't explicitly specified, inherit the value from later sources.
\r
420 if symbol.replacement is None:
\r
421 symbol.replacement = sourceSymbol.replacement
\r
422 if symbol.level is None:
\r
423 symbol.level = sourceSymbol.level
\r
424 if symbol.preserve is None:
\r
425 symbol.preserve = sourceSymbol.preserve
\r
426 if symbol.displayName is None:
\r
427 symbol.displayName = sourceSymbol.displayName
\r
429 # Set defaults for any fields not explicitly set.
\r
430 for symbol in symbols.values():
\r
431 if symbol.replacement is None:
\r
432 # Symbols without a replacement specified are useless.
\r
433 log.warning(u"Replacement not defined in locale {locale} for symbol: {symbol}".format(
\r
434 symbol=symbol.identifier, locale=self.locale))
\r
435 del symbols[symbol.identifier]
\r
437 complexSymbolsList.remove(symbol)
\r
441 if symbol.level is None:
\r
442 symbol.level = SYMLVL_ALL
\r
443 if symbol.preserve is None:
\r
444 symbol.preserve = SYMPRES_NEVER
\r
445 if symbol.displayName is None:
\r
446 symbol.displayName = symbol.identifier
\r
448 # Make characters into a regexp character set.
\r
449 characters = "[%s]" % re.escape("".join(characters))
\r
450 # The simple symbols must be ordered longest first so that the longer symbols will match.
\r
451 multiChars.sort(key=lambda identifier: len(identifier), reverse=True)
\r
453 # Build the regexp.
\r
455 # Strip repeated spaces from the end of the line to stop them from being picked up by repeated.
\r
456 r"(?P<rstripSpace> +$)",
\r
457 # Repeated characters: more than 3 repeats.
\r
458 r"(?P<repeated>(?P<repTmp>%s)(?P=repTmp){3,})" % characters
\r
461 # Each complex symbol has its own named group so we know which symbol matched.
\r
463 u"(?P<c{index}>{pattern})".format(index=index, pattern=symbol.pattern)
\r
464 for index, symbol in enumerate(complexSymbolsList))
\r
466 # These are all handled in one named group.
\r
467 # Because the symbols are just text, we know which symbol matched just by looking at the matched text.
\r
468 patterns.append(ur"(?P<simple>{multiChars}|{singleChars})".format(
\r
469 multiChars="|".join(re.escape(identifier) for identifier in multiChars),
\r
470 singleChars=characters
\r
472 pattern = "|".join(patterns)
\r
474 self._regexp = re.compile(pattern, re.UNICODE)
\r
475 except re.error as e:
\r
476 log.error("Invalid complex symbol regular expression in locale %s: %s" % (locale, e))
\r
479 def _regexpRepl(self, m):
\r
480 group = m.lastgroup
\r
482 if group == "rstripSpace":
\r
485 elif group == "repeated":
\r
486 # Repeated character.
\r
488 symbol = self.computedSymbols[text[0]]
\r
489 if self._level >= symbol.level:
\r
490 return u" {count} {char} ".format(count=len(text), char=symbol.replacement)
\r
495 # One of the defined symbols.
\r
497 if group == "simple":
\r
499 symbol = self.computedSymbols[text]
\r
502 index = int(group[1:])
\r
503 symbol = self._computedComplexSymbolsList[index]
\r
504 if symbol.preserve == SYMPRES_ALWAYS or (symbol.preserve == SYMPRES_NOREP and self._level < symbol.level):
\r
508 if self._level >= symbol.level and symbol.replacement:
\r
509 return u" {repl}{suffix}".format(repl=symbol.replacement, suffix=suffix)
\r
513 def processText(self, text, level):
\r
514 self._level = level
\r
515 return self._regexp.sub(self._regexpRepl, text)
\r
517 def updateSymbol(self, newSymbol):
\r
518 """Update information for a symbol if it has changed.
\r
519 If there is a change, the changed information will be added to the user's symbol data.
\r
520 These changes do not take effect until the symbol processor is reinitialised.
\r
521 @param newSymbol: The symbol to update.
\r
522 @type newSymbol: L{SpeechSymbol}
\r
523 @return: Whether there was a change.
\r
526 identifier = newSymbol.identifier
\r
527 oldSymbol = self.computedSymbols[identifier]
\r
528 if oldSymbol is newSymbol:
\r
531 userSymbol = self.userSymbols.symbols[identifier]
\r
533 userSymbol = SpeechSymbol(identifier)
\r
536 if newSymbol.pattern != oldSymbol.pattern:
\r
537 userSymbol.pattern = newSymbol.pattern
\r
539 if newSymbol.replacement != oldSymbol.replacement:
\r
540 userSymbol.replacement = newSymbol.replacement
\r
542 if newSymbol.level != oldSymbol.level:
\r
543 userSymbol.level = newSymbol.level
\r
545 if newSymbol.preserve != oldSymbol.preserve:
\r
546 userSymbol.preserve = newSymbol.preserve
\r
548 if newSymbol.displayName != oldSymbol.displayName:
\r
549 userSymbol.displayName = newSymbol.displayName
\r
555 # Do this in case the symbol wasn't in userSymbols before.
\r
556 self.userSymbols.symbols[identifier] = userSymbol
\r
559 _localeSpeechSymbolProcessors = LocaleDataMap(SpeechSymbolProcessor)
\r
561 def processSpeechSymbols(locale, text, level):
\r
562 """Process some text, converting symbols according to desired pronunciation.
\r
563 @param locale: The locale of the text.
\r
565 @param text: The text to process.
\r
567 @param level: The symbol level to use; one of the SYMLVL_* constants.
\r
570 ss = _localeSpeechSymbolProcessors.fetchLocaleData(locale)
\r
571 except LookupError:
\r
572 if not locale.startswith("en_"):
\r
573 return processSpeechSymbols("en", text, level)
\r
575 return ss.processText(text, level)
\r
577 def processSpeechSymbol(locale, symbol):
\r
578 """Process a single symbol according to desired pronunciation.
\r
579 @param locale: The locale of the symbol.
\r
581 @param symbol: The symbol.
\r
585 ss = _localeSpeechSymbolProcessors.fetchLocaleData(locale)
\r
586 except LookupError:
\r
587 if not locale.startswith("en_"):
\r
588 return processSpeechSymbol("en", symbol)
\r
591 return ss.computedSymbols[symbol].replacement
\r