symbols = self.computedSymbols = collections.OrderedDict()\r
# An indexable list of complex symbols for use in building/executing the regexp.\r
complexSymbolsList = self._computedComplexSymbolsList = []\r
- # A list of simple symbol identifiers for use in building the regexp.\r
- simpleSymbolIdentifiers = []\r
- # Single character symbols.\r
- characters = set()\r
+ # A list of multi-character simple symbols for use in building the regexp.\r
+ multiChars = []\r
+ # A list of single character symbols for use in building the regexp.\r
+ characters = []\r
\r
# Add all complex symbols first, as they take priority.\r
for source in sources:\r
# This is a new simple symbol.\r
# (All complex symbols have already been added.)\r
symbol = symbols[identifier] = SpeechSymbol(identifier)\r
- simpleSymbolIdentifiers.append(identifier)\r
if len(identifier) == 1:\r
- characters.add(identifier)\r
+ characters.append(identifier)\r
+ else:\r
+ multiChars.append(identifier)\r
# If fields weren't explicitly specified, inherit the value from later sources.\r
if symbol.replacement is None:\r
symbol.replacement = sourceSymbol.replacement\r
if symbol.displayName is None:\r
symbol.displayName = symbol.identifier\r
\r
- characters = "".join(characters)\r
+ # Make characters into a regexp character set.\r
+ characters = "[%s]" % re.escape("".join(characters))\r
# The simple symbols must be ordered longest first so that the longer symbols will match.\r
- simpleSymbolIdentifiers.sort(key=lambda identifier: len(identifier), reverse=True)\r
+ multiChars.sort(key=lambda identifier: len(identifier), reverse=True)\r
\r
# Build the regexp.\r
patterns = [\r
# Strip repeated spaces from the end of the line to stop them from being picked up by repeated.\r
r"(?P<rstripSpace> +$)",\r
# Repeated characters: more than 3 repeats.\r
- r"(?P<repeated>(?P<repTmp>[%s])(?P=repTmp){3,})" % re.escape("".join(characters))\r
+ r"(?P<repeated>(?P<repTmp>%s)(?P=repTmp){3,})" % characters\r
]\r
# Complex symbols.\r
# Each complex symbol has its own named group so we know which symbol matched.\r
# Simple symbols.\r
# These are all handled in one named group.\r
# Because the symbols are just text, we know which symbol matched just by looking at the matched text.\r
- patterns.append(ur"(?P<simple>{})".format(\r
- "|".join(re.escape(identifier) for identifier in simpleSymbolIdentifiers)\r
+ patterns.append(ur"(?P<simple>{multiChars}|{singleChars})".format(\r
+ multiChars="|".join(re.escape(identifier) for identifier in multiChars),\r
+ singleChars=characters\r
))\r
pattern = "|".join(patterns)\r
try:\r