SpeechSymbolProcessor: Optimise the regular expression by matching all single charact...

author James Teh <jamie@jantrid.net>

Wed, 19 Sep 2012 07:01:08 +0000 (17:01 +1000)

committer James Teh <jamie@jantrid.net>

Wed, 19 Sep 2012 07:01:08 +0000 (17:01 +1000)
author James Teh <jamie@jantrid.net>
Wed, 19 Sep 2012 07:01:08 +0000 (17:01 +1000)
committer James Teh <jamie@jantrid.net>
Wed, 19 Sep 2012 07:01:08 +0000 (17:01 +1000)
diff --git a/source/characterProcessing.py b/source/characterProcessing.py

index 6ae70f8..0adc6e3 100644 (file)
--- a/source/characterProcessing.py
+++ b/source/characterProcessing.py
@@ -379,10 +379,10 @@ class SpeechSymbolProcessor(object):
                 symbols = self.computedSymbols = collections.OrderedDict()\r
                 # An indexable list of complex symbols for use in building/executing the regexp.\r
                 complexSymbolsList = self._computedComplexSymbolsList = []\r
-               # A list of simple symbol identifiers for use in building the regexp.\r
-               simpleSymbolIdentifiers = []\r
-               # Single character symbols.\r
-               characters = set()\r
+               # A list of multi-character simple symbols for use in building the regexp.\r
+               multiChars = []\r
+               # A list of single character symbols for use in building the regexp.\r
+               characters = []\r
  \r
                 # Add all complex symbols first, as they take priority.\r
                 for source in sources:\r
@@ -404,9 +404,10 @@ class SpeechSymbolProcessor(object):
                                         # This is a new simple symbol.\r
                                         # (All complex symbols have already been added.)\r
                                         symbol = symbols[identifier] = SpeechSymbol(identifier)\r
-                                       simpleSymbolIdentifiers.append(identifier)\r
                                         if len(identifier) == 1:\r
-                                               characters.add(identifier)\r
+                                               characters.append(identifier)\r
+                                       else:\r
+                                               multiChars.append(identifier)\r
                                 # If fields weren't explicitly specified, inherit the value from later sources.\r
                                 if symbol.replacement is None:\r
                                         symbol.replacement = sourceSymbol.replacement\r
@@ -436,16 +437,17 @@ class SpeechSymbolProcessor(object):
                         if symbol.displayName is None:\r
                                 symbol.displayName = symbol.identifier\r
  \r
-               characters = "".join(characters)\r
+               # Make characters into a regexp character set.\r
+               characters = "[%s]" % re.escape("".join(characters))\r
                 # The simple symbols must be ordered longest first so that the longer symbols will match.\r
-               simpleSymbolIdentifiers.sort(key=lambda identifier: len(identifier), reverse=True)\r
+               multiChars.sort(key=lambda identifier: len(identifier), reverse=True)\r
  \r
                 # Build the regexp.\r
                 patterns = [\r
                         # Strip repeated spaces from the end of the line to stop them from being picked up by repeated.\r
                         r"(?P<rstripSpace>  +$)",\r
                         # Repeated characters: more than 3 repeats.\r
-                       r"(?P<repeated>(?P<repTmp>[%s])(?P=repTmp){3,})" % re.escape("".join(characters))\r
+                       r"(?P<repeated>(?P<repTmp>%s)(?P=repTmp){3,})" % characters\r
                 ]\r
                 # Complex symbols.\r
                 # Each complex symbol has its own named group so we know which symbol matched.\r
@@ -455,8 +457,9 @@ class SpeechSymbolProcessor(object):
                 # Simple symbols.\r
                 # These are all handled in one named group.\r
                 # Because the symbols are just text, we know which symbol matched just by looking at the matched text.\r
-               patterns.append(ur"(?P<simple>{})".format(\r
-                       "|".join(re.escape(identifier) for identifier in simpleSymbolIdentifiers)\r
+               patterns.append(ur"(?P<simple>{multiChars}|{singleChars})".format(\r
+                       multiChars="|".join(re.escape(identifier) for identifier in multiChars),\r
+                       singleChars=characters\r
                 ))\r
                 pattern = "|".join(patterns)\r
                 try:\r
author	James Teh <jamie@jantrid.net>
	Wed, 19 Sep 2012 07:01:08 +0000 (17:01 +1000)
committer	James Teh <jamie@jantrid.net>
	Wed, 19 Sep 2012 07:01:08 +0000 (17:01 +1000)