OSDN Git Service

SpeechSymbolProcessor: Optimise the regular expression by matching all single charact...
authorJames Teh <jamie@jantrid.net>
Wed, 19 Sep 2012 07:01:08 +0000 (17:01 +1000)
committerJames Teh <jamie@jantrid.net>
Wed, 19 Sep 2012 07:01:08 +0000 (17:01 +1000)
This should also allow for more single character symbols.

source/characterProcessing.py

index 6ae70f8..0adc6e3 100644 (file)
@@ -379,10 +379,10 @@ class SpeechSymbolProcessor(object):
                symbols = self.computedSymbols = collections.OrderedDict()\r
                # An indexable list of complex symbols for use in building/executing the regexp.\r
                complexSymbolsList = self._computedComplexSymbolsList = []\r
-               # A list of simple symbol identifiers for use in building the regexp.\r
-               simpleSymbolIdentifiers = []\r
-               # Single character symbols.\r
-               characters = set()\r
+               # A list of multi-character simple symbols for use in building the regexp.\r
+               multiChars = []\r
+               # A list of single character symbols for use in building the regexp.\r
+               characters = []\r
 \r
                # Add all complex symbols first, as they take priority.\r
                for source in sources:\r
@@ -404,9 +404,10 @@ class SpeechSymbolProcessor(object):
                                        # This is a new simple symbol.\r
                                        # (All complex symbols have already been added.)\r
                                        symbol = symbols[identifier] = SpeechSymbol(identifier)\r
-                                       simpleSymbolIdentifiers.append(identifier)\r
                                        if len(identifier) == 1:\r
-                                               characters.add(identifier)\r
+                                               characters.append(identifier)\r
+                                       else:\r
+                                               multiChars.append(identifier)\r
                                # If fields weren't explicitly specified, inherit the value from later sources.\r
                                if symbol.replacement is None:\r
                                        symbol.replacement = sourceSymbol.replacement\r
@@ -436,16 +437,17 @@ class SpeechSymbolProcessor(object):
                        if symbol.displayName is None:\r
                                symbol.displayName = symbol.identifier\r
 \r
-               characters = "".join(characters)\r
+               # Make characters into a regexp character set.\r
+               characters = "[%s]" % re.escape("".join(characters))\r
                # The simple symbols must be ordered longest first so that the longer symbols will match.\r
-               simpleSymbolIdentifiers.sort(key=lambda identifier: len(identifier), reverse=True)\r
+               multiChars.sort(key=lambda identifier: len(identifier), reverse=True)\r
 \r
                # Build the regexp.\r
                patterns = [\r
                        # Strip repeated spaces from the end of the line to stop them from being picked up by repeated.\r
                        r"(?P<rstripSpace>  +$)",\r
                        # Repeated characters: more than 3 repeats.\r
-                       r"(?P<repeated>(?P<repTmp>[%s])(?P=repTmp){3,})" % re.escape("".join(characters))\r
+                       r"(?P<repeated>(?P<repTmp>%s)(?P=repTmp){3,})" % characters\r
                ]\r
                # Complex symbols.\r
                # Each complex symbol has its own named group so we know which symbol matched.\r
@@ -455,8 +457,9 @@ class SpeechSymbolProcessor(object):
                # Simple symbols.\r
                # These are all handled in one named group.\r
                # Because the symbols are just text, we know which symbol matched just by looking at the matched text.\r
-               patterns.append(ur"(?P<simple>{})".format(\r
-                       "|".join(re.escape(identifier) for identifier in simpleSymbolIdentifiers)\r
+               patterns.append(ur"(?P<simple>{multiChars}|{singleChars})".format(\r
+                       multiChars="|".join(re.escape(identifier) for identifier in multiChars),\r
+                       singleChars=characters\r
                ))\r
                pattern = "|".join(patterns)\r
                try:\r