1 from xml.parsers import expat
\r
3 from logHandler import log
\r
5 class XMLTextParser(object):
\r
8 self.parser=expat.ParserCreate('utf-8')
\r
9 self.parser.StartElementHandler=self._startElementHandler
\r
10 self.parser.EndElementHandler=self._EndElementHandler
\r
11 self.parser.CharacterDataHandler=self._CharacterDataHandler
\r
12 self._commandList=[]
\r
14 def _startElementHandler(self,tagName,attrs):
\r
15 if tagName=='unich':
\r
16 data=attrs.get('value',None)
\r
17 if data is not None:
\r
19 data=unichr(int(data))
\r
22 self._CharacterDataHandler(data)
\r
24 elif tagName=='control':
\r
25 newAttrs=textInfos.ControlField(attrs)
\r
26 self._commandList.append(textInfos.FieldCommand("controlStart",newAttrs))
\r
27 elif tagName=='text':
\r
28 newAttrs=textInfos.FormatField(attrs)
\r
29 self._commandList.append(textInfos.FieldCommand("formatChange",newAttrs))
\r
31 raise ValueError("Unknown tag name: %s"%tagName)
\r
33 # Normalise attributes common to both field types.
\r
35 newAttrs["_startOfNode"] = newAttrs["_startOfNode"] == "1"
\r
39 newAttrs["_endOfNode"] = newAttrs["_endOfNode"] == "1"
\r
43 def _EndElementHandler(self,tagName):
\r
44 if tagName=="control":
\r
45 self._commandList.append(textInfos.FieldCommand("controlEnd",None))
\r
46 elif tagName in ("text","unich"):
\r
49 raise ValueError("unknown tag name: %s"%tagName)
\r
51 def _CharacterDataHandler(self,data):
\r
52 cmdList=self._commandList
\r
53 if cmdList and isinstance(cmdList[-1],basestring):
\r
56 cmdList.append(data)
\r
58 def parse(self,XMLText):
\r
60 self.parser.Parse(XMLText.encode('utf-8'))
\r
62 log.error("XML: %s"%XMLText,exc_info=True)
\r
63 return self._commandList
\r