2 # Copyright (c) 2004 Danilo Segan <danilo@kvota.net>.
\r
4 # This file is part of xml2po.
\r
6 # xml2po is free software; you can redistribute it and/or modify
\r
7 # it under the terms of the GNU General Public License as published by
\r
8 # the Free Software Foundation; either version 2 of the License, or
\r
9 # (at your option) any later version.
\r
11 # xml2po is distributed in the hope that it will be useful,
\r
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
14 # GNU General Public License for more details.
\r
16 # You should have received a copy of the GNU General Public License
\r
17 # along with xml2po; if not, write to the Free Software Foundation, Inc.,
\r
18 # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
21 # slightly modified to work on Windows for TortoiseSVN.
\r
23 # xml2po -- translate XML documents
\r
26 # Versioning system (I use this for a long time, so lets explain it to
\r
27 # those Linux-versioning-scheme addicts):
\r
28 # 1.0.* are unstable, development versions
\r
29 # 1.1 will be first stable release (release 1), and 1.1.* bugfix releases
\r
30 # 2.0.* will be unstable-feature-development stage (milestone 1)
\r
31 # 2.1.* unstable development betas (milestone 2)
\r
32 # 2.2 second stable release (release 2), and 2.2.* bugfix releases
\r
41 class MessageOutput:
\r
42 def __init__(self, with_translations = 0):
\r
47 if with_translations:
\r
48 self.translations = []
\r
49 self.do_translations = with_translations
\r
50 self.output_msgstr = 0 # this is msgid mode for outputMessage; 1 is for msgstr mode
\r
52 def translationsFollow(self):
\r
53 """Indicate that what follows are translations."""
\r
54 self.output_msgstr = 1
\r
56 def setFilename(self, filename):
\r
57 self.filename = filename
\r
59 def outputMessage(self, text, lineno = 0, comment = None, spacepreserve = 0, tag = None):
\r
60 """Adds a string to the list of messages."""
\r
61 if (text.strip() != ''):
\r
62 t = escapePoString(normalizeString(text, not spacepreserve))
\r
63 if self.output_msgstr:
\r
64 self.translations.append(t)
\r
67 if self.do_translations or (not t in self.messages):
\r
68 self.messages.append(t)
\r
71 if t in self.linenos.keys():
\r
72 self.linenos[t].append((self.filename, tag, lineno))
\r
74 self.linenos[t] = [ (self.filename, tag, lineno) ]
\r
75 if (not self.do_translations) and comment and not t in self.comments:
\r
76 self.comments[t] = comment
\r
78 if t in self.linenos.keys():
\r
79 self.linenos[t].append((self.filename, tag, lineno))
\r
81 self.linenos[t] = [ (self.filename, tag, lineno) ]
\r
82 if comment and not t in self.comments:
\r
83 self.comments[t] = comment
\r
85 def outputHeader(self, out):
\r
86 from time import gmtime, strftime
\r
87 tstamp = strftime("%Y-%m-%d %H:%M +0000", gmtime())
\r
90 "Project-Id-Version: PACKAGE VERSION\\n"
\r
91 "POT-Creation-Date: %s\\n"
\r
92 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
\r
93 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
\r
94 "Language-Team: LANGUAGE <LL@li.org>\\n"
\r
95 "MIME-Version: 1.0\\n"
\r
96 "Content-Type: text/plain; charset=UTF-8\\n"
\r
97 "Content-Transfer-Encoding: 8bit\\n"
\r
101 out.write(tmp.encode('utf-8'))
\r
103 def outputAll(self, out):
\r
104 self.outputHeader(out)
\r
106 for k in self.messages:
\r
107 if k in self.comments:
\r
108 out.write("#. %s\n" % (self.comments[k].replace("\n","\n#. ")))
\r
110 for reference in self.linenos[k]:
\r
111 references += "#: %s:%d\n#.(%s)\n" % (reference[0], reference[2], reference[1])
\r
112 out.write("%s" % (references))
\r
113 if k in self.nowrap and self.nowrap[k]:
\r
114 out.write("#, no-wrap\n")
\r
115 out.write("msgid \"%s\"\n" % (k))
\r
117 if self.do_translations:
\r
118 if len(self.translations)>0:
\r
119 translation = self.translations.pop(0)
\r
120 out.write("msgstr \"%s\"\n\n" % (translation))
\r
123 def normalizeNode(node):
\r
126 elif isSpacePreserveNode(node):
\r
128 elif node.isText():
\r
129 if node.isBlankNode():
\r
130 node.setContent('')
\r
132 node.setContent(re.sub('\s+',' ', node.content))
\r
134 elif node.children and node.type == 'element':
\r
135 child = node.children
\r
137 normalizeNode(child)
\r
140 def normalizeString(text, ignorewhitespace = 1):
\r
141 """Normalizes string to be used as key for gettext lookup.
\r
143 Removes all unnecessary whitespace."""
\r
144 if not ignorewhitespace:
\r
147 # Lets add document DTD so entities are resolved
\r
148 dtd = doc.intSubset()
\r
149 tmp = dtd.serialize('utf-8')
\r
150 tmp = tmp + '<norm>%s</norm>' % text
\r
152 tmp = '<norm>%s</norm>' % text
\r
155 ctxt = libxml2.createDocParserCtxt(tmp)
\r
156 if expand_entities:
\r
157 ctxt.replaceEntities(1)
\r
158 ctxt.parseDocument()
\r
160 newnode = tree.getRootElement()
\r
162 print >> sys.stderr, """Error while normalizing string as XML:\n"%s"\n""" % (text)
\r
165 normalizeNode(newnode)
\r
168 child = newnode.children
\r
170 result += child.serialize('utf-8')
\r
173 result = re.sub('^ ','', result)
\r
174 result = re.sub(' $','', result)
\r
178 def stringForEntity(node):
\r
179 """Replaces entities in the node."""
\r
180 text = node.serialize('utf-8')
\r
182 # Lets add document DTD so entities are resolved
\r
183 dtd = node.doc.intSubset()
\r
184 tmp = dtd.serialize('utf-8') + '<norm>%s</norm>' % text
\r
187 tmp = '<norm>%s</norm>' % text
\r
190 ctxt = libxml2.createDocParserCtxt(tmp)
\r
191 if expand_entities:
\r
192 ctxt.replaceEntities(1)
\r
193 ctxt.parseDocument()
\r
196 newnode = tree.children.next
\r
198 newnode = tree.children
\r
201 child = newnode.children
\r
203 result += child.serialize('utf-8')
\r
209 def escapePoString(text):
\r
210 return text.replace('\\','\\\\').replace('"', "\\\"").replace("\n","\\n").replace("\t","\\t")
\r
212 def unEscapePoString(text):
\r
213 return text.replace('\\"', '"').replace('\\\\','\\')
\r
215 def getTranslation(text, spacepreserve = 0):
\r
216 """Returns a translation via gettext for specified snippet.
\r
218 text should be a string to look for, spacepreserve set to 1
\r
219 when spaces should be preserved.
\r
221 text = normalizeString(text, not spacepreserve)
\r
222 if (text.strip() == ''):
\r
224 file = open(mofile, "rb")
\r
226 gt = gettext.GNUTranslations(file)
\r
228 return gt.ugettext(text.decode('utf-8'))
\r
231 def startTagForNode(node):
\r
237 if node.properties:
\r
238 for p in node.properties:
\r
239 if p.type == 'attribute':
\r
240 # FIXME: This part sucks
\r
241 params += p.serialize('utf-8')
\r
242 return result+params
\r
244 def endTagForNode(node):
\r
251 def isFinalNode(node):
\r
253 auto = autoNodeIsFinal(node)
\r
254 # Check if any of the parents is also autoNodeIsFinal,
\r
255 # and if it is, don't consider this node a final one
\r
256 parent = node.parent
\r
257 while parent and auto:
\r
258 auto = not autoNodeIsFinal(parent)
\r
259 parent = parent.parent
\r
261 #node.type =='text' or not node.children or
\r
262 if node.type == 'element' and node.name in ultimate_tags:
\r
264 elif node.children:
\r
266 child = node.children
\r
267 while child and final_children:
\r
268 if not isFinalNode(child):
\r
275 def ignoreNode(node):
\r
277 if node.type in ('dtd', 'comment'):
\r
282 if isFinalNode(node):
\r
284 if node.name in ignored_tags or node.type in ('dtd', 'comment'):
\r
288 def isSpacePreserveNode(node):
\r
289 pres = node.getSpacePreserve()
\r
293 if CurrentXmlMode and (node.name in CurrentXmlMode.getSpacePreserveTags()):
\r
298 def getCommentForNode(node):
\r
299 """Walk through previous siblings until a comment is found, or other element.
\r
301 Only whitespace is allowed between comment and current node."""
\r
303 while prev and prev.type == 'text' and prev.content.strip() == '':
\r
305 if prev and prev.type == 'comment':
\r
306 return prev.content.strip()
\r
311 def replaceNodeContentsWithText(node,text):
\r
312 """Replaces all subnodes of a node with contents of text treated as XML."""
\r
314 starttag = node.name #startTagForNode(node)
\r
315 endtag = endTagForNode(node)
\r
317 # Lets add document DTD so entities are resolved
\r
318 dtd = doc.intSubset()
\r
320 if expand_entities: # FIXME: we get a "Segmentation fault" in libxml2.parseMemory() when we include DTD otherwise
\r
321 tmp = dtd.serialize('utf-8')
\r
322 tmp = tmp + '<%s>%s</%s>' % (starttag, text, endtag)
\r
324 tmp = '<%s>%s</%s>' % (starttag, text, endtag)
\r
327 ctxt = libxml2.createDocParserCtxt(tmp.encode('utf-8'))
\r
328 ctxt.replaceEntities(0)
\r
329 ctxt.parseDocument()
\r
330 newnode = ctxt.doc()
\r
332 print >> sys.stderr, """Error while parsing translation as XML:\n"%s"\n""" % (text.encode('utf-8'))
\r
335 newelem = newnode.getRootElement()
\r
336 if newelem and newelem.children:
\r
337 free = node.children
\r
343 node.addChildList(newelem.children)
\r
345 # In practice, this happens with tags such as "<para> </para>" (only whitespace in between)
\r
348 node.setContent(text)
\r
350 def autoNodeIsFinal(node):
\r
351 """Returns 1 if node is text node, contains non-whitespace text nodes or entities."""
\r
353 if node.isText() and node.content.strip()!='':
\r
355 child = node.children
\r
357 if child.type in ['text'] and child.content.strip()!='':
\r
365 def worthOutputting(node):
\r
366 """Returns 1 if node is "worth outputting", otherwise 0.
\r
368 Node is "worth outputting", if none of the parents
\r
369 isFinalNode, and it contains non-blank text and entities.
\r
372 parent = node.parent
\r
373 final = isFinalNode(node) and node.name not in ignored_tags
\r
374 while not final and parent:
\r
375 if isFinalNode(parent):
\r
376 final = 1 # reset if we've got to one final tag
\r
377 if final and (parent.name not in ignored_tags) and worthOutputting(parent):
\r
380 parent = parent.parent
\r
384 return autoNodeIsFinal(node)
\r
386 def processElementTag(node, replacements, restart = 0):
\r
387 """Process node with node.type == 'element'."""
\r
388 if node.type == 'element':
\r
393 myrepl = replacements
\r
397 child = node.children
\r
399 if (isFinalNode(child)) or (child.type == 'element' and worthOutputting(child)):
\r
400 myrepl.append(processElementTag(child, myrepl, 1))
\r
401 outtxt += '<placeholder-%d/>' % (len(myrepl))
\r
403 if child.type == 'element':
\r
404 (starttag, content, endtag, translation) = processElementTag(child, myrepl, 0)
\r
405 outtxt += '<%s>%s</%s>' % (starttag, content, endtag)
\r
407 outtxt += doSerialize(child)
\r
411 if mode == 'merge':
\r
412 translation = getTranslation(outtxt, isSpacePreserveNode(node))
\r
414 translation = outtxt
\r
415 starttag = startTagForNode(node)
\r
416 endtag = endTagForNode(node)
\r
418 if restart or worthOutputting(node):
\r
420 while i < len(myrepl):
\r
421 replacement = '<%s>%s</%s>' % (myrepl[i][0], myrepl[i][3], myrepl[i][2])
\r
423 translation = translation.replace('<placeholder-%d/>' % (i), replacement)
\r
425 if worthOutputting(node):
\r
426 if mode == 'merge':
\r
427 replaceNodeContentsWithText(node, translation)
\r
429 msg.outputMessage(outtxt, node.lineNo(), getCommentForNode(node), isSpacePreserveNode(node), tag = node.name)
\r
431 return (starttag, outtxt, endtag, translation)
\r
433 raise Exception("You must pass node with node.type=='element'.")
\r
436 def isExternalGeneralParsedEntity(node):
\r
437 if (node and node.type=='entity_ref'):
\r
439 # it would be nice if debugDumpNode could use StringIO, but it apparently cannot
\r
440 tmp = file(".xml2po-entitychecking","w+")
\r
441 node.debugDumpNode(tmp,0)
\r
443 tmpstr = tmp.read()
\r
445 os.remove(".xml2po-entitychecking")
\r
447 # We fail silently, and replace all entities if we cannot
\r
448 # write .xml2po-entitychecking
\r
449 # !!! This is not very nice thing to do, but I don't know if
\r
450 # raising an exception is any better
\r
452 if tmpstr.find('EXTERNAL_GENERAL_PARSED_ENTITY') != -1:
\r
459 def doSerialize(node):
\r
460 """Serializes a node and its children, emitting PO messages along the way.
\r
462 node is the node to serialize, first indicates whether surrounding
\r
463 tags should be emitted as well.
\r
466 if ignoreNode(node):
\r
468 elif not node.children:
\r
469 return node.serialize("utf-8")
\r
470 elif node.type == 'entity_ref':
\r
471 if isExternalGeneralParsedEntity(node):
\r
472 return node.serialize('utf-8')
\r
474 return stringForEntity(node) #content #content #serialize("utf-8")
\r
475 elif node.type == 'entity_decl':
\r
476 return node.serialize('utf-8') #'<%s>%s</%s>' % (startTagForNode(node), node.content, node.name)
\r
477 elif node.type == 'text':
\r
478 return node.serialize('utf-8')
\r
479 elif node.type == 'element':
\r
481 (starttag, content, endtag, translation) = processElementTag(node, repl, 1)
\r
482 return '<%s>%s</%s>' % (starttag, content, endtag)
\r
484 child = node.children
\r
487 outtxt += doSerialize(child)
\r
492 def read_finaltags(filelist):
\r
494 return CurrentXmlMode.getFinalTags()
\r
496 defaults = ['para', 'title', 'releaseinfo', 'revnumber',
\r
497 'date', 'itemizedlist', 'orderedlist',
\r
498 'variablelist', 'varlistentry', 'term' ]
\r
501 def read_ignoredtags(filelist):
\r
503 return CurrentXmlMode.getIgnoredTags()
\r
505 defaults = ['itemizedlist', 'orderedlist', 'variablelist',
\r
509 def tryToUpdate(allargs, lang):
\r
510 # Remove "-u" and "--update-translation"
\r
511 command = allargs[0]
\r
513 opts, args = getopt.getopt(args, 'avhmket:o:p:u:',
\r
514 ['automatic-tags','version', 'help', 'keep-entities', 'extract-all-entities', 'merge', 'translation=',
\r
515 'output=', 'po-file=', 'update-translation=' ])
\r
516 for opt, arg in opts:
\r
517 if opt in ('-a', '--automatic-tags'):
\r
519 elif opt in ('-k', '--keep-entities'):
\r
521 elif opt in ('-e', '--extract-all-entities'):
\r
523 elif opt in ('-m', '--mode'):
\r
524 command += " -m %s" % arg
\r
525 elif opt in ('-o', '--output'):
\r
526 sys.stderr.write("Error: Option '-o' is not yet supported when updating translations directly.\n")
\r
528 elif opt in ('-v', '--version'):
\r
531 elif opt in ('-h', '--help'):
\r
532 sys.stderr.write("Error: If you want help, please use `%s --help' without '-u' option.\n" % (allargs[0]))
\r
534 elif opt in ('-u', '--update-translation'):
\r
537 sys.stderr.write("Error: Option `%s' is not supported with option `-u'.\n" % (opt))
\r
541 command += " " + args.pop()
\r
545 sys.stderr.write("Merging translations for %s: " % (lang))
\r
546 result = os.system("%s | msgmerge -o .tmp.%s.po %s -" % (command, lang, file))
\r
550 result = os.system("mv .tmp.%s.po %s" % (lang, file))
\r
552 sys.stderr.write("Error: cannot rename file.\n")
\r
555 os.system("msgfmt -cv -o NUL %s" % (file))
\r
558 def load_mode(modename):
\r
560 #found = imp.find_module(modename, submodes_path)
\r
561 #module = imp.load_module(modename, found[0], found[1], found[2])
\r
563 sys.path.append(submodes_path)
\r
564 module = __import__(modename)
\r
565 modeModule = '%sXmlMode' % modename
\r
566 return getattr(module, modeModule)
\r
570 def xml_error_handler(arg, ctxt):
\r
573 libxml2.registerErrorHandler(xml_error_handler, None)
\r
576 # Main program start
\r
577 if __name__ != '__main__': raise NotImplementedError
\r
580 submodes_path = "xml2po-modes"
\r
581 default_mode = 'docbook'
\r
590 mode = 'pot' # 'pot' or 'merge'
\r
592 expand_entities = 1
\r
593 expand_all_entities = 0
\r
595 output = '-' # this means to stdout
\r
597 import getopt, fileinput
\r
599 def usage (with_help = False):
\r
600 print >> sys.stderr, "Usage: %s [OPTIONS] [XMLFILE]..." % (sys.argv[0])
\r
602 print >> sys.stderr, """
\r
603 OPTIONS may be some of:
\r
604 -a --automatic-tags Automatically decides if tags are to be considered
\r
606 -k --keep-entities Don't expand entities
\r
607 -e --expand-all-entities Expand ALL entities (including SYSTEM ones)
\r
608 -m --mode=TYPE Treat tags as type TYPE (default: docbook)
\r
609 -o --output=FILE Print resulting text (XML or POT) to FILE
\r
610 -p --po-file=FILE Specify PO file containing translation, and merge
\r
611 Overwrites temporary file .xml2po.mo.
\r
612 -r --reuse=FILE Specify translated XML file with the same structure
\r
613 -t --translation=FILE Specify MO file containing translation, and merge
\r
614 -u --update-translation=LANG.po Updates a PO file using msgmerge program
\r
615 -v --version Output version of the xml2po program
\r
617 -h --help Output this message
\r
620 To create a POTemplate book.pot from input files chapter1.xml and
\r
621 chapter2.xml, run the following:
\r
622 %s -o book.pot chapter1.xml chapter2.xml
\r
624 After translating book.pot into de.po, merge the translations back,
\r
625 using -p option for each XML file:
\r
626 %s -p de.po chapter1.xml > chapter1.de.xml
\r
627 %s -p de.po chapter2.xml > chapter2.de.xml
\r
628 """ % (sys.argv[0], sys.argv[0], sys.argv[0])
\r
631 if len(sys.argv) < 2: usage()
\r
633 args = sys.argv[1:]
\r
634 try: opts, args = getopt.getopt(args, 'avhkem:t:o:p:u:r:',
\r
635 ['automatic-tags','version', 'help', 'keep-entities', 'expand-all-entities', 'mode=', 'translation=',
\r
636 'output=', 'po-file=', 'update-translation=', 'reuse=' ])
\r
637 except getopt.GetoptError: usage(True)
\r
639 for opt, arg in opts:
\r
640 if opt in ('-m', '--mode'):
\r
642 if opt in ('-a', '--automatic-tags'):
\r
644 elif opt in ('-k', '--keep-entities'):
\r
645 expand_entities = 0
\r
646 elif opt in ('-e', '--expand-all-entities'):
\r
647 expand_all_entities = 1
\r
648 elif opt in ('-t', '--translation'):
\r
651 translationlanguage = os.path.splitext(mofile)[0]
\r
652 elif opt in ('-r', '--reuse'):
\r
654 elif opt in ('-u', '--update-translation'):
\r
655 tryToUpdate(sys.argv, arg)
\r
656 elif opt in ('-p', '--po-file'):
\r
657 mofile = ".xml2po.mo"
\r
659 translationlanguage = os.path.splitext(pofile)[0]
\r
660 os.system("msgfmt -o %s %s >NUL" % (mofile, pofile)) and sys.exit(7)
\r
662 elif opt in ('-o', '--output'):
\r
664 elif opt in ('-v', '--version'):
\r
667 elif opt in ('-h', '--help'):
\r
670 # Treat remaining arguments as XML files
\r
672 filenames.append(args.pop())
\r
674 if len(filenames) > 1 and mode=='merge':
\r
675 print >> sys.stderr, "Error: You can merge translations with only one XML file at a time."
\r
679 CurrentXmlMode = load_mode(default_mode)()
\r
681 CurrentXmlMode = None
\r
682 print >> sys.stderr, "Warning: cannot load module '%s', using automatic detection (-a)." % (default_mode)
\r
685 if mode=='merge' and mofile=='':
\r
686 print >> sys.stderr, "Error: You must specify MO file when merging translations."
\r
689 ultimate_tags = read_finaltags(ultimate)
\r
690 ignored_tags = read_ignoredtags(ignored)
\r
692 # I'm not particularly happy about making any of these global,
\r
693 # but I don't want to bother too much with it right now
\r
697 msg = MessageOutput()
\r
699 filenames.append(origxml)
\r
700 msg = MessageOutput(1)
\r
702 for filename in filenames:
\r
704 if filename == origxml:
\r
705 msg.translationsFollow()
\r
706 ctxt = libxml2.createFileParserCtxt(filename)
\r
707 ctxt.lineNumbers(1)
\r
708 if expand_all_entities:
\r
709 ctxt.replaceEntities(1)
\r
710 ctxt.parseDocument()
\r
713 print >> sys.stderr, "Error: cannot open file '%s'." % (filename)
\r
716 msg.setFilename(filename)
\r
717 if CurrentXmlMode and origxml=='':
\r
718 CurrentXmlMode.preProcessXml(doc,msg)
\r
725 out = file(output, 'w')
\r
727 print >> sys.stderr, "Error: cannot open file %s for writing." % (output)
\r
730 if mode != 'merge':
\r
732 tcmsg = CurrentXmlMode.getStringForTranslators()
\r
733 tccom = CurrentXmlMode.getCommentForTranslators()
\r
735 msg.outputMessage(tcmsg, 0, tccom)
\r
737 tcmsg = CurrentXmlMode.getStringForTranslation()
\r
738 tccom = CurrentXmlMode.getCommentForTranslation()
\r
740 msg.outputMessage(tcmsg, 0, tccom)
\r
745 tcmsg = CurrentXmlMode.getStringForTranslators()
\r
747 tnames = getTranslation(tcmsg)
\r
750 tcmsg = CurrentXmlMode.getStringForTranslation()
\r
752 tstring = getTranslation(tcmsg)
\r
756 CurrentXmlMode.postProcessXmlTranslation(doc, translationlanguage, tnames, tstring)
\r
757 out.write(doc.serialize('utf-8', 1))
\r