Package translate :: Package storage :: Module lisa
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.lisa

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Parent class for LISA standards (TMX, TBX, XLIFF)""" 
 22   
 23  import re 
 24   
 25  from translate.storage import base 
 26  from translate.lang import data 
 27  try: 
 28      from lxml import etree 
 29      from translate.misc.xml_helpers import getText, getXMLlang, setXMLlang, \ 
 30                                             getXMLspace, setXMLspace, namespaced 
 31  except ImportError, e: 
 32      raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.") 
 33   
 34   
35 -def _findAllMatches(text, re_obj):
36 """generate match objects for all L{re_obj} matches in L{text}.""" 37 start = 0 38 max = len(text) 39 while start < max: 40 m = re_obj.search(text, start) 41 if not m: 42 break 43 yield m 44 start = m.end()
45 46 #TODO: we can now do better with our proper placeables support 47 placeholders = ['(%[diouxXeEfFgGcrs])', r'(\\+.?)', 48 '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)'] 49 re_placeholders = [re.compile(ph) for ph in placeholders]
50 -def _getPhMatches(text):
51 """return list of regexp matchobjects for with all place holders in the 52 L{text}""" 53 matches = [] 54 for re_ph in re_placeholders: 55 matches.extend(list(_findAllMatches(text, re_ph))) 56 57 # sort them so they come sequentially 58 matches.sort(lambda a, b: cmp(a.start(), b.start())) 59 return matches
60 61
62 -class LISAunit(base.TranslationUnit):
63 """ 64 A single unit in the file. Provisional work is done to make several 65 languages possible. 66 """ 67 68 #The name of the root element of this unit type:(termEntry, tu, trans-unit) 69 rootNode = "" 70 # The name of the per language element of this unit type:(termEntry, tu, 71 # trans-unit) 72 languageNode = "" 73 #The name of the innermost element of this unit type:(term, seg) 74 textNode = "" 75 76 namespace = None 77 _default_xml_space = "preserve" 78 """The default handling of spacing in the absense of an xml:space 79 attribute. 80 81 This is mostly for correcting XLIFF behaviour.""" 82
83 - def __init__(self, source, empty=False, **kwargs):
84 """Constructs a unit containing the given source string""" 85 self._rich_source = None 86 self._rich_target = None 87 if empty: 88 self._state_n = 0 89 return 90 self.xmlelement = etree.Element(self.namespaced(self.rootNode)) 91 #add descrip, note, etc. 92 super(LISAunit, self).__init__(source)
93
94 - def __eq__(self, other):
95 """Compares two units""" 96 if not isinstance(other, LISAunit): 97 return super(LISAunit, self).__eq__(other) 98 languageNodes = self.getlanguageNodes() 99 otherlanguageNodes = other.getlanguageNodes() 100 if len(languageNodes) != len(otherlanguageNodes): 101 return False 102 for i in range(len(languageNodes)): 103 mytext = self.getNodeText(languageNodes[i], 104 getXMLspace(self.xmlelement, 105 self._default_xml_space)) 106 othertext = other.getNodeText(otherlanguageNodes[i], 107 getXMLspace(self.xmlelement, 108 self._default_xml_space)) 109 if mytext != othertext: 110 #TODO:^ maybe we want to take children and notes into account 111 return False 112 return True
113
114 - def namespaced(self, name):
115 """Returns name in Clark notation. 116 117 For example namespaced("source") in an XLIFF document might return:: 118 {urn:oasis:names:tc:xliff:document:1.1}source 119 This is needed throughout lxml. 120 """ 121 return namespaced(self.namespace, name)
122
123 - def set_source_dom(self, dom_node):
124 languageNodes = self.getlanguageNodes() 125 if len(languageNodes) > 0: 126 self.xmlelement.replace(languageNodes[0], dom_node) 127 else: 128 self.xmlelement.append(dom_node)
129
130 - def get_source_dom(self):
131 return self.getlanguageNode(lang=None, index=0)
132 source_dom = property(get_source_dom, set_source_dom) 133
134 - def setsource(self, text, sourcelang='en'):
135 if self._rich_source is not None: 136 self._rich_source = None 137 text = data.forceunicode(text) 138 self.source_dom = self.createlanguageNode(sourcelang, text, "source")
139
140 - def getsource(self):
141 return self.getNodeText(self.source_dom, 142 getXMLspace(self.xmlelement, 143 self._default_xml_space))
144 source = property(getsource, setsource) 145
146 - def set_target_dom(self, dom_node, append=False):
147 languageNodes = self.getlanguageNodes() 148 assert len(languageNodes) > 0 149 if dom_node is not None: 150 if append or len(languageNodes) == 0: 151 self.xmlelement.append(dom_node) 152 else: 153 self.xmlelement.insert(1, dom_node) 154 if not append and len(languageNodes) > 1: 155 self.xmlelement.remove(languageNodes[1])
156
157 - def get_target_dom(self, lang=None):
158 if lang: 159 return self.getlanguageNode(lang=lang) 160 else: 161 return self.getlanguageNode(lang=None, index=1)
162 target_dom = property(get_target_dom) 163
164 - def settarget(self, text, lang='xx', append=False):
165 """Sets the "target" string (second language), or alternatively 166 appends to the list""" 167 #XXX: we really need the language - can't really be optional, and we 168 # need to propagate it 169 if self._rich_target is not None: 170 self._rich_target = None 171 text = data.forceunicode(text) 172 # Firstly deal with reinitialising to None or setting to identical 173 # string 174 if self.gettarget() == text: 175 return 176 languageNode = self.get_target_dom(None) 177 if not text is None: 178 if languageNode is None: 179 languageNode = self.createlanguageNode(lang, text, "target") 180 self.set_target_dom(languageNode, append) 181 else: 182 if self.textNode: 183 terms = languageNode.iter(self.namespaced(self.textNode)) 184 try: 185 languageNode = terms.next() 186 except StopIteration, e: 187 pass 188 languageNode.text = text 189 else: 190 self.set_target_dom(None, False)
191
192 - def gettarget(self, lang=None):
193 """retrieves the "target" text (second entry), or the entry in the 194 specified language, if it exists""" 195 return self.getNodeText(self.get_target_dom(lang), 196 getXMLspace(self.xmlelement, 197 self._default_xml_space))
198 target = property(gettarget, settarget) 199
200 - def createlanguageNode(self, lang, text, purpose=None):
201 """Returns a xml Element setup with given parameters to represent a 202 single language entry. Has to be overridden.""" 203 return None
204
205 - def createPHnodes(self, parent, text):
206 """Create the text node in parent containing all the ph tags""" 207 matches = _getPhMatches(text) 208 if not matches: 209 parent.text = text 210 return 211 212 # Now we know there will definitely be some ph tags 213 start = matches[0].start() 214 pretext = text[:start] 215 if pretext: 216 parent.text = pretext 217 lasttag = parent 218 for i, m in enumerate(matches): 219 #pretext 220 pretext = text[start:m.start()] 221 # this will never happen with the first ph tag 222 if pretext: 223 lasttag.tail = pretext 224 #ph node 225 phnode = etree.SubElement(parent, self.namespaced("ph")) 226 phnode.set("id", str(i+1)) 227 phnode.text = m.group() 228 lasttag = phnode 229 start = m.end() 230 #post text 231 if text[start:]: 232 lasttag.tail = text[start:]
233
234 - def getlanguageNodes(self):
235 """Returns a list of all nodes that contain per language information. 236 """ 237 return list(self.xmlelement.iterchildren(self.namespaced(self.languageNode)))
238
239 - def getlanguageNode(self, lang=None, index=None):
240 """Retrieves a languageNode either by language or by index""" 241 if lang is None and index is None: 242 raise KeyError("No criterea for languageNode given") 243 languageNodes = self.getlanguageNodes() 244 if lang: 245 for set in languageNodes: 246 if getXMLlang(set) == lang: 247 return set 248 else:#have to use index 249 if index >= len(languageNodes): 250 return None 251 else: 252 return languageNodes[index] 253 return None
254
255 - def getNodeText(self, languageNode, xml_space="preserve"):
256 """Retrieves the term from the given languageNode""" 257 if languageNode is None: 258 return None 259 if self.textNode: 260 terms = languageNode.iterdescendants(self.namespaced(self.textNode)) 261 if terms is None: 262 return None 263 else: 264 return getText(terms.next(), xml_space) 265 else: 266 return getText(languageNode, xml_space)
267
268 - def __str__(self):
269 return etree.tostring(self.xmlelement, pretty_print=True, 270 encoding='utf-8')
271
272 - def _set_property(self, name, value):
273 self.xmlelement.attrib[name] = value
274 275 xid = property(lambda self: self.xmlelement.attrib[self.namespaced('xid')], 276 lambda self, value: self._set_property(self.namespaced('xid'), value)) 277 278 rid = property(lambda self: self.xmlelement.attrib[self.namespaced('rid')], 279 lambda self, value: self._set_property(self.namespaced('rid'), value)) 280
281 - def createfromxmlElement(cls, element):
282 term = cls(None, empty=True) 283 term.xmlelement = element 284 return term
285 createfromxmlElement = classmethod(createfromxmlElement)
286 287
288 -class LISAfile(base.TranslationStore):
289 """A class representing a file store for one of the LISA file formats.""" 290 UnitClass = LISAunit 291 #The root node of the XML document: 292 rootNode = "" 293 #The root node of the content section: 294 bodyNode = "" 295 #The XML skeleton to use for empty construction: 296 XMLskeleton = "" 297 298 namespace = None 299
300 - def __init__(self, inputfile=None, sourcelanguage='en', 301 targetlanguage=None, unitclass=None):
302 super(LISAfile, self).__init__(unitclass=unitclass) 303 if inputfile is not None: 304 self.parse(inputfile) 305 assert self.document.getroot().tag == self.namespaced(self.rootNode) 306 else: 307 # We strip out newlines to ensure that spaces in the skeleton 308 # doesn't interfere with the the pretty printing of lxml 309 self.parse(self.XMLskeleton.replace("\n", "")) 310 self.setsourcelanguage(sourcelanguage) 311 self.settargetlanguage(targetlanguage) 312 self.addheader() 313 self._encoding = "UTF-8"
314
315 - def addheader(self):
316 """Method to be overridden to initialise headers, etc.""" 317 pass
318
319 - def namespaced(self, name):
320 """Returns name in Clark notation. 321 322 For example namespaced("source") in an XLIFF document might return:: 323 {urn:oasis:names:tc:xliff:document:1.1}source 324 This is needed throughout lxml. 325 """ 326 return namespaced(self.namespace, name)
327
328 - def initbody(self):
329 """Initialises self.body so it never needs to be retrieved from the 330 XML again.""" 331 self.namespace = self.document.getroot().nsmap.get(None, None) 332 self.body = self.document.find('//%s' % self.namespaced(self.bodyNode))
333
334 - def addsourceunit(self, source):
335 #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word? 336 """Adds and returns a new unit with the given string as first entry.""" 337 newunit = self.UnitClass(source) 338 self.addunit(newunit) 339 return newunit
340
341 - def addunit(self, unit, new=True):
342 unit.namespace = self.namespace 343 super(LISAfile, self).addunit(unit) 344 if new: 345 self.body.append(unit.xmlelement)
346
347 - def __str__(self):
348 """Converts to a string containing the file's XML""" 349 return etree.tostring(self.document, pretty_print=True, 350 xml_declaration=True, encoding='utf-8')
351
352 - def parse(self, xml):
353 """Populates this object from the given xml string""" 354 if not hasattr(self, 'filename'): 355 self.filename = getattr(xml, 'name', '') 356 if hasattr(xml, "read"): 357 xml.seek(0) 358 posrc = xml.read() 359 xml = posrc 360 if etree.LXML_VERSION >= (2, 1, 0): 361 #Since version 2.1.0 we can pass the strip_cdata parameter to 362 #indicate that we don't want cdata to be converted to raw XML 363 parser = etree.XMLParser(strip_cdata=False) 364 else: 365 parser = etree.XMLParser() 366 self.document = etree.fromstring(xml, parser).getroottree() 367 self._encoding = self.document.docinfo.encoding 368 self.initbody() 369 assert self.document.getroot().tag == self.namespaced(self.rootNode) 370 for entry in self.document.getroot().iterdescendants(self.namespaced(self.UnitClass.rootNode)): 371 term = self.UnitClass.createfromxmlElement(entry) 372 self.addunit(term, new=False)
373