Package translate :: Package search :: Module match
[hide private]
[frames] | no frames]

Source Code for Module translate.search.match

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Class to perform translation memory matching from a store of translation units""" 
 22   
 23  import heapq 
 24  import re 
 25   
 26  from translate.search import lshtein 
 27  from translate.search import terminology 
 28  from translate.storage import base 
 29  from translate.storage import po 
 30  from translate.misc.multistring import multistring 
 31   
 32   
33 -def sourcelen(unit):
34 """Returns the length of the source string""" 35 return len(unit.source)
36 37
38 -def _sort_matches(matches, match_info):
39 def _matches_cmp(x, y): 40 # This function will sort a list of matches according to the match's starting 41 # position, putting the one with the longer source text first, if two are the same. 42 c = cmp(match_info[x.source]['pos'], match_info[y.source]['pos']) 43 return c and c or cmp(len(y.source), len(x.source))
44 matches.sort(_matches_cmp) 45 46
47 -class matcher(object):
48 """A class that will do matching and store configuration for the matching process""" 49 50 sort_reverse = False 51
52 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
53 """max_candidates is the maximum number of candidates that should be assembled, 54 min_similarity is the minimum similarity that must be attained to be included in 55 the result, comparer is an optional Comparer with similarity() function""" 56 if comparer is None: 57 comparer = lshtein.LevenshteinComparer(max_length) 58 self.comparer = comparer 59 self.setparameters(max_candidates, min_similarity, max_length) 60 self.usefuzzy = usefuzzy 61 self.inittm(store) 62 self.addpercentage = True
63
64 - def usable(self, unit):
65 """Returns whether this translation unit is usable for TM""" 66 #TODO: We might want to consider more attributes, such as approved, reviewed, etc. 67 source = unit.source 68 target = unit.target 69 if source and target and (self.usefuzzy or not unit.isfuzzy()): 70 if len(source) < 2: 71 return False 72 if source in self.existingunits and self.existingunits[source] == target: 73 return False 74 else: 75 self.existingunits[source] = target 76 return True 77 return False
78
79 - def inittm(self, stores, reverse=False):
80 """Initialises the memory for later use. We use simple base units for 81 speedup.""" 82 # reverse is deprectated - just use self.sort_reverse 83 self.existingunits = {} 84 self.candidates = base.TranslationStore() 85 86 if isinstance(stores, base.TranslationStore): 87 stores = [stores] 88 for store in stores: 89 self.extendtm(store.units, store=store, sort=False) 90 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
91 # print "TM initialised with %d candidates (%d to %d characters long)" % \ 92 # (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source)) 93
94 - def extendtm(self, units, store=None, sort=True):
95 """Extends the memory with extra unit(s). 96 97 @param units: The units to add to the TM. 98 @param store: Optional store from where some metadata can be retrieved 99 and associated with each unit. 100 @param sort: Optional parameter that can be set to False to supress 101 sorting of the candidates list. This should probably only be used in 102 inittm(). 103 """ 104 if isinstance(units, base.TranslationUnit): 105 units = [units] 106 candidates = filter(self.usable, units) 107 for candidate in candidates: 108 simpleunit = base.TranslationUnit("") 109 # We need to ensure that we don't pass multistrings futher, since 110 # some modules (like the native Levenshtein) can't use it. 111 if isinstance(candidate.source, multistring): 112 if len(candidate.source.strings) > 1: 113 simpleunit.orig_source = candidate.source 114 simpleunit.orig_target = candidate.target 115 simpleunit.source = unicode(candidate.source) 116 simpleunit.target = unicode(candidate.target) 117 else: 118 simpleunit.source = candidate.source 119 simpleunit.target = candidate.target 120 # If we now only get translator comments, we don't get programmer 121 # comments in TM suggestions (in Pootle, for example). If we get all 122 # notes, pot2po adds all previous comments as translator comments 123 # in the new po file 124 simpleunit.addnote(candidate.getnotes(origin="translator")) 125 simpleunit.fuzzy = candidate.isfuzzy() 126 self.candidates.units.append(simpleunit) 127 if sort: 128 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
129
130 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
131 """Sets the parameters without reinitialising the tm. If a parameter 132 is not specified, it is set to the default, not ignored""" 133 self.MAX_CANDIDATES = max_candidates 134 self.MIN_SIMILARITY = min_similarity 135 self.MAX_LENGTH = max_length
136
137 - def getstoplength(self, min_similarity, text):
138 """Calculates a length beyond which we are not interested. 139 The extra fat is because we don't use plain character distance only.""" 140 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
141
142 - def getstartlength(self, min_similarity, text):
143 """Calculates the minimum length we are interested in. 144 The extra fat is because we don't use plain character distance only.""" 145 return max(len(text) * (min_similarity/100.0), 1)
146
147 - def matches(self, text):
148 """Returns a list of possible matches for given source text. 149 150 @type text: String 151 @param text: The text that will be search for in the translation memory 152 @rtype: list 153 @return: a list of units with the source and target strings from the 154 translation memory. If self.addpercentage is true (default) the match 155 quality is given as a percentage in the notes. 156 """ 157 bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES 158 #We use self.MIN_SIMILARITY, but if we already know we have max_candidates 159 #that are better, we can adjust min_similarity upwards for speedup 160 min_similarity = self.MIN_SIMILARITY 161 162 # We want to limit our search in self.candidates, so we want to ignore 163 # all units with a source string that is too short or too long. We use 164 # a binary search to find the shortest string, from where we start our 165 # search in the candidates. 166 167 # minimum source string length to be considered 168 startlength = self.getstartlength(min_similarity, text) 169 startindex = 0 170 endindex = len(self.candidates.units) 171 while startindex < endindex: 172 mid = (startindex + endindex) // 2 173 if sourcelen(self.candidates.units[mid]) < startlength: 174 startindex = mid + 1 175 else: 176 endindex = mid 177 178 # maximum source string length to be considered 179 stoplength = self.getstoplength(min_similarity, text) 180 lowestscore = 0 181 182 for candidate in self.candidates.units[startindex:]: 183 cmpstring = candidate.source 184 if len(cmpstring) > stoplength: 185 break 186 similarity = self.comparer.similarity(text, cmpstring, min_similarity) 187 if similarity < min_similarity: 188 continue 189 if similarity > lowestscore: 190 heapq.heapreplace(bestcandidates, (similarity, candidate)) 191 lowestscore = bestcandidates[0][0] 192 if lowestscore >= 100: 193 break 194 if min_similarity < lowestscore: 195 min_similarity = lowestscore 196 stoplength = self.getstoplength(min_similarity, text) 197 198 #Remove the empty ones: 199 def notzero(item): 200 score = item[0] 201 return score != 0
202 bestcandidates = filter(notzero, bestcandidates) 203 #Sort for use as a general list, and reverse so the best one is at index 0 204 bestcandidates.sort(reverse=True) 205 return self.buildunits(bestcandidates)
206
207 - def buildunits(self, candidates):
208 """Builds a list of units conforming to base API, with the score in the comment""" 209 units = [] 210 for score, candidate in candidates: 211 if hasattr(candidate, "orig_source"): 212 candidate.source = candidate.orig_source 213 candidate.target = candidate.orig_target 214 newunit = po.pounit(candidate.source) 215 newunit.target = candidate.target 216 newunit.markfuzzy(candidate.fuzzy) 217 candidatenotes = candidate.getnotes().strip() 218 if candidatenotes: 219 newunit.addnote(candidatenotes) 220 if self.addpercentage: 221 newunit.addnote("%d%%" % score) 222 units.append(newunit) 223 return units
224 225 226 # We don't want to miss certain forms of words that only change a little 227 # at the end. Now we are tying this code to English, but it should serve 228 # us well. For example "category" should be found in "categories", 229 # "copy" should be found in "copied" 230 # 231 # The tuples define a regular expression to search for, and with what it 232 # should be replaced. 233 ignorepatterns = [ 234 ("y\s*$", "ie"), #category/categories, identify/identifies, apply/applied 235 ("[\s-]+", ""), #down time / downtime, pre-order / preorder 236 ("-", " "), #pre-order / pre order 237 (" ", "-"), #pre order / pre-order 238 ] 239 ignorepatterns_re = [(re.compile(a), b) for (a, b) in ignorepatterns] 240 241 context_re = re.compile("\s+\(.*\)\s*$") 242
243 -class terminologymatcher(matcher):
244 """A matcher with settings specifically for terminology matching""" 245 246 sort_reverse = True 247
248 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
249 if comparer is None: 250 comparer = terminology.TerminologyComparer(max_length) 251 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer) 252 self.addpercentage = False 253 self.match_info = {}
254
255 - def inittm(self, store):
256 """Normal initialisation, but convert all source strings to lower case""" 257 matcher.inittm(self, store) 258 extras = [] 259 for unit in self.candidates.units: 260 source = unit.source = context_re.sub("", unit.source).lower() 261 for ignorepattern_re, replacement in ignorepatterns_re: 262 (newterm, occurrences) = ignorepattern_re.subn(replacement, source) 263 if occurrences: 264 new_unit = type(unit).buildfromunit(unit) 265 new_unit.source = newterm 266 # We mark it fuzzy to indicate that it isn't pristine 267 unit.markfuzzy() 268 extras.append(new_unit) 269 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse) 270 if extras: 271 # We don't sort, so that the altered forms are at the back and 272 # considered last. 273 self.extendtm(extras, sort=False)
274
275 - def getstartlength(self, min_similarity, text):
276 # Let's number false matches by not working with terms of two 277 # characters or less 278 return 3
279
280 - def getstoplength(self, min_similarity, text):
281 # Let's ignore terms with more than 50 characters. Perhaps someone 282 # gave a file with normal (long) translations 283 return 50
284
285 - def usable(self, unit):
286 """Returns whether this translation unit is usable for terminology.""" 287 if not unit.istranslated(): 288 return False 289 l = len(context_re.sub("", unit.source)) 290 return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None)
291
292 - def matches(self, text):
293 """Normal matching after converting text to lower case. Then replace 294 with the original unit to retain comments, etc.""" 295 text = text.lower() 296 comparer = self.comparer 297 comparer.match_info = {} 298 match_info = {} 299 matches = [] 300 known = set() 301 for cand in self.candidates.units: 302 source = cand.source 303 if (source, cand.target) in known: 304 continue 305 if comparer.similarity(text, source, self.MIN_SIMILARITY): 306 match_info[source] = {'pos': comparer.match_info[source]['pos']} 307 matches.append(cand) 308 known.add((source, cand.target)) 309 310 final_matches = [] 311 lastend = 0 312 _sort_matches(matches, match_info) 313 for match in matches: 314 start_pos = match_info[match.source]['pos'] 315 if start_pos < lastend: 316 continue 317 end = start_pos + len(match.source) 318 319 final_matches.append(match) 320 321 # Get translations for the placeable 322 for m in matches: 323 if m is match: 324 continue 325 m_info = match_info[m.source] 326 m_end = m_info['pos'] 327 if m_end > start_pos: 328 # we past valid possibilities in the list 329 break 330 m_end += len(m.source) 331 if start_pos == m_info['pos'] and end == m_end: 332 # another match for the same term 333 final_matches.append(m) 334 335 lastend = end 336 if final_matches: 337 self.match_info = match_info 338 return final_matches
339 340 341 # utility functions used by virtaal and tmserver to convert matching units in easily marshallable dictionaries
342 -def unit2dict(unit):
343 """converts a pounit to a simple dict structure for use over the web""" 344 return {"source": unit.source, "target": unit.target, 345 "quality": _parse_quality(unit.getnotes()), "context": unit.getcontext()}
346
347 -def _parse_quality(comment):
348 """extracts match quality from po comments""" 349 quality = re.search('([0-9]+)%', comment) 350 if quality: 351 return quality.group(1)
352