Package translate :: Package search :: Package indexing :: Module XapianIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.XapianIndexer

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2008-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """ 
 24  Interface to the Xapian indexing engine for the Translate Toolkit 
 25   
 26  Xapian v1.0 or higher is supported. 
 27   
 28  If you are interested in writing an interface for Xapian 0.x, then 
 29  you should checkout the following:: 
 30      svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/ 
 31  It is not completely working, but it should give you a good start. 
 32  """ 
 33   
 34  __revision__ = "$Id: XapianIndexer.py 15330 2010-08-05 11:06:53Z alaaosh $" 
 35   
 36  # xapian module versions before 1.0.13 hangs apache under mod_python 
 37  import sys 
 38  import re 
 39   
 40  # detect if running under apache 
 41  if 'apache' in sys.modules or '_apache' in sys.modules or 'mod_wsgi' in sys.modules: 
42 - def _str2version(version):
43 return [int(i) for i in version.split('.')]
44 45 import subprocess 46 # even checking xapian version leads to deadlock under apache, must figure version from command line 47 try: 48 command = subprocess.Popen(['xapian-check', '--version'], stdout=subprocess.PIPE) 49 stdout, stderr = command.communicate() 50 if _str2version(re.match('.*([0-9]+\.[0-9]+\.[0-9]+).*', stdout).groups()[0]) < [1, 0, 13]: 51 raise ImportError("Running under apache, can't load xapain") 52 except: 53 #FIXME: report is xapian-check command is missing? 54 raise ImportError("Running under apache, can't load xapian") 55 56 import CommonIndexer 57 import xapian 58 import os 59 import time 60 import logging 61
62 -def is_available():
63 return xapian.major_version() > 0
64 65 66 # in xapian there is a length restriction for term strings 67 # see http://osdir.com/ml/search.xapian.general/2006-11/msg00210.html 68 # a maximum length of around 240 is described there - but we need less anyway 69 _MAX_TERM_LENGTH = 128 70 71
72 -class XapianDatabase(CommonIndexer.CommonDatabase):
73 """interface to the xapian (http://xapian.org) indexer 74 """ 75 76 QUERY_TYPE = xapian.Query 77 INDEX_DIRECTORY_NAME = "xapian" 78
79 - def __init__(self, basedir, analyzer=None, create_allowed=True):
80 """initialize or open a xapian database 81 82 @raise ValueError: the given location exists, but the database type 83 is incompatible (e.g. created by a different indexing engine) 84 @raise OSError: the database failed to initialize 85 86 @param basedir: the parent directory of the database 87 @type basedir: str 88 @param analyzer: bitwise combination of possible analyzer flags 89 to be used as the default analyzer for this database. Leave it empty 90 to use the system default analyzer (self.ANALYZER_DEFAULT). 91 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 92 @type analyzer: int 93 @param create_allowed: create the database, if necessary; default: True 94 @type create_allowed: bool 95 """ 96 # call the __init__ function of our parent 97 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer, 98 create_allowed=create_allowed) 99 self.reader = None 100 self.writer = None 101 if os.path.exists(self.location): 102 # try to open an existing database 103 try: 104 self.reader = xapian.Database(self.location) 105 except xapian.DatabaseOpeningError, err_msg: 106 raise ValueError("Indexer: failed to open xapian database " \ 107 + "(%s) - maybe it is not a xapian database: %s" \ 108 % (self.location, str(err_msg))) 109 else: 110 # create a new database 111 if not create_allowed: 112 raise OSError("Indexer: skipping database creation") 113 try: 114 # create the parent directory if it does not exist 115 parent_path = os.path.dirname(self.location) 116 if not os.path.isdir(parent_path): 117 # recursively create all directories up to parent_path 118 os.makedirs(parent_path) 119 except IOError, err_msg: 120 raise OSError("Indexer: failed to create the parent " \ 121 + "directory (%s) of the indexing database: %s" \ 122 % (parent_path, str(err_msg))) 123 try: 124 self.writer = xapian.WritableDatabase(self.location, 125 xapian.DB_CREATE_OR_OPEN) 126 self.flush() 127 except xapian.DatabaseOpeningError, err_msg: 128 raise OSError("Indexer: failed to open or create a xapian " \ 129 + "database (%s): %s" % (self.location, str(err_msg)))
130
131 - def __del__(self):
132 self.reader = None 133 self._writer_close()
134
135 - def flush(self, optimize=False):
136 """force to write the current changes to disk immediately 137 138 @param optimize: ignored for xapian 139 @type optimize: bool 140 """ 141 # write changes to disk (only if database is read-write) 142 if self._writer_is_open(): 143 self._writer_close() 144 self._index_refresh()
145
146 - def make_query(self, *args, **kwargs):
147 try: 148 return super(XapianDatabase, self).make_query(*args, **kwargs) 149 except xapian.DatabaseModifiedError: 150 self._index_refresh() 151 return super(XapianDatabase, self).make_query(*args, **kwargs)
152
153 - def _create_query_for_query(self, query):
154 """generate a query based on an existing query object 155 156 basically this function should just create a copy of the original 157 158 @param query: the original query object 159 @type query: xapian.Query 160 @return: the resulting query object 161 @rtype: xapian.Query 162 """ 163 # create a copy of the original query 164 return xapian.Query(query)
165
166 - def _create_query_for_string(self, text, require_all=True, 167 analyzer=None):
168 """generate a query for a plain term of a string query 169 170 basically this function parses the string and returns the resulting 171 query 172 173 @param text: the query string 174 @type text: str 175 @param require_all: boolean operator 176 (True -> AND (default) / False -> OR) 177 @type require_all: bool 178 @param analyzer: Define query options (partial matching, exact matching, 179 tokenizing, ...) as bitwise combinations of 180 CommonIndexer.ANALYZER_???. 181 This can override previously defined field analyzer settings. 182 If analyzer is None (default), then the configured analyzer for the 183 field is used. 184 @type analyzer: int 185 @return: resulting query object 186 @rtype: xapian.Query 187 """ 188 qp = xapian.QueryParser() 189 qp.set_database(self.reader) 190 if require_all: 191 qp.set_default_op(xapian.Query.OP_AND) 192 else: 193 qp.set_default_op(xapian.Query.OP_OR) 194 if analyzer is None: 195 analyzer = self.analyzer 196 if analyzer & self.ANALYZER_PARTIAL > 0: 197 match_flags = xapian.QueryParser.FLAG_PARTIAL 198 return qp.parse_query(text, match_flags) 199 elif analyzer == self.ANALYZER_EXACT: 200 # exact matching - 201 return xapian.Query(text) 202 else: 203 # everything else (not partial and not exact) 204 match_flags = 0 205 return qp.parse_query(text, match_flags)
206
207 - def _create_query_for_field(self, field, value, analyzer=None):
208 """generate a field query 209 210 this functions creates a field->value query 211 212 @param field: the fieldname to be used 213 @type field: str 214 @param value: the wanted value of the field 215 @type value: str 216 @param analyzer: Define query options (partial matching, exact matching, 217 tokenizing, ...) as bitwise combinations of 218 CommonIndexer.ANALYZER_???. 219 This can override previously defined field analyzer settings. 220 If analyzer is None (default), then the configured analyzer for the 221 field is used. 222 @type analyzer: int 223 @return: the resulting query object 224 @rtype: xapian.Query 225 """ 226 if analyzer is None: 227 analyzer = self.analyzer 228 if analyzer == self.ANALYZER_EXACT: 229 # exact matching -> keep special characters 230 return xapian.Query("%s%s" % (field.upper(), value)) 231 # other queries need a parser object 232 qp = xapian.QueryParser() 233 qp.set_database(self.reader) 234 if (analyzer & self.ANALYZER_PARTIAL > 0): 235 # partial matching 236 match_flags = xapian.QueryParser.FLAG_PARTIAL 237 return qp.parse_query(value, match_flags, field.upper()) 238 else: 239 # everything else (not partial and not exact) 240 match_flags = 0 241 return qp.parse_query(value, match_flags, field.upper())
242
243 - def _create_query_combined(self, queries, require_all=True):
244 """generate a combined query 245 246 @param queries: list of the original queries 247 @type queries: list of xapian.Query 248 @param require_all: boolean operator 249 (True -> AND (default) / False -> OR) 250 @type require_all: bool 251 @return: the resulting combined query object 252 @rtype: xapian.Query 253 """ 254 if require_all: 255 query_op = xapian.Query.OP_AND 256 else: 257 query_op = xapian.Query.OP_OR 258 return xapian.Query(query_op, queries)
259
260 - def _create_empty_document(self):
261 """create an empty document to be filled and added to the index later 262 263 @return: the new document object 264 @rtype: xapian.Document 265 """ 266 return xapian.Document()
267
268 - def _add_plain_term(self, document, term, tokenize=True):
269 """add a term to a document 270 271 @param document: the document to be changed 272 @type document: xapian.Document 273 @param term: a single term to be added 274 @type term: str 275 @param tokenize: should the term be tokenized automatically 276 @type tokenize: bool 277 """ 278 if tokenize: 279 term_gen = xapian.TermGenerator() 280 term_gen.set_document(document) 281 term_gen.index_text(term) 282 else: 283 document.add_term(_truncate_term_length(term))
284
285 - def _add_field_term(self, document, field, term, tokenize=True):
286 """add a field term to a document 287 288 @param document: the document to be changed 289 @type document: xapian.Document 290 @param field: name of the field 291 @type field: str 292 @param term: term to be associated to the field 293 @type term: str 294 @param tokenize: should the term be tokenized automatically 295 @type tokenize: bool 296 """ 297 if tokenize: 298 term_gen = xapian.TermGenerator() 299 term_gen.set_document(document) 300 term_gen.index_text(term, 1, field.upper()) 301 else: 302 document.add_term(_truncate_term_length("%s%s" % \ 303 (field.upper(), term)))
304
305 - def _add_document_to_index(self, document):
306 """add a prepared document to the index database 307 308 @param document: the document to be added 309 @type document: xapian.Document 310 """ 311 # open the database for writing 312 self._writer_open() 313 self.writer.add_document(document)
314
315 - def begin_transaction(self):
316 """begin a transaction 317 318 Xapian supports transactions to group multiple database modifications. 319 This avoids intermediate flushing and therefore increases performance. 320 """ 321 self._writer_open() 322 self.writer.begin_transaction()
323
324 - def cancel_transaction(self):
325 """cancel an ongoing transaction 326 327 no changes since the last execution of 'begin_transcation' are written 328 """ 329 self.writer.cancel_transaction() 330 self._writer_close()
331
332 - def commit_transaction(self):
333 """submit the changes of an ongoing transaction 334 335 all changes since the last execution of 'begin_transaction' are written 336 """ 337 self.writer.commit_transaction() 338 self._writer_close()
339
340 - def get_query_result(self, query):
341 """return an object containing the results of a query 342 343 @param query: a pre-compiled xapian query 344 @type query: xapian.Query 345 @return: an object that allows access to the results 346 @rtype: XapianIndexer.CommonEnquire 347 """ 348 enquire = xapian.Enquire(self.reader) 349 enquire.set_query(query) 350 return XapianEnquire(enquire)
351
352 - def delete_document_by_id(self, docid):
353 """delete a specified document 354 355 @param docid: the document ID to be deleted 356 @type docid: int 357 """ 358 # open the database for writing 359 self._writer_open() 360 try: 361 self.writer.delete_document(docid) 362 return True 363 except xapian.DocNotFoundError: 364 return False
365
366 - def search(self, query, fieldnames):
367 """return a list of the contents of specified fields for all matches of 368 a query 369 370 @param query: the query to be issued 371 @type query: xapian.Query 372 @param fieldnames: the name(s) of a field of the document content 373 @type fieldnames: string | list of strings 374 @return: a list of dicts containing the specified field(s) 375 @rtype: list of dicts 376 """ 377 result = [] 378 if isinstance(fieldnames, basestring): 379 fieldnames = [fieldnames] 380 try: 381 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames)) 382 except xapian.DatabaseModifiedError: 383 self._index_refresh() 384 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames)) 385 return result
386
387 - def _delete_stale_lock(self):
388 if not self._writer_is_open(): 389 lockfile = os.path.join(self.location, 'flintlock') 390 if os.path.exists(lockfile) and (time.time() - os.path.getmtime(lockfile)) / 60 > 15: 391 logging.warning("stale lock found in %s, removing.", self.location) 392 os.remove(lockfile)
393
394 - def _writer_open(self):
395 """open write access for the indexing database and acquire an exclusive lock""" 396 if not self._writer_is_open(): 397 self._delete_stale_lock() 398 try: 399 self.writer = xapian.WritableDatabase(self.location, xapian.DB_OPEN) 400 except xapian.DatabaseOpeningError, err_msg: 401 402 raise ValueError("Indexer: failed to open xapian database " \ 403 + "(%s) - maybe it is not a xapian database: %s" \ 404 % (self.location, str(err_msg)))
405
406 - def _writer_close(self):
407 """close indexing write access and remove database lock""" 408 if self._writer_is_open(): 409 self.writer.flush() 410 self.writer = None
411
412 - def _writer_is_open(self):
413 """check if the indexing write access is currently open""" 414 return not self.writer is None
415
416 - def _index_refresh(self):
417 """re-read the indexer database""" 418 try: 419 if self.reader is None: 420 self.reader = xapian.Database(self.location) 421 else: 422 self.reader.reopen() 423 except xapian.DatabaseOpeningError, err_msg: 424 raise ValueError("Indexer: failed to open xapian database " \ 425 + "(%s) - maybe it is not a xapian database: %s" \ 426 % (self.location, str(err_msg)))
427 428
429 -class XapianEnquire(CommonIndexer.CommonEnquire):
430 """interface to the xapian object for storing sets of matches 431 """ 432
433 - def get_matches(self, start, number):
434 """return a specified number of qualified matches of a previous query 435 436 @param start: index of the first match to return (starting from zero) 437 @type start: int 438 @param number: the number of matching entries to return 439 @type number: int 440 @return: a set of matching entries and some statistics 441 @rtype: tuple of (returned number, available number, matches) 442 "matches" is a dictionary of:: 443 ["rank", "percent", "document", "docid"] 444 """ 445 matches = self.enquire.get_mset(start, number) 446 result = [] 447 for match in matches: 448 elem = {} 449 elem["rank"] = match[xapian.MSET_RANK] 450 elem["docid"] = match[xapian.MSET_DID] 451 elem["percent"] = match[xapian.MSET_PERCENT] 452 elem["document"] = match[xapian.MSET_DOCUMENT] 453 result.append(elem) 454 return (matches.size(), matches.get_matches_estimated(), result)
455 456
457 -def _truncate_term_length(term, taken=0):
458 """truncate the length of a term string length to the maximum allowed 459 for xapian terms 460 461 @param term: the value of the term, that should be truncated 462 @type term: str 463 @param taken: since a term consists of the name of the term and its 464 actual value, this additional parameter can be used to reduce the 465 maximum count of possible characters 466 @type taken: int 467 @return: the truncated string 468 @rtype: str 469 """ 470 if len(term) > _MAX_TERM_LENGTH - taken: 471 return term[0:_MAX_TERM_LENGTH - taken - 1] 472 else: 473 return term
474
475 -def _extract_fieldvalues(match, (result, fieldnames)):
476 """add a dict of field values to a list 477 478 usually this function should be used together with '_walk_matches' 479 for traversing a list of matches 480 @param match: a single match object 481 @type match: xapian.MSet 482 @param result: the resulting dict will be added to this list 483 @type result: list of dict 484 @param fieldnames: the names of the fields to be added to the dict 485 @type fieldnames: list of str 486 """ 487 # prepare empty dict 488 item_fields = {} 489 # fill the dict 490 for term in match["document"].termlist(): 491 for fname in fieldnames: 492 if ((fname is None) and re.match("[^A-Z]", term.term)): 493 value = term.term 494 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term): 495 value = term.term[len(fname):] 496 else: 497 continue 498 # we found a matching field/term 499 if item_fields.has_key(fname): 500 item_fields[fname].append(value) 501 else: 502 item_fields[fname] = [value] 503 result.append(item_fields)
504