Package xappy :: Module fieldactions
[frames] | no frames]

Source Code for Module xappy.fieldactions

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright (C) 2007 Lemur Consulting Ltd 
  4  # 
  5  # This program is free software; you can redistribute it and/or modify 
  6  # it under the terms of the GNU General Public License as published by 
  7  # the Free Software Foundation; either version 2 of the License, or 
  8  # (at your option) any later version. 
  9  # 
 10  # This program is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  # GNU General Public License for more details. 
 14  #  
 15  # You should have received a copy of the GNU General Public License along 
 16  # with this program; if not, write to the Free Software Foundation, Inc., 
 17  # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 
 18  r"""fieldactions.py: Definitions and implementations of field actions. 
 19   
 20  """ 
 21  __docformat__ = "restructuredtext en" 
 22   
 23  import _checkxapian 
 24  import errors 
 25  import marshall 
 26  from replaylog import log 
 27  import xapian 
 28  import parsedate 
 29   
30 -def _act_store_content(fieldname, doc, value, context):
31 """Perform the STORE_CONTENT action. 32 33 """ 34 try: 35 fielddata = doc.data[fieldname] 36 except KeyError: 37 fielddata = [] 38 doc.data[fieldname] = fielddata 39 fielddata.append(value)
40
41 -def _act_index_exact(fieldname, doc, value, context):
42 """Perform the INDEX_EXACT action. 43 44 """ 45 doc.add_term(fieldname, value, 0)
46
47 -def _act_tag(fieldname, doc, value, context):
48 """Perform the TAG action. 49 50 """ 51 doc.add_term(fieldname, value.lower(), 0)
52
53 -def _act_facet(fieldname, doc, value, context, type=None):
54 """Perform the FACET action. 55 56 """ 57 if type is None or type == 'string': 58 value = value.lower() 59 doc.add_term(fieldname, value, 0) 60 serialiser = log(xapian.StringListSerialiser, 61 doc.get_value(fieldname, 'facet')) 62 serialiser.append(value) 63 doc.add_value(fieldname, serialiser.get(), 'facet') 64 else: 65 marshaller = SortableMarshaller() 66 fn = marshaller.get_marshall_function(fieldname, type) 67 doc.add_value(fieldname, fn(fieldname, value), 'facet')
68
69 -def _act_index_freetext(fieldname, doc, value, context, weight=1, 70 language=None, stop=None, spell=False, 71 nopos=False, 72 allow_field_specific=True, 73 search_by_default=True):
74 """Perform the INDEX_FREETEXT action. 75 76 """ 77 termgen = log(xapian.TermGenerator) 78 if language is not None: 79 termgen.set_stemmer(log(xapian.Stem, language)) 80 81 if stop is not None: 82 stopper = log(xapian.SimpleStopper) 83 for term in stop: 84 stopper.add (term) 85 termgen.set_stopper (stopper) 86 87 if spell: 88 termgen.set_database(context.index) 89 termgen.set_flags(termgen.FLAG_SPELLING) 90 91 termgen.set_document(doc._doc) 92 93 if search_by_default: 94 termgen.set_termpos(context.current_position) 95 # Store a copy of the field without a prefix, for non-field-specific 96 # searches. 97 if nopos: 98 termgen.index_text_without_positions(value, weight, '') 99 else: 100 termgen.index_text(value, weight, '') 101 102 if allow_field_specific: 103 # Store a second copy of the term with a prefix, for field-specific 104 # searches. 105 prefix = doc._fieldmappings.get_prefix(fieldname) 106 if len(prefix) != 0: 107 termgen.set_termpos(context.current_position) 108 if nopos: 109 termgen.index_text_without_positions(value, weight, prefix) 110 else: 111 termgen.index_text(value, weight, prefix) 112 113 # Add a gap between each field instance, so that phrase searches don't 114 # match across instances. 115 termgen.increase_termpos(10) 116 context.current_position = termgen.get_termpos()
117
118 -class SortableMarshaller(object):
119 """Implementation of marshalling for sortable values. 120 121 """
122 - def __init__(self, indexing=True):
123 if indexing: 124 self._err = errors.IndexerError 125 else: 126 self._err = errors.SearchError
127
128 - def marshall_string(self, fieldname, value):
129 """Marshall a value for sorting in lexicograpical order. 130 131 This returns the input as the output, since strings already sort in 132 lexicographical order. 133 134 """ 135 return value
136
137 - def marshall_float(self, fieldname, value):
138 """Marshall a value for sorting as a floating point value. 139 140 """ 141 # convert the value to a float 142 try: 143 value = float(value) 144 except ValueError: 145 raise self._err("Value supplied to field %r must be a " 146 "valid floating point number: was %r" % 147 (fieldname, value)) 148 return marshall.float_to_string(value)
149
150 - def marshall_date(self, fieldname, value):
151 """Marshall a value for sorting as a date. 152 153 """ 154 try: 155 value = parsedate.date_from_string(value) 156 except ValueError, e: 157 raise self._err("Value supplied to field %r must be a " 158 "valid date: was %r: error is '%s'" % 159 (fieldname, value, str(e))) 160 return marshall.date_to_string(value)
161
162 - def get_marshall_function(self, fieldname, sorttype):
163 """Get a function used to marshall values of a given sorttype. 164 165 """ 166 try: 167 return { 168 None: self.marshall_string, 169 'string': self.marshall_string, 170 'float': self.marshall_float, 171 'date': self.marshall_date, 172 }[sorttype] 173 except KeyError: 174 raise self._err("Unknown sort type %r for field %r" % 175 (sorttype, fieldname))
176 177
178 -def _act_sort_and_collapse(fieldname, doc, value, context, type=None):
179 """Perform the SORTABLE action. 180 181 """ 182 marshaller = SortableMarshaller() 183 fn = marshaller.get_marshall_function(fieldname, type) 184 value = fn(fieldname, value) 185 doc.add_value(fieldname, value, 'collsort')
186
187 -class ActionContext(object):
188 """The context in which an action is performed. 189 190 This is just used to pass term generators, word positions, and the like 191 around. 192 193 """
194 - def __init__(self, index):
195 self.current_language = None 196 self.current_position = 0 197 self.index = index
198
199 -class FieldActions(object):
200 """An object describing the actions to be performed on a field. 201 202 The supported actions are: 203 204 - `STORE_CONTENT`: store the unprocessed content of the field in the search 205 engine database. All fields which need to be displayed or used when 206 displaying the search results need to be given this action. 207 208 - `INDEX_EXACT`: index the exact content of the field as a single search 209 term. Fields whose contents need to be searchable as an "exact match" 210 need to be given this action. 211 212 - `INDEX_FREETEXT`: index the content of this field as text. The content 213 will be split into terms, allowing free text searching of the field. Four 214 optional parameters may be supplied: 215 216 - 'weight' is a multiplier to apply to the importance of the field. This 217 must be an integer, and the default value is 1. 218 - 'language' is the language to use when processing the field. This can 219 be expressed as an ISO 2-letter language code. The supported languages 220 are those supported by the xapian core in use. 221 - 'stop' is an iterable of stopwords to filter out of the generated 222 terms. Note that due to Xapian design, only non-positional terms are 223 affected, so this is of limited use. 224 - 'spell' is a boolean flag - if true, the contents of the field will be 225 used for spelling correction. 226 - 'nopos' is a boolean flag - if true, positional information is not 227 stored. 228 - 'allow_field_specific' is a boolean flag - if False, prevents terms with the field 229 prefix being generated. This means that searches specific to this 230 field will not work, and thus should only be used when only non-field 231 specific searches are desired. Defaults to True. 232 - 'search_by_default' is a boolean flag - if False, the field will not be 233 searched by non-field specific searches. If True, or omitted, the 234 field will be included in searches for non field-specific searches. 235 236 - `SORTABLE`: index the content of the field such that it can be used to 237 sort result sets. It also allows result sets to be restricted to those 238 documents with a field values in a given range. One optional parameter 239 may be supplied: 240 241 - 'type' is a value indicating how to sort the field. It has several 242 possible values: 243 244 - 'string' - sort in lexicographic (ie, alphabetical) order. 245 This is the default, used if no type is set. 246 - 'float' - treat the values as (decimal representations of) floating 247 point numbers, and sort in numerical order. The values in the field 248 must be valid floating point numbers (according to Python's float() 249 function). 250 - 'date' - sort in date order. The values must be valid dates (either 251 Python datetime.date objects, or ISO 8601 format (ie, YYYYMMDD or 252 YYYY-MM-DD). 253 254 - `COLLAPSE`: index the content of the field such that it can be used to 255 "collapse" result sets, such that only the highest result with each value 256 of the field will be returned. 257 258 - `TAG`: the field contains tags; these are strings, which will be matched 259 in a case insensitive way, but otherwise must be exact matches. Tag 260 fields can be searched for by making an explict query (ie, using 261 query_field(), but not with query_parse()). A list of the most frequent 262 tags in a result set can also be accessed easily. 263 264 - `FACET`: the field represents a classification facet; these are strings 265 which will be matched exactly, but a list of all the facets present in 266 the result set can also be accessed easily - in addition, a suitable 267 subset of the facets, and a selection of the facet values, present in the 268 result set can be calculated. One optional parameter may be supplied: 269 270 - 'type' is a value indicating the type of facet contained in the field: 271 272 - 'string' - the facet values are exact binary strings. 273 - 'float' - the facet values are floating point numbers. 274 275 """ 276 277 # See the class docstring for the meanings of the following constants. 278 STORE_CONTENT = 1 279 INDEX_EXACT = 2 280 INDEX_FREETEXT = 3 281 SORTABLE = 4 282 COLLAPSE = 5 283 TAG = 6 284 FACET = 7 285 286 # Sorting and collapsing store the data in a value, but the format depends 287 # on the sort type. Easiest way to implement is to treat them as the same 288 # action. 289 SORT_AND_COLLAPSE = -1 290 291 _unsupported_actions = [] 292 293 if 'tags' in _checkxapian.missing_features: 294 _unsupported_actions.append(TAG) 295 if 'facets' in _checkxapian.missing_features: 296 _unsupported_actions.append(FACET) 297
298 - def __init__(self, fieldname):
299 # Dictionary of actions, keyed by type. 300 self._actions = {} 301 self._fieldname = fieldname
302
303 - def add(self, field_mappings, action, **kwargs):
304 """Add an action to perform on a field. 305 306 """ 307 if action in self._unsupported_actions: 308 raise errors.IndexerError("Action unsupported with this release of xapian") 309 310 if action not in (FieldActions.STORE_CONTENT, 311 FieldActions.INDEX_EXACT, 312 FieldActions.INDEX_FREETEXT, 313 FieldActions.SORTABLE, 314 FieldActions.COLLAPSE, 315 FieldActions.TAG, 316 FieldActions.FACET, 317 ): 318 raise errors.IndexerError("Unknown field action: %r" % action) 319 320 info = self._action_info[action] 321 322 # Check parameter names 323 for key in kwargs.keys(): 324 if key not in info[1]: 325 raise errors.IndexerError("Unknown parameter name for action %r: %r" % (info[0], key)) 326 327 # Fields cannot be indexed both with "EXACT" and "FREETEXT": whilst we 328 # could implement this, the query parser wouldn't know what to do with 329 # searches. 330 if action == FieldActions.INDEX_EXACT: 331 if FieldActions.INDEX_FREETEXT in self._actions: 332 raise errors.IndexerError("Field %r is already marked for indexing " 333 "as free text: cannot mark for indexing " 334 "as exact text as well" % self._fieldname) 335 if action == FieldActions.INDEX_FREETEXT: 336 if FieldActions.INDEX_EXACT in self._actions: 337 raise errors.IndexerError("Field %r is already marked for indexing " 338 "as exact text: cannot mark for indexing " 339 "as free text as well" % self._fieldname) 340 341 # Fields cannot be indexed as more than one type for "SORTABLE": to 342 # implement this, we'd need to use a different prefix for each sortable 343 # type, but even then the search end wouldn't know what to sort on when 344 # searching. Also, if they're indexed as "COLLAPSE", the value must be 345 # stored in the right format for the type "SORTABLE". 346 if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE: 347 if action == FieldActions.COLLAPSE: 348 sorttype = None 349 else: 350 try: 351 sorttype = kwargs['type'] 352 except KeyError: 353 sorttype = 'string' 354 kwargs['type'] = sorttype 355 action = FieldActions.SORT_AND_COLLAPSE 356 357 try: 358 oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE] 359 except KeyError: 360 oldsortactions = () 361 362 if len(oldsortactions) > 0: 363 for oldsortaction in oldsortactions: 364 oldsorttype = oldsortaction['type'] 365 366 if sorttype == oldsorttype or oldsorttype is None: 367 # Use new type 368 self._actions[action] = [] 369 elif sorttype is None: 370 # Use old type 371 return 372 else: 373 raise errors.IndexerError("Field %r is already marked for " 374 "sorting, with a different " 375 "sort type" % self._fieldname) 376 377 if 'prefix' in info[3]: 378 field_mappings.add_prefix(self._fieldname) 379 if 'slot' in info[3]: 380 purposes = info[3]['slot'] 381 if isinstance(purposes, basestring): 382 field_mappings.add_slot(self._fieldname, purposes) 383 else: 384 slotnum = None 385 for purpose in purposes: 386 slotnum = field_mappings.get_slot(self._fieldname, purpose) 387 if slotnum is not None: 388 break 389 for purpose in purposes: 390 field_mappings.add_slot(self._fieldname, purpose, slotnum=slotnum) 391 392 # Make an entry for the action 393 if action not in self._actions: 394 self._actions[action] = [] 395 396 # Check for repetitions of actions 397 for old_action in self._actions[action]: 398 if old_action == kwargs: 399 return 400 401 # Append the action to the list of actions 402 self._actions[action].append(kwargs)
403
404 - def perform(self, doc, value, context):
405 """Perform the actions on the field. 406 407 - `doc` is a ProcessedDocument to store the result of the actions in. 408 - `value` is a string holding the value of the field. 409 - `context` is an ActionContext object used to keep state in. 410 411 """ 412 for type, actionlist in self._actions.iteritems(): 413 info = self._action_info[type] 414 for kwargs in actionlist: 415 info[2](self._fieldname, doc, value, context, **kwargs)
416 417 _action_info = { 418 STORE_CONTENT: ('STORE_CONTENT', (), _act_store_content, {}, ), 419 INDEX_EXACT: ('INDEX_EXACT', (), _act_index_exact, {'prefix': True}, ), 420 INDEX_FREETEXT: ('INDEX_FREETEXT', ('weight', 'language', 'stop', 'spell', 'nopos', 'allow_field_specific', 'search_by_default', ), 421 _act_index_freetext, {'prefix': True, }, ), 422 SORTABLE: ('SORTABLE', ('type', ), None, {'slot': 'collsort',}, ), 423 COLLAPSE: ('COLLAPSE', (), None, {'slot': 'collsort',}, ), 424 TAG: ('TAG', (), _act_tag, {'prefix': True,}, ), 425 FACET: ('FACET', ('type', ), _act_facet, {'prefix': True, 'slot': 'facet',}, ), 426 427 SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, {'slot': 'collsort',}, ), 428 }
429 430 if __name__ == '__main__': 431 import doctest, sys 432 doctest.testmod (sys.modules[__name__]) 433