############################################################################## # # Zope Public License (ZPL) Version 1.0 # ------------------------------------- # # Copyright (c) Digital Creations. All rights reserved. # # This license has been certified as Open Source(tm). # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # 1. Redistributions in source code must retain the above copyright # notice, this list of conditions, and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions, and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # # 3. Digital Creations requests that attribution be given to Zope # in any manner possible. Zope includes a "Powered by Zope" # button that is installed by default. While it is not a license # violation to remove this button, it is requested that the # attribution remain. A significant investment has been put # into Zope, and this effort will continue if the Zope community # continues to grow. This is one way to assure that growth. # # 4. All advertising materials and documentation mentioning # features derived from or use of this software must display # the following acknowledgement: # # "This product includes software developed by Digital Creations # for use in the Z Object Publishing Environment # (http://www.zope.org/)." # # In the event that the product being advertised includes an # intact Zope distribution (with copyright and license included) # then this clause is waived. # # 5. Names associated with Zope or Digital Creations must not be used to # endorse or promote products derived from this software without # prior written permission from Digital Creations. # # 6. Modified redistributions of any form whatsoever must retain # the following acknowledgment: # # "This product includes software developed by Digital Creations # for use in the Z Object Publishing Environment # (http://www.zope.org/)." # # Intact (re-)distributions of any official Zope release do not # require an external acknowledgement. # # 7. Modifications are encouraged but must be packaged separately as # patches to official Zope releases. Distributions that do not # clearly separate the patches from the original work must be clearly # labeled as unofficial distributions. Modifications which do not # carry the name Zope may be packaged in any form, as long as they # conform to all of the clauses above. # # # Disclaimer # # THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND ANY # EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DIGITAL CREATIONS OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # # This software consists of contributions made by Digital Creations and # many individuals on behalf of Digital Creations. Specific # attributions are listed in the accompanying credits file. # ############################################################################## from Persistence import Persistent import Acquisition import BTree, OIBTree, IOBTree, IIBTree IIBucket=IIBTree.Bucket from intSet import intSet from SearchIndex import UnIndex, UnTextIndex, UnKeywordIndex, Query from SearchIndex.Lexicon import Lexicon import regex, pdb from MultiMapping import MultiMapping from string import lower import Record from Missing import MV from Lazy import LazyMap, LazyFilter, LazyCat class NoBrainer: """ This is the default class that gets instantiated for records returned by a __getitem__ on the Catalog. By default, no special methods or attributes are defined. """ pass class KWMultiMapping(MultiMapping): def has_key(self, name): try: r=self[name] return 1 except KeyError: return 0 def orify(seq, query_map={ type(regex.compile('')): Query.Regex, type(''): Query.String, }): subqueries=[] for q in seq: try: q=query_map[type(q)](q) except: q=Query.Cmp(q) subqueries.append(q) return apply(Query.Or,tuple(subqueries)) class Catalog(Persistent, Acquisition.Implicit): """ An Object Catalog An Object Catalog maintains a table of object metadata, and a series of manageable indexes to quickly search for objects (references in the metadata) that satisfy a search query. This class is not Zope specific, and can be used in any python program to build catalogs of objects. Note that it does require the objects to be Persistent, and thus must be used with ZODB3. """ _v_brains = NoBrainer _v_result_class = NoBrainer def __init__(self, brains=None): self.schema = {} # mapping from attribute name to column number self.names = () # sequence of column names self.indexes = {} # mapping from index name to index object # The catalog maintains a BTree of object meta_data for # convenient display on result pages. meta_data attributes # are turned into brain objects and returned by # searchResults. The indexing machinery indexes all records # by an integer id (rid). self.data is a mapping from the # integer id to the meta_data, self.uids is a mapping of the # object unique identifier to the rid, and self.paths is a # mapping of the rid to the unique identifier. self.data = BTree.BTree() # mapping of rid to meta_data self.uids = OIBTree.BTree() # mapping of uid to rid self.paths = IOBTree.BTree() # mapping of rid to uid # indexes can share a lexicon or have a private copy. Here, # we instantiate a lexicon to be shared by all text indexes. # This may change. self.lexicon = Lexicon() if brains is not None: self._v_brains = brains self.useBrains(self._v_brains) def __getitem__(self, index, ttype=type(())): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ if type(index) is ttype: # then it contains a score... normalized_score, score, key = index r=self._v_result_class(self.data[key]).__of__(self.aq_parent) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = normalized_score else: # otherwise no score, set all scores to 1 r=self._v_result_class(self.data[index]).__of__(self.aq_parent) r.data_record_id_ = index r.data_record_score_ = 1 r.data_record_normalized_score_ = 1 return r def __setstate__(self, state): """ initialize your brains. This method is called when the catalog is first activated (from the persistent storage) """ Persistent.__setstate__(self, state) self.useBrains(self._v_brains) if not hasattr(self, 'lexicon'): self.lexicon = Lexicon() def useBrains(self, brains): """ Sets up the Catalog to return an object (ala ZTables) that is created on the fly from the tuple stored in the self.data Btree. """ class mybrains(Record.Record, Acquisition.Implicit, brains): __doc__ = 'Data record' def has_key(self, key): return self.__record_schema__.has_key(key) scopy={} scopy = self.schema.copy() # it is useful for our brains to know these things scopy['data_record_id_']=len(self.schema.keys()) scopy['data_record_score_']=len(self.schema.keys())+1 scopy['data_record_normalized_score_']=len(self.schema.keys())+2 mybrains.__record_schema__ = scopy self._v_brains = brains self._v_result_class=mybrains def addColumn(self, name, default_value=None): """ adds a row to the meta data schema """ schema = self.schema names = list(self.names) if schema.has_key(name): raise 'Column Exists', 'The column exists' if not schema.has_key(name): if schema.values(): schema[name] = max(schema.values())+1 else: schema[name] = 0 names.append(name) if default_value is None or default_value == '': default_value = MV for key in self.data.keys(): rec = list(self.data[key]) rec.append(default_value) self.data[key] = tuple(rec) self.names = tuple(names) self.schema = schema # new column? update the brain self.useBrains(self._v_brains) self.__changed__(1) #why? def delColumn(self, name): """ deletes a row from the meta data schema """ names = list(self.names) _index = names.index(name) if not self.schema.has_key(name): return names.remove(name) # rebuild the schema i=0; schema = {} for name in names: schema[name] = i i = i + 1 self.schema = schema self.names = tuple(names) # update the brain self.useBrains(self._v_brains) # remove the column value from each record for key in self.data.keys(): rec = list(self.data[key]) rec.remove(rec[_index]) self.data[key] = tuple(rec) def addIndex(self, name, type): """Create a new index, of one of the following types Types: 'FieldIndex', 'TextIndex', 'KeywordIndex'. """ if self.indexes.has_key(name): raise 'Index Exists', 'The index specified already exists' # this is currently a succesion of hacks. Indexes should be # pluggable and managable indexes = self.indexes if type == 'FieldIndex': indexes[name] = UnIndex.UnIndex(name) elif type == 'TextIndex': indexes[name] = UnTextIndex.UnTextIndex(name, None, None, self.lexicon) elif type == 'KeywordIndex': indexes[name] = UnKeywordIndex.UnKeywordIndex(name) else: raise 'Unknown Index Type', ("%s invalid - must be one of %s" % (type, ['FieldIndex', 'TextIndex', 'KeywordIndex'])) self.indexes = indexes def delIndex(self, name): """ deletes an index """ if not self.indexes.has_key(name): raise 'No Index', 'The index specified does not exist' indexes = self.indexes del indexes[name] self.indexes = indexes # the cataloging API def catalogObject(self, object, uid, threshold=None): """ Adds an object to the Catalog by iteratively applying it all indexes. 'object' is the object to be cataloged 'uid' is the unique Catalog identifier for this object """ data = self.data if self.uids.has_key(uid): i = self.uids[uid] elif data: i = data.keys()[-1] + 1 # find the next available unique id else: i = 0 self.uids[uid] = i self.paths[i] = uid # meta_data is stored as a tuple for efficiency data[i] = self.recordify(object) total = 0 for x in self.indexes.values(): if hasattr(x, 'index_object'): blah = x.index_object(i, object, threshold) total = total + blah self.data = data return total def uncatalogObject(self, uid): """ Uncatalog and object from the Catalog. and 'uid' is a unique Catalog identifier Note, the uid must be the same as when the object was catalogued, otherwise it will not get removed from the catalog """ if uid not in self.uids.keys(): raise ValueError, "Uncatalog of absent id %s" % `uid` rid = self.uids[uid] for x in self.indexes.values(): if hasattr(x, 'unindex_object'): try: x.unindex_object(rid) except KeyError: pass #fugedaboudit del self.data[rid] del self.uids[uid] del self.paths[rid] def clear(self): """ clear catalog """ self.data = BTree.BTree() self.uids = OIBTree.BTree() self.paths = IOBTree.BTree() for x in self.indexes.values(): x.clear() def uniqueValuesFor(self, name): """ return unique values for FieldIndex name """ return list(self.indexes[name].uniqueValues()) def hasuid(self, uid): """ return the rid if catalog contains an object with uid """ if self.uids.has_key(uid): return self.uids[uid] else: return None def recordify(self, object): """ turns an object into a record tuple """ record = [] # the unique id is allways the first element for x in self.names: try: attr = getattr(object, x) if(callable(attr)): attr = attr() except: attr = MV record.append(attr) return tuple(record) def instantiate(self, record): r=self._v_result_class(record[1]) r.data_record_id_ = record[0] return r.__of__(self) ## Searching engine. You don't really have to worry about what goes ## on below here... def _indexedSearch(self, args, sort_index, append, used, IIBType=type(IIBucket()), intSType=type(intSet())): """ Iterate through the indexes, applying the query to each one. Do some magic to join result sets. Be intelligent about handling intSets and IIBuckets. """ ## import pdb ## pdb.set_trace() ## I use this so much I'm just leaving it commented out -michel rs=None data=self.data if used is None: used={} for i in self.indexes.keys(): try: index = self.indexes[i] if hasattr(index,'_apply_index'): r=index._apply_index(args) if r is not None: r, u = r for name in u: used[name]=1 if rs is None: rs = r else: # you can't intersect an IIBucket into an # intSet, but you can go the other way # around. Make sure we're facing the # right direction... if type(rs) is intSType and type(r) is IIBType: rs=r.intersection(rs) else: rs=rs.intersection(r) except: return used if rs is None: if sort_index is None: rs=data.items() append(LazyMap(self.instantiate, rs)) else: for k, intset in sort_index._index.items(): append((k,LazyMap(self.__getitem__, intset))) elif rs: if type(rs) is IIBType: # then there is score information. Build a new result # set, sort it by score, reverse it, compute the # normalized score, and Lazify it. rset = [] for key, score in rs.items(): rset.append((score, key)) rset.sort() rset.reverse() max = float(rset[0][0]) rs = [] for score, key in rset: rs.append(( int((score/max)*100), score, key)) append(LazyMap(self.__getitem__, rs)) elif sort_index is None and type(rs) is intSType: # no scores? Just Lazify. append(LazyMap(self.__getitem__, rs)) else: # sort. If there are scores, then this block is not # reached, therefor 'sort-on' does not happen in the # context of text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. for k, intset in sort_index._index.items(): intset=intset.intersection(rs) if intset: append((k,LazyMap(self.__getitem__, intset))) return used def searchResults(self, REQUEST=None, used=None, query_map={ type(regex.compile('')): Query.Regex, type([]): orify, type(''): Query.String, }, **kw): # Get search arguments: if REQUEST is None and not kw: try: REQUEST=self.REQUEST except: pass if kw: if REQUEST: m=KWMultiMapping() m.push(REQUEST) m.push(kw) kw=m elif REQUEST: kw=REQUEST # Make sure batch size is set if REQUEST and not REQUEST.has_key('batch_size'): try: batch_size=self.default_batch_size except: batch_size=20 REQUEST['batch_size']=batch_size # Compute "sort_index", which is a sort index, or none: if kw.has_key('sort-on'): sort_index=kw['sort-on'] elif hasattr(self, 'sort-on'): sort_index=getattr(self, 'sort-on') elif kw.has_key('sort_on'): sort_index=kw['sort_on'] else: sort_index=None sort_order='' if sort_index is not None and sort_index in self.indexes.keys(): sort_index=self.indexes[sort_index] # Perform searches with indexes and sort_index r=[] used=self._indexedSearch(kw, sort_index, r.append, used) if not r: return r # Sort/merge sub-results if len(r)==1: if sort_index is None: r=r[0] else: r=r[0][1] else: if sort_index is None: r=LazyCat(r) else: r.sort() if kw.has_key('sort-order'): so=kw['sort-order'] elif hasattr(self, 'sort-order'): so=getattr(self, 'sort-order') elif kw.has_key('sort_order'): so=kw['sort_order'] else: so=None if (type(so) is type('') and lower(so) in ('reverse', 'descending')): r.reverse() r=LazyCat(map(lambda i: i[1], r)) return r __call__ = searchResults