1. 快速尝试

Jiahua Huang <[email protected]>
reply-to        [email protected],
to      "python. cn" <[email protected]>,
date    Nov 6, 2007 10:11 PM
subject [CPyUG:34610] 试了下 Durus 对象数据库
mailing list    <python-cn.googlegroups.com> Filter messages from this mailing list
mailed-by       googlegroups.com

晚饭前跟大妈聊了会 Durus , Durus 是 Quixote 团队的作品, 轻量级的 zodb 对象数据库.

顺便测试了下 Durus 处理大数据库的情况

1.1. 测试用例

   1 ##用的倒排索引全文搜索类
   2 class WordIndex:
   3    '''简单的倒排索引
   4    '''
   5    def __init__(self, wordDict={}, Dict=dict, commit=None):
   6        '''dWordsIndex 是索引字典
   7        '''
   8        self.dWordsIndex = wordDict
   9        self.Dict = Dict #用 btree 或 OOBTree
  10        self.commit = commit
  11    def addIndex(self, text, uid=None):
  12        '''添加索引
  13        '''
  14        dWordsIndex = self.dWordsIndex
  15        Dict = self.Dict
  16        uid = uid or _str2hash(text)
  17        words = getWordFs(text) #分词,带词频
  18        for word, f in words.iteritems():
  19            if not word in dWordsIndex:
  20                dWordsIndex[word]=Dict()
  21            #Uids = dWordsIndex[word]
  22            #Uids[uid] = f
  23            dWordsIndex[word][uid] = f
  24        if self.commit: self.commit()
  25        return uid
  26    def delIndex(self, text, uid=None):
  27        '''删除索引
  28        '''
  29        dWordsIndex = self.dWordsIndex
  30        uid = uid or _str2hash(text)
  31        words = getWords(text)
  32        if not words: return False
  33        for word in words:
  34            if word in dWordsIndex and uid in dWordsIndex[word]:
  35                del dWordsIndex[word][uid]
  36        if self.commit: self.commit()
  37        return uid
  38    def query(self, QueryString):
  39        '''
  40        @TODO:
  41        '''
  42        pass
  43    def searchIndex(self, text):
  44        '''搜索文章，返回 uid
  45        '''
  46        dWordsIndex = self.dWordsIndex
  47        words = getWords(text) #分词
  48        if not words: return []
  49        dicts = map(dWordsIndex.get, words)
  50        while None in dicts: dicts.remove(None) ## 怎么来的 None?
  51        if not dicts : return []
  52        dicts = map(lambda i:i[1], sorted(map(lambda i:(len(i),i),
  53 dicts))) ##按字典长度排序，先捅掉较短的
  54        #return reduce(lambda d1,d2: set(d1) & set(d2), dicts)
  55        return eval('&'.join(map(lambda i:'set(dicts[%s])'%i,
  56 xrange(0, len(dicts))))) ##更快?
  57    def _dumpIndex(self):
  58        print '{'
  59        for i,t in self.dWordsIndex.iteritems(): print "'%s' : %s,"%(i,t)
  60        print '}'
  61 
  62 
  63 class _TextIndex(WordIndex):
  64    '''演示用 WordIndex 索引
  65    '''
  66    def __init__(self, wordDict={}, textDict={}, Dict=dict, commit=None):
  67        self.dWordsIndex = wordDict
  68        self.dTextDict = textDict
  69        self.commit = commit
  70        self.Dict = dict
  71        WordIndex.__init__(self, wordDict, Dict, commit)
  72    def add(self, text, uid=None):
  73        dTextDict = self.dTextDict
  74        uid = uid or _str2hash(text)
  75        dTextDict[uid] = text
  76        self.addIndex(text, uid)
  77        return uid
  78    def delete(uid):
  79        dTextDict = self.dTextDict
  80        if not uid in dTextDict: return False
  81        text = dTextDict[uid]
  82        self.delIndex(text, uid)
  83        del dTextDict[uid]
  84    def search(self, text):
  85        rev = self.searchIndex(text)
  86        return rev
  87    def _search(self, text):
  88        dTextDict = self.dTextDict
  89        rev = self.searchIndex(text)
  90        if not rev: return False
  91        word1 = cutword.cutword(text)[0].decode('utf8', 'ignore')
  92        print rev
  93        for uid in rev:
  94            print '=== %s ==='%uid
  95            text = dTextDict[uid].decode('utf8', 'ignore')
  96            ord = text.rfind(word1)
  97            print text[max(0, ord-120):ord+120]
  98            print
  99    def _words4uid(self, uid):
 100        '对比的暴力搜索'
 101        dWordsIndex = self.dWordsIndex
 102        for word, uids in dWordsIndex.iteritems():
 103            if uid in uids: print word,
 104    def _uids4word(self, word):
 105        '对比的暴力搜索'
 106        dTextDict = self.dTextDict
 107        for uid, text in dTextDict.iteritems():
 108            if word in text: print uid,
 109 
 110 
 111 ##连接 Durus 数据库用
 112 def getdb_durus():
 113    #from durus.file_storage import FileStorage
 114    #from durus.connection import Connection
 115    #connection = Connection(FileStorage("testIndexWord.durus"))
 116    # 用 Durus 服务
 117    from durus.client_storage import ClientStorage
 118    from durus.connection import Connection
 119    connection = Connection(ClientStorage())
 120    ##
 121    root = connection.get_root()
 122    from durus import persistent, persistent_dict, persistent_list
 123    from durus import btree
 124    Tree = btree.BTree
 125    List = persistent_list.PersistentList
 126    def getdb(name):
 127        if not name in root:
 128            root[name] = Tree()
 129            connection.commit()
 130        return root[name]
 131    dTextDict = getdb('TextDict')
 132    dWordsIndex = getdb('WordsIndex')
 133    commit = connection.commit
 134    ti = _TextIndex(wordDict=dWordsIndex, textDict=dTextDict,
 135 Dict=Tree, commit=commit)
 136    return ti
 137 
 138 
 139 
 140 ##用下边的生成随机字符串来填充数据库
 141 import random
 142 def _randomword(n=1, m=7):
 143    return ''.join(map(lambda
 144 i:random.choice('abcdefghijklmnopqrstuvwxyz'),
 145 range(random.randrange(n, m))))
 146 
 147 def _randomtext(n=10, m=100):
 148    return ' '.join(map(lambda i:_randomword(1, 3),
 149 range(random.randrange(n, m))))
 150 
 151 def _randzhtext(n=10, m=100):
 152    return ''.join(map(lambda i:unichr(random.randrange(19968,
 153 21000)).encode('utf8'), range(random.randrange(n, m))))
 154 
 155 
 156 ##用这个测试运行时间
 157 import time
 158 def _timeit(_src):
 159    '''测试 src 运行时间
 160    '''
 161    exec('''
 162 _t0 = time.time()
 163 %s
 164 _t1 = time.time()
 165 _t3 = _t1 - _t0
 166 '''%_src)
 167    return _t3
 168 
 169 ## 用这个来随机填充英文
 170 _timeit("for i in range(300000): print i, ti.add(_randomtext(10,100),
 171 _randomword(2,11)) ")
 172 ## 用这个来随机填充中文
 173 _timeit("for i in range(300000): print i, ti.add(_randzhtext(10,100),
 174 _randzhtext(2, 10))  ")

1.2. 测试行动

插入 59W 条随机字符的文章后 SWAP 占用 500M, 然后客户端被 Linux 杀掉

重新连接,测试搜索

>>> _timeit(ti.search('东东'))
1.5020370483398438e-05
>>> _timeit(ti.search('东东'))
4.0531158447265625e-06

## 下边去掉了 print
>>> _timeit("ti._words4uid('东东')")
0.074419975280761719
>>> _timeit("ti._words4uid('东东')")
0.074203968048095703

>>> _timeit("ti._uids4word('东东')")
1.1149060726165771
>>> _timeit("ti._uids4word('东东')")
1.1026270389556885

这时的数据库文件大小为 1.6G

-rw-r--r-- 1 huahua huahua 1.6G 2007-11-06 20:37 testIndexWord.durus

1. 快速尝试

1.1. 测试用例

1.2. 测试行动

2. 反馈