::-- ZoomQuiet [2007-11-06 14:25:17]
1. 快速尝试
Jiahua Huang <[email protected]> reply-to [email protected], to "python. cn" <[email protected]>, date Nov 6, 2007 10:11 PM subject [CPyUG:34610] 试了下 Durus 对象数据库 mailing list <python-cn.googlegroups.com> Filter messages from this mailing list mailed-by googlegroups.com
晚饭前跟大妈聊了会 Durus , Durus 是 Quixote 团队的作品, 轻量级的 zodb 对象数据库.
顺便测试了下 Durus 处理大数据库的情况
1.1. 测试用例
1 ##用的倒排索引全文搜索类
2 class WordIndex:
3 '''简单的倒排索引
4 '''
5 def __init__(self, wordDict={}, Dict=dict, commit=None):
6 '''dWordsIndex 是索引字典
7 '''
8 self.dWordsIndex = wordDict
9 self.Dict = Dict #用 btree 或 OOBTree
10 self.commit = commit
11 def addIndex(self, text, uid=None):
12 '''添加索引
13 '''
14 dWordsIndex = self.dWordsIndex
15 Dict = self.Dict
16 uid = uid or _str2hash(text)
17 words = getWordFs(text) #分词,带词频
18 for word, f in words.iteritems():
19 if not word in dWordsIndex:
20 dWordsIndex[word]=Dict()
21 #Uids = dWordsIndex[word]
22 #Uids[uid] = f
23 dWordsIndex[word][uid] = f
24 if self.commit: self.commit()
25 return uid
26 def delIndex(self, text, uid=None):
27 '''删除索引
28 '''
29 dWordsIndex = self.dWordsIndex
30 uid = uid or _str2hash(text)
31 words = getWords(text)
32 if not words: return False
33 for word in words:
34 if word in dWordsIndex and uid in dWordsIndex[word]:
35 del dWordsIndex[word][uid]
36 if self.commit: self.commit()
37 return uid
38 def query(self, QueryString):
39 '''
40 @TODO:
41 '''
42 pass
43 def searchIndex(self, text):
44 '''搜索文章,返回 uid
45 '''
46 dWordsIndex = self.dWordsIndex
47 words = getWords(text) #分词
48 if not words: return []
49 dicts = map(dWordsIndex.get, words)
50 while None in dicts: dicts.remove(None) ## 怎么来的 None?
51 if not dicts : return []
52 dicts = map(lambda i:i[1], sorted(map(lambda i:(len(i),i),
53 dicts))) ##按字典长度排序,先捅掉较短的
54 #return reduce(lambda d1,d2: set(d1) & set(d2), dicts)
55 return eval('&'.join(map(lambda i:'set(dicts[%s])'%i,
56 xrange(0, len(dicts))))) ##更快?
57 def _dumpIndex(self):
58 print '{'
59 for i,t in self.dWordsIndex.iteritems(): print "'%s' : %s,"%(i,t)
60 print '}'
61
62
63 class _TextIndex(WordIndex):
64 '''演示用 WordIndex 索引
65 '''
66 def __init__(self, wordDict={}, textDict={}, Dict=dict, commit=None):
67 self.dWordsIndex = wordDict
68 self.dTextDict = textDict
69 self.commit = commit
70 self.Dict = dict
71 WordIndex.__init__(self, wordDict, Dict, commit)
72 def add(self, text, uid=None):
73 dTextDict = self.dTextDict
74 uid = uid or _str2hash(text)
75 dTextDict[uid] = text
76 self.addIndex(text, uid)
77 return uid
78 def delete(uid):
79 dTextDict = self.dTextDict
80 if not uid in dTextDict: return False
81 text = dTextDict[uid]
82 self.delIndex(text, uid)
83 del dTextDict[uid]
84 def search(self, text):
85 rev = self.searchIndex(text)
86 return rev
87 def _search(self, text):
88 dTextDict = self.dTextDict
89 rev = self.searchIndex(text)
90 if not rev: return False
91 word1 = cutword.cutword(text)[0].decode('utf8', 'ignore')
92 print rev
93 for uid in rev:
94 print '=== %s ==='%uid
95 text = dTextDict[uid].decode('utf8', 'ignore')
96 ord = text.rfind(word1)
97 print text[max(0, ord-120):ord+120]
98 print
99 def _words4uid(self, uid):
100 '对比的暴力搜索'
101 dWordsIndex = self.dWordsIndex
102 for word, uids in dWordsIndex.iteritems():
103 if uid in uids: print word,
104 def _uids4word(self, word):
105 '对比的暴力搜索'
106 dTextDict = self.dTextDict
107 for uid, text in dTextDict.iteritems():
108 if word in text: print uid,
109
110
111 ##连接 Durus 数据库用
112 def getdb_durus():
113 #from durus.file_storage import FileStorage
114 #from durus.connection import Connection
115 #connection = Connection(FileStorage("testIndexWord.durus"))
116 # 用 Durus 服务
117 from durus.client_storage import ClientStorage
118 from durus.connection import Connection
119 connection = Connection(ClientStorage())
120 ##
121 root = connection.get_root()
122 from durus import persistent, persistent_dict, persistent_list
123 from durus import btree
124 Tree = btree.BTree
125 List = persistent_list.PersistentList
126 def getdb(name):
127 if not name in root:
128 root[name] = Tree()
129 connection.commit()
130 return root[name]
131 dTextDict = getdb('TextDict')
132 dWordsIndex = getdb('WordsIndex')
133 commit = connection.commit
134 ti = _TextIndex(wordDict=dWordsIndex, textDict=dTextDict,
135 Dict=Tree, commit=commit)
136 return ti
137
138
139
140 ##用下边的生成随机字符串来填充数据库
141 import random
142 def _randomword(n=1, m=7):
143 return ''.join(map(lambda
144 i:random.choice('abcdefghijklmnopqrstuvwxyz'),
145 range(random.randrange(n, m))))
146
147 def _randomtext(n=10, m=100):
148 return ' '.join(map(lambda i:_randomword(1, 3),
149 range(random.randrange(n, m))))
150
151 def _randzhtext(n=10, m=100):
152 return ''.join(map(lambda i:unichr(random.randrange(19968,
153 21000)).encode('utf8'), range(random.randrange(n, m))))
154
155
156 ##用这个测试运行时间
157 import time
158 def _timeit(_src):
159 '''测试 src 运行时间
160 '''
161 exec('''
162 _t0 = time.time()
163 %s
164 _t1 = time.time()
165 _t3 = _t1 - _t0
166 '''%_src)
167 return _t3
168
169 ## 用这个来随机填充英文
170 _timeit("for i in range(300000): print i, ti.add(_randomtext(10,100),
171 _randomword(2,11)) ")
172 ## 用这个来随机填充中文
173 _timeit("for i in range(300000): print i, ti.add(_randzhtext(10,100),
174 _randzhtext(2, 10)) ")
1.2. 测试行动
插入 59W 条随机字符的文章后 SWAP 占用 500M, 然后客户端被 Linux 杀掉
重新连接,测试搜索
>>> _timeit(ti.search('东东')) 1.5020370483398438e-05 >>> _timeit(ti.search('东东')) 4.0531158447265625e-06 ## 下边去掉了 print >>> _timeit("ti._words4uid('东东')") 0.074419975280761719 >>> _timeit("ti._words4uid('东东')") 0.074203968048095703 >>> _timeit("ti._uids4word('东东')") 1.1149060726165771 >>> _timeit("ti._uids4word('东东')") 1.1026270389556885
- 这时的数据库文件大小为 1.6G
-rw-r--r-- 1 huahua huahua 1.6G 2007-11-06 20:37 testIndexWord.durus