Python 常用模块体验 ::-- ZoomQuiet [2007-11-10 06:37:48]
Contents
1. Py常用模块汇编
'Python 标准库2.0 整理者
Python 江湖 QQ 群: 43680167 Feather (校对) gt: [email protected]
一些有用的Python函式庫列表 » 程式設計 遇上 小提琴
::-- ZoomQuiet [2007-11-10 07:39:01]
1.1. zshelve 对象持久模块
{{{Jiahua Huang <[email protected]> reply-to [email protected], to "python. cn" <[email protected]>, date Nov 8, 2007 5:41 PM subject [CPyUG:34726] 贴个 zlib 压缩的 zshelve 对象持久模块 }}} 这个给 Python 标准库的 shelve.py 添加了 zlib 压缩, 减小数据库文件体积,以改善磁盘 io 性能
1.1.1. 发布
http://zshelve.googlecode.com/svn/trunk/
加了个命令行工具:
huahua@huahua:tmp$ zshelve commandline tool for zshelve databases Usage: zshelve FILE dump Dump the data tree zshelve FILE keys List of keys zshelve FILE get KEY Dump value for key zshelve FILE set KEY VALUE Set db[key] = value zshelve FILE has_key KEY True if database has the key zshelve FILE search_key KEY Search key zshelve FILE search_value VALUE Search value huahua@huahua:tmp$ zshelve set tes.db a 1 huahua@huahua:tmp$ zshelve dump tes.db |- a | | - 1 huahua@huahua:tmp$ zshelve set tes.db b "dict(a=1,b=2,c=3,d={'s':'4'})" huahua@huahua:tmp$ zshelve dump tes.db |- a | |- 1 |- b | |- a | | |- 1 | |- c | | |- 3 | |- b | | |- 2 | |- d | | |- s | | | |- 4
对比::
>>> import zshelve >>> import shelve >>> zdb = zshelve.open('/tmp/zshelve.db') >>> db = shelve.open('/tmp/shelve.db') >>> zdb['1'] = dict(a='0123456789'*10000000) >>> db['1'] = dict(a='0123456789'*10000000) >>> zdb.sync() >>> db.sync()
看看文件大小差异::
huahua@huahua:zshelve$ ll /tmp/*shelve.db -rw-r--r-- 1 huahua huahua 96M 2007-11-08 17:36 /tmp/shelve.db -rw-r--r-- 1 huahua huahua 204K 2007-11-08 17:36 /tmp/zshelve.db
1.1.2. 补丁::
--- shelve.py 2007-05-03 00:56:36.000000000 +0800 +++ zshelve.py 2007-11-08 17:25:59.000000000 +0800 @@ -70,6 +70,7 @@ except ImportError: import UserDict import warnings +import zlib ## use zlib to compress dbfile __all__ = ["Shelf","BsdDbShelf","DbfilenameShelf","open"] @@ -80,13 +81,14 @@ class Shelf(UserDict.DictMixin): See the module's __doc__ string for an overview of the interface. """ - def __init__(self, dict, protocol=None, writeback=False): + def __init__(self, dict, protocol=None, writeback=False, compresslevel=2): self.dict = dict if protocol is None: protocol = 0 self._protocol = protocol self.writeback = writeback self.cache = {} + self.compresslevel = compresslevel def keys(self): return self.dict.keys() @@ -109,7 +111,7 @@ class Shelf(UserDict.DictMixin): try: value = self.cache[key] except KeyError: - f = StringIO(self.dict[key]) + f = StringIO(zlib.decompress(self.dict[key])) value = Unpickler(f).load() if self.writeback: self.cache[key] = value @@ -121,7 +123,7 @@ class Shelf(UserDict.DictMixin): f = StringIO() p = Pickler(f, self._protocol) p.dump(value) - self.dict[key] = f.getvalue() + self.dict[key] = zlib.compress(f.getvalue(), self.compresslevel) def __delitem__(self, key): del self.dict[key] @@ -168,32 +170,32 @@ class BsdDbShelf(Shelf): See the module's __doc__ string for an overview of the interface. """ - def __init__(self, dict, protocol=None, writeback=False): - Shelf.__init__(self, dict, protocol, writeback) + def __init__(self, dict, protocol=None, writeback=False, compresslevel=2): + Shelf.__init__(self, dict, protocol, writeback, compresslevel) def set_location(self, key): (key, value) = self.dict.set_location(key) - f = StringIO(value) + f = StringIO(zlib.decompress(value)) return (key, Unpickler(f).load()) def next(self): (key, value) = self.dict.next() - f = StringIO(value) + f = StringIO(zlib.decompress(value)) return (key, Unpickler(f).load()) def previous(self): (key, value) = self.dict.previous() - f = StringIO(value) + f = StringIO(zlib.decompress(value)) return (key, Unpickler(f).load()) def first(self): (key, value) = self.dict.first() - f = StringIO(value) + f = StringIO(zlib.decompress(value)) return (key, Unpickler(f).load()) def last(self): (key, value) = self.dict.last() - f = StringIO(value) + f = StringIO(zlib.decompress(value)) return (key, Unpickler(f).load()) @@ -204,12 +206,12 @@ class DbfilenameShelf(Shelf): See the module's __doc__ string for an overview of the interface. """ - def __init__(self, filename, flag='c', protocol=None, writeback=False): + def __init__(self, filename, flag='c', protocol=None, writeback=False, compresslevel=2): import anydbm - Shelf.__init__(self, anydbm.open(filename, flag), protocol, writeback) + Shelf.__init__(self, anydbm.open(filename, flag), protocol, writeback, compresslevel) -def open(filename, flag='c', protocol=None, writeback=False): +def open(filename, flag='c', protocol=None, writeback=False, compresslevel=2): """Open a persistent dictionary for reading and writing. The filename parameter is the base filename for the underlying @@ -222,4 +224,4 @@ def open(filename, flag='c', protocol=No See the module's __doc__ string for an overview of the interface. """ - return DbfilenameShelf(filename, flag, protocol, writeback) + return DbfilenameShelf(filename, flag, protocol, writeback, compresslevel)
::-- ZoomQuiet [2007-11-10 07:34:49]
Contents
1.2. fast UserDict
{{{Jiahua Huang <[email protected]> reply-to [email protected], to "python. cn" <[email protected]>, date Nov 10, 2007 3:28 PM subject [CPyUG:34791] 一行代码让 UserDict.UserDict 的类加速 4 倍 }}} 发现 Python 标准库里好些字典类从 UserDict.UserDict 派生, 而不是从 dict 派生, 是因为 旧版 python 内建类型不能派生子类,
那么这会不会影响速度呢,
先给两个分别继承 UserDict.UserDict 和 dict 的类 URdict, Rdict
>>> import UserDict >>> class URdict(UserDict.UserDict): ... '''dict can search key by value ... ''' ... def indexkey4value(self, value): ... '''search key by value ... >>> rd = Rdict(a='One', b='Other', c='What', d='Why', e='Other') ... >>> rd.indexkey4value('Other') ... 'b' ... ''' ... try: ... ind = self.values().index(value) ... return self.keys()[ind] ... except: ... return None ... def key4value(self, svalue): ... '''search key by value ... >>> rd = Rdict(a='One', b='Other', c='What', d='Why', e='Other') ... >>> rd.key4value('Other') ... 'b' ... ''' ... for key, value in self.iteritems(): ... if value == svalue: ... return key ... def keys4value(self, svalue): ... '''search keys by value ... >>> rd = Rdict(a='One', b='Other', c='What', d='Why', e='Other') ... >>> rd.keys4value('Other') ... ['b', 'e'] ... ''' ... keys=[] ... for key, value in self.iteritems(): ... if value == svalue: ... keys.append(key) ... return keys ... >>> >>> class Rdict(dict): ... '''dict can search key by value ... ''' ... def indexkey4value(self, value): ... '''search key by value ... >>> rd = Rdict(a='One', b='Other', c='What', d='Why', e='Other') ... >>> rd.indexkey4value('Other') ... 'b' ... ''' ... try: ... ind = self.values().index(value) ... return self.keys()[ind] ... except: ... return None ... def key4value(self, svalue): ... '''search key by value ... >>> rd = Rdict(a='One', b='Other', c='What', d='Why', e='Other') ... >>> rd.key4value('Other') ... 'b' ... ''' ... for key, value in self.iteritems(): ... if value == svalue: ... return key ... def keys4value(self, svalue): ... '''search keys by value ... >>> rd = Rdict(a='One', b='Other', c='What', d='Why', e='Other') ... >>> rd.keys4value('Other') ... ['b', 'e'] ... ''' ... keys=[] ... for key, value in self.iteritems(): ... if value == svalue: ... keys.append(key) ... return keys ... >>> >>> import time >>> def _timeit(_src): ... exec(''' ... _t0 = time.time() ... %s ... _t1 = time.time() ... _t3 = _t1 - _t0 ... '''%_src) ... return _t3 ... >>> ran = range(100000) 再弄俩实例 >>> u = URdict() >>> r = Rdict() 看看插入速度 >>> _timeit("for i in ran: u[i]=i") 0.1777961254119873 >>> _timeit("for i in ran: r[i]=i") 0.048948049545288086 看看原始 dict 的速度 >>> _timeit("for i in ran: d[i]=i") 0.041368961334228516
可以看到, UserDict.UserDict 确实严重影响速度,
python 标准库里边好多 UserDict 的都应该换成 dict , 以提高性能
不过,一个个修改 Python 标准库似乎又不合适,
再次使用一招鲜,直接干掉 UserDict
在使用/导入那些模块前先来一行
>>> import UserDict; UserDict.UserDict = dict
完了再导入模块来试试
>>> u = URdict() >>> _timeit("for i in ran: u[i]=i") 0.042366981506347656
一行代码让速度提高 4 倍