爬书虫 ::-- ZoomQuiet [2007-08-06 08:33:05]
1. 起点爬虫
朱丹 <[email protected]> hide details 3:42 pm (9 minutes ago) reply-to [email protected] to [email protected] date Aug 6, 2007 3:42 PM subject [CPyUG:30059] 给爱看小说的朋友_起点小说爬虫 mailed-by googlegroups.com
最近闹书荒,不知道看什么书好. 起点上推荐的,排行榜上靠前的烂书真不少,不能用那个来判定. 而且下的时候麻烦死了,点好多下 我下txt格式(手机上用)的,,经常下不下来. 好不容易下下来了,还贼难看... 下下来的书名还是一堆id数字,还要自己去修改书名.....麻烦.. 干脆用python写了一个程序,下的那叫一个爽.. 稀里哗啦下了200M的书,慢慢看了..这个爬虫也是一顺的就写完了,没做什么修改,也懒得去改了,能用就行. 下的时候需要提供你要下的页面的url,比如http://www.cmfu.com 会找到这个页面上所有的书来下载
1 #@+leo-ver=4-thin-encoding=gb2312,.
2 #@+node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py
3 #@+at
4 #@nonl
5 # 起点小说爬虫
6 #@-at
7 #@@c
8 #@@language python
9 #@+others
10 #@+node:BIGZHU.20070731161308:import
11 import httplib,urllib2,urllib,cookielib,re,threading
12 import os
13 #@nonl
14 #@-node:BIGZHU.20070731161308:import
15 #@+node:BIGZHU.20070731160928:getCookie
16 def getCookie():
17 cj = cookielib.CookieJar()#建立Cookie实例
18 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))#建立opener与Cookie关联
19 return opener
20 #@-node:BIGZHU.20070731160928:getCookie
21 #@-others
22 #@<<getBookIdList>>
23 #@+node:BIGZHU.20070731160918.1:<<getBookIdList>>
24 def getBookIdList(opener,urlList):
25
26 BookIdList = []
27 for i in urlList:
28 url=i
29 print url
30 request = urllib2.Request(url)
31 cmfu = opener.open(request).read()
32 #cmfuURL = re.findall("<a href='showbook.asp\?bl_id=\d{1 ,}'",cmfu)
33 #BookIdListTemp = [re.sub("<a href='showbook.asp\?bl_id=",'',k) for k in cmfuURL]
34 #BookIdListTemp = [re.sub("'",'',k) for k in BookIdListTemp]
35 #起点的代码太不规范了,想一个更广泛性的匹配正则表达式
36 """
37 cmfuURL = re.findall("showbook.asp\?bl_id=\d{1,}",cmfu)
38 BookIdListTemp = [re.sub("showbook.asp\?bl_id=",'',k) for k in cmfuURL]
39 """
40 #更大众化一些
41 cmfuURL = re.findall("bl_id=\d{1,}",cmfu)
42 BookIdListTemp = [re.sub("bl_id=",'',k) for k in cmfuURL]
43 #BookIdListTemp = [ re.sub("'",'',k) for k in BookIdListTemp]
44 bookCount = len(BookIdList)
45 for listTemp in BookIdListTemp:
46 #检查该bookid是否在BookIdList中已有
47 if listTemp in BookIdList:
48 pass
49 else:
50 BookIdList.extend([listTemp])#加进去
51 print "取得书本数目:%i"%(len(BookIdList)-bookCount)
52 print "合计取得下载书本:%i"%len(BookIdList)
53 return BookIdList
54
55 #@-node:BIGZHU.20070731160918.1:<<getBookIdList>>
56 #@nl
57 #@<<getBookName>>
58 #@+node:BIGZHU.20070731164705:<<getBookName>>
59 def getBookName(opener,bookId=''):
60 if bookId == '':
61 print "传入BookIdList是空的"
62 bookURL = 'http://www.cmfu.com/readbook.asp?bl_id=%s'%bookId
63 request = urllib2.Request(bookURL)
64 bookPage = opener.open(request).read()
65 opener.close()
66 bookname = re.findall('bookname=\S{1,}',bookPage)
67
68 bookname = [re.sub("bookname=",'',k) for k in bookname]
69 bookname = [re.sub('"','',k) for k in bookname][0]
70
71 return bookname
72
73 #@-node:BIGZHU.20070731164705:<<getBookName>>
74 #@nl
75 #@<<getTextFile>>
76 #@+node: BIGZHU.20070731171721:<<getTextFile>>
77 def getTextFile(opener,bookId):
78 bookName = getBookName(opener,bookId)
79 #判断文件是否已经存在
80 if os.path.isfile(os.getcwd()+"\\起点\\%s.txt"%bookName):
81 print "%s 已经存在"%bookName
82 else:
83 url = 'http://download.cmfu.com/pda/%s.txt'%bookId
84 try:
85 bookData = opener.open(url).read()
86 except :
87 print "2 %s"%bookName
88 try:
89 bookData = opener.open(url).read()
90 except :
91 print "last try %s"%bookName
92 try:
93 bookData = opener.open(url).read()
94 except :
95 print "end try %s"%bookName
96
97 opener.close()
98
99 f=open(os.getcwd()+"\\起点\\%s.txt"%bookName,"wb")
100 f.write(bookData)
101 f.close()
102 print 'get book %s 完毕'%bookName
103 #@-node:BIGZHU.20070731171721:<<getTextFile>>
104 #@nl
105 #@<<class runGetFile>>
106 #@+node:BIGZHU.20070801172939:<<class runGetFile>>
107 class runGetFile(threading.Thread):
108 def __init__(self,bookId):
109 threading.Thread.__init__(self)
110 self.bookId = bookId
111 #self.opener = opener
112 def run(self):
113 opener = getCookie()
114 getTextFile(opener,self.bookId)
115 #@nonl
116 #@-node: BIGZHU.20070801172939:<<class runGetFile>>
117 #@nl
118 #@<<class ProcessURL>>
119 #@+node:BIGZHU.20070802171013:<<class ProcessURL>>
120 class ProcessURL:
121 """对新输入url,save 到ini中
122 对已有url,忽视
123 每次使用,自动读取ini的url,提供使用"""
124 def __init__(self):
125 pass
126 #@ <<saveURL>>
127 #@+node:BIGZHU.20070802171013.1:<<saveURL>>
128 def saveURL(self,urlList=[]):
129 '''存储新的url到URL.ini中'''
130
131
132 try:
133 f=open(os.getcwd()+"\\起点\\URL.ini","wb")#追加内容
134 except IOError:
135 print "文件打开错误"
136 #格式化成字符串
137 s_urlList = ";".join(urlList)
138 f.write(s_urlList)
139 f.close()
140 #@-node:BIGZHU.20070802171013.1:<<saveURL>>
141 #@nl
142 #@ <<getURLIni>>
143 #@+node:BIGZHU.20070802171013.2:<<getURLIni>>
144 def getURLIni(self):
145 """读取 URL.ini中的url
146 返回一个URL list"""
147 #判断目录是否存在
148 if os.path.exists (os.getcwd()+"\\起点"):
149 pass
150 else:
151 print "创建目录 \起点"
152 os.mkdir("起点")
153
154 iniData=''
155 if os.path.isfile(os.getcwd ()+"\\起点\\URL.ini"):
156 f=open(os.getcwd()+"\\起点\\URL.ini","rb")
157 iniData = f.read()
158 f.close()
159 else:
160 print "URL.txt不存在,创建之"
161 f=open(os.getcwd()+"\\起点\\URL.ini","wb")
162 #iniData = f.read()
163 f.close()
164 return iniData.split(";")#格式化成list
165 #@-node:BIGZHU.20070802171013.2: <<getURLIni>>
166 #@nl
167
168
169
170
171 #@-node:BIGZHU.20070802171013:<<class ProcessURL>>
172 #@nl
173 #@<<main>>
174 #@+node:BIGZHU.20070731164705.1:<<main>>
175 if __name__ == '__main__':
176 opener = getCookie()
177 #urlList =["http://www.cmfu.com/index.asp"," http://www.cmfu.com/listbookqb.asp?pageid=2007-8-1%2012:26&status=down","http://www.cmfu.com/listbookqb.asp?pageid=2007-7-31%2023:03&status=down ","http://www.cmfu.com/index_wxxx.asp"]
178 #存放和读取url
179 urlType = ProcessURL()
180 urlList = urlType.getURLIni()
181 saveIni = 0 # 标识是否有url 更新
182 while True:
183 url = raw_input("要截取的起点的某个页面: ")
184 if url=='':
185 break
186 if url in urlList:
187 print "%s 已有,忽视之"%url
188 else:
189 urlList.extend([url])
190 print "%s 是新的,添加之"%url
191 saveIni =1
192 #url = 'http://www.cmfu.com/index.asp'
193
194
195 bookIdList=getBookIdList(opener,urlList)
196
197
198 for i in bookIdList:
199 thread = runGetFile(i)
200 thread.start()
201 #存储到ini中
202 if saveIni == 1:
203 urlType.saveURL(urlList)
204 #@-node:BIGZHU.20070731164705.1:<<main>>
205 #@nl
206 #@nonl
207 #@-node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py
208 #@-leo