网页抓取实作 ::-- ZoomQuiet [2007-08-02 09:07:08]
1. 献给买基金的朋友
1.1. Jun Tsai
Jun Tsai <[email protected]> hide details 3:53 pm (30 minutes ago) reply-to [email protected] to [email protected] date Aug 2, 2007 3:53 PM subject [python-chinese] 献给买基金的朋友. 今天用python写的一个脚本,来自动抓取今日基金的收益情况(懒得去网站看),刚学习python,错误的地方多指正.
1 # -*- coding: UTF-8 -*-
2 import httplib, urllib,re
3 import datetime
4
5 FUND_CODE = "161706"
6 SALE_DATE="2007-06-22"
7 SALE_MONEY = 5000.0
8 TODAY_DATE=datetime.date.today()
9
10 PANEL = "biz.finance.sina.com.cn"
11 USERAGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv: 1.8.0.1) Gecko/20060111 Firefox/1.5.0.1'
12 PATH="/fundinfo/open/lsjz.php?fund_code="
13
14 """
15 用来自动抓取基金的值,获取对应的利润情况.
16 author:jun tsai
17 revision:$Revision: 3191 $
18 since:0.1
19 """
20 def get_found_value(fund_code,sale_date,sale_money):
21 """自动抓取基金净值的脚本程序,通过给定的基金代码,买基金的日期,以及投入使用的钱,
22 来自动抓取基金的净值,以及利润
23 """
24
25 params = urllib.urlencode({"startdate1":sale_date,"enddate1":TODAY_DATE})
26 headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain",
27 'Referer' :'http://'+PANEL+PATH+fund_code, 'User-Agent':USERAGENT
28 }
29 conn = httplib.HTTPConnection(PANEL)
30 conn.request("POST", PATH+fund_code, params, headers)
31 response = conn.getresponse()
32 data = response.read()
33 data=data.decode ("gb2312")
34 conn.close();
35 pattern = '<title>(.+)\('+fund_code+'\)'
36 all_matches = re.findall(pattern,data);
37
38 fund_name = all_matches[0].encode("utf-8")
39 # print all_matches[0]
40
41 pattern='<a href=\'./lsjz_dwjz.php\?jzrq=(.*)\'[\s]+target=_blank>(.*)</a>'
42 all_matches=re.findall(pattern,data);
43
44 today_value = float(all_matches[0][1])
45 sale_value = float(all_matches[len(all_matches)-1][1])
46 sale_count = sale_money/sale_value
47 value=(today_value-sale_value)*sale_count
48 print "|"+construct_block(10,fund_code)+"|"+construct_head_block(20,fund_name)+"|" +construct_block(10,sale_value.__str__())+"|" +construct_block(20,sale_count.__str__())+"|" +construct_block(10,today_value.__str__())+"|" +construct_block(20,value.__str__())+"|"
49 def construct_block(length,str):
50 r=' '+str
51 while(length>len(r)):
52 r+=' '
53 return r
54
55 def construct_head_block(length,str):
56 r=' '+str
57 head_str_len=len( str.decode("utf-8"))
58 while(length>(len(r)-head_str_len)):
59 r+=' '
60 return r
61
62 print "+-----------------------------------------------------------------------------------------------+"
63 print "|"+construct_head_block(10,"代码")+"|"+construct_head_block(20,"名称")+"|"+construct_head_block(10,"购买净值")+"|"+construct_head_block(20,"购买数")+"|"+construct_head_block(10,"今日净值")+"|"+construct_head_block(20,"利润")+"|"
64 print "+-----------------------------------------------------------------------------------------------+"
65 get_found_value("161706","2007-06-22",5000.0)
66 get_found_value("260110","2007-06-10", 5000.0)
67 get_found_value("070011","2007-06-23",5000.0)
68 print "+-----------------------------------------------------------------------------------------------+"
1.2. shily escape
shily escape <[email protected]> hide details 4:01 pm (21 minutes ago) reply-to [email protected] to [email protected] date Aug 2, 2007 4:01 PM subject Re: [python-chinese] 献给买基金的朋友. 呵呵, 巧了. 我这两天也写了一个. 不过没有算利润的这些东西
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import re
5 import urllib
6 import time
7 from sqlalchemy import *
8
9 class Fund:
10 def __init__(self, code):
11 self.code = code
12 self.attributes = {}
13 def __setitem__(self, key, value):
14 self.attributes[key] = value
15 def __getitem__(self, key):
16 return self.attributes[key]
17
18 fund_all = {}
19
20 fund_url = 'http://my.fund.163.com/stock/rankkfs.htm'
21
22 u_sock = urllib.urlopen(fund_url)
23
24 fund_str = u_sock.read().decode('gb2312')
25
26 fund_str = fund_str.encode('utf-8')
27
28 print fund_str
29
30 u_sock.close()
31
32 r_item_pattern = re.compile(r'<tr align="center" bgcolor="(#EFEFEF|#E7F3FE)" class="bzi">(.*?)</tr>', re.DOTALL)
33
34 r_anchor_pattern = re.compile(r'<td><a href=.*?>(.*?)</a></td>')
35
36 r_normal_pattern = re.compile(r'<td>([-0-9]*\.*\d*)</td>')
37
38 fund_list = r_item_pattern.findall(fund_str)
39
40 file_name = time.strftime('%Y%m%d') + '.html'
41
42 f = open( file_name, 'w')
43
44 db = BoundMetaData("mysql://root:clhclh@localhost/testcase?charset=utf8", echo=True)
45
46 funds = Table('funds', db, autoload=True)
47
48 for item in fund_list:
49 i = funds.insert()
50 s = item[1]
51 f.write(s)
52 anchor_tuple = r_anchor_pattern.findall(s)
53 fund = Fund(anchor_tuple[0])
54 fund['name'] = anchor_tuple[1]
55 fund['company'] = anchor_tuple[2]
56 normal_tuple = r_normal_pattern.findall(s)
57 fund['date'] = normal_tuple[0]
58 fund['util'] = normal_tuple[1]
59 fund['total'] = normal_tuple[2]
60 fund['rate'] = normal_tuple[3]
61 funds.insert().execute({'name':fund['name'],'code':fund.code,'date':fund['date'],'util':fund['util'],'total':fund['total'],'rate':fund['rate'],'company':fund['company']})
62 # sql = "insert into funds(name, code, `date`, util, total, rate, company) values('%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
63 # % (fund['name'], fund.code, fund['date'], fund['util'], fund['total'], fund['rate'], fund['company'])
64 # try:
65 # print sql
66 # cursor.execute(sql)
67 # except Exception, e:
68 g # print e
69 # fund_all[fund.code] = fund
70 # f.write(s)
71 #conn.commit()
72 #cursor.close()
73 #conn.close()
74 f.close()
1.2.1. ruby版:
#!/usr/bin/env ruby # Time-stamp: <2007-08-02[星期四] 14:15:47 dongsheng> require 'net/http' require 'iconv' url = URI.parse('http://my.fund.163.com/stock/rankkfs.htm') req = Net::HTTP::Get.new (url.path) res = Net::HTTP.start(url.host, url.port) do |http| http.request(req) end content = res.body #content = Iconv.iconv('utf-8', 'gb2312', content) #puts content re_items = /\<tr align="center" bgcolor="(#EFEFEF|#E7F3FE)" class="bzi">(.*?)\<\/tr\>/im re_anchor = /\<td\>\<a href=.*?\>(.*?)\<\/a\>\<\/td\>/im re_normal = /\<td\>([0-9-]*\.*\d*)\<\/td\>/im content.scan(re_items) do |x,y| anchors = y.scan(re_anchor) puts "code: #{Iconv.iconv('utf-8', 'gb2312', anchors[0].to_s)}" puts "name: #{Iconv.iconv('utf-8', 'gb2312', anchors[1].to_s)}" puts "company: #{Iconv.iconv('utf-8', 'gb2312', anchors[2].to_s)}" normal = y.scan(re_normal) puts "date: #{normal[0]}" puts "util: #{normal[1]}" puts "total: #{normal[2]}" puts "rate: #{normal[3]}" end