网页抓取实作 ::-- ZoomQuiet [2007-08-02 09:07:08]

CPUG联盟::

CPUG::门户plone

BPUG

SPUG

ZPUG

SpreadPython Python宣传

1. 献给买基金的朋友

1.1. Jun Tsai

Jun Tsai <[email protected]>                 hide details    3:53 pm (30 minutes ago)  
        reply-to                [email protected]   
        to              [email protected]   
        date            Aug 2, 2007 3:53 PM      
        subject         [python-chinese] 献给买基金的朋友.       
今天用python写的一个脚本,来自动抓取今日基金的收益情况(懒得去网站看),刚学习python,错误的地方多指正.

   1 # -*- coding: UTF-8 -*-
   2 import httplib, urllib,re
   3 import datetime
   4 
   5 FUND_CODE = "161706"
   6 SALE_DATE="2007-06-22"
   7 SALE_MONEY = 5000.0
   8 TODAY_DATE=datetime.date.today()
   9 
  10 PANEL = "biz.finance.sina.com.cn"
  11 USERAGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv: 1.8.0.1) Gecko/20060111 Firefox/1.5.0.1'
  12 PATH="/fundinfo/open/lsjz.php?fund_code="
  13 
  14 """
  15 用来自动抓取基金的值,获取对应的利润情况.
  16 author:jun tsai
  17 revision:$Revision: 3191 $
  18 since:0.1
  19 """
  20 def get_found_value(fund_code,sale_date,sale_money):
  21     """自动抓取基金净值的脚本程序,通过给定的基金代码,买基金的日期,以及投入使用的钱,
  22     来自动抓取基金的净值,以及利润
  23     """
  24 
  25     params = urllib.urlencode({"startdate1":sale_date,"enddate1":TODAY_DATE})
  26     headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain",
  27                'Referer' :'http://'+PANEL+PATH+fund_code, 'User-Agent':USERAGENT
  28                }
  29     conn = httplib.HTTPConnection(PANEL)
  30     conn.request("POST", PATH+fund_code, params, headers)
  31     response = conn.getresponse()
  32     data = response.read()
  33     data=data.decode ("gb2312")
  34     conn.close();
  35     pattern = '<title>(.+)\('+fund_code+'\)'
  36     all_matches = re.findall(pattern,data);
  37 
  38     fund_name =  all_matches[0].encode("utf-8")
  39 #    print all_matches[0]
  40 
  41     pattern='<a href=\'./lsjz_dwjz.php\?jzrq=(.*)\'[\s]+target=_blank>(.*)</a>'
  42     all_matches=re.findall(pattern,data);
  43 
  44     today_value =  float(all_matches[0][1])
  45     sale_value = float(all_matches[len(all_matches)-1][1])
  46     sale_count = sale_money/sale_value
  47     value=(today_value-sale_value)*sale_count
  48     print "|"+construct_block(10,fund_code)+"|"+construct_head_block(20,fund_name)+"|" +construct_block(10,sale_value.__str__())+"|" +construct_block(20,sale_count.__str__())+"|" +construct_block(10,today_value.__str__())+"|" +construct_block(20,value.__str__())+"|"
  49 def construct_block(length,str):
  50     r=' '+str
  51     while(length>len(r)):
  52         r+=' '
  53     return r
  54 
  55 def construct_head_block(length,str):
  56     r=' '+str
  57     head_str_len=len( str.decode("utf-8"))
  58     while(length>(len(r)-head_str_len)):
  59         r+=' '
  60     return r
  61 
  62 print "+-----------------------------------------------------------------------------------------------+"
  63 print "|"+construct_head_block(10,"代码")+"|"+construct_head_block(20,"名称")+"|"+construct_head_block(10,"购买净值")+"|"+construct_head_block(20,"购买数")+"|"+construct_head_block(10,"今日净值")+"|"+construct_head_block(20,"利润")+"|"
  64 print "+-----------------------------------------------------------------------------------------------+"
  65 get_found_value("161706","2007-06-22",5000.0)
  66 get_found_value("260110","2007-06-10", 5000.0)
  67 get_found_value("070011","2007-06-23",5000.0)
  68 print "+-----------------------------------------------------------------------------------------------+"

1.2. shily escape

shily escape <[email protected]>            hide details    4:01 pm (21 minutes ago)  
        reply-to                [email protected]   
        to              [email protected]   
        date            Aug 2, 2007 4:01 PM      
        subject         Re: [python-chinese] 献给买基金的朋友.   
呵呵, 巧了. 我这两天也写了一个. 不过没有算利润的这些东西

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 
   4 import re
   5 import urllib
   6 import time
   7 from sqlalchemy import *
   8 
   9 class Fund:
  10     def __init__(self, code):
  11         self.code = code
  12         self.attributes = {}
  13     def __setitem__(self, key, value):
  14         self.attributes[key] = value
  15     def __getitem__(self, key):
  16         return self.attributes[key]
  17 
  18 fund_all = {}
  19 
  20 fund_url = 'http://my.fund.163.com/stock/rankkfs.htm'
  21 
  22 u_sock = urllib.urlopen(fund_url)
  23 
  24 fund_str = u_sock.read().decode('gb2312')
  25 
  26 fund_str = fund_str.encode('utf-8')
  27 
  28 print fund_str
  29 
  30 u_sock.close()
  31 
  32 r_item_pattern = re.compile(r'<tr align="center" bgcolor="(#EFEFEF|#E7F3FE)" class="bzi">(.*?)</tr>', re.DOTALL)
  33 
  34 r_anchor_pattern = re.compile(r'<td><a href=.*?>(.*?)</a></td>')
  35 
  36 r_normal_pattern = re.compile(r'<td>([-0-9]*\.*\d*)</td>')
  37 
  38 fund_list = r_item_pattern.findall(fund_str)
  39 
  40 file_name = time.strftime('%Y%m%d') + '.html'
  41 
  42 f = open( file_name, 'w')
  43 
  44 db = BoundMetaData("mysql://root:clhclh@localhost/testcase?charset=utf8", echo=True)
  45 
  46 funds = Table('funds', db, autoload=True)
  47 
  48 for item in fund_list:
  49     i = funds.insert()
  50     s = item[1]
  51     f.write(s)
  52     anchor_tuple = r_anchor_pattern.findall(s)
  53     fund = Fund(anchor_tuple[0])
  54     fund['name'] = anchor_tuple[1]
  55     fund['company'] = anchor_tuple[2]
  56     normal_tuple = r_normal_pattern.findall(s)
  57     fund['date'] = normal_tuple[0]
  58     fund['util'] = normal_tuple[1]
  59     fund['total'] = normal_tuple[2]
  60     fund['rate'] = normal_tuple[3]
  61     funds.insert().execute({'name':fund['name'],'code':fund.code,'date':fund['date'],'util':fund['util'],'total':fund['total'],'rate':fund['rate'],'company':fund['company']})
  62 #    sql = "insert into funds(name, code, `date`, util, total, rate, company) values('%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
  63 #          % (fund['name'], fund.code, fund['date'], fund['util'], fund['total'], fund['rate'], fund['company'])
  64  #   try:
  65   #      print sql
  66    #     cursor.execute(sql)
  67 #    except Exception, e:
  68 g #       print e
  69   #  fund_all[fund.code] = fund
  70    # f.write(s)
  71 #conn.commit()
  72 #cursor.close()
  73 #conn.close()
  74 f.close()

1.2.1. ruby版:

#!/usr/bin/env ruby
# Time-stamp: <2007-08-02[星期四] 14:15:47 dongsheng>
require 'net/http'
require 'iconv'

url = URI.parse('http://my.fund.163.com/stock/rankkfs.htm')

req = Net::HTTP::Get.new (url.path)

res = Net::HTTP.start(url.host, url.port) do |http|
  http.request(req)
end

content = res.body

#content = Iconv.iconv('utf-8', 'gb2312', content)

#puts content

re_items = /\<tr align="center" bgcolor="(#EFEFEF|#E7F3FE)" class="bzi">(.*?)\<\/tr\>/im

re_anchor = /\<td\>\<a href=.*?\>(.*?)\<\/a\>\<\/td\>/im

re_normal = /\<td\>([0-9-]*\.*\d*)\<\/td\>/im

content.scan(re_items) do |x,y|
  anchors = y.scan(re_anchor)
  puts "code: #{Iconv.iconv('utf-8', 'gb2312', anchors[0].to_s)}"
  puts "name: #{Iconv.iconv('utf-8', 'gb2312', anchors[1].to_s)}"
  puts "company: #{Iconv.iconv('utf-8', 'gb2312', anchors[2].to_s)}"
  normal = y.scan(re_normal)
  puts "date: #{normal[0]}"
  puts "util: #{normal[1]}"
  puts "total: #{normal[2]}"
  puts "rate: #{normal[3]}"
end

1.3. 反馈

Name Password4deL ;) :( X-( B-)