#coding:utf-8
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import numpy as np
#收集要爬取的URL列表
def url_manger():
print "爬虫程序开始,先收集需要爬取的URL"
numList=range(1,1490)
first='https://www.cyzone.cn/event/list-0-'
second='-0-0-0-0/0'
url_list=[]
count=1
for num in numList:
url=first+str(num)+second
url_list.append(url)
print "已成功添加第" + str(count) + "条URL"
count+=1
print "URL已经添加完毕,一共有"+str(count-1)+"条URL"
return url_list
#解析url_list里的数据,获取到最终想要的数据
def html_parser(url_list):
print "开始解析页面数据"
name_list=[] #公司名称
money_list=[] # 融资额度
turn_list=[] # 轮次
invest_list=[] # 投资方
industy_list=[] # 行业
date_list=[] # 融资时间
cyzone_data={}
headers={
'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
'Referer':"https://www.cyzone.cn/",
'Origin':"https://www.cyzone.cn"
}
proxies={
'HTTPS':'14.20.235.106:9797',
'HTTPS':'221.7.211.246:60233',
'HTTPS':'121.69.37.6:9797',
'HTTPS':'222.92.112.66:3128',
'HTTPS':'125.123.140.12:9000',
'HTTPS':'14.118.130.212:8081',
'HTTPS':'1.192.243.134:9797',
'HTTPS':'221.7.211.246:60233',
'HTTPS':'101.132.122.230:3128',
'HTTPS':'59.62.167.21:808',
'HTTPS':'110.52.235.102:9999'
}
count=1
num=1
for url in url_list:
time.sleep(random.uniform(0.2,0.8))
try:
print "正在爬取第"+str(num)+"个URL:"+str(url)
res=requests.get(url,headers=headers,proxies=proxies)
html_content=res.text
soup=BeautifulSoup(html_content,'lxml')
num+=1
except:
print "出错了,不解析数据,继续下一次循环"
continue
for i in range(20):
name_list.append(soup.select(".tp2_tit")[i].text)
money_list.append(soup.select('.tp-mean')[i].text)
turn_list.append(soup.select('.table-plate3 > td:nth-of-type(4)')[i].text)
invest_td=soup.select(".tp3")
if (len(invest_td[i].text) - 1) == 0:
invest_list.append("投资方未披露")
else:
invest_list.append(invest_td[i].text)
industy_list.append(soup.select('.table-plate3 > td:nth-of-type(6)')[i].text)
date_list.append(soup.select('.table-plate3 > td:nth-of-type(7)')[i].text)
print "已成功爬取" + str(count) + "条数据"
count += 1
cyzone_data={
'name':name_list,
'money':money_list,
'turn':turn_list,
'invest':invest_list,
'industy':industy_list,
'date':date_list
}
print "页面数据解析完成,一共爬取了" + str(count-1) + "条数据"
return cyzone_data
def save_csv(cyzone_data):
print "开始存储数据到csv文件"
cyzone_data=pd.DataFrame(cyzone_data)
cyzone_data.to_csv('../data/cyzonedata.csv',encoding='utf-8-sig')
print '数据存储完成,爬虫程序结束'
if __name__=='__main__':
url_list=url_manger()
cyzone_data=html_parser(url_list)
save_csv(cyzone_data)
网友评论