新闻爬虫编写
# 爬取腾讯新闻首页所有新闻内容
1、爬取新闻首页
2、得到各新闻链接
3、爬取新闻链接
4、寻找有没有frame
5、若有,抓取frame下对应网页内容
6、若没有,直接抓取当前页面
import urllib.request
url="https://news.qq.com/a/20171028/001835.htm"
#头文件格式header=("User-Agent",具体用户代理值)
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
data=opener.open(url).read().decode("utf-8","ignore")
#data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
len(data)
import urllib.request
import re
url="http://news.163.com/"
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")
pat1='<a target="_blank" href="(.*?)"'
alllink=re.compile(pat1).findall(data)
for i in range(0,len(alllink)):
thislink=alllink[i]
thispage=urllib.request.urlopen(thislink).read().decode("gb2312","ignore")
pat2="<frame src=(.*?)>"
isframe=re.compile(pat2).findall(thispage)
if(len(isframe)==0):
#直接爬
print(i)
urllib.request.urlretrieve(thislink,"E:/python/python爬虫/data/"+str(i)+".html")
else:
#得到frame的网址爬
urllib.request.urlretrieve(flink,"E:/python/python爬虫/data/"+str(i)+".html")
# CSDN博文爬虫
import urllib.request
import re
url="http://blog.csdn.net/"
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
# 安装为全局
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<h3 class="csdn-tracking-statistics" data-mod="popu_430" data-poputype="feed" data-feed-show="false" data-dsm="post"><a href="(.*?)"'
alllink=re.compile(pat).findall(data)
#print(alllink)
for i in range(0,len(alllink)):
localpath="E:\\python\\python爬虫\\rst\\"+str(i)+".html"
thislink=alllink[i]
urllib.request.urlretrieve(thislink,filename=localpath)
print("当前文章(第"+str(i)+"篇) 爬取成功了!")
网友评论