1、获取域名注册信息
#-*- coding:utf-8 -*-
import whois
print(whois.whois('baidu.com'))
urllib2库
#下载指定网站信息的内容
def download(url,user_agent='Mozilla/5.0',num_retries=3,proxy=None):
print('downloading!!!')
content = None
# 伪造请求头
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
request = urllib2.Request(url,headers=headers)
try:
#读取指定url地址的内容
# content = urllib2.urlopen(url).read()
content = urllib2.urlopen(request).read()
#启用代理IP
if proxy:
handler = urllib2.ProxyHandler(proxies={'http':'http://%(host)s:%(port)d'%proxy})
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
#读取指定url地址的内容
# content = urllib2.urlopen(url).read()
content = urllib2.urlopen(request).read()
except Exception as e:
print e
#判断重试次数不为0
if num_retries > 0:
#错误为服务器错误
if hasattr(e,'code') and 500 <= e.code < 600:
#递归调用,减少重试次数
return download(url,user_agent,num_retries-1)
return content
#*****************测试**********************
# print download('https://www.baidu.com/')
# print download('https://www.taobao.com/')
# print download('https://www.vip.com/')
print download('http://www.xicidaili.com/')
请求头:有多种作用:伪装成浏览器
代理IP的重要性: HTTP代理ip就很重要,如果当前的ip地址受限制,可以换一个新的ip地址,保证爬虫的顺利进行
robot协议
查看网站robot协议:域名+robots.txt ===>https://www.taobao.com/robots.txt
#-*- coding:utf-8 -*-
import robotparser
#尝试baidu或taobao的robots.txt协议
rp = robotparser.RobotFileParser()
rp.set_url('https://www.taobao.com/robots.txt')
rp.read()
print rp.can_fetch(useragent='Baiduspider',url='https://www.taobao.com/article')
网站地图
requests使用
import requests
import re
#获取请求信息
text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
#匹配符合条件的内容
result = re.findall('<label class="readonly" for="(.*?)" id="(.*?)">(.*?)</label>',text)
print result[3][2]
提取方式:
BeautifulSoup(美丽汤)
from bs4 import BeautifulSoup
import requests
text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
bs = BeautifulSoup(text,'html.parser')
bs.prettify()
# tr = bs.find('tr',attrs={'id': 'places_phone__row'})
#提取符合要求的内容
# print tr.find('td',attrs={'class': 'w2p_fw'}).text
#提取符合要求的图片
tr = bs.find('tr',attrs={'id': 'places_national_flag__row'})
# print tr.find('td',attrs={'class': 'w2p_fw'})
s = tr.find('img').attrs['src']
print s
lxml提取
import requests
import lxml.html
text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
doc = lxml.html.fromstring(text)
print type(text) #<class 'lxml.html.HtmlElement'>
text = lxml.html.tostring(doc,pretty_print=True)
print type(doc ) #<type 'str'>
tree = lxml.html.fromstring(text)
t = tree.cssselect('tr#places_country__row > td.w2p_fw')[0].text_content()
# t = doc.cssselect('tr#places_country__row > td.w2p_fw')[0].text_content()
print t,tree.get('class')
lxml -- xpath
#-*- coding:utf-8 -*-
from lxml import etree
with open('hello.html','r') as f:
html = f.read()
html = etree.HTML(html)
html = etree.tostring(html,pretty_print=True)
# print html
html = etree.HTML(html)
# html = etree.parse('hello.html')
# print type(html)
# #获取所有的 <li> 标签
result = html.xpath('//li')
# print result
# print type(result)
# print type(result[0])
# print len(result)
---
#获取所有li标签的所有class
# print html.xpath('//li/@class')
---
#获取 <li> 标签下 href 为 link1.html 的 <a> 标签
print html.xpath('//li//a[@href="link1.html"]')
#获取所有a的href
print html.xpath('//li//a/@href')
# print html.xpath('//li//a/text()')
---
网友评论