requests
是什麽?
- 是一个第三方库,这个库和urllib是一样的,就是模拟浏览器发送http请求的,requests是对urllib的一层封装,所以提供的接口更加的人性化
详请地址
http://docs.python-requests.org/zh_CN/latest/index.html
安装方式
pip install requests
get\带参数的get
get方法是params=data
data是一个参数字典
r = requests.get(url=url, params=data)
响应对象r
r.text 字符串格式内容
r.content 字节格式内容
r.headers 响应头部
r.url 请求url
r.status_code 状态码
import requests
'''
url = 'http://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
r = requests.get(url=url, headers=headers)
'''
# r是响应对象
# 网页的字符串内容
# print(r.text)
# 字节内容
# print(r.content)
# 获取网页的url
# print(r.url)
# 获取响应头部
# print(r.headers)
# 获取状态码
# print(r.status_code)
url = 'https://www.baidu.com/s?'
data = {
'ie': 'utf8',
'wd': '周杰伦'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
r = requests.get(url=url, params=data, headers=headers)
with open(r'tupian\zhou.html', 'wb') as fp:
fp.write(r.content)
post
必应翻译
data是参数字典
r = requests.post(url=url, data=data)
import requests
post_url = 'https://cn.bing.com/ttranslationlookup?&IG=5C360E60322D4FA4865EEBCF710B93B6&IID=translator.5036.2'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
fromdata = {
'from': 'zh-CHS',
'to': 'en',
'text': '皇上',
}
r = requests.post(url=post_url, data=fromdata, headers=headers)
print(r.text)
会话
登录,人人网---获取保存cooike
s = requests.Session()
s.post()
s.get()
import requests
# 使用会话技术,首先创建一个会话
# 往下所有操作,使用s进行发送 s.post s.get
s = requests.Session()
post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018621432232'
data = {
'email':'17701256561',
'icode':'',
'origURL':'http://www.renren.com/home',
'domain':'renren.com',
'key_id':'1',
'captcha_type':'web_login',
'password':'bd20fe8cf1541a10558676a6eeccb4a1a786cfc09823ddd69d5bbaafc7060292',
'rkey':'227f4ceb2f44827f9de8296ca1ef1c3f',
'f':'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DaovDobnt13PO-vgvw1r-eSnSe_QNvNGtexiQFzyME-a%26wd%3D%26eqid%3Db5d58b1e000297f4000000025b4d88e3',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
r = s.post(url=post_url, headers=headers, data=data)
# print(r.text)
url = 'http://www.renren.com/960481378/profile'
r = s.get(url, headers=headers)
with open('renren.html', 'wb') as fp:
fp.write(r.content)
公交爬取
import requests
from lxml import etree
import re
import json
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
def parse_first_page(url):
r = requests.get(url=url, headers=headers)
# 生成一个tree对象
tree = etree.HTML(r.text)
# 通过tree对象查找所有的数字、字母链接
number_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')
return number_href_list + char_href_list
def parse_second_page(url, all_href, fp):
# 为了拼接完整的url,先将右边的 / 干掉
url = url.rstrip('/')
for href in all_href:
href = url + href
r = requests.get(href, headers=headers)
tree = etree.HTML(r.text)
# 解析,获取所有的公交href信息
bus_href_list = tree.xpath('//div[@id="con_site_1"]/a/@href')
bus_name_list = tree.xpath('//div[@id="con_site_1"]/a/text()')
# print(bus_href_list)
# exit()
# 向列表中的url依次发送请求,解析内容
parse_third_page(url, bus_href_list, bus_name_list, fp)
def parse_third_page(url, bus_href_list, bus_name_list, fp):
for bus_href in bus_href_list:
title = bus_name_list[bus_href_list.index(bus_href)]
print('正在爬取%s......' % title)
# 拼接完整的url
bus_href = url + bus_href
# 向每一路公交的详情页发送请求
r = requests.get(url=bus_href, headers=headers)
# 在下面的函数中,解析每一路公交的详细信息
parse_content(r.text, fp)
print('结束爬取%s' % title)
time.sleep(1)
def parse_content(content, fp):
tree = etree.HTML(content)
# 获取线路名称
name = tree.xpath('//div[@class="bus_i_t1"]/h1/text()')[0]
# 获取运行时间
runtime = tree.xpath('//div[@class="bus_i_content"]/p[1]/text()')[0]
# 获取票价信息
price = tree.xpath('//div[@class="bus_i_content"]/p[2]/text()')[0]
# 公交公司
try:
company = tree.xpath('//div[@class="bus_i_content"]/p[3]/a/text()')[0]
except Exception as e:
company = ''
# 更新时间
gxsj = tree.xpath('//div[@class="bus_i_content"]/p[last()]/text()')[0]
# 获取公交路线长度
try:
length = tree.xpath('//div[@class="bus_label "]/p/text()')[0]
pattern = re.compile(r'\d+\.\d+')
ret = pattern.search(length)
length = ret.group()
except Exception as e:
length = ''
total_list = tree.xpath('//span[@class="bus_line_no"]/text()')
# 获取上行总站数, 使用正则将总站数拿走
pattern = re.compile(r'\d+')
up_total = total_list[0]
up_total = pattern.search(up_total).group()
# 获取下行总站数
try:
down_total = total_list[1]
down_total = pattern.search(down_total).group()
except Exception as e:
down_total = ''
# 获取上行的公交站牌信息
up_site_name = tree.xpath('//div[@class="bus_line_site "][1]//a/text()')
# 获取下行的公交站牌信息
try:
down_site_name = tree.xpath('//div[@class="bus_line_site "][2]//a/text()')
except Exception as e:
down_site_name = []
# 将公交的详细信息保存到字典中
item = {
'线路名称': name,
'运行时间': runtime,
'票价信息': price,
'公交公司': company,
'更新时间': gxsj,
'线路长度': length,
'上行站数': up_total,
'上行站牌': up_site_name,
'下行站数': down_total,
'下行站牌': down_site_name,
}
string = json.dumps(item, ensure_ascii=False)
fp.write(string + '\n')
def main():
# 打开文件
fp = open('北京公交路线.txt', 'w', encoding='utf8')
url = 'http://beijing.8684.cn/'
# 获取所有的数字、字母链接
all_href = parse_first_page(url)
# 遍历列表,依次发送请求,解析二级页面
parse_second_page(url, all_href, fp)
fp.close()
if __name__ == '__main__':
main()
登录---验证码
验证码:
(1)将验证码下载到本地,让用户手动输入
(2)使用软件识别,效率不高
(3)使用打码平台,识别率高
import requests
# 搞一个会话
s = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
# 先将验证码下载到本地
get_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
r = s.get(get_url, headers=headers)
# 需要向图片src发送请求,将验证码下载到本地
image_src = 'https://so.gushiwen.org/RandCode.ashx'
r = s.get(image_src, headers=headers)
with open('code.png', 'wb') as fp:
fp.write(r.content)
code = input('请输入验证码:')
post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
data = {
'__VIEWSTATE': 'BvBAwAIKh29BShbC/yKMDsjiElxi+d4wdH3pR2dacgsifqK0rmUzL4Mc9YzHGDc6P6rqB4wMZ39uRj2MpaaSjQtarGnIo6qf1djLGa75XLo/S4b65Uhv2TETKt0=',
'__VIEWSTATEGENERATOR':'C93BE1AE',
'from': 'http://so.gushiwen.org/user/collect.aspx',
'email': '1090509990@qq.com',
'pwd': '123456',
'code': code,
'denglu': '登录',
}
r = s.post(post_url, headers=headers, data=data)
print(r.text)
网友评论