1.正则表达式
元字符【单字符】
. [and] \d \D \s \S
修饰符
.* + ? {m} {m,n} {m,}
边界符
^ $ \A \B
贪婪模式
.*
非贪婪模式
.*?
模式修正
re.S 单行
re.M 多行
re.I 忽略大小写
2.XPath语法
层级等位:根据标签的层级关系进行查找
属性定位:根据属性查找标签
4.爬取妹子图代码
from time import sleep
from urllib import request, parse
import re
# 业务函数,处理url
def handler_url(url, page ,num):
if num == 1:
page_url = url + str(page)
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer': 'https://www.baidu.com/link?url=dORiYkjnb0AkMxSoE4UzQYAiVlhvcutBR6sSxgYQY-y&wd=&eqid=961cc7e80003f1a6000000065bd05902'
}
return request.Request(url=page_url, headers=headers)
else:
page_url = url + str(page) + '/' + str(num)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer': 'https://www.baidu.com/link?url=dORiYkjnb0AkMxSoE4UzQYAiVlhvcutBR6sSxgYQY-y&wd=&eqid=961cc7e80003f1a6000000065bd05902'
}
return request.Request(url=page_url, headers=headers)
# 业务函数,发起请求
def request_data(req):
res = request.urlopen(req)
# print(res.read().decode('utf-8'))
return res.read().decode('utf-8')
# 业务函数,解析
def anylasis(html):
# 正则匹配图片url
pat = re.compile(r'<div class="article">.*?<img src="(.*?)"', re.S)
res = pat.findall(html)
# print(res)
for img in res:
# print(img)
yield img
# print(res)
# 主函数
def main():
url = "http://www.mmjpg.com/mm/"
start = int(input('请输入起始页:'))
end = int(input('请输入终止页:'))
print('开始下载...')
img_name = 9540
for page in range(start, end+1):
# 把page对应的url处理成一个请求对象
for num in range(1,51):
req = handler_url(url, page, num)
# 对请求对象发起请求
html = request_data(req)
# print(html)
# 解析并且处理解析结果
res = anylasis(html)
# 处理res中图片地址
for img in res:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer': 'https://www.baidu.com/link?url=dORiYkjnb0AkMxSoE4UzQYAiVlhvcutBR6sSxgYQY-y&wd=&eqid=961cc7e80003f1a6000000065bd05902'
}
# request.urlretrieve(url=img, filename="./images/" + str(img_name) + ".jpg")
req = request.Request(img, headers=headers)
res = request.urlopen(req)
with open('./images/' + str(img_name) + '.jpg', 'wb') as fp:
fp.write(res.read())
print("正在下载:" + img)
img_name += 1
sleep(0.1)
print("下载结束!")
if __name__ == '__main__':
main()
网友评论