import requests
import lxml
from bs4 import BeautifulSoup
import re
import threading
import os
def header(referer):
headers = {
'Host':
'i.meizitu.net',
'Pragma':
'no-cache',
'Accept-Encoding':
'gzip, deflate',
'Accept-Language':
'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':
'no-cache',
'Connection':
'keep-alive',
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/59.0.3071.115 Safari/537.36',
'Accept':
'image/webp,image/apng,image/*,*/*;q=0.8',
'Referer':
'{}'.format(referer),
}
return headers
URLPOOL = ["https://m.mzitu.com/159050"]
URLSET = set()
def geturl():
global URLSET #已经解析
global URLPOOL #待下载
for url in URLPOOL:
if url in URLSET: #判断是否解析过,跳过本次 url解析
continue
resp = requests.get(url)
soup = BeautifulSoup(resp.text, features="lxml")
#注意http 和 https
tag_set = soup.find_all(
re.compile("a"),
{"href": re.compile("http(.|s)://m\.mzitu\.com/(\d+)$")})
for tag in tag_set:
if (tag["href"]) not in URLSET:
#这个页面抓取的url 是否被解析 未被解析加入urlpool,这里有点问题
#对于同意页面解析出来的相同url 有可能加入pool 造成浪费
URLPOOL.append(tag["href"])
print(tag["href"])
URLSET.add(url)
print("-----------geturl() finished----------------")
def sort_theme():
#url = "https://m.mzitu.com/159050"
print("down load -------------------------start")
global URLPOOL
for url in URLPOOL:
URLPOOL.remove(url)
sametheme = []
title = ""
count = 50
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "lxml")
title = soup.head.title.string
for i in range(count + 1):
i = i + 1
suburl = url + "/" + str(i)
resp = requests.get(suburl, "usert")
subsoup = BeautifulSoup(resp.content, "lxml")
src = subsoup.figure.a.find("img", {"src": re.compile("^https://")})
sametheme.append(src["src"])
download(title, sametheme)
def download(title, srcset):
dirpath = "d:\\pic\\" + title
try:
os.mkdir(dirpath)
except:
print("已存在")
return 0
count = 1
for src in srcset:
filepath = dirpath + "\\" + str(count) + ".jpg"
with open(filepath, "wb+") as picture:
picture.write(requests.get(src, headers=header(src)).content)
count += 1
print("-------------------------------->>>>>>>>>>>>>>>>>>>>>>>>>")
def main():
threading.Thread(target=geturl,args=()).start()
for i in range(10):
threading.Thread(target =sort_theme,args=()).start()
if __name__ == "__main__":
main()
总结:
基础:网络请求,html 文档解析,线程,文件操作,正则表达式
使用工具:bs4 、requests、lxml、threading
问题:
geturl()函数运行过快,众多url无法进入解析函数(sort_theme)
URLPOOL设计不合理。数据结构使用列表,URLPOOL增长速度太快
sort_theme()函数设计不合理。1)重复请求页面,造成巨大浪费 2)url解析不完全
结构混乱
网友评论