仅供交流学习
#coding=utf-8
import json
import requests
import re
import os
from multiprocessing import Pool
from urllib.parse import urlencode
from fake_useragent import UserAgent
from hashlib import md5
from bs4 import BeautifulSoup
ua=UserAgent()
keyword="街拍"
def get_page(offset):
param={
'offset': offset,
'format':'json',
'keyword':keyword,
'autoload':'true',
'count': 20
}
base="https://www.toutiao.com/api/search/content/?"
url=base+urlencode(param)
content=get_content(url)
data=json.loads(content)
if data and "data" in data.keys():
article_list=data.get('data')
return [item.get('article_url') for item in article_list]
return None
#保存结果到文件
def write_to_file(content):
with open("res.txt","a",encoding="utf-8") as f:
f.write(content)
#解析获取内页的图片
def parse_page_image(url):
content=get_content(url)
if content!=None:
#获取标题
soup=BeautifulSoup(content,'lxml')
res=soup.select('title')
title=res[0].get_text()
pattern=re.compile(r'gallery: JSON.parse\("(.*?)"\),',re.S)
items=pattern.findall(content)
for item in items:
item=eval("'{}'".format(item))
data=json.loads(item)
if data and "sub_images" in data.keys():
items=[item.get("url") for item in data.get('sub_images')]
res={
'title':title,
'imgList':items,
'url':url
}
write_to_file(json.dumps(res,ensure_ascii=False)+"\n")
for url in items:
get_img(url)
#保存图片
def save_img(content):
path_file="{0}/{1}/{2}.{3}".format(os.getcwd(),"img",md5(content).hexdigest(),"jpg")
print(path_file)
with open(path_file,"wb") as f:
f.write(content)
#获取远程图片
def get_img(url):
try:
headers={'User-Agent':ua.chrome}
response=requests.get(url,headers=headers)
if response.status_code==200:
save_img(response.content)
except:
pass
#获取文本内容
def get_content(url):
try:
headers={'User-Agent':ua.chrome}
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except:
return None
def main(offset):
items=get_page(offset)
if items!=None:
for item in items:
parse_page_image(item)
if __name__=='__main__':
pool=Pool()
pool.map(main,[i*10 for i in range(10)])
pool.close()
pool.join()
原文: https://rumenz.com/rumenbiji/python-requests-multiprocessing.html
网友评论