这几天一直被正则卡住,少加了一个逗号什么都匹配不到。。。。。
下面这个是爬取今日头条的街拍图片URL和title,上图
有很多None,不是很清楚是什么原因

md5,进程池
# -*- coding:utf-8 -*-
import requests
import json
import re
import os
from hashlib import md5
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from requests.exceptions import RequestException
from multiprocessing import Pool
获取今日头条搜索页面,keyword 和 offset分别为搜索关键字和显示的条数
配置搜索url
def get_one_page(offset,keyword):
data = {
'offset':offset,
'format':'json',
'keyword':keyword,
'autoload':'true',
'count':'20',
'cur_tab':'1',
'from':'search_tab'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('error')
return None
获取搜索页面的街拍url
动态页面,json 数据格式,用生成器来逐条生成article_url
def parse_page_index(html):
data = json.loads(html)
if data and "data" in data.keys():
for item in data.get('data'):
yield item.get('article_url')
获取页面html
def get_page_detail(url):
try:
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('error')
return None
传入html,正则取出其中的articleinfo:{content:***}中的内容,取出的内容很乱,
不是一般的html页面,再次正则出url
def parse_page_detail(html, url):
bsobject = BeautifulSoup(html, 'lxml')
title = bsobject.select('title')[0].text
images_pattern = re.compile('articleInfo: \{.*?,(.*?),', re.S)
result = re.search(images_pattern, html)
images_url = re.compile('(http:\/\/.*?)&')
if result:
url_list = re.findall(images_url, result.group(1).strip())
if url_list:
for url in url_list:
if url: download_image(url)
return {
'title':title,
'url_list' : url_list,
'url':url
}
传入获取的图片url,用save_image函数进行存储,
response.text 是str, 为文本编码,response.content 是bytes, 二进制
def download_image(url):
print('downloing:', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except RequestException:
print('download image error ' ,url)
return None
def save_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
传入offset
def main(offset):
html = get_one_page(offset,'街拍')
for url in parse_page_index(html):
if url:
result = get_page_detail(url)
dictionary = parse_page_detail(result, url)
print(dictionary)
if __name__ == "__main__":
groups = [x * 20 for x in range(1 , 20)]
pool = Pool()
pool.map(main, groups)
网友评论