有什么爬虫的问题都可以私信我哦,很乐意为你效劳,如果我有空的话!
该文仅供学习参考!!!
pyppeteer爬取唯品会数据
爬取内容:唯品会口红数据
采集站点:https://category.vip.com/suggest.php?keyword=%E5%8F%A3%E7%BA%A2
存储:csv表格
涉及知识点:pyppeteer自动化、pandas的使用、csv存储、异步、logger、xpath、ip代理
代码:
# -*- coding: utf-8 -*-
# @Time : 2022/5/1 12:39
# @Author : Lonelyroots
# @Email : Lonelyroots@qq.com
# @File : pyppeteer爬取唯品会数据.py
# @Software : PyCharm
import asyncio
import requests
from pyppeteer import launch
from lxml import etree
from loguru import logger
import pandas as pd
class Wph():
def __init__(self, url, name):
self.url = url
self.name = name
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}
self.hadlnone = lambda x: x[0] if x else ''
self.proxie = {
'http':'114.239.149.47:9999'
}
def request(self, url):
res = requests.get(url, params={'keyword': self.name}, headers=self.headers,proxies =self.proxie,verify=False)
if res.ok:
return res
async def main(self, url):
browser = await launch()
page = await browser.newPage()
await page.goto(url)
text = await page.content() # 返回页面html
return text
def spider(self):
'''
业务核心
:return:
'''
df = pd.DataFrame(columns=['品牌', '标题', '原价', '现价', '折扣'])
res = self.request(self.url)
# 定位品牌信息 构造对应的请求地址
......
# 卖价
x_price = self.hadlnone(item.xpath('.//div[contains(@class,"c-goods-item__sale-price")]/text()'))
# 折扣
zk = self.hadlnone(item.xpath('.//div[contains(@class,"c-goods-item__discount")]/text()'))
logger.info(f'品牌{pingpai},标题{title},原价{y_price},现价{x_price},折扣{zk}')
# 构造字典
pro = {
'品牌': pingpai,
'标题': title,
'原价': y_price,
'现价': x_price,
'折扣': zk
}
df = df.append([pro])
df.to_csv('唯品会数据.csv', index=False)
def __del__(self):
# browser.close()
pass
if __name__ == '__main__':
name = '口红'
url = "https://category.vip.com/suggest.php?keyword="+str(name)
c = Wph(url, name)
c.spider()
需要完整代码可以私信我哦!
网友评论