美文网首页
大众点评爱车频道爬虫,requests多线程

大众点评爱车频道爬虫,requests多线程

作者: sexy_cyber | 来源:发表于2018-06-18 09:17 被阅读47次

在请求每个列表页的商家详情页时开15个子线程(每个列表页有15个商家)

这一版是没有加jion()的:

存在一个问题,15个详情页的子线程没有结束的情况下,主线程不会等待(非阻塞),主线程会继续往下翻页(列表页),继续开子线程解析详情页;如果电脑性能不好,再加上网络差或者网站反爬严重,可能同时会有上千个子线程在跑,这对于服务器的压力就比较大了;而且有个问题就是,下载的数据更加的混乱了,可能第一条是第一个列表页的第一个商铺详情,但是第二条却是第50个列表页的第15个商铺详情,这数据就有点混乱了;所以需要优化;

每分钟大概是40.5条数据,折合每天(24小时)58320条数据,似乎速度还是有点慢啊,相当于开了75个子线程

下面加了Joion()的会更慢

#coding:utf-8
import hashlib
import time
from fake_useragent import UserAgent
import requests
# from UA import data
import json
import scylla_test
import kuai_test
from lxml import etree
import re
import csv
from shantou_links import *
import random
from cookie_parse import GetComments
import threading
L = threading.Lock()


class Luoyang:
    def __init__(self):
        self.city_name = '洛阳'

    # 爬虫入口,开始爬取
    def get_first_page(self):
        ua = UserAgent()
        print('全部爬虫开始工作,从后往前')
        data = ['id从后往前','店铺名称','所属城市','行政区','地址','二级分类','三级分类','电话','点评数量','最新点评时间']
        with open('luoyang_dianping_car.csv', 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(data)
        field = '买车卖车'
        urls = self.get_target_urls()
        self.start(urls)
        print('完成')

    # 获取所有的初始链接
    def get_target_urls(self):
        # beau_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g34072')
        # mend_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g176')
        match_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g177')
        # result = mend_urls + beau_urls + match_urls
        result = match_urls
        print(result)
        return result

    def get_target_url(self, url):
        field = '买车'
        while True:
            headers = {

            }
            content = self.url_start(url,headers,field)

            if content.status_code == 200 and field in content.text:
                tree = etree.HTML(content.text)
                urls = tree.xpath('//div[@id="region-nav"]/a/@href')
                return urls


    # 遍历所有的初始链接,发起请求,保存数据
    def start(self,urls):
        for first_url in urls:
            field = '买车卖车'


            # 如果不能正确响应,反复试
            while True:

                headers = {

                }
                response = self.url_start(first_url,headers,field)
                print(response.status_code)
                print(response.text)
                if response.status_code == 200 and field in response.text:
                    print('返回200,页面有需求的字段')
                    break
                else:
                    print('该请求失败,准备重试')
                    time.sleep(2)
            if 'g34084' in first_url:
                type = '4s'
            elif 'g34085' in first_url:
                type = '综合经销商'
            elif 'g34086' in first_url:
                type = '二手车'
            else:
                type = 0
            self.parse_firs_full_page(response, type)


    # 拿到页数;保存首页全部店铺信息,循环翻页执行,保存操作,如果没有type 那么type就传参0
    def parse_firs_full_page(self,response,type=0):
        tree = etree.HTML(response.text)
        try:
            pages = tree.xpath('//a[@class="PageLink"]/text()')[-1]
            pages = int(pages)
        except:
            pages = 0
        # 保存首页数据
        # url_t = 'http://www.dianping.com/luoyang/ch65/g34085'
        # if url_t not in response.url:
        #     print('经销商页面进来了,首页不保存')
        self.one_full_page(response,type)
        # 翻页继续保存
        self.turn_page(response.url,pages,type)

    # 翻页并且保存每页的全部店铺数据
    def turn_page(self, url, pages, type):
        if pages > 0:
            for i in range(2, pages + 1):
                time.sleep(5)
                start_url = url + 'p{}'
                action_url = start_url.format(i)
                if i == 2:
                    headers = {
                        'Referer': url,
                        'Host': 'www.dianping.com'
                    }
                else:
                    headers = {
                        'Referer': start_url.format(i - 1),
                        'Host': 'www.dianping.com'
                    }
                print(headers)
                field = '买车卖车'
                while True:
                    response = self.url_start(action_url, headers, field)
                    print(response.status_code)
                    # print(response.text)
                    if response.status_code == 200 and field in response.text:
                        print('该链接请求成功')
                        break
                    else:
                        print('请求失败')
                        time.sleep(2)
                self.one_full_page(response, type)

    #发起一个new请求拿到响应数据
    def url_start(self,url,headers,field):
        while True:
            try:
                # 捕捉代理超时异常
                times = int(time.time())
                planText = "orderno=隐藏,secret=b5dd53126b3143fba00dda5fec6b9607,timestamp={}".format(times)
                md = hashlib.md5()
                md.update(planText.encode('utf-8'))
                content = md.hexdigest()
                ua = UserAgent()
                headers['User-Agent'] = ua.random
                headers['Proxy-Authorization'] = 'sign={}&orderno=ZF20186170227TPgMj4&timestamp={}'.format(content.upper(), times)

                proxies = {'http': 'forward.xdaili.cn:80'}
                response = requests.get(url, proxies=proxies, headers=headers)
                return response
            except:
                print ('代理超时,重试.....')



    # 下载保存单个列表页的全部店铺详情;
    def one_full_page(self,response,type=0):
        tree = etree.HTML(response.text)
        business_li = tree.xpath('//div[@class="pic"]/a/@href')

        headers = {
            'Referer': response.url,
            'Host': 'www.dianping.com'
        }
        print(headers)
        if len(business_li) > 0:
            for business in business_li:
                id = re.findall(r'/shop/(\d+)', business)[0]
                t = threading.Thread(target=self.parse_detail,args=(business,id,headers,type))
                t.start()
        else:
            print('该页面没有店铺')

    # 解析店铺详情页,保存数据,供one_full_page用
    def parse_detail(self,url,id,headers,type=0):
        field = '地址'
        while True:

            response = self.url_start(url,headers,field)
            print(headers)
            if response.status_code == 200 and len(response.text)>0 and field in response.text:
                print('请求成功200')
                break
            else:
                print('请求失败,重试')
                time.sleep(2)

        content = response.text
        # print(content)
        try:
            tree = etree.HTML(content)
            # print('详情页数据',content)
            name = tree.xpath('//h1[@class="shop-name"]/text()')[0]
            city = self.city_name

            district_list = tree.xpath('//div[@class="breadcrumb"]/a/text()')
            district = ''
            for i in district_list:
                if '区' in i:
                    district = i
            address = tree.xpath('//span[@itemprop="street-address"]/text()')[0].strip()
            second_type =tree.xpath('//div[@class="breadcrumb"]/a/text()')[1]
            if type == 0:
                third_type = ''
            else:
                third_type = type
            try:
                tel = tree.xpath('//p[@class="expand-info tel"]/span[@itemprop="tel"]/text()')[0]
            except:
                tel = ''
            comment_num = tree.xpath('//span[@id="reviewCount"]/text()')[0]
            id =id
            latest_time = self.get_commets_time(id)
            info_list = [id,name,city,district,address,second_type,third_type,tel,comment_num,latest_time]
            # 保存单个店铺详情
            L.acquire()
            with open('luoyang_dianping_car.csv', 'a', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(info_list)
            L.release()
            print('单个店铺详情保存成功')
        except:
            print('详情页没有数据')


    # 获取最新评论的日期,供parse_detail用
    def get_commets_time(self,id):
        getcomments = GetComments(id)
        lasttime = getcomments.get_lasttime()
        return lasttime


if __name__ == '__main__':
    luoyang = Luoyang()
    luoyang.get_first_page()

加入jion(),优化多线程:

使得下载的数据更加的有秩序,对于服务器的压力也更加的小;
代码总链接注释掉的部分,是已经爬完的,代码只跑了剩余部分;
代码中只有这一块做了调整:

            threads = []
            for business in business_li:
                id = re.findall(r'/shop/(\d+)', business)[0]
                t = threading.Thread(target=self.parse_detail,args=(business,id,headers,type))
                threads.append(t)
                t.start()
            for thread in threads:
                thread.join()

以下是完整的代码:

#coding:utf-8
import hashlib
import time
from fake_useragent import UserAgent
import requests
# from UA import data
import json
import scylla_test
import kuai_test
from lxml import etree
import re
import csv
from shantou_links import *
import random
from cookie_parse import GetComments
import threading
L = threading.Lock()


class Luoyang:
    def __init__(self):
        self.city_name = '洛阳'

    # 爬虫入口,开始爬取
    def get_first_page(self):
        ua = UserAgent()
        print('全部爬虫开始工作,从后往前')
        data = ['id从配件厂老城区开始','店铺名称','所属城市','行政区','地址','二级分类','三级分类','电话','点评数量','最新点评时间']
        with open('luoyang_dianping_car.csv', 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(data)
        field = '买车卖车'
        urls = self.get_target_urls()
        self.start(urls)
        print('完成')

    # 获取所有的初始链接
    def get_target_urls(self):
        # beau_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g34072')
        # mend_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g176')
        match_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g177')
        # result = mend_urls + beau_urls + match_urls
        result = match_urls
        print(result)
        return result

    def get_target_url(self, url):
        field = '买车'
        while True:
            headers = {

            }
            content = self.url_start(url,headers,field)

            if content.status_code == 200 and field in content.text:
                tree = etree.HTML(content.text)
                urls = tree.xpath('//div[@id="region-nav"]/a/@href')
                return urls


    # 遍历所有的初始链接,发起请求,保存数据
    def start(self,urls):
        for first_url in urls[2:]:
            field = '买车卖车'
            # 如果不能正确响应,反复试
            while True:

                headers = {

                }
                response = self.url_start(first_url,headers,field)
                print(response.status_code)
                print(response.text)
                if response.status_code == 200 and field in response.text:
                    print('返回200,页面有需求的字段')
                    break
                else:
                    print('该请求失败,准备重试')
                    time.sleep(2)
            if 'g34084' in first_url:
                type = '4s'
            elif 'g34085' in first_url:
                type = '综合经销商'
            elif 'g34086' in first_url:
                type = '二手车'
            else:
                type = 0
            self.parse_firs_full_page(response, type)

    # 拿到页数;保存首页全部店铺信息,循环翻页执行,保存操作,如果没有type 那么type就传参0
    def parse_firs_full_page(self,response,type=0):
        tree = etree.HTML(response.text)
        try:
            pages = tree.xpath('//a[@class="PageLink"]/text()')[-1]
            pages = int(pages)
        except:
            pages = 0
        # 保存首页数据
        # url_t = 'http://www.dianping.com/luoyang/ch65/g34085'
        # if url_t not in response.url:
        #     print('经销商页面进来了,首页不保存')
        self.one_full_page(response,type)
        # 翻页继续保存
        self.turn_page(response.url,pages,type)

    # 翻页并且保存每页的全部店铺数据
    def turn_page(self, url, pages, type):
        if pages > 0:
            for i in range(2, pages + 1):
                time.sleep(5)
                start_url = url + 'p{}'
                action_url = start_url.format(i)
                if i == 2:
                    headers = {
                        'Referer': url,
                        'Host': 'www.dianping.com'
                    }
                else:
                    headers = {
                        'Referer': start_url.format(i - 1),
                        'Host': 'www.dianping.com'
                    }
                print(headers)
                field = '买车卖车'
                while True:
                    response = self.url_start(action_url, headers, field)
                    print(response.status_code)
                    # print(response.text)
                    if response.status_code == 200 and field in response.text:
                        print('该链接请求成功')
                        break
                    else:
                        print('请求失败')
                        time.sleep(2)
                self.one_full_page(response, type)

    #发起一个new请求拿到响应数据
    def url_start(self,url,headers,field):
        while True:
            try:
                # 捕捉代理超时异常
                times = int(time.time())
                planText = "orderno=ZF20186170227TPgMj4,secret=b5dd53126b3143fba00dda5fec6b9607,timestamp={}".format(times)
                md = hashlib.md5()
                md.update(planText.encode('utf-8'))
                content = md.hexdigest()
                ua = UserAgent()
                headers['User-Agent'] = ua.random
                headers['Proxy-Authorization'] = 'sign={}&orderno=ZF20186170227TPgMj4&timestamp={}'.format(content.upper(), times)

                proxies = {'http': 'forward.xdaili.cn:80'}
                response = requests.get(url, proxies=proxies, headers=headers)
                return response
            except:
                print ('代理超时,重试.....')

    # 下载保存单个列表页的全部店铺详情;
    def one_full_page(self,response,type=0):
        tree = etree.HTML(response.text)
        business_li = tree.xpath('//div[@class="pic"]/a/@href')

        headers = {
            'Referer': response.url,
            'Host': 'www.dianping.com'
        }
        print(headers)
        if len(business_li) > 0:
            threads = []
            for business in business_li:
                id = re.findall(r'/shop/(\d+)', business)[0]
                t = threading.Thread(target=self.parse_detail,args=(business,id,headers,type))
                threads.append(t)
                t.start()
            for thread in threads:
                thread.join()
        else:
            print('该页面没有店铺')

    # 解析店铺详情页,保存数据,供one_full_page用
    def parse_detail(self,url,id,headers,type=0):
        field = '地址'
        while True:

            response = self.url_start(url,headers,field)
            print(headers)
            if response.status_code == 200 and len(response.text)>0 and field in response.text:
                print('请求成功200')
                break
            else:
                print('请求失败,重试')
                time.sleep(2)
        content = response.text
        # print(content)
        try:
            tree = etree.HTML(content)
            # print('详情页数据',content)
            name = tree.xpath('//h1[@class="shop-name"]/text()')[0]
            city = self.city_name

            district_list = tree.xpath('//div[@class="breadcrumb"]/a/text()')
            district = ''
            for i in district_list:
                if '区' in i:
                    district = i
            address = tree.xpath('//span[@itemprop="street-address"]/text()')[0].strip()
            second_type =tree.xpath('//div[@class="breadcrumb"]/a/text()')[1]
            if type == 0:
                third_type = ''
            else:
                third_type = type
            try:
                tel = tree.xpath('//p[@class="expand-info tel"]/span[@itemprop="tel"]/text()')[0]
            except:
                tel = ''
            comment_num = tree.xpath('//span[@id="reviewCount"]/text()')[0]
            id =id
            latest_time = self.get_commets_time(id)
            info_list = [id,name,city,district,address,second_type,third_type,tel,comment_num,latest_time]
            # 保存单个店铺详情
            L.acquire()
            with open('luoyang_dianping_car.csv', 'a', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(info_list)
            L.release()
            print('单个店铺详情保存成功')
        except:
            print('详情页没有数据')

    # 获取最新评论的日期,供parse_detail用
    def get_commets_time(self,id):
        getcomments = GetComments(id)
        lasttime = getcomments.get_lasttime()
        return lasttime


if __name__ == '__main__':
    luoyang = Luoyang()
    luoyang.get_first_page()

每分钟8.4条数据,折合每天(24小时)12096条数据,速度确实慢到爆;这还是开了15个子线程的

需要注意的是:

子线程涉及到操作全局变量,进行修改赋值操作的时候,一定要加上锁;因为对于赋值和修改,同样是异步的,第一个子线程可能才刚刚修改完2全局变量(赋值还没进行),第二个子线程就过来修改了,这个时候全局变量得到的值和预期值肯定是不一样的;

相关文章

网友评论

      本文标题:大众点评爱车频道爬虫,requests多线程

      本文链接:https://www.haomeiwen.com/subject/snogeftx.html