美文网首页
python_58同城二手车信息采集

python_58同城二手车信息采集

作者: 盛夏光年ing | 来源:发表于2019-04-23 09:40 被阅读0次

    本篇是针对 58 同城二手车的爬虫,主要是爬取车的价格,一些基础信息,保存到 CSV表格中。

    创建时间:2019-04-22 10:20 很简单还是分享一下吧

    import re
    import math
    import requests
    from scrapy import Selector
    
    
    def start_request():
        """
        开始爬虫
        :return:
        """
        index_url = 'https://quanguo.58.com/ershouche/'
        index_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'https://quanguo.58.com/ershouche/',
        }
        index_response = requests.get(url=index_url, headers=index_headers)
        if index_response.status_code == 200:
            selector_response = Selector(text=index_response.text)
            all_info = selector_response.xpath('//tr')
    
            all_car_info_total = selector_response.xpath('//p[@id="infocont"]/strong/text()').extract_first()
            all_page = math.ceil(int(all_car_info_total) / 50)
            print('has %s total_page' % all_page)
            get_car_info(all_info)  # 进行第一页信息的解析
            for page_ in range(2, int(all_page)):
                print('开始下载第 %s 页图片' % page_)
                page_url = 'https://quanguo.58.com/ershouche/pn%s/' % str(page_)
                page_response = requests.get(url=page_url, headers=index_headers)
                if index_response.status_code == 200:
                    selector_response = Selector(text=page_response.text)
                    all_info = selector_response.xpath('//tr')
                    get_car_info(all_info)  # 进行第下一页信息的解析
    
    
    def get_car_info(all_info):
        """
        解析获得的信息
        :param all_info:
        :return:
        """
        for each_info in all_info[1:]:
            car_info = each_info.xpath('td[2]/a//text()').extract()
            car_log = car_info[0] if len(car_info) >= 2 else ''  # 车的标志 eg:现代、大众、日产等标志
            car_model = car_info[1] if len(car_info) >= 2 else ''  # 车的型号 eg: 索纳塔 2011款 2.0L 自动尊贵版
            base_car_info = each_info.xpath('td[2]/p//text()').extract()  # 车的一些基础信息
            buy_year = re.findall(r'.*\t(\w+)\t', base_car_info[0])[0]  # 购买的年限
            travelling_kilometers = re.findall(r'(.*)\t', base_car_info[2])[0]  # 已经行驶公里数
            displacement = re.findall(r'(.*)\t', base_car_info[4])[0]  # 汽车的排量是多少升
            car_type = re.findall(r'(.*)\t', base_car_info[6])[0]  # 汽车是自动挡还是手动挡的
            car_price = each_info.xpath('td[3]/b/text()').extract_first()  # 汽车的价格
            car_price = car_price + '万元' if car_price else ''
            car_safety = each_info.xpath('td[4]//a/text()').extract_first()  # 行驶证是否验证
            list_info = [car_log, car_model, buy_year, travelling_kilometers, displacement, car_type, car_price, car_safety]
            all_write = ','.join('%s' % each_ for each_ in list_info)
            with open('car_info.csv', 'a+', encoding='utf-8') as f:
                print('正在写入中................')
                f.write(all_write + '\n')
    
    
    if __name__ == '__main__':
        start_request()
    
    

    很简答的一次整理,欢迎查看个人 csdn账号:https://blog.csdn.net/weixin_42812527

    相关文章

      网友评论

          本文标题:python_58同城二手车信息采集

          本文链接:https://www.haomeiwen.com/subject/chymgqtx.html