美文网首页
chezhiwang_spider

chezhiwang_spider

作者: overad | 来源:发表于2020-07-05 19:03 被阅读0次
    #! /usr/bin/python3
    # -*- coding: utf-8 -*-
    # @Time : 2020/7/5 15:10
    # @File : chezhiwangspider
    # @Software: PyCharm
    
    
    #http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-7.shtml
    #10510
    
    
    
    from fake_useragent import UserAgent
    ua = UserAgent()
    
    import pymysql
    import random
    import time
    
    data = {
        'host':'127.0.0.1',
        'port':3306,
        'user':'root',
        'password':'*******',
        'charset':'utf8',
        'db':'chezhiwang'
    }
    
    conn = pymysql.connect(**data)
    cur = conn.cursor()
    sql = "insert into chezhiwang.complaint(complaint_id, car_brand, car_series, car_model, description, topical_prob, cp_tm, cp_status, crawler_tm) " \
          "values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    
    import requests
    from bs4 import BeautifulSoup as bs
    import re
    # from w3lib import *
    from datetime import datetime
    
    url = 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-7.shtml'
    
    headers = {
        'Host': 'www.12365auto.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'User-Agent': ua.random,
    }
    
    def get_html(url):
        web_data = requests.get(url=url,headers = headers)
        soup = bs(web_data.text,'lxml')
        return soup
    
    
    
    
    if __name__ == '__main__':
    
        flag = 0
    
        for i in range(1, 10510):
            url = "http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-{}.shtml".format(str(i))
            flag += 1
    
            data = get_html(url)
            nodes = data.select('div.tslb_b table  tr')[1:]
    
            alist = []
    
            for node in nodes:
                #投诉编号
                id = node.select('td')[0].text
                #投诉品牌
                car_brand = node.select('td')[1].text
                #投诉车系
                car_series = node.select('td')[2].text
                #投诉车型
                car_model  = node.select('td')[3].text
                #问题简述
                description =  node.select('td')[4].text
                #典型问题
                topical_problem = node.select('td')[5].text
                #投诉时间
                complain_tm = node.select('td')[6].text
                #投诉状态
                complain_status = node.select('td')[7].text
    
                crawler_tm = str(datetime.now())
    
                alist.append([id,car_brand,car_series,car_model,description,topical_problem,complain_tm,complain_status,crawler_tm])
    
            #没十页写入一次数据库
            if flag % 10 == 0:
                # print(alist)
                for i in alist:
                    try:
                        cur.execute(sql,((i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7],i[8])))
                        conn.commit()
                    except Exception as e:
                        print(e)
    
                alist = []
                time.sleep(random.randint(1,3))
            print(flag,datetime.now(),url)
    
        conn.close()
    

    相关文章

      网友评论

          本文标题:chezhiwang_spider

          本文链接:https://www.haomeiwen.com/subject/nbtbqktx.html