#! /usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2020/7/5 15:10
# @File : chezhiwangspider
# @Software: PyCharm
#http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-7.shtml
#10510
from fake_useragent import UserAgent
ua = UserAgent()
import pymysql
import random
import time
data = {
'host':'127.0.0.1',
'port':3306,
'user':'root',
'password':'*******',
'charset':'utf8',
'db':'chezhiwang'
}
conn = pymysql.connect(**data)
cur = conn.cursor()
sql = "insert into chezhiwang.complaint(complaint_id, car_brand, car_series, car_model, description, topical_prob, cp_tm, cp_status, crawler_tm) " \
"values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
import requests
from bs4 import BeautifulSoup as bs
import re
# from w3lib import *
from datetime import datetime
url = 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-7.shtml'
headers = {
'Host': 'www.12365auto.com',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'User-Agent': ua.random,
}
def get_html(url):
web_data = requests.get(url=url,headers = headers)
soup = bs(web_data.text,'lxml')
return soup
if __name__ == '__main__':
flag = 0
for i in range(1, 10510):
url = "http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-{}.shtml".format(str(i))
flag += 1
data = get_html(url)
nodes = data.select('div.tslb_b table tr')[1:]
alist = []
for node in nodes:
#投诉编号
id = node.select('td')[0].text
#投诉品牌
car_brand = node.select('td')[1].text
#投诉车系
car_series = node.select('td')[2].text
#投诉车型
car_model = node.select('td')[3].text
#问题简述
description = node.select('td')[4].text
#典型问题
topical_problem = node.select('td')[5].text
#投诉时间
complain_tm = node.select('td')[6].text
#投诉状态
complain_status = node.select('td')[7].text
crawler_tm = str(datetime.now())
alist.append([id,car_brand,car_series,car_model,description,topical_problem,complain_tm,complain_status,crawler_tm])
#没十页写入一次数据库
if flag % 10 == 0:
# print(alist)
for i in alist:
try:
cur.execute(sql,((i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7],i[8])))
conn.commit()
except Exception as e:
print(e)
alist = []
time.sleep(random.randint(1,3))
print(flag,datetime.now(),url)
conn.close()
网友评论