关键词参考.png
爬虫结果参考,comment为空的是表情.png
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 2 09:17:43 2018
@author: Shirley
"""
import requests
from lxml import etree
import random
import time
import csv
import xlrd
header = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
'Cookie':'TY_SESSION_ID=75f9338b-39d0-4304-a15c-3875dc632470; bdshare_firstime=1456884995197; _ga=GA1.3.434288974.1481159012; QuickMsgQuickId=l739www.xcar.com.cn20; _Xdwuv=5063286921331; _Xdwnewuv=1; _PVXuv=5b5956ad1167f; _fwck_www=364742ea15686b6835b21dcb2fa56e6a; _appuv_www=7b82bccdd767f93fb076aa0baf96b891; _fwck_my=95825f0ced1d6ef8501a9cc2494aabf8; _appuv_my=11a48715a1413ae3146d6d9dd0f66135; touch_thread=%5B32904319%2C32760685%2C32106749%2C30612733%2C30723865%2C31020013%2C30652364%2C30441650%2C30339528%2C30315077%2C30327279%2C30323484%2C30306706%2C30290867%2C30325140%2C30323447%2C30323354%2C30316990%2C30242778%2C30166775%2C30177375%2C30159504%2C30286550%2C30126636%2C30171166%2C30174450%2C30125812%2C30178410%2C30131636%2C30136840%2C30182408%2C30193439%2C30187866%2C30275684%2C30245605%2C30237837%2C30236160%2C30243821%2C29458143%2C29892723%2C29038201%2C29591611%2C29594260%2C29591312%5D; touch_forum=%5B2010%2C712%2C738%2C545%2C589%2C1161%2C931%2C740%2C840%2C739%2C766%2C1753%5D; place_prid=25; place_crid=133; place_ip=114.216.206.139_1; UM_distinctid=164f814bb2e2a0-0c992e8f35a359-2711938-1fa400-164f814bb2f138; BIGipServerpool-c26-xcar-bbsweb-80=382275338.20480.0000; cidreset=1; _locationInfo_=%7Burl%3A%22http%3A%2F%2Fsuzhou.xcar.com.cn%2F%22%2Ccity_id%3A%22133%22%2Cprovince_id%3A%2225%22%2C%20city_name%3A%22%25E8%258B%258F%25E5%25B7%259E%22%7D; bbs_visitedfid=2010D963D1218D992D496D1566; ad__city=133; bbs_sid=4Iarcf; Hm_lvt_53eb54d089f7b5dd4ae2927686b183e0=1532581677,1533170482; fw_slc=1%3A1533170482%3B1%3A1533170497%3B1%3A1533170498%3B1%3A1533170501%3B1%3A1533170502; bbs_oldtopics=D33925176D; fw_pvc=1%3A1533170466%3B1%3A1533170481%3B1%3A1533170516%3B1%3A1533170533; Hm_lpvt_53eb54d089f7b5dd4ae2927686b183e0=1533170533; close_cz=1; fw_clc=1%3A1533170479%3B1%3A1533170515%3B1%3A1533170531%3B1%3A1533170556%3B1%3A1533170564; fw_exc=1%3A1533170481%3B1%3A1533170515%3B1%3A1533170516%3B1%3A1533173395'}
data = []
def getcomments(url):
r = requests.get(url,headers=header)
r.encoding='gbk'
time.sleep(random.random())
comments = etree.HTML(r.text,parser=etree.HTMLParser(encoding='utf-8'))
author = comments.xpath("//td[@class='t_user']/a[@class='bold']")
authorid = comments.xpath("//td[@class='t_user']/a/@href")#获取的是链接,http://my.xcar.com.cn/space.php?uid=7011250,要替换
datetime = comments.xpath("//table[@class='t_msg']/tr[1]/td/div/div[2]")#要替换“发表于 ”
floor = comments.xpath("//div[@class='right t_number']/a[@class='bold']")
comment = comments.xpath("//td[@class='line']/div")#1、引用回复的问题未处理 2、多个标签文本要用string(.)获取,否则为空
commentid = comments.xpath("//td[@class='t_user']/a/@name")
for i in range(0,len(author)):
author_ele = author[i].text
authorid_ele = authorid[i].replace("http://my.xcar.com.cn/space.php?uid=","")
datetime_ele = "".join(datetime[i].text.split()).replace("发表于","")
floor_ele = floor[i].text
comment_ele = "".join(comment[i].xpath('string(.)').split())
commentid_ele = commentid[i]
data.append([author_ele,authorid_ele,datetime_ele,floor_ele,comment_ele,commentid_ele,url])
with open ('D:/anaconda/shirleylearn/xcar/xcarcomments.csv','w',newline='') as f :#爬取数据存储到csv
writer = csv.writer(f)
writer.writerow(['author','authorid','datetime','floor','comment','commentid','fromurl'])
for k in data:
writer.writerow(k)
def getAllcomments():
path = 'D:/anaconda/shirleylearn/xcar/xcar_comments_key.xlsx'#打开关键词excel
excelfile = xlrd.open_workbook(path)
keys = excelfile.sheet_by_name('Sheet1')
n = keys.nrows
for j in range(0,n):#定位行数
for page in range(1,int(keys.row(j)[1].value)+1):#从第一页开始取页码
if page == 1:
url = keys.row(j)[0].value#page=1时,链接会发生跳转到没有page的链接,发生跳转就取不到内容,所以要进行判断
else:
url = keys.row(j)[0].value+"&page="+str(page)
getcomments(url)
if __name__ == "__main__":
getAllcomments()
网友评论