美文网首页
爬取大众点评(深圳)美食

爬取大众点评(深圳)美食

作者: 呼吸的小蚂蚁 | 来源:发表于2018-09-06 10:08 被阅读0次

import requests

from lxmlimport etree

import pandasas pd

import time

from pandasimport DataFrame,Series

headers = {

'Cookie':'************************************************',

    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',

    'Connection':'keep-alive'

}

info_list = []

def get_url(url):

res = requests.get(url,headers = headers)

selector = etree.HTML(res.text)

hrefs = selector.xpath('//*[@id="classfy"]/a/@href')

for hrefin hrefs:

print(href)

for iin range(30):

new_href = href +'p' +str(i +1)

#print(i+1,new_href)

            get_href(new_href)

def get_href(new_href):

html = requests.get(new_href, headers=headers)

selector_2 = etree.HTML(html.text)

htmls = selector_2.xpath('//*[@id="shop-all-list"]/ul/li')

#print(htmls)

# //*[@id="shop-all-list"]/ul/li[1]/div[2]/div[1]/a[1]/h4

    for html_3in htmls:

prices = html_3.xpath('div[2]/div[2]/a[2]/b/text()')

shangqu = html_3.xpath('div[2]/div[3]/a[2]/span/text()')

pinglun = html_3.xpath('div[2]/div[2]/a[1]/b/text()')

kouwei = html_3.xpath('div[2]/span/span[1]/b/text()')

huanjing = html_3.xpath('div[2]/span/span[2]/b/text()')

fuwu = html_3.xpath('div[2]/span/span[3]/b/text()')

info = {

'店名': html_3.xpath('div[2]/div[1]/a[1]/h4/text()')[0],

            '星级': html_3.xpath('div[2]/div[2]/span/@title')[0],

            '评论数': pinglun[0]if len(pinglun) !=0 else " ",

            '均价': prices[0]if len(prices) !=0 else " ",

            '类型': html_3.xpath('div[2]/div[3]/a[1]/span/text()')[0],

            '商区': shangqu[0]if len(shangqu) !=0 else " ",

            '地址': html_3.xpath('div[2]/div[3]/span/text()')[0],

            '口味': kouwei[0]if len(kouwei) !=0 else " ",

            '环境': huanjing[0]if len(huanjing) !=0 else " ",

            '服务': fuwu[0]if len(fuwu) !=0 else " "

        }

info_list.append(info)

time.sleep(3)

if __name__ =='__main__':

url ='http://www.dianping.com/shenzhen/ch10'

    get_url(url)

data = pd.DataFrame(info_list,columns=['店名', "星级", "评论数", "均价", "类型", "商区","地址", "口味", "环境", "服务"])

print(data)

    data.to_csv(r'C:\Users\Administrator\Desktop\大众点评.csv', header=True, index=False, mode='a+', encoding='gb18030')

相关文章

网友评论

      本文标题:爬取大众点评(深圳)美食

      本文链接:https://www.haomeiwen.com/subject/mecwwftx.html