美文网首页
Python spider for 58.com

Python spider for 58.com

作者: chenchao981 | 来源:发表于2016-05-04 23:11 被阅读59次

    frombs4importBeautifulSoup

    importtime,requests,re

    url='http://wx.58.com/pingbandiannao/25892738648911x.shtml

    wb_data=requests.get(url,headers=header)

    soup=BeautifulSoup(wb_data.text,'lxml')

    defget_links_from(who_sells):

    urls=[]

    list_view='http://wx.58.com/pbdn/{}/pn2/'.format(str(who_sells))

    wb_data=requests.get(list_view)

    soup=BeautifulSoup(wb_data.text,'lxml')

    forlinkinsoup.select('td.t > a.t'):

    url_one=link.get('href').split('?')[0]

    ifstr('zhuanzhuan')not inurl_one:#去掉不能被解析掉的转转网页

    urls.append(url_one)

    returnurls

    defget_view():

    id=re.findall('http.*?nao/(.*?)x.shtml',url,re.S)#使用正则表达筛选ID

    # print(id[0])

    api='http://jst1.58.com/counter?infoid={}'.format(id[0])

    js=requests.get(api)

    views=js.text.split('=')[-1]

    # print(views)

    returnviews

    defget_item_info(who_sells=0):

    urls=get_links_from(who_sells)

    forurlinurls:

    wb_data=requests.get(url)

    soup=BeautifulSoup(wb_data.text,'lxml')

    data={

    'title':soup.title.text,

    'price':soup.select('div.su_con > span.c_f50')[0].text,

    'date':soup.select('li.time')[0].text,

    'area':list(soup.select('span.c_25d')[0].stripped_strings)ifsoup.find_all('span','c_25d')else None,#去掉区域为空掉选项,防止报错

    'url':url,

    'cate':'个人'ifwho_sells==0else'商家',

    'views':get_view(),

    }

    print(data)

    # get_item_info(url)

    # get_links_from()

    # get_view()

    get_item_info()

    相关文章

      网友评论

          本文标题:Python spider for 58.com

          本文链接:https://www.haomeiwen.com/subject/xsqdrttx.html