美文网首页
第一周实战作业

第一周实战作业

作者: tianxia339 | 来源:发表于2017-03-25 18:28 被阅读0次

    成果展示

    屏幕快照 2017-03-25 下午7.19.13.png
    from bs4 import BeautifulSoup
    import requests
    
    import time
    
    
    def get_links(who_sells):
        url = 'http://bj.58.com/pbdn/{}/pn2'.format(who_sells)
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text,'lxml')
        link_tags = soup.select('td.t  a.t')
        urls_datas = []
        for link_tag in link_tags:
            if link_tag.get('href').find('jump') < 0:
                #转转的link
                urls_datas.append(link_tag.get('href').split('?')[0])
            else:
                #精准的link解析
                urls_datas.append('http://bj.58.com/pingbandiannao/'+link_tag.get('href').split('entinfo')[1].split('&')[0][1:-2]+'x.shtml')
        #print(urls_datas)
        get_infos(urls_datas,who_sells)
    
    def getViews(url):
        #解析获取浏览数,但是得到的始终为0,待解决 ???
        info_id =url.split('/')[-1].strip('x.shtml')
        api = 'http://jst1.58.com/counter?infoid={}'.format(info_id)
        headers = {
            'Cookie': r'bj58_id58s="eG44SE0raFpjSmpwMjI4NQ=="; id58=c5/ns1jBVI5v3RDiA5T7Ag==; als=0; myfeet_tooltip=end; bangbigtip2=1; city=bj; ipcity=gltokyo%7C%u4E1C%u4EAC; sessionid=e3b672f8-f7ca-4c60-8eb3-eba3cfb9a905; 58tj_uuid=a1cc2a2a-8536-417e-8fdf-b86563c43986; new_session=0; new_uv=9; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=10'.format(str(info_id)),
            'User-Agent': r'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'jst1.58.com',
            'Referer': r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(info_id)
        }
    
    
        r = requests.get(api,headers)
        return r.text.split('total=')[1]
    
    def get_infos(urls,who_sells =0):
    
        for url in urls:
            wb_data = requests.get(url)
            soup = BeautifulSoup(wb_data.text, 'lxml')
            time.sleep(1)
    
            if "zhuanzhuan" not in url:
    
                data = {
                    'title': soup.title.text.strip(),
                    'price': soup.select('.price.c_f50')[0].text,
                    'date': soup.select('li.time')[0].text ,
                    'area': list(soup.select('.c_25d')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None,
                    'cate': '个人' if who_sells == 0 else '商家',
                    'views': getViews(url)
                }
                print(data)
            else:
                data = {
                    'title': soup.title.text.strip(),
                    'price': soup.select('span.price_now i')[0].text,
                    'date': None,
                    'area': list(soup.select('div.palce_li  i')[0].stripped_strings) if soup.find_all('div', 'palce_li') else None,
                    'cate': '个人' ,
                    'views': soup.select('span.look_time')[0].get_text().strip(u'次浏览')
                }
                print(data)
    
            #保存到本地文件中
            # with open('/Users/lht/Downloads/imgs/text','a') as fs:
            #     for data0 in datas:
            #         fs.write(str(data0))
    
    
    
    
    
    
    get_links(0)
    
    
    

    个人总结

    -浏览量通过js请求,还未解决

    相关文章

      网友评论

          本文标题:第一周实战作业

          本文链接:https://www.haomeiwen.com/subject/bzbcottx.html