美文网首页
课时20-采集58列表

课时20-采集58列表

作者: 田边女斯基 | 来源:发表于2016-10-07 21:49 被阅读0次

    成果

    b2c_or_c2c(sell=0,pages=2)

    总执行代码,ID默认为个人0(商户1),根据ID分别采集列表,默认2页;产生具体的链接地址导入get_ipads_links(link)
    注意点的地方:

    get_ipads_links(link)

    根据b2c_or_c2c获得批量的地址;产生对应的具体地址,将地址和ID(商户,个人)导入ipad_detail(ipad_detail_url,id)(ID从地址中获取)
    注意的地方:
    1.链接有可能会跳转
    2.商户和个人的列表样式不一样

    ipad_detail(ipad_detail_url,id)

    根据get_ipads_links(link)获得批量的具体地址ID(商户,个人)导入,采集数据,分别存储
    注意的地方:
    1.链接有可能会跳转,浏览数首先要的是最终着陆地址
    2.分别存储
    3.浏览数动态加载,是如何发现的?为什么会知道是哪个地方调用?为什么知道用cookie才有效??

    get_detail_views(url)

    ipad_detail(ipad_detail_url,id)内部使用的方法(仅仅在商户时生效);
    注意的地方:
    1.为什么会知道是哪个地方调用?为什么知道用cookie才有效??

    代码

    #!/usr/bin/env python
    #-*- coding: utf-8 -*-
    
    path='./58_results_'
    from bs4 import BeautifulSoup
    import requests #有s
    import time
    time1 = time.strftime("%H:%M:%S").replace(':','-')
    def get_ipads_links(url):
        ipads_list_url = url  # 平板地址
        ipads_list_code = requests.get(ipads_list_url).status_code
        time.sleep(1)
        if ipads_list_code ==200:
            print(url+u' 成功链接')
            id = (url.split('/')[-3])
            print(id)
            ipads_list_date = requests.get(ipads_list_url).text
            soup = BeautifulSoup(ipads_list_date,'lxml')
            if id == '0':#这里是str
                ipads_detail_links = soup.select('table.tbimg tbody tr.zzinfo td.t a.t')
            elif id=='1':
                ipads_detail_links = soup.select('td.t > a.t')
            i=0
            link = []
            for ipads_detail_link in ipads_detail_links:
                i=i+1
                ipads_detail_link = ipads_detail_link.get('href')
                link.append(ipads_detail_link)
                print(str(i) +' '+ str(ipads_detail_link))
                ipad_detail(ipad_detail_url=str(ipads_detail_link),id=id)
    def ipad_detail(ipad_detail_url= 'http://zhuanzhuan.58.com/detail/783587851712380932z.shtml',id='0'):
        from urllib import request
        with request.urlopen(ipad_detail_url) as f:
            ipad_detail_url = f.geturl()
            print(ipad_detail_url)
        ipad_detail_code = requests.get(ipad_detail_url).status_code
        ipad_detail_date = requests.get(ipad_detail_url).text
        time.sleep(1)
        print(ipad_detail_code)
        if ipad_detail_code == 200:
            soup = BeautifulSoup(ipad_detail_date,'lxml')
            if id =='0':
                id ='个人'
                titles = soup.select('h1.info_titile')
                categorys = soup.select('#nav > div')
                prices = soup.select('div.info_massege.left > div.price_li > span > i')
                regions = soup.select('div.info_massege.left > div.palce_li > span > i')
                view = soup.select('span.look_time')[0].get_text() #直接拿了
            else:
                id = '商户'
                titles = soup.select('h1')
                categorys = soup.select('#header > div.breadCrumb.f12')
                prices = soup.select('div.su_con > span')
                regions = soup.select('span.c_25d')
                view = get_detail_views(ipad_detail_url)
            with open(path  + id + str(time1) + '.txt', 'a+') as text:
                date = {#不用再循环了
                        'title': titles[0].get_text(),
                        'category': categorys[0].get_text().replace('\n','>').replace('\r','').replace('\t','').replace(' ','')if len(categorys) > 0 else 'none',#要做好判断
                        'price': prices[0].get_text()if len(prices) > 0 else 'none',
                        'region': regions[0].get_text().replace('\n','').replace('\r','').replace('\t','').replace(' ','')if len(regions) > 0 else 'none',
                        'view': view
                    }
                print(date.values())
                text.write(str(date.values()))
                text.write('\n')
    def get_detail_views(url='http://bj.58.com/pingbandiannao/27609273161793x.shtml?psid=103856697193428913414588056&entinfo=27609273161793_0'):
        url = url.split('?')[0]
        url = url.split('/')[-1]
        infoid = url.strip('x.shtml') #strip() 方法用于移除字符串头尾指定的字符(默认为空格),split()是切割
        url='http://jst1.58.com/counter?infoid={}'.format(infoid)
        headers = {
            'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
            'Cookie': r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(
                str(infoid)),
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'jst1.58.com',
            'Referer': r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(infoid)
        }
        web_date = requests.get(url,headers=headers).text
        soup = BeautifulSoup(web_date,'lxml')
        views = soup.text.split('=')[-1]
        print(views)
        return views
    
    def b2c_or_c2c(sell=0,pages=2):
        if sell == 0:
            links = ['http://bj.58.com/pbdn/0/pn{}/'.format(i) for i in range(1, pages)]
        else:
            links = ['http://bj.58.com/pbdn/1/pn{}/'.format(i) for i in range(1, pages)]
        for link in links:
            get_ipads_links(link)
    
    b2c_or_c2c(sell=1)
    #ipad_detail(ipad_detail_url= 'http://jm.58.com/pingbandiannao/27602121535432x.shtml?adtype=1&entinfo=27602121535432_0&psid=198431012193429552236749177',id='1')
    #ipad_detail(ipad_detail_url= 'http://zhuanzhuan.58.com/detail/783469171049086980z.shtml?fullCate=5%2C38484%2C23094&fullLocal=1&from=pc',id='0')
    #get_detail_views()
    #ipad_detail(ipad_detail_url= 'http://jump.zhineng.58.com/jump?target=pZwY0jCfsvFJsWN3shPfUiqkpyOMmh78uA-6UhO6UztzP1N3nj0Qn1D3PHTLng980v6YUykKnH93njEdPjmOnHb1PjcOnW0knjTQnH9zP1nLTHE3rHnkrjTvTHEKnW0drjTLnHnQrjNkP1DKnHTkTH91THc1njbYTHDKnHELPH0YrjcQPj9knTDQTyQG0Lw_uyuYTHDKnE7wTiYQTEDkTHTKTyu6UZP-TyDQuj6-uHI-syNOn1nVPjIhPBd6ujmzsHcdPWb1nj7-PHDLPkDzrH9YPHbYnkDQrHckn1bzP1D3rH9Ynj9KTHc1njbYTHDKnEDKTEDKpZwY0jCfsvFJsWN3shPfUiqkmhw8s1Df0A3Qsk78IyQ_THDznz33ra3znW08nH0vTHTKPvwhmWTdnAcLn1TLuWn3nk&psid=188045469193429270001182737&entinfo=27580713185071_0',id='1')
    

    总结

    0.感觉谷歌的css路径比火狐的css路径要好
    1.format的格式:,会不会有更复杂的(多参数)??

    links = ['http://bj.58.com/pbdn/1/pn{}/'.format(i) for i in range(1, pages)]
    

    2.生成的时间作为“随机数”作为文件的命名

    import time
    time1 = time.strftime("%H:%M:%S").replace(':','-')
    

    3.urllib的geturl()能获得跳转后的地址,无需获得headre中的location

    from urllib import request 
    with request.urlopen(ipad_detail_url) as f: 
     ipad_detail_url = f.geturl() 
     print(ipad_detail_url)
    

    4.get_detail_views中的url和cookie是怎么获得的???!

    url='http://jst1.58.com/counter?infoid={}'.format(infoid)
     headers = { 'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36', 'Cookie': r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format( str(infoid)), 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'jst1.58.com', 'Referer': r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(infoid) }
    

    5.不需要一直for,可以categorys[0].get_text()

    date = {#不用再循环了
                        'title': titles[0].get_text(),
                        'category': categorys[0].get_text().replace('\n','>').replace('\r','').replace('\t','').replace(' ','')if len(categorys) > 0 else 'none',#要做好判断
                        'price': prices[0].get_text()if len(prices) > 0 else 'none',
                        'region': regions[0].get_text().replace('\n','').replace('\r','').replace('\t','').replace(' ','')if len(regions) > 0 else 'none',
                        'view': view
                    }
    
    
                for title, category, price, region, view in zip(titles, categorys, prices, regions, views):
                    date = {
                            'title': title[0].get_text(),
                            'category': category.get_text().replace('\n','>').replace('\r','').replace('\t','').replace(' ',''),
                            'price': price.get_text(),
                            'region': region.get_text(),
                            'view': view.get_text()
                    }
    
    

    6.split()与strip()很有用

        url = url.split('?')[0]
        url = url.split('/')[-1]
        infoid = url.strip('x.shtml') #strip() 方法用于移除字符串头尾指定的字符(默认为空格),split()是切割
    

    相关文章

      网友评论

          本文标题:课时20-采集58列表

          本文链接:https://www.haomeiwen.com/subject/rooyyttx.html