美文网首页
python:爬取某个微博指定时间段的数据

python:爬取某个微博指定时间段的数据

作者: 獨孤記憶 | 来源:发表于2018-08-13 01:11 被阅读0次

    成果:爬下2017年4月份到2018年8月份的个人原创微博数据(微博内容,发微博时间,设备及该条微博网址)


    image.png
    #start_chrome --> input-date --> scroll_down --> find_cards_info -->save -->find_next
    
    from  selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    import csv
    import os
    
    def start_chrome():
        driver  = webdriver.Chrome(executable_path='./chromedriver')
        #打开chrome客户端
        driver.start_client()
        return driver
    
    def q(st,et):
        return f'?is_ori=1&key_word=&start_time={st}&end_time={et}&is_search=1&is_searchadv=1#_0'
    
    def scroll_down():
        #定位页面
        html_page = driver.find_element_by_tag_name('html')
        for i in range(15):
            print(i)
            #模拟发送END按键(网页滑到底)
            html_page.send_keys(Keys.END)
            #间隔时间0.6秒
            time.sleep(0.6)
    
    def find_cards_info():
    
        #单条微博区块,CSS路径
        cards_sel  = 'div.WB_feed_detail'
        #定位到单条微博区块
        cards      = driver.find_elements_by_css_selector(cards_sel)
        info_list  = []
        for card in cards :
            content_sel = 'div.WB_text.W_f14'
            time_sel    = 'div.WB_from.S_txt2'
            link_sel    = 'div.WB_from.S_txt2>a:nth-child(1)'
    
            content     = card.find_element_by_css_selector(content_sel).text
            time        = card.find_element_by_css_selector(time_sel).text
            link        = card.find_element_by_css_selector(link_sel).get_attribute('href')
            info_list.append([content,time,link])
        # int_sel = 'span>span.line.S_line1>span>em:nth-child(2)'
        #
        # most = driver.find_elements_by_css_selector(int_sel)
        # rep,comm,like = [str(el.text) for el in most[1:]]
        # print(most)
        # print(rep)
    
    
        return info_list
    
    def find_next():
        next_sel  = 'a.page.next'
        next_page = driver.find_elements_by_css_selector(next_sel)
        if next_page:
            return next_page[0].get_attribute('href')
    
    def save(info_list,name):
        full_path = './'+name + '.csv'
        #如果当前目录下已有该名字的文件,则在该文件中添加数据;没有则重新创建并添加数据
        if os.path.exists(full_path):
            with open(full_path,'a',newline='',encoding='utf-8') as f :
                writer = csv.writer(f)
                writer.writerows(info_list)
                print('Done!')
        else:
            with open(full_path,'w+',newline='',encoding='utf-8') as f :
                writer = csv.writer(f)
                writer.writerows(info_list)
                print('Done!!')
    
    def run_crawler(base,duration ):
        if not base.endswith('feedtop'):
            st,et = duration.split('~')
            #第一个为开始时间,第二个为结束时间
            driver.get(base+q(st,et))
        else:
            driver.get(base)
        time.sleep(5)
        scroll_down()
        time.sleep(5)
        info_list=find_cards_info()
        save(info_list,duration)
        next_page=find_next()
        if next_page:
            run_crawler(next_page,duration)
    
    #个人微博主页
    base   = 'https://weibo.com/p/1005055793083716'
    driver = start_chrome()
    input()
    #时间可替换
    run_crawler(base,'2017-04-20~2018-08-10')

    相关文章

      网友评论

          本文标题:python:爬取某个微博指定时间段的数据

          本文链接:https://www.haomeiwen.com/subject/ywzebftx.html