美文网首页
知乎登录和关键词爬取

知乎登录和关键词爬取

作者: ckawyh | 来源:发表于2018-05-16 17:13 被阅读0次
    import csv
    import time
    import urllib.parse as parse
    from selenium import webdriver
    from pyquery import PyQuery as pq
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    driver = webdriver.Chrome()
    driver.set_window_size(1920, 1200)
    wait = WebDriverWait(driver, 10)
    
    
    def login_zhihu(username, password):
        """登录知乎
        :param username: 用户名
        :param password: 密码
        """
        try:
            login_url = 'https://www.zhihu.com/signin'
            driver.get(login_url)
    
            username_input = wait.until(EC.presence_of_element_located((By.XPATH,
                                                                        '//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[1]/div[2]/div[1]/input')))
            username_input.send_keys(username)
            password_input = wait.until(EC.presence_of_element_located((By.XPATH,
                                                                        '//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[2]/div/div[1]/input')))
            password_input.send_keys(password)
    
            # 等待20秒,用于手动输入验证码
            time.sleep(20)
            button = wait.until(EC.element_to_be_clickable((By.XPATH,
                                                            '//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/button')))
            button.submit()
    
            # 等待Cookies加载# 等待Cookies加载
            time.sleep(10)
            cookies = driver.get_cookies()
            print(cookies)
        except Exception as e:
            print(e)
        finally:
            driver.close()
    
    
    def topic_title_spider(keyword='王宝强', filename='wangbaoqiang', scroll_times=10):
        """ 获取关键词搜素下标题和评论的数目
        :param keyword: 关键词
        :param filename: 保存的文件名
        :param scroll_times: 页面向下滚动次数
        """
        start = time.time()
    
        # 建立一个收集数据的csv文件
        csvFile = open('./%s.csv' % filename, 'a+', newline='')
        writer = csv.writer(csvFile)
        writer.writerow(('title', 'review_num'))
    
        # 将关键词转换为十六进制格式,填入到链接中
        kw = parse.quote(keyword)
        url = 'https://www.zhihu.com/search?type=content&q=%s' % kw
        driver.get(url)
    
        # 加载尽可能多的页面数据
        for i in range(1, scroll_times, 1):
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            duration = time.time() - start
            print('%s -- 小爬虫 已经向下滚动 第%d次 了,运行时间%.2f秒,好累啊!' % (keyword, i, duration))
            time.sleep(5)
    
        html = driver.page_source
    
        # 解析获取所需内容
        doc = pq(html)
        items = doc('.Card .List-item').items()
        count = 0
        for i, item in enumerate(items):
            button = item.find('.ContentItem-action')
            if button:
                try:
                    review_num = button.text().split(' ')[-2]
                    title = item.find('.ContentItem-title').text().replace('\r', '').replace('\n', '')
                    writer.writerow((title, review_num))
                    count += 1
                    print('成功' + str(count) + '条!')
                    print(title, review_num)
                    print()
                except Exception as e:
                    print(e)
                    continue
    
        csvFile.close()
        driver.quit()
    
    
    if __name__ == '__main__':
        # login_zhihu('XXX', 'XXX')
        topic_title_spider(keyword='Python', filename='python', scroll_times=20)
    
    

    相关文章

      网友评论

          本文标题:知乎登录和关键词爬取

          本文链接:https://www.haomeiwen.com/subject/cbqmdftx.html