import csv
import time
import urllib.parse as parse
from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.set_window_size(1920, 1200)
wait = WebDriverWait(driver, 10)
def login_zhihu(username, password):
"""登录知乎
:param username: 用户名
:param password: 密码
"""
try:
login_url = 'https://www.zhihu.com/signin'
driver.get(login_url)
username_input = wait.until(EC.presence_of_element_located((By.XPATH,
'//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[1]/div[2]/div[1]/input')))
username_input.send_keys(username)
password_input = wait.until(EC.presence_of_element_located((By.XPATH,
'//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/div[2]/div/div[1]/input')))
password_input.send_keys(password)
# 等待20秒,用于手动输入验证码
time.sleep(20)
button = wait.until(EC.element_to_be_clickable((By.XPATH,
'//*[@id="root"]/div/main/div/div/div/div[2]/div[1]/form/button')))
button.submit()
# 等待Cookies加载# 等待Cookies加载
time.sleep(10)
cookies = driver.get_cookies()
print(cookies)
except Exception as e:
print(e)
finally:
driver.close()
def topic_title_spider(keyword='王宝强', filename='wangbaoqiang', scroll_times=10):
""" 获取关键词搜素下标题和评论的数目
:param keyword: 关键词
:param filename: 保存的文件名
:param scroll_times: 页面向下滚动次数
"""
start = time.time()
# 建立一个收集数据的csv文件
csvFile = open('./%s.csv' % filename, 'a+', newline='')
writer = csv.writer(csvFile)
writer.writerow(('title', 'review_num'))
# 将关键词转换为十六进制格式,填入到链接中
kw = parse.quote(keyword)
url = 'https://www.zhihu.com/search?type=content&q=%s' % kw
driver.get(url)
# 加载尽可能多的页面数据
for i in range(1, scroll_times, 1):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
duration = time.time() - start
print('%s -- 小爬虫 已经向下滚动 第%d次 了,运行时间%.2f秒,好累啊!' % (keyword, i, duration))
time.sleep(5)
html = driver.page_source
# 解析获取所需内容
doc = pq(html)
items = doc('.Card .List-item').items()
count = 0
for i, item in enumerate(items):
button = item.find('.ContentItem-action')
if button:
try:
review_num = button.text().split(' ')[-2]
title = item.find('.ContentItem-title').text().replace('\r', '').replace('\n', '')
writer.writerow((title, review_num))
count += 1
print('成功' + str(count) + '条!')
print(title, review_num)
print()
except Exception as e:
print(e)
continue
csvFile.close()
driver.quit()
if __name__ == '__main__':
# login_zhihu('XXX', 'XXX')
topic_title_spider(keyword='Python', filename='python', scroll_times=20)
网友评论