爬虫数据
代码
# -*-coding:utf-8 -*-
# BY WANGCC
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:48.0) Gecko/20100101 Firefox/48.0', }
def get_html(url):
r = requests.get(url, headers=headers)
html = r.content
return html
def get_juzi(html):
soup = BeautifulSoup(html, "lxml")
juzilist = soup.find_all('a', class_="xlistju")
for x in juzilist:
print(x.get_text())
print("\n")
def get_title(html):
soup = BeautifulSoup(html, "lxml")
print(soup.title.get_text().replace('_句子迷', ''))
if __name__ == '__main__':
# url = 'http://www.juzimi.com/article/316132?page=0' url 的模式
for item in range(8): # 这里是手动模式 ^_^
url = 'https://www.juzimi.com/article/20657?page=%s' % item
html = get_html(url)
if item == 0:
get_title(html)
get_juzi(html)
有参考网上的代码,但目前还有问题,需要明天调整
网友评论