import requests
from lxml import etree
from bs4 import BeautifulSoup as bf
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
url = 'https://www.soxscc.com/WoZaiLingQiFuSuDiShiJieBaiDiTan/'
hrefs = []
output = '{}\n\n{}\n\n\n\n\n\n\n'
def get_org_url(url):
resp = requests.get(url,headers=headers)
return resp.text
def get_urls(text):
tpath = etree.HTML(text)
txpath = tpath.xpath("//div[@id='novel160576']//dd//a/@href")
for href in txpath:
hrefs.append(href)
return hrefs
text = get_org_url(url)
hrefsl = get_urls(text)
for i in range(3):
html = 'https://www.soxscc.com'+hrefsl[i]
textplus = get_org_url(html)
soup = bf(textplus,'lxml')
title = soup.find('div',class_='read_title').find('h1').string
content = soup.find('div',class_="content").get_text()
outputs = output.format(title,content)
with open('摆地摊.txt','a',encoding='utf-8') as f:
f.write(outputs)
爬取的是搜小说网站内容,只需要更换url,range()中数字,更换id,就可以爬取不同的小说。
range中的数字可以指定爬取多少章节,url是爬取什么小说。
网友评论