import requests
from lxmlimport etree
from bs4import BeautifulSoupas bf
# https://www.soxscc.com/SuiTangWoLaoPoShiChangSunWuGou/157152.html
# https://www.soxscc.com/SuiTangWoLaoPoShiChangSunWuGou/157153.html
# /SuiTangWoLaoPoShiChangSunWuGou/864881.html
url ='https://www.soxscc.com/SuiTangWoLaoPoShiChangSunWuGou/'
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
resp = requests.get(url,headers=headers)
resp_xpath = etree.HTML(resp.text)
hrefs = resp_xpath.xpath("//div[@id='novel150661']//dd/a/@href")
for iin range(400):
url ='https://www.soxscc.com'+hrefs[i]
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
resp = requests.get(url,headers=headers)
soup = bf(resp.text)
content = soup.find('div',class_='content').get_text()
r_x = etree.HTML(resp.text)
output ="\n{}\n\n{}-----------\n"
title = r_x.xpath("//div[@class='read_title']/h1/text()")
outputs = output.format(title[0],content)
print(outputs)
with open('biquge.txt','a',encoding='utf-8')as f:
f.write(outputs)
最开始用xpath爬取,爬取小说,纯用xpath爬取碰到一个问题,得到内容的时候,写不进文件。
output ="\n{}\n\n{}-----------\n"
按照这种方式写进文件出的问题是每一行字都跟一个章节名称,最后还是换成了Beautifulsoup解析内容的。
网友评论