最近在学习爬虫,拿糗事百科练手。高手勿喷!
#!/usr/local/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2018/6/29 14:20
# @Author : 老虎傻傻
# @FileName: qiushibaike.py
# @Software: PyCharm
import re
import requests
from bs4 import BeautifulSoup
url = 'https://www.qiushibaike.com/text/page/'
def get_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
html = requests.get(url, headers)
return html.text
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
authors = soup.select('#content-left > div > div.author.clearfix > a > h2')
author = re.findall('.*?<h2>\n(.*?)\n</h2>', str(authors), re.S)
jokes = soup.select('#content-left > div > a.contentHerf > div.content > span')
joke = re.findall('.*?<span>\n(.*?)\n</span>', str(jokes), re.S)
for a, j in zip(author, joke):
data = {
'author': a,
'joke': j.strip()
}
print(data)
def main(url):
for i in range(14):
urls = url +str(i)
html = get_page(urls)
parse_page(html)
if __name__ == '__main__':
main(url)
网友评论