本周学习内容爬虫,已理解标签的选择。
目前问题:
1对python函数不熟练
2解析索引页获取详情页url,解析详情页获取目标内容
3Ajax请求,翻页请求
4beautifulsoup,正则表达式(以后学)
5存取数据库
学完后在进行全面总结
```
《爬取58招聘信息》
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
headers = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"}
def get_page_index(url):
try:
respones = requests.get(url,headers = headers)
respones.encoding = 'utf-8'
if respones.status_code == 200:
return respones.text
return None
except RequestException:
return('请求错误')
return None
def parse_page_index(html):
soup = BeautifulSoup(html,'lxml')
list_li = soup.find_all('ul',id = 'list_con' )[0].find_all('li',class_ = 'job_item clearfix')#返回一个列表需要用循环
for link in list_li:
#list_a = link.find_all('div',class_ = 'job_name clearfix')[0].find_all('a')[0]
#list_a = link.find_all('div',class_ = 'job_name clearfix')[0].find_all('a')[0]#筛选出所有的a标签
address = link.find_all('div',class_ = 'job_name clearfix')[0].find_all('a')[0].find_all('span',class_ = 'address')[0].text
name = link.find_all('div',class_ = 'job_name clearfix')[0].find_all('a')[0].find_all('span',class_ = 'name')[0].text
print([address,name])
def main():
url = 'https://cd.58.com/jiazhengbaojiexin/?key=月嫂'
html = get_page_index(url)
parse_page_index(html)
if __name__=="__main__":
main()
```
网友评论