安装环境
pip install requests
pip install pyquery
import requests
import os
import time
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
def get_html(url):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
try:
r=requests.get(url,headers=headers,timeout=10)
r.encoding=r.apparent_encoding
if r.status_code==200:
print(r.status_code)
return r.text
except:
print('出现异常')
def one_parser_html(html):
doc=pq(html)
new_list=list()
ul=doc('div.right-content >ul')
a=ul.find('a').items()
for a_href in a:
print(a_href.attr('href'))
href=a_href.attr('href')
new_list.append(href)
return new_list
def second_parser_html(html2):
soup=BeautifulSoup(html2,'lxml')
doc=pq(html2)
result={}
result['title']= soup.find('h1',attrs={'class':'main-title'}).string
result['time']=doc('div.date-source >span').text()
result['text']=doc('div.article >p').text()
result['edit']=doc('p.show_author').text()
print(result)
return result
if __name__ == '__main__':
url='https://news.sina.com.cn/china/'
html=get_html(url)
new_list=one_parser_html(html)
for l in new_list:
u=get_html(l)
second_parser_html(u)
ps要是再有问题,给我留言,我在改改
如果对pyquery不太熟悉的附上 pyquery官方文档
网友评论