利用Bs4,爬取前程无忧职位信息。里面才用的是css选择器爬取标签内容
代码如下:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import xlwt
class Bs4_Job( ):
def __init__(self):
self.count = 1
self.f = xlwt.Workbook() # 创建工作薄
self.sheet1 = self.f.add_sheet(u'任务列表', cell_overwrite_ok=True) # 创建工作表
self.rowTitle = [u'编号', u'职位名', u'公司名', u'工作地点', u'网站']
for i in range(0, len(self.rowTitle)):
self.sheet1.write(0, i, self.rowTitle[i])
self.f.save('bs4_51.xlsx')
def set_style(self, name, height, bold=False):
style = xlwt.XFStyle # 初始化样式
font = xlwt.Font() # 创建字体
font.name = name
font.bold = bold
font.colour_index = 2
font.height = height
style.font = font
return style
def geturl(self):
for i in range(1,3):
url = 'https://search.51job.com/list/360000,000000,0000,00,9,99,Java,2,{}.html'.format(i)
self.getspider(url)
def getspider(self, url):
if url is None:
return None
try:
proxies = {
'http': 'http://125.46.0.62:53281',
}
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
headers = {'User-Agent':user_agent}
soup = BeautifulSoup(urlopen(url), 'lxml')
results = soup.select('.dw_table > .el')[1:]
for li in results:
data = []
title = li.select('p.t1 > span > a')
company = li.select('span.t2 > a')
work = li.select('span.t3 ')
href = li.select('span.t2 > a')
title = title[0].string if len(title) > 0 else ' '
company = company[0].string if len(company) > 0 else ' '
work = work[0].string if len(work) > 0 else ' '
href = href[0].attrs['href'] if len(href) > 0 else ' '
print(title.strip(), company.strip(), work.strip(),href)
if title.strip()and company.strip()and work.strip()and href:
data.append(self.count)
data.append(title.strip())
data.append(company)
data.append(work)
data.append(href)
self.count += 1
for i in range(len(data)):
self.sheet1.write(data[0], i ,data[i])
self.f.save('bs4_51.xlsx')
except:
print('出错!')
if '_main_':
Job = Bs4_Job()
Job.geturl()


网友评论