1 导入soup
from bs4 import BeautifulSoup
2 解析html
soup = BeautifulSoup(response.text,'lxml')
3 搜索并去除空格
name = soup.select("div .info-group h1")[0].get_text(strip=True)
4 split :
r.get_text(strip=True).split(":")
5 检索表格
tables = soup.findAll("table")
6 获取tr td
trs_0 = tables[0].findAll("tr")
tds_0 = trs_0[1].findAll("td")
ths_0 = trs_0[1].findAll("th")
7 获取内容并去除空格
ths_0[0].contents[0].strip()
8 拼接list
trs_1 = tables[1].findAll("tr")
for tr in trs_1:
tds_1 = tr.findAll("td")
affiliation = {
'business':tds_1[0].text.strip(),
'hire_date':tds_1[1].text.strip(),
'termination_date':tds_1[2].text.strip(),
}
affiliations.append(affiliation)
9 json 输出
json.dumps(affiliations)
10 格式化日期
item['expiration'] = time.strftime("%Y-%m-%d %H:%M:%S",time.strptime(expiration, '%m/%d/%Y'))
11 判定数组是否越界.
if(len(mailing_address)>=3):
12
网友评论