美文网首页
python抓取百度百科

python抓取百度百科

作者: 原上的小木屋 | 来源:发表于2021-03-17 08:20 被阅读0次

python抓取百度百科结构化信息

import pymysql
import re
import requests
from lxml import html
import xlwt,xlrd
def baidubaike(name):
    baseurl='https://baike.baidu.com/item/'
    headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    url=baseurl+str(name)

    response=requests.get(url,headers=headers)
    print(response.status_code)
    print(response.url)
    text=response.content.decode("utf-8").replace('\n','')
    text = text.replace('</a>', '')
    text=re.sub(r'(<a.*?>)', '', text)
    text = text.replace('<br/>','、')
    text = re.sub(r'(<em.*?>)', '', text)
    text = text.replace('</em>', '、')
    text = re.sub(r'(<sup.*?>)', '', text)
    text = text.replace('</sup>', '、')
    text = text.replace('<i>', '')
    text = text.replace('</i>', '')
    tree=html.fromstring(text)

    result0=tree.xpath('//dt[@class="basicInfo-item name"]/text()')
    result00 = tree.xpath('//dd[@class="basicInfo-item value"]/text()')

    result1=[i.replace('\xa0','') for i in result0]
    result11 = [i.replace('\xa0', '') for i in result00]
    if(len(result1)!=len(result11)):
        print(name,"出现了一个错误")
        pass
    else:
        s={}
        for i in range(len(result1)):
            s[result1[i]]=result11[i]
    return s

aa=baidubaike("刘诗诗")
# print((aa[0]),'\n',aa[1],'\n',aa[2],'\n',aa[3],'\n',aa[4])
print(aa)

python从excel读取数据并将抓取到的数据存入excel

import pymysql
import re
import requests
from lxml import html

import xlwt,xlrd
def baidubaike(name):
    baseurl='https://baike.baidu.com/item/'
    headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    url=baseurl+str(name)

    response=requests.get(url,headers=headers)
    print(response.status_code)
    print(response.url)
    text=response.content.decode("utf-8").replace('\n','')
    text = text.replace('</a>', '')
    text=re.sub(r'(<a.*?>)', '', text)
    text = text.replace('<br/>','、')
    text = re.sub(r'(<em.*?>)', '', text)
    text = text.replace('</em>', '、')
    text = re.sub(r'(<sup.*?>)', '', text)
    text = text.replace('</sup>', '、')
    text = text.replace('<i>', '')
    text = text.replace('</i>', '')
    tree=html.fromstring(text)

    result0=tree.xpath('//dt[@class="basicInfo-item name"]/text()')
    result00 = tree.xpath('//dd[@class="basicInfo-item value"]/text()')

    result1=[i.replace('\xa0','') for i in result0]
    result11 = [i.replace('\xa0', '') for i in result00]
    if(len(result1)!=len(result11)):
        print(name,"出现了一个错误")
        pass
    else:
        s={}
        for i in range(len(result1)):
            s[result1[i]]=result11[i]
    return s




"""
读取excel表格
"""
readbook = xlrd.open_workbook('C:\\Users\\root\\Desktop\\6.xls')
sheet = readbook.sheet_by_index(0)
data=sheet.col_values(0)
headlist=[]
for i in range(len(data)):
    print(data[i])
    a = baidubaike(data[i]).keys()
    print(a)
    headlist=list(set(headlist+list(a)))
print(headlist)
print(1234)

"""
将数据写入
"""
workbook = xlwt.Workbook(encoding = 'utf-8')
# 第2步:创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 第3步:写入excel # 参数对应 行, 列, 值
for k in range(len(headlist)):
    worksheet.write(0, k, headlist[k])
for i in range(len(data)):
    aa=baidubaike(data[i])
    bb = list(aa.keys())
    cc= list(aa.values())
    for j in range(len(bb)):
        if(bb[j] in headlist):
            indexkey=headlist.index(bb[j])
            print(indexkey)
            print(headlist[indexkey])
            worksheet.write(i+1,indexkey,aa[headlist[indexkey]])
        else:
            pass
# 第4步:保存(一定记得保存)
workbook.save('C:\\Users\\root\\Desktop\\ls.xls')

相关文章

网友评论

      本文标题:python抓取百度百科

      本文链接:https://www.haomeiwen.com/subject/xulscltx.html