Python爬取中药网站信息并对其进行简单的分析

作者: 9ba4bd5525b9 | 来源:发表于2019-08-02 15:22 被阅读54次

开发工具

Python版本：3.5.4
相关模块：

爬虫：

import requests
from bs4 import BeautifulSoup

词云：

from wordcloud import WordCloud
import jieba
from os import path
import matplotlib.pyplot as plt

主要思路：

写了两个文件，具体如下：

1).数据爬取并生成txt文件的py文件

2).利用python相关的包生成词云相关操作的py文件

(2).遇到的问题以及解决方案：

wordcloud包的安装配置出现很大的问题，本机系统装载了两个python版本导致装载出现很多额外的问题。

解决：在同学的帮助下安装了whl文件并删除了本机中的另一个python版本。

信息爬取过慢

解决：暂未解决。爬取的页面预计超过100p，所以有关方面可能需要依赖别的技术。

使用演示

程序运行截图：

image.png

导出文档：

image.png

数据爬取并生成txt文件的py文件：

import requests
from bs4 import BeautifulSoup
'''
遇到不懂的问题？Python学习交流群：821460695满足你的需求，资料都已经上传群文件，可以自行下载！
'''
#获取——————————————————————————————————————————
def catchSoup(url):
    #url='http://www.18ladys.com/post/buchong/'
    res=requests.get(url)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    return soup

#类型及其网页查找(首页查找)——————————————————————
def kindSearch(soup):
    herbKind=[]
    for new in soup.select('li'):
        if(new.text!='首页'):
            perKind=[]
            perKind.append(new.text)
            perKind.append(new.select('a')[0].attrs['href'])
            herbKind.append(perKind)
    return herbKind

#药名查找(传入页面)——————————————————————————————————————————————————————
def nameSearch(soup):
    herbName=[]
    for new in soup.select('h3'):
        pername=new.text.split('_')[0].rstrip('图片').lstrip('\xa0').split('的功效')[0].split('(')[0].split('功效')[0].rstrip('的')
        herbName.append(pername)
    return herbName

#分页及详细地址——————————————————————————————————————————————————————————
def perPage(soup):
    kindPage=[]
    add=[]
    for new in soup.select('.post.pagebar'):
        for detail in new.select('a'):
            d=[]
            d.append(detail.text)
            d.append(detail.attrs['href'])
            kindPage.append(d)
    kindPage.remove(kindPage[0])
    kindPage.remove(kindPage[-1])
    return kindPage
#爬取某一类的所有药名:kind是一个数字,照着kindSearch的结果输入。————————————
def herbDetail(kind):
    soup=catchSoup('http://www.18ladys.com/post/buchong/')#从首页开始
    kindName=kindSearch(soup)[kind][0]       #这一类草药的类名
    adds=kindSearch(soup)[kind][1]           #这一类草药的第一页地址
    totalRecord = []                         #这一类草药的所有名字
    print("正在爬取 "+str(kind)+'.'+kindName)
    totalRecord.append(nameSearch(catchSoup(adds)))#第一页的草药
    for add in perPage(catchSoup(adds)):           #第二页以及之后的草药
        pageAdd=add[1]
        totalRecord.append(nameSearch(catchSoup(pageAdd)))
        #print(nameSearch(catchSoup(pageAdd)))
    print(totalRecord)
    return totalRecord

#===========================================================
#                      操作
#===========================================================
if __name__=="__main__":
    #获取类别名字及其网页地址—
    totalKind=kindSearch(catchSoup('http://www.18ladys.com/post/buchong/')) #首页
    #获取某一类中药的各种药名
    kind=0
    detailContent = ''
    while(kind<20):                #如果要爬取全网站请写41
        index = 1                  #前面的序列号指示
        totalRecord = []
        totalRecord=herbDetail(kind)
        if(kind==0):
            detailContent+='目录：\n'
            for i in totalKind:
                detailContent+=str(index)+'.'+i[0]+' '
                index+=1
            kind+=1
            continue
        else:
            detailContent+='\n'+str(totalKind[kind-1][0])+':\n'
        for i in totalRecord:
            k=0
            while k<len(i):
                detailContent+=str(index)+'.'+i[k]+' '
                index+=1
                k+=1
        kind+=1

f = open('herbDetail.txt', 'a+',encoding='utf-8')
f.write(detailContent)
f.close()

词云生成部分

from wordcloud import WordCloud
import jieba
from os import path
import matplotlib.pyplot as plt
'''
遇到不懂的问题？Python学习交流群：821460695满足你的需求，资料都已经上传群文件，可以自行下载！
'''
comment_text = open('D:\\herbDetail.txt','r',encoding='utf-8').read()
cut_text = " ".join(jieba.cut(comment_text))
d = path.dirname(__file__)
cloud = WordCloud(
    font_path="C:\\Windows\\Fonts\\simhei.ttf",
    background_color='white',
    max_words=2000,
    max_font_size=40
)
word_cloud = cloud.generate(cut_text)
word_cloud.to_file("cloud4herb.jpg")
#显示词云图片===================================
plt.imshow(word_cloud)
plt.axis('off')
plt.show()

网友评论

本文标题：Python爬取中药网站信息并对其进行简单的分析

本文链接：https://www.haomeiwen.com/subject/chmldctx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

Python爬取中药网站信息并对其进行简单的分析

使用演示

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

虫虫

Python精选