美文网首页
Python-网络爬虫爬取陶素药物价格

Python-网络爬虫爬取陶素药物价格

作者: ZeroDesigner | 来源:发表于2019-08-09 10:05 被阅读0次
    定义:

    网络爬虫(又被称为网页蜘蛛,是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。(来源于百度,懒得写定义)

    目的:

    爬取陶素官网药物的售价信息
    https://tsbiochem.com/alltargets

    工具

    python的爬虫工具很多,这次使用Beautiful Soup为例,简单,快速。

    步骤:
    #安装
    conda install -c conda-forge beautifulsoup4
    conda install requests
    #导入包
    import os, time, random,sys,re
    import requests
    import bs4
    from bs4 import BeautifulSoup
    from fake_useragent import UserAgent
    #requests获取网页,BeautifulSoup获取网页状态
    response = requests.get('https://tsbiochem.com/alltargets')
    soup = BeautifulSoup(response.content, "html.parser")
    soup.prettify
    #显示
    <bound method Tag.prettify of <!DOCTYPE html>
    
    <html>
    <head>
    <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
    <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
    <meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
    <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
    <meta content="true" name="HandheldFriendly"/>
    <link href="/css/bootstrap.min.css" media="all" rel="stylesheet" type="text/css"/>
    <link href="/images/ticontransparent.png" rel="icon"/>
    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
    <link href="/css/tsweb_base.css?ver=1.3" rel="stylesheet" type="text/css">
    <link href="/css/tsweb_layout.css?ver=1.35" rel="stylesheet" type="text/css">
    <link href="/css/tsweb_shopcart.css" rel="stylesheet" type="text/css">
    <link href="/css/tsweb_orderway.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_compound.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_library.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_alltargets.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_pathway.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_search.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_target.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_contact.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_about.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_partner.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_calculator.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_news.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_join_us.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_faq.css" rel="stylesheet" type="text/css"/>
    <link href="/css/tsweb_alllibraries.css" rel="stylesheet" type="text/css"/>
    <script src="/js/jquery-3.1.1.min.js"></script>
    <script src="/js/bootstrap.min.js"></script>
    <script src="/js/tsweb_base.js"></script>
    <meta content="dtb30YDZRhiTZGGnIbEL36fKUTMGnLcU7DTY1Sdd" name="csrf-token"/>
    <title>通路靶点 | 陶素生化</title>
    #显示我们需要的内容,就是提取属性为’content_a_en target_a‘的内容,获取其网址链接
    body=soup.find_all(class_="content_a_en target_a")
    http1=[x['href'] for  x in body]
    http1
    #结果
    'https://tsbiochem.com/target/Salt-Inducible%20Kinase',
     'https://tsbiochem.com/target/Serine-Protease',
     'https://tsbiochem.com/target/Serine-threonin-kinase',
     'https://tsbiochem.com/target/SGK',
     'https://tsbiochem.com/target/SGLT',
     'https://tsbiochem.com/target/Sigma-receptor',
     'https://tsbiochem.com/target/Sirtuin',
     'https://tsbiochem.com/target/Sodium-Channel',
     'https://tsbiochem.com/target/Somatostatin',
     'https://tsbiochem.com/target/Src',
     'https://tsbiochem.com/target/STAT',
     'https://tsbiochem.com/target/Survivin',
     'https://tsbiochem.com/target/Syk',
     'https://tsbiochem.com/target/TAM-Receptor',
     'https://tsbiochem.com/target/Telomerase',
     'https://tsbiochem.com/target/TGF-beta-Smad',
     'https://tsbiochem.com/target/Thioredoxin',
     'https://tsbiochem.com/target/Thrombin',
    
    #转换接头(UserAgent),防止反爬虫,
    http2=[]
    for i in range(len(http1)):
        ua=UserAgent()
        headers={"User-Agent":ua.random}
        response1 = requests.get(http1[i],headers=headers)
        soup1 = BeautifulSoup(response1.content, "html.parser")
        body1=soup1.find_all('td')
    #发现所有‘td’的标签,储存为body1
        for x in range(len(body1)):
            try:
                c=body1[x].find('a')
    #找寻body1中所有为‘a’的标签,存储为c
                http2.append(c['href'])
    #找寻c中‘href’的属性,储存静茹列表http2
            except:
                True
    #将http2转化为一个无序,无重复的列表
    http3=list(set(http2))
    
    #看下我们都做了什么,我这里直接展示最后的结果,如果你想看看每一步都发生了什么,取http1中的一个元素测试一下就好
    http3[1:10]
    #结果
    ['https://tsbiochem.com/compound/N-Acetyl-5-hydroxytryptamine',
     'https://tsbiochem.com/compound/Atipamezole',
     'https://tsbiochem.com/compound/Zacopride%20hydrochloride',
     'https://tsbiochem.com/compound/GTS-21-dihydrochloride',
     'https://tsbiochem.com/compound/Carvedilol',
     'https://tsbiochem.com/compound/Isoetharine%20mesylate%20salt',
     'https://tsbiochem.com/compound/Sotalol-hydrochloride',
     'https://tsbiochem.com/compound/Urapidil-hydrochloride',
     'https://tsbiochem.com/compound/Pirenzepine-hydrochloride']
    #获取价格以及规格信息,使用字典格式储存
    fin_dict={}
    #获取网页
    for i in range(len(http3)):
            ua=UserAgent()
            headers={"User-Agent":ua.random}
            response3 = requests.get(http3[i],headers=headers)
            soup3 = BeautifulSoup(response3.content, "html.parser")
    #获取CAS编号
            if re.search(r'\s\d+-\d+-\d+\s',str(soup3)):
               cas=re.search(r'\s\d+-\d+-\d+\s',str(soup3))
               body3=soup3.find_all(class_="qtyInput")
               package=[]
            else:
               cas=http3[i]
               body3=soup3.find_all(class_="qtyInput")
               package=[]
    #将价格以及规格储存
            for y in range(len(body3)):
                    package.append([body3[y]['package'],body3[y]['price']])
            try:
                   fin_dict[cas.group(0)]=package
            except:
                   fin_dict[http3[i]]=[['0','0']]
    #看下结果
    fin_dict
    #结果,键为cas编号,值以列表储存,列表中第一个元素为规格,第二个元素为价格
    {' 62-51-1 ': [['25 mg', '154.00'],
      ['50 mg', '278.00'],
      ['100 mg', '500.00'],
      ['200 mg', '850.00'],
      ['1 mL * 10 mM (in DMSO)', '459.00']],
     ' 1210-83-9 ': [['5 mg', '336.00'],
      ['10 mg', '664.00'],
      ['25 mg', '1064.00'],
      ['50 mg', '1808.00'],
      ['100 mg', '3254.00'],
      ['200 mg', '5858.00']],
     ' 104054-27-5 ': [['5 mg', '369.00'],
      ['10 mg', '665.00'],
      ['25 mg', '1197.00'],
      ['50 mg', '1975.00'],
      ['100 mg', '3555.00'],
      ['200 mg', '6399.00'],
      ['1 mL * 10 mM (in DMSO)', '369.00']],
    .....}
    #输出文件为,csv格式,将fin_dict输出为out.csv
    f=open('out.csv','w+')
    for k,v in fin_dict.items():
        f.write(k+',')
        for i  in range(len(v)):
            f.write(v[i][0]+','+v[i][1]+',')
        f.write('\n')
    f.close()
    
    

    相关文章

      网友评论

          本文标题:Python-网络爬虫爬取陶素药物价格

          本文链接:https://www.haomeiwen.com/subject/isnbdctx.html