美文网首页
【request爬虫3】批量爬取Cell Blast

【request爬虫3】批量爬取Cell Blast

作者: Geekero | 来源:发表于2021-06-09 20:46 被阅读0次

    特别声明:

    1. 供交流学习使用,不得用作商业用途。
    2. 如有违规侵权,请联系删除。
    import requests
    # from pyquery import PyQuery as pq
    import time
    import json
    import sys
    import os
    import re
    
    # 设置数据保存路径 & 请求网址
    wd = r'/share/disk1/Data/Users/luohb/spider/Cell_BLAST/result/'
    url='https://cblast.gao-lab.org/datasets_meta'
    
    # 网站请求获取 Json 数据
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
    }
    
    try:
        res = requests.post(url=url, headers=headers)
        print(res.status_code)
        # print(res.text)
    except Exception:
        print('request fail...please check!')
    
    # Json文件解析
    i = 0
    json_list = json.loads(res.text)
    for item in json_list:
        time.sleep(2)  #避免请求异常导致爬取过快
        try:
            dataset_name = str(item['dataset_name'].replace(' ', '_'))
            organism = str(item['organism'].replace(' ', '_'))
            organ = str(item['organ'].replace(' ', '_'))
            platform = str(item['platform'])
            cell_number = str(item['cell_number'])
            visualization = list(item['visualization'].split(','))
            i += 1
        except KeyError:
            print('item {} has key error, please check!'.format(i))
        
        #create & change directory
        dir_name = '-'.join([dataset_name, organism, organ, platform, cell_number])
        path = os.path.join(wd, dir_name)
        os.mkdir(path)
        os.chdir(path)
        print(os.getcwd())
    
        #download h5file
        h5_url = 'https://cblast.gao-lab.org/{name}/{name}.h5'.format(name=dataset_name)
        # print(h5_url)
        os.system('wget {}'.format(h5_url))
    
        #download SVG file
        for viz in visualization:
            viz = viz.strip()
            svg_path = 'https://cblast.gao-lab.org/{name}/{svg_type}'.format(name=dataset_name, svg_type=viz)
            print(svg_path)
            os.system('wget {}'.format(svg_path))
    

    相关文章

      网友评论

          本文标题:【request爬虫3】批量爬取Cell Blast

          本文链接:https://www.haomeiwen.com/subject/wsgeeltx.html