爬取NCBI中GEO中的数据

作者: 一只烟酒僧 | 来源:发表于2020-03-01 10:53 被阅读0次

    获取GEO中GSE网页的信息

    for (i in 1:length(GEO_id)) {
        
        url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GEO_id[i],sep = "")
        GEO<-getURL(url,.encoding = "utf-8")
        GEO
        write.table(GEO,paste(table_dir,"GEO",GEO_id[i],".html",sep = ""))
        GEO_tree<-htmlParse(GEO,encoding = "utf-8")
        
        status<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Status")]/following-sibling::td')
        status<-sapply(status,xmlValue)
        title<-getNodeSet(GEO_tree,path = '//tr/child::td[contains(text(),"Title")]/following-sibling::td')
        title<-xmlValue(title[[1]])
        
        
        sample_hide<-getNodeSet(GEO_tree,path = '//td[contains(@onmouseout,"geo_empty_help")]/div/table//tr')
        sample_hide<-sapply(sample_hide,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
        
        
        sample_visual<-getNodeSet(GEO_tree,path ='//td[contains(@onmouseout,"geo_empty_help")]/table//tr' )
        sample_visual<-sapply(sample_visual,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
        sample_visual<-t(sample_visual[-3,])
        sample_visual<-sample_visual[-grep("GPL",sample_visual[,1]),]
        if(length(sample_hide)>0){
        sample_hide<-t(sample_hide[-3,])
        sample<-rbind(sample_visual,sample_hide)
        }else{sample=sample_visual}
        sample_list[[i]]<-sample
        Organism<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Organism")]/following-sibling::td')
        Organism<-sapply(Organism,xmlValue)
        
        Experiment.type<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Experiment type")]/following-sibling::td')
        Experiment.type<-sapply(Experiment.type,xmlValue)
        Experiment.type
        
        
        summary<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Summary")]/following-sibling::td')
        summary<-sapply(summary,xmlValue)
        
        
        Overall.design<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Overall design")]/following-sibling::td')
        Overall.design<-sapply(Overall.design,xmlValue)
        Citation<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Citation")]/following-sibling::td/span')
        Citation<-sapply(Citation,xmlValue)
        #下载sample信息压缩文件
        if(download_sample_info==T&!is.null(sample_info_dir)){
          sample_info=getNodeSet(GEO_tree,path = '//a[contains(text(),"Series Matrix File")]')
          sample_info=sapply(sample_info,xmlAttrs)
          sample_info=t(sample_info)
          sample_info<-sample_info[,which(colnames(sample_info)=="href")]
          sample_web<-getURL(sample_info)
          sample_name<-str_split(sample_web," ",simplify = T)
          sample_name<-t(sample_name)
          sample_name<-as.character(sample_name)[grep("GSE",sample_name)]
          sample_name<-unlist(str_split(sample_name,"\\r",simplify = T)[,1])
          sample_info_url<-paste(sample_info,sample_name,sep = "")
          dir.create(path = paste(sample_info_dir,GEO_id[i],sep = ""))
          for (j in 1:length(sample_info_url)) {
    
            sample_info_dir_new=paste(sample_info_dir,GEO_id[i],sep = "")
            download.file(url = sample_info_url[j],
                          destfile = paste(sample_info_dir_new,sample_name[j],sep = "/"),
                          mode = "wb")
    
          }
          cat(paste(paste(GEO_id[i],"的样本信息文件名为",paste(sample_name,collapse = "、"),sep=""),
                    paste("保存在",sample_info_dir,"中",sep=""),
                    sep="\n"))
        }
        
        
        GSE_information_sub<-data.frame(GEO_ID=GEO_id[i],
                                    Status=status,
                                    Title=title,
                                    Organism=Organism,
                                    Experiment.type=Experiment.type,
                                    Summary=summary,
                                    Overall.design=Overall.design,
                                    PMID=Citation,
                                    sample_NO=dim(sample)[1],
                                    sample_info_url=sample_info
                                    
        )
        GSE_information<-rbind(GSE_information,GSE_information_sub)
        Sys.sleep(5)
      }
    

    获取GEO中GSM的信息

    #-------------------------------------------------------
    #Function2:获得GSM信息
    #-------------------------------------------------------
    write.xlsx(as.data.frame(GSE_information),paste(table_dir,"GSM_information.xlsx",sep = ""),
                   sheetName="GSE_information",
                   col.names = T)
    sum_GSM<-sapply(sample_list,function(x){return(dim(x)[1])},simplify = T)
    sum_GSM_sum<-cumsum(sum_GSM)
    sum_GSM_sum<-rev(sum_GSM_sum)[1]
    sum_GSM_sum
    
    if (download_GSM_info==T) {
      for (m in 1:length(sample_list)) {
        sample_info<-data.frame()
        for (n in 1:(dim(sample_list[[m]])[1])){
          GSMid<-sample_list[[m]][n,1]
          url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSMid,sep = "")
          GSM<-getURL(url,.encoding = "utf-8")
          GSM_url<-htmlParse(GSM)
          info_name<-c("Title","Organism","Source.name","Characteristics","Treatment.protocol",
                       "Growth.protocol","Extracted.molecule","Extraction.protocol","Library.strategy","Library.source",
                       "Library.selection","Instrument.model","Description","Data.processing")
          info_key<-c("Title","Organism","Source name","Characteristics","Treatment protocol",
                      "Growth protocol","Extracted molecule","Extraction protocol","Library strategy","Library source",
                      "Library selection","Instrument model","Description","Data processing")
          info_list<-list()
          for (a in 1:length(info_key)) {
            xpath<-paste('//td[contains(text(),"',info_key[a],'")]/following-sibling::td',sep = "")
            info_node<-getNodeSet(GSM_url,xpath)
            info_node<-sapply(info_node,xmlValue)
            info_list[[a]]<-info_node
          }
          names(info_list)<-info_name
          sra_all<-getNodeSet(GSM_url,'//td[contains(text(),"SRA")]/following-sibling::td/a')
          sra<-sapply(sra_all,xmlValue)
          sra_url<-sapply(sra_all,xmlAttrs)
          info_list$sra<-sra
          info_list$sra_url<-sra_url
          #对未获取的信息做NA处理
          info_list<-lapply(info_list,function(x){if(length(x)==0){return("NA")}else(return(x))})
          sample=data.frame(GSM_ID=GSMid,
                            Title=info_list$Title,
                            Organism=info_list$Organism,
                            Source.name=info_list$Source.name,
                            Characteristics=info_list$Characteristics,
                            Treatment.protocol=info_list$Treatment.protocol,
                            Growth.protocol=info_list$Growth.protocol,
                            Extracted.molecule=info_list$Extracted.molecule,
                            Extraction.protocol=info_list$Extraction.protocol,
                            Library.source=info_list$Library.source,
                            Description=info_list$Description,
                            Library.selection=info_list$Library.selection,
                            Instrument.model=info_list$Instrument.model,
                            Data.processing=info_list$Data.processing,
                            sra_id=info_list$sra,
                            sra_url=info_list$sra_url)
          sample_info<-rbind(sample_info,sample)
          #-------------------------------------------------------
          #Function3:显示进度条
          #-------------------------------------------------------
          running_time<-function(num=num){
          num=num+0.001
          a=seq(0,100,1)/100
          b=c(a,num)
          b=b[order(b)]
          n=which(b==num)
          persent=b[n-1]*100
          jindu<-paste(rep("--",round(persent/3)),collapse  = "")
          left<-"Running:"
          right<-paste(persent,"%",sep = "")
          final<-paste(left,jindu,right,sep = "")
          return(final)
          
          }
          
          if(m>1){jindu<-(rev(cumsum(sum_GSM[1:(m-1)]))[1]+n)/sum_GSM_sum}else{jindu<-n/sum_GSM_sum}
          
          print(running_time(jindu))
          
        }
        
        write.xlsx(sample_info,paste(table_dir,"GSM_information.xlsx",sep = ""),
                   sheetName = paste(names(sample_list)[m],"_sampleinformation",sep = ""),
                   append = T,row.names = F)
        
      }
      
    }
    

    汇总写成函数

    library(RCurl)
    library(XML)
    library(stringr)
    library(rvest)
    library(xlsx)
    GEO_get<-function(GEO_id=GEO_id,table_dir=".",download_sample_info=T,download_GSM_info=T,sample_info_dir=NULL){
      if(download_sample_info==T&is.null(sample_info_dir)){
        stop("快交出保存样本信息文件的地址,不然不给你下载")}
      if(download_sample_info==F&!is.null(sample_info_dir)){
        stop("download_sample_info设为true啊小兄弟")}
      GEO_id<-GEO_id[!duplicated(GEO_id)]
    #-------------------------------------------------------
    #Function1:获取GSE信息
    #-------------------------------------------------------
      table_dir=paste(table_dir,"myGEO_search/",sep = "")
      dir.create(table_dir)
      GSE_information<-data.frame()
      sample_list<-list()
      for (i in 1:length(GEO_id)) {
        
        url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GEO_id[i],sep = "")
        GEO<-getURL(url,.encoding = "utf-8")
        GEO
        write.table(GEO,paste(table_dir,"GEO",GEO_id[i],".html",sep = ""))
        GEO_tree<-htmlParse(GEO,encoding = "utf-8")
        
        status<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Status")]/following-sibling::td')
        status<-sapply(status,xmlValue)
        title<-getNodeSet(GEO_tree,path = '//tr/child::td[contains(text(),"Title")]/following-sibling::td')
        title<-xmlValue(title[[1]])
        
        
        sample_hide<-getNodeSet(GEO_tree,path = '//td[contains(@onmouseout,"geo_empty_help")]/div/table//tr')
        sample_hide<-sapply(sample_hide,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
        
        
        sample_visual<-getNodeSet(GEO_tree,path ='//td[contains(@onmouseout,"geo_empty_help")]/table//tr' )
        sample_visual<-sapply(sample_visual,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
        sample_visual<-t(sample_visual[-3,])
        sample_visual<-sample_visual[-grep("GPL",sample_visual[,1]),]
        if(length(sample_hide)>0){
        sample_hide<-t(sample_hide[-3,])
        sample<-rbind(sample_visual,sample_hide)
        }else{sample=sample_visual}
        sample_list[[i]]<-sample
        Organism<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Organism")]/following-sibling::td')
        Organism<-sapply(Organism,xmlValue)
        
        Experiment.type<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Experiment type")]/following-sibling::td')
        Experiment.type<-sapply(Experiment.type,xmlValue)
        Experiment.type
        
        
        summary<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Summary")]/following-sibling::td')
        summary<-sapply(summary,xmlValue)
        
        
        Overall.design<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Overall design")]/following-sibling::td')
        Overall.design<-sapply(Overall.design,xmlValue)
        Citation<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Citation")]/following-sibling::td/span')
        Citation<-sapply(Citation,xmlValue)
        #下载sample信息压缩文件
        if(download_sample_info==T&!is.null(sample_info_dir)){
          sample_info=getNodeSet(GEO_tree,path = '//a[contains(text(),"Series Matrix File")]')
          sample_info=sapply(sample_info,xmlAttrs)
          sample_info=t(sample_info)
          sample_info<-sample_info[,which(colnames(sample_info)=="href")]
          sample_web<-getURL(sample_info)
          sample_name<-str_split(sample_web," ",simplify = T)
          sample_name<-t(sample_name)
          sample_name<-as.character(sample_name)[grep("GSE",sample_name)]
          sample_name<-unlist(str_split(sample_name,"\\r",simplify = T)[,1])
          sample_info_url<-paste(sample_info,sample_name,sep = "")
          dir.create(path = paste(sample_info_dir,GEO_id[i],sep = ""))
          for (j in 1:length(sample_info_url)) {
    
            sample_info_dir_new=paste(sample_info_dir,GEO_id[i],sep = "")
            download.file(url = sample_info_url[j],
                          destfile = paste(sample_info_dir_new,sample_name[j],sep = "/"),
                          mode = "wb")
    
          }
          cat(paste(paste(GEO_id[i],"的样本信息文件名为",paste(sample_name,collapse = "、"),sep=""),
                    paste("保存在",sample_info_dir,"中",sep=""),
                    sep="\n"))
        }
        
        
        GSE_information_sub<-data.frame(GEO_ID=GEO_id[i],
                                    Status=status,
                                    Title=title,
                                    Organism=Organism,
                                    Experiment.type=Experiment.type,
                                    Summary=summary,
                                    Overall.design=Overall.design,
                                    PMID=Citation,
                                    sample_NO=dim(sample)[1],
                                    sample_info_url=sample_info
                                    
        )
        GSE_information<-rbind(GSE_information,GSE_information_sub)
        Sys.sleep(5)
      }
      write.table(GSE_information,paste(table_dir,"GSE_information.txt",sep = ""))
      names(sample_list)<-GEO_id
    #-------------------------------------------------------
    #Function2:获得GSM信息
    #-------------------------------------------------------
    write.xlsx(as.data.frame(GSE_information),paste(table_dir,"GSM_information.xlsx",sep = ""),
                   sheetName="GSE_information",
                   col.names = T)
    sum_GSM<-sapply(sample_list,function(x){return(dim(x)[1])},simplify = T)
    sum_GSM_sum<-cumsum(sum_GSM)
    sum_GSM_sum<-rev(sum_GSM_sum)[1]
    sum_GSM_sum
    
    if (download_GSM_info==T) {
      for (m in 1:length(sample_list)) {
        sample_info<-data.frame()
        for (n in 1:(dim(sample_list[[m]])[1])){
          GSMid<-sample_list[[m]][n,1]
          url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSMid,sep = "")
          GSM<-getURL(url,.encoding = "utf-8")
          GSM_url<-htmlParse(GSM)
          info_name<-c("Title","Organism","Source.name","Characteristics","Treatment.protocol",
                       "Growth.protocol","Extracted.molecule","Extraction.protocol","Library.strategy","Library.source",
                       "Library.selection","Instrument.model","Description","Data.processing")
          info_key<-c("Title","Organism","Source name","Characteristics","Treatment protocol",
                      "Growth protocol","Extracted molecule","Extraction protocol","Library strategy","Library source",
                      "Library selection","Instrument model","Description","Data processing")
          info_list<-list()
          for (a in 1:length(info_key)) {
            xpath<-paste('//td[contains(text(),"',info_key[a],'")]/following-sibling::td',sep = "")
            info_node<-getNodeSet(GSM_url,xpath)
            info_node<-sapply(info_node,xmlValue)
            info_list[[a]]<-info_node
          }
          names(info_list)<-info_name
          sra_all<-getNodeSet(GSM_url,'//td[contains(text(),"SRA")]/following-sibling::td/a')
          sra<-sapply(sra_all,xmlValue)
          sra_url<-sapply(sra_all,xmlAttrs)
          info_list$sra<-sra
          info_list$sra_url<-sra_url
          #对未获取的信息做NA处理
          info_list<-lapply(info_list,function(x){if(length(x)==0){return("NA")}else(return(x))})
          sample=data.frame(GSM_ID=GSMid,
                            Title=info_list$Title,
                            Organism=info_list$Organism,
                            Source.name=info_list$Source.name,
                            Characteristics=info_list$Characteristics,
                            Treatment.protocol=info_list$Treatment.protocol,
                            Growth.protocol=info_list$Growth.protocol,
                            Extracted.molecule=info_list$Extracted.molecule,
                            Extraction.protocol=info_list$Extraction.protocol,
                            Library.source=info_list$Library.source,
                            Description=info_list$Description,
                            Library.selection=info_list$Library.selection,
                            Instrument.model=info_list$Instrument.model,
                            Data.processing=info_list$Data.processing,
                            sra_id=info_list$sra,
                            sra_url=info_list$sra_url)
          sample_info<-rbind(sample_info,sample)
          #-------------------------------------------------------
          #Function3:显示进度条
          #-------------------------------------------------------
          running_time<-function(num=num){
          num=num+0.001
          a=seq(0,100,1)/100
          b=c(a,num)
          b=b[order(b)]
          n=which(b==num)
          persent=b[n-1]*100
          jindu<-paste(rep("--",round(persent/3)),collapse  = "")
          left<-"Running:"
          right<-paste(persent,"%",sep = "")
          final<-paste(left,jindu,right,sep = "")
          return(final)
          
          }
          
          if(m>1){jindu<-(rev(cumsum(sum_GSM[1:(m-1)]))[1]+n)/sum_GSM_sum}else{jindu<-n/sum_GSM_sum}
          
          print(running_time(jindu))
          
        }
        
        write.xlsx(sample_info,paste(table_dir,"GSM_information.xlsx",sep = ""),
                   sheetName = paste(names(sample_list)[m],"_sampleinformation",sep = ""),
                   append = T,row.names = F)
        
      }
      
    }
    
      
      
    }
    

    获取附加材料文件

    ######################################################## 
    #-------------------------------------------------------
    # Topic:爬取并下载NCBI中GEO的附件文件
    # Author:Wang Haiquan
    # Date:Sun Mar  1 17:25:03 2020
    # Mail:mg1835020@smail.nju.edu.cn
    #-------------------------------------------------------
    ########################################################
    
    
    library(RCurl)
    library(rvest)
    library(XML)
    
    get_supply<-function(GSEid=GSEid,download_file=F){
      GSEid<-GSEid
      item<-'//strong[contains(text(),"Supplementary file")]/parent::td/parent::tr/following-sibling::tr/td[1]'
      item_url<-'//strong[contains(text(),"Supplementary file")]/parent::td/parent::tr/following-sibling::tr/td[3]/a[1]'
      url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSEid,sep = "")
      GSE_web<-getURL(url,.encoding = "utf-8")
      GSE_web<-htmlParse(GSE_web)
      item_url<-getNodeSet(GSE_web,item_url)
      item_url<-sapply(item_url,xmlAttrs)
      item<-getNodeSet(GSE_web,item)
      item<-sapply(item,xmlValue)
      item<-item[1:length(item_url)]
      supply<-data.frame(item=item,item_url=item_url)
      idex<-data.frame(idex=rep("https://www.ncbi.nlm.nih.gov/",dim(supply)[1]))
      idex$idex[grep("^ftp",supply[,2])]<-""
      supply$idex<-idex$idex
      supply$item_url<-paste(supply$idex,supply$item_url,sep = "")
      
      
      if(download_file==T){
        download_url<-supply$item_url
        filename<-as.character(supply$item)
        for (i in 1:length(download_url)) {
          download.file(download_url[i],filename[i],mode = "wb")
        }
      }
      #注意:此处下载至工作目录!!
      return(supply)
      
    }
    `````

    相关文章

      网友评论

        本文标题:爬取NCBI中GEO中的数据

        本文链接:https://www.haomeiwen.com/subject/zcjxkhtx.html