美文网首页ggplot集锦
R语言将一个fasta文件拆分成多个

R语言将一个fasta文件拆分成多个

作者: 气旋_c8b6 | 来源:发表于2020-06-18 18:04 被阅读0次

    有一个蛋白质序列文件:

    >tr|A0A023T6R1|A0A023T6R1_HUMAN Mago nashi protein OS=Homo sapiens OX=9606 GN=FLJ10292 PE=2 SV=1
    MAVASDFYLRYYVGHKGKFGHEFLEFEFRPDGKLRYANNSNYKNDVMIRKEAYVHKSVME
    ELKRIIDDSEITKEDDALWPPPDRVGRQELEIVIGDEHISFTTSKIGSLIDVNQSKDPEG
    LRVFYYLVQDLKCLVFSLIGLHFKIKPI
    >tr|A0A024QYU9|A0A024QYU9_HUMAN Eukaryotic translation initiation factor 3 subunit C OS=Homo sapiens OX=9606 GN=EIF3S8 PE=3 SV=1
    MSRFFTTGSDSESESSLSGEELVTKPVGGNYGKQPLLLSEDEEDTKRVVRSAKDKRFEEL
    TNLIRTIRNAMKIRDVTKCLEEFELLGKAYGKAKSIVDKEGVPRFYIRILADLEDYLNEL
    WEDKEGKKKMNKNNAKALSTLRQKIRKYNRDFESHITSYKQNPEQSADEDAEKNEEDSEG
    SSDEDEDEDGVSAATFLKKKSEAPSGESRKFLKKMDDEDEDSEDSEDDEDWDTGSTSSDS
    DSEEEEGKQTALASRFLKKAPTTDEDKKAAEKKREDKAKKKHDRKSKRLDEEEEDNEGGE
    WERVRGGVPLVKEKPKMFAKGTEITHAVVIKKLNEILQARGKKGTDRAAQIELLQLLVQI
    AAENNLGEGVIVKIKFNIIASLYDYNPNLATYMKPEMWGKCLDCINELMDILFANPNIFV
    GENILEESENLHNADQPLRVRGCILTLVERMDEEFTKIMQNTDPHSQEYVEHLKDEAQVC
    AIIERVQRYLEEKGTTEEVCRIYLLRILHTYYKFDYKAHQRQLTPPEGSSKSEQDQAENE
    GEDSAVLMERLCKYIYAKDRTDRIRTCAILCHIYHHALHSRWYQARDLMLMSHLQDNIQH
    ADPPVQILYNRTMVQLGICAFRQGLTKDAHNALLDIQSSGRAKELLGQGLLLRSLQERNQ
    EQEKVERRRQVPFHLHINLELLECVYLVSAMLLEIPYMAAHESDARRRMISKQFHHQLRV
    GERQPLLGPPESMREHVVAASKAMKMGDWKTCHSFIINEKMNGKVWDLFPEADKVRTMLV
    RKIQEESLRTYLFTYSSVYDSISMETLSDMFELDLPTVHSIISKMIINEELMASLDQPTQ
    TVVMHRTEPTAQQNLALQLAEKLGSLVENNERVFDHKQGTYGGYFRDQKDGYRKNEGYMR
    RGGYRQQQSQTAY
    >tr|A0A024QYX0|A0A024QYX0_HUMAN Emopamil binding protein OS=Homo sapiens OX=9606 GN=EBP PE=2 SV=1
    MTTNAGPLHPYWPQHLRLDNFVPNDRPTWHILAGLFSVTGVLVVTTWLLSGRAAVVPLGT
    WRRLSLCWFAVCGFIHLVIEGWFVLYYEDLLGDQAFLSQLWKEYAKGDSRYILGDNFTVC
    METITACLWGPLSLWVVIAFLRQHPLRFILQLVVSVGQIYGDVLYFLTEHRDGFQHGELG
    HPLYFWFYFVFMNALWLVLPGVLVLDAVKHLTHAQSTLDAKATKAKSKKN
    >tr|A0A024QYX3|A0A024QYX3_HUMAN RNA binding motif (RNP1, RRM) protein 3, isoform CRA_c OS=Homo sapiens OX=9606 GN=RBM3 PE=4 SV=1
    MSSEEGKLFVGGLNFNTDEQALEDHFSSFGPISEVVVVKDRETQRSRGFGFITFTNPEHA
    SVAMRAMNGESLDGRQIRVDHAGKSARGTRGGGFGAHGRGRSYSRGGGDQGYGSGRYYDS
    RPGGYGYGYGRSRDYNGRNQGGYDRYSGGNYRDNYDN
    .................
    

    需要按蛋白将他们分成多个fasta文件。

    library(tidyverse)
    text <- read_lines("蛋白文件")
    po <- as_tibble(str_locate(text,"^>")) %>%
      mutate(lines=1:length(text)) %>%
      na.omit() %>% 
      select(lines) 
    AC <- str_split(text[po$lines],pattern = "\\|",simplify = T)[,2]
    for (i in 1:(nrow(po) - 1)) {
      write_lines(text[po$lines[i]:(po$lines[i + 1] - 1)],
                  path = str_c(".输出目录", AC[i], ".fasta"))
    }
    write_lines(text[po$lines[i+1]:length(text)],
                path = str_c("输出目录",AC[i+1],".fasta"))
    
    

    修改目录和文件即可。

    相关文章

      网友评论

        本文标题:R语言将一个fasta文件拆分成多个

        本文链接:https://www.haomeiwen.com/subject/xmtpxktx.html