生信练习题
参考:
https://mp.weixin.qq.com/s/loKVJJPxGyRWYOF3pYDOdw
在公众号上看到了大神用c实现的一个操作,我尝试了用R,数据结构是list,同时细化了一下其他操作,比如排序,指定输出等;我的思路是先提取id,把list按id分割,然后计算每个id对应chr的长度,再提取list里的元素
作为新手,我对写代码不是很熟,高手勿喷
rm(list=ls())
#输入文件
con0 <- file("test.faa","r")
all_test<- readLines(con0,n=-1)
#length(all_test)
#提取所有id:
id = list()
sum = 0
for (i in 1:length(all_test)) {
if(strsplit(all_test[i],split = "")[[1]][1] == ">"){
sum = sum + 1
id[[sum]] <- all_test[i]
}
}
id <- unlist(id)
close(con0)
if(file.exists("id.txt") == F){
write(id,file = "id.txt")
}
#匹配id和在text里的位置
pos<- match(id, all_test)
# length(pos)
#1295
con0 <- file("test.faa","r")
#如果是按id号分割:可先测试一下1:5,并write
tex = list()
for(i in 1:(length(pos)-1)){
tex[[i]] = readLines(con0,n=(pos[i+1]-pos[i]))
if (i == (length(pos)-1)){
tex[[i+1]] <- readLines(con0,n=-1)
}#不要忘记添加最后一行
#write(tex,file = paste("tes",i,".txt"))
}
close(con0)
#目的是将读取的字符串变成list
# tex[[1295]][1]
#怎么对tex排序?
tail(id)
id2<- sort(id)#排序
tex2 <- list()#创建一个新的list用来存放排序后的tex
for(i in 1:length(id2)){
for(j in 1:length(tex)){
if(tex[[j]][1] == id2[i]){
tex2[[i]] <- tex[[j]]
}
}
}
# tex2[[1]]
# id2
if(file.exists("sorted.txt") == F){
write(unlist(tex2),file = "sorted.txt")
}
#测试
subset(tex2,tex2[1:unlist(t)])
#下面的方法可以计算出有多少个chr1的片段
t<- lapply(lapply(tex2, function(x){grep(strsplit(x,split = ":")[[1]][1],
pattern = paste(">chr",1,sep = ""))}),
function(x){ifelse(x == 0, FALSE, TRUE)})
#这样就可以把每个单独的染色体序列写入一个list了
chr1 <- list()
for(i in 1:length(unlist(t))){
chr1[[i]] <- tex2[[i]]
}
#接下来的问题是,把上面的步骤包装成函数,一步写出文件
#获取每条染色体的行数(一共7条,chr1-chr7),然后放到函数里面,输出全部染色体
pos2 <- list()
#step1:获取每个chr的len
for(eachchr in seq(1,7)){
t <- unlist(lapply(lapply(tex2, function(x){grep(strsplit(x,split = ":")[[1]][1],
pattern = paste(">chr",eachchr,sep = ""))}),
function(x){ifelse(x == 0, FALSE, TRUE)}))
pos2[eachchr] <- length(t)#储存每个片段的长度
}
#step2:利用len取出每条染色体
CountChr2 <- function(tex = tex2,chr=chr){
for(eachchr in chr){
t <- unlist(lapply(lapply(tex, function(x){grep(strsplit(x,split = ":")[[1]][1],
pattern = paste(">chr",eachchr,sep = ""))}),
function(x){ifelse(x == 0, FALSE, TRUE)}))
pos2[eachchr] <- length(t)#储存每个片段的长度
}
fil <- list()
for(eachchr in chr){
if(eachchr == 1){#分支
fil[[1]] <- unlist(tex[1:pos2[[eachchr]]])
write(fil[[eachchr]],file = paste("chr",eachchr,".txt",sep = ""))
}
else{
fil[[eachchr]] <- unlist(tex[(pos2[[eachchr-1]]+1):((pos2[[eachchr-1]]+1) + pos2[[eachchr]]-1)])
pos2[[eachchr]]<- pos2[[eachchr-1]]+1 + pos2[[eachchr]]-1#比如从chr1-chr2,为了计算每次要从tex中提取多少行对应的chr,比如485-690,迭代时需要累加上已经提取的行数
write(fil[[eachchr]],file = paste("chr",eachchr,".txt",sep = ""))#这里可以设置一个条件判断,只输出指定的chr
}
}
}
#测试
CountChr2(tex = tex2,chr = seq(1,7))
#指定染色体输出------------------------------------------------------------------------
##参数说明:tex为排序好的chr_list,chr为完整的染色体数(写成向量),
#cfile为指定分割出的染色体,比如c(3,4)
CountChr3 <- function(tex = tex2,chr=chr,cfile = c()){
for(eachchr in chr){
t <- unlist(lapply(lapply(tex, function(x){grep(strsplit(x,split = ":")[[1]][1],
pattern = paste(">chr",eachchr,sep = ""))}),
function(x){ifelse(x == 0, FALSE, TRUE)}))
pos2[eachchr] <- length(t)#储存每个片段的长度
}
fil <- list()
for(eachchr in chr){
if(eachchr == 1){
fil[[1]] <- unlist(tex[1:pos2[[eachchr]]])
if(eachchr %in% cfile){
write(fil[[eachchr]],file = paste("chr",eachchr,".txt",sep = ""))
}
}
else{
fil[[eachchr]] <- unlist(tex[(pos2[[eachchr-1]]+1):((pos2[[eachchr-1]]+1) + pos2[[eachchr]]-1)])
pos2[[eachchr]]<- pos2[[eachchr-1]]+1 + pos2[[eachchr]]-1
if(eachchr %in% cfile){
write(fil[[eachchr]],file = paste("chr",eachchr,".txt",sep = ""))
}
#这里可以设置一个条件判断,只输出指定的chr
}
}
}
#测试
CountChr2(tex = tex2,chr = seq(1,7),cfile = c(3,5))
网友评论