library(stats)
library(msgps)
library(glmnet)
set.seed(6300)
#################
setwd("/Users/mumu/Documents/Documents/R_DataAnalyzed/lm_mgps/")
dat<-read.table(file="Data2.txt", head=F, fill=F, sep="\t")
rownames(dat)<-paste("TF", seq(1:100), "")
colnames(dat)<-paste("Sample", seq(1:373), "")
tf.names<-paste("TF", seq(1,100,20), "")
train_id <- sample(1:ncol(dat), ncol(dat)*0.66)
test_id<-(-train_id)
train_dat<-t(dat[,train_id])
test_dat<-t(dat[,test_id])
lm.out<-LM(train_dat)
msgps.out<-MSGPS(train_dat)
glmnet.out<-GLMNET(train_dat)
lm.out[1:10,1:10]
msgps.out[1:10,1:10]
glmnet.out[1:10,1:10]
########### test the performance by std
rsq.lm<-RSQ(test_dat, lm.out)
rsq.msgps<-RSQ(test_dat,msgps.out)
rsq.glmnet<-RSQ(test_dat,glmnet.out)
summary(rsq.lm)
summary(rsq.msgps)
summary(rsq.glmnet)
#####################
lm.out2<-LM2(train_dat,tf.names)
msgps.out2<-MSGPS2(train_dat,tf.names)
glmnet.out2<-GLMNET2(train_dat,tf.names)
lm.out2[1:10,1:5]
msgps.out2[1:10,1:5]
glmnet.out2[1:10,1:5]
rsq.lm2<-RSQ2(test_dat,tf.names, lm.out2)
rsq.msgps2<-RSQ2(test_dat,tf.names, msgps.out2)
rsq.glmnet2<-RSQ2(test_dat,tf.names, glmnet.out2)
summary(rsq.lm2)
summary(rsq.msgps2)
summary(rsq.glmnet2)
input data format : a matrix with row names<-sample, colnames<-genes
LM<-function(dat){
size<-dim(dat)[2]
reg.coef<-matrix(0,size,size+1)
for (i in 1:size){
lm_out<-lm(dat[,i]~dat[,-i])
coef<-lm_out$coefficients
# summary(lm_out)
for (j in 1:(size+1)){
if (j<i){
reg.coef[i,j]=coef[j]
}else if (j==i){
reg.coef[i,j]=0
}else{
reg.coef[i,j]=coef[j-1]
}
}
}
return(reg.coef)
}
LM2<-function(dat, tf.names){
size<-dim(dat)[2]
len.coef<-length(tf.names)+1
reg.coef<-matrix(0,ncol(dat), length(tf.names)+1)
tf.pos<-match(tf.names, colnames(dat))
tf.dat<-dat[,tf.pos]
for (i in 1:size){
print (i)
i.pos<-match(row.names(dat)[i],tf.names)
if (is.na(i.pos)==TRUE){
lm_out<-lm(dat[,i]~tf.dat)
coef<-lm_out$coefficients
# summary(lm_out)
for (j in 1:len.coef){
reg.coef[i,j]=coef[j]}
}else{
lm_out<-lm(dat[,i]~tf.dat[,-i.pos])
coef<-lm_out$coefficients
if (j<i.pos){
reg.coef[i,j]=coef[j]
}else if (j==i.pos){
reg.coef[i,j]<-0
}else{
reg.coef[i,j]=coef[j-1]
}
}
}
return(reg.coef)
}
MSGPS<-function(dat){
size<-dim(dat)[2]
reg.coef<-matrix(0,size,size+1)
for (i in 1:size){
print (i)
msgps_out<-msgps(dat[,-i], dat[,i])
coef<-coef(msgps_out)[,2]
for (j in 1:(size+1)){
if (j<i){
reg.coef[i,j]=coef[j]
}else if (j==i){
reg.coef[i,j]=0
}else{
reg.coef[i,j]=coef[j-1]
}
}
}
return (reg.coef)
}
MSGPS2<-function(dat, tf.names){
size<-dim(dat)[2]
len.coef<-length(tf.names)+1
reg.coef<-matrix(0,ncol(dat), length(tf.names)+1)
tf.pos<-match(tf.names, colnames(dat))
tf.dat<-dat[,tf.pos]
for (i in 1:size){
print (i)
i.pos<-match(row.names(dat)[i],tf.names)
if (is.na(i.pos)==TRUE){
msgps_out<-msgps(tf.dat, dat[,i])
coef<-coef(msgps_out)[,2]
# summary(lm_out)
for (j in 1:len.coef){
reg.coef[i,j]=coef[j]}
}else{
msgps_out<-msgps(tf.dat[,-i.pos], dat[,i])
coef<-coef(msgps_out)[,2]
if (j<i.pos){
reg.coef[i,j]=coef[j]
}else if (j==i.pos){
reg.coef[i,j]<-0
}else{
reg.coef[i,j]=coef[j-1]
}
}
}
return(reg.coef)
}
glmnet
GLMNET<-function(dat){
size<-dim(dat)[2]
reg.coef<-matrix(0,size,size+1)
for (i in 1:size){
print (i)
cv_output <- cv.glmnet(dat[,-i], dat[,i],
alpha = 1)
identifying best lamda
best_lam <- cv_output$lambda.min
best_lam
lasso_best <- glmnet(dat[,-i], dat[,i], alpha = 1, lambda = best_lam)
coef<-coef(lasso_best)
for (j in 1:(size+1)){
if (j<i){
reg.coef[i,j]=coef[j]
}else if (j==i){
reg.coef[i,j]<-0
}else{
reg.coef[i,j]<-coef[j-1]
}
}
}
return(reg.coef)
}
GLMNET2<-function(dat, tf.names){
size<-dim(dat)[2]
len.coef<-length(tf.names)+1
reg.coef<-matrix(0,ncol(dat), length(tf.names)+1)
tf.pos<-match(tf.names, colnames(dat))
tf.dat<-dat[,tf.pos]
for (i in 1:size){
print (i)
i.pos<-match(row.names(dat)[i],tf.names)
if (is.na(i.pos)==TRUE){
cv_output <- cv.glmnet(tf.dat, dat[,i],
alpha = 1)
best_lam <- cv_output$lambda.min
best_lam
lasso_best <- glmnet(tf.dat, dat[,i], alpha = 1, lambda = best_lam)
coef<-coef(lasso_best)
# summary(lm_out)
for (j in 1:len.coef){
reg.coef[i,j]=coef[j]}
}else{
# identifying best lamda
cv_output <- cv.glmnet(tf.dat[,-i.pos], dat[,i],
alpha = 1)
best_lam <- cv_output$lambda.min
best_lam
lasso_best <- glmnet(tf.dat[,-i.pos], dat[,i], alpha = 1, lambda = best_lam)
coef<-coef(lasso_best)
if (j<i.pos){
reg.coef[i,j]=coef[j]
}else if (j==i.pos){
reg.coef[i,j]<-0
}else{
reg.coef[i,j]=coef[j-1]
}
}
}
return(reg.coef)
}
######### get the rsq of test data set
rss <- sum((preds - actual) ^ 2)
tss <- sum((actual - mean(actual)) ^ 2)
rsq <- 1 - rss/tss
rsq
RSQ<-function(dat,reg.coef){
size<-dim(dat)[2]
dif<-matrix(0,dim(dat)[1],dim(dat)[2])
rsq<-rep(0,size)
rss<-rep(0,size)
tss<-rep(0,size)
predict.x<-dif
reg.coef.x<-reg.coef[,-1]
for (i in 1:dim(dat)[1]){
print (i)
for (j in 1:dim(dat)[2]){
predict.x[i,j]<-reg.coef[j,1]+sum(reg.coef.x[j,]*dat[i,])
dif[i,j]<-dat[i,j]-predict.x[i,j]
}
}
predict.x[predict.x<0]<-0
for (j in 1:size){
print (j)
rss[j]<-sum(dif[,j]^2)
tss[j]<-sum((dat[,j]-mean(predict.x[,j]))^2)
rsq[j]<-1-rss[j]/tss[j]
}
return (rsq)
}
RSQ2<-function(dat,tf.names, reg.coef){
size<-dim(dat)[2]
tf.pos<-match(tf.names, colnames(dat))
tf.dat<-dat[,tf.pos]
dif<-matrix(0,dim(dat)[1],dim(dat)[2])
rsq<-rep(0,size)
rss<-rep(0,size)
tss<-rep(0,size)
predict.x<-dif
reg.coef.x<-reg.coef[,-1]
for (i in 1:dim(dat)[1]){
print (i)
for (j in 1:dim(dat)[2]){
predict.x[i,j]<-reg.coef[j,1]+sum(reg.coef.x[j,]*tf.dat[i,])
dif[i,j]<-dat[i,j]-predict.x[i,j]
}
}
predict.x[predict.x<0]<-0
for (j in 1:size){
print (j)
rss[j]<-sum(dif[,j]^2)
tss[j]<-sum((dat[,j]-mean(predict.x[,j]))^2)
rsq[j]<-1-rss[j]/tss[j]
}
return (rsq)
}
########### v2.0 input [gene_expression_matrix, names of TFs]
########### input [gene_expression_matrix, names of TFs, reg network matrix]
网友评论