- 删除指定的行
w1 <- which(mydata =="unknown") # 确定行号(索引):
mydata[-w1,] # 把这几行删除
# 假设w2为逻辑值的话:
mydata[!w2,]
- 读入excel表的不同sheets
library(tidyverse)
library(readxl)
library(writexl)
path = "LongData.xlsx"
df <- map_dfr(excel_sheets(path),
~ read_xlsx(path, sheet = .x)) # 批量读取+按行堆叠合并
- Try to use
write.xlsx
fromopenxlsx
package that does not use java.
openxlsx::write.xlsx(x = dataframe, file = "dataframe.xlsx")
# 再次打开时,变量名还是"dataframe"
- 批量转换数列数据类型
# Method 1:
library( taRifx )
dat <- japply( mydat, which(sapply(mydat, class)=="character"), as.numeric ) # 转换特定的列
# Method 2
instanceconvert_test <- colnames(ap_test[1:876])
ap_test[,instanceconvert_test] <- lapply(ap_test[,instanceconvert_test,drop=FALSE],as.numeric)
# Method 3
instanceconvert <- colnames(ap_train[1:876])
for (i in instanceconvert)
{
ap_train[[i]] <- as.numeric(ap_train[[i]])
}
- 提取重复的行
library(dplyr)
dat %>%
# 下面这一步非常巧妙,将每一列进行分组
# 很显然,分完组后,很容易求出哪些组有出现不止一次
group_by(id1, id2, id3) %>%
# 注意:这里是mutate(index = n()),不是summarize(index = n())
mutate(index = n()) %>%
# 选择那些出现不止一次的组
filter(index > 1) %>%
# 选择前3列
# select(1:3) %>% #可以不使用这一步,这样可以直接观察到新生成的列显示的重复值信息
# 去掉分组属性
ungroup()
#参考链接http://guangzheng.name/2017/10/07/%E5%A6%82%E4%BD%95%E6%9F%A5%E6%89%BE%E6%95%B0%E6%8D%AE%E6%A1%86%E4%B8%AD%E9%87%8D%E5%A4%8D%E7%9A%84%E6%95%B0%E6%8D%AE/
- 修改某个值
df1[2,3] <- 160;df1
df2 <- data.frame(name2,birth,accept,stringsAsFactors=F)
df2$accept[df2$accept=="ok"] <- "yes"
df2 # 数据框发生了改变
#修改字符串
mydata$AFP <- gsub(">400",">=400",mydata$AFP)
## gsub("目标字符", "替换字符", 对象)
- 对某列进行运算
r, x, y 为列名
data%>%dplyr::mutate(Var=(r + 1) * x / y) -> data #生成的一列变量名为"Var"
- 长数据改为宽数据(某些observation有缺失) 。注意:转换的前提是ID和Sex必须一致,如果不一致,函数会认为是两个observation.
library(reshape2)
md <- dcast(data = LongData,
formula = ID + Sex ~ Test,
value.var = "Value")
Picture1.png
宽数据改为长数据
df1_long1 <- melt(df1,
id.vars = c('ID'),#需要保留不参与聚合的变量,
measure.vars = c('ALT1','ALT2','ALT3'), #用于聚合的变量
variable.name='Timepoints',
value.name='Values')
- 去除重复行
a.若某行的name和test(test为分类变量)两个条目同时一致才视作重复的话,去重可以使用:
mydata2 <- mydata[!duplicated(data.frame(mydata$name,mydata$test)),]
b.若mydata中1、2列数据完全相同则删除此行(test为连续变量),代码实现:
index1 = duplicated(mydata[,1]) # 返回逻辑值
index2 = duplicated(mydata[,2])
index = index1 & index2
mydata2 = mydata[!index,] # 逻辑值取反
- ifelse 函数用于转换成分类变量
group_list <- ifelse(as.numeric(str_sub(colnames(a),14,15)) < 10, "tumor","normal")
# Or
mydata$CLASS <- ifelse(mydata$CLASS == 0, 0, 1)
# Or
a$AFP <- ifelse(a$AFP < 20, "<20",ifelse(a$AFP > 400, ">400", "20-400"))
网友评论