1.数据探索常用代码段
#维度
dim(data)
#空值
data %>% is.na() %>% sum()
na_status <- data %>% apply(2,function(x){is.na(x) %>% sum()}) %>% data.frame()
colnames(na_status) <- c("na_status")
#值分布
unique_status <- data %>% apply(2,function(x){unique(x) %>% length()}) %>% data.frame()
colnames(unique_status) <- c(unique_status)
unique_values <- data %>% apply(2,function(x){unique(x)})
2.数据类型转换
## 把所有与数据转为数字
data = data.frame(apply(data,2,function(x)as.numeric(x)))
str(data)
## 数据种类小于5的变量转为字符串
n <- 5
to_char = unique_status[unique_status$unique_status < n,][,'items']
data[,colnames(data)%in%to_char] <- apply(data[,colnames(data)%in%to_char],2,function(x)as.character(x))
str(data)
3.正太分布检验
## 对连续变量进行正太分布检验
contiouns_vars = select_if(data,is.numeric)
plist = as.data.frame(apply(contiouns_vars, 2, function(x)shapiro.test(x)$p.value))
## 更改P值表列名
colnames(plist) = "pvalue"
## 提取非正太分布数据行名
nonnormal_vars = rownames(subset(plist,pvalue < 0.05))
网友评论