老样子,跟着文档学习
1.readxl包:读取结果为tibble
read_excel(path, #excel文件名,自动判断xls或xlsx
sheet = NULL, #工作表名(sheet1)或表的序号(1)
skip = 0,#读取时要跳过的最小行数,如果依旧有空行,继续跳过该行
n_max = Inf, #最大的读取行数,如果实际存在有效内容的行数(不为NA)小于该参数值,则读取会提前终止
range = NULL, #读取的工作表的范围,比如"B1:D8"或"R1C2:R2C5"或"sheet1!B1:D8",优先级高于"skip","nmax"和"sheet"参数
col_names = TRUE,#TRUE表示使用第一行作为列名,FALSE表示获取默认名称,或者一个字符向量表示每个列的名称。
col_types = NULL, #NULL表示函数自己猜测列类型
na = "", #默认情况下,readxl将空白单元格视为丢失的数据"NA"
trim_ws = TRUE, #默认为TRUE,将自动去除表前后的空白行,优先级低于"range",指定"range"时该参数值TRUE无效
guess_max = min(1000, n_max),#用于猜测列类型的最大数据行数。
progress = readxl_progress(), #是否显示处理进程,默认只出现在交互式会话中
.name_repair = "unique"#列名的处理。默认情况下,readxl将确保列名不为空并且是唯一的
)
1.正常读取:
read_excel("test.xlsx",sheet=2)
read_excel("test.xlsx",sheet="sheet2")
2.读取某一范围:
read_excel("test.xlsx", range = "A1:E7")#读取表1的A1:E7
read_excel("test.xlsx", range = "R1C1:R7C5")#读取从表1的1行1列开始,到7行5列结束
read_excel("test.xlsx", range = "sheet1!A1:E7")#表必须为表名,不能为表的编号
read_excel("test.xlsx", range = "sheet1!R1C1:R7C5")#这样也可以
#以上四种读法结果一致
3.指定读取行或列的范围:
read_excel("test.xlsx", range = cell_rows(10:15), col_names = FALSE)#读取10到15行
#列名称由函数自动分配,样式为"...1","...2","...3",......
read_excel("test.xlsx",range = cell_cols("A:B"))#读取A到B列,也就是1到2列
read_excel("test.xlsx", range = cell_cols(c(NA, 2)))#读取到第2列
read_excel("test.xlsx", range = cell_cols(c(NA, "B")))#读取到第2列
#上述三种方法结果相同
read_excel("test.xlsx", range = cell_cols(2))#只读取第2列
read_excel("test.xlsx", range = cell_cols(c("B", NA)))#从第2列开始读取
read_excel("test.xlsx", range = cell_cols(c(2, NA)))#从第2列开始读取
read_excel("test.xlsx", range = cell_limits(c(4, 3), c(NA, NA)))#从4行3列(C4)开始读取至工作表结束
read_excel("test.xlsx", range = cell_limits(c(4, NA), c(NA, 8)))#从第4行1列开始读取至第n行8列
4.跳过xx行,设置列名为xx,设置列名类型为xx:
read_excel("test.xlsx",
skip = 10,#最少要跳过前10行
col_names = c("ID","Description","Generatio","Bgratio","pvalue"),#指定每一列名称
col_types = c("text","guess","guess","skip","numeric"))
#第四列直接跳过读取,其余列依次指定为文本类型,自动预测,自动预测,数值类型
## A tibble: 6 x 4
# ID Description Generatio pvalue
# <chr> <chr> <chr> <dbl>
#1 GO:0120161 regulation of cold-induced thermogenesis 11/139 0.00000000107
#2 GO:1990845 adaptive thermogenesis 11/139 0.00000000307
#3 GO:0032922 circadian regulation of gene expression 8/139 0.00000000897
#4 GO:0001676 long-chain fatty acid metabolic process 9/139 0.0000000220
#5 GO:0033559 unsaturated fatty acid metabolic process 9/139 0.0000000436
#6 GO:0120254 olefinic compound metabolic process 9/139 0.0000000666
5.列名重复时,函数也会自动进行处理:
read_excel("test.xlsx",
skip = 10,
col_names = c("ID","Description","1ratio","1ratio","pvalue"),#支持数字开头,引用使用` `就可以,列名重复会自动修改
col_types = c("text","skip","guess","guess","numeric"))
#New names:
#* `1ratio` -> `1ratio...2`
#* `1ratio` -> `1ratio...3`
## A tibble: 6 x 4
# ID `1ratio...2` `1ratio...3` pvalue
# <chr> <chr> <chr> <dbl>
#1 GO:0120161 11/139 144/23328 0.00000000107
#2 GO:1990845 11/139 159/23328 0.00000000307
#3 GO:0032922 8/139 70/23328 0.00000000897
#4 GO:0001676 9/139 111/23328 0.0000000220
#5 GO:0033559 9/139 120/23328 0.0000000436
#6 GO:0120254 9/139 126/23328 0.0000000666
6.列名中带有空格:
#默认状态,依旧为空格
read_excel("test.xlsx")
#更改为其他形式
my_custom_name_repair <- function(name) gsub(" ", "_", name)
read_excel("test.xlsx",
.name_repair = my_custom_name_repair)
#同样适用于其他字符的替换,如"."
my_custom_name_repair <- function(name) gsub("\\.", "_", name)
##my_custom_name_repair <- function(name) gsub("[.]", "_", name)
read_excel("test.xlsx",
.name_repair = my_custom_name_repair)
7.读取带有行合并,列合并和行列合并的表格
![](https://img.haomeiwen.com/i19039742/b59d4a71df43d66f.png)
read_excel("test.xlsx")
#New names:
#* `` -> ...2
#* `` -> ...3
#* `` -> ...4
#* `` -> ...5
#* `` -> ...8
#* ...
## A tibble: 6 x 10
# ID ...2 ...3 ...4 ...5 Description Gene.Ratio. ...8 pvalue...9 pvalue...10
# <chr> <lgl> <lgl> <lgl> <lgl> <chr> <chr> <chr> <dbl> <dbl>
#1 NA NA NA NA NA response to stilbenoid 8/139 22/23328 3.86e-13 0.05
#2 NA NA NA NA NA fat cell differentiation 15/139 252/23328 3.11e-11 0.0500
#3 GO:0106106 NA NA NA NA cold-induced thermogenesis 11/139 144/23328 1.07e- 9 0.0500
#4 NA NA NA NA NA NA NA NA NA NA
#5 NA NA NA NA NA NA NA NA NA NA
#6 GO:0001659 NA NA NA NA temperature homeostasis 13/139 181/23328 6.66e-11 0.0500
#其会自动保留左上角的值,其余用NA替代,等同于读取有表格中整行整列NA的数据,无法实现中间行列全为NA的过滤
2.openxlsx包:读取结果为数据框
read.xlsx(
xlsxFile,#xlsx文件名,不支持"xls"文件
sheet = 1,#工作表名称或序号
startRow = 1,#默认从第一行开始查找数据。另外,不管startRow的值是多少,文件顶部的空行总是会被跳过。
colNames = TRUE,#如果为TRUE,第一行数据将用作列名。
rowNames = FALSE,#如果为TRUE,第一列将用作行名。
detectDates = FALSE,#如果为TRUE,尝试识别日期并执行转换。
skipEmptyRows = TRUE,#如果为TRUE,则跳过空行,否则在包含数据的第一行之后的空行将返回一行NAs。
skipEmptyCols = TRUE,#如果为TRUE,则跳过空列
rows = NULL,#一个数字向量,指定要读取Excel文件中的哪些行。如果为NULL,则读取所有行。
cols = NULL,#一个数字向量,指定要读取Excel文件中的哪些列。如果为NULL,则读取所有列。
check.names = FALSE,#如果为TRUE,则检查数据框中的变量名,以确保它们在语法上是有效的变量名
sep.names = ".",#一个字符,默认情况下,用"."来代替列名中的空格
namedRegion = NULL,#对于普通的xlsx文件来说不必理会
na.strings = "NA",#空白单元格将返回为NA
fillMergedCells = FALSE#如果为TRUE,合并单元格中的值将给予合并单元格中的所有单元格。
)
1.默认读取
read.xlsx("test.xlsx",sheet=1)
read.xlsx("test.xlsx",sheet="sheet1")
2.指定读取行或列的范围:
![](https://img.haomeiwen.com/i19039742/6fa938ff11654cdc.png)
read.xlsx("test.xlsx",
sheet = 1,
colNames=FALSE,#不让读取到的第一行成为列名
skipEmptyRows = FALSE,#不跳过空行,即在包含数据的第一行之后的空行将返回一行NAs
skipEmptyCols = FALSE,#不跳过空列,即在包含数据的第一列之后的空列将返回一行NAs
cols = c(1, 4, 8), #读取第1,4,8列,但保持完整的表格形式,第2,3,5,6,7行用NA填充
rows = c(1, 2, 6))#读取第1,2,6行,但保持完整的表格形式,第3,4,5行用NA填充
# X1 X2 X3 X4 X5 X6 X7 X8
#1 NA NA NA ID NA NA NA pvalue
#2 NA NA NA <NA> NA NA NA <NA>
#3 NA NA NA <NA> NA NA NA <NA>
#4 NA NA NA <NA> NA NA NA <NA>
#5 NA NA NA GO:0106106 NA NA NA 1.0727869301115299E-9
########注意:文件顶部的空行被跳过了,不然应该一共有6行8列,现在只有5行8列。########
read.xlsx("test.xlsx",
sheet = 1,
colNames=FALSE,#不让读取到的第一行成为列名
skipEmptyRows = T,#跳过空行
skipEmptyCols = T,#跳过空列
cols = c(1, 4, 8), #读取第1,4,8列,简化的表格形式
rows = c(1, 2, 6))#读取第1,2,6行,简化的表格形式
# X1 X2
#1 ID pvalue
#2 GO:0106106 1.0727869301115299E-9
#######这是简化后的版本,对于读取内容中有整行整列NA的数据十分友好########
3.当列名重复时:
![](https://img.haomeiwen.com/i19039742/5e94f9481edff962.png)
#默认状态,不进行处理,会导致语义不明
x <- read.xlsx("test.xlsx")
x
# ID Description GeneRatio BgRatio pvalue pvalue
#1 GO:0035634 response to stilbenoid 8/139 22/23328 3.863005e-13 0.05
#2 GO:0045444 fat cell differentiation 15/139 252/23328 3.112811e-11 0.05
#3 GO:0001659 temperature homeostasis 13/139 181/23328 6.657958e-11 0.05
#4 GO:0106106 cold-induced thermogenesis 11/139 144/23328 1.072787e-09 0.05
x$pvalue
#[1] 3.863005e-13 3.112811e-11 6.657958e-11 1.072787e-09
#check.names状态为TRUE
x <- read.xlsx("test.xlsx",check.names = T)
x
# ID Description GeneRatio BgRatio pvalue pvalue.1
#1 GO:0035634 response to stilbenoid 8/139 22/23328 3.863005e-13 0.05
#2 GO:0045444 fat cell differentiation 15/139 252/23328 3.112811e-11 0.05
#3 GO:0001659 temperature homeostasis 13/139 181/23328 6.657958e-11 0.05
#4 GO:0106106 cold-induced thermogenesis 11/139 144/23328 1.072787e-09 0.05
x$pvalue
#[1] 3.863005e-13 3.112811e-11 6.657958e-11 1.072787e-09
x$pvalue.1
#[1] 0.05 0.05 0.05 0.05
4.列名中带有空格:
#由sep.names指定,默认替换为".",与参数check.names的值无关
read.xlsx("test.xlsx")
#可以保持不被更改,依旧为空格
read.xlsx("test.xlsx",sep.names = " ")
#更改为其他形式
read.xlsx("test.xlsx",sep.names = "_")
5.读取带有行合并,列合并和行列合并的表格:
![](https://img.haomeiwen.com/i19039742/bcc15a639290036d.png)
read.xlsx("test.xlsx")#只保留左上角的值,然后其余用NA代替,再自动去除表中的整行整列NA
# ID Description Gene.Ratio. X4 pvalue pvalue
#1 <NA> response to stilbenoid 8/139 22/23328 3.863005e-13 0.05
#2 <NA> fat cell differentiation 15/139 252/23328 3.112811e-11 0.05
#3 GO:0106106 cold-induced thermogenesis 11/139 144/23328 1.072787e-09 0.05
#4 GO:0001659 temperature homeostasis 13/139 181/23328 6.657958e-11 0.05
read.xlsx("test.xlsx",fillMergedCells = T)#选择用左上角的值填充整个合并单元格
# ID ID.1 ID.2 ID.3 Description Gene.Ratio. Gene.Ratio..1 pvalue pvalue.1
#1 ID ID ID ID response to stilbenoid 8/139 22/23328 3.863005e-13 0.05
#2 ID ID ID ID fat cell differentiation 15/139 252/23328 3.112811e-11 0.05
#3 GO:0106106 GO:0106106 GO:0106106 GO:0106106 cold-induced thermogenesis 11/139 144/23328 1.072787e-09 0.05
#4 GO:0106106 GO:0106106 GO:0106106 GO:0106106 <NA> <NA> <NA> NA NA
#5 GO:0106106 GO:0106106 GO:0106106 GO:0106106 <NA> <NA> <NA> NA NA
#6 GO:0001659 GO:0001659 GO:0001659 GO:0001659 temperature homeostasis 13/139 181/23328 6.657958e-11 0.05
附:基准测试
library(bench)
set.seed(42)
create_df <- function(rows, cols) {
as.data.frame(replicate(cols, runif(rows, 1, 100), simplify = T))
}
results <- bench::press(
rows = c(1000, 10000),
cols = c(2, 10),
{
dat <- create_df(rows, cols)
write.xlsx(dat,file = "1.xlsx")
bench::mark(
min_iterations = 100,
read_excel = as.data.frame(read_excel("1.xlsx")),
read.xlsx = read.xlsx("1.xlsx"),
)
}
)
#Running with:
# rows cols
#1 1000 2
#2 10000 2
#3 1000 10
#4 10000 10
results
## A tibble: 8 x 15
# expression rows cols min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <dbl> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <list>
#1 read_excel 1000 2 10.4ms 10.5ms 94.3 133.02KB 1.92 98 2 1.04s <df[,2~ <Rprof~ <bch~ <tibb~
#2 read.xlsx 1000 2 29.4ms 29.9ms 33.1 586.05KB 0.335 99 1 2.99s <df[,2~ <Rprof~ <bch~ <tibb~
#3 read_excel 10000 2 32.2ms 33.3ms 30.0 1.23MB 0.929 97 3 3.23s <df[,2~ <Rprof~ <bch~ <tibb~
#4 read.xlsx 10000 2 76.8ms 78ms 12.7 4.99MB 1.74 88 12 6.91s <df[,2~ <Rprof~ <bch~ <tibb~
#5 read_excel 1000 10 19.1ms 20.2ms 49.2 546.29KB 1.52 97 3 1.97s <df[,1~ <Rprof~ <bch~ <tibb~
#6 read.xlsx 1000 10 48.7ms 50ms 19.8 2.47MB 0.826 96 4 4.84s <df[,1~ <Rprof~ <bch~ <tibb~
#7 read_excel 10000 10 114.7ms 118.9ms 8.42 5.33MB 1.15 88 12 10.46s <df[,1~ <Rprof~ <bch~ <tibb~
#8 read.xlsx 10000 10 265ms 269.2ms 3.71 23.28MB 2.37 61 39 16.43s <df[,1~ <Rprof~ <bch~ <tibb~
library(ggplot2)
autoplot(results)+
coord_trans()
![](https://img.haomeiwen.com/i19039742/b5950d8cd1f16b00.png)
网友评论