美文网首页R语言杂记
R 读取Excel文件 学习笔记

R 读取Excel文件 学习笔记

作者: leoxiaobei | 来源:发表于2021-01-21 23:25 被阅读0次

老样子,跟着文档学习

1.readxl包:读取结果为tibble

read_excel(path, #excel文件名,自动判断xls或xlsx
          sheet = NULL, #工作表名(sheet1)或表的序号(1)
          skip = 0,#读取时要跳过的最小行数,如果依旧有空行,继续跳过该行
          n_max = Inf, #最大的读取行数,如果实际存在有效内容的行数(不为NA)小于该参数值,则读取会提前终止
          range = NULL, #读取的工作表的范围,比如"B1:D8"或"R1C2:R2C5"或"sheet1!B1:D8",优先级高于"skip","nmax"和"sheet"参数
          col_names = TRUE,#TRUE表示使用第一行作为列名,FALSE表示获取默认名称,或者一个字符向量表示每个列的名称。
          col_types = NULL, #NULL表示函数自己猜测列类型
          na = "", #默认情况下,readxl将空白单元格视为丢失的数据"NA"
          trim_ws = TRUE, #默认为TRUE,将自动去除表前后的空白行,优先级低于"range",指定"range"时该参数值TRUE无效
          guess_max = min(1000, n_max),#用于猜测列类型的最大数据行数。
          progress = readxl_progress(), #是否显示处理进程,默认只出现在交互式会话中
          .name_repair = "unique"#列名的处理。默认情况下,readxl将确保列名不为空并且是唯一的
)

1.正常读取:
read_excel("test.xlsx",sheet=2)
read_excel("test.xlsx",sheet="sheet2")
2.读取某一范围:

read_excel("test.xlsx", range = "A1:E7")#读取表1的A1:E7
read_excel("test.xlsx", range = "R1C1:R7C5")#读取从表1的1行1列开始,到7行5列结束
read_excel("test.xlsx", range = "sheet1!A1:E7")#表必须为表名,不能为表的编号
read_excel("test.xlsx", range = "sheet1!R1C1:R7C5")#这样也可以
#以上四种读法结果一致

3.指定读取行或列的范围:

read_excel("test.xlsx", range = cell_rows(10:15), col_names = FALSE)#读取10到15行
#列名称由函数自动分配,样式为"...1","...2","...3",......

read_excel("test.xlsx",range = cell_cols("A:B"))#读取A到B列,也就是1到2列
read_excel("test.xlsx", range = cell_cols(c(NA, 2)))#读取到第2列
read_excel("test.xlsx", range = cell_cols(c(NA, "B")))#读取到第2列
#上述三种方法结果相同

read_excel("test.xlsx", range = cell_cols(2))#只读取第2列

read_excel("test.xlsx", range = cell_cols(c("B", NA)))#从第2列开始读取
read_excel("test.xlsx", range = cell_cols(c(2, NA)))#从第2列开始读取

read_excel("test.xlsx", range = cell_limits(c(4, 3), c(NA, NA)))#从4行3列(C4)开始读取至工作表结束
read_excel("test.xlsx", range = cell_limits(c(4, NA), c(NA, 8)))#从第4行1列开始读取至第n行8列

4.跳过xx行,设置列名为xx,设置列名类型为xx:

read_excel("test.xlsx",
           skip = 10,#最少要跳过前10行
           col_names = c("ID","Description","Generatio","Bgratio","pvalue"),#指定每一列名称
           col_types = c("text","guess","guess","skip","numeric"))
#第四列直接跳过读取,其余列依次指定为文本类型,自动预测,自动预测,数值类型

## A tibble: 6 x 4
#  ID         Description                              Generatio        pvalue
#  <chr>      <chr>                                    <chr>             <dbl>
#1 GO:0120161 regulation of cold-induced thermogenesis 11/139    0.00000000107
#2 GO:1990845 adaptive thermogenesis                   11/139    0.00000000307
#3 GO:0032922 circadian regulation of gene expression  8/139     0.00000000897
#4 GO:0001676 long-chain fatty acid metabolic process  9/139     0.0000000220 
#5 GO:0033559 unsaturated fatty acid metabolic process 9/139     0.0000000436 
#6 GO:0120254 olefinic compound metabolic process      9/139     0.0000000666 

5.列名重复时,函数也会自动进行处理:

read_excel("test.xlsx",
           skip = 10,
           col_names = c("ID","Description","1ratio","1ratio","pvalue"),#支持数字开头,引用使用` `就可以,列名重复会自动修改
           col_types = c("text","skip","guess","guess","numeric"))

#New names:
#* `1ratio` -> `1ratio...2`
#* `1ratio` -> `1ratio...3`
## A tibble: 6 x 4
#  ID         `1ratio...2` `1ratio...3`        pvalue
#  <chr>      <chr>        <chr>                <dbl>
#1 GO:0120161 11/139       144/23328    0.00000000107
#2 GO:1990845 11/139       159/23328    0.00000000307
#3 GO:0032922 8/139        70/23328     0.00000000897
#4 GO:0001676 9/139        111/23328    0.0000000220 
#5 GO:0033559 9/139        120/23328    0.0000000436 
#6 GO:0120254 9/139        126/23328    0.0000000666 

6.列名中带有空格:

#默认状态,依旧为空格
read_excel("test.xlsx")
#更改为其他形式
my_custom_name_repair <- function(name) gsub(" ", "_", name)
read_excel("test.xlsx",
           .name_repair = my_custom_name_repair)
#同样适用于其他字符的替换,如"."
my_custom_name_repair <- function(name) gsub("\\.", "_", name)
##my_custom_name_repair <- function(name) gsub("[.]", "_", name)
read_excel("test.xlsx",
            .name_repair = my_custom_name_repair)

7.读取带有行合并,列合并和行列合并的表格


read_excel("test.xlsx")

#New names:
#* `` -> ...2
#* `` -> ...3
#* `` -> ...4
#* `` -> ...5
#* `` -> ...8
#* ...
## A tibble: 6 x 10
#  ID         ...2  ...3  ...4  ...5  Description                Gene.Ratio. ...8      pvalue...9 pvalue...10
#  <chr>      <lgl> <lgl> <lgl> <lgl> <chr>                      <chr>       <chr>          <dbl>       <dbl>
#1 NA         NA    NA    NA    NA    response to stilbenoid     8/139       22/23328    3.86e-13      0.05  
#2 NA         NA    NA    NA    NA    fat cell differentiation   15/139      252/23328   3.11e-11      0.0500
#3 GO:0106106 NA    NA    NA    NA    cold-induced thermogenesis 11/139      144/23328   1.07e- 9      0.0500
#4 NA         NA    NA    NA    NA    NA                         NA          NA         NA            NA     
#5 NA         NA    NA    NA    NA    NA                         NA          NA         NA            NA     
#6 GO:0001659 NA    NA    NA    NA    temperature homeostasis    13/139      181/23328   6.66e-11      0.0500
#其会自动保留左上角的值,其余用NA替代,等同于读取有表格中整行整列NA的数据,无法实现中间行列全为NA的过滤

2.openxlsx包:读取结果为数据框

read.xlsx(
  xlsxFile,#xlsx文件名,不支持"xls"文件
  sheet = 1,#工作表名称或序号
  startRow = 1,#默认从第一行开始查找数据。另外,不管startRow的值是多少,文件顶部的空行总是会被跳过。
  colNames = TRUE,#如果为TRUE,第一行数据将用作列名。
  rowNames = FALSE,#如果为TRUE,第一列将用作行名。
  detectDates = FALSE,#如果为TRUE,尝试识别日期并执行转换。
  skipEmptyRows = TRUE,#如果为TRUE,则跳过空行,否则在包含数据的第一行之后的空行将返回一行NAs。
  skipEmptyCols = TRUE,#如果为TRUE,则跳过空列
  rows = NULL,#一个数字向量,指定要读取Excel文件中的哪些行。如果为NULL,则读取所有行。
  cols = NULL,#一个数字向量,指定要读取Excel文件中的哪些列。如果为NULL,则读取所有列。
  check.names = FALSE,#如果为TRUE,则检查数据框中的变量名,以确保它们在语法上是有效的变量名
  sep.names = ".",#一个字符,默认情况下,用"."来代替列名中的空格
  namedRegion = NULL,#对于普通的xlsx文件来说不必理会
  na.strings = "NA",#空白单元格将返回为NA
  fillMergedCells = FALSE#如果为TRUE,合并单元格中的值将给予合并单元格中的所有单元格。
)

1.默认读取
read.xlsx("test.xlsx",sheet=1)
read.xlsx("test.xlsx",sheet="sheet1")
2.指定读取行或列的范围:

原始excel
read.xlsx("test.xlsx",
          sheet = 1, 
          colNames=FALSE,#不让读取到的第一行成为列名
          skipEmptyRows = FALSE,#不跳过空行,即在包含数据的第一行之后的空行将返回一行NAs
          skipEmptyCols = FALSE,#不跳过空列,即在包含数据的第一列之后的空列将返回一行NAs
          cols = c(1, 4, 8), #读取第1,4,8列,但保持完整的表格形式,第2,3,5,6,7行用NA填充
          rows = c(1, 2, 6))#读取第1,2,6行,但保持完整的表格形式,第3,4,5行用NA填充

#  X1 X2 X3         X4 X5 X6 X7                    X8
#1 NA NA NA         ID NA NA NA                pvalue
#2 NA NA NA       <NA> NA NA NA                  <NA>
#3 NA NA NA       <NA> NA NA NA                  <NA>
#4 NA NA NA       <NA> NA NA NA                  <NA>
#5 NA NA NA GO:0106106 NA NA NA 1.0727869301115299E-9
########注意:文件顶部的空行被跳过了,不然应该一共有6行8列,现在只有5行8列。########


read.xlsx("test.xlsx",
          sheet = 1, 
          colNames=FALSE,#不让读取到的第一行成为列名
          skipEmptyRows = T,#跳过空行
          skipEmptyCols = T,#跳过空列
          cols = c(1, 4, 8), #读取第1,4,8列,简化的表格形式
          rows = c(1, 2, 6))#读取第1,2,6行,简化的表格形式

#          X1                    X2
#1         ID                pvalue
#2 GO:0106106 1.0727869301115299E-9
#######这是简化后的版本,对于读取内容中有整行整列NA的数据十分友好########

3.当列名重复时:


原始excel1
#默认状态,不进行处理,会导致语义不明
x <- read.xlsx("test.xlsx")
x
#          ID                Description GeneRatio   BgRatio       pvalue pvalue
#1 GO:0035634     response to stilbenoid     8/139  22/23328 3.863005e-13   0.05
#2 GO:0045444   fat cell differentiation    15/139 252/23328 3.112811e-11   0.05
#3 GO:0001659    temperature homeostasis    13/139 181/23328 6.657958e-11   0.05
#4 GO:0106106 cold-induced thermogenesis    11/139 144/23328 1.072787e-09   0.05
x$pvalue
#[1] 3.863005e-13 3.112811e-11 6.657958e-11 1.072787e-09

#check.names状态为TRUE
x <- read.xlsx("test.xlsx",check.names = T)
x
#          ID                Description GeneRatio   BgRatio       pvalue pvalue.1
#1 GO:0035634     response to stilbenoid     8/139  22/23328 3.863005e-13     0.05
#2 GO:0045444   fat cell differentiation    15/139 252/23328 3.112811e-11     0.05
#3 GO:0001659    temperature homeostasis    13/139 181/23328 6.657958e-11     0.05
#4 GO:0106106 cold-induced thermogenesis    11/139 144/23328 1.072787e-09     0.05
x$pvalue
#[1] 3.863005e-13 3.112811e-11 6.657958e-11 1.072787e-09
x$pvalue.1
#[1] 0.05 0.05 0.05 0.05

4.列名中带有空格:

#由sep.names指定,默认替换为".",与参数check.names的值无关
read.xlsx("test.xlsx")
#可以保持不被更改,依旧为空格
read.xlsx("test.xlsx",sep.names = " ")
#更改为其他形式
read.xlsx("test.xlsx",sep.names = "_")

5.读取带有行合并,列合并和行列合并的表格:


同样的表
read.xlsx("test.xlsx")#只保留左上角的值,然后其余用NA代替,再自动去除表中的整行整列NA
#          ID                Description Gene.Ratio.        X4       pvalue pvalue
#1       <NA>     response to stilbenoid       8/139  22/23328 3.863005e-13   0.05
#2       <NA>   fat cell differentiation      15/139 252/23328 3.112811e-11   0.05
#3 GO:0106106 cold-induced thermogenesis      11/139 144/23328 1.072787e-09   0.05
#4 GO:0001659    temperature homeostasis      13/139 181/23328 6.657958e-11   0.05


read.xlsx("test.xlsx",fillMergedCells = T)#选择用左上角的值填充整个合并单元格
#          ID       ID.1       ID.2       ID.3                Description Gene.Ratio. Gene.Ratio..1       pvalue pvalue.1
#1         ID         ID         ID         ID     response to stilbenoid       8/139      22/23328 3.863005e-13     0.05
#2         ID         ID         ID         ID   fat cell differentiation      15/139     252/23328 3.112811e-11     0.05
#3 GO:0106106 GO:0106106 GO:0106106 GO:0106106 cold-induced thermogenesis      11/139     144/23328 1.072787e-09     0.05
#4 GO:0106106 GO:0106106 GO:0106106 GO:0106106                       <NA>        <NA>          <NA>           NA       NA
#5 GO:0106106 GO:0106106 GO:0106106 GO:0106106                       <NA>        <NA>          <NA>           NA       NA
#6 GO:0001659 GO:0001659 GO:0001659 GO:0001659    temperature homeostasis      13/139     181/23328 6.657958e-11     0.05

附:基准测试

library(bench)
set.seed(42)

create_df <- function(rows, cols) {
  as.data.frame(replicate(cols, runif(rows, 1, 100), simplify = T))
}

results <- bench::press(
  rows = c(1000, 10000),
  cols = c(2, 10),
  {
    dat <- create_df(rows, cols)
    write.xlsx(dat,file = "1.xlsx")
    bench::mark(
      min_iterations = 100,
      read_excel = as.data.frame(read_excel("1.xlsx")),
      read.xlsx = read.xlsx("1.xlsx"),
    )
  }
)

#Running with:
#   rows  cols
#1  1000     2
#2 10000     2
#3  1000    10
#4 10000    10

results
## A tibble: 8 x 15
#  expression  rows  cols      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result  memory  time  gc    
#  <bch:expr> <dbl> <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>  <list>  <lis> <list>
#1 read_excel  1000     2   10.4ms   10.5ms     94.3   133.02KB    1.92     98     2      1.04s <df[,2~ <Rprof~ <bch~ <tibb~
#2 read.xlsx   1000     2   29.4ms   29.9ms     33.1   586.05KB    0.335    99     1      2.99s <df[,2~ <Rprof~ <bch~ <tibb~
#3 read_excel 10000     2   32.2ms   33.3ms     30.0     1.23MB    0.929    97     3      3.23s <df[,2~ <Rprof~ <bch~ <tibb~
#4 read.xlsx  10000     2   76.8ms     78ms     12.7     4.99MB    1.74     88    12      6.91s <df[,2~ <Rprof~ <bch~ <tibb~
#5 read_excel  1000    10   19.1ms   20.2ms     49.2   546.29KB    1.52     97     3      1.97s <df[,1~ <Rprof~ <bch~ <tibb~
#6 read.xlsx   1000    10   48.7ms     50ms     19.8     2.47MB    0.826    96     4      4.84s <df[,1~ <Rprof~ <bch~ <tibb~
#7 read_excel 10000    10  114.7ms  118.9ms      8.42    5.33MB    1.15     88    12     10.46s <df[,1~ <Rprof~ <bch~ <tibb~
#8 read.xlsx  10000    10    265ms  269.2ms      3.71   23.28MB    2.37     61    39     16.43s <df[,1~ <Rprof~ <bch~ <tibb~

library(ggplot2)
autoplot(results)+
  coord_trans()
read_excel和read.xlsx对比
小结:感觉两个函数总体上差不多,但是readxl::read_excel基准测试(单次执行时间,每秒执行次数,运行分配内存)上还是要更优秀一些(且其明明还多执行了一个函数as.data.frame),然后读取方式也更多样化一些,而read.xlsx对于读取合并单元格和读取内容中间NA行(列)的数据支持的更好一点,各有千秋吧

相关文章

网友评论

    本文标题:R 读取Excel文件 学习笔记

    本文链接:https://www.haomeiwen.com/subject/vxdlzktx.html