美文网首页
DEXseq2分析

DEXseq2分析

作者: 就是大饼 | 来源:发表于2024-07-18 10:09 被阅读0次

    isoform usage是常见的RNA seq下游分析流程,常见的包也是DEXseq2。但是用它跑自己的数据一直报错,跑示例数据是能够work的。

    names(x) <- value: 'names' atrribute [9] must be the same length as the vector [1]
    

    网上很多教程都是对示例数据进行讲解,解决不了我的问题。看报错信息感觉是gff文件格式对不上,看了源代码,它的版本很久了,要求gff文件里“exon_number”为“exonic_part_number”、“transcript_id”为“transcripts”、第三列类型为“exonic_part”。要改的地方太多了,所以我自己改了它的源代码来跑。

    要求输入文件有:

    1. 以exon为单元的reads count文件
    2. gff文件

    获得reads count文件——HTseq

    # 安装
    conda install -c bioconda htseq
    htseq-count -f bam -c all.exon.count.csv -n 40 --append-output  -t exon -i gene_id -i exon_number \
    /data/workdir/FRAL190036196.sort.bam \
    /data/workdir/FRAL190036198.sort.bam \
    /data/workdir/FRAL190036258.sort.bam.........\
    /data/workdir/information/hg38/gencode_new.v40.gtf
    # -f  输入文件格式
    # -c  输出文件名
    # -n 线程数
    #  --append-output  所有结果合并输出一个结果文件,每列为一个样本
    # -t  识别gtf第三列的exon为计算reads count的单元
    # -i  结果文件第一列为id,为gene_id:exon_number的格式
    !非常耗时,建议用多点线程跑。一个样本单线程差不多一个半小时
    

    在这我多输出了一列gene name,完全没必要


    结果文件

    分隔各列为一个文件的脚本

    # samples.txt为样本名,每行一个样本
    sf="samples.txt"
    for i in {2..469};
    do
        s_num=$((i-1))
        s=$(sed -n "${s_num}p" "${sf}")
        awk -v col="$((i+1))" '{OFS = "\t"} {print $1,$col}' all.exon.count.tsv > "split_sample_exon_count/${s}.txt"
    done
    

    接下来用R跑

    # DEXseq分析
    library("DEXSeq")
    samples_label <- read.delim("D:/data/465_sample_group.txt")
    countFiles <- list.files("D:/data/extData_exonnumber/", pattern=".txt$", full.names=TRUE)
    gffFile <- list.files("D:/data/DEXseq", pattern="gff$", full.names=TRUE)
    sampleTable <- data.frame(row.names=samples_label$Sample,
                              condition=samples_label$group)
    sample_names <- samples_label$Sample
    countFileNames <- sub("\\.txt$", "", basename(countFiles))
    # 根据样本名称排序 countFiles
    sorted_countFiles <- countFiles[order(match(countFileNames, sample_names))]
    
    # 大部分都没有动,只做了细微改变
    modify_DEXSeqDataSetFromHTSeq = function (sorted_countFiles, sampleTable, design = ~sample + exon + 
                                             condition:exon, gffFile = NULL){
      if (!all(sapply(sorted_countFiles, class) == "character")) {
        stop("The countfiles parameter must be a character vector")
      }
      lf <- lapply(sorted_countFiles, function(x) read.table(x, header = FALSE, 
                                                             stringsAsFactors = FALSE))
      if (!all(sapply(lf[-1], function(x) all(x$V1 == lf[1]$V1)))) 
        stop("Count files have differing gene ID column.")
      dcounts <- sapply(lf, `[[`, "V2")
      rownames(dcounts) <- gsub("_PAR_Y","",lf[[1]][, 1])
      dcounts <- dcounts[substr(rownames(dcounts), 1, 1) != "_",]
      rownames(dcounts) <- sub(":", ":E", rownames(dcounts))
      colnames(dcounts) <- sorted_countFiles
      splitted <- strsplit(rownames(dcounts), ":")
      exons <- sapply(splitted, "[[", 2)
      genesrle <- sapply(splitted, "[[", 1)
      if (!is.null(gffFile)) {
        aggregates <- read.delim(gffFile, stringsAsFactors = FALSE, header = FALSE)
        colnames(aggregates) <- c("chr", "source", "class", 
                                  "start", "end", "ex", "strand", "ex2", "attr")
        aggregates$strand <- gsub("\\.", "*", aggregates$strand)
        aggregates <- aggregates[which(aggregates$class == "exon"),]
        aggregates$attr <- gsub("\"|=|;", " ", aggregates$attr)
        aggregates$gene_id <- sub(".*gene_id\\s(\\S+).*", "\\1", aggregates$attr)
        transcripts <- gsub(".*transcript_id\\s(\\S+).*", "\\1", aggregates$attr)
        exonids <- gsub(".*exon_number\\s(\\S+).*", "\\1", aggregates$attr)
        exoninfo <- GRanges(as.character(aggregates$chr), IRanges(start = aggregates$start, 
                                                                  end = aggregates$end), strand = aggregates$strand)
        names(exoninfo) <- paste(aggregates$gene_id, exonids, sep = ":E")
        names(transcripts) <- rownames(exoninfo)
        if (!all(rownames(dcounts) %in% names(exoninfo))) {
          stop("Count files do not correspond to the flattened annotation file")
        }
        matching <- match(rownames(dcounts), names(exoninfo))
        stopifnot(all(names(exoninfo[matching]) == rownames(dcounts)))
        stopifnot(all(names(transcripts[matching]) == rownames(dcounts)))
        dxd <- DEXSeqDataSet(dcounts, sampleTable, design, exons, 
                             genesrle, exoninfo[matching], transcripts[matching])
        return(dxd)
      }
      else {
        dxd <- DEXSeqDataSet(dcounts, sampleTable, design, exons, 
                             genesrle)
        return(dxd)
      }
    }
    
    dxd <- modify_DEXSeqDataSetFromHTSeq(
      sorted_countFiles,
      sampleData=sampleTable,
      design= ~sample + exon + condition:exon,
      flattenedfile=gffFile)
    
    # 差异分析
    dxr <- DEXSeq(dxd)
    

    没跑完,实在是太慢了,数据量太大,R直接死机了

    相关文章

      网友评论

          本文标题:DEXseq2分析

          本文链接:https://www.haomeiwen.com/subject/uikzcjtx.html