分析群体结构的几种方法

作者: 千万别加香菜 | 来源:发表于2021-11-19 10:09 被阅读0次
    常见的群体结构的分析方法有admixture分析、系统发生数分析以及主成分分析等。

    1、admixture分析

    ###过滤数据
    常用plink软件过滤,在此就不做介绍了,直接开始后续操作。
    
    ###dmixture进行群体遗传结构分析(群体数自己决定)
    for K in 3 4 5 6 7; do /home/software/admixture_linux-1.3.0/admixture --cv ld.QC.75_noinclude0-502502-geno02-maf03.bed $K | tee log${K}.out; done
    
    ###提取CV值:CV error最小的为最佳K值
    grep -h CV log*.out
    
    分析结束后生成了自己设定k值的Q文件,用于在R中绘图
    
    R语言绘图
    admixture的可视化分为两种
    ###最佳K值的可视化
    ta1 = read.table("ld.QC.75_noinclude0-502502-geno02-maf03.ped.map.3.4.Q")    ##用的是最佳K值的那个Q文件    
    head(ta1)
    barplot(t(as.matrix(ta1)),col = rainbow(3),
            xlab = "Individual",
            ylab = "Ancestry",
            border = NA)
    
    ####全部K值的可视化(较复杂)
    利用表格根据fam文件(三列 1.地区Asia  2.ID名称,与fam文件的一致  3.样本品种)作三列的order.txt并用制表符分隔形式保存
    (###可将order.txt文件的第一列地区Asia改成真正的个体名称,这样图中就会显示每个个体名称
      ###可将order.txt文件中的顺序进行调整则图中的顺序即为order.txt文件的个体顺序)
    Session中上传工作目录,需建立一个文件夹(包括Q文件,fam、bed、bim文件,order.txt文件)
    ##安装软件
    install packages(Rcolorbrewer)
    ##(导入含有Q order.txt bed bid fam的文件夹,修改以下程序中的文件名和K值)
    sort.admixture <- function(admix.data){
      ## sort columns according to the cor
      k <- length(admix.data)
      n.ind <- nrow(admix.data[[1]])
      name.ind <- rownames(admix.data[[1]])
      admix.sorted <- list()
      
      if (admix.data[[1]][1,1] > admix.data[[1]][1,2]){
        admix.sorted[[1]] <- admix.data[[1]]
      }else{
        admix.sorted[[1]] <- admix.data[[1]][,c(2,1)]
      }
       for (i in 1:(k-1)){
        admix <- matrix(nrow = n.ind, ncol = (i + 2))
        cors <- cor(admix.sorted[[i]], admix.data[[i + 1]])
        sorted.loc <- c()
        for (j in 1:nrow(cors)){
          cor <- cors[j,]
          cor[sorted.loc] <- NA
          sorted.loc <- c(sorted.loc, which.max(cor))
        }
        
        sorted.loc <- c(sorted.loc, which(! 1:ncol(cors) %in% sorted.loc))
        cat("n_max = ", sorted.loc, "\n")
        admix <- admix.data[[i + 1]][,sorted.loc]
        rownames(admix) <- name.ind
        admix.sorted[[i + 1]] <- admix
      }
      return(admix.sorted)
    }
    
    sort.iid <- function(k.values, groups){
      ##k.values <- admix.values[[1]]
      ##groups <- admix.fam
      max.col <- which.max(colSums(k.values))
      k.values <- cbind(k.values, groups[match(rownames(k.values), as.character(groups$iid)),])
      k.values <- transform(k.values, group = as.factor(k.values$fid))
      k.means <- tapply(k.values[,max.col], k.values$group, mean)
      k.means <- k.means[order(k.means)]
      k.sort <- data.frame(id = names(k.means), 
                           order = order(k.means),
                           mean = k.means)
      k.values$order <- k.sort[match(as.character(k.values$group), k.sort$id), 3]
      k.values <- k.values[order(k.values$order, k.values[,max.col]),]
      return(rownames(k.values))
    }
    
    sort.fid <- function(iid.order, fid.order, fam.table){
      new.order <- c()
      for (fid in fid.order){
        new.order <- c(new.order, which(iid.order %in% fam.table[fam.table$fid == fid, "iid"]))
      }
      return(iid.order[new.order])
    }
    
    read.structure <- function(file, type = "structure"){
      if (type == "structure"){
        k.values <- read.table(file = file, header = F)
        rownames(k.values) <- k.values[,1]
        k.values[,1:3] <- NULL
      }else{
        k.values <- read.table(file = file, header = F)
      }
      return(k.values)
    }
    
    add.black.line <- function(data, groups, nline = 1){
      # data <- as.matrix(plot.data[[2]])
      # groups <- group.name
      # nline <- 3
      
      data <- as.matrix(data)
      group.name <- unique(groups)
      new.data <- matrix(NA, ncol = ncol(data))
      black.data <- matrix(NA, nrow = nline, ncol = ncol(data))
      new.name <- c(NA)
      for (name in group.name){
        new.data <- rbind(new.data, black.data)
        new.data <- rbind(new.data, data[which(groups == name),])
        new.name <- c(new.name, rep(NA,nline))
        new.name <- c(new.name, rownames(data)[which(groups == name)])
      }
      
      added.data <- new.data[(nline + 2):nrow(new.data),]
      rownames(added.data) <- new.name[(nline + 2):nrow(new.data)]
      return(added.data)
    }
    
    ##============================================================================
    ## 
    header <- "ld.QC.75_noinclude0-502502-geno02-maf03"         (######plink格式的文件,改为自己使用的文件名)
    max.k <- 4
    admix.fn <- paste(header, 2:max.k, "Q", sep = ".")
    fam.fn <- paste(header, "fam", sep = ".")
    admix.fam <- read.table(fam.fn, stringsAsFactors = F,
                            col.names = c("fid", "iid", "pid", "mid", "sex", "pheno"))
    admix.values <- lapply(admix.fn, read.table, header = F, 
                           row.names = as.character(admix.fam$iid))
    order.fn <- paste(header, "order.txt", sep = ".")
    admix.order <- read.table(order.fn, col.names = c("region", "iid", "fid"), stringsAsFactors = F)
    id.order <- admix.order$iid
    #id.order <- sort.iid(admix.values[[1]], admix.order)
    admix.data <- list()
    for (i in 1:length(admix.values)){
      admix.data[[i]] <- admix.values[[i]][id.order,]
    }
    species <- as.character(admix.order[,1])
    sorted.data <- sort.admixture(admix.data)
    ## add black line in plot
    nline <- 1
    plot.data <- list()
    group.order <- admix.order[match(id.order, admix.order$iid),3]
    for (i in 1:length(sorted.data)){
      plot.data[[i]] <- add.black.line(sorted.data[[i]], group.order, nline = nline)
    }
    ## add xlab to plot
    plot.id.list <- rownames(plot.data[[1]])
    #plot.xlab <- admix.fam[match(x = plot.id.list, table = admix.fam$iid),]
    plot.xlab <- admix.order[match(x = plot.id.list, table = admix.order$iid),]
    plot.lab <- unique(plot.xlab$fid)
    plot.lab <- plot.lab[!is.na(plot.lab)]
    plot.at <- c()
    start <- 0
    for (fid in plot.lab){
      xlen <- length(which(plot.xlab$fid == fid))
      gap <- start + floor(xlen / 2)
      plot.at <- c(plot.at, gap)
      start <- start + nline + xlen
    }
    ##=============================================================================
    ## barplot admixture and structure
    library(RColorBrewer)
    my.colours <- c(brewer.pal(8, "Dark2"), "mediumblue", "darkred", "coral4",
                    "purple3", "lawngreen", "dodgerblue1", "paleturquoise3",
                    "navyblue",  "green3", "red1", "cyan", 
                    "orange", "blue", "magenta4", "yellowgreen", "darkorange3", 
                    "grey60", "black")
    max.k <- length(plot.data)
    n <- dim(plot.data[[1]])[1]
    #pdf(file=paste(header, "admix.plot.pdf", sep = "."), width = 16, height = 12)
    png(file=paste(header, "admix.plot.png", sep = "."), res=300, width = 2400, height = 2000)
    par(mfrow = c(max.k,1),  mar=c(0.5,2,0,0), oma=c(6,0,1,0))
    par(las=2)
    #for (i in 1:(max.k - 1)){
    for (i in 1:max.k){
      barplot(t(as.matrix(plot.data[[i]])), names.arg = rep(c(""), n), 
              col = my.colours, border = NA, space = 0, axes = F, 
              ylab = "")
      axis(side = 2, at = 0.5, labels = as.character(i+1), tick = F, hadj = 0)
    }
    axis(side = 1, at = plot.at, labels = plot.lab, tick = F, lty = 15, cex.axis = 0.5)
    #barplot(t(as.matrix(plot.data[[max.k]])), names.arg = rownames(plot.data[[1]]), axes = F,
    #        col = my.colours, border = NA, space = 0, 
    #        ylab = paste("K=", max.k + 1, sep = ""), cex.axis=0.6(坐标轴字体大小), cex.names=0.6)
    dev.off()
    
    

    2、系统发生树分析

    有很多种建树的方法,我们这里介绍下MEGA用NJ法建树和IQ-TREE用ML法建树
    过滤的步骤略去
    1)MEGA建树
    ###计算genome
    /home/software/plink  --bfile XXX-502502-geno02-maf03 --allow-extra-chr --chr-set 26  --genome
    
    
    编写perl脚本 .pl结尾(Linux下新建.pl文件,直接进入编辑即可,当然,底下这个是现成的,改改可以直接用)
    (第一个open里改成自己的genome文件
    第二个open里改成自己的fam文件
    第三个open里将>后面的改成输出的文件名.meg
    在下面的sample_size里,将数量改成自己使用的数量)
    #!usr/bin/perl
    # define array of input and output files
    open (AAA,"ld.QC.0-1-cattle-290-chr1-29-snp-indel.filter.pass-502502-geno01-maf00.genome.genome") || die "can't open AAA"; ##用自己上一步的genome文件
    open (BBB,"290-cattle-breed.fam.txt") || die "can't open BBB"; ##使用自己的fam文件
    open (CCC,">4-22-lzx_Dis.meg"); ##输出文件名在>后面改
    my @aa=<AAA>;
    my @bb=<BBB>;
    
    $sample_size=290; ###  个体数目,改成自己用的数目
    print CCC "#mega\n!Title: $sample_size pigs;\n!Format DataType=Distance DataFormat=UpperRight NTaxa=$sample_size;\n\n"; 
    
    foreach ($num1=0;$num1<=$#bb;$num1++){
        chomp $bb[$num1];
        @arraynum1=split(/\s+/,$bb[$num1]);
        print CCC "#$arraynum1[1]\n";       ##个体的ID名称
        }
    print CCC "\n";
    
    @array=();
    foreach ($num2=1;$num2<=$#aa;$num2++){
        chomp $aa[$num2];
        @arraynum1=split(/\s+/,$aa[$num2]);
        push(@array,1-$arraynum1[12]);
        }
        
    @array2=(0);
    $i=$sample_size;
    while ($i>0){   
        push(@array2,$array2[$#array2]+$i);
        $i=$i-1;
        }
    print "@array2";
    
    for ($i=($sample_size-1); $i>=0; $i=$i-1){
        print CCC " " x ($sample_size-($i+1));
            for ($j=$array2[$sample_size-$i-1]; $j<=$array2[$sample_size-$i]-1; $j++){
                                                                                      print CCC "$array[$j] ";  
                                                                                     }
            print  CCC "\n";
        }
    close AAA;
    close BBB;
    close CCC;
    

    #######生成.meg文件利用MEGA软件可视化

    2)IQ-TREE建树
    #####转成map ped
    /home/software/plink --allow-extra-chr --chr-set 27 -bfile ld.QC.xll_noinclude0.recode-502502-geno02-maf03 --recode --out ld.QC.xll_noinclude0.recode-502502-geno02-maf03  
    
    #####转成fa
    nohup python /home/hmy/ped2fa.py ld.QC.xll_noinclude0.recode-502502-geno02-maf03.ped ld.QC.xll_noinclude0.recode-502502-geno02-maf03.fa &
         打开fa文件修改个体ID,当然也可以不改
    
    #####过滤I成N
    sed -i "s/I/N/g" ld.QC.xll_noinclude0.recode-502502-geno02-maf03.fa
    
    #####mafft比对
    ####多序列比对:是指把多条(3 条或以上)有系统进化关系的蛋白质或核酸序列进行比对,尽可能地把相同的碱基或氨基酸残基排在同一列上。
    ###这样做的意义是,对齐的碱基或氨基酸残基在进化上是同源的,即来自共同祖先(common ancestor)。
    nohup mafft --auto ld.QC.xll_noinclude0.recode-502502-geno02-maf03.fa > ld.xll_noinclude0.recode-502502-geno02-maf03_aligned.fa &
    
    #####Iqtree(构建进化树)
    nohup /home/software/iqtree-1.6.12-Linux/bin/iqtree -s ld.QC.75_xiugai_noinclude0-502502-geno02-maf03.fa -m TEST -st DNA -bb 1000 -nt AUTO &
    
    #####itol画树(网页搜索)
    用bionj文件
    
    

    3、主成分分析

    (gcta)(gcta用的是数字染色体,若不是,注意替换下)
    ###make germ
    nohup /home/software/gcta_1.92.3beta3/gcta64 --bfile ld.QC.75_noinclude0-502502-geno02-maf03 --make-grm --autosome-num 26 --out ld.QC.75_noinclude0-502502-geno02-maf03.gcta &
           # autosome表示只选出常染色体来运行
    ###pca
    nohup /home/software/gcta_1.92.3beta3/gcta64 --grm ld.QC.75_noinclude0-502502-geno02-maf03.gcta --pca 5 --out ld.QC.75_noinclude0-502502-geno02-maf03.gcta.out &
        ########输入的文件就是.gcta,这个文件不是上一步的生成文件,pca后面跟的是要做几个主成分的比较
    
    R绘图
    ################
    a=read.table("pca.eigenvec",header=F)
    head(a)
    dim(a)
    
    b=read.table("pca.txt",header=F)  ######pca.txt为四列的文件,(1列为品种,2列和4列为个体ID,3列为排序数字)
    head(b)
    library("ggplot2")  
    qplot(a[,3],a[,4],col=b[,1])
    Breed=b[,1]
    p = ggplot(data  = a ,
           aes(x = a[,3],
               y = a[,4],    ############x = a[,3], y = a[,4]代表第3列和第4列比较也就是pc1与pc2比较
               group = Breed,
               shape = Breed,
               color = Breed)
    )+geom_point(size=2) +scale_shape_manual(values = seq(0,75))
    p + labs(x = "pc1", y = "pc2")     #############修改x轴和y轴的坐标名称
    
    
    好了,群体结构分析这块就会这么多了

    相关文章

      网友评论

        本文标题:分析群体结构的几种方法

        本文链接:https://www.haomeiwen.com/subject/jmjdtrtx.html