美文网首页
2020-07-08 R基础绘图+统计

2020-07-08 R基础绘图+统计

作者: 程凉皮儿 | 来源:发表于2020-07-08 13:33 被阅读0次

    R统计

    安装加载必要的R包

    options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor/")
    options("repos" = c(CRAN="http://mirrors.cloud.tencent.com/CRAN/")) 
    options(download.file.method = 'libcurl')
    options(url.method='libcurl')
    
    #install.packages(c("beanplot", "plotrix", "pastecs", "reshape2"))
    
    library(beanplot)
    library(plotrix)
    #> Warning: package 'plotrix' was built under R version 3.6.2
    library(pastecs)
    library(reshape2)
    #> Warning: package 'reshape2' was built under R version 3.6.2
    
    数据类型定义

    QUALITATIVE DATA

    ## power analysis ##
    power.prop.test(p1 = .25, p2 = .7, power = .8, sig.level = 0.05)
    #> 
    #>      Two-sample comparison of proportions power calculation 
    #> 
    #>               n = 18.10585
    #>              p1 = 0.25
    #>              p2 = 0.7
    #>       sig.level = 0.05
    #>           power = 0.8
    #>     alternative = two.sided
    #> 
    #> NOTE: n is number in *each* group
    
    ## Load the data ##
    cats.data<-read.table("cats.dat", sep="\t",header=T)
    
    ## Look at the data ##
    head(cats.data)
    #>         Training Dance
    #> 1 Food as Reward   Yes
    #> 2 Food as Reward   Yes
    #> 3 Food as Reward   Yes
    #> 4 Food as Reward   Yes
    #> 5 Food as Reward   Yes
    #> 6 Food as Reward   Yes
    #View(cats.data)
    
    ## Plot the data and contingency table ##
    plot(cats.data$Training, cats.data$Dance, xlab = "Training", ylab = "Dance")
    
    image.png
    plot(table(cats.data))
    
    image.png
    
    contingency.table <- table(cats.data)
    contingency.table100<-prop.table(contingency.table,1)
    contingency.table100<-round(contingency.table100*100)
    contingency.table100
    #>                      Dance
    #> Training              No Yes
    #>   Affection as Reward 70  30
    #>   Food as Reward      26  74
    
    contingency.table100<-cbind(contingency.table100[,"Yes"],contingency.table100[,"No"])
    colnames(contingency.table100) <- c("Yes", "No")
    contingency.table100
    #>                     Yes No
    #> Affection as Reward  30 70
    #> Food as Reward       74 26
    
    barplot(t(contingency.table100), 
            legend.text = TRUE,
            ylab = "Percentages",
            las=1)
    
    image.png
    
    #### Pretty ###
    par(lwd=2) 
    barplot(t(contingency.table100), 
            col=c("chartreuse3","lemonchiffon2"),
            cex.axis=1.2, 
            cex.names=1.5,
            cex.lab=1.5,
            ylab = "Percentages",
            las=1
            )
    
    legend("topright",   
           title="Dancing", 
           inset=.05, 
           c("Yes","No"), 
           horiz=TRUE, 
           pch=15,
           col=c("chartreuse3","lemonchiffon2"))
    
    image.png
    
    dev.off()
    #> null device 
    #>           1
    

    QUANTITATIVE DATA

    卡方检验,Fishers检验

    ## Chi2  ##
    chisq.test(contingency.table)
    #> 
    #>  Pearson's Chi-squared test with Yates' continuity correction
    #> 
    #> data:  contingency.table
    #> X-squared = 23.52, df = 1, p-value = 1.236e-06
    
    ## Chi2 without continuity correction #
    chisq.test(contingency.table, correct=F)
    #> 
    #>  Pearson's Chi-squared test
    #> 
    #> data:  contingency.table
    #> X-squared = 25.356, df = 1, p-value = 4.767e-07
    
    ## Fishers test  ##
    fisher.test(contingency.table)
    #> 
    #>  Fisher's Exact Test for Count Data
    #> 
    #> data:  contingency.table
    #> p-value = 1.312e-06
    #> alternative hypothesis: true odds ratio is not equal to 1
    #> 95 percent confidence interval:
    #>   2.837773 16.429686
    #> sample estimates:
    #> odds ratio 
    #>   6.579265
    

    T检验

    ## Import data ##
    coyote<-read.csv("coyote.csv")
    #View(coyote)
    head(coyote)
    #>   length gender
    #> 1   93.0 female
    #> 2   97.0 female
    #> 3   92.0 female
    #> 4  101.6 female
    #> 5   93.0 female
    #> 6   84.5 female
    
    ## Power calculation # 
    #power.t.test(n = , d = , sig.level = , power = , type = c("two.sample", "one.sample", "paired"))  
    
    # mean 1 = 92 and mean 2 = 87.4
    # sd = 7
    
    delta <- 92 - 87.4
    power.t.test(delta = 92 - 87.4, sd = 7, sig.level = 0.05, power = 0.8)
    #> 
    #>      Two-sample t test power calculation 
    #> 
    #>               n = 37.33624
    #>           delta = 4.6
    #>              sd = 7
    #>       sig.level = 0.05
    #>           power = 0.8
    #>     alternative = two.sided
    #> 
    #> NOTE: n is number in *each* group
    

    Graphical exploration of the data

    ## Stripchart ##
    stripchart(coyote$length~coyote$gender)
    
    image.png
    stripchart(coyote$length~coyote$gender,vertical=TRUE, method="jitter")
    
    image.png
    
    stripchart(coyote$length~coyote$gender,
               vertical=TRUE, 
               method="jitter", 
               las=1, 
               ylab="Length", 
               pch=16,
               col=c("darkorange","purple"), 
               cex=1.5
               )
    
    # Add group means #
    length.means <- tapply(coyote$length,coyote$gender,mean) 
    
    segments(1:2-0.15,length.means, 1:2+0.15, length.means, col="black", lwd=3)
    
    image.png
    
    ## Boxplot or beanplot # beanplot package ##
    boxplot(coyote$length~coyote$gender,notch = TRUE)
    
    image.png
    
    # Prettier: 
    boxplot(coyote$length~coyote$gender, col=c("orange","purple"), ylab = 'Length (cm)', las = 1)
    
    image.png
    
    ## Beanplot ##
    beanplot(coyote$length~coyote$gender,las=1,ylab="Length (cm)") 
    
    image.png
    
    ## Combination boxplot and scatterplot ##
    boxplot(coyote$length~coyote$gender, 
            lwd = 2, 
            ylab = 'Length', 
            cex.axis=1.5,
            las=1, 
            cex.lab=1.5)
    
    stripchart(coyote$length~coyote$gender, 
               vertical = TRUE, 
               method = "jitter", 
               pch = 20, 
               col = 'red', 
               cex=2, 
               add = TRUE)
    
    image.png
    
    ## Combination boxplot and beanplot ##
    beanplot(coyote$length~coyote$gender,
             las=1, overallline = "median", 
             ylab = 'Length', 
             cex.lab=1.5, 
             col="bisque",
             what = c(0,1,1,0),
             cex.axis=1.5) 
    
    boxplot(coyote$length~coyote$gender, 
            add=TRUE, 
            col=rgb(0.2,0.5,0.3, alpha=0.5), 
            pch = 20,  
            cex=2, 
            lwd=2, 
            yaxt='n',
            xaxt='n')
    
    image.png
    
    ## Histogram ##
    par(mfrow=c(1,2))
    hist(coyote[coyote$gender=="male",]$length)
    hist(coyote[coyote$gender=="female",]$length)
    
    image.png
    
    # Prettier:
    hist(coyote[coyote$gender=="male",]$length,main="Male", xlab="Length",col="lightgreen")
    hist(coyote[coyote$gender=="female",]$length,main="Female", xlab="Length",col="tomato1")
    
    image.png
    
    ### Stats summary of test for normality (1st assumption)  ## pastecs package ###
    #by(coyote$length,coyote$gender, stat.desc, basic=F, desc=F, norm=T)#
    tapply(coyote$length,coyote$gender, stat.desc, basic=F, desc=F, norm=T)
    #> $female
    #>   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
    #> -0.5290403 -0.7320175  0.5029741  0.3546892  0.9700101  0.3164448 
    #> 
    #> $male
    #>    skewness    skew.2SE    kurtosis    kurt.2SE  normtest.W  normtest.p 
    #> -0.08453876 -0.11697379 -0.67756124 -0.47780521  0.98445697  0.81898309
    
    ### Homogeneity of variance (test 2nd assumption) 
    bartlett.test(coyote$length~coyote$gender)
    #> 
    #>  Bartlett test of homogeneity of variances
    #> 
    #> data:  coyote$length by coyote$gender
    #> Bartlett's K-squared = 0.02021, df = 1, p-value = 0.887
    
    ###### Independent t-test ###########
    t.test(coyote$length~coyote$gender, var.equal=T)
    #> 
    #>  Two Sample t-test
    #> 
    #> data:  coyote$length by coyote$gender
    #> t = -1.6411, df = 84, p-value = 0.1045
    #> alternative hypothesis: true difference in means is not equal to 0
    #> 95 percent confidence interval:
    #>  -5.184747  0.496375
    #> sample estimates:
    #> mean in group female   mean in group male 
    #>             89.71163             92.05581
    
    ## Plot a bar chart ##
    dev.off()
    #> null device 
    #>           1
    bar.length<-barplot(length.means) 
    bar.length<-barplot(length.means,xlim=c(0,1),width=0.3, ylim=c(0,100)) 
    
    # Prettier:
    bar.length<-barplot(length.means, col=c("darkslategray1","darkseagreen1"), xlim=c(0,1),width=0.3,ylim=c(50,100), ylab="Mean length",las=1, xpd=FALSE)
    
    ## Calculation of SEM for error bars # plotrix package ##
    length.se<-tapply(coyote$length,coyote$gender,std.error)
    length.se
    #>    female      male 
    #> 0.9988377 1.0211241
    
    # Alternative #
    length.se<-tapply(coyote$length,coyote$gender,function(x) sd(x)/sqrt(length(x)))
    
    ## Barplot returns the values for the centers of the bars ##
    bar.length
    #>      [,1]
    #> [1,] 0.21
    #> [2,] 0.57
    
    # Adding error bars #
    arrows(x0=bar.length, y0=length.means-length.se, x1=bar.length, y1=length.means+length.se, length=0.3, angle=90,code=3)
    
    ###### Independent t-test ###########
    t.test(coyote$length~coyote$gender, var.equal=T)
    #> 
    #>  Two Sample t-test
    #> 
    #> data:  coyote$length by coyote$gender
    #> t = -1.6411, df = 84, p-value = 0.1045
    #> alternative hypothesis: true difference in means is not equal to 0
    #> 95 percent confidence interval:
    #>  -5.184747  0.496375
    #> sample estimates:
    #> mean in group female   mean in group male 
    #>             89.71163             92.05581
    
    ###### Paired t-test ###########
    working.memory<-read.csv("Working.memory.csv", header=T)
    head(working.memory)
    #>   Subject placebo DA.depletion
    #> 1      M1       9            7
    #> 2      M2      10            7
    #> 3      M3      15           10
    #> 4      M4      18           12
    #> 5      M5      19           13
    #> 6      M6      22           15
    
    ### Graphical exploration of the data ###
    
    ## Boxplot ##
    boxplot(working.memory[2:3])
    beanplot(working.memory[2:3],las=1,ylab="Scores") 
    
    ## Stripchart ##
    stripchart(working.memory[2:3])
    stripchart(working.memory[2:3],vertical=TRUE, method="jitter", las=1, ylab="Scores", pch=16,col=c("darkred","darkgreen"), cex=1.5)
    
    # Add group means #
    scores.means <- colMeans(working.memory[2:3])
    scores.means
    #>      placebo DA.depletion 
    #>     27.26667     18.86667
    segments(1:2-0.15,scores.means, 1:2+0.15, scores.means, col="black", lwd=3)
    
    ## Check assumption for normality ##
    stat.desc(working.memory$placebo, basic=F, desc=F, norm=T)
    #>   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
    #>  0.3338775  0.2877662 -1.0318530 -0.4602800  0.9591353  0.6773687
    stat.desc(working.memory$DA.depletion, basic=F, desc=F, norm=T)
    #>   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
    #>  0.4334375  0.3735762 -0.9792473 -0.4368141  0.9427404  0.4180816
    
    ## Running paired t-test ##
    t.test(working.memory$placebo, working.memory$DA.depletion, paired=T)
    #> 
    #>  Paired t-test
    #> 
    #> data:  working.memory$placebo and working.memory$DA.depletion
    #> t = 8.6161, df = 14, p-value = 5.715e-07
    #> alternative hypothesis: true difference in means is not equal to 0
    #> 95 percent confidence interval:
    #>   6.308997 10.491003
    #> sample estimates:
    #> mean of the differences 
    #>                     8.4
    
    ## Plot the difference ##
    working.memory$Difference<- working.memory$placebo-working.memory$DA.depletion
    head(working.memory)
    #>   Subject placebo DA.depletion Difference
    #> 1      M1       9            7          2
    #> 2      M2      10            7          3
    #> 3      M3      15           10          5
    #> 4      M4      18           12          6
    #> 5      M5      19           13          6
    #> 6      M6      22           15          7
    stat.desc(working.memory$Difference, basic=F, desc=F, norm=T)
    #>    skewness    skew.2SE    kurtosis    kurt.2SE  normtest.W  normtest.p 
    #>  0.08857017  0.07633789 -1.04670910 -0.46690688  0.97726706  0.94740752
    
    ## Difference as stripchart ##
    stripchart(working.memory$Difference)
    stripchart(working.memory$Difference,vertical=TRUE, method="jitter", las=1, ylab="Differences",pch=16, col="blue", cex=2)
    
    diff.mean <- mean(working.memory$Difference)
    loc.strip<-1
    segments(loc.strip-0.15,diff.mean, loc.strip+0.15, diff.mean, col="black", lwd=3)
    diff.se <- std.error(working.memory$Difference) # plotrix package #
    
    ## Confidence intervals ##
    lower<-diff.mean-1.96*diff.se
    upper<-diff.mean+1.96*diff.se
    
    ## Plot confidence intervals on a scatterplot ##
    arrows(x0=loc.strip, y0=lower, x1=loc.strip, y1=upper, length=0.3, angle=90,code=3,lwd=3)
    
    ## Plot beanplots ##
    beanplot(working.memory$Difference,las=1,ylab="Differences") 
    
    ## One-sample t-test ##
    t.test(working.memory$Difference ,mu=0)
    #> 
    #>  One Sample t-test
    #> 
    #> data:  working.memory$Difference
    #> t = 8.6161, df = 14, p-value = 5.715e-07
    #> alternative hypothesis: true mean is not equal to 0
    #> 95 percent confidence interval:
    #>   6.308997 10.491003
    #> sample estimates:
    #> mean of x 
    #>       8.4
    

    ANOVA

    # Import and plot dataset #
    protein<-read.csv("protein.expression.csv",header=T)
    head(protein)
    #>      A    B    C    D    E
    #> 1 0.40 0.26 0.24 1.04 0.74
    #> 2 1.50 0.47 0.25 2.78 0.99
    #> 3 0.98 0.42 1.01 0.82 1.26
    #> 4 0.33 0.64 0.77 1.65 1.50
    #> 5 0.75 0.32 0.47 0.49 0.30
    #> 6 1.48 0.65 0.47 0.97 0.34
    
    protein.stack<-melt(protein) ## reshape2 package ## this will generate a warning because we do not specify the ID ##
    colnames(protein.stack)<-c("line","expression")
    protein.stack.clean <- protein.stack[!is.na(protein.stack$expression),]
    head(protein.stack.clean)
    #>   line expression
    #> 1    A       0.40
    #> 2    A       1.50
    #> 3    A       0.98
    #> 4    A       0.33
    #> 5    A       0.75
    #> 6    A       1.48
    
    ## stripchart ##
    
    stripchart(protein.stack.clean$expression~protein.stack.clean$line,
               vertical=TRUE,
               method="jitter",
               las=1,
               ylab="Protein Expression", 
               pch=16, 
               col=rainbow(5))
    
    expression.means<-tapply(protein.stack.clean$expression,protein.stack.clean$line,mean)
    segments(1:5-0.15,expression.means, 1:5+0.15, expression.means, col="black", lwd=3)
    
    image.png
    
    ## boxplot, beanplot and normality test ##
    boxplot(protein.stack.clean$expression~protein.stack.clean$line)
    
    image.png
    boxplot(protein.stack.clean$expression~protein.stack.clean$line,col=rainbow(5), las=1,ylab="Protein Expression") 
    
    image.png
    
    beanplot(protein.stack.clean$expression~protein.stack.clean$line,overallline = "median",log="",ylab="Protein Expression", las=1) ## beanplot package ##
    
    image.png
    
    tapply(protein.stack.clean$expression,protein.stack.clean$line,stat.desc,desc=F, basic=F, norm=T) ## pastecs package ##
    #> $A
    #>    skewness    skew.2SE    kurtosis    kurt.2SE  normtest.W  normtest.p 
    #> -0.02086018 -0.01636601 -1.16300793 -0.47190556  0.92956709  0.37554602 
    #> 
    #> $B
    #>    skewness    skew.2SE    kurtosis    kurt.2SE  normtest.W  normtest.p 
    #>  0.11228032  0.08809036 -1.38388312 -0.56152854  0.95351440  0.68878672 
    #> 
    #> $C
    #>    skewness    skew.2SE    kurtosis    kurt.2SE  normtest.W  normtest.p 
    #> 1.277108966 1.190715643 0.565595525 0.272498653 0.819684006 0.002921089 
    #> 
    #> $D
    #>     skewness     skew.2SE     kurtosis     kurt.2SE   normtest.W   normtest.p 
    #> 1.9465885553 1.8149065614 3.6022084100 1.7355104443 0.7530719928 0.0003548725 
    #> 
    #> $E
    #>   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
    #>  0.4569725  0.4260594 -0.4885993 -0.2354026  0.9670693  0.7411281
    
    ## stripchart of log transformed data ##
    stripchart(protein.stack.clean$expression~protein.stack.clean$line,
               vertical=TRUE,
               method="jitter",
               las=1,
               ylab="Protein Expression", 
               pch=16, 
               col=rainbow(5)
    )
    
    expression.log.means<-tapply(protein.stack.clean$expression,protein.stack.clean$line,mean)
    segments(1:5-0.15,expression.log.means,1:5+0.15, expression.log.means, col="black", lwd=3)
    
    image.png
    
    beanplot(protein.stack.clean$expression~protein.stack.clean$line,overallline = "median",ylab="Protein Expression") ## beanplot package ##
    
    image.png
    
    # log transform y and test again #
    protein.stack.clean$log10.expression<-log10(protein.stack.clean$expression)
    head(protein.stack.clean)
    #>   line expression log10.expression
    #> 1    A       0.40     -0.397940009
    #> 2    A       1.50      0.176091259
    #> 3    A       0.98     -0.008773924
    #> 4    A       0.33     -0.481486060
    #> 5    A       0.75     -0.124938737
    #> 6    A       1.48      0.170261715
    
    tapply(protein.stack.clean$log10.expression,protein.stack.clean$line,stat.desc,basic=F, norm=T, desc=F)
    #> $A
    #>    skewness    skew.2SE    kurtosis    kurt.2SE  normtest.W  normtest.p 
    #> -0.61963191 -0.48613680 -1.28185977 -0.52013124  0.85424637  0.04143953 
    #> 
    #> $B
    #>   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
    #> -0.3231837 -0.2535561 -1.2210293 -0.4954485  0.9458450  0.5772532 
    #> 
    #> $C
    #>   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
    #>  0.1851959  0.1726679 -1.0600052 -0.5107006  0.9657060  0.7141796 
    #> 
    #> $D
    #>   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
    #>  0.3345489  0.3119174 -0.4044918 -0.1948804  0.9868425  0.9934883 
    #> 
    #> $E
    #>   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
    #> -0.7155948 -0.6671865 -0.3741176 -0.1802464  0.9313425  0.2050270
    
    ## boxplot log transformed data ##
    boxplot(protein.stack.clean$log10.expression~protein.stack.clean$line,col=rainbow(5))
    
    image.png
    
    # equal variance (2nd assumption) #
    bartlett.test(protein.stack.clean$log10.expression~protein.stack.clean$line)
    #> 
    #>  Bartlett test of homogeneity of variances
    #> 
    #> data:  protein.stack.clean$log10.expression by protein.stack.clean$line
    #> Bartlett's K-squared = 5.8261, df = 4, p-value = 0.2125
    
    #run the anova
    anova.log.protein<-aov(log10.expression~line,data=protein.stack.clean)
    summary(anova.log.protein)
    #>             Df Sum Sq Mean Sq F value   Pr(>F)    
    #> line         4  2.691  0.6728   8.123 1.78e-05 ***
    #> Residuals   73  6.046  0.0828                     
    #> ---
    #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    #par(mfrow=c(2,2))
    #plot(anova.log.protein)
    
    ## Bonferroni ##
    pairwise.t.test(protein.stack.clean$log10.expression,protein.stack.clean$line, p.adj = "bonf")
    #> 
    #>  Pairwise comparisons using t tests with pooled SD 
    #> 
    #> data:  protein.stack.clean$log10.expression and protein.stack.clean$line 
    #> 
    #>   A      B       C      D     
    #> B 0.3655 -       -      -     
    #> C 1.0000 1.0000  -      -     
    #> D 0.0571 1.9e-05 0.0017 -     
    #> E 1.0000 0.0062  0.3318 0.7675
    #> 
    #> P value adjustment method: bonferroni
    
    ## Tukey ##
    TukeyHSD(anova.log.protein)
    #>   Tukey multiple comparisons of means
    #>     95% family-wise confidence level
    #> 
    #> Fit: aov(formula = log10.expression ~ line, data = protein.stack.clean)
    #> 
    #> $line
    #>            diff          lwr        upr     p adj
    #> B-A -0.25024832 -0.578882494 0.07838585 0.2187264
    #> C-A -0.07499724 -0.374997820 0.22500335 0.9560187
    #> D-A  0.30549397  0.005493391 0.60549456 0.0438762
    #> E-A  0.13327517 -0.166725416 0.43327575 0.7265567
    #> C-B  0.17525108 -0.124749499 0.47525167 0.4809387
    #> D-B  0.55574230  0.255741712 0.85574288 0.0000183
    #> E-B  0.38352349  0.083522904 0.68352407 0.0054767
    #> D-C  0.38049121  0.112162532 0.64881989 0.0015431
    #> E-C  0.20827240 -0.060056276 0.47660108 0.2023355
    #> E-D -0.17221881 -0.440547487 0.09610987 0.3841989
    
    ## as a bar plot ##
    bar.expression<-barplot(expression.means, beside=TRUE, ylab="Mean expression",ylim=c(0,3),las=1)
    expression.se <- tapply(protein.stack.clean$expression, protein.stack.clean$line,std.error) 
    
    arrows(x0=bar.expression, y0=expression.means-expression.se, x1=bar.expression, y1=expression.means+expression.se, length=0.2, angle=90,code=3)
    
    image.png

    Correlation

    #dev.off()
    ## Import and plot correlation data ##
    exam.anxiety<-read.table("exam.anxiety.dat", sep="\t",header=T)
    head(exam.anxiety)
    #>   Code Revise Exam Anxiety Gender
    #> 1    1      4   40  86.298   Male
    #> 2    2     11   65  88.716 Female
    #> 3    3     27   80  70.178   Male
    #> 4    4     53   80  61.312   Male
    #> 5    5      4   40  89.522   Male
    #> 6    6     22   70  60.506 Female
    
    ## Plot the data ##
    plot(exam.anxiety$Anxiety~exam.anxiety$Revise,col=exam.anxiety$Gender,pch=16)
    
    ## Add legend  ##
    legend("topright", title="Gender",inset=.05, c("Female","Male"), horiz=TRUE, pch=16,col=1:2)
    
    # also works: 
    legend("topright", title="Gender",inset=.05, c("Female","Male"), horiz=TRUE, pch=16,col=c("black","red"))
    
    ## Fit the model ##
    fit.male <- lm(Anxiety~Revise, data=exam.anxiety[exam.anxiety$Gender=="Male",])
    fit.female <- lm(Anxiety~Revise, data=exam.anxiety[exam.anxiety$Gender=="Female",])
    
    ## Add a line of best-fit ##
    abline((fit.male), col="red")
    abline((fit.female), col="black")
    
    image.png
    
    ## Check the assumptions ##
    par(mfrow=c(2,2))
    plot(fit.male)
    
    image.png
    plot(fit.female)
    
    image.png
    
    ## Test the significance of the model ##
    summary(fit.male)
    #> 
    #> Call:
    #> lm(formula = Anxiety ~ Revise, data = exam.anxiety[exam.anxiety$Gender == 
    #>     "Male", ])
    #> 
    #> Residuals:
    #>     Min      1Q  Median      3Q     Max 
    #> -73.124  -2.900   2.221   6.750  16.600 
    #> 
    #> Coefficients:
    #>             Estimate Std. Error t value Pr(>|t|)    
    #> (Intercept)  84.1941     2.6213  32.119  < 2e-16 ***
    #> Revise       -0.5353     0.1016  -5.267 2.94e-06 ***
    #> ---
    #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    #> 
    #> Residual standard error: 13.3 on 50 degrees of freedom
    #> Multiple R-squared:  0.3568, Adjusted R-squared:  0.344 
    #> F-statistic: 27.74 on 1 and 50 DF,  p-value: 2.937e-06
    summary(fit.female)
    #> 
    #> Call:
    #> lm(formula = Anxiety ~ Revise, data = exam.anxiety[exam.anxiety$Gender == 
    #>     "Female", ])
    #> 
    #> Residuals:
    #>     Min      1Q  Median      3Q     Max 
    #> -22.687  -6.263  -1.204   4.197  38.628 
    #> 
    #> Coefficients:
    #>             Estimate Std. Error t value Pr(>|t|)    
    #> (Intercept) 91.94181    2.27858   40.35  < 2e-16 ***
    #> Revise      -0.82380    0.08173  -10.08 1.54e-13 ***
    #> ---
    #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    #> 
    #> Residual standard error: 10.42 on 49 degrees of freedom
    #> Multiple R-squared:  0.6746, Adjusted R-squared:  0.668 
    #> F-statistic: 101.6 on 1 and 49 DF,  p-value: 1.544e-13
    
    ## Correlation ##
    cor(exam.anxiety[exam.anxiety$Gender=="Male",c("Exam","Anxiety","Revise")])
    #>               Exam    Anxiety     Revise
    #> Exam     1.0000000 -0.5056874  0.3593981
    #> Anxiety -0.5056874  1.0000000 -0.5973682
    #> Revise   0.3593981 -0.5973682  1.0000000
    cor(exam.anxiety[exam.anxiety$Gender == "Female", c("Exam","Anxiety","Revise")])
    #>               Exam    Anxiety     Revise
    #> Exam     1.0000000 -0.3813845  0.4399865
    #> Anxiety -0.3813845  1.0000000 -0.8213698
    #> Revise   0.4399865 -0.8213698  1.0000000
    
    ## Let's filter out the misbehaving students ##
    exam.anxiety.filtered <- exam.anxiety[c(-78,-87),]
    
    ## Model without the outlier/influential case ##
    fit.male2 <- lm(Anxiety~Revise, data=exam.anxiety.filtered[exam.anxiety.filtered$Gender=="Male",])
    summary(fit.male2)
    #> 
    #> Call:
    #> lm(formula = Anxiety ~ Revise, data = exam.anxiety.filtered[exam.anxiety.filtered$Gender == 
    #>     "Male", ])
    #> 
    #> Residuals:
    #>      Min       1Q   Median       3Q      Max 
    #> -22.0296  -3.8704   0.5626   6.0786  14.2525 
    #> 
    #> Coefficients:
    #>             Estimate Std. Error t value Pr(>|t|)    
    #> (Intercept) 86.97461    1.64755  52.790  < 2e-16 ***
    #> Revise      -0.60752    0.06326  -9.603 7.59e-13 ***
    #> ---
    #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    #> 
    #> Residual standard error: 8.213 on 49 degrees of freedom
    #> Multiple R-squared:  0.653,  Adjusted R-squared:  0.6459 
    #> F-statistic: 92.22 on 1 and 49 DF,  p-value: 7.591e-13
    
    fit.female2 <- lm(Anxiety~Revise, data=exam.anxiety.filtered[exam.anxiety.filtered$Gender=="Female",])
    summary(fit.female2)
    #> 
    #> Call:
    #> lm(formula = Anxiety ~ Revise, data = exam.anxiety.filtered[exam.anxiety.filtered$Gender == 
    #>     "Female", ])
    #> 
    #> Residuals:
    #>      Min       1Q   Median       3Q      Max 
    #> -18.7518  -5.7069  -0.7782   3.2117  18.5538 
    #> 
    #> Coefficients:
    #>             Estimate Std. Error t value Pr(>|t|)    
    #> (Intercept) 92.24536    1.93591   47.65   <2e-16 ***
    #> Revise      -0.87504    0.07033  -12.44   <2e-16 ***
    #> ---
    #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    #> 
    #> Residual standard error: 8.849 on 48 degrees of freedom
    #> Multiple R-squared:  0.7633, Adjusted R-squared:  0.7584 
    #> F-statistic: 154.8 on 1 and 48 DF,  p-value: < 2.2e-16
    
    ## New correlation ##
    cor(exam.anxiety.filtered[exam.anxiety.filtered$Gender=="Male",c("Exam","Anxiety","Revise")])
    #>               Exam    Anxiety     Revise
    #> Exam     1.0000000 -0.4653914  0.4028863
    #> Anxiety -0.4653914  1.0000000 -0.8080950
    #> Revise   0.4028863 -0.8080950  1.0000000
    cor(exam.anxiety.filtered[exam.anxiety.filtered$Gender=="Female",c("Exam","Anxiety","Revise")])
    #>               Exam    Anxiety     Revise
    #> Exam     1.0000000 -0.4070663  0.4312691
    #> Anxiety -0.4070663  1.0000000 -0.8736684
    #> Revise   0.4312691 -0.8736684  1.0000000
    
    ## New plot ##
    #dev.off()
    plot(exam.anxiety$Anxiety~exam.anxiety$Revise,col=exam.anxiety$Gender,pch=16)
    
    ## Add legend  ##
    legend("topright", title="Gender", inset=.05, c("Female","Male"), horiz=TRUE, pch=16,col=1:2)
    
    ## Lines of best fit ##
    abline((fit.male), col="red")
    abline((fit.female), col="black")
    abline((fit.male2), col="red",lty=3)
    abline((fit.female2), col="black",lty=3)
    
    image.png

    相关文章

      网友评论

          本文标题:2020-07-08 R基础绘图+统计

          本文链接:https://www.haomeiwen.com/subject/nhoaqktx.html