R统计
安装加载必要的R包
options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor/")
options("repos" = c(CRAN="http://mirrors.cloud.tencent.com/CRAN/"))
options(download.file.method = 'libcurl')
options(url.method='libcurl')
#install.packages(c("beanplot", "plotrix", "pastecs", "reshape2"))
library(beanplot)
library(plotrix)
#> Warning: package 'plotrix' was built under R version 3.6.2
library(pastecs)
library(reshape2)
#> Warning: package 'reshape2' was built under R version 3.6.2
数据类型定义
QUALITATIVE DATA
## power analysis ##
power.prop.test(p1 = .25, p2 = .7, power = .8, sig.level = 0.05)
#>
#> Two-sample comparison of proportions power calculation
#>
#> n = 18.10585
#> p1 = 0.25
#> p2 = 0.7
#> sig.level = 0.05
#> power = 0.8
#> alternative = two.sided
#>
#> NOTE: n is number in *each* group
## Load the data ##
cats.data<-read.table("cats.dat", sep="\t",header=T)
## Look at the data ##
head(cats.data)
#> Training Dance
#> 1 Food as Reward Yes
#> 2 Food as Reward Yes
#> 3 Food as Reward Yes
#> 4 Food as Reward Yes
#> 5 Food as Reward Yes
#> 6 Food as Reward Yes
#View(cats.data)
## Plot the data and contingency table ##
plot(cats.data$Training, cats.data$Dance, xlab = "Training", ylab = "Dance")
image.png
plot(table(cats.data))
image.png
contingency.table <- table(cats.data)
contingency.table100<-prop.table(contingency.table,1)
contingency.table100<-round(contingency.table100*100)
contingency.table100
#> Dance
#> Training No Yes
#> Affection as Reward 70 30
#> Food as Reward 26 74
contingency.table100<-cbind(contingency.table100[,"Yes"],contingency.table100[,"No"])
colnames(contingency.table100) <- c("Yes", "No")
contingency.table100
#> Yes No
#> Affection as Reward 30 70
#> Food as Reward 74 26
barplot(t(contingency.table100),
legend.text = TRUE,
ylab = "Percentages",
las=1)
image.png
#### Pretty ###
par(lwd=2)
barplot(t(contingency.table100),
col=c("chartreuse3","lemonchiffon2"),
cex.axis=1.2,
cex.names=1.5,
cex.lab=1.5,
ylab = "Percentages",
las=1
)
legend("topright",
title="Dancing",
inset=.05,
c("Yes","No"),
horiz=TRUE,
pch=15,
col=c("chartreuse3","lemonchiffon2"))
image.png
dev.off()
#> null device
#> 1
QUANTITATIVE DATA
卡方检验,Fishers检验
## Chi2 ##
chisq.test(contingency.table)
#>
#> Pearson's Chi-squared test with Yates' continuity correction
#>
#> data: contingency.table
#> X-squared = 23.52, df = 1, p-value = 1.236e-06
## Chi2 without continuity correction #
chisq.test(contingency.table, correct=F)
#>
#> Pearson's Chi-squared test
#>
#> data: contingency.table
#> X-squared = 25.356, df = 1, p-value = 4.767e-07
## Fishers test ##
fisher.test(contingency.table)
#>
#> Fisher's Exact Test for Count Data
#>
#> data: contingency.table
#> p-value = 1.312e-06
#> alternative hypothesis: true odds ratio is not equal to 1
#> 95 percent confidence interval:
#> 2.837773 16.429686
#> sample estimates:
#> odds ratio
#> 6.579265
T检验
## Import data ##
coyote<-read.csv("coyote.csv")
#View(coyote)
head(coyote)
#> length gender
#> 1 93.0 female
#> 2 97.0 female
#> 3 92.0 female
#> 4 101.6 female
#> 5 93.0 female
#> 6 84.5 female
## Power calculation #
#power.t.test(n = , d = , sig.level = , power = , type = c("two.sample", "one.sample", "paired"))
# mean 1 = 92 and mean 2 = 87.4
# sd = 7
delta <- 92 - 87.4
power.t.test(delta = 92 - 87.4, sd = 7, sig.level = 0.05, power = 0.8)
#>
#> Two-sample t test power calculation
#>
#> n = 37.33624
#> delta = 4.6
#> sd = 7
#> sig.level = 0.05
#> power = 0.8
#> alternative = two.sided
#>
#> NOTE: n is number in *each* group
Graphical exploration of the data
## Stripchart ##
stripchart(coyote$length~coyote$gender)
image.png
stripchart(coyote$length~coyote$gender,vertical=TRUE, method="jitter")
image.png
stripchart(coyote$length~coyote$gender,
vertical=TRUE,
method="jitter",
las=1,
ylab="Length",
pch=16,
col=c("darkorange","purple"),
cex=1.5
)
# Add group means #
length.means <- tapply(coyote$length,coyote$gender,mean)
segments(1:2-0.15,length.means, 1:2+0.15, length.means, col="black", lwd=3)
image.png
## Boxplot or beanplot # beanplot package ##
boxplot(coyote$length~coyote$gender,notch = TRUE)
image.png
# Prettier:
boxplot(coyote$length~coyote$gender, col=c("orange","purple"), ylab = 'Length (cm)', las = 1)
image.png
## Beanplot ##
beanplot(coyote$length~coyote$gender,las=1,ylab="Length (cm)")
image.png
## Combination boxplot and scatterplot ##
boxplot(coyote$length~coyote$gender,
lwd = 2,
ylab = 'Length',
cex.axis=1.5,
las=1,
cex.lab=1.5)
stripchart(coyote$length~coyote$gender,
vertical = TRUE,
method = "jitter",
pch = 20,
col = 'red',
cex=2,
add = TRUE)
image.png
## Combination boxplot and beanplot ##
beanplot(coyote$length~coyote$gender,
las=1, overallline = "median",
ylab = 'Length',
cex.lab=1.5,
col="bisque",
what = c(0,1,1,0),
cex.axis=1.5)
boxplot(coyote$length~coyote$gender,
add=TRUE,
col=rgb(0.2,0.5,0.3, alpha=0.5),
pch = 20,
cex=2,
lwd=2,
yaxt='n',
xaxt='n')
image.png
## Histogram ##
par(mfrow=c(1,2))
hist(coyote[coyote$gender=="male",]$length)
hist(coyote[coyote$gender=="female",]$length)
image.png
# Prettier:
hist(coyote[coyote$gender=="male",]$length,main="Male", xlab="Length",col="lightgreen")
hist(coyote[coyote$gender=="female",]$length,main="Female", xlab="Length",col="tomato1")
image.png
### Stats summary of test for normality (1st assumption) ## pastecs package ###
#by(coyote$length,coyote$gender, stat.desc, basic=F, desc=F, norm=T)#
tapply(coyote$length,coyote$gender, stat.desc, basic=F, desc=F, norm=T)
#> $female
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> -0.5290403 -0.7320175 0.5029741 0.3546892 0.9700101 0.3164448
#>
#> $male
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> -0.08453876 -0.11697379 -0.67756124 -0.47780521 0.98445697 0.81898309
### Homogeneity of variance (test 2nd assumption)
bartlett.test(coyote$length~coyote$gender)
#>
#> Bartlett test of homogeneity of variances
#>
#> data: coyote$length by coyote$gender
#> Bartlett's K-squared = 0.02021, df = 1, p-value = 0.887
###### Independent t-test ###########
t.test(coyote$length~coyote$gender, var.equal=T)
#>
#> Two Sample t-test
#>
#> data: coyote$length by coyote$gender
#> t = -1.6411, df = 84, p-value = 0.1045
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -5.184747 0.496375
#> sample estimates:
#> mean in group female mean in group male
#> 89.71163 92.05581
## Plot a bar chart ##
dev.off()
#> null device
#> 1
bar.length<-barplot(length.means)
bar.length<-barplot(length.means,xlim=c(0,1),width=0.3, ylim=c(0,100))
# Prettier:
bar.length<-barplot(length.means, col=c("darkslategray1","darkseagreen1"), xlim=c(0,1),width=0.3,ylim=c(50,100), ylab="Mean length",las=1, xpd=FALSE)
## Calculation of SEM for error bars # plotrix package ##
length.se<-tapply(coyote$length,coyote$gender,std.error)
length.se
#> female male
#> 0.9988377 1.0211241
# Alternative #
length.se<-tapply(coyote$length,coyote$gender,function(x) sd(x)/sqrt(length(x)))
## Barplot returns the values for the centers of the bars ##
bar.length
#> [,1]
#> [1,] 0.21
#> [2,] 0.57
# Adding error bars #
arrows(x0=bar.length, y0=length.means-length.se, x1=bar.length, y1=length.means+length.se, length=0.3, angle=90,code=3)
###### Independent t-test ###########
t.test(coyote$length~coyote$gender, var.equal=T)
#>
#> Two Sample t-test
#>
#> data: coyote$length by coyote$gender
#> t = -1.6411, df = 84, p-value = 0.1045
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -5.184747 0.496375
#> sample estimates:
#> mean in group female mean in group male
#> 89.71163 92.05581
###### Paired t-test ###########
working.memory<-read.csv("Working.memory.csv", header=T)
head(working.memory)
#> Subject placebo DA.depletion
#> 1 M1 9 7
#> 2 M2 10 7
#> 3 M3 15 10
#> 4 M4 18 12
#> 5 M5 19 13
#> 6 M6 22 15
### Graphical exploration of the data ###
## Boxplot ##
boxplot(working.memory[2:3])
beanplot(working.memory[2:3],las=1,ylab="Scores")
## Stripchart ##
stripchart(working.memory[2:3])
stripchart(working.memory[2:3],vertical=TRUE, method="jitter", las=1, ylab="Scores", pch=16,col=c("darkred","darkgreen"), cex=1.5)
# Add group means #
scores.means <- colMeans(working.memory[2:3])
scores.means
#> placebo DA.depletion
#> 27.26667 18.86667
segments(1:2-0.15,scores.means, 1:2+0.15, scores.means, col="black", lwd=3)
## Check assumption for normality ##
stat.desc(working.memory$placebo, basic=F, desc=F, norm=T)
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 0.3338775 0.2877662 -1.0318530 -0.4602800 0.9591353 0.6773687
stat.desc(working.memory$DA.depletion, basic=F, desc=F, norm=T)
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 0.4334375 0.3735762 -0.9792473 -0.4368141 0.9427404 0.4180816
## Running paired t-test ##
t.test(working.memory$placebo, working.memory$DA.depletion, paired=T)
#>
#> Paired t-test
#>
#> data: working.memory$placebo and working.memory$DA.depletion
#> t = 8.6161, df = 14, p-value = 5.715e-07
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> 6.308997 10.491003
#> sample estimates:
#> mean of the differences
#> 8.4
## Plot the difference ##
working.memory$Difference<- working.memory$placebo-working.memory$DA.depletion
head(working.memory)
#> Subject placebo DA.depletion Difference
#> 1 M1 9 7 2
#> 2 M2 10 7 3
#> 3 M3 15 10 5
#> 4 M4 18 12 6
#> 5 M5 19 13 6
#> 6 M6 22 15 7
stat.desc(working.memory$Difference, basic=F, desc=F, norm=T)
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 0.08857017 0.07633789 -1.04670910 -0.46690688 0.97726706 0.94740752
## Difference as stripchart ##
stripchart(working.memory$Difference)
stripchart(working.memory$Difference,vertical=TRUE, method="jitter", las=1, ylab="Differences",pch=16, col="blue", cex=2)
diff.mean <- mean(working.memory$Difference)
loc.strip<-1
segments(loc.strip-0.15,diff.mean, loc.strip+0.15, diff.mean, col="black", lwd=3)
diff.se <- std.error(working.memory$Difference) # plotrix package #
## Confidence intervals ##
lower<-diff.mean-1.96*diff.se
upper<-diff.mean+1.96*diff.se
## Plot confidence intervals on a scatterplot ##
arrows(x0=loc.strip, y0=lower, x1=loc.strip, y1=upper, length=0.3, angle=90,code=3,lwd=3)
## Plot beanplots ##
beanplot(working.memory$Difference,las=1,ylab="Differences")
## One-sample t-test ##
t.test(working.memory$Difference ,mu=0)
#>
#> One Sample t-test
#>
#> data: working.memory$Difference
#> t = 8.6161, df = 14, p-value = 5.715e-07
#> alternative hypothesis: true mean is not equal to 0
#> 95 percent confidence interval:
#> 6.308997 10.491003
#> sample estimates:
#> mean of x
#> 8.4
ANOVA
# Import and plot dataset #
protein<-read.csv("protein.expression.csv",header=T)
head(protein)
#> A B C D E
#> 1 0.40 0.26 0.24 1.04 0.74
#> 2 1.50 0.47 0.25 2.78 0.99
#> 3 0.98 0.42 1.01 0.82 1.26
#> 4 0.33 0.64 0.77 1.65 1.50
#> 5 0.75 0.32 0.47 0.49 0.30
#> 6 1.48 0.65 0.47 0.97 0.34
protein.stack<-melt(protein) ## reshape2 package ## this will generate a warning because we do not specify the ID ##
colnames(protein.stack)<-c("line","expression")
protein.stack.clean <- protein.stack[!is.na(protein.stack$expression),]
head(protein.stack.clean)
#> line expression
#> 1 A 0.40
#> 2 A 1.50
#> 3 A 0.98
#> 4 A 0.33
#> 5 A 0.75
#> 6 A 1.48
## stripchart ##
stripchart(protein.stack.clean$expression~protein.stack.clean$line,
vertical=TRUE,
method="jitter",
las=1,
ylab="Protein Expression",
pch=16,
col=rainbow(5))
expression.means<-tapply(protein.stack.clean$expression,protein.stack.clean$line,mean)
segments(1:5-0.15,expression.means, 1:5+0.15, expression.means, col="black", lwd=3)
image.png
## boxplot, beanplot and normality test ##
boxplot(protein.stack.clean$expression~protein.stack.clean$line)
image.png
boxplot(protein.stack.clean$expression~protein.stack.clean$line,col=rainbow(5), las=1,ylab="Protein Expression")
image.png
beanplot(protein.stack.clean$expression~protein.stack.clean$line,overallline = "median",log="",ylab="Protein Expression", las=1) ## beanplot package ##
image.png
tapply(protein.stack.clean$expression,protein.stack.clean$line,stat.desc,desc=F, basic=F, norm=T) ## pastecs package ##
#> $A
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> -0.02086018 -0.01636601 -1.16300793 -0.47190556 0.92956709 0.37554602
#>
#> $B
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 0.11228032 0.08809036 -1.38388312 -0.56152854 0.95351440 0.68878672
#>
#> $C
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 1.277108966 1.190715643 0.565595525 0.272498653 0.819684006 0.002921089
#>
#> $D
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 1.9465885553 1.8149065614 3.6022084100 1.7355104443 0.7530719928 0.0003548725
#>
#> $E
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 0.4569725 0.4260594 -0.4885993 -0.2354026 0.9670693 0.7411281
## stripchart of log transformed data ##
stripchart(protein.stack.clean$expression~protein.stack.clean$line,
vertical=TRUE,
method="jitter",
las=1,
ylab="Protein Expression",
pch=16,
col=rainbow(5)
)
expression.log.means<-tapply(protein.stack.clean$expression,protein.stack.clean$line,mean)
segments(1:5-0.15,expression.log.means,1:5+0.15, expression.log.means, col="black", lwd=3)
image.png
beanplot(protein.stack.clean$expression~protein.stack.clean$line,overallline = "median",ylab="Protein Expression") ## beanplot package ##
image.png
# log transform y and test again #
protein.stack.clean$log10.expression<-log10(protein.stack.clean$expression)
head(protein.stack.clean)
#> line expression log10.expression
#> 1 A 0.40 -0.397940009
#> 2 A 1.50 0.176091259
#> 3 A 0.98 -0.008773924
#> 4 A 0.33 -0.481486060
#> 5 A 0.75 -0.124938737
#> 6 A 1.48 0.170261715
tapply(protein.stack.clean$log10.expression,protein.stack.clean$line,stat.desc,basic=F, norm=T, desc=F)
#> $A
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> -0.61963191 -0.48613680 -1.28185977 -0.52013124 0.85424637 0.04143953
#>
#> $B
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> -0.3231837 -0.2535561 -1.2210293 -0.4954485 0.9458450 0.5772532
#>
#> $C
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 0.1851959 0.1726679 -1.0600052 -0.5107006 0.9657060 0.7141796
#>
#> $D
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> 0.3345489 0.3119174 -0.4044918 -0.1948804 0.9868425 0.9934883
#>
#> $E
#> skewness skew.2SE kurtosis kurt.2SE normtest.W normtest.p
#> -0.7155948 -0.6671865 -0.3741176 -0.1802464 0.9313425 0.2050270
## boxplot log transformed data ##
boxplot(protein.stack.clean$log10.expression~protein.stack.clean$line,col=rainbow(5))
image.png
# equal variance (2nd assumption) #
bartlett.test(protein.stack.clean$log10.expression~protein.stack.clean$line)
#>
#> Bartlett test of homogeneity of variances
#>
#> data: protein.stack.clean$log10.expression by protein.stack.clean$line
#> Bartlett's K-squared = 5.8261, df = 4, p-value = 0.2125
#run the anova
anova.log.protein<-aov(log10.expression~line,data=protein.stack.clean)
summary(anova.log.protein)
#> Df Sum Sq Mean Sq F value Pr(>F)
#> line 4 2.691 0.6728 8.123 1.78e-05 ***
#> Residuals 73 6.046 0.0828
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#par(mfrow=c(2,2))
#plot(anova.log.protein)
## Bonferroni ##
pairwise.t.test(protein.stack.clean$log10.expression,protein.stack.clean$line, p.adj = "bonf")
#>
#> Pairwise comparisons using t tests with pooled SD
#>
#> data: protein.stack.clean$log10.expression and protein.stack.clean$line
#>
#> A B C D
#> B 0.3655 - - -
#> C 1.0000 1.0000 - -
#> D 0.0571 1.9e-05 0.0017 -
#> E 1.0000 0.0062 0.3318 0.7675
#>
#> P value adjustment method: bonferroni
## Tukey ##
TukeyHSD(anova.log.protein)
#> Tukey multiple comparisons of means
#> 95% family-wise confidence level
#>
#> Fit: aov(formula = log10.expression ~ line, data = protein.stack.clean)
#>
#> $line
#> diff lwr upr p adj
#> B-A -0.25024832 -0.578882494 0.07838585 0.2187264
#> C-A -0.07499724 -0.374997820 0.22500335 0.9560187
#> D-A 0.30549397 0.005493391 0.60549456 0.0438762
#> E-A 0.13327517 -0.166725416 0.43327575 0.7265567
#> C-B 0.17525108 -0.124749499 0.47525167 0.4809387
#> D-B 0.55574230 0.255741712 0.85574288 0.0000183
#> E-B 0.38352349 0.083522904 0.68352407 0.0054767
#> D-C 0.38049121 0.112162532 0.64881989 0.0015431
#> E-C 0.20827240 -0.060056276 0.47660108 0.2023355
#> E-D -0.17221881 -0.440547487 0.09610987 0.3841989
## as a bar plot ##
bar.expression<-barplot(expression.means, beside=TRUE, ylab="Mean expression",ylim=c(0,3),las=1)
expression.se <- tapply(protein.stack.clean$expression, protein.stack.clean$line,std.error)
arrows(x0=bar.expression, y0=expression.means-expression.se, x1=bar.expression, y1=expression.means+expression.se, length=0.2, angle=90,code=3)
image.png
Correlation
#dev.off()
## Import and plot correlation data ##
exam.anxiety<-read.table("exam.anxiety.dat", sep="\t",header=T)
head(exam.anxiety)
#> Code Revise Exam Anxiety Gender
#> 1 1 4 40 86.298 Male
#> 2 2 11 65 88.716 Female
#> 3 3 27 80 70.178 Male
#> 4 4 53 80 61.312 Male
#> 5 5 4 40 89.522 Male
#> 6 6 22 70 60.506 Female
## Plot the data ##
plot(exam.anxiety$Anxiety~exam.anxiety$Revise,col=exam.anxiety$Gender,pch=16)
## Add legend ##
legend("topright", title="Gender",inset=.05, c("Female","Male"), horiz=TRUE, pch=16,col=1:2)
# also works:
legend("topright", title="Gender",inset=.05, c("Female","Male"), horiz=TRUE, pch=16,col=c("black","red"))
## Fit the model ##
fit.male <- lm(Anxiety~Revise, data=exam.anxiety[exam.anxiety$Gender=="Male",])
fit.female <- lm(Anxiety~Revise, data=exam.anxiety[exam.anxiety$Gender=="Female",])
## Add a line of best-fit ##
abline((fit.male), col="red")
abline((fit.female), col="black")
image.png
## Check the assumptions ##
par(mfrow=c(2,2))
plot(fit.male)
image.png
plot(fit.female)
image.png
## Test the significance of the model ##
summary(fit.male)
#>
#> Call:
#> lm(formula = Anxiety ~ Revise, data = exam.anxiety[exam.anxiety$Gender ==
#> "Male", ])
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -73.124 -2.900 2.221 6.750 16.600
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 84.1941 2.6213 32.119 < 2e-16 ***
#> Revise -0.5353 0.1016 -5.267 2.94e-06 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 13.3 on 50 degrees of freedom
#> Multiple R-squared: 0.3568, Adjusted R-squared: 0.344
#> F-statistic: 27.74 on 1 and 50 DF, p-value: 2.937e-06
summary(fit.female)
#>
#> Call:
#> lm(formula = Anxiety ~ Revise, data = exam.anxiety[exam.anxiety$Gender ==
#> "Female", ])
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -22.687 -6.263 -1.204 4.197 38.628
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 91.94181 2.27858 40.35 < 2e-16 ***
#> Revise -0.82380 0.08173 -10.08 1.54e-13 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 10.42 on 49 degrees of freedom
#> Multiple R-squared: 0.6746, Adjusted R-squared: 0.668
#> F-statistic: 101.6 on 1 and 49 DF, p-value: 1.544e-13
## Correlation ##
cor(exam.anxiety[exam.anxiety$Gender=="Male",c("Exam","Anxiety","Revise")])
#> Exam Anxiety Revise
#> Exam 1.0000000 -0.5056874 0.3593981
#> Anxiety -0.5056874 1.0000000 -0.5973682
#> Revise 0.3593981 -0.5973682 1.0000000
cor(exam.anxiety[exam.anxiety$Gender == "Female", c("Exam","Anxiety","Revise")])
#> Exam Anxiety Revise
#> Exam 1.0000000 -0.3813845 0.4399865
#> Anxiety -0.3813845 1.0000000 -0.8213698
#> Revise 0.4399865 -0.8213698 1.0000000
## Let's filter out the misbehaving students ##
exam.anxiety.filtered <- exam.anxiety[c(-78,-87),]
## Model without the outlier/influential case ##
fit.male2 <- lm(Anxiety~Revise, data=exam.anxiety.filtered[exam.anxiety.filtered$Gender=="Male",])
summary(fit.male2)
#>
#> Call:
#> lm(formula = Anxiety ~ Revise, data = exam.anxiety.filtered[exam.anxiety.filtered$Gender ==
#> "Male", ])
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -22.0296 -3.8704 0.5626 6.0786 14.2525
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 86.97461 1.64755 52.790 < 2e-16 ***
#> Revise -0.60752 0.06326 -9.603 7.59e-13 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 8.213 on 49 degrees of freedom
#> Multiple R-squared: 0.653, Adjusted R-squared: 0.6459
#> F-statistic: 92.22 on 1 and 49 DF, p-value: 7.591e-13
fit.female2 <- lm(Anxiety~Revise, data=exam.anxiety.filtered[exam.anxiety.filtered$Gender=="Female",])
summary(fit.female2)
#>
#> Call:
#> lm(formula = Anxiety ~ Revise, data = exam.anxiety.filtered[exam.anxiety.filtered$Gender ==
#> "Female", ])
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -18.7518 -5.7069 -0.7782 3.2117 18.5538
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 92.24536 1.93591 47.65 <2e-16 ***
#> Revise -0.87504 0.07033 -12.44 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 8.849 on 48 degrees of freedom
#> Multiple R-squared: 0.7633, Adjusted R-squared: 0.7584
#> F-statistic: 154.8 on 1 and 48 DF, p-value: < 2.2e-16
## New correlation ##
cor(exam.anxiety.filtered[exam.anxiety.filtered$Gender=="Male",c("Exam","Anxiety","Revise")])
#> Exam Anxiety Revise
#> Exam 1.0000000 -0.4653914 0.4028863
#> Anxiety -0.4653914 1.0000000 -0.8080950
#> Revise 0.4028863 -0.8080950 1.0000000
cor(exam.anxiety.filtered[exam.anxiety.filtered$Gender=="Female",c("Exam","Anxiety","Revise")])
#> Exam Anxiety Revise
#> Exam 1.0000000 -0.4070663 0.4312691
#> Anxiety -0.4070663 1.0000000 -0.8736684
#> Revise 0.4312691 -0.8736684 1.0000000
## New plot ##
#dev.off()
plot(exam.anxiety$Anxiety~exam.anxiety$Revise,col=exam.anxiety$Gender,pch=16)
## Add legend ##
legend("topright", title="Gender", inset=.05, c("Female","Male"), horiz=TRUE, pch=16,col=1:2)
## Lines of best fit ##
abline((fit.male), col="red")
abline((fit.female), col="black")
abline((fit.male2), col="red",lty=3)
abline((fit.female2), col="black",lty=3)
image.png
网友评论