什么是Shapiro-Wilk检验
Shapiro-Wilk检验用来检验小样本数据是否数据符合正态分布。类似于回归的方法一样,计算一个相关系数,它越接近1就越表明数据和正态分布拟合得越好。
-
构建检验统计量W
W检验统计量 -
建立原假设与备择假设
原假设为H0:数据集符合正态分布;备择假设H1:数据集不符合正态分布。 -
计算p值
非正态分布的小样本数据在检验时也可能出现较大的W值。因此需要通过模拟或者查表来估计其概率。由于原假设是其符合正态分布,如果p值小于所选择的α水平,则拒绝零假设,如果p值大于所选择的α水平,则表明没有证据拒绝原假设。
R 语言计算Shapiro-Wilk检验
> x <- 1:10
> shapiro.test(x)
shapiro.test(x)
Shapiro-Wilk normality test
data: x
W = 0.97016, p-value = 0.8924
在scala和java中没有直接使用的方法,本文在现有的其他语言基础上对该方法进行重构
import breeze.linalg.DenseMatrix
import breeze.numerics.sqrt
import breeze.stats.distributions.Gaussian
import scala.math.{log, pow, exp, asin}
// 均值
def mean(ds: Array[Double]): Double = {
ds.sum / (ds.length)
}
// K阶中心距 E(X-EX)^k vk
def centerDistance(ds: Array[Double], k: Int) = {
mean(ds.map(i => pow(i - mean(ds), k)))
}
// 峰度
def kurtosis(ds: Array[Double]) =
centerDistance(ds, 4) / pow(centerDistance(ds, 2), 2) - 3
def swtest(x: Array[Double]): (Double, Double) = {
val gaussian = new Gaussian(0, 1)
var W: Double = 0D
var pValue: Double = 0D
var count = 0
var phi = 0D
var mu = 0D
var sigma = 0D
var gam = 0D
var newSWstatistic = 0D
var NormalSWstatistic = 0D
val n = x.length
val mean = x.sum / n
val icdfArray = x.map(i => {
// quantile
gaussian.inverseCdf((i - 3d / 8) / (n + 0.25))
})
val vx: DenseMatrix[Double] = DenseMatrix.create(n, 1, x)
val vxt: DenseMatrix[Double] = DenseMatrix.create(n, 1, x.map(_ - mean))
val mtilde: DenseMatrix[Double] = DenseMatrix.create(n, 1, icdfArray)
var weights: DenseMatrix[Double] = DenseMatrix.zeros[Double](n, 1)
//////////////// kurtosis < 3 /////////////
// The Shapiro-Francia test is better for leptokurtic samples.
if (skewness(x) > 3) {
val che = sqrt(mtilde.t * mtilde)
weights = 1 / che(0, 0) * mtilde
W = pow((weights.t * vx).data(0), 2) / (vxt.t * vxt).data(0)
val nu = log(n)
val u1 = log(nu) - nu
val u2 = log(nu) + 2 / nu
mu = -1.2725 + (1.0521 * u1)
sigma = 1.0308 - (0.26758 * u2)
newSWstatistic = log(1 - W)
// Compute the normalized Shapiro-Francia statistic and its p-value.
val NormalSFstatistic: Double = (newSWstatistic - mu) / sigma
// the next p-value is for the tail = 1 test.
pValue = 1 - gaussian.cdf(NormalSFstatistic)
} else {
// % The Shapiro-Wilk test is better for platykurtic samples.
val c: DenseMatrix[Double] = 1 / sqrt(mtilde.t * mtilde).data(0) * mtilde
val u: Double = 1 / sqrt(n)
/* polynomial coefficients */
val g = Array(-2.273, 0.459)
val c1 = Array(c.data(n - 1), 0.221157, -0.147981, -2.07119, 4.434685, -2.706056)
val c2 = Array(c.data(n - 2), 0.042981, -0.293762,-1.752461, 5.682633,-3.582633)
val c3 = Array(0.544, -0.39978, 0.025054, -6.714e-4)
val c4 = Array(1.3822, -0.77857, 0.062767, -0.0020322)
val c5 = Array(-1.5861, -0.31082, -0.083751, 0.0038915)
val c6 = Array(-0.4803, -0.082676, 0.0030302)
def polyval(arr: Array[Double], x: Double) = {
arr.zipWithIndex
.map(tp => {
tp._1 * math.pow(x, tp._2)
})
.sum
}
weights.update(n - 1, 0, polyval(c1, u))
weights.update(0, 0, -polyval(c1, u))
// Special attention when n=3 (this is a special case).
if (n == 3) {
weights.update(0, 0, 0.707106781)
weights.update(n - 1, 0, -0.707106781)
} else if (n >= 6) {
weights.update(n - 2, 0, polyval(c2, u))
weights.update(1, 0, -polyval(c2, u))
count = 3
phi = ((mtilde.t * mtilde - 2 * pow(mtilde(n - 1, 0), 2) - 2 * pow(
mtilde(n - 2, 0),
2
)) /
(1 - 2 * pow(weights(n - 1, 0), 2) - 2 * pow(weights(n - 2, 0), 2)))
.data(0)
} else {
count = 2
phi = ((mtilde.t * mtilde - 2 * pow(mtilde(n - 1, 0), 2)) /
(1 - 2 * pow(weights(n - 1, 0), 2)))
.data(0)
}
// The vector 'WEIGHTS' obtained next corresponds to the same coefficients
// listed by Shapiro-Wilk in their original test for small samples.
for (i <- count - 1 to n - count) {
weights.update(i, 0, mtilde(i, 0) / math.sqrt(phi))
}
// The Shapiro-Wilk statistic W is calculated to avoid excessive rounding
// errors for W close to 1 (a potential problem in very large samples).
W = pow((weights.t * vx).data(0), 2) / (vxt.t * vxt).data(0)
// Calculate the significance level for W (exact for n=3).
val newn = log(n)
if (n > 3 && n <= 11) {
mu = polyval(c3, n)
sigma = exp(polyval(c4, n))
gam = polyval(g, n)
newSWstatistic = -log(gam - log(1 - W))
} else if (n >= 12) {
mu = polyval(c5, newn)
sigma = exp(polyval(c6, newn))
newSWstatistic = log(1 - W)
} else {
mu = 0
sigma = 1
newSWstatistic = 0
}
// Compute the normalized Shapiro-Wilk statistic and its p-value.
// Special attention when n=3 (this is a special case)
if (n == 3) {
pValue = 1.909859 * (asin(math.sqrt(W)) - 1.047198)
NormalSWstatistic = gaussian.inverseCdf(pValue)
} else {
NormalSWstatistic = (newSWstatistic - mu) / sigma
// % The next p-value is for the tail = 1 test.
pValue = 1 - gaussian.cdf(NormalSWstatistic)
}
}
(pValue, W)
}
// 方法调用
val x: Array[Double] = (1 to 10).map(_.toDouble).toArray
println(swtest(x))
// 返回p值和W统计量
(0.892367307523924,0.970164611230666)
以上代码重构于一份Matlab(Matlab真好用)代码以及参考自 github。
参考资料
https://github.com/rniwa/js-shapiro-wilk
http://blog.sina.com.cn/s/blog_403aa80a01019lwd.html
https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test
网友评论