美文网首页手机好文
第2章 无序多分类和有序多分类

第2章 无序多分类和有序多分类

作者: 养猪场小老板 | 来源:发表于2020-07-16 09:24 被阅读0次
    第2章
    image.png
    image.png 代码案例
    数据形式
    image.png
    #第 2章 代码开始
    > ## 利用 nnet 包中的函数multinom ,建立多元logistic回归模型:
    > library(foreign)
    > library(nnet)
    > library(ggplot2)
    > library(reshape2)
    > ml <- read.dta("hsbdemo.dta")#读取.dta格式数据
    > #View(ml)#查看数据的具体形式
    > with(ml, table(ses, prog))#ses行和prog列的表格
            prog
    ses      general academic vocation
      low         16       19       12
      middle      20       44       31
      high         9       42        7
    > #with在数据框上的操作
    > #do.call(操作,数据)
    > with(ml, do.call(rbind, tapply(write, prog, function(x) c(M = mean(x), SD = sd(x)))))
                    M       SD
    general  51.33333 9.397775
    academic 56.25714 7.943343
    vocation 46.76000 9.318754
    > ml$prog2 <- relevel(ml$prog, ref = "academic")#以academic作为参考
    > test <- multinom(prog2 ~ ses + write, data = ml)#回归
    # weights:  15 (8 variable)
    initial  value 219.722458 
    iter  10 value 179.982880
    final  value 179.981726 
    converged
    > summary(test)
    Call:
    multinom(formula = prog2 ~ ses + write, data = ml)
    
    Coefficients:
             (Intercept)  sesmiddle    seshigh      write
    general     2.852198 -0.5332810 -1.1628226 -0.0579287
    vocation    5.218260  0.2913859 -0.9826649 -0.1136037
    
    #行general 和vocation是指相对于academic的值,列sesmiddle   seshigh是指相对于seslow的值
    
    Std. Errors:
             (Intercept) sesmiddle   seshigh      write
    general     1.166441 0.4437323 0.5142196 0.02141097
    vocation    1.163552 0.4763739 0.5955665 0.02221996
    
    Residual Deviance: 359.9635 
    AIC: 375.9635 
    > # 2-tailed z test
    > z <- summary(test)$coefficients/summary(test)$standard.errors #Z值的计算
    > z
             (Intercept)  sesmiddle   seshigh     write
    general     2.445214 -1.2018081 -2.261334 -2.705562
    vocation    4.484769  0.6116747 -1.649967 -5.112689
    > p <- (1 - pnorm(abs(z), 0, 1)) * 2##P值的计算
    > p
              (Intercept) sesmiddle    seshigh        write
    general  0.0144766100 0.2294379 0.02373856 6.818902e-03
    vocation 0.0000072993 0.5407530 0.09894976 3.176045e-07
    #general 相对于academic,因素seshigh和write是有意义的
    #vocation  相对于academic,因素write是有意义的
    > # extract the coefficients from the model and exponentiate
    > exp(coef(test)) #OR值
             (Intercept) sesmiddle   seshigh     write
    general     17.32582 0.5866769 0.3126026 0.9437172
    vocation   184.61262 1.3382809 0.3743123 0.8926116
    #解释OR值,write每提高一个单位,产生general是产生academic的0.9437172倍
    #write每提高一个单位,产生vocation是产生academic的0.8926116倍
    #
    > head(pp <- fitted(test))
       academic   general  vocation
    1 0.1482764 0.3382454 0.5134781
    2 0.1202017 0.1806283 0.6991700
    3 0.4186747 0.2368082 0.3445171
    4 0.1726885 0.3508384 0.4764731
    5 0.1001231 0.1689374 0.7309395
    6 0.3533566 0.2377976 0.4088458
    
    > dses <- data.frame(ses = c("low", "middle", "high"), write = mean(ml$write))
    #这里把写作write取均值,即,每个人的写作贡献固定了,只有社会经济地位的变化了
    > predict(test, newdata = dses, "probs")#预测
       academic   general  vocation
    1 0.4396845 0.3581917 0.2021238
    2 0.4777488 0.2283353 0.2939159
    3 0.7009007 0.1784939 0.1206054
    > dwrite <- data.frame(ses = rep(c("low", "middle", "high"), each = 41), write = rep(c(30:70),3))
    > # store the predicted probabilities for each value of ses and write
    > pp.write <- cbind(dwrite, predict(test, newdata = dwrite, type = "probs", se = TRUE))
    > # calculate the mean probabilities within each level of ses
    > by(pp.write[, 3:5], pp.write$ses, colMeans)
    pp.write$ses: high
     academic   general  vocation 
    0.6164315 0.1808037 0.2027648 
    -------------------------------------------------------- 
    pp.write$ses: low
     academic   general  vocation 
    0.3972977 0.3278174 0.2748849 
    -------------------------------------------------------- 
    pp.write$ses: middle
     academic   general  vocation 
    0.4256198 0.2010864 0.3732938 
    > #melt data set to long for ggplot2
    > lpp <- melt(pp.write, id.vars = c("ses", "write"), value.name = "probability")
    > head(lpp)  # view first few rows
      ses write variable probability
    1 low    30 academic  0.09843588
    2 low    31 academic  0.10716868
    3 low    32 academic  0.11650390
    4 low    33 academic  0.12645834
    5 low    34 academic  0.13704576
    6 low    35 academic  0.14827643
    > # plot predicted probabilities across write values for each level of ses
    > # facetted by program type
    > ggplot(lpp, aes(x = write, y = probability, colour = ses)) + geom_line() + facet_grid(variable ~., scales = "free")
    
    
    
    > install.packages("mlogit")
    > library(Formula)
    > library(maxLik)
    > library(miscTools)
    > library(mlogit)
    > data("Fishing", package = "mlogit")
    > Fish <- mlogit.data(Fishing,shape = "wide",choice = "mode")
    > summary(mlogit(mode ~ 0|income, data = Fish))
    
    Call:
    mlogit(formula = mode ~ 0 | income, data = Fish, method = "nr")
    
    Frequencies of alternatives:choice
      beach    boat charter    pier 
    0.11337 0.35364 0.38240 0.15059 
    
    nr method
    4 iterations, 0h:0m:0s 
    g'(-H)^-1g = 8.32E-07 
    gradient close to zero 
    
    Coefficients :
                           Estimate  Std. Error z-value  Pr(>|z|)    
    (Intercept):boat     7.3892e-01  1.9673e-01  3.7560 0.0001727 ***
    (Intercept):charter  1.3413e+00  1.9452e-01  6.8955 5.367e-12 ***
    (Intercept):pier     8.1415e-01  2.2863e-01  3.5610 0.0003695 ***
    income:boat          9.1906e-05  4.0664e-05  2.2602 0.0238116 *  
    income:charter      -3.1640e-05  4.1846e-05 -0.7561 0.4495908    
    income:pier         -1.4340e-04  5.3288e-05 -2.6911 0.0071223 ** 
    ---
    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    Log-Likelihood: -1477.2
    McFadden R^2:  0.013736 
    Likelihood ratio test : chisq = 41.145 (p.value = 6.0931e-09)
    
    

    2.2章 有序分类logistic回归 即 等级资料

    有序分类logistic回归
    案例

    有可能把等级资料转换成二分类资料就转换,否则只能当作等级资料或多分类资料;


    image.png

    是否从学校毕业:不可能,有可能,极有可能;父母是否有学位:1表示有,0表示无;公立还是私立学校:1公里,0私立

    ## 程序包MASS提供polr()函数可以进行ordered logit或probit回归
    
    require(foreign)
    require(ggplot2)
    require(MASS)
    require(Hmisc)
    require(reshape2)
    
    dat <- read.dta("ologit.dta")
    #View(dat)
    head(dat)
    ## one at a time, table apply, pared, and public
    lapply(dat[, c("apply", "pared", "public")], table)
    ## three way cross tabs (xtabs) and flatten the table
    ftable(xtabs(~ public + apply + pared, data = dat))
    summary(dat$gpa)
    sd(dat$gpa)
    
    ggplot(dat, aes(x = apply, y = gpa)) +
      geom_boxplot(size = .75) +
      geom_jitter(alpha = .5) +
      facet_grid(pared ~ public, margins = TRUE) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
    
    ## fit ordered logit model and store results 'm'
    m <- polr(apply ~ pared + public + gpa, data = dat, Hess=TRUE)
    ## view a summary of the model
    summary(m)
    
    ## store table
    (ctable <- coef(summary(m)))
    ## calculate and store p values
    p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
    ## combined table
    (ctable <- cbind(ctable, "p value" = p))
    
    (ci <- confint(m)) # default method gives profiled CIs
    confint.default(m) # CIs assuming normality
    ## odds ratios
    exp(coef(m))
    ## OR and CI
    exp(cbind(OR = coef(m), ci))
    
    
    sf <- function(y) {
      c('Y>=1' = qlogis(mean(y >= 1)),
        'Y>=2' = qlogis(mean(y >= 2)),
        'Y>=3' = qlogis(mean(y >= 3)))
    }
    (s <- with(dat, summary(as.numeric(apply) ~ pared + public + gpa, fun=sf)))
    
    glm(I(as.numeric(apply) >= 2) ~ pared, family="binomial", data = dat)
    
    glm(I(as.numeric(apply) >= 3) ~ pared, family="binomial", data = dat)
    
    s[, 4] <- s[, 4] - s[, 3]
    s[, 3] <- s[, 3] - s[, 3]
    s # print
    
    plot(s, which=1:3, pch=1:3, xlab='logit', main=' ', xlim=range(s[,3:4]))
    
    newdat <- data.frame(
      pared = rep(0:1, 200),
      public = rep(0:1, each = 200),
      gpa = rep(seq(from = 1.9, to = 4, length.out = 100), 4))
    newdat <- cbind(newdat, predict(m, newdat, type = "probs"))
    
    ##show first few rows
    head(newdat)
    
    lnewdat <- melt(newdat, id.vars = c("pared", "public", "gpa"),
                    variable.name = "Level", value.name="Probability")
    ## view first few rows
    head(lnewdat)
    
    ggplot(lnewdat, aes(x = gpa, y = Probability, colour = Level)) +
      geom_line() + facet_grid(pared ~ public, labeller="label_both")
    

    相关文章

      网友评论

        本文标题:第2章 无序多分类和有序多分类

        本文链接:https://www.haomeiwen.com/subject/relshktx.html