### 统计与R讲座的演示程序。 ########### 演示用的数据 ############ ## 19个学生的身高体重数据。 d.class <- structure(list(name = structure(c(2L, 3L, 5L, 10L, 11L, 12L, 15L, 16L, 17L, 1L, 4L, 6L, 7L, 8L, 9L, 13L, 14L, 18L, 19L), .Label = c("Alfred", "Alice", "Becka", "Duke", "Gail", "Guido", "James", "Jeffrey", "John", "Karen", "Kathy", "Mary", "Philip", "Robert", "Sandy", "Sharon", "Tammy", "Thomas", "William"), class = "factor"), sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("F", "M"), class = "factor"), age = c(13L, 13L, 14L, 12L, 12L, 15L, 11L, 15L, 14L, 14L, 14L, 15L, 12L, 13L, 12L, 16L, 12L, 11L, 15L), height = c(56.5, 65.3, 64.3, 56.3, 59.8, 66.5, 51.3, 62.5, 62.8, 69, 63.5, 67, 57.3, 62.5, 59, 72, 64.8, 57.5, 66.5), weight = c(84, 98, 90, 77, 84.5, 112, 50.5, 112.5, 102.5, 112.5, 102.5, 133, 83, 84, 99.5, 150, 128, 85, 112)), .Names = c("name", "sex", "age", "height", "weight"), class = "data.frame", row.names = c(NA, -19L)) ## 100个学生的语文、数学、英语、物理、化学、生物成绩。主成份和因子分析。 d.scores <- structure(list(学号 = c("225040819", "210050718", "226170534", "226191045", "229120703", "225102012", "210060331", "226090835", "225180415", "225072225", "228192837", "228160809", "226071147", "226191019", "228032508", "205093429", "228121617", "210070122", "226190523", "205154816", "205021825", "228221122", "203050439", "205253820", "228211614", "225052428", "210070332", "203102326", "228200918", "226020307", "205134827", "226121101", "229120707", "205134418", "225092127", "228171508", "221060722", "226080741", "229080931", "228020803", "205034506", "203041406", "226080420", "205063410", "225172323", "225040827", "205134308", "203131832", "205173806", "226190443", "228212122", "226141416", "203020227", "228051109", "205053908", "228070939", "210111213", "229081026", "203021502", "226171716", "225080604", "226081301", "205241919", "203010233", "210070218", "228122620", "225122710", "225051018", "226081020", "226020322", "203071407", "225031614", "203060719", "228182104", "221020221", "205172627", "229091404", "228072710", "205011714", "226191139", "226151322", "205172106", "203060550", "203161626", "203061014", "205233330", "203211013", "205084213", "203112725", "217040329", "225040817", "203121211", "228032313", "225020208", "203142140", "226020335", "205223328", "210091321", "225102218", "225071708"), 语文 = c(97, 88, 93, 81, 89, 98, 82, 97, 93, 84, 84, 105, 92, 88, 85, 75, 87, 77, 94, 83, 92, 103, 84, 88, 94, 79, 80, 92, 87, 102, 73, 85, 75, 87, 79, 103, 80, 86, 80, 107, 76, 93, 79, 83, 94, 93, 84, 83, 83, 81, 81, 100, 94, 88, 79, 99, 72, 98, 73, 89, 96, 98, 92, 98, 93, 91, 67, 95, 92, 88, 91, 84, 103, 91, 87, 98, 82, 92, 96, 94, 91, 87, 98, 0, 89, 77, 96, 83, 38, 78, 102, 80, 110, 96, 77, 103, 94, 70, 74, 80), 数学 = c(81, 31, 96, 95, 75, 76, 40, 107, 98, 62, 25, 90, 116, 102, 0, 79, 90, 57, 116, 50, 111, 105, 102, 96, 80, 28, 45, 69, 87, 100, 58, 90, 101, 45, 44, 98, 5, 116, 65, 107, 56, 66, 98, 77, 42, 93, 60, 82, 55, 107, 86, 93, 102, 102, 80, 96, 15, 40, 88, 73, 114, 104, 81, 112, 55, 76, 28, 89, 103, 109, 56, 76, 70, 89, 52, 91, 20, 77, 68, 91, 98, 96, 82, 0, 85, 68, 105, 59, 0, 52, 89, 106, 0, 110, 58, 118, 77, 15, 65, 72), 英语 = c(78.5, 33, 70.5, 74, 60.5, 65.5, 61, 71, 91, 72, 52.5, 82.5, 78, 65, 0, 47.5, 63.5, 41, 70.5, 52, 74, 90.5, 81, 49.5, 91.5, 48, 63.5, 38.5, 82, 82, 71.5, 80, 55.5, 42, 62.5, 71, 46, 94.5, 55.5, 101, 53.5, 72.5, 74, 59.5, 64.5, 79, 37, 58, 43, 70, 58, 71, 97, 84, 44, 95, 28.5, 59, 48, 64.5, 94, 62.5, 72, 90.5, 73.5, 66, 58.5, 85, 79, 71.5, 64.5, 70, 85, 82.5, 79, 70, 57.5, 68.5, 81, 64.5, 75.5, 57, 94.5, 0, 48.5, 29, 56.5, 57, 60.5, 51.5, 82, 57, 0, 90.5, 52, 79.5, 75.5, 28.5, 49, 82), 物理 = c(49, 34, 33, 3, 40, 50, 51, 22, 22, 49, 15, 43, 11, 23, 21, 89, 36, 61, 20, 44, 15, 58, 20, 46, 36, 36, 39, 59, 17, 21, 57, 34, 36, 43, 45, 42, 28, 29, 61, 29, 19, 67, 34, 48, 58, 32, 66, 24, 27, 17, 40, 0, 9, 27, 26, 14, 15, 56, 72, 82, 39, 23, 44, 19, 53, 30, 34, 52, 38, 65, 57, 26, 54, 53, 15, 17, 17, 61, 6, 21, 36, 43, 49, 21, 56, 49, 50, 63, 26, 43, 46, 9, 39, 20, 29, 16, 67, 33, 43, 14 ), 化学 = c(56, 24, 57, 18, 17, 13, 51, 19, 22, 57, 14, 49, 32, 20, 24, 92, 46, 56, 28, 33, 31, 68, 12, 62, 70, 39, 18, 69, 8, 25, 50, 20, 18, 48, 49, 40, 32, 47, 84, 68, 16, 60, 53, 47, 47, 25, 32, 35, 22, 22, 32, 0, 25, 39, 59, 31, 18, 56, 59, 85, 24, 26, 22, 28, 74, 35, 28, 39, 55, 67, 63, 31, 53, 75, 23, 42, 40, 49, 16, 44, 41, 52, 71, 19, 18, 72, 50, 87, 14, 81, 40, 14, 9, 63, 35, 28, 80, 22, 46, 8), 生物 = c(42, 28, 59, 16, 42, 43, 59, 44, 44, 60, 9, 60, 26, 7, 7, 84, 54, 46, 41, 63, 35, 58, 4, 32, 66, 53, 44, 61, 31, 30, 60, 20, 27, 67, 46, 80, 20, 46, 63, 44, 22, 64, 51, 60, 70, 30, 65, 47, 44, 29, 59, 0, 27, 51, 71, 65, 27, 21, 65, 79, 29, 17, 35, 26, 56, 40, 22, 58, 61, 67, 57, 21, 51, 46, 11, 48, 36, 56, 17, 52, 64, 51, 72, 30, 54, 36, 61, 60, 42, 69, 67, 27, 34, 44, 49, 22, 69, 33, 60, 11)), .Names = c("学号", "语文", "数学", "英语", "物理", "化学", "生物"), row.names = c("225040819", "210050718", "226170534", "226191045", "229120703", "225102012", "210060331", "226090835", "225180415", "225072225", "228192837", "228160809", "226071147", "226191019", "228032508", "205093429", "228121617", "210070122", "226190523", "205154816", "205021825", "228221122", "203050439", "205253820", "228211614", "225052428", "210070332", "203102326", "228200918", "226020307", "205134827", "226121101", "229120707", "205134418", "225092127", "228171508", "221060722", "226080741", "229080931", "228020803", "205034506", "203041406", "226080420", "205063410", "225172323", "225040827", "205134308", "203131832", "205173806", "226190443", "228212122", "226141416", "203020227", "228051109", "205053908", "228070939", "210111213", "229081026", "203021502", "226171716", "225080604", "226081301", "205241919", "203010233", "210070218", "228122620", "225122710", "225051018", "226081020", "226020322", "203071407", "225031614", "203060719", "228182104", "221020221", "205172627", "229091404", "228072710", "205011714", "226191139", "226151322", "205172106", "203060550", "203161626", "203061014", "205233330", "203211013", "205084213", "203112725", "217040329", "225040817", "203121211", "228032313", "225020208", "203142140", "226020335", "205223328", "210091321", "225102218", "225071708"), class = "data.frame") ## 癌症病人康复的logistic回归。 d.remiss <- structure(list(remiss = c(1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L), cell = c(0.8, 0.9, 0.8, 1, 0.9, 1, 0.95, 0.95, 1, 0.95, 0.85, 0.7, 0.8, 0.2, 1, 1, 0.65, 1, 0.5, 1, 1, 0.9, 1, 0.95, 1, 1, 1), smear = c(0.83, 0.36, 0.88, 0.87, 0.75, 0.65, 0.97, 0.87, 0.45, 0.36, 0.39, 0.76, 0.46, 0.39, 0.9, 0.84, 0.42, 0.75, 0.44, 0.63, 0.33, 0.93, 0.58, 0.32, 0.6, 0.69, 0.73), infil = c(0.66, 0.32, 0.7, 0.87, 0.68, 0.65, 0.92, 0.83, 0.45, 0.34, 0.33, 0.53, 0.37, 0.08, 0.9, 0.84, 0.27, 0.75, 0.22, 0.63, 0.33, 0.84, 0.58, 0.3, 0.6, 0.69, 0.73), li = c(1.9, 1.4, 0.8, 0.7, 1.3, 0.6, 1, 1.9, 0.8, 0.5, 0.7, 1.2, 0.4, 0.8, 1.1, 1.9, 0.5, 1, 0.6, 1.1, 0.4, 0.6, 1, 1.6, 1.7, 0.9, 0.7), blast = c(1.1, 0.74, 0.176, 1.053, 0.519, 0.519, 1.23, 1.354, 0.322, 0, 0.279, 0.146, 0.38, 0.114, 1.037, 2.064, 0.114, 1.322, 0.114, 1.072, 0.176, 1.591, 0.531, 0.886, 0.964, 0.398, 0.398), temp = c(0.996, 0.992, 0.982, 0.986, 0.98, 0.982, 0.992, 1.02, 0.999, 1.038, 0.988, 0.982, 1.006, 0.99, 0.99, 1.02, 1.014, 1.004, 0.99, 0.986, 1.01, 1.02, 1.002, 0.988, 0.99, 0.986, 0.986)), .Names = c("remiss", "cell", "smear", "infil", "li", "blast", "temp"), class = "data.frame", row.names = c(NA, -27L)) ## 工作性质与工作满意度关系。典型相关分析。 d.jobs <- structure(list(Career = c(72L, 63L, 96L, 96L, 84L, 66L, 31L, 45L, 42L, 79L, 39L, 54L, 60L, 63L), Supervis = c(26L, 76L, 31L, 98L, 94L, 10L, 40L, 14L, 18L, 74L, 12L, 35L, 75L, 45L), Finance = c(9L, 7L, 7L, 6L, 6L, 5L, 9L, 2L, 6L, 4L, 2L, 3L, 5L, 5L), Variety = c(10L, 85L, 83L, 82L, 36L, 28L, 64L, 19L, 33L, 23L, 37L, 23L, 45L, 22L ), Feedback = c(11L, 22L, 63L, 75L, 77L, 24L, 23L, 15L, 13L, 14L, 13L, 74L, 58L, 67L), Autonomy = c(70L, 93L, 73L, 97L, 97L, 75L, 75L, 50L, 70L, 90L, 70L, 53L, 83L, 53L)), .Names = c("Career", "Supervis", "Finance", "Variety", "Feedback", "Autonomy"), class = "data.frame", row.names = c(NA, -14L)) ## 北京地区1949--1964年洪水受灾面积与成灾面积数据。时间序列。 d.flood <- matrix(c( 1949 , 1 , 331.12 , 243.96 , 1950 , 2 , 380.44 , 293.90 , 1951 , 3 , 59.63 , 59.63, 1952 , 4 , 37.89 , 18.09, 1953 , 5 , 103.66 ,72.92, 1954 , 6 , 316.67 , 244.57, 1955 , 7 , 208.72 , 155.77, 1956 , 8 , 288.79 , 255.22, 1957 , 9 , 25.00 , 0.50, 1958 , 10 , 84.72 , 48.59, 1959 , 11 , 260.89 ,202.96, 1960, 12 , 27.18 ,15.02, 1961, 13 , 20.74 ,17.09, 1962 , 14 , 52.99 ,14.66, 1963 , 15 , 99.25 , 45.61, 1964 , 16 , 55.36 ,41.90), byrow=T, ncol=4, dimnames=list(1949:1964, c("year", "t", "area1", "area2"))) ########### 第一讲 ################# #################################### ### R初步演示。 x1 <- 0:100 x2 <- x1 * 2 * pi / 100 y1 <- sin(x2) plot(x2, y1, type="l") abline(h=0, lwd=2) abline(v=(0:4)/2*pi, lty=3, col="gray") y2 <- cos(x2) lines(x2, y2, lty=2, col="green") demo("graphics") demo("image") cl <- read.csv("class.csv", header=TRUE) summary(cl) mean(cl$height) var(cl$height) lm1 <- lm(weight ~ height + age + sex, data=cl) print(summary(lm1)) lm2 <- step(lm1, weight ~ height + age + sex, data=cl) sink("myres.txt", split=TRUE) print(A) print(Ai) sink() ### R向量演示。 marks <- c(10, 6, 4, 7, 8) x <- c(1:3, 10:13) x1 <- c(1, 2) x2 <- c(3, 4) x <- c(x1, x2) x x <- c(1, 10) x + 2 x - 2 x * 2 x / 2 x ^ 2 2 / x 2 ^x x1 <- c(1, 10) x2 <- c(4, 2) x1 + x2 x1 - x2 x1 * x2 x1 / x2 x1 <- c(1, 10) x2 <- c(1, 3, 5, 7) x1 + x2 x1 * x2 sort(c(3, 5, 1)) cl[order(cl$height),] ages <- c("李明"=30, "张聪"=25, "刘颖"=28) ages <- c(30, 25, 28) names(ages) <- c("李明", "张聪", "刘颖") ### R矩阵演示。 A <- matrix(1:6, nrow=3, ncol=2) B <- matrix(1,-1, 1,1, nrow=2, ncol=2, byrow=TRUE) C1 <- A %*% B C2 <- A + 2 C3 <- A / 2 colnames(A) <- c("x", "y") A[,"y"] ### R函数演示。 fsub <- function(x, y=0){ cat("x=", x, " y=", y, "\n") x - y } fsub(5,3) fsub(5) fsub(x=5, y=3) fsub(y=3, x=5) ### R探索性数据分析演示。 x <- cl$sex print(x) table(x) table(x)/length(x) barplot(table(x)) x <- rnorm(30, 10, 2) summary(x) hist(x) boxplot(x) qqnorm(x);qqline(x) x <- rexp(30) summary(x) hist(x) boxplot(x) qqnorm(x);qqline(x) ########### 第二讲 ################# #################################### ### 用optim求正态MLE的演示。 objf <- function(theta, x){ mu <- theta[1] s2 <- exp(theta[2]) n <- length(x) res <- n*log(s2) + 1/s2*sum((x - mu)^2) res } sim <- function(n=30){ mu0 <- 20 sigma0 <- 2 x <- rnorm(n, mu0, sigma0) theta0 <- c(0,0) ores <- optim(theta0, objf, x=x) print(ores) theta <- ores$par mu <- theta[1] sigma <- exp(0.5*theta[2]) cat("mu: ", mu, " ==> ", mu0, "\n") cat("sigma: ", sigma, " ==> ", sigma0, "\n") } ### 用optimize求正态MLE的演示。 sim1 <- function(n=30){ mu0 <- 20 sigma0 <- 2 x <- rnorm(n, mu0, sigma0) mu <- mean(x) ss <- sum((x - mu)^2)/length(x) objf <- function(delta,ss) log(delta) + 1/delta*ss ores <- optimize(objf, lower=0.0001, upper=1000, ss=ss) delta <- ores$minimum sigma <- sqrt(delta) print(ores) cat("mu: ", mu, " ==> ", mu0, "\n") cat("sigma: ", sigma, " ==> ", sigma0, "\n") } ### R置信区间演示。 x <- rnorm(100,10,3) mu <- mean(x) sig <- sd(x) p = pnorm(15,mu,sig) - pnorm(5,mu,sig) cat("Prob in [5,15] = ", p, "\n") ### 用t.test求正态均值置信区间。 x <- c(11.67, 9.29, 10.45, 9.01, 12.67, 16.24, 11.64, 7.73, 12.23) t.test(x, conf.level=0.95) ### 单总体双侧均值t检验。 x <- c(490, 506, 508, 502, 498, 511, 510, 515, 512) t.test(x, mu=500, side="two-sided") ### 单总体右侧方差卡方检验。 x <- c(42,65,75,78,59,71,57,68,54,55) n <- length(x) chi2 <- (n-1)*var(x)/80 p <- 1-pchisq(chi2,n-1) cat("Chi-squared = ", chi2, " p-value = ", p, "\n") ### 两样本t检验。 x <- c(20.5, 19.8, 19.7, 20.4, 20.1, 20.0, 19.0, 19.9) y <- c(20.7, 19.8, 19.5, 20.8, 20.4, 19.6, 20.2) t.test(x, y) ### 方差比较。 var.test(x, y) ### 成对t检验。 x <- c(20.5, 18.8, 19.8, 20.9, 21.5, 19.5, 21.0, 21.2) y <- c(17.7, 20.3, 20.0, 18.8, 19.0, 20.1, 20.0, 19.1) t.test(x, y, paired=TRUE) ### 单总体比例检验。 ### 12次中5次成功,与40\%比较。 binom.test(5, 12, 0.4) prop.test(5, 12, 0.4) ### 单总体比例检验。 ### 120次中35次成功,与25\%比较。 prop.test(35, 120, 0.25) ### 两个比例的比较。102中成功23和135中成功25的比例比较。 prop.test(c(23,25), c(102,135)) ########### 第三讲 方差分析 ######## #################################### ## Wilcoxon秩和检验。 x <- c(20.5, 19.8, 19.7, 20.4, 20.1, 20.0, 19.0, 19.9) y <- c(20.7, 19.8, 19.5, 20.8, 20.4, 19.6, 20.2) wilcox.test(x, y) ## 符号检验。 x <- c(20.5, 18.8, 19.8, 20.9, 21.5, 19.5, 21.0, 21.2) y <- c(17.7, 20.3, 20.0, 18.8, 19.0, 20.1, 20.0, 19.1) t.test(x, y, paired=TRUE) z <- x - y binom.test(sum(z>0), sum(z != 0), p=0.5) ## Wilcoxon 符号秩检验 wilcox.test(x, y, paired=TRUE) ## 单总体拟合优度卡方检验。 x <- c(18, 13, 17, 21, 15, 16) p <- rep(1/6, 6) chisq.test(x, p) ## 独立性卡方检验 tab <- matrix(c(60, 3, 32, 11), nrow=2, ncol=2, byrow=TRUE, dimnames=list(c("病人", "健康人"), c("吸烟", "不吸烟"))) chisq.test(tab) ## 单因素方差分析 A <- factor(rep(1:5, each=4)) y <- c(25.6, 22.2, 28.0, 29.8, 24.4, 30.0, 29.0, 27.5, 25.0, 27.7, 23.0, 32.2, 28.8, 28.0, 31.5, 25.9, 20.6, 21.2, 22.0, 21.2) d <- data.frame(A, y) plot(y ~ A, data=d) summary(aov(y ~ A, data=d)) tapply(d$y, d$A, mean) ## 控制单次比较错误率的多重比较 pairwise.t.test(y, A, p.adjust="none") ## 控制总错误率的多重比较 pairwise.t.test(y, A) ## 控制错误发现率的多重比较 pairwise.t.test(y, A, p.adjust="fdr") ## 用Tukey同时置信区间做多重比较。 TukeyHSD(aov(y ~ A, data=d)) ## Bartlett方差齐性检验 bartlett.test(y ~ A, data=d) ## Levene方差齐性检验 require(car) leveneTest(y ~ A, data=d) ## 方差不相等情形下单因素方差分析的Welch检验 oneway.test(y ~ A, data=d) ## 非参数单因素方差分析的Kruskal-Wallis检验 kruskal.test(y ~ A, data=d) ## 两因素方差分析 rats <- data.frame( y = c(0.31, 0.45, 0.46, 0.43, # (1,1) 0.82, 1.10, 0.88, 0.72, # (1,2) 0.43, 0.45, 0.63, 0.76, # (1,3) 0.45, 0.71, 0.66, 0.62, # (1,4) 0.36, 0.29, 0.40, 0.23, # (2,1) 0.92, 0.61, 0.49, 1.24, # (2,2) 0.44, 0.35, 0.31, 0.40, # (2,3) 0.56, 1.02, 0.71, 0.38, # (2,4) 0.22, 0.21, 0.18, 0.23, # (3,1) 0.30, 0.37, 0.38, 0.29, # (3,2) 0.23, 0.25, 0.24, 0.22, # (3,3) 0.30, 0.36, 0.31, 0.33), # (3,4) Toxicant=factor(rep(1:3, each=4*4)), Cure=factor(rep(rep(1:4, each=4), 3))) ## 比较各组的盒形图 opar <- par(mfrow=c(1,2)) plot(y ~ Toxicant + Cure, data=rats) par(opar) ## 带交互作用的两因素方差分析 res <- aov(y ~ Toxicant + Cure + Toxicant:Cure, data=rats) summary(res) ## 无交互作用的两因素方差分析 res2 <- aov(y ~ Toxicant + Cure, data=rats) summary(res2) tapply(rats$y, rats$Toxicant, mean) tapply(rats$y, rats$Cure, mean) ## 协方差分析, 用HH包。 require(HH) ancova(y ~ A + x, data=d) ## 协方差分析,用lm函数。 anova(lm(y ~ A + x, data=d)) ## 三个三水平因素的正交设计。烟灰砖试验 d <- data.frame( A = factor(rep(1:3, each=3)), B = factor(rep(1:3, 3)), C = factor(c(1,2,3, 2,3,1, 3,1,2)), y = c(16.9, 19.1, 16.7, 19.8, 23.7, 19.0, 25.0, 20.4, 23.1)) ## 画图比较 opar <- par(mfrow=c(1,3)) plot(y ~ A + B + C, data=d) par(opar) ## 方差分析 summary(aov(y ~ A + B + C, data=d)) ############ 第四讲 统计模型 ################# ############################################## ### 相关与回归 ### ## 生成线性回归模拟数据 nsamp <- 30 x <- runif(nsamp, -10, 10) y <- 20 + 0.5*x + rnorm(nsamp,0,0.5) plot(x, y) ## 生成二次曲线回归模拟数据 y2 <- 0.5*x^2 + rnorm(nsamp,0,2) plot(x, y2) ## 生成指数函数回归模拟数据 y3 <- exp(0.2*(x+10)) + rnorm(nsamp,0,2) plot(x, y3) ## 生成对数函数回归模拟数据 y4 <- log(10*(x+12)) + rnorm(nsamp,0,0.1) plot(x, y4) ## 计算相关系数并检验:线性相关例子 cor(x, y) cor.test(x, y) ## 计算相关系数并检验:二次曲线相关例子 cor(x, y2) cor.test(x, y2) ## 模拟数据的一元回归 plot(x, y) abline(lm(y ~ x), col="red", lwd=2) res <- lm(y ~ x); res summary(res) ## 19个学生身高体重的回归。 d <- read.csv("class.csv", header=TRUE) lm1 <- lm(weight ~ height, data=d); summary(lm1) plot(weight ~ height, data=d) abline(lm1, col="red", lwd=2) plot(lm1) ## 样条回归例子。 nsamp <- 30 x <- runif(nsamp, -10, 10) x <- sort(x) y <- 10*sin(x/10*pi)^2 + rnorm(nsamp,0,0.2) plot(x, y) require(splines) res <- smooth.spline(x, y) lines(spline(x, res$y), col="red") ## 体重对身高和年龄的多元回归。 lm2 <- lm(weight ~ height + age, data=d) summary(lm2) ## 只对年龄回归。 lm3 <- lm(weight ~ age, data=d) summary(lm3) ## 逐步回归。 lm4 <- step(lm(weight ~ height + age + sex, data=d)) summary(lm4) ## 以因子为自变量的回归。 lm5 <- lm(weight ~ height + sex, data=d) summary(lm5) ## 癌症病人康复概率的logistic回归。 remiss <- read.csv("remiss.csv", header=TRUE) res1 <- glm(remiss ~ cell+smear+infil+li +blast+temp, family=binomial, data=remiss) summary(res1) ## 逐步剔除不重要的自变量。 res2 <- glm(remiss ~ cell+smear+infil+li +temp, family=binomial, data=remiss) summary(res2) res3 <- glm(remiss ~ cell+infil+li +temp, family=binomial, data=remiss) summary(res3) res4 <- glm(remiss ~ cell+li+temp, family=binomial, data=remiss) summary(res4) ## step()逐步回归。 ress <- step(glm(remiss ~ cell+smear+infil+li +blast+temp, family=binomial, data=remiss)) summary(ress) ### 多元分析 ### ### 主成份分析。 ## 模拟例子 nsamp <- 30; s <- 1 p01 <- runif(nsamp, -10, 10) p02 <- runif(nsamp, 0, 5) x1 <- 0.5*p01 + 0.1*p02 + rnorm(nsamp, 0, s) x2 <- 0.1*p01 - 0.5*p02 + rnorm(nsamp, 0, s) x3 <- 0.5*p01 - 0.1*p02 + rnorm(nsamp, 0, s) x4 <- -0.1*p01 + 0.5*p02 + rnorm(nsamp, 0, s) ## 用协方差阵。 res1 <- princomp(~x1+x2+x3+x4, cor=FALSE, scores=TRUE) summary(res1) ## 用相关阵。 res2 <- princomp(~x1+x2+x3+x4, cor=TRUE) summary(res2) ## 计算主成份得分。 pr1 <- predict(res1) plot(d[,1], pr1[,1]) ### 因子分析。 ## 100个学生的语文、数学、英语、物理、化学、生物成绩。 scores <- read.csv("scores.csv", header=TRUE) dd <- scores[,2:7] res <- factanal(dd, factors=2) res ### 判别分析。 ## 鸢尾花数据的Fisher判别分析。 data(iris) plot(Sepal.Length ~ Species, data=iris) require(MASS) res <- lda(Species ~ Sepal.Length+Sepal.Width +Petal.Length+Petal.Width, data=iris) res ## 对比判别效果。 pr <- predict(res)$class table(iris[,"Species"], pr) ### 聚类分析。 ## 鸢尾花数据的聚类分析。 data(iris) ## 计算距离。 di <- dist(iris[,1:4], method="euclidean") print(as.matrix(di)[1:10,1:10]) ## 谱系聚类。 res <- hclust(di, method="complete") plot(res, labels=FALSE) ## 在结果树形图中切分出三个类。 rect.hclust(res, k=3) ## 得到分三个类的分类结果。 clus <- cutree(res, k=3) ## 与真实类对比。 table(iris[,"Species"], clus) ### 典型相关分析 ## 鸢尾花数据花萼长、宽与花瓣长、宽的典型相关。 data(iris) res <- cancor(iris[,1:2], iris[,3:4]) print(res) ## 工作性质与工作满意度的典型相关。 jobs <- read.csv("jobs.csv", header=TRUE) res <- cancor(jobs[,1:3], jobs[,4:6]) print(res) ### 时间序列分析 ### ## 北京地区1949-1964年洪灾受灾和成灾面积数据,年数据。 print(d.flood) flood.area1 <- ts(d.flood[,"area1"], frequency=1, start=1949) flood.area2 <- ts(d.flood[,"area2"], frequency=1, start=1949) plot(flood.area1) lines(flood.area2, col="red", lty=2) ## 自相关系数函数ACF。 acf(flood.area1) ## 居民用煤消耗季度数据,从1991年到1996年。 coal.consumption <- ts(c( 6878.4 , 5343.7 , 4847.9 , 6421.9 , 6815.4 , 5532.6 , 4745.6 , 6406.2 , 6634.4 , 5658.5 , 4674.8 , 6445.5 , 7130.2 , 5532.6 , 4989.6 , 6642.3 , 7413.5 , 5863.1 , 4997.4 , 6776.1 , 7476.5 , 5965.5 , 5202.1 , 6894.1 ), frequency=4, start=c(1991,1)) plot(coal.consumption) ## 用car包中的durbin.watson()函数检验洪灾受灾面积是否白噪声。 require(car) durbinWatsonTest(lm(flood.area1 ~ 1)) ## 用Box.test()函数检验洪灾受灾面积是否白噪声。 require(ts) Box.test(flood.area1) Box.test(flood.area1, type="Ljung-Box") ## 用tseries包中的adf.test进行Dickey-Fuller单位根检验。 require(tseries) adf.test(flood.area1) adf.test(coal.consumption) ### AR 模型 ## coal.consumption的AR模型。 pacf(coal.consumption) res <- arma(coal.consumption, order=c(3,0,0)) summary(res) ## 拟合 pr <- fitted(res) plot(coal.consumption) lines(pr, col="red", lty=2) ############ 第五讲 质量控制与管理 ############ ############################################## ## dpmo 计算公式。设公差为设计值加减k*sigma, ## 生产过程均值距离设计值a*sigma。 ## dpmo = pnorm(a-k) aarr <- seq(0, 2, by=0.25) karr <- seq(3, 6, by=0.5) dpmo.tab <- outer(aarr, karr, function(a, k) pnorm(a-k)*1E6) rownames(dpmo.tab) <- format(aarr) colnames(dpmo.tab) <- format(karr) for(ii in seq(nrow(dpmo.tab))) for(jj in seq(ncol(dpmo.tab))){ a <- dpmo.tab[ii,jj] if(a>=10) a <- round(a) else if(a>=1) a <- round(a,1) else if(a>=0.1) a <- round(a,2) else a <- round(a,3) dpmo.tab[ii,jj] <- a } print(dpmo.tab) ## 运行图的例子。 demo.run <- function(use.pdf=FALSE){ if(use.pdf){ pdf("run-chart.pdf", width=10, height=4) on.exit(dev.off()) } x <- c(5, -7, -5, -7, 4, -2, 9, 8, -5, 1, -1, 6, -6, -1, -0.5, 2, -1) mu <- mean(x) plot(seq(along=x), x, type="p", pch=16, xlab="Time", ylab="Measurement") lines(seq(along=x), x) abline(h=mu, lwd=2) } demo.run() ## 控制图的例子。 demo.control <- function(use.pdf=FALSE){ if(use.pdf){ pdf("control-chart.pdf", width=10, height=4) on.exit(dev.off()) } UCL <- 10; LCL <- -10 mu <- mean(x) x <- c(5, -7, -5, -7, 4, -2, 9, 8, -5, 1, -1, 6, -6, -1, -0.5, 2, -1) plot(seq(along=x), x, type="p", pch=16, ylim=range(c(LCL, UCL, x)), xlab="Time", ylab="Measurement") lines(seq(along=x), x) abline(h=mu, lwd=2) abline(h=c(UCL, LCL), lty=3) } demo.control() ## 离散变量的直方图。 x <- sample(1:3, 100, replace=TRUE, prob=c(0.5, 0.3, 0.2)) barplot(table(x)) ## 连续变量的直方图。 x <- exp(rnorm(100)) hist(x) ## qcc质量控制包。pistonrings数据。 require(qcc) data(pistonrings) attach(pistonrings) print(pistonrings[1:8,]) ##直方图。 hist(diameter) ## 重排数据框使每个时间的5个观测在同一行。 diameter <- qcc.groups(diameter, sample) print(diameter[1:5,]) ## 均值的控制图。仅适用前25次观测。 obj <- qcc(diameter[1:25,], type="xbar") ## 过程能力分析。 process.capability(obj, spec.limits=c(73.95, 74.05)) ## 控制图示例。 obj <- qcc(diameter[1:25,], type="xbar") summary(obj) obj2 <- qcc(diameter[1:25,], type="xbar", newdata=diameter[26:40,]) ## 标准差控制图。 obj3 <- qcc(diameter[1:25,], type="S") obj4 <- qcc(diameter[1:25,], type="S", newdata=diameter[26:40,]) ## 计数值的控制图。 data(orangejuice) head(orangejuice) attach(orangejuice) obj <- qcc(D[trial], sizes=size[trial], type="p") summary(obj) obj2 <- qcc(D[trial], sizes=size[trial], type="p", newdata=D[!trial], newsizes=size[!trial])