### 本科生《数理统计》的一些计算例子。 ### 节4.2的例2.1,合成纤维强度 fiber <- function(){ ## 自变量,拉伸倍数 x <- c(1.9, 2.0, 2.1, 2.5, 2.7, 2.7, 3.5, 3.5, 4.0, 4.0, 4.5, 4.6, 5.0, 5.2, 6.0, 6.3, 6.5, 7.1, 8.0, 8.0, 8.9, 9.0, 9.5, 10.0) ## 因变量,强度 y <- c(1.4, 1.3, 1.8, 2.5, 2.8, 2.5, 3.0, 2.7, 4.0, 3.5, 4.2, 3.5, 5.5, 5.0, 5.5, 6.4, 6.0, 5.3, 6.5, 7.0, 8.5, 8.0, 8.1, 8.1) plot(x, y) ## 直接用P.185公式计算。 n <- length(y) lyy <- sum((y - mean(y))^2) lxy <- sum((x - mean(x))*(y - mean(y))) lxx <- sum((x - mean(x))^2) b <- lxy / lxx a <- mean(y) - b*mean(x) pred <- a + b*x err <- y - pred Q <- sum(err^2) sigma <- sqrt(Q/(n-2)) U <- lyy - Q F <- U / (Q / (n-2)) pvalue <- 1 - pf(F, 1, n-2) cat("lyy=", lyy, " lxy=", lxy, " lxx=", lxx, "\n") cat("a=", a, " b=", b, " sigma=", sigma, "\n") cat("Q=", Q, " U=", U, "\n", " F=", F, " Pr>F =", pvalue, "\n") abline(a=a, b=b, col="green", lwd=3) ## 预测限。 xx <- seq(min(x), max(x), length=30) yy <- a + b*xx lambda <- qt(1 - 0.05/2, n-2) lb <- yy - lambda*sigma * sqrt(1 + 1/n + (xx - mean(x))^2/lxx) ub <- yy + lambda*sigma * sqrt(1 + 1/n + (xx - mean(x))^2/lxx) lines(xx, lb, col="cyan", lwd=2, lty=3) lines(xx, ub, col="cyan", lwd=2, lty=3) ## 一般用专用的回归函数计算: lm1 <- lm(y ~ x) cat("\n\n") print(summary(lm1)) ## 预测 yhat <- predict(lm1) ## 残差 ehat <- resid(lm1) ## 回归诊断 plot(lm1) } ### 节4.2的例2.2,哈勃定律。 Harbor <- function(){ d <- cbind(dist=c(22, 68, 108, 137, 255, 315, 390, 405, 685, 700, 1100), velo=c(0.75, 2.4, 3.2, 4.7, 9.3, 12.0, 13.4, 14.4, 24.5, 26.0, 38.0)) d <- as.data.frame(d) plot(d$dist, d$velo) lm1 <- lm(velo ~ -1 + dist, data=as.data.frame(d)) abline(lm1, col="red") print(summary(lm1)) browser() } ### 节4.2的例2.3,哺乳动物开始走动时间与开始玩耍时间 mammals <- function(){ ## 自变量,开始走动时间 d <- data.frame(x = c(360, 165, 21, 23, 11, 18, 18, 150), y = c(90, 105, 21, 26, 14, 28, 21, 105)) rownames(d) <- c("人", "大猩猩", "猫", "家犬", "挪威鼠", "乌鸫", "混血猕猴", "黑猩猩") plot(d$x, d$y, main="哺乳动物开始走动时间与开始玩耍时间", xlab="开始走动时间", ylab="开始玩耍时间", xlim=c(0, 400), ylim=c(0, 160)) text(d["人", "x"], d["人", "y"]+5, "人") text(d["黑猩猩", "x"]-30, d["黑猩猩", "y"], "黑猩猩") text(d["大猩猩", "x"]+30, d["大猩猩", "y"], "大猩猩") ## 线性回归 lm1 <- lm(y ~ x, data=d) abline(lm1, lty=2, col="cyan") ## 幂曲线 d$logx <- log(d$x) d$logy <- log(d$y) lm2 <- lm(logy ~ logx, data=d) yhat <- exp(predict(lm2)) lines(spline(d$x, yhat), lwd=2) cat("所有8种动物的线性回归:\n") print(summary(lm1)) cat("所有8种动物的幂曲线回归:\n") print(summary(lm2)) ## 去掉人、黑猩猩、大猩猩后的结果 locator(1) d <- d[-c(1, 2, 8),] plot(d$x, d$y, main="哺乳动物开始走动时间与开始玩耍时间\n(去掉三个灵长类)", xlab="开始走动时间", ylab="开始玩耍时间", xlim=c(0, 40), ylim=c(0, 40)) ## 线性回归 lm1 <- lm(y ~ x, data=d) abline(lm1, lty=2, col="cyan") ## 幂曲线 d$logx <- log(d$x) d$logy <- log(d$y) lm2 <- lm(logy ~ logx, data=d) cat("5种动物的线性回归:\n") print(summary(lm1)) cat("5种动物的幂曲线回归:\n") print(summary(lm2)) yhat <- exp(predict(lm2)) lines(spline(d$x, yhat), lwd=2) } ### 节4.3例3.2(P.207),温度随时间的变化 temp <- function(){ time <- seq(5, 55, by=5) temp <- c(99.2, 99.7, 99.9, 100.2, 100.3, 100.4, 100.4, 100.3, 100.0, 99.8, 99.4) plot(time, temp) x <- (time - 30)/5 y <- (temp - 99.0)*10 x2 <- x^2 x4 <- x^4 print(cbind("时间"=time, "x"=x, "温度"=temp, "y"=y, "x2"=x2, "x4"=x4)) ## 直接用矩阵计算 n <- length(y) X <- cbind(1, x, x2) cat("X矩阵:\n"); print(X) beta <- solve(crossprod(X), crossprod(X, y)) cat("回归系数:\n"); print(beta) Q <- sum((y - X %*% beta)^2) cat("Q = ", Q, "\n") sigma <- sqrt(Q/(n-3)) inv <- solve(crossprod(X)) varest <- sigma^2 * inv cat("系数协方差阵估计:\n") print(varest) } ### 节4.5例5.1(P.233). 汽车数与车祸数。 cars <- function(){ d <- data.frame(year=seq(1947, 1957), y = c(166, 153, 177, 201, 216, 208, 227, 238, 268, 268, 274), x = c(352, 373, 411, 441, 462, 490, 529, 577, 641, 692, 743)) lm1 <- lm(y ~ x, data=d) print(summary(lm1)) x0 <- 1000 y0 <- predict(lm1, newdata=list(x=x0)) cat("对x0=", x0, "时y的点预测=", y0, "\n") ## 预测区间。 n <- nrow(d) sigma <- summary(lm1)$sigma lxx <- sum((d$x-mean(d$x))^2) lambda <- qt(1 - 0.05/2, n-2) rad <- lambda*sigma*sqrt(1 + 1/n + (x0 - mean(d$x))^2/lxx) lb <- y0 - rad ub <- y0 + rad cat("预测区间半径:", rad, " 区间为:[", y0-rad, ", ", y0+rad, "]\n", sep="") ## 检验 x0 <- 800; y0 <- 270 yhat <- predict(lm1, newdata=list(x=x0)) tstat <- (y0 - yhat) / (sigma * sqrt(1 + 1/n + (x0-mean(d$x))^2/lxx)) pvalue <- 2*(1 - pt(abs(tstat), n-2)) cat("T统计量=", tstat, " p值=", pvalue, "\n") } ### 英国进口数据。因变量为进口指数,自变量X1为总产品指数,X2为价格指数。 import <- function(){ d <- data.frame(year=seq(1948, 1956), y=c(100, 106, 107, 120, 110, 116, 123, 133, 137), x1=c(100, 104, 106, 111, 111, 115, 120, 124, 126), x2=c(100, 99, 110, 126, 113, 103, 102, 103, 98)) n <- nrow(d) C <- cbind(d$x1, d$x2) X <- cbind(1, C) xbar <- apply(C, 2, mean) C.centered <- C - cbind(rep(1,n)) %*% rbind(xbar) L <- crossprod(C.centered) cat("L=\n"); print(L) Lxy <- crossprod(C.centered, d$y) cat("Lxy=\n"); print(Lxy) lyy <- sum((d$y - mean(d$y))^2) cat("lyy=", lyy, "\n") beta1 <- solve(L, Lxy) beta0 <- mean(d$y) - sum(beta1 * xbar) beta <- c(beta0, beta1) cat("回归系数:", beta, "\n") U <- sum(beta1 * Lxy) Q <- lyy - U cat("Q=", Q, " U=", U, "\n") ## 检验所有斜率项为0 F <- (U/2)/(Q/(n-3)) pvalue <- 1 - pf(F, 2, n-3) cat("F统计量=", F, " p值=", pvalue, "\n") ## 检验单个斜率项。 Linv <- solve(L) F1 <- beta1[1]^2 / (Linv[1,1] * Q / (n-3)) pv1 <- 1 - pf(F1, 1, n-3) cat("beta1=0检验F统计量=", F1, " p值=", pv1, "\n") F2 <- beta1[2]^2 / (Linv[2,2] * Q / (n-3)) pv2 <- 1 - pf(F2, 1, n-3) cat("beta2=0检验F统计量=", F2, " p值=", pv2, "\n") ## 使用R的专用函数计算: lm1 <- lm(y ~ x1 + x2, data=d) cat("\n\n使用R的lm()函数计算的结果:\n") print(summary(lm1)) } ### 例4.5,回归诊断。 anscombe <- function(){ x <- 1:8 y1 <- c(2.0, 0.5, 1.0, 3.5, 2.0, 4.5, 5.0, 3.5) y2 <- c(0.5, 1.0, 2.5, 2.0, 5.5, 3.0, 3.5, 4.0) plot(x, y1, xlab="x", ylab="y", main="两组数据", col="red", ylim=c(0, 6)) points(x, y2, col="blue", pch=2) lm1 <- lm(y1 ~ x) cat("第一组数据的回归结果:\n") print(summary(lm1)) cat("第二组数据的回归结果:\n") lm2 <- lm(y2 ~ x) print(summary(lm2)) resid1 <- rstudent(lm1) resid2 <- rstudent(lm2) locator(1) plot(x, resid2, xlab="x", ylab="y", main="两组残差", col="red") points(x, resid1, col="blue", pch=2) } ### 节4.5例5.6,放射性金元素衰变。 gold <- function(){ x <- c(1,1, 2,2,2, 3, 5, 6,6, 7) y <- c(94.5, 86.4, 71.0, 80.5, 81.4, 67.4, 49.3, 46.8, 42.3, 36.6) plot(x, y, main="放射性金元素衰变", xlab="天数", ylab="残留百分比") lm1 <- lm(y ~ x) cat("线性回归:\n"); print(summary(lm1)) abline(lm1, lty=2, col="blue") ## 负指数 lm2 <- lm(I(log(y)) ~ x) cat("负指数回归:\n"); print(summary(lm2)) yhat2 <- exp(predict(lm2)) lines(spline(x, yhat2), lwd=2, col="red") }