diff --git a/_quarto.yml b/_quarto.yml index 3f504c47..2165fec2 100755 --- a/_quarto.yml +++ b/_quarto.yml @@ -64,6 +64,7 @@ book: - analyze-text-data.qmd - analyze-time-series-data.qmd - analyze-spatial-data.qmd + - analyze-areal-data.qmd - part: "优化建模" chapters: - statistical-computation.qmd @@ -73,6 +74,7 @@ book: chapters: - probabilistic-reasoning-framework.qmd - generalized-linear-models.qmd + - hierarchical-normal-models.qmd - mixed-effects-models.qmd - generalized-additive-models.qmd - gaussian-processes-regression.qmd diff --git a/analyze-areal-data.qmd b/analyze-areal-data.qmd new file mode 100644 index 00000000..7780a5ea --- /dev/null +++ b/analyze-areal-data.qmd @@ -0,0 +1,137 @@ +# 区域数据分析 {#sec-analyze-areal-data} + +## 苏格兰唇癌数据分析 {#sec-scotland-lip-cancer} + +> Everything is related to everything else, but near things are more related than distant things. +> +> --- Waldo Tobler [@Tobler1970] + +::: {#spatial-areal-data .callout-note title="空间区域数据分析"} +空间区域数据的贝叶斯建模 + +- Bayesian spatial and spatio-temporal GLMMs with possible extremes [glmmfields](https://github.com/seananderson/glmmfields) +- Bayesian spatial analysis [geostan](https://github.com/ConnorDonegan/geostan/) +- [Spatial Models in Stan: Intrinsic Auto-Regressive Models for Areal Data](https://mc-stan.org/users/documentation/case-studies/icar_stan.html) +- [Exact sparse CAR models in Stan](https://github.com/mbjoseph/CARstan) [网页文档](https://mc-stan.org/users/documentation/case-studies/mbjoseph-CARStan.html) +- [Spatial Models in Stan: Intrinsic Auto-Regressive Models for Areal Data](https://github.com/stan-dev/example-models/tree/master/knitr/car-iar-poisson) [网页文档](https://mc-stan.org/users/documentation/case-studies/icar_stan.html) 原始数据和代码,接上面苏格兰唇癌数据分析,用 CmdStanR 更新后的[代码](https://github.com/stan-dev/example-models/tree/master/knitr/car-iar-poisson) +- [Spatial modeling of areal data. Lip cancer in Scotland](https://www.paulamoraga.com/book-geospatial/sec-arealdataexamplespatial.html) INLA 建模 +- [CAR models Scotland Lip cancer dataset](https://rafaelcabral96.github.io/nigstan/sar-and-car-models.html#car-models) Stan 建模 +- 空间计量 [区域数据分析](https://rsbivand.github.io/emos_talk_2304/bivand_emos_230419.pdf) [on-the-use-of-r-for-spatial-econometrics](https://github.com/rsbivand/emos_talk_2304) +::: + +响应变量服从泊松分布 + +- BYM-INLA [@blangiardo2013;@moraga2020] +- BYM-Stan [@morris2019; @donegan2022; @cabral2022] + +记录 1975-1986 年苏格兰 56 个地区的唇癌病例数,这是一个按地区汇总的数据。 + +```{r} +library(sf) +scotlips <- st_read('data/scotland/scotland.shp', crs = st_crs("EPSG:27700")) +str(scotlips) +``` + +```{r} +#| label: fig-lip-cancer-map +#| fig-cap: 苏格兰各地区唇癌病例数分布 +#| fig-width: 5 +#| fig-height: 5 +#| fig-showtext: true + +library(ggplot2) +ggplot() + + geom_sf(data = scotlips, aes(fill = Observed)) + + scale_fill_viridis_c() + + theme_minimal() +``` + +## 美国各州犯罪率分析 + +响应变量服从高斯分布的调查数据 [@bivand2001] + +数据集 USArrests 记录 1973 年美国各州每 10 万居民中因谋杀 Murder、袭击 Assault 和强奸 Rape 被警察逮捕的人数以及城市人口所占百分比(可以看作城市化率)。 + +```{r} +#| echo: false +#| label: tbl-us-arrests +#| tbl-cap: "数据集 USArrests(部分)" + +us_arrests <- data.frame( + state_name = rownames(USArrests), + state_region = state.region, + USArrests, check.names = FALSE +) + +knitr::kable(head(us_arrests), col.names = c( + "州名", "区域划分", "谋杀犯", "袭击犯", "城市化率", "强奸犯" +), row.names = FALSE) +``` + +```{r} +#| label: fig-us-arrests-sf +#| fig-cap: 因袭击被逮捕的人数分布 +#| fig-showtext: true +#| fig-width: 7 +#| fig-height: 4 + +library(sf) +# 州数据 +us_state_sf <- readRDS("data/us-state-map-2010.rds") +# 观测数据 +us_state_df <- merge(x = us_state_sf, y = us_arrests, + by.x = "NAME", by.y = "state_name", all.x = TRUE) + +ggplot() + + geom_sf( + data = us_state_df, aes(fill = Assault), color = "gray80", lwd = 0.25) + + scale_fill_viridis_c(option = "plasma", na.value = "white") + + theme_void() +``` + +1973 年美国各州因袭击被逮捕的人数与城市化率的关系:相关分析 + +```{r} +#| label: fig-us-arrests-point +#| fig-cap: 逮捕人数比例与城市化率的关系 +#| fig-width: 7 +#| fig-height: 5.5 +#| code-fold: true +#| echo: !expr knitr::is_html_output() +#| fig-showtext: true + +library(ggrepel) +ggplot(data = us_arrests, aes(x = UrbanPop, y = Assault)) + + geom_point(aes(color = state_region)) + + geom_text_repel(aes(label = state_name), size = 3, seed = 2022) + + theme_classic() + + labs(x = "城市化率(%)", y = "因袭击被逮捕人数", color = "区域划分") +``` + +阿拉斯加州和夏威夷州与其它州都不相连,属于孤立的情况,下面在空间相关性的分析中排除这两个州。 + +```{r} +# 州的中心 +centers48 <- subset( + x = data.frame(x = state.center$x, y = state.center$y), + subset = !state.name %in% c("Alaska", "Hawaii") +) +# 观测数据 +arrests48 <- subset( + x = USArrests, + subset = !rownames(USArrests) %in% c("Alaska", "Hawaii") +) +``` + +```{r} +#| message: false + +library(spData) +library(spdep) +# KNN +k4.48 <- knn2nb(knearneigh(as.matrix(centers48), k = 4)) +# Moran I test +moran.test(x = arrests48$Assault, listw = nb2listw(k4.48)) +# Permutation test for Moran's I statistic +moran.mc(x = arrests48$Assault, listw = nb2listw(k4.48), nsim = 499) +``` diff --git a/common-statistical-tests.qmd b/common-statistical-tests.qmd index 344d7a32..d608a189 100644 --- a/common-statistical-tests.qmd +++ b/common-statistical-tests.qmd @@ -111,6 +111,20 @@ qnorm(p = 1 - 0.05, mean = 0, sd = 1) 1 - pnorm(q = u) ``` +::: callout-important +随机变量 $X$ 服从标准正态分布,它的概率分布函数如下: + +$$ +P(X \leq u)= \phi(u) = \frac{1}{\sqrt{2\pi}}\int_{-\infty}^{u}\mathrm{e}^{-t^2/2}\mathrm{dt} +$$ + +若已知概率 $p = 0.95$ ,则对应的下分位点可用函数 `qnorm()` 计算。 + +```{r} +qnorm(p = 0.95, mean = 0, sd = 1) +``` +::: + #### 方差未知 $$ @@ -150,6 +164,25 @@ qt(p = 1 - 0.05, df = n - 1) ::: callout-note 英国统计学家 William Sealy Gosset (1876-1937) 于 1908 年在杂志 《Biometrics》 上以笔名 Student 发表论文《The Probable Error of a Mean》[@Gosset1908],论文中展示了独立同正态分布的样本 $x_1, \ldots, x_n \stackrel{i.i.d}{\sim} \mathcal{N}(\mu,\sigma^2)$ 的样本方差 $s^2$ 和样本标准差 $s$ 的抽样分布,根据均值和标准差不相关的性质导出 t 分布,宣告 t 分布的诞生,因其在小样本领域的突出贡献,W. S. Gosset 进入世纪名人录 [@Heyde2001]。 + +```{r} +#| label: tbl-t-quantile +#| tbl-cap: $t$ 分布的分位数表 +#| echo: false + +vec_prob <- c( + 0.75, 0.80, 0.90, 0.95, + 0.975, 0.99, 0.995, 0.999 +) +vec_df <- 1:10 + +tmp <- mapply(FUN = qt, + p = vec_prob, + MoreArgs = list(df = vec_df), SIMPLIFY = TRUE +) +row.names(tmp) <- vec_df +knitr::kable(tmp, row.names = TRUE, col.names = vec_prob, digits = 4) +``` ::: ### 正态总体方差检验 @@ -181,6 +214,41 @@ qchisq(p = 1 - 0.05, df = n -1) 1 - pchisq(q = chi, df = n -1) ``` +::: callout-important +R 软件提供很多统计分布的计算,因此,不再需要查分位数表,现算即可。计算自由度为 $n$ 概率为 $p$ 的 $\chi^2$ 分布的分位数 $\chi^2_p(n)$ ,即 + +$$ +P(\chi^2(n) \leq \chi^2_p(n)) = p +$$ + +若已知自由度为 1 ,概率为 0.05,则可借助分位数函数 `qchisq()` 计算对应的(下)分位点。 + +```{r} +qchisq(p = 0.05, df = 1) +``` + +同理,也可以获得 $\chi^2$ 分布的分位数 @tbl-chisq-quantile ,计算出来的分位数保留 4 位小数。 + +```{r} +#| label: tbl-chisq-quantile +#| tbl-cap: $\chi^2$ 分布的分位数表 +#| echo: false + +vec_prob <- c( + 0.005, 0.01, 0.025, 0.05, 0.1, + 0.9, 0.95, 0.975, 0.99, 0.995 +) +vec_df <- 1:10 + +tmp <- mapply(FUN = qchisq, + p = vec_prob, + MoreArgs = list(df = vec_df), SIMPLIFY = TRUE +) +row.names(tmp) <- vec_df +knitr::kable(tmp, row.names = TRUE, col.names = vec_prob, digits = 4) +``` +::: + ### 总体未知均值检验 有了均值和方差,为什么还要位置参数和尺度参数?为了更一般地描述问题,扩展范围。特别是在总体分布未知或知之甚少的情况下做检验,不再仅限于均值和方差这样的特征量。 @@ -531,7 +599,7 @@ flowchart LR B1 --> C1(均值检验) C1 --> D2(方差相等) --> E2(F 检验) C1 --> D3(方差不等) --> E3(F 检验) - B1 --> C2(方差检验) --> E4(Bartlett 检验) + B1 --> C2(方差检验) --> E4(Hartley 检验\n Bartlett 检验\n 修正的 Bartlett 检验\n Levene 检验) B2 --> C3(均值检验) --> E5(Kruskal-Wallis 秩和检验\n Friedman 秩和检验\n Quade 检验) B2 --> C4(方差检验) --> E7(Fligner-Killeen 检验) ``` @@ -644,7 +712,16 @@ logLik(fit_gls) ### 正态总体方差检验 -后面总体分布未知的情况下的方差检验也都可以用在这里。 +总体服从正态分布,有四种常见的参数检验方法: + +1. Hartley 检验:各组样本量必须相等。 +2. Bartlett 检验:各组样本量可以相等或不等,但每个组的样本量必须不低于 5。 +3. 修正的 Bartlett 检验:在样本量较大或较小、相等或不等场合都可使用。 +4. Levene 检验:相当于单因素组间方差分析,相比于 Bartlett 检验,Levene 检验更加稳健。 + +::: callout-tip +在总体分布未知的情况下,检验方差齐性的非参数方法也都可以用在这里。 +::: 设 $x_1,\cdots,x_{n_1}$ 是来自总体 $\mathcal{N}(\mu_1,\sigma_1^2)$ 的样本,设 $y_1,\cdots,y_{n_2}$ 是来自总体 $\mathcal{N}(\mu_2,\sigma_2^2)$ 的样本,设 $z_1,\cdots,z_{n_3}$ 是来自总体 $\mathcal{N}(\mu_3,\sigma_3^2)$ 的样本。 @@ -652,7 +729,7 @@ $$ \sigma_1^2 = \sigma_2^2 = \sigma_3^2 \quad vs. \quad \sigma_1^2,\sigma_2^2,\sigma_3^2 \quad \text{不全相等} $$ -Bartlett (巴特利特)检验 `bartlett.test()` 要求总体的分布为正态分布,检验各个组的方差是否有显著性差异,即方差齐性检验,属于参数检验,适用于多个样本的情况。相比于 Bartlett 检验,Levene 检验更加稳健。 +Bartlett (巴特利特)检验 `bartlett.test()` 要求总体的分布为正态分布,检验各个组的方差是否有显著性差异,即方差齐性检验,属于参数检验,适用于多个样本的情况。 ```{r} # 三样本 @@ -1018,11 +1095,11 @@ apply(Wish, MARGIN = 1:2, var) | [W. Kruskal](https://en.wikipedia.org/wiki/William_Kruskal) | 美国 | 1919-10-10 | 2005-04-21 | 85 | Kruskal-Wallis 检验 | | [George E. P. Box](https://en.wikipedia.org/wiki/George_E._P._Box) | 英国、美国 | 1919-10-18 | 2013-03-28 | 93 | Box-Pierce 检验 | | [C. R. Rao](https://en.wikipedia.org/wiki/C._R._Rao) | 印度、美国 | 1920-09-10 | 2023-08-22 | 102 | Score 检验 | -| [M. Wilk](https://en.wikipedia.org/wiki/Martin_Wilk) | 加拿大 | 1922-12-18 | 2013-02-19 | 90 | Shapiro-Wilk 正态性检验 | +| [M. Wilk](https://en.wikipedia.org/wiki/Martin_Wilk) | 加拿大 | 1922-12-18 | 2013-02-19 | 90 | Shapiro-Wilk 检验 | | [J. Durbin](https://en.wikipedia.org/wiki/James_Durbin) | 英国 | 1923-06-30 | 2012-06-23 | 88 | Durbin 检验 | | [L. Le Cam](https://en.wikipedia.org/wiki/Lucien_Le_Cam) | 法国 | 1924-11-18 | 2000-04-25 | 75 | 渐近理论 | | [H. Lilliefors](https://en.wikipedia.org/wiki/Hubert_Lilliefors) | 美国 | 1928-06-14 | 2008-02-23 | 80 | Lilliefors 检验 | -| [S. S. Shapiro](https://en.wikipedia.org/wiki/Samuel_Sanford_Shapiro) | 美国 | 1930-07-13 | \- | 93 | Shapiro-Wilk 正态性检验 | +| [S. S. Shapiro](https://en.wikipedia.org/wiki/Samuel_Sanford_Shapiro) | 美国 | 1930-07-13 | \- | 93 | Shapiro-Wilk 检验 | : 对假设检验理论有重要贡献的学者 {#tbl-statistican tbl-colwidths="\[20,15,15,15,7,28\]"} diff --git a/data/scotland/scotland.dbf b/data/scotland/scotland.dbf new file mode 100755 index 00000000..44ee8812 Binary files /dev/null and b/data/scotland/scotland.dbf differ diff --git a/data/scotland/scotland.shp b/data/scotland/scotland.shp new file mode 100755 index 00000000..d826e4fb Binary files /dev/null and b/data/scotland/scotland.shp differ diff --git a/data/scotland/scotland.shx b/data/scotland/scotland.shx new file mode 100755 index 00000000..7c787ada Binary files /dev/null and b/data/scotland/scotland.shx differ diff --git a/hierarchical-normal-models.qmd b/hierarchical-normal-models.qmd new file mode 100644 index 00000000..37697033 --- /dev/null +++ b/hierarchical-normal-models.qmd @@ -0,0 +1,572 @@ +# 分层正态模型 {#sec-hierarchical-normal-models} + +```{r} +#| echo: false + +source("_common.R") +``` + +以分层正态模型介绍 **rstan** 包的用法 + +```{r} +#| message: false + +library(StanHeaders) +library(ggplot2) +library(rstan) +rstan_options(auto_write = TRUE) +# 如果CPU和内存足够,设置成与马尔科夫链一样多 +options(mc.cores = max(c(floor(parallel::detectCores() / 2), 1L))) + +custom_colors <- c( + "#4285f4", # GoogleBlue + "#34A853", # GoogleGreen + "#FBBC05", # GoogleYellow + "#EA4335" # GoogleRed +) +rstan_ggtheme_options( + panel.background = element_rect(fill = "white"), + legend.position = "top" +) +rstan_gg_options( + fill = "#4285f4", color = "white", + pt_color = "#EA4335", chain_colors = custom_colors +) +``` + +## 8schools 数据 {#sec-eight-schools} + +分层正态模型 + +$$ +\begin{aligned} +y_j &\sim \mathcal{N}(\theta_j,\sigma_j) \quad +\theta_j = \mu + \tau * \eta_j \\ +\theta_j &\sim \mathcal{N}(\mu, \tau) \quad +\eta_j \sim \mathcal{N}(0,1) +\end{aligned} +$$ + +### 拟合模型 + +用 rstan 包来拟合模型 + +```{r} +# 编译模型 +eight_schools_fit <- stan( + model_name = "eight_schools", + # file = "code/stan/8schools.stan", + model_code = " + // saved as 8schools.stan + data { + int J; // number of schools + real y[J]; // estimated treatment effects + real sigma[J]; // standard error of effect estimates + } + parameters { + real mu; // population treatment effect + real tau; // standard deviation in treatment effects + vector[J] eta; // unscaled deviation from mu by school + } + transformed parameters { + vector[J] theta = mu + tau * eta; // school treatment effects + } + model { + target += normal_lpdf(eta | 0, 1); // prior log-density + target += normal_lpdf(y | theta, sigma); // log-likelihood + } + ", + data = list( # 观测数据 + J = 8, + y = c(28, 8, -3, 7, -1, 1, 18, 12), + sigma = c(15, 10, 16, 11, 9, 11, 10, 18) + ), + warmup = 1000, # 每条链预处理迭代次数 + iter = 2000, # 每条链总迭代次数 + chains = 2, # 马尔科夫链的数目 + cores = 2, # 指定 CPU 核心数,可以给每条链分配一个 + verbose = FALSE, # 不显示迭代的中间过程 + refresh = 0, # 不显示采样的进度 + seed = 20190425 # 设置随机数种子,不要使用 set.seed() 函数 +) +``` + +### 模型输出 + +```{r} +print(eight_schools_fit, digits = 1) +``` + +提取任意一个参数的结果,如查看参数 $\tau$ 的 95% 置信区间。 + +```{r} +print(eight_schools_fit, pars = "tau", probs = c(0.025, 0.975)) +``` + +从迭代抽样数据获得与 `print(fit)` 一样的结果。以便后续对原始采样数据做任意的进一步分析。rstan 包扩展泛型函数 `summary()` 以支持对 stanfit 数据对象汇总,输出各个参数分链条和合并链条的后验分布结果。 + +### 操作数据 + +合并四条马氏链的结果 + +```{r} +eight_schools_sim <- extract(eight_schools_fit, permuted = TRUE) +``` + +返回的结果是一个列表 + +```{r} +str(eight_schools_sim) +class(eight_schools_sim) +``` + +计算参数 $\eta,\theta$ 的均值 + +```{r} +apply(eight_schools_sim$eta, 2, mean) +apply(eight_schools_sim$theta, 2, mean) +``` + +计算参数 $\eta,\theta$ 的分位点 + +```{r} +t(apply(eight_schools_sim$eta, 2, quantile, probs = c(2.5, 25, 50, 75, 97.5) / 100)) +t(apply(eight_schools_sim$theta, 2, quantile, probs = c(2.5, 25, 50, 75, 97.5) / 100)) +``` + +计算参数 $\mu,\tau$ 的均值 + +```{r} +lapply(eight_schools_sim["mu"], mean) +lapply(eight_schools_sim["tau"], mean) +lapply(eight_schools_sim["lp__"], mean) +``` + +计算参数 $\mu,\tau$ 的分位点 + +```{r} +lapply(eight_schools_sim["mu"], quantile, probs = c(2.5, 25, 50, 75, 97.5) / 100) +lapply(eight_schools_sim["tau"], quantile, probs = c(2.5, 25, 50, 75, 97.5) / 100) +lapply(eight_schools_sim["lp__"], quantile, probs = c(2.5, 25, 50, 75, 97.5) / 100) +``` + +### 采样诊断 + +获取马尔科夫链迭代点列数据 + +```{r} +eight_schools_sim <- extract(eight_schools_fit, permuted = FALSE) +``` + +`eight_schools_sim` 是一个三维数组,1000(次迭代)\* 4 (条链)\* 19(个参数)。如果 `permuted = TRUE` 则会合并四条马氏链的迭代结果,变成一个列表。 + +```{r} +# 数据类型 +class(eight_schools_sim) +# 1000(次迭代)* 4 (条链)* 19(个参数) +str(eight_schools_sim) +``` + +提取参数 $\mu$ 的四条迭代点列 + +```{r} +eight_schools_mu_sim <- eight_schools_sim[, , "mu"] +``` + +```{r} +#| label: fig-8schools-mu-base +#| fig-cap: 参数 $\mu$ 的迭代轨迹 +#| fig-showtext: true +#| par: true + +matplot(eight_schools_mu_sim, + xlab = "Iteration", ylab = expression(mu), + type = "l", lty = "solid", col = custom_colors +) +abline(h = apply(eight_schools_mu_sim, 2, mean), col = custom_colors) +legend("bottomleft", + legend = paste0("chain:", 1:2), box.col = "white", inset = 0.01, + lty = "solid", horiz = TRUE, col = custom_colors +) +``` + +或者使用 rstan 提供的 `traceplot` 函数或者 `stan_trace` 函数,rstan 大量依赖 ggplot2 绘图,所以如果你熟悉 GGplot2 可以很方便地定制属于自己的风格,除了 rstan 提供的 `rstan_ggtheme_options` 和 `rstan_gg_options` 两个函数外,还可以使用 ggplot2 自带的大量配置选项和主题,如 `theme_minimal` 主题,因为 `stan_trace`等作图函数返回的是一个 ggplot 对象。 + +```{r} +#| label: fig-8schools-mu-gg +#| fig-cap: 马氏链的迭代序列 +#| fig-showtext: true + +stan_trace(eight_schools_fit, pars = "mu") + + theme_minimal() + + labs(x = "Iteration", y = expression(mu)) +``` + +序列的自相关图,类似地,我们这里也使用 `stan_ac` 函数绘制自相关图 + +```{r} +#| label: fig-8schools-mu-acf +#| fig-cap: 马氏链的自相关图 +#| fig-showtext: true +#| fig-height: 4 + +stan_ac(eight_schools_fit, pars = "mu", separate_chains = TRUE, color = "white") + + theme_minimal() +``` + +### 后验分布 + +可以用 `stan_hist` 函数绘制参数 $\mu$ 的后验分布图,它没有 `separate_chains` 参数,所以不能分链条绘制 + +```{r} +#| label: fig-8schools-mu-hist +#| fig-cap: 参数 $\mu$ 的后验分布 +#| fig-showtext: true + +stan_hist(eight_schools_fit, pars = "mu", bins = 30) + theme_minimal() +``` + +参数 $\mu$ 和 $\tau$ 的散点图 + +```{r} +#| label: fig-8schools-mu-tau +#| fig-cap: 参数 $\mu$ 和 $\tau$ 的联合后验分布 +#| fig-showtext: true + +stan_scat(eight_schools_fit, pars = c("mu","tau")) + theme_minimal() +``` + +参数 $\mu$ 的后验概率密度分布图 + +```{r} +#| label: fig-8schools-mu-dens +#| fig-cap: 参数 $\mu$ 的后验概率密度分布图 +#| fig-showtext: true + +stan_dens(eight_schools_fit, pars = "mu", separate_chains = TRUE) + + theme_minimal() + + labs(x = expression(mu), y = "Density") +``` + +**bayesplot** 包的函数 `mcmc_pairs()` 以矩阵图展示多个参数的分布。 + +```{r} +#| label: fig-parameters-posterior +#| fig-cap: 参数 $\mu$,$\tau$ 和 $\mathrm{lp\_\_}$ 的后验分布图 +#| fig-showtext: true + +bayesplot::mcmc_pairs(eight_schools_fit, pars = c("mu", "tau", "lp__")) +``` + +### 模型导出 + +rstan 包还支持从外部磁盘读取代码 + +```{r} +#| eval: false + +fit <- stan(file = 'code/stan/8schools.stan', ...) + +schools_dat <- read_rdump('data/8schools.rdump') +source('data/8schools.rdump') +``` + +### 其它 R 包 + +接下来,分别用 nlme 和 lme4 包拟合模型。 + +```{r} +# 成绩 +y <- c(28, 8, -3, 7, -1, 1, 18, 12) +# 标准差 +sigma <- c(15, 10, 16, 11, 9, 11, 10, 18) +# 学校编号 +g <- 1:8 +``` + +首先,调用 **nlme** 包的函数 `lme()` 拟合模型。 + +```{r} +library(nlme) +fit_lme <- lme(y ~ 1, random = ~ 1 | g, weights = varFunc(~ 1/sigma^2)) +summary(fit_lme) +``` + +接着,采用 lme4 包拟合模型,发现 lme4 包可以获得与 nlme 包一致的结果。 + +```{r} +control <- lme4::lmerControl( + check.conv.singular = "ignore", + check.nobs.vs.nRE = "ignore", + check.nobs.vs.nlev = "ignore" +) +fit_lme4 <- lme4::lmer(y ~ 1 + (1 | g), weights = 1 / sigma^2, control = control, REML = FALSE) +summary(fit_lme4) +``` + +最后,使用 blme 包 [@Chung2013] ,blme 包基于 lme4 包,结果完全一致。 + +```{r} +### Example using a residual variance prior ### +# This is the "eight schools" data set; +# the mode should be at the boundary of the space. + +fit_blme <- blme::blmer( + y ~ 1 + (1 | g), + control = control, REML = FALSE, + resid.prior = point, cov.prior = NULL, + weights = 1 / sigma^2 +) +summary(fit_blme) +``` + +## rats 数据 {#sec-thirty-rats} + +rats 数据最早来自 @gelfand1990 ,记录 30 只小鼠每隔一周的重量,一共进行了 5 周。第一次记录是小鼠第 8 天的时候,第二次测量记录是第 15 天的时候,一直持续到第 36 天。下面在 R 环境中准备数据。 + +```{r} +# 总共 30 只老鼠 +N <- 30 +# 总共进行 5 周 +T <- 5 +# 小鼠重量 +y <- structure(c( + 151, 145, 147, 155, 135, 159, 141, 159, 177, 134, + 160, 143, 154, 171, 163, 160, 142, 156, 157, 152, 154, 139, 146, + 157, 132, 160, 169, 157, 137, 153, 199, 199, 214, 200, 188, 210, + 189, 201, 236, 182, 208, 188, 200, 221, 216, 207, 187, 203, 212, + 203, 205, 190, 191, 211, 185, 207, 216, 205, 180, 200, 246, 249, + 263, 237, 230, 252, 231, 248, 285, 220, 261, 220, 244, 270, 242, + 248, 234, 243, 259, 246, 253, 225, 229, 250, 237, 257, 261, 248, + 219, 244, 283, 293, 312, 272, 280, 298, 275, 297, 350, 260, 313, + 273, 289, 326, 281, 288, 280, 283, 307, 286, 298, 267, 272, 285, + 286, 303, 295, 289, 258, 286, 320, 354, 328, 297, 323, 331, 305, + 338, 376, 296, 352, 314, 325, 358, 312, 324, 316, 317, 336, 321, + 334, 302, 302, 323, 331, 345, 333, 316, 291, 324 +), .Dim = c(30, 5)) +# 第几天 +x <- c(8.0, 15.0, 22.0, 29.0, 36.0) +xbar <- 22.0 +``` + +重复测量的小鼠重量数据 rats 如下 @tbl-rats 所示。 + +```{r} +#| label: tbl-rats +#| tbl-cap: 小鼠重量数据(部分) +#| echo: false + +rownames(y) <- 1:30 +knitr::kable(head(y), col.names = paste("第", c(8, 15, 22, 29, 36), "天"), row.names = TRUE) +``` + +小鼠重量数据的分布情况见下 @fig-rats ,由图可以假定 30 只小鼠的重量服从正态分布。 + +```{r} +#| label: fig-rats +#| fig-cap: 30 只小鼠 5 次测量的数据 +#| fig-showtext: true +#| par: true +#| echo: false +#| fig-width: 5 +#| fig-height: 4.5 + +matplot(y, xlab = "小鼠编号", ylab = "小鼠重量") +``` + +由 @fig-rats-growth 可见, 30 只小鼠的重量增长趋势呈现一种线性趋势。 + +```{r} +#| label: fig-rats-growth +#| fig-cap: 30 只小鼠 5 次测量的数据 +#| fig-showtext: true +#| par: true +#| echo: false +#| fig-width: 5 +#| fig-height: 4.5 + +matplot(t(y), xlab = "测量次数", ylab = "小鼠重量") +``` + +### rstan {#sec-rats-rstan} + +初始化模型参数,设置采样算法的参数。 + +```{r} +# 迭代链 +chains <- 4 +# 迭代次数 +iter <- 1000 +# 初始值 +init <- rep(list(list( + alpha = rep(250, 30), beta = rep(6, 30), + alpha_c = 150, beta_c = 10, + tausq_c = 1, tausq_alpha = 1, + tausq_beta = 1 +)), chains) +``` + +接下来,基于重复测量数据,建立线性生长曲线模型: + +$$ +\begin{aligned} +\alpha_c &\sim \mathcal{N}(0,100) \quad \beta_c \sim \mathcal{N}(0,100) \\ +\tau^2_c &\sim \mathrm{inv\_gamma}(0.001, 0.001) \\ +\tau^2_{\alpha} &\sim \mathrm{inv\_gamma}(0.001, 0.001) \\ +\tau^2_{\beta} &\sim \mathrm{inv\_gamma}(0.001, 0.001) \\ +\alpha &\sim \mathcal{N}(\alpha_c, \tau_{\alpha}) \quad +\beta \sim \mathcal{N}(\beta_c, \tau_{\beta}) \\ +y_{nt} &\sim \mathcal{N}(\alpha_n + \beta_n * (x_t - \bar{x}), \tau_c) \\ +& n = 1,2,\ldots,N \quad t = 1,2,\ldots,T +\end{aligned} +$$ + +其中, $\alpha_c,\beta_c,\tau_c,\tau_{\alpha},\tau_{\beta}$ 为无信息先验,$N = 30$ 和 $T = 5$ 分别表示实验中的小鼠数量和测量次数,下面采用 Stan 编码、编译、采样和拟合模型。 + +```{r} +rats_fit <- stan( + model_name = "rats", + model_code = " + data { + int N; + int T; + vector[T] x; + matrix[N,T] y; + real xbar; + } + parameters { + vector[N] alpha; + vector[N] beta; + + real alpha_c; + real beta_c; // beta.c in original bugs model + + real tausq_c; + real tausq_alpha; + real tausq_beta; + } + transformed parameters { + real tau_c; // sigma in original bugs model + real tau_alpha; + real tau_beta; + + tau_c = sqrt(tausq_c); + tau_alpha = sqrt(tausq_alpha); + tau_beta = sqrt(tausq_beta); + } + model { + alpha_c ~ normal(0, 100); + beta_c ~ normal(0, 100); + tausq_c ~ inv_gamma(0.001, 0.001); + tausq_alpha ~ inv_gamma(0.001, 0.001); + tausq_beta ~ inv_gamma(0.001, 0.001); + alpha ~ normal(alpha_c, tau_alpha); // vectorized + beta ~ normal(beta_c, tau_beta); // vectorized + for (n in 1:N) + for (t in 1:T) + y[n,t] ~ normal(alpha[n] + beta[n] * (x[t] - xbar), tau_c); + } + generated quantities { + real alpha0; + alpha0 = alpha_c - xbar * beta_c; + } + ", + data = list(N = N, T = T, y = y, x = x, xbar = xbar), + chains = chains, init = init, iter = iter, + verbose = FALSE, refresh = 0, seed = 20190425 +) +``` + +模型输出结果如下: + +```{r} +print(rats_fit, pars = c("alpha", "beta"), include = FALSE, digits = 1) +``` + +对于分量众多的参数向量,比较适合用岭线图展示后验分布,下面调用 bayesplot 包绘制参数向量 $\alpha,\beta$ 的后验分布。 + +```{r} +#| label: fig-rats-alpha +#| fig-cap: 参数 $\alpha$ 的后验分布 +#| fig-showtext: true +#| fig-width: 6 +#| fig-height: 8 + +# plot(rats_fit, pars = "alpha", show_density = TRUE, ci_level = 0.8, outer_level = 0.95) +bayesplot::mcmc_areas_ridges(rats_fit, pars = paste0("alpha", "[", 1:30, "]")) +``` + +```{r} +#| label: fig-rats-beta +#| fig-cap: 参数 $\beta$ 的后验分布 +#| fig-showtext: true +#| fig-width: 6 +#| fig-height: 8 + +# plot(rats_fit, pars = "beta", ci_level = 0.8, outer_level = 0.95) +bayesplot::mcmc_areas_ridges(rats_fit, pars = paste0("beta", "[", 1:30, "]")) +``` + +### nlme {#sec-rats-nlme} + +**nlme** 包适合将长格式的数据,因此,先将小鼠数据整理成长格式。 + +```{r} +weight <- c( + 151, 145, 147, 155, 135, 159, 141, 159, 177, 134, + 160, 143, 154, 171, 163, 160, 142, 156, 157, 152, 154, 139, 146, + 157, 132, 160, 169, 157, 137, 153, 199, 199, 214, 200, 188, 210, + 189, 201, 236, 182, 208, 188, 200, 221, 216, 207, 187, 203, 212, + 203, 205, 190, 191, 211, 185, 207, 216, 205, 180, 200, 246, 249, + 263, 237, 230, 252, 231, 248, 285, 220, 261, 220, 244, 270, 242, + 248, 234, 243, 259, 246, 253, 225, 229, 250, 237, 257, 261, 248, + 219, 244, 283, 293, 312, 272, 280, 298, 275, 297, 350, 260, 313, + 273, 289, 326, 281, 288, 280, 283, 307, 286, 298, 267, 272, 285, + 286, 303, 295, 289, 258, 286, 320, 354, 328, 297, 323, 331, 305, + 338, 376, 296, 352, 314, 325, 358, 312, 324, 316, 317, 336, 321, + 334, 302, 302, 323, 331, 345, 333, 316, 291, 324 +) +rats <- rep(1:30, each = 5) +days <- rep(c(8, 15, 22, 29, 36), each = 30) +rats_data <- data.frame(weight = weight, rats = rats, days = days) +``` + +小鼠的重量随时间增长,不同小鼠的情况又会有所不同。可用随机效应模型表示生长曲线模型,下面加载 **nlme** 包调用函数 `lme()` 拟合该模型。 + +```{r} +library(nlme) +rats_lme <- lme(data = rats_data, fixed = weight ~ days, random = ~ 1 | rats) +summary(rats_lme) +``` + +模型输出结果中,固定效应中的截距项 `(Intercept)` 对应 106.56762,斜率 `days` 对应 6.18571。Stan 模型中截距参数 `alpha0` 的后验估计是 106.332,斜率参数 `beta_c` 的后验估计是 6.188。对比 Stan 和 **nlme** 包的拟合结果,可以发现贝叶斯和频率方法的结果是非常接近的。 + +### lme4 + +当采用 **lme4** 包拟合数据的时候,发现参数取值落在参数空间的边界上,除此之外,输出结果与 **nlme** 包几乎相同。 + +```{r} +rats_lme4 <- lme4::lmer(weight ~ days + (1 | rats), data = rats_data) +summary(rats_lme4) +``` + +**lme4** 包依赖 **nloptr** 包提供数值优化求解器,也做了更多的矩阵奇异性检查,对于复杂的重复测量模型,往往会发生奇异,需要适当降低复杂性,更多详情见 **lme4** 包的帮助文档 `help('isSingular')` 。 + +### blme + +blme 包基于 lme4 包 + +```{r} +control <- lme4::lmerControl( + check.conv.singular = "ignore", + check.nobs.vs.nRE = "ignore", + check.nobs.vs.nlev = "ignore" +) +rats_blme <- blme::blmer( + weight ~ days + (1 | rats), + data = rats_data, + resid.prior = point, cov.prior = NULL, + control = control, REML = FALSE +) +summary(rats_blme) +``` diff --git a/references.bib b/references.bib index af6b9eb9..132a0633 100755 --- a/references.bib +++ b/references.bib @@ -1674,5 +1674,110 @@ @article{shapiro1965 volume = {52}, number = {3-4}, doi = {10.1093/biomet/52.3-4.591}, - url = {https://doi.org/10.1093/biomet/52.3-4.591} +} + +@article{blangiardo2013, + title = {Spatial and spatio-temporal models with {R-INLA}}, + author = {Blangiardo, Marta and Cameletti, Michela and Baio, Gianluca and Rue, {Håvard}}, + year = {2013}, + month = {12}, + date = {2013-12-01}, + journal = {Spatial and Spatio-temporal Epidemiology}, + pages = {39--55}, + volume = {7}, + doi = {10.1016/j.sste.2013.07.003}, +} + +@book{moraga2020, + title = {Geospatial health data: modeling and visualization with R-INLA and Shiny}, + author = {Moraga, Paula}, + year = {2020}, + publisher = {Chapman and Hall/CRC}, + address = {Boca Raton, Florida}, +} + +@article{donegan2022, + title = {{geostan}: An R package for Bayesian spatialanalysis}, + author = {Donegan, Connor}, + year = {2022}, + month = {11}, + date = {2022-11-16}, + journal = {Journal of Open Source Software}, + pages = {4716}, + volume = {7}, + number = {79}, + doi = {10.21105/joss.04716}, +} + +@article{morris2019, + title = {Bayesian hierarchical spatial models: Implementing the Besag York Mollié model in stan}, + author = {Morris, Mitzi and Wheeler-Martin, Katherine and Simpson, Dan and Mooney, Stephen J. and Gelman, Andrew and DiMaggio, Charles}, + year = {2019}, + month = {11}, + date = {2019-11}, + journal = {Spatial and Spatio-temporal Epidemiology}, + pages = {100301}, + volume = {31}, + doi = {10.1016/j.sste.2019.100301}, +} + +@article{cabral2022, + title = {Controlling the Flexibility of Non-Gaussian Processes Through Shrinkage Priors}, + author = {Cabral, Rafael and Bolin, David and Rue, {Håvard}}, + year = {2022}, + month = {01}, + date = {2022-01}, + journal = {Bayesian Analysis}, + pages = {1--24}, + volume = {-1}, + number = {-1}, + doi = {10.1214/22-BA1342}, +} + +@article{bivand2001, + title = {More on Spatial Data Analysis}, + author = {Bivand, Roger}, + year = {2001}, + month = {09}, + date = {2001-09-01}, + journal = {R News}, + pages = {13--17}, + volume = {1}, + number = {3}, + url = {https://www.r-project.org/doc/Rnews/Rnews_2001-3.pdf} +} + +@Article{Tobler1970, + title = {A computer movie simulating urban growth in the Detroit region}, + author = {Waldo Tobler}, + journal = {Economic Geography}, + year = {1970}, + volume = {46}, + number = {Supplement}, + pages = {234--240}, + doi = {10.2307/143141}, +} + +@article{gelfand1990, + title = {Illustration of Bayesian Inference in Normal Data Models Using Gibbs Sampling}, + author = {Gelfand, Alan E. and Hills, Susan E. and Racine-Poon, Amy and Smith, Adrian F. M.}, + year = {1990}, + date = {1990}, + journal = {Journal of the American Statistical Association}, + pages = {972--985}, + volume = {85}, + number = {412}, + doi = {10.2307/2289594}, +} + +@Article{Chung2013, + title = {A nondegenerate penalized likelihood estimator for variance parameters in multilevel models}, + author = {Yeojin Chung and Sophia Rabe-Hesketh and Vincent Dorie and Andrew Gelman and Jingchen Liu}, + year = {2013}, + journal = {Psychometrika}, + volume = {78}, + number = {4}, + pages = {685--709}, + publisher = {Springer}, + doi = {10.1007/s11336-013-9328-2}, } diff --git a/regression-and-correlation.qmd b/regression-and-correlation.qmd index 1e139d94..e915188b 100644 --- a/regression-and-correlation.qmd +++ b/regression-and-correlation.qmd @@ -161,8 +161,8 @@ str(state_x77) ```{r} #| label: fig-state-x77-scatter #| fig-cap: "预期寿命与人均收入的关系图" -#| fig-width: 4.5 -#| fig-height: 3.5 +#| fig-width: 6 +#| fig-height: 4.5 #| code-fold: true #| echo: !expr knitr::is_html_output() #| fig-showtext: true