diff --git a/_main.pdf b/_main.pdf index 9b577e7d..d8f27c6f 100644 Binary files a/_main.pdf and b/_main.pdf differ diff --git a/analyze-areal-data.html b/analyze-areal-data.html index a380f73d..da3364ec 100644 --- a/analyze-areal-data.html +++ b/analyze-areal-data.html @@ -860,7 +860,7 @@

select = c("Package", "Maintainer") ) |> transform(Maintainer = gsub( - x = Maintainer, - pattern = '(<([^<>]*)>)|(")', - replacement = "" - )) |> - transform(Maintainer = gsub( - x = Maintainer, - pattern = "(R-core)|(R Core Team)", - replacement = "CRAN Team" - )) |> - transform(Maintainer = gsub( - x = Maintainer, - pattern = "(S. M. Iacus)|(Stefano M.Iacus)|(Stefano Maria Iacus)", - replacement = "Stefano M. Iacus" + x = Maintainer, pattern = '(<([^<>]*)>)|(")', replacement = "" + )) |> + transform(Maintainer = gsub( + x = Maintainer, pattern = "(R-core)|(R Core Team)", replacement = "CRAN Team" + )) |> + transform(Maintainer = gsub( + x = Maintainer, + pattern = "(S. M. Iacus)|(Stefano M.Iacus)|(Stefano Maria Iacus)", + replacement = "Stefano M. Iacus" + )) |> + transform(Maintainer = gsub( + x = Maintainer, pattern = "(Toby Hocking)", + replacement = "Toby Dylan Hocking" )) |> transform(Maintainer = gsub( - x = Maintainer, - pattern = "(Toby Hocking)", - replacement = "Toby Dylan Hocking" - )) |> - transform(Maintainer = gsub( - x = Maintainer, - pattern = "(John M Chambers)", - replacement = "John Chambers" - )) - -cran_dev <- aggregate(data = cran_dev, Package ~ Maintainer, FUN = function(x) length(unique(x))) -cran_dev <- cran_dev[order(cran_dev$Package, decreasing = TRUE), ] - -knitr::kable(head(cran_dev, ceiling(nrow(cran_dev) / 2)), - col.names = c("团队成员", "R 包数量"), row.names = FALSE -) + x = Maintainer, pattern = "(John M Chambers)", replacement = "John Chambers" + )) +cran_dev <- aggregate(data = cran_dev, Package ~ Maintainer, FUN = function(x) length(unique(x))) +cran_dev <- cran_dev[order(cran_dev$Package, decreasing = TRUE), ] +knitr::kable(head(cran_dev, ceiling(nrow(cran_dev) / 2)), + col.names = c("团队成员", "R 包数量"), row.names = FALSE +)
代码
knitr::kable(tail(cran_dev, floor(nrow(cran_dev) / 2)),
   col.names = c("团队成员", "R 包数量"), row.names = FALSE
 )
@@ -1162,13 +1153,11 @@

select = c("Package", "Maintainer") ) |> transform(Maintainer = extract_maintainer(Maintainer)) - -rstudio_dev <- aggregate(data = rstudio_dev, Package ~ Maintainer, FUN = function(x) length(unique(x))) -rstudio_dev <- rstudio_dev[order(rstudio_dev$Package, decreasing = TRUE), ] - -knitr::kable(head(rstudio_dev, ceiling(nrow(rstudio_dev) / 2)), - col.names = c("团队成员", "R 包数量"), row.names = FALSE -) +rstudio_dev <- aggregate(data = rstudio_dev, Package ~ Maintainer, FUN = function(x) length(unique(x))) +rstudio_dev <- rstudio_dev[order(rstudio_dev$Package, decreasing = TRUE), ] +knitr::kable(head(rstudio_dev, ceiling(nrow(rstudio_dev) / 2)), + col.names = c("团队成员", "R 包数量"), row.names = FALSE +)

代码
knitr::kable(tail(rstudio_dev, floor(nrow(rstudio_dev) / 2)),
   col.names = c("团队成员", "R 包数量"), row.names = FALSE
 )
@@ -1366,54 +1355,65 @@

  • -Dirk Eddelbuettel 维护了 Rcpp、 RcppEigen 等流行的 R 包,通过 Rcpp 包将很多优秀的 C++ 库引入 R 语言社区。
  • +Dirk Eddelbuettel 维护了 Rcpp、RcppEigen 等流行的 R 包,通过 Rcpp 包将很多优秀的 C++ 库引入 R 语言社区。
  • -Stéphane Laurent 维护了很多与 shiny 、htmlwidgets 相关的 R 包,比如 rAmCharts4 包。
  • +Stéphane Laurent 维护了很多与 shiny、htmlwidgets 相关的 R 包,比如 rAmCharts4 包。
  • Gábor Csárdi 维护了 igraph 包以及大量帮助 R 包开发的基础设施,RStudio 雇员。
  • Hadley Wickham 维护了 ggplot2、dplyr、devtools 等流行的 R 包,RStudio 雇员。
  • -Jeroen Ooms 维护了 magick、 curl 以及大量帮助 R 包开发的基础设施。
  • +Jeroen Ooms 维护了 magick、curl 以及大量帮助 R 包开发的基础设施。
  • Scott Chamberlain 维护了很多与 HTTP/Web 相关的 R 包,rOpenSci 联合创始人。
  • Robin K. S. Hankin 维护了很多与贝叶斯、多元统计相关的 R 包。
  • Henrik Bengtsson 维护了 future 和 parallelly 等流行的 R 包,在并行计算方面有很多贡献。
  • -
  • Jan Wijffels 维护了很多与自然语言处理、图像识别相关的 R 包,比如 udpipe 、BTM 和 word2vec 等包,Bnosac 团队成员。
  • -
  • Kurt Hornik 参与维护 R 软件代码并许多与自然语言处理相关的 R 包,CRAN 核心团队成员。
  • -
  • Martin Maechler 维护了 Matrix 包,CRAN 核心团队成员。
  • +
  • +Jan Wijffels 维护了很多与自然语言处理、图像识别相关的 R 包,比如 udpipe 、BTM 和 word2vec 等包,Bnosac 团队成员。
  • +
  • +Kurt Hornik 参与维护 R 软件代码并许多与自然语言处理相关的 R 包,R 核心团队成员。
  • +
  • +Martin Maechler 维护了 Matrix 包,R 核心团队成员。
  • Max Kuhn 维护了 tidymodels 等包,RStudio 雇员。
  • Bob Rudis 维护了一些与 ggplot2 相关的 R 包,如 ggalt、hrbrthemes 和 statebins 等。
  • -
  • Kartikeya Bolar 维护了很多统计与 shiny 结合的 R 包,比如方差分析、逻辑回归、列联表、聚类分析等。
  • +
  • +Kartikeya Bolar 维护了很多统计与 shiny 结合的 R 包,比如方差分析、逻辑回归、列联表、聚类分析等。
  • Kirill Müller 维护了 DBI 等大量与数据库连接的 R 包。
  • -
  • Shannon T. Holloway 维护了许多与生存分析相关的 R 包。
  • -Simon Urbanek 维护了 rJava、Rserve 等流行的 R 包,CRAN 核心团队成员,负责维护 R 软件中与 MacOS 平台相关的部分。
  • +Shannon T. Holloway 维护了许多与生存分析相关的 R 包。 +
  • +Simon Urbanek 维护了 rJava、Rserve 等流行的 R 包,R 核心团队成员,负责维护 R 软件中与 MacOS 平台相关的部分。
  • +
  • +Achim Zeileis 维护了 colorspace 等流行的 R 包,R 核心团队成员。
  • +
  • +Muhammad Yaseen 维护了多个与 Multiple Indicator Cluster Survey 相关的 R 包。
  • -Achim Zeileis 维护了 colorspace 等流行的 R 包,CRAN 核心团队成员。
  • -
  • Muhammad Yaseen 维护了多个与 Multiple Indicator Cluster Survey 相关的 R 包。
  • -
  • Pablo Sanchez 维护了多个与市场营销平台连接的 R 语言接口,Windsor.ai 组织成员。
  • +Pablo Sanchez 维护了多个与市场营销平台连接的 R 语言接口,Windsor.ai 组织成员。
  • Thomas Lin Pedersen 维护了 patchwork、 gganimate 和 ggraph 等流行的 R 包,RStudio 雇员。
  • -
  • Torsten Hothorn 在统计检验方面贡献了不少内容,比如 coin 和 multcomp 等包,CRAN 核心团队成员。
  • +
  • +Torsten Hothorn 在统计检验方面贡献了不少内容,比如 coin 和 multcomp 等包,R 核心团队成员。
  • Richard Cotton 维护了 assertive 和 rebus 系列 R 包,代码可读性检查。
  • -
  • Florian Schwendinger 维护了大量运筹优化方面的 R 包,扩展了 ROI 包的能力。
  • +
  • +Florian Schwendinger 维护了大量运筹优化方面的 R 包,扩展了 ROI 包的能力。
  • Guangchuang Yu 维护了 ggtree 和 ggimage 等 R 包,在生物信息和可视化领域有不少贡献。
  • Winston Chang 维护了 shiny 等流行的 R 包,RStudio 雇员。
  • John Muschelli 维护了多个关于神经图像的 R 包。
  • -
  • Kevin R. Coombes 维护了多个关于生物信息的 R 包,如 oompaBase 和 oompaData 等。
  • +
  • +Kevin R. Coombes 维护了多个关于生物信息的 R 包,如 oompaBase 和 oompaData 等。
  • Yihui Xie 维护了 knitr 、rmarkdown 等流行的 R 包,RStudio 雇员。
  • -
  • Carl Boettiger 维护了多个接口包,比如 rfishbase 等,rOpenSci 团队成员。
  • +
  • +Carl Boettiger 维护了多个接口包,比如 rfishbase 等,rOpenSci 团队成员。
  • Michael D. Sumner 维护了多个空间统计相关的 R 包。
  • @@ -1422,7 +1422,8 @@

    Georgi N. Boshnakov 维护了多个金融时间序列相关的 R 包,如 fGarch、timeDate 和 timeSeries 等包。

  • Hana Sevcikova 维护了多个与贝叶斯人口统计相关的 R 包。
  • -
  • Joe Thorley 维护了多个与贝叶斯 MCMC 相关的 R 包,Poisson Consulting 雇员。
  • +
  • +Joe Thorley 维护了多个与贝叶斯 MCMC 相关的 R 包,Poisson Consulting 雇员。
  • 统计开发者数量随维护 R 包数量的分布,发现,开发 1 个 R 包的开发者有 6732 人,开发 2 个 R 包的开发者有 1685 人,第二名是第一名的五分之一,递减规律非常符合指数分布。

    @@ -1546,7 +1547,8 @@

    -
    pdb_authors_dt[ ,.(cnt = length(Package)) , by = c("Maintainer", "Authors")][cnt >= 10, ][order(cnt, decreasing = T), ]
    +
    pdb_authors_dt[ ,.(cnt = length(Package)) , by = c("Maintainer", "Authors")
    +                ][cnt >= 10, ][order(cnt, decreasing = T), ]
    #>                 Maintainer               Authors   cnt
     #>                     <char>                <char> <int>
    @@ -1602,7 +1604,8 @@ 

    Poisson Consulting 提供一系列用于计算生物学和统计生态学的 R 包和相关研究论文。
  • Amazon.com, Inc. 提供一系列用于存储、管理、操作等 Amazon 云服务的 R 包,形成一个 paws 套件。
  • -
  • Eli Lilly and Company 可能是 rOpenSci 的一员,赞助了旗下的 targetsjagstargets 等 R 包。
  • +
  • +Eli Lilly and Company 可能是 rOpenSci 的一员,赞助了旗下的 targetsjagstargets 等 R 包。
  • 最后,统计协作次数的分布,网络中边的权重的分布。

    @@ -1621,18 +1624,20 @@

    23.3.3 节点出入度分布

    下面简化这个网络,仅考虑贡献者也是维护者的情况,就是说网络中所有节点既是维护者也是贡献者,这会过滤掉组织机构、大量没有在 CRAN 发过 R 包的贡献者、从没给其它维护者做贡献的维护者。简化后,网络节点的出度、入度的分布图如下。

    # Maintainer 的入度
    -pdb_authors_net_indegree <- pdb_authors_dt[Authors %in% Maintainer, ][, .(in_degree = length(Authors)), by = "Maintainer"]
    -# Authors 的出度
    -pdb_authors_net_outdegree <- pdb_authors_dt[Authors %in% Maintainer, ][, .(out_degree = length(Maintainer)), by = "Authors"]
    -
    -ggplot(pdb_authors_net_indegree, aes(x = in_degree)) +
    -  geom_histogram(binwidth = 1) +
    -  geom_freqpoly(binwidth = 1) +
    -  theme_classic()
    -ggplot(pdb_authors_net_outdegree, aes(x = out_degree)) +
    -  geom_histogram(binwidth = 1) +
    -  geom_freqpoly(binwidth = 1) +
    -  theme_classic()
    +pdb_authors_net_indegree <- pdb_authors_dt[Authors %in% Maintainer, + ][, .(in_degree = length(Authors)), by = "Maintainer"] +# Authors 的出度 +pdb_authors_net_outdegree <- pdb_authors_dt[Authors %in% Maintainer, + ][, .(out_degree = length(Maintainer)), by = "Authors"] + +ggplot(pdb_authors_net_indegree, aes(x = in_degree)) + + geom_histogram(binwidth = 1) + + geom_freqpoly(binwidth = 1) + + theme_classic() +ggplot(pdb_authors_net_outdegree, aes(x = out_degree)) + + geom_histogram(binwidth = 1) + + geom_freqpoly(binwidth = 1) + + theme_classic()

    @@ -1667,8 +1672,9 @@

    # 边
    -pdb_authors_net_edge <- pdb_authors_dt[Authors %in% Maintainer, ][, .(edge_cnt = .N), by = c("Authors", "Maintainer")][edge_cnt > 1,]
    -pdb_authors_net_edge[order(edge_cnt, decreasing = TRUE),]
    +pdb_authors_net_edge <- pdb_authors_dt[Authors %in% Maintainer, + ][, .(edge_cnt = .N), by = c("Authors", "Maintainer")][edge_cnt > 1, ] +pdb_authors_net_edge[order(edge_cnt, decreasing = TRUE),]

    #>                      Authors            Maintainer edge_cnt
     #>                       <char>                <char>    <int>
    @@ -1685,8 +1691,9 @@ 

    # 顶点
    -pdb_authors_net_vertex <- pdb_authors_dt[, .(vertex_cnt = length(unique(Package))), by = "Maintainer"][Maintainer %in% c(pdb_authors_net_edge$Maintainer, pdb_authors_net_edge$Authors),]
    -pdb_authors_net_vertex[order(vertex_cnt, decreasing = TRUE),]

    +pdb_authors_net_vertex <- pdb_authors_dt[, .(vertex_cnt = length(unique(Package))), by = "Maintainer" + ][Maintainer %in% c(pdb_authors_net_edge$Maintainer, pdb_authors_net_edge$Authors),] +pdb_authors_net_vertex[order(vertex_cnt, decreasing = TRUE),]
    #>                Maintainer vertex_cnt
     #>                    <char>      <int>
    diff --git a/analyze-network-data_files/figure-pdf/fig-active-maintainer-1.pdf b/analyze-network-data_files/figure-pdf/fig-active-maintainer-1.pdf
    index 80cff474..88dc80e1 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-active-maintainer-1.pdf and b/analyze-network-data_files/figure-pdf/fig-active-maintainer-1.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-dist-maintainer-1.pdf b/analyze-network-data_files/figure-pdf/fig-dist-maintainer-1.pdf
    index 9fc5c793..8cdf10cd 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-dist-maintainer-1.pdf and b/analyze-network-data_files/figure-pdf/fig-dist-maintainer-1.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-dist-maintainer-2.pdf b/analyze-network-data_files/figure-pdf/fig-dist-maintainer-2.pdf
    index 86ab4486..691e9685 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-dist-maintainer-2.pdf and b/analyze-network-data_files/figure-pdf/fig-dist-maintainer-2.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-network-cluster-1.pdf b/analyze-network-data_files/figure-pdf/fig-network-cluster-1.pdf
    index d2d21184..ca077916 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-network-cluster-1.pdf and b/analyze-network-data_files/figure-pdf/fig-network-cluster-1.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-network-degree-1.pdf b/analyze-network-data_files/figure-pdf/fig-network-degree-1.pdf
    index 182e20f8..db88945a 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-network-degree-1.pdf and b/analyze-network-data_files/figure-pdf/fig-network-degree-1.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-network-degree-2.pdf b/analyze-network-data_files/figure-pdf/fig-network-degree-2.pdf
    index 59f496b8..94599475 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-network-degree-2.pdf and b/analyze-network-data_files/figure-pdf/fig-network-degree-2.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-network-ggraph-1.pdf b/analyze-network-data_files/figure-pdf/fig-network-ggraph-1.pdf
    index b2e2a220..c965463d 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-network-ggraph-1.pdf and b/analyze-network-data_files/figure-pdf/fig-network-ggraph-1.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-network-igraph-1.pdf b/analyze-network-data_files/figure-pdf/fig-network-igraph-1.pdf
    index 15e88572..3334e9aa 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-network-igraph-1.pdf and b/analyze-network-data_files/figure-pdf/fig-network-igraph-1.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-network-visnetwork-1.pdf b/analyze-network-data_files/figure-pdf/fig-network-visnetwork-1.pdf
    index 4279ea58..70f35a57 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-network-visnetwork-1.pdf and b/analyze-network-data_files/figure-pdf/fig-network-visnetwork-1.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-top-maintainer-1.pdf b/analyze-network-data_files/figure-pdf/fig-top-maintainer-1.pdf
    index c7f00b7c..443b67c2 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-top-maintainer-1.pdf and b/analyze-network-data_files/figure-pdf/fig-top-maintainer-1.pdf differ
    diff --git a/analyze-network-data_files/figure-pdf/fig-updated-package-1.pdf b/analyze-network-data_files/figure-pdf/fig-updated-package-1.pdf
    index db2b8f89..05579e18 100644
    Binary files a/analyze-network-data_files/figure-pdf/fig-updated-package-1.pdf and b/analyze-network-data_files/figure-pdf/fig-updated-package-1.pdf differ
    diff --git a/analyze-point-pattern_files/figure-pdf/fig-convex-hull-1.pdf b/analyze-point-pattern_files/figure-pdf/fig-convex-hull-1.pdf
    index 159ca1a8..0c90d3d7 100644
    Binary files a/analyze-point-pattern_files/figure-pdf/fig-convex-hull-1.pdf and b/analyze-point-pattern_files/figure-pdf/fig-convex-hull-1.pdf differ
    diff --git a/analyze-point-pattern_files/figure-pdf/fig-convex-hull-2.pdf b/analyze-point-pattern_files/figure-pdf/fig-convex-hull-2.pdf
    index 22e10b20..7ae61e55 100644
    Binary files a/analyze-point-pattern_files/figure-pdf/fig-convex-hull-2.pdf and b/analyze-point-pattern_files/figure-pdf/fig-convex-hull-2.pdf differ
    diff --git a/analyze-point-pattern_files/figure-pdf/fig-kernel-heatmap-1.pdf b/analyze-point-pattern_files/figure-pdf/fig-kernel-heatmap-1.pdf
    index 3801ac34..4427f16d 100644
    Binary files a/analyze-point-pattern_files/figure-pdf/fig-kernel-heatmap-1.pdf and b/analyze-point-pattern_files/figure-pdf/fig-kernel-heatmap-1.pdf differ
    diff --git a/analyze-point-pattern_files/figure-pdf/fig-kernel-heatmap-2.pdf b/analyze-point-pattern_files/figure-pdf/fig-kernel-heatmap-2.pdf
    index 78189635..784f48bd 100644
    Binary files a/analyze-point-pattern_files/figure-pdf/fig-kernel-heatmap-2.pdf and b/analyze-point-pattern_files/figure-pdf/fig-kernel-heatmap-2.pdf differ
    diff --git a/analyze-point-pattern_files/figure-pdf/fig-quakes-ggplot2-grid-1.pdf b/analyze-point-pattern_files/figure-pdf/fig-quakes-ggplot2-grid-1.pdf
    index 5eeb19fd..3e696f3e 100644
    Binary files a/analyze-point-pattern_files/figure-pdf/fig-quakes-ggplot2-grid-1.pdf and b/analyze-point-pattern_files/figure-pdf/fig-quakes-ggplot2-grid-1.pdf differ
    diff --git a/analyze-point-pattern_files/figure-pdf/fig-quakes-ggplot2-grid-2.pdf b/analyze-point-pattern_files/figure-pdf/fig-quakes-ggplot2-grid-2.pdf
    index a65ec997..6cb7b05d 100644
    Binary files a/analyze-point-pattern_files/figure-pdf/fig-quakes-ggplot2-grid-2.pdf and b/analyze-point-pattern_files/figure-pdf/fig-quakes-ggplot2-grid-2.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-matern-fun-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-matern-fun-1.pdf
    index f2410e45..81bf1ed4 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-matern-fun-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-matern-fun-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-point-to-polygon-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-point-to-polygon-1.pdf
    index 8682705c..1bfa6bd7 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-point-to-polygon-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-point-to-polygon-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-point-to-polygon-2.pdf b/analyze-spatial-data_files/figure-pdf/fig-point-to-polygon-2.pdf
    index 52d8bffe..f1d4faef 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-point-to-polygon-2.pdf and b/analyze-spatial-data_files/figure-pdf/fig-point-to-polygon-2.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-1.pdf
    index 634951f6..1ac6c6a2 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-2.pdf b/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-2.pdf
    index 8f2b5c37..c2982c50 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-2.pdf and b/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-2.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-3.pdf b/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-3.pdf
    index 347411f9..0b59ad86 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-3.pdf and b/analyze-spatial-data_files/figure-pdf/fig-poisson-residuals-3.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-atoll-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-atoll-1.pdf
    index 2b979d4c..0ca23fdc 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-atoll-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-atoll-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-buffer-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-buffer-1.pdf
    index 40af7c05..92d782e3 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-buffer-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-buffer-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-coastline-grid-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-coastline-grid-1.pdf
    index e8dd53f5..f400caf2 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-coastline-grid-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-coastline-grid-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-concentration-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-concentration-1.pdf
    index c826caa4..8752513e 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-concentration-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-concentration-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-grid-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-grid-1.pdf
    index b89da207..c454ea46 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-grid-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-grid-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-1.pdf
    index 33ba1283..f1372b40 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-2.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-2.pdf
    index 9454c707..290d8d89 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-2.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-2.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-zoom-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-zoom-1.pdf
    index cb84fd09..ab005923 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-zoom-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-location-zoom-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-poisson-residuals-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-poisson-residuals-1.pdf
    index 0113cac8..0eda167c 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-poisson-residuals-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-poisson-residuals-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-pred-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-pred-1.pdf
    index c5ae65ac..353edc41 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-pred-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-pred-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-pred-sp-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-pred-sp-1.pdf
    index 102035f0..42d53c95 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-pred-sp-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-pred-sp-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-var-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-var-1.pdf
    index 3b432527..c61c5639 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-var-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-var-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-1.pdf
    index ef40d51d..632b6501 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-norm-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-norm-1.pdf
    index 3651a598..b13aa44b 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-norm-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-norm-1.pdf differ
    diff --git a/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-theory-1.pdf b/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-theory-1.pdf
    index a17bf302..a34d4232 100644
    Binary files a/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-theory-1.pdf and b/analyze-spatial-data_files/figure-pdf/fig-rongelap-vario-theory-1.pdf differ
    diff --git a/analyze-survival-data.html b/analyze-survival-data.html
    index dfd63cb0..c7eb8b6d 100644
    --- a/analyze-survival-data.html
    +++ b/analyze-survival-data.html
    @@ -785,8 +785,8 @@ 

    Fixed effects:
                      mean    sd 0.025quant 0.5quant 0.975quant   mode kld
    -(Intercept)    -4.172 0.376     -4.910   -4.172     -3.434 -4.172   0
    -xNonmaintained  0.983 0.482      0.038    0.983      1.929  0.983   0
    +(Intercept)    -4.173 0.378     -4.913   -4.173     -3.432 -4.173   0
    +xNonmaintained  0.984 0.483      0.036    0.984      1.931  0.984   0
     
      is computed 

    diff --git a/analyze-survival-data_files/figure-html/fig-aml-1.png b/analyze-survival-data_files/figure-html/fig-aml-1.png index 7c58953f..4aea89cf 100644 Binary files a/analyze-survival-data_files/figure-html/fig-aml-1.png and b/analyze-survival-data_files/figure-html/fig-aml-1.png differ diff --git a/analyze-survival-data_files/figure-pdf/fig-aml-1.pdf b/analyze-survival-data_files/figure-pdf/fig-aml-1.pdf index be79b9a4..ab311305 100644 Binary files a/analyze-survival-data_files/figure-pdf/fig-aml-1.pdf and b/analyze-survival-data_files/figure-pdf/fig-aml-1.pdf differ diff --git a/analyze-survival-data_files/figure-pdf/fig-leukemia-surv-1.pdf b/analyze-survival-data_files/figure-pdf/fig-leukemia-surv-1.pdf index bbdc37d3..63bd1edd 100644 Binary files a/analyze-survival-data_files/figure-pdf/fig-leukemia-surv-1.pdf and b/analyze-survival-data_files/figure-pdf/fig-leukemia-surv-1.pdf differ diff --git a/analyze-text-data.html b/analyze-text-data.html index fab55ee8..01de50d2 100644 --- a/analyze-text-data.html +++ b/analyze-text-data.html @@ -778,8 +778,8 @@

    n_check_convergence = 25, progressbar = FALSE )

    -
    #> INFO  [06:14:42.698] early stopping at 175 iteration
    -#> INFO  [06:14:43.321] early stopping at 50 iteration
    +
    #> INFO  [05:04:54.962] early stopping at 175 iteration
    +#> INFO  [05:04:55.568] early stopping at 50 iteration

    下图展示主题的分布,各个主题及其所占比例。

    diff --git a/analyze-text-data_files/figure-pdf/fig-topic-distr-1.pdf b/analyze-text-data_files/figure-pdf/fig-topic-distr-1.pdf index 4b04e228..e980c075 100644 Binary files a/analyze-text-data_files/figure-pdf/fig-topic-distr-1.pdf and b/analyze-text-data_files/figure-pdf/fig-topic-distr-1.pdf differ diff --git a/analyze-text-data_files/figure-pdf/fig-yihui-cn-1.pdf b/analyze-text-data_files/figure-pdf/fig-yihui-cn-1.pdf index f162d65c..b9d69159 100644 Binary files a/analyze-text-data_files/figure-pdf/fig-yihui-cn-1.pdf and b/analyze-text-data_files/figure-pdf/fig-yihui-cn-1.pdf differ diff --git a/analyze-text-data_files/figure-pdf/fig-yihui-wordcloud-1.pdf b/analyze-text-data_files/figure-pdf/fig-yihui-wordcloud-1.pdf index ad3da7b9..017f40e9 100644 Binary files a/analyze-text-data_files/figure-pdf/fig-yihui-wordcloud-1.pdf and b/analyze-text-data_files/figure-pdf/fig-yihui-wordcloud-1.pdf differ diff --git a/analyze-time-series-data.html b/analyze-time-series-data.html index 0eb31082..04638cd4 100644 --- a/analyze-time-series-data.html +++ b/analyze-time-series-data.html @@ -896,8 +896,8 @@

    dyUnzoom()
    -
    - +
    +
    图 26.6: 美团股价变化趋势 diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-acf-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-acf-1.pdf index 35b47a35..27e8f98d 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-acf-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-acf-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-add-fitted-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-add-fitted-1.pdf index e924a0ed..f78ae3ab 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-add-fitted-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-add-fitted-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-adjusted-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-adjusted-1.pdf index 7241581b..8dfcdeb6 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-adjusted-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-adjusted-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-decomp-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-decomp-1.pdf index 0a048a2e..fb081c85 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-decomp-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-decomp-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-exp-fitted-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-exp-fitted-1.pdf index 98e2dcdd..78dcc98c 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-exp-fitted-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-exp-fitted-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-exp-pred-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-exp-pred-1.pdf index 9091496c..806ef646 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-exp-pred-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-exp-pred-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-holt-fitted-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-holt-fitted-1.pdf index 8fdc680f..86dff806 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-holt-fitted-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-holt-fitted-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-mult-fitted-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-mult-fitted-1.pdf index d7b27ef9..d46420a0 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-mult-fitted-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-mult-fitted-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-pacf-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-pacf-1.pdf index 968acf19..d3e03d57 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-pacf-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-pacf-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-stl-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-stl-1.pdf index 2145bc6a..df503f34 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-airpassengers-stl-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-airpassengers-stl-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-meituan-by-year-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-meituan-by-year-1.pdf index 56a1fdb8..ea4082e7 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-meituan-by-year-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-meituan-by-year-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-meituan-ggfortify-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-meituan-ggfortify-1.pdf index 17d5844a..9f625a80 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-meituan-ggfortify-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-meituan-ggfortify-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-meituan-ggplot2-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-meituan-ggplot2-1.pdf index 12d81aa0..4c8a623d 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-meituan-ggplot2-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-meituan-ggplot2-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-meituan-plot-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-meituan-plot-1.pdf index eba56afd..d7cb7d25 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-meituan-plot-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-meituan-plot-1.pdf differ diff --git a/analyze-time-series-data_files/figure-pdf/fig-meituan-plot-xts-1.pdf b/analyze-time-series-data_files/figure-pdf/fig-meituan-plot-xts-1.pdf index ff1dadbd..8cec86c6 100644 Binary files a/analyze-time-series-data_files/figure-pdf/fig-meituan-plot-xts-1.pdf and b/analyze-time-series-data_files/figure-pdf/fig-meituan-plot-xts-1.pdf differ diff --git a/categorical-data-analysis_files/figure-pdf/fig-titanic-alluvial-1.pdf b/categorical-data-analysis_files/figure-pdf/fig-titanic-alluvial-1.pdf index 711712b3..a614988a 100644 Binary files a/categorical-data-analysis_files/figure-pdf/fig-titanic-alluvial-1.pdf and b/categorical-data-analysis_files/figure-pdf/fig-titanic-alluvial-1.pdf differ diff --git a/categorical-data-analysis_files/figure-pdf/fig-titanic-ggstats-1.pdf b/categorical-data-analysis_files/figure-pdf/fig-titanic-ggstats-1.pdf index 796fca29..254ff534 100644 Binary files a/categorical-data-analysis_files/figure-pdf/fig-titanic-ggstats-1.pdf and b/categorical-data-analysis_files/figure-pdf/fig-titanic-ggstats-1.pdf differ diff --git a/categorical-data-analysis_files/figure-pdf/fig-titanic-mosaic-1.pdf b/categorical-data-analysis_files/figure-pdf/fig-titanic-mosaic-1.pdf index 70ab54a9..a8d2f686 100644 Binary files a/categorical-data-analysis_files/figure-pdf/fig-titanic-mosaic-1.pdf and b/categorical-data-analysis_files/figure-pdf/fig-titanic-mosaic-1.pdf differ diff --git a/categorical-data-analysis_files/figure-pdf/fig-titanic-mosaicplot-1.pdf b/categorical-data-analysis_files/figure-pdf/fig-titanic-mosaicplot-1.pdf index 1016e0b8..97165e41 100644 Binary files a/categorical-data-analysis_files/figure-pdf/fig-titanic-mosaicplot-1.pdf and b/categorical-data-analysis_files/figure-pdf/fig-titanic-mosaicplot-1.pdf differ diff --git a/categorical-data-analysis_files/figure-pdf/fig-ucb-admissions-fourfoldplot-1.pdf b/categorical-data-analysis_files/figure-pdf/fig-ucb-admissions-fourfoldplot-1.pdf index d6530a0d..6daae90d 100644 Binary files a/categorical-data-analysis_files/figure-pdf/fig-ucb-admissions-fourfoldplot-1.pdf and b/categorical-data-analysis_files/figure-pdf/fig-ucb-admissions-fourfoldplot-1.pdf differ diff --git a/categorical-data-analysis_files/figure-pdf/fig-ucb-admissions-mosaicplot-1.pdf b/categorical-data-analysis_files/figure-pdf/fig-ucb-admissions-mosaicplot-1.pdf index aca7c83a..9cc78287 100644 Binary files a/categorical-data-analysis_files/figure-pdf/fig-ucb-admissions-mosaicplot-1.pdf and b/categorical-data-analysis_files/figure-pdf/fig-ucb-admissions-mosaicplot-1.pdf differ diff --git a/classification-problems.html b/classification-problems.html index 2389df14..518beedf 100644 --- a/classification-problems.html +++ b/classification-problems.html @@ -1010,11 +1010,11 @@

    @@ -1064,20 +1064,20 @@

    a 4-4-3 network with 35 weights
     options were - softmax modelling 
    - b->h1 i1->h1 i2->h1 i3->h1 i4->h1 
    -  0.14   0.26   0.60   0.76   1.01 
    - b->h2 i1->h2 i2->h2 i3->h2 i4->h2 
    -  0.72   3.10  11.52 -13.44  -6.45 
    - b->h3 i1->h3 i2->h3 i3->h3 i4->h3 
    --71.48 -86.52  38.38  86.85  52.74 
    - b->h4 i1->h4 i2->h4 i3->h4 i4->h4 
    -  3.07  -0.60   2.10  -0.40  -2.95 
    - b->o1 h1->o1 h2->o1 h3->o1 h4->o1 
    - -0.13  -0.24  30.32 -10.63   5.79 
    - b->o2 h1->o2 h2->o2 h3->o2 h4->o2 
    - 21.04  -0.40 -28.63 -18.75  16.31 
    - b->o3 h1->o3 h2->o3 h3->o3 h4->o3 
    --19.47   0.37  -1.86  29.78 -20.79 
    + b->h1 i1->h1 i2->h1 i3->h1 i4->h1 + 148.05 169.39 702.94 -1218.15 -537.34 + b->h2 i1->h2 i2->h2 i3->h2 i4->h2 + -51.93 -119.60 -91.22 276.26 130.38 + b->h3 i1->h3 i2->h3 i3->h3 i4->h3 +-1371.99 -76.88 -144.45 349.89 337.51 + b->h4 i1->h4 i2->h4 i3->h4 i4->h4 + -24.41 -191.75 -58.38 -229.31 -91.16 + b->o1 h1->o1 h2->o1 h3->o1 h4->o1 + 71.09 512.43 -293.22 -543.92 4.49 + b->o2 h1->o2 h2->o2 h3->o2 h4->o2 + -3.08 -325.66 368.75 16.87 37.73 + b->o3 h1->o3 h2->o3 h3->o3 h4->o3 + -67.71 -185.66 -74.37 528.55 -42.73

    size 隐藏层中的神经元数量

    diff --git a/classification-problems_files/figure-html/fig-iris-rf-1.png b/classification-problems_files/figure-html/fig-iris-rf-1.png index 2d95d721..626aa8fa 100644 Binary files a/classification-problems_files/figure-html/fig-iris-rf-1.png and b/classification-problems_files/figure-html/fig-iris-rf-1.png differ diff --git a/classification-problems_files/figure-html/fig-iris-vi-1.png b/classification-problems_files/figure-html/fig-iris-vi-1.png index 6b663b68..993e8611 100644 Binary files a/classification-problems_files/figure-html/fig-iris-vi-1.png and b/classification-problems_files/figure-html/fig-iris-vi-1.png differ diff --git a/classification-problems_files/figure-pdf/fig-iris-rf-1.pdf b/classification-problems_files/figure-pdf/fig-iris-rf-1.pdf index c2da30ac..ede143ce 100644 Binary files a/classification-problems_files/figure-pdf/fig-iris-rf-1.pdf and b/classification-problems_files/figure-pdf/fig-iris-rf-1.pdf differ diff --git a/classification-problems_files/figure-pdf/fig-iris-rpart-1.pdf b/classification-problems_files/figure-pdf/fig-iris-rpart-1.pdf index 41515e05..cee5c1ae 100644 Binary files a/classification-problems_files/figure-pdf/fig-iris-rpart-1.pdf and b/classification-problems_files/figure-pdf/fig-iris-rpart-1.pdf differ diff --git a/classification-problems_files/figure-pdf/fig-iris-vi-1.pdf b/classification-problems_files/figure-pdf/fig-iris-vi-1.pdf index 83d13d2a..6ccfc142 100644 Binary files a/classification-problems_files/figure-pdf/fig-iris-vi-1.pdf and b/classification-problems_files/figure-pdf/fig-iris-vi-1.pdf differ diff --git a/classification-problems_files/figure-pdf/fig-multinom-glmnet-1.pdf b/classification-problems_files/figure-pdf/fig-multinom-glmnet-1.pdf index 0537e319..4feeefaf 100644 Binary files a/classification-problems_files/figure-pdf/fig-multinom-glmnet-1.pdf and b/classification-problems_files/figure-pdf/fig-multinom-glmnet-1.pdf differ diff --git a/classification-problems_files/figure-pdf/fig-multinom-glmnet-2.pdf b/classification-problems_files/figure-pdf/fig-multinom-glmnet-2.pdf index 406f565a..0b32cd42 100644 Binary files a/classification-problems_files/figure-pdf/fig-multinom-glmnet-2.pdf and b/classification-problems_files/figure-pdf/fig-multinom-glmnet-2.pdf differ diff --git a/classification-problems_files/figure-pdf/fig-multinom-glmnet-3.pdf b/classification-problems_files/figure-pdf/fig-multinom-glmnet-3.pdf index 30b07e7a..c787041c 100644 Binary files a/classification-problems_files/figure-pdf/fig-multinom-glmnet-3.pdf and b/classification-problems_files/figure-pdf/fig-multinom-glmnet-3.pdf differ diff --git a/classification-problems_files/figure-pdf/fig-multinom-glmnet-4.pdf b/classification-problems_files/figure-pdf/fig-multinom-glmnet-4.pdf index 37d3caa1..f41857a4 100644 Binary files a/classification-problems_files/figure-pdf/fig-multinom-glmnet-4.pdf and b/classification-problems_files/figure-pdf/fig-multinom-glmnet-4.pdf differ diff --git a/common-statistical-tests_files/figure-html/fig-two-samples-means-1.pdf b/common-statistical-tests_files/figure-html/fig-two-samples-means-1.pdf index 695074a3..1756daf5 100644 Binary files a/common-statistical-tests_files/figure-html/fig-two-samples-means-1.pdf and b/common-statistical-tests_files/figure-html/fig-two-samples-means-1.pdf differ diff --git a/common-statistical-tests_files/figure-html/fig-two-samples-means-1.tex b/common-statistical-tests_files/figure-html/fig-two-samples-means-1.tex index 83894769..21a63a48 100644 --- a/common-statistical-tests_files/figure-html/fig-two-samples-means-1.tex +++ b/common-statistical-tests_files/figure-html/fig-two-samples-means-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:11:37 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:02:04 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/common-statistical-tests_files/figure-pdf/fig-ToothGrowth-interaction-plot-1.pdf b/common-statistical-tests_files/figure-pdf/fig-ToothGrowth-interaction-plot-1.pdf index d6ce1a91..959c945f 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-ToothGrowth-interaction-plot-1.pdf and b/common-statistical-tests_files/figure-pdf/fig-ToothGrowth-interaction-plot-1.pdf differ diff --git a/common-statistical-tests_files/figure-pdf/fig-chickwts-i-1.pdf b/common-statistical-tests_files/figure-pdf/fig-chickwts-i-1.pdf index 60232ddc..b9301627 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-chickwts-i-1.pdf and b/common-statistical-tests_files/figure-pdf/fig-chickwts-i-1.pdf differ diff --git a/common-statistical-tests_files/figure-pdf/fig-chickwts-ii-1.pdf b/common-statistical-tests_files/figure-pdf/fig-chickwts-ii-1.pdf index f298e398..fb785d82 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-chickwts-ii-1.pdf and b/common-statistical-tests_files/figure-pdf/fig-chickwts-ii-1.pdf differ diff --git a/common-statistical-tests_files/figure-pdf/fig-iris-ridgeline-1.pdf b/common-statistical-tests_files/figure-pdf/fig-iris-ridgeline-1.pdf index 07f32032..368d0455 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-iris-ridgeline-1.pdf and b/common-statistical-tests_files/figure-pdf/fig-iris-ridgeline-1.pdf differ diff --git a/common-statistical-tests_files/figure-pdf/fig-morley-1.pdf b/common-statistical-tests_files/figure-pdf/fig-morley-1.pdf index 6fe22b13..f07445d4 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-morley-1.pdf and b/common-statistical-tests_files/figure-pdf/fig-morley-1.pdf differ diff --git a/common-statistical-tests_files/figure-pdf/fig-plant-growth-1.pdf b/common-statistical-tests_files/figure-pdf/fig-plant-growth-1.pdf index 0df56eff..25a58200 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-plant-growth-1.pdf and b/common-statistical-tests_files/figure-pdf/fig-plant-growth-1.pdf differ diff --git a/common-statistical-tests_files/figure-pdf/fig-sleep-1.pdf b/common-statistical-tests_files/figure-pdf/fig-sleep-1.pdf index e0619863..a04b8595 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-sleep-1.pdf and b/common-statistical-tests_files/figure-pdf/fig-sleep-1.pdf differ diff --git a/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.pdf b/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.pdf index 0da6ba74..7dded41d 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.pdf and b/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.pdf differ diff --git a/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.png b/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.png index 0c8c0846..f2499047 100644 Binary files a/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.png and b/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.png differ diff --git a/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.tex b/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.tex index 6374d58f..e9465c44 100644 --- a/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.tex +++ b/common-statistical-tests_files/figure-pdf/fig-two-samples-means-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:11:50 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:02:18 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/documents-html_files/figure-pdf/fig-iris-ggplot2-1.pdf b/documents-html_files/figure-pdf/fig-iris-ggplot2-1.pdf index e884cd8c..6623139b 100644 Binary files a/documents-html_files/figure-pdf/fig-iris-ggplot2-1.pdf and b/documents-html_files/figure-pdf/fig-iris-ggplot2-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-2d-gp-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-2d-gp-1.pdf index 45c87cb7..1435d256 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-2d-gp-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-2d-gp-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-bivar-scatter-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-bivar-scatter-1.pdf index 62b30f62..ceb95712 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-bivar-scatter-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-bivar-scatter-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-location-bayes-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-location-bayes-1.pdf index f86f4f70..4183f834 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-location-bayes-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-location-bayes-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-grid-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-grid-1.pdf index b47d6425..ca311781 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-grid-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-grid-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-dens-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-dens-1.pdf index 676ca05a..a72cf22e 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-dens-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-dens-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-fitted-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-fitted-1.pdf index 7c3429ca..85fe1118 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-fitted-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-fitted-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-ppcheck-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-ppcheck-1.pdf index adc222fe..dfda1a46 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-ppcheck-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-ppcheck-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-pred-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-pred-1.pdf index d62e4630..9973ac47 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-pred-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-pred-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-trace-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-trace-1.pdf index 882086e2..5cbfcf02 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-trace-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-rongelap-poisson-trace-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-trace-dens-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-trace-dens-1.pdf index 874281ae..bc18dcbe 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-trace-dens-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-trace-dens-1.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-trace-dens-2.pdf b/gaussian-processes-regression_files/figure-pdf/fig-trace-dens-2.pdf index be40ba78..d7f1d690 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-trace-dens-2.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-trace-dens-2.pdf differ diff --git a/gaussian-processes-regression_files/figure-pdf/fig-trivar-bayes-1.pdf b/gaussian-processes-regression_files/figure-pdf/fig-trivar-bayes-1.pdf index c5099aff..33e477d0 100644 Binary files a/gaussian-processes-regression_files/figure-pdf/fig-trivar-bayes-1.pdf and b/gaussian-processes-regression_files/figure-pdf/fig-trivar-bayes-1.pdf differ diff --git a/generalized-additive-models.html b/generalized-additive-models.html index 5bb47fdf..6c220a1a 100644 --- a/generalized-additive-models.html +++ b/generalized-additive-models.html @@ -1354,14 +1354,14 @@

    res$summary.fixed
    #>        mean         sd 0.025quant 0.5quant 0.975quant     mode          kld
    -#> b0 1.828027 0.06147354   1.706422 1.828284   1.948169 1.828279 1.782545e-08
    +#> b0 1.828027 0.06147357 1.706422 1.828283 1.948169 1.828279 1.782558e-08
    # 超参数
     res$summary.hyperpar
    -
    #>                   mean        sd 0.025quant  0.5quant 0.975quant      mode
    -#> Theta1 for s  2.000684 0.0623506   1.876512  2.001169   2.122006  2.003209
    -#> Theta2 for s -4.851258 0.1297349  -5.105062 -4.851807  -4.594250 -4.854094
    +
    #>                   mean         sd 0.025quant  0.5quant 0.975quant      mode
    +#> Theta1 for s  2.000684 0.06235036   1.876512  2.001169   2.122006  2.003209
    +#> Theta2 for s -4.851258 0.12973214  -5.105057 -4.851807  -4.594256 -4.854095

    提取预测数据,并整理数据。

    diff --git a/generalized-additive-models_files/figure-html/fig-mcycle-smooths-2.png b/generalized-additive-models_files/figure-html/fig-mcycle-smooths-2.png index 7d1ebdc6..7a988f05 100644 Binary files a/generalized-additive-models_files/figure-html/fig-mcycle-smooths-2.png and b/generalized-additive-models_files/figure-html/fig-mcycle-smooths-2.png differ diff --git a/generalized-additive-models_files/figure-html/fig-rongelap-inla-1.png b/generalized-additive-models_files/figure-html/fig-rongelap-inla-1.png index 84bd6491..30aae5c4 100644 Binary files a/generalized-additive-models_files/figure-html/fig-rongelap-inla-1.png and b/generalized-additive-models_files/figure-html/fig-rongelap-inla-1.png differ diff --git a/generalized-additive-models_files/figure-pdf/fig-city-as-1.pdf b/generalized-additive-models_files/figure-pdf/fig-city-as-1.pdf index 8df313cb..5927b8cc 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-city-as-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-city-as-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-city-as-2.pdf b/generalized-additive-models_files/figure-pdf/fig-city-as-2.pdf index c8eb03f8..95734cf5 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-city-as-2.pdf and b/generalized-additive-models_files/figure-pdf/fig-city-as-2.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-city-border-1.pdf b/generalized-additive-models_files/figure-pdf/fig-city-border-1.pdf index 10c6b6f8..fe1582c1 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-city-border-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-city-border-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-city-data-1.pdf b/generalized-additive-models_files/figure-pdf/fig-city-data-1.pdf index 18d2fdcd..f0690b38 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-city-data-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-city-data-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-city-hg-1.pdf b/generalized-additive-models_files/figure-pdf/fig-city-hg-1.pdf index 01c64555..83bc3edc 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-city-hg-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-city-hg-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-city-topo-1.pdf b/generalized-additive-models_files/figure-pdf/fig-city-topo-1.pdf index 82099162..05a0e9b1 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-city-topo-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-city-topo-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-mcycle-1.pdf b/generalized-additive-models_files/figure-pdf/fig-mcycle-1.pdf index 07c3e706..49deff8a 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-mcycle-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-mcycle-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-mcycle-ggplot2-1.pdf b/generalized-additive-models_files/figure-pdf/fig-mcycle-ggplot2-1.pdf index c9dbf8e7..e4c444c8 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-mcycle-ggplot2-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-mcycle-ggplot2-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-mcycle-smooths-1.pdf b/generalized-additive-models_files/figure-pdf/fig-mcycle-smooths-1.pdf index 066a7fd4..5f717ef2 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-mcycle-smooths-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-mcycle-smooths-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-mcycle-smooths-2.pdf b/generalized-additive-models_files/figure-pdf/fig-mcycle-smooths-2.pdf index 9413d6d1..8d43a3d7 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-mcycle-smooths-2.pdf and b/generalized-additive-models_files/figure-pdf/fig-mcycle-smooths-2.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-mcycle-viz-1.pdf b/generalized-additive-models_files/figure-pdf/fig-mcycle-viz-1.pdf index 3403c3ea..81a7ccf2 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-mcycle-viz-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-mcycle-viz-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-rongelap-inla-1.pdf b/generalized-additive-models_files/figure-pdf/fig-rongelap-inla-1.pdf index fa2fbedf..10013354 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-rongelap-inla-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-rongelap-inla-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-rongelap-mgcv-gam-1.pdf b/generalized-additive-models_files/figure-pdf/fig-rongelap-mgcv-gam-1.pdf index 362f4c77..34d57025 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-rongelap-mgcv-gam-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-rongelap-mgcv-gam-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-rongelap-mgcv-ginla-1.pdf b/generalized-additive-models_files/figure-pdf/fig-rongelap-mgcv-ginla-1.pdf index e1adcad9..7f0f7bd1 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-rongelap-mgcv-ginla-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-rongelap-mgcv-ginla-1.pdf differ diff --git a/generalized-additive-models_files/figure-pdf/fig-rongelap-scatter3d-1.pdf b/generalized-additive-models_files/figure-pdf/fig-rongelap-scatter3d-1.pdf index 7c132f9b..ca1a1c77 100644 Binary files a/generalized-additive-models_files/figure-pdf/fig-rongelap-scatter3d-1.pdf and b/generalized-additive-models_files/figure-pdf/fig-rongelap-scatter3d-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-cdplot-orings-1.pdf b/generalized-linear-models_files/figure-pdf/fig-cdplot-orings-1.pdf index 191af9ca..bd987aee 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-cdplot-orings-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-cdplot-orings-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-density-filled-1.pdf b/generalized-linear-models_files/figure-pdf/fig-density-filled-1.pdf index 8d28bc7b..e4e359b1 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-density-filled-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-density-filled-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-esoph-1.pdf b/generalized-linear-models_files/figure-pdf/fig-esoph-1.pdf index ab6260d8..21b713d8 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-esoph-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-esoph-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-areas-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-areas-1.pdf index 3a4c4bf0..4c6bf5bf 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-areas-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-areas-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-dens-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-dens-1.pdf index 621ad61e..9d9fff87 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-dens-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-dens-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-hex-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-hex-1.pdf index 7652d93b..0edb75fb 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-hex-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-hex-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-nuts-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-nuts-1.pdf index ea3a89ac..27a2c83c 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-nuts-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-nuts-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-ppcheck-dens-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-ppcheck-dens-1.pdf index 62799bcd..e847445c 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-ppcheck-dens-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-ppcheck-dens-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-ppcheck-intervals-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-ppcheck-intervals-1.pdf index c384d952..f92dbb43 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-ppcheck-intervals-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-ppcheck-intervals-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-ridges-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-ridges-1.pdf index e96b8087..7c5beac0 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-ridges-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-ridges-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-scatter-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-scatter-1.pdf index 102c787d..378c69cd 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-scatter-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-scatter-1.pdf differ diff --git a/generalized-linear-models_files/figure-pdf/fig-stan-trace-1.pdf b/generalized-linear-models_files/figure-pdf/fig-stan-trace-1.pdf index bd24781f..86024d68 100644 Binary files a/generalized-linear-models_files/figure-pdf/fig-stan-trace-1.pdf and b/generalized-linear-models_files/figure-pdf/fig-stan-trace-1.pdf differ diff --git a/hierarchical-normal-models.html b/hierarchical-normal-models.html index 71be0785..d5b475b5 100644 --- a/hierarchical-normal-models.html +++ b/hierarchical-normal-models.html @@ -790,7 +790,7 @@

    str(eight_schools_sim)
    #> List of 5
    -#>  $ mu   : num [1:2000(1d)] 10.9 7.32 11.08 6.01 7.17 ...
    +#>  $ mu   : num [1:2000(1d)] 4.379 8.171 12.021 -0.284 7.584 ...
     #>   ..- attr(*, "dimnames")=List of 1
     #>   .. ..$ iterations: NULL
    -#>  $ tau  : num [1:2000(1d)] 0.0611 13.9378 0.2101 10.2247 8.0898 ...
    +#>  $ tau  : num [1:2000(1d)] 14.523 0.742 13.614 14.05 10.707 ...
     #>   ..- attr(*, "dimnames")=List of 1
     #>   .. ..$ iterations: NULL
    -#>  $ eta  : num [1:2000, 1:8] -0.0263 1.2116 0.6171 0.4923 1.2238 ...
    +#>  $ eta  : num [1:2000, 1:8] 0.8887 1.7902 -0.3404 1.1472 0.0448 ...
     #>   ..- attr(*, "dimnames")=List of 2
     #>   .. ..$ iterations: NULL
     #>   .. ..$           : NULL
    -#>  $ theta: num [1:2000, 1:8] 10.9 24.2 11.2 11 17.1 ...
    +#>  $ theta: num [1:2000, 1:8] 17.29 9.5 7.39 15.83 8.06 ...
     #>   ..- attr(*, "dimnames")=List of 2
     #>   .. ..$ iterations: NULL
     #>   .. ..$           : NULL
    -#>  $ lp__ : num [1:2000(1d)] -54 -46.3 -54 -48.5 -51 ...
    +#>  $ lp__ : num [1:2000(1d)] -48.4 -51.8 -49.8 -46.1 -49 ...
     #>   ..- attr(*, "dimnames")=List of 1
     #>   .. ..$ iterations: NULL
    @@ -2267,7 +2267,7 @@

    R 语言数据分析实战

    发布日期
    -

    2024年2月1日

    +

    2024年2月5日

    @@ -628,7 +628,7 @@

    R 语言数据分析实战

    本书初稿是在 RStudio IDE 内使用 Quarto 编辑的,Quarto 是继R Markdown之后,一个新的开源的科学和技术发布系统,它基于 Pandoc支持输出多种格式的书稿,比如 HTML 网页、EPUB 电子书、DOCX 文档和 PDF 便携式文档等。Quarto 吸收了过去 10 年 R Markdown 取得的经验和教训,在书籍写作、创建博客、制作简历和幻灯片等系列场景中支持更加统一的使用语法,一份源文档输出多种格式,使文档内容在不同场景中的迁移成本更低。了解更多 Quarto 特性,请访问 https://quarto.org/

    书中的代码字体采用美观的 Source Code Pro 字体, 为方便跨操作系统编译书籍电子版,正文的中文字体采用开源的 fandol 字体。此外,考虑到美观性,本书图形使用了 Noto 系列中英文字体,它们来自 Google Fonts 字体库,分别是 Noto Sans 无衬线英文字体和 Noto Serif SC 宋体中文字体。

    书中 R 包名以粗体表示,如 knitr 包,函数名以等宽体表示,如 plot(),函数的参数名同理。代码块内注释用 # 表示,运行结果每一行开头以 #> 标记。本书写作过程中,依赖 knitr (Xie 2015)ggplot2 (Wickham 2016)lattice (Sarkar 2008) 等众多 R 包。考虑到要同时支持 DOCX、EPUB、PDF 和 HTML 四种书籍格式,书中使用 knitr 包和 gt 包制作静态的表格。

    -

    为方便测试贡献者提供的 PR,本书托管在 Github 上,同时启用 Github Action 服务,为书籍自定义了一个可复现全书内容的运行环境,包括 R 软件、扩展包和系统软件依赖,详见仓库中的 DESCRIPTION 文件。你现在看到的是在线编译版本,使用 Quarto 1.4.549,最新一次编译时间是 2024-02-01 14:00:55。

    +

    为方便测试贡献者提供的 PR,本书托管在 Github 上,同时启用 Github Action 服务,为书籍自定义了一个可复现全书内容的运行环境,包括 R 软件、扩展包和系统软件依赖,详见仓库中的 DESCRIPTION 文件。你现在看到的是在线编译版本,使用 Quarto 1.4.549,最新一次编译时间是 2024-02-05 12:50:52。

    xfun::session_info(packages = c(
       "ggplot2", "gganimate", "ggrepel", "ggdensity",
    @@ -653,10 +653,10 @@ 

    R 语言数据分析实战

    #> gganimate_1.0.8 ggbeeswarm_0.7.2 ggdensity_1.0.0 ggeffects_1.3.4 #> ggforce_0.4.1 ggnewscale_0.4.9 ggplot2_3.5.0 ggraph_2.1.0.9000 #> ggrepel_0.9.5 ggridges_0.5.6 ggsignif_0.6.4 gifski_1.12.0.2 -#> glmnet_4.1.8 gt_0.10.1 httr_1.4.7 igraph_2.0.1 +#> glmnet_4.1.8 gt_0.10.1 httr_1.4.7 igraph_2.0.1.1 #> INLA_23.9.9 knitr_1.45 lattice_0.21.9 lme4_1.1.35.1 #> magick_2.8.2 mgcv_1.9.0 patchwork_1.2.0 plotly_4.10.4 -#> purrr_1.0.2 rmarkdown_2.25 rsconnect_1.2.0 sf_1.0.15 +#> purrr_1.0.2 rmarkdown_2.25 rsconnect_1.2.1 sf_1.0.15 #> shiny_1.8.0 showtext_0.9.6 spaMM_4.4.16 stars_0.6.4 #> terra_1.7.71 tidygraph_1.3.1 tidyr_1.3.1 tinytex_0.49 #> xgboost_1.7.7.1 @@ -664,7 +664,7 @@

    R 语言数据分析实战

    #> Pandoc version: 3.1.11 #> #> LaTeX version used: -#> TeX Live 2023 (TinyTeX) with tlmgr 2024-01-13
    +#> TeX Live 2023 (TinyTeX) with tlmgr 2024-01-31
    diff --git a/interactive-applications.html b/interactive-applications.html index 7d0f1320..9425d2c1 100644 --- a/interactive-applications.html +++ b/interactive-applications.html @@ -1141,8 +1141,8 @@

    -
    - +
    +
    图 15.1: Shiny 生态系统 diff --git a/interactive-applications_files/figure-pdf/fig-shiny-ecosystem-1.pdf b/interactive-applications_files/figure-pdf/fig-shiny-ecosystem-1.pdf index dfebe062..2ca8663c 100644 Binary files a/interactive-applications_files/figure-pdf/fig-shiny-ecosystem-1.pdf and b/interactive-applications_files/figure-pdf/fig-shiny-ecosystem-1.pdf differ diff --git a/interactive-graphics.html b/interactive-graphics.html index 431b5c1c..9fb1640c 100644 --- a/interactive-graphics.html +++ b/interactive-graphics.html @@ -763,8 +763,8 @@

    plotly::add_markers(color = ~mag)

    -
    - +
    +
    图 13.2: 给散点图配色 @@ -783,8 +783,8 @@

    )

    -
    - +
    +
    图 13.3: 设置刻度及标签 @@ -811,8 +811,8 @@

    )

    -
    - +
    +
    图 13.4: 添加各处标题 @@ -839,8 +839,8 @@

    )

    -
    - +
    +
    图 13.5: 设置主题风格 @@ -943,8 +943,8 @@

    )

    -
    - +
    +
    图 13.7: 柱形图 @@ -964,8 +964,8 @@

    )

    -
    - +
    +
    图 13.8: 曲线图 @@ -1061,8 +1061,8 @@

    )

    -
    - +
    +
    图 13.12: 不同深度下地震震级的分布 @@ -1082,8 +1082,8 @@

    )

    -
    - +
    +
    图 13.13: 不同深度下地震震级的分布 @@ -1199,8 +1199,8 @@

    )

    -
    - +
    +
    图 13.16: 1999-2022 年 Martin Maechler 和 Brian Ripley 的代码提交量变化 @@ -1289,8 +1289,8 @@

    plotly::ggplotly(p)
    -
    - +
    +

    当使用配置函数 config() 设置参数选项 staticPlot = TRUE,可将原本交互式的动态图形转为非交互式的静态图形。

    @@ -1298,8 +1298,8 @@

    plotly::ggplotly(p) |> 
       plotly::config(staticPlot = TRUE)
    -
    - +
    +
    @@ -1318,8 +1318,8 @@

    plotly::style(hoveron = "points", hoverinfo = "x+y+text", hoverlabel = list(bgcolor = "white"))

    -
    - +
    +
    @@ -1405,8 +1405,8 @@

    )
    -
    - +
    +
    图 13.20: 添加水印图片 @@ -1437,9 +1437,9 @@

    htmltools::tagList(p1, p2)

    -
    -
    - +
    +
    +
    图 13.21: 上下布局 @@ -1453,8 +1453,8 @@

    nrows = 2, margin = 0.05, shareX = TRUE, titleY = TRUE)

    -
    - +
    +
    图 13.22: 上下布局 @@ -1474,8 +1474,8 @@

    )

    -
    - +
    +
    图 13.23: 灵活布局 @@ -1502,9 +1502,9 @@

    -
    -
    - +
    +
    +

    diff --git a/interactive-graphics_files/figure-pdf/fig-ggplot-to-ggplotly-1.pdf b/interactive-graphics_files/figure-pdf/fig-ggplot-to-ggplotly-1.pdf index c402eb9b..723d44f4 100644 Binary files a/interactive-graphics_files/figure-pdf/fig-ggplot-to-ggplotly-1.pdf and b/interactive-graphics_files/figure-pdf/fig-ggplot-to-ggplotly-1.pdf differ diff --git a/interactive-tables.html b/interactive-tables.html index 5a4806ae..0b02d98b 100644 --- a/interactive-tables.html +++ b/interactive-tables.html @@ -681,8 +681,8 @@

    -
    - +
    +
    @@ -721,8 +721,8 @@

    -
    - +
    +
    diff --git a/interactive-tables_files/figure-pdf/tbl-datasets-1.pdf b/interactive-tables_files/figure-pdf/tbl-datasets-1.pdf index ab144b8f..dccc9e01 100644 Binary files a/interactive-tables_files/figure-pdf/tbl-datasets-1.pdf and b/interactive-tables_files/figure-pdf/tbl-datasets-1.pdf differ diff --git a/interactive-tables_files/figure-pdf/tbl-table-colorize-1.pdf b/interactive-tables_files/figure-pdf/tbl-table-colorize-1.pdf index de0a27ef..0f961e1b 100644 Binary files a/interactive-tables_files/figure-pdf/tbl-table-colorize-1.pdf and b/interactive-tables_files/figure-pdf/tbl-table-colorize-1.pdf differ diff --git a/intro.html b/intro.html index 64e5730d..d62355bf 100644 --- a/intro.html +++ b/intro.html @@ -811,23 +811,23 @@

    介绍< 表格 2: anscombe 数据集
    -
    - diff --git a/intro_files/figure-pdf/fig-anscombe-1.pdf b/intro_files/figure-pdf/fig-anscombe-1.pdf index fab62cf9..b4137795 100644 Binary files a/intro_files/figure-pdf/fig-anscombe-1.pdf and b/intro_files/figure-pdf/fig-anscombe-1.pdf differ diff --git a/intro_files/figure-pdf/fig-anscombe-2.pdf b/intro_files/figure-pdf/fig-anscombe-2.pdf index 6e7d994e..be7c30bc 100644 Binary files a/intro_files/figure-pdf/fig-anscombe-2.pdf and b/intro_files/figure-pdf/fig-anscombe-2.pdf differ diff --git a/intro_files/figure-pdf/fig-anscombe-3.pdf b/intro_files/figure-pdf/fig-anscombe-3.pdf index 4dceb9e7..002eed17 100644 Binary files a/intro_files/figure-pdf/fig-anscombe-3.pdf and b/intro_files/figure-pdf/fig-anscombe-3.pdf differ diff --git a/intro_files/figure-pdf/fig-anscombe-4.pdf b/intro_files/figure-pdf/fig-anscombe-4.pdf index d19b35d2..c1ffd297 100644 Binary files a/intro_files/figure-pdf/fig-anscombe-4.pdf and b/intro_files/figure-pdf/fig-anscombe-4.pdf differ diff --git a/intro_files/figure-pdf/fig-datasaurus-dozen-1.pdf b/intro_files/figure-pdf/fig-datasaurus-dozen-1.pdf index 4369551d..0e0a2b8c 100644 Binary files a/intro_files/figure-pdf/fig-datasaurus-dozen-1.pdf and b/intro_files/figure-pdf/fig-datasaurus-dozen-1.pdf differ diff --git a/mixed-effects-models.html b/mixed-effects-models.html index 4246234c..b9ebfaf0 100644 --- a/mixed-effects-models.html +++ b/mixed-effects-models.html @@ -1179,14 +1179,14 @@

    nlp <- ROI_solve(op, solver = "nloptr.directL") nlp$solution

    -
    #> [1]  0.00000 22.22222
    +
    #> [1] 22.22222  0.00000
    nlp$objval
    @@ -2358,7 +2358,7 @@

    # 目标函数值 nlp$objval

    -
    #> [1] 368.1061
    +
    #> [1] 368.106
    # 最优解
     nlp$solution
    @@ -2719,11 +2719,11 @@

    nlp <- ROI_solve(op, solver = "nloptr.isres", start = c(1, 5, 5, 1))
     nlp$solution

    -
    #> [1] 1.111263 4.830733 3.723643 1.251758
    +
    #> [1] 1.284840 4.793586 3.749755 1.145225
    nlp$objval
    -
    #> [1] 17.16886
    +
    #> [1] 18.21124

    可以看出,nloptr 提供的优化能力可以覆盖 Ipopt 求解器,从以上求解的情况来看,推荐使用 nloptr.slsqp 求解器,这也是 Octave 的选择。

    diff --git a/numerical-optimization_files/figure-html/fig-one-dimensional-optimization-1.pdf b/numerical-optimization_files/figure-html/fig-one-dimensional-optimization-1.pdf index 080b1f18..d8b8cd2f 100644 Binary files a/numerical-optimization_files/figure-html/fig-one-dimensional-optimization-1.pdf and b/numerical-optimization_files/figure-html/fig-one-dimensional-optimization-1.pdf differ diff --git a/numerical-optimization_files/figure-html/fig-one-dimensional-optimization-1.tex b/numerical-optimization_files/figure-html/fig-one-dimensional-optimization-1.tex index b453cb05..7fbd9323 100644 --- a/numerical-optimization_files/figure-html/fig-one-dimensional-optimization-1.tex +++ b/numerical-optimization_files/figure-html/fig-one-dimensional-optimization-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:17:49 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:07:45 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/numerical-optimization_files/figure-pdf/fig-complex-constrained-1.pdf b/numerical-optimization_files/figure-pdf/fig-complex-constrained-1.pdf index 8ff53023..7291bbae 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-complex-constrained-1.pdf and b/numerical-optimization_files/figure-pdf/fig-complex-constrained-1.pdf differ diff --git a/numerical-optimization_files/figure-pdf/fig-cone-1.pdf b/numerical-optimization_files/figure-pdf/fig-cone-1.pdf index 230bb46e..c7d2fed4 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-cone-1.pdf and b/numerical-optimization_files/figure-pdf/fig-cone-1.pdf differ diff --git a/numerical-optimization_files/figure-pdf/fig-implicit-function-1.pdf b/numerical-optimization_files/figure-pdf/fig-implicit-function-1.pdf index d4e44e06..8c964599 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-implicit-function-1.pdf and b/numerical-optimization_files/figure-pdf/fig-implicit-function-1.pdf differ diff --git a/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.pdf b/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.pdf index 92706e75..ef9da79b 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.pdf and b/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.pdf differ diff --git a/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.png b/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.png index 83796fba..6dda068d 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.png and b/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.png differ diff --git a/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.tex b/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.tex index 261ae35e..bf2c169b 100644 --- a/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.tex +++ b/numerical-optimization_files/figure-pdf/fig-one-dimensional-optimization-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:18:18 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:08:11 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/numerical-optimization_files/figure-pdf/fig-quadprog-1.pdf b/numerical-optimization_files/figure-pdf/fig-quadprog-1.pdf index 967384b4..1f3ec745 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-quadprog-1.pdf and b/numerical-optimization_files/figure-pdf/fig-quadprog-1.pdf differ diff --git a/numerical-optimization_files/figure-pdf/fig-rastrigin-1.pdf b/numerical-optimization_files/figure-pdf/fig-rastrigin-1.pdf index 1a0d1943..48d1fa58 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-rastrigin-1.pdf and b/numerical-optimization_files/figure-pdf/fig-rastrigin-1.pdf differ diff --git a/numerical-optimization_files/figure-pdf/fig-super-function-1.pdf b/numerical-optimization_files/figure-pdf/fig-super-function-1.pdf index bdcf6a73..e0289895 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-super-function-1.pdf and b/numerical-optimization_files/figure-pdf/fig-super-function-1.pdf differ diff --git a/numerical-optimization_files/figure-pdf/fig-super-function-2.pdf b/numerical-optimization_files/figure-pdf/fig-super-function-2.pdf index f0b11666..c7b64d58 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-super-function-2.pdf and b/numerical-optimization_files/figure-pdf/fig-super-function-2.pdf differ diff --git a/numerical-optimization_files/figure-pdf/fig-valley-persp-1.pdf b/numerical-optimization_files/figure-pdf/fig-valley-persp-1.pdf index cbaada5d..a8400141 100644 Binary files a/numerical-optimization_files/figure-pdf/fig-valley-persp-1.pdf and b/numerical-optimization_files/figure-pdf/fig-valley-persp-1.pdf differ diff --git a/optimization-problems.html b/optimization-problems.html index 3cc941e7..3817d7d0 100644 --- a/optimization-problems.html +++ b/optimization-problems.html @@ -1281,33 +1281,34 @@

    control = list(maximize = TRUE) )
    -
    #> iter:  0  f-value:  -2290.408  pgrad:  110.367 
    -#> iter:  10  f-value:  -1998.362  pgrad:  2.092643 
    -#> iter:  20  f-value:  -1998.255  pgrad:  2.135778 
    -#> iter:  30  f-value:  -1989.95  pgrad:  0.2171396 
    -#> iter:  40  f-value:  -1989.946  pgrad:  0.2082606 
    -#> iter:  50  f-value:  -1989.946  pgrad:  0.360415 
    +
    #> iter:  0  f-value:  -2822.177  pgrad:  6.550888 
    +#> iter:  10  f-value:  -1990.894  pgrad:  2.158326 
    +#> iter:  20  f-value:  -1990.057  pgrad:  1.542383 
    +#> iter:  30  f-value:  -1989.991  pgrad:  2.684052 
    +#> iter:  40  f-value:  -1989.946  pgrad:  0.05826678 
    +#> iter:  50  f-value:  -1989.946  pgrad:  0.005381935 
    +#> iter:  60  f-value:  -1989.946  pgrad:  0.0008139978 
     #>   Successful convergence.
    ans
    #> $par
    -#> [1] 0.3598829 1.2560907 2.6634012
    +#> [1] 0.3598724 1.2560702 2.6633898
     #> 
     #> $value
     #> [1] -1989.946
     #> 
     #> $gradient
    -#> [1] 6.82121e-06
    +#> [1] 0.0002114575
     #> 
     #> $fn.reduction
    -#> [1] -300.4626
    +#> [1] -832.2316
     #> 
     #> $iter
    -#> [1] 56
    +#> [1] 66
     #> 
     #> $feval
    -#> [1] 58
    +#> [1] 68
     #> 
     #> $convergence
     #> [1] 0
    @@ -1327,15 +1328,15 @@ 

    hess

    #>           [,1]       [,2]       [,3]
    -#> [1,] -907.1104  270.22856  341.25434
    -#> [2,]  270.2286 -113.47936  -61.68191
    -#> [3,]  341.2543  -61.68191 -192.78218
    +#> [1,] -907.1271 270.22673 341.26114 +#> [2,] 270.2267 -113.47615 -61.68078 +#> [3,] 341.2611 -61.68078 -192.78769
    # 标准差
     se <- sqrt(diag(solve(-hess)))
     se
    -
    #> [1] 0.1946833 0.3500301 0.2504766
    +
    #> [1] 0.1946820 0.3500371 0.2504738

    multiStart 从不同初始值出发寻找全局最大值,先找一系列局部极大值,通过比较获得全局最大值。

    @@ -1356,12 +1357,8 @@

    pmat[!duplicated(pmat), ]
    #>         fvalue parameter 1 parameter 2 parameter 3
    -#> [1,] -1992.839      0.1764      3.6867      1.8441
    -#> [2,] -1989.946      0.6401      2.6634      1.2561
    -#> [3,] -1998.253      0.5668      2.3774      1.8620
    -#> [4,] -2000.055      0.9896      2.1176      6.5843
    -#> [5,] -1999.731      0.0131      6.3280      2.1094
    -#> [6,] -1989.946      0.3599      1.2561      2.6634
    +#> [1,] -1989.946 0.3599 1.2561 2.6634 +#> [2,] -1989.946 0.6401 2.6634 1.2561

    diff --git a/optimization-problems_files/figure-pdf/fig-dgamma-1.pdf b/optimization-problems_files/figure-pdf/fig-dgamma-1.pdf index aff333d6..8fd68274 100644 Binary files a/optimization-problems_files/figure-pdf/fig-dgamma-1.pdf and b/optimization-problems_files/figure-pdf/fig-dgamma-1.pdf differ diff --git a/optimization-problems_files/figure-pdf/fig-topo-loglik-contour-1.pdf b/optimization-problems_files/figure-pdf/fig-topo-loglik-contour-1.pdf index 0fee721c..ee1c4541 100644 Binary files a/optimization-problems_files/figure-pdf/fig-topo-loglik-contour-1.pdf and b/optimization-problems_files/figure-pdf/fig-topo-loglik-contour-1.pdf differ diff --git a/optimization-problems_files/figure-pdf/fig-topo-loglik-persp-1.pdf b/optimization-problems_files/figure-pdf/fig-topo-loglik-persp-1.pdf index 9af1ff3a..58d203df 100644 Binary files a/optimization-problems_files/figure-pdf/fig-topo-loglik-persp-1.pdf and b/optimization-problems_files/figure-pdf/fig-topo-loglik-persp-1.pdf differ diff --git a/optimization-problems_files/figure-pdf/fig-tsp-problem-1.pdf b/optimization-problems_files/figure-pdf/fig-tsp-problem-1.pdf index b0eae845..92f0fcb8 100644 Binary files a/optimization-problems_files/figure-pdf/fig-tsp-problem-1.pdf and b/optimization-problems_files/figure-pdf/fig-tsp-problem-1.pdf differ diff --git a/optimization-problems_files/figure-pdf/fig-tsp-solution-1.pdf b/optimization-problems_files/figure-pdf/fig-tsp-solution-1.pdf index f1ec2d67..fff75950 100644 Binary files a/optimization-problems_files/figure-pdf/fig-tsp-solution-1.pdf and b/optimization-problems_files/figure-pdf/fig-tsp-solution-1.pdf differ diff --git a/power-analysis_files/figure-html/fig-power-t-test-1.pdf b/power-analysis_files/figure-html/fig-power-t-test-1.pdf index 39d9f9eb..851f21cd 100644 Binary files a/power-analysis_files/figure-html/fig-power-t-test-1.pdf and b/power-analysis_files/figure-html/fig-power-t-test-1.pdf differ diff --git a/power-analysis_files/figure-html/fig-power-t-test-1.tex b/power-analysis_files/figure-html/fig-power-t-test-1.tex index 1c77800c..91ccc4f1 100644 --- a/power-analysis_files/figure-html/fig-power-t-test-1.tex +++ b/power-analysis_files/figure-html/fig-power-t-test-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:13:08 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:03:24 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/power-analysis_files/figure-pdf/fig-power-t-test-1.pdf b/power-analysis_files/figure-pdf/fig-power-t-test-1.pdf index 9da9143f..1d4136df 100644 Binary files a/power-analysis_files/figure-pdf/fig-power-t-test-1.pdf and b/power-analysis_files/figure-pdf/fig-power-t-test-1.pdf differ diff --git a/power-analysis_files/figure-pdf/fig-power-t-test-1.png b/power-analysis_files/figure-pdf/fig-power-t-test-1.png index 114b3aa0..a42235b7 100644 Binary files a/power-analysis_files/figure-pdf/fig-power-t-test-1.png and b/power-analysis_files/figure-pdf/fig-power-t-test-1.png differ diff --git a/power-analysis_files/figure-pdf/fig-power-t-test-1.tex b/power-analysis_files/figure-pdf/fig-power-t-test-1.tex index 4db56a8d..69125335 100644 --- a/power-analysis_files/figure-pdf/fig-power-t-test-1.tex +++ b/power-analysis_files/figure-pdf/fig-power-t-test-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:13:14 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:03:30 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/preface_files/figure-pdf/fig-plant-growth-1.pdf b/preface_files/figure-pdf/fig-plant-growth-1.pdf index 400c0058..bb48aa13 100644 Binary files a/preface_files/figure-pdf/fig-plant-growth-1.pdf and b/preface_files/figure-pdf/fig-plant-growth-1.pdf differ diff --git a/preface_files/figure-pdf/fig-plant-growth-2.pdf b/preface_files/figure-pdf/fig-plant-growth-2.pdf index 778b46a8..bc3c8023 100644 Binary files a/preface_files/figure-pdf/fig-plant-growth-2.pdf and b/preface_files/figure-pdf/fig-plant-growth-2.pdf differ diff --git a/probabilistic-reasoning-framework.html b/probabilistic-reasoning-framework.html index 0975bd85..753654b9 100644 --- a/probabilistic-reasoning-framework.html +++ b/probabilistic-reasoning-framework.html @@ -1599,7 +1599,7 @@

    seed = 20232023 # 随机数种子 )
    -
    #> Finished in  0.3 seconds.
    +
    #> Finished in  0.2 seconds.
    fit_optim_logit$summary(c("alpha", "beta", "lambda", "lp__"))
    @@ -1635,7 +1635,7 @@

    seed = 20232023 # 随机数种子 )

    -
    #> Finished in  2.3 seconds.
    +
    #> Finished in  1.8 seconds.
    fit_advi_logit$summary(c("alpha", "beta", "lambda", "lp__"))
    @@ -1670,8 +1670,8 @@

    seed = 20232023 # 随机数种子 )

    -
    #> Finished in  0.1 seconds.
    -#> Finished in  1.9 seconds.
    +
    #> Finished in  0.3 seconds.
    +#> Finished in  1.7 seconds.
    fit_laplace_logit$summary(c("alpha", "beta", "lambda", "lp__"))
    @@ -1706,7 +1706,7 @@

    seed = 20232023 # 随机数种子 )

    -
    #> Finished in  5.2 seconds.
    +
    #> Finished in  3.8 seconds.
    fit_pathfinder_logit$summary(c("alpha", "beta", "lambda", "lp__"))
    diff --git a/probabilistic-reasoning-framework_files/figure-html/fig-prior-1.pdf b/probabilistic-reasoning-framework_files/figure-html/fig-prior-1.pdf index aba77d5f..062be927 100644 Binary files a/probabilistic-reasoning-framework_files/figure-html/fig-prior-1.pdf and b/probabilistic-reasoning-framework_files/figure-html/fig-prior-1.pdf differ diff --git a/probabilistic-reasoning-framework_files/figure-html/fig-prior-1.tex b/probabilistic-reasoning-framework_files/figure-html/fig-prior-1.tex index b061f1f5..6788a428 100644 --- a/probabilistic-reasoning-framework_files/figure-html/fig-prior-1.tex +++ b/probabilistic-reasoning-framework_files/figure-html/fig-prior-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:20:59 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:10:16 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/probabilistic-reasoning-framework_files/figure-pdf/fig-faithful-mixture-1.pdf b/probabilistic-reasoning-framework_files/figure-pdf/fig-faithful-mixture-1.pdf index 23258840..861fe22f 100644 Binary files a/probabilistic-reasoning-framework_files/figure-pdf/fig-faithful-mixture-1.pdf and b/probabilistic-reasoning-framework_files/figure-pdf/fig-faithful-mixture-1.pdf differ diff --git a/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.pdf b/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.pdf index a2bbd170..01be46ae 100644 Binary files a/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.pdf and b/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.pdf differ diff --git a/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.png b/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.png index 4f02dd8c..a7769b91 100644 Binary files a/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.png and b/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.png differ diff --git a/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.tex b/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.tex index 62ef753b..6f398b02 100644 --- a/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.tex +++ b/probabilistic-reasoning-framework_files/figure-pdf/fig-prior-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:26:00 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:14:11 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/probabilistic-reasoning-framework_files/figure-pdf/fig-stan-lp-1.pdf b/probabilistic-reasoning-framework_files/figure-pdf/fig-stan-lp-1.pdf index bd6cd1b2..3dc27ab9 100644 Binary files a/probabilistic-reasoning-framework_files/figure-pdf/fig-stan-lp-1.pdf and b/probabilistic-reasoning-framework_files/figure-pdf/fig-stan-lp-1.pdf differ diff --git a/regression-and-correlation_files/figure-pdf/fig-galton-1.pdf b/regression-and-correlation_files/figure-pdf/fig-galton-1.pdf index 7895c927..17df6a02 100644 Binary files a/regression-and-correlation_files/figure-pdf/fig-galton-1.pdf and b/regression-and-correlation_files/figure-pdf/fig-galton-1.pdf differ diff --git a/regression-and-correlation_files/figure-pdf/fig-galton-bivar-1.pdf b/regression-and-correlation_files/figure-pdf/fig-galton-bivar-1.pdf index 5021be19..ecd3c769 100644 Binary files a/regression-and-correlation_files/figure-pdf/fig-galton-bivar-1.pdf and b/regression-and-correlation_files/figure-pdf/fig-galton-bivar-1.pdf differ diff --git a/regression-and-correlation_files/figure-pdf/fig-galton-gender-1.pdf b/regression-and-correlation_files/figure-pdf/fig-galton-gender-1.pdf index a086278c..de6222aa 100644 Binary files a/regression-and-correlation_files/figure-pdf/fig-galton-gender-1.pdf and b/regression-and-correlation_files/figure-pdf/fig-galton-gender-1.pdf differ diff --git a/regression-and-correlation_files/figure-pdf/fig-state-x77-bubble-1.pdf b/regression-and-correlation_files/figure-pdf/fig-state-x77-bubble-1.pdf index f3912dfe..12f124f5 100644 Binary files a/regression-and-correlation_files/figure-pdf/fig-state-x77-bubble-1.pdf and b/regression-and-correlation_files/figure-pdf/fig-state-x77-bubble-1.pdf differ diff --git a/regression-and-correlation_files/figure-pdf/fig-state-x77-lm-1.pdf b/regression-and-correlation_files/figure-pdf/fig-state-x77-lm-1.pdf index 8365a60e..98726747 100644 Binary files a/regression-and-correlation_files/figure-pdf/fig-state-x77-lm-1.pdf and b/regression-and-correlation_files/figure-pdf/fig-state-x77-lm-1.pdf differ diff --git a/regression-and-correlation_files/figure-pdf/fig-state-x77-scatter-1.pdf b/regression-and-correlation_files/figure-pdf/fig-state-x77-scatter-1.pdf index 7bdef055..102a3c1a 100644 Binary files a/regression-and-correlation_files/figure-pdf/fig-state-x77-scatter-1.pdf and b/regression-and-correlation_files/figure-pdf/fig-state-x77-scatter-1.pdf differ diff --git a/regression-problems.html b/regression-problems.html index 08db17f2..6c19a3f3 100644 --- a/regression-problems.html +++ b/regression-problems.html @@ -818,8 +818,8 @@

    @@ -1690,7 +1690,7 @@

    # RMSE rmse(Boston$medv, pred_medv_svm)

    -
    #> [1] 2.892347
    +
    #> [1] 2.811383

    @@ -1708,7 +1708,7 @@

    pred_medv_nnet <- predict(fit_nnet, newdata = Boston[, -14], type = "raw") rmse(Boston$medv, pred_medv_nnet)
    -
    #> [1] 2.78669
    +
    #> [1] 2.687467

    @@ -1754,13 +1754,13 @@

    pred_medv_rf <- predict(fit_rf, newdata = Boston[, -14])
     rmse(Boston$medv, pred_medv_rf)
    -
    #> [1] 1.399153
    +
    #> [1] 1.412122

    diff --git a/regression-problems_files/figure-html/fig-pls-1.png b/regression-problems_files/figure-html/fig-pls-1.png index 6591b7f5..4d0d01ee 100644 Binary files a/regression-problems_files/figure-html/fig-pls-1.png and b/regression-problems_files/figure-html/fig-pls-1.png differ diff --git a/regression-problems_files/figure-pdf/fig-Boston-rpart-1.pdf b/regression-problems_files/figure-pdf/fig-Boston-rpart-1.pdf index ffdeba55..fc4124c5 100644 Binary files a/regression-problems_files/figure-pdf/fig-Boston-rpart-1.pdf and b/regression-problems_files/figure-pdf/fig-Boston-rpart-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-abess-lambda-1.pdf b/regression-problems_files/figure-pdf/fig-abess-lambda-1.pdf index 5856a313..40e02b92 100644 Binary files a/regression-problems_files/figure-pdf/fig-abess-lambda-1.pdf and b/regression-problems_files/figure-pdf/fig-abess-lambda-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-abess-lambda-2.pdf b/regression-problems_files/figure-pdf/fig-abess-lambda-2.pdf index f238e219..845f3746 100644 Binary files a/regression-problems_files/figure-pdf/fig-abess-lambda-2.pdf and b/regression-problems_files/figure-pdf/fig-abess-lambda-2.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-adaptive-lasso-1.pdf b/regression-problems_files/figure-pdf/fig-adaptive-lasso-1.pdf index ae99e21a..c31de2c9 100644 Binary files a/regression-problems_files/figure-pdf/fig-adaptive-lasso-1.pdf and b/regression-problems_files/figure-pdf/fig-adaptive-lasso-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-adaptive-lasso-2.pdf b/regression-problems_files/figure-pdf/fig-adaptive-lasso-2.pdf index 823afd92..e010b509 100644 Binary files a/regression-problems_files/figure-pdf/fig-adaptive-lasso-2.pdf and b/regression-problems_files/figure-pdf/fig-adaptive-lasso-2.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-cv-lars-1.pdf b/regression-problems_files/figure-pdf/fig-cv-lars-1.pdf index 6c34df12..a1c4aece 100644 Binary files a/regression-problems_files/figure-pdf/fig-cv-lars-1.pdf and b/regression-problems_files/figure-pdf/fig-cv-lars-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-cv-lars-2.pdf b/regression-problems_files/figure-pdf/fig-cv-lars-2.pdf index 670e71e4..115b728e 100644 Binary files a/regression-problems_files/figure-pdf/fig-cv-lars-2.pdf and b/regression-problems_files/figure-pdf/fig-cv-lars-2.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-elasticnet-glmnet-1.pdf b/regression-problems_files/figure-pdf/fig-elasticnet-glmnet-1.pdf index 4c591b8b..3bfb3f6a 100644 Binary files a/regression-problems_files/figure-pdf/fig-elasticnet-glmnet-1.pdf and b/regression-problems_files/figure-pdf/fig-elasticnet-glmnet-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-elasticnet-glmnet-2.pdf b/regression-problems_files/figure-pdf/fig-elasticnet-glmnet-2.pdf index f9efafc9..9d3048b4 100644 Binary files a/regression-problems_files/figure-pdf/fig-elasticnet-glmnet-2.pdf and b/regression-problems_files/figure-pdf/fig-elasticnet-glmnet-2.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-lars-lasso-1.pdf b/regression-problems_files/figure-pdf/fig-lars-lasso-1.pdf index 8412a50b..c8c288cf 100644 Binary files a/regression-problems_files/figure-pdf/fig-lars-lasso-1.pdf and b/regression-problems_files/figure-pdf/fig-lars-lasso-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-lars-lasso-2.pdf b/regression-problems_files/figure-pdf/fig-lars-lasso-2.pdf index 0c019450..6e1536ed 100644 Binary files a/regression-problems_files/figure-pdf/fig-lars-lasso-2.pdf and b/regression-problems_files/figure-pdf/fig-lars-lasso-2.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-lasso-glmnet-1.pdf b/regression-problems_files/figure-pdf/fig-lasso-glmnet-1.pdf index 09642bc8..93731ce1 100644 Binary files a/regression-problems_files/figure-pdf/fig-lasso-glmnet-1.pdf and b/regression-problems_files/figure-pdf/fig-lasso-glmnet-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-lasso-glmnet-2.pdf b/regression-problems_files/figure-pdf/fig-lasso-glmnet-2.pdf index 5b3a8bd1..20cbe404 100644 Binary files a/regression-problems_files/figure-pdf/fig-lasso-glmnet-2.pdf and b/regression-problems_files/figure-pdf/fig-lasso-glmnet-2.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-mcp-lambda-1.pdf b/regression-problems_files/figure-pdf/fig-mcp-lambda-1.pdf index 18056785..9435e481 100644 Binary files a/regression-problems_files/figure-pdf/fig-mcp-lambda-1.pdf and b/regression-problems_files/figure-pdf/fig-mcp-lambda-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-mcp-ncvreg-1.pdf b/regression-problems_files/figure-pdf/fig-mcp-ncvreg-1.pdf index 554ec2b9..f374d1a9 100644 Binary files a/regression-problems_files/figure-pdf/fig-mcp-ncvreg-1.pdf and b/regression-problems_files/figure-pdf/fig-mcp-ncvreg-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-pls-1.pdf b/regression-problems_files/figure-pdf/fig-pls-1.pdf index 415fab9d..8e4cb8e5 100644 Binary files a/regression-problems_files/figure-pdf/fig-pls-1.pdf and b/regression-problems_files/figure-pdf/fig-pls-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-relax-lasso-1.pdf b/regression-problems_files/figure-pdf/fig-relax-lasso-1.pdf index d2dbf6ed..ed7e06ce 100644 Binary files a/regression-problems_files/figure-pdf/fig-relax-lasso-1.pdf and b/regression-problems_files/figure-pdf/fig-relax-lasso-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-ridge-glmnet-1.pdf b/regression-problems_files/figure-pdf/fig-ridge-glmnet-1.pdf index 39a29c40..c9b386cd 100644 Binary files a/regression-problems_files/figure-pdf/fig-ridge-glmnet-1.pdf and b/regression-problems_files/figure-pdf/fig-ridge-glmnet-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-ridge-glmnet-2.pdf b/regression-problems_files/figure-pdf/fig-ridge-glmnet-2.pdf index 9419990a..a8aeaa22 100644 Binary files a/regression-problems_files/figure-pdf/fig-ridge-glmnet-2.pdf and b/regression-problems_files/figure-pdf/fig-ridge-glmnet-2.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-scad-lambda-1.pdf b/regression-problems_files/figure-pdf/fig-scad-lambda-1.pdf index 69648c96..396793bb 100644 Binary files a/regression-problems_files/figure-pdf/fig-scad-lambda-1.pdf and b/regression-problems_files/figure-pdf/fig-scad-lambda-1.pdf differ diff --git a/regression-problems_files/figure-pdf/fig-scad-ncvreg-1.pdf b/regression-problems_files/figure-pdf/fig-scad-ncvreg-1.pdf index dacaed26..322d49a9 100644 Binary files a/regression-problems_files/figure-pdf/fig-scad-ncvreg-1.pdf and b/regression-problems_files/figure-pdf/fig-scad-ncvreg-1.pdf differ diff --git a/search.json b/search.json index 963c7d36..9976592c 100644 --- a/search.json +++ b/search.json @@ -4,7 +4,7 @@ "href": "index.html", "title": "R 语言数据分析实战", "section": "", - "text": "欢迎\n\n\n\n\n\n\n警告\n\n\n\nBook in early development. Planned release in 2024.\n\n\n本书初稿是在 RStudio IDE 内使用 Quarto 编辑的,Quarto 是继R Markdown之后,一个新的开源的科学和技术发布系统,它基于 Pandoc支持输出多种格式的书稿,比如 HTML 网页、EPUB 电子书、DOCX 文档和 PDF 便携式文档等。Quarto 吸收了过去 10 年 R Markdown 取得的经验和教训,在书籍写作、创建博客、制作简历和幻灯片等系列场景中支持更加统一的使用语法,一份源文档输出多种格式,使文档内容在不同场景中的迁移成本更低。了解更多 Quarto 特性,请访问 https://quarto.org/。\n书中的代码字体采用美观的 Source Code Pro 字体, 为方便跨操作系统编译书籍电子版,正文的中文字体采用开源的 fandol 字体。此外,考虑到美观性,本书图形使用了 Noto 系列中英文字体,它们来自 Google Fonts 字体库,分别是 Noto Sans 无衬线英文字体和 Noto Serif SC 宋体中文字体。\n书中 R 包名以粗体表示,如 knitr 包,函数名以等宽体表示,如 plot(),函数的参数名同理。代码块内注释用 # 表示,运行结果每一行开头以 #> 标记。本书写作过程中,依赖 knitr (Xie 2015)、ggplot2 (Wickham 2016) 和 lattice (Sarkar 2008) 等众多 R 包。考虑到要同时支持 DOCX、EPUB、PDF 和 HTML 四种书籍格式,书中使用 knitr 包和 gt 包制作静态的表格。\n为方便测试贡献者提供的 PR,本书托管在 Github 上,同时启用 Github Action 服务,为书籍自定义了一个可复现全书内容的运行环境,包括 R 软件、扩展包和系统软件依赖,详见仓库中的 DESCRIPTION 文件。你现在看到的是在线编译版本,使用 Quarto 1.4.549,最新一次编译时间是 2024-02-01 14:00:55。\n\nxfun::session_info(packages = c(\n \"ggplot2\", \"gganimate\", \"ggrepel\", \"ggdensity\",\n \"ggridges\", \"ggsignif\", \"ggforce\", \"ggbeeswarm\",\n \"ggeffects\", \"ggnewscale\", \"patchwork\", \"shiny\",\n \"plotly\", \"lattice\", \"igraph\", \"tidygraph\", \"ggraph\",\n \"dplyr\", \"purrr\", \"tidyr\", \"httr\", \"data.table\",\n \"rsconnect\", \"knitr\", \"rmarkdown\", \"gt\", \"DT\",\n \"mgcv\", \"glmnet\", \"lme4\", \"xgboost\", \"spaMM\",\n \"sf\", \"stars\", \"terra\", \"INLA\", \"cmdstanr\",\n \"showtext\", \"gifski\", \"tinytex\", \"magick\"\n), dependencies = FALSE)\n\n#> R version 4.3.2 (2023-10-31)\n#> Platform: x86_64-apple-darwin20 (64-bit)\n#> Running under: macOS Ventura 13.6.3\n#> \n#> Locale: en_US.UTF-8 / en_US.UTF-8 / en_US.UTF-8 / C / en_US.UTF-8 / en_US.UTF-8\n#> \n#> Package version:\n#> cmdstanr_0.7.1 data.table_1.15.0 dplyr_1.1.4 DT_0.31 \n#> gganimate_1.0.8 ggbeeswarm_0.7.2 ggdensity_1.0.0 ggeffects_1.3.4 \n#> ggforce_0.4.1 ggnewscale_0.4.9 ggplot2_3.5.0 ggraph_2.1.0.9000\n#> ggrepel_0.9.5 ggridges_0.5.6 ggsignif_0.6.4 gifski_1.12.0.2 \n#> glmnet_4.1.8 gt_0.10.1 httr_1.4.7 igraph_2.0.1 \n#> INLA_23.9.9 knitr_1.45 lattice_0.21.9 lme4_1.1.35.1 \n#> magick_2.8.2 mgcv_1.9.0 patchwork_1.2.0 plotly_4.10.4 \n#> purrr_1.0.2 rmarkdown_2.25 rsconnect_1.2.0 sf_1.0.15 \n#> shiny_1.8.0 showtext_0.9.6 spaMM_4.4.16 stars_0.6.4 \n#> terra_1.7.71 tidygraph_1.3.1 tidyr_1.3.1 tinytex_0.49 \n#> xgboost_1.7.7.1 \n#> \n#> Pandoc version: 3.1.11\n#> \n#> LaTeX version used: \n#> TeX Live 2023 (TinyTeX) with tlmgr 2024-01-13\n\n\n\n\n\n\nSarkar, Deepayan. 2008. lattice: Multivariate Data Visualization with R. New York: Springer. http://lmdvr.r-forge.r-project.org.\n\n\nWickham, Hadley. 2016. ggplot2: Elegant Graphics for Data Analysis. 2nd 本. Springer-Verlag New York. https://ggplot2.tidyverse.org.\n\n\nXie, Yihui. 2015. Dynamic Documents with R and knitr. 2nd 本. Boca Raton, Florida: Chapman; Hall/CRC. https://yihui.org/knitr/.", + "text": "欢迎\n\n\n\n\n\n\n警告\n\n\n\nBook in early development. Planned release in 2024.\n\n\n本书初稿是在 RStudio IDE 内使用 Quarto 编辑的,Quarto 是继R Markdown之后,一个新的开源的科学和技术发布系统,它基于 Pandoc支持输出多种格式的书稿,比如 HTML 网页、EPUB 电子书、DOCX 文档和 PDF 便携式文档等。Quarto 吸收了过去 10 年 R Markdown 取得的经验和教训,在书籍写作、创建博客、制作简历和幻灯片等系列场景中支持更加统一的使用语法,一份源文档输出多种格式,使文档内容在不同场景中的迁移成本更低。了解更多 Quarto 特性,请访问 https://quarto.org/。\n书中的代码字体采用美观的 Source Code Pro 字体, 为方便跨操作系统编译书籍电子版,正文的中文字体采用开源的 fandol 字体。此外,考虑到美观性,本书图形使用了 Noto 系列中英文字体,它们来自 Google Fonts 字体库,分别是 Noto Sans 无衬线英文字体和 Noto Serif SC 宋体中文字体。\n书中 R 包名以粗体表示,如 knitr 包,函数名以等宽体表示,如 plot(),函数的参数名同理。代码块内注释用 # 表示,运行结果每一行开头以 #> 标记。本书写作过程中,依赖 knitr (Xie 2015)、ggplot2 (Wickham 2016) 和 lattice (Sarkar 2008) 等众多 R 包。考虑到要同时支持 DOCX、EPUB、PDF 和 HTML 四种书籍格式,书中使用 knitr 包和 gt 包制作静态的表格。\n为方便测试贡献者提供的 PR,本书托管在 Github 上,同时启用 Github Action 服务,为书籍自定义了一个可复现全书内容的运行环境,包括 R 软件、扩展包和系统软件依赖,详见仓库中的 DESCRIPTION 文件。你现在看到的是在线编译版本,使用 Quarto 1.4.549,最新一次编译时间是 2024-02-05 12:50:52。\n\nxfun::session_info(packages = c(\n \"ggplot2\", \"gganimate\", \"ggrepel\", \"ggdensity\",\n \"ggridges\", \"ggsignif\", \"ggforce\", \"ggbeeswarm\",\n \"ggeffects\", \"ggnewscale\", \"patchwork\", \"shiny\",\n \"plotly\", \"lattice\", \"igraph\", \"tidygraph\", \"ggraph\",\n \"dplyr\", \"purrr\", \"tidyr\", \"httr\", \"data.table\",\n \"rsconnect\", \"knitr\", \"rmarkdown\", \"gt\", \"DT\",\n \"mgcv\", \"glmnet\", \"lme4\", \"xgboost\", \"spaMM\",\n \"sf\", \"stars\", \"terra\", \"INLA\", \"cmdstanr\",\n \"showtext\", \"gifski\", \"tinytex\", \"magick\"\n), dependencies = FALSE)\n\n#> R version 4.3.2 (2023-10-31)\n#> Platform: x86_64-apple-darwin20 (64-bit)\n#> Running under: macOS Ventura 13.6.3\n#> \n#> Locale: en_US.UTF-8 / en_US.UTF-8 / en_US.UTF-8 / C / en_US.UTF-8 / en_US.UTF-8\n#> \n#> Package version:\n#> cmdstanr_0.7.1 data.table_1.15.0 dplyr_1.1.4 DT_0.31 \n#> gganimate_1.0.8 ggbeeswarm_0.7.2 ggdensity_1.0.0 ggeffects_1.3.4 \n#> ggforce_0.4.1 ggnewscale_0.4.9 ggplot2_3.5.0 ggraph_2.1.0.9000\n#> ggrepel_0.9.5 ggridges_0.5.6 ggsignif_0.6.4 gifski_1.12.0.2 \n#> glmnet_4.1.8 gt_0.10.1 httr_1.4.7 igraph_2.0.1.1 \n#> INLA_23.9.9 knitr_1.45 lattice_0.21.9 lme4_1.1.35.1 \n#> magick_2.8.2 mgcv_1.9.0 patchwork_1.2.0 plotly_4.10.4 \n#> purrr_1.0.2 rmarkdown_2.25 rsconnect_1.2.1 sf_1.0.15 \n#> shiny_1.8.0 showtext_0.9.6 spaMM_4.4.16 stars_0.6.4 \n#> terra_1.7.71 tidygraph_1.3.1 tidyr_1.3.1 tinytex_0.49 \n#> xgboost_1.7.7.1 \n#> \n#> Pandoc version: 3.1.11\n#> \n#> LaTeX version used: \n#> TeX Live 2023 (TinyTeX) with tlmgr 2024-01-31\n\n\n\n\n\n\nSarkar, Deepayan. 2008. lattice: Multivariate Data Visualization with R. New York: Springer. http://lmdvr.r-forge.r-project.org.\n\n\nWickham, Hadley. 2016. ggplot2: Elegant Graphics for Data Analysis. 2nd 本. Springer-Verlag New York. https://ggplot2.tidyverse.org.\n\n\nXie, Yihui. 2015. Dynamic Documents with R and knitr. 2nd 本. Boca Raton, Florida: Chapman; Hall/CRC. https://yihui.org/knitr/.", "crumbs": [ "欢迎" ] @@ -156,7 +156,7 @@ "href": "wrangling-objects.html#sec-data-structure", "title": "1  数据对象", "section": "\n1.2 数据结构", - "text": "1.2 数据结构\n\n1.2.1 向量\n所有元素都是同一类型\n\n1.2.2 矩阵\n所有元素都是同一类型\n\n1.2.3 数组\n所有元素都是同一类型\n\n1.2.4 列表\n元素可以属于不同类型\n\n1.2.5 因子\n\n1.2.6 数据框\n同列的元素类型必须一致,不同列的元素类型可以不同。\n\n1.2.7 ts\nts 类型用于表示时间序列数据,是继承自数组类型的。给定数据、采样初始时间、采样频率的情况下,利用内置的函数 ts() 构造一个 ts 类型的分钟级的时间序列对象。\n\nx <- ts(\n data = rnorm(100), \n start = c(2017, 1), \n frequency = 365.25 * 24 * 60, \n class = \"ts\", names = \"Time_Series\"\n)\n\nts() 函数的 start 和 frequency 参数很关键,前者指定了时间单位是天,后者指定每个时间单位下的数据点的数量。其中 365.25 是因为每隔 4 年有 366 天,平均下来,每年算 365.25 天。每隔 1 / (24 * 60) 天(即 1 分钟)采样一个点。如果初始时间不是从一年的第1分钟开始,而是从此时此刻 2023-01-31 10:43:30 CST 开始,则可以换算成今年的第 30 * 24 * 60 + 9 * 60 + 43 = 43783 分钟,则 Start = c(2023, 43783)。\n以数据集 x 为例,它是一个 ts 类型的时间序列数据对象。时间序列对象有很多方法,如函数 class() 、 mode() 和 str() 分别可以查看其数据类型、存储类型和数据结构。\n\n# 数据类型\nclass(x)\n\n[1] \"ts\"\n\n# 存储类型\nmode(x)\n\n[1] \"numeric\"\n\n# 数据结构\nstr(x)\n\n Time-Series [1:100] from 2017 to 2017: 0.2767 -1.8463 2.4645 -0.1078 0.0194 ...\n\n\n函数 start() 和 end() 查看开始和结束的时间点。\n\nc(start(x), end(x))\n\n[1] 2017 1 2017 100\n\n\n函数 time() 可以查看在以上时间区间的划分。\n\ntime(x)\n\nTime Series:\nStart = c(2017, 1) \nEnd = c(2017, 100) \nFrequency = 525960 \n [1] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [16] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [31] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [46] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [61] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [76] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [91] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n\n\n函数 tsp() 可以查看其期初、期末和周期。\n\ntsp(x)\n\n[1] 2017 2017 525960", + "text": "1.2 数据结构\n\n1.2.1 向量\n所有元素都是同一类型\n\n1.2.2 矩阵\n所有元素都是同一类型\n\n1.2.3 数组\n所有元素都是同一类型\n\n1.2.4 列表\n元素可以属于不同类型\n\n1.2.5 因子\n\n1.2.6 数据框\n同列的元素类型必须一致,不同列的元素类型可以不同。\n\n1.2.7 ts\nts 类型用于表示时间序列数据,是继承自数组类型的。给定数据、采样初始时间、采样频率的情况下,利用内置的函数 ts() 构造一个 ts 类型的分钟级的时间序列对象。\n\nx <- ts(\n data = rnorm(100), \n start = c(2017, 1), \n frequency = 365.25 * 24 * 60, \n class = \"ts\", names = \"Time_Series\"\n)\n\nts() 函数的 start 和 frequency 参数很关键,前者指定了时间单位是天,后者指定每个时间单位下的数据点的数量。其中 365.25 是因为每隔 4 年有 366 天,平均下来,每年算 365.25 天。每隔 1 / (24 * 60) 天(即 1 分钟)采样一个点。如果初始时间不是从一年的第1分钟开始,而是从此时此刻 2023-01-31 10:43:30 CST 开始,则可以换算成今年的第 30 * 24 * 60 + 9 * 60 + 43 = 43783 分钟,则 Start = c(2023, 43783)。\n以数据集 x 为例,它是一个 ts 类型的时间序列数据对象。时间序列对象有很多方法,如函数 class() 、 mode() 和 str() 分别可以查看其数据类型、存储类型和数据结构。\n\n# 数据类型\nclass(x)\n\n[1] \"ts\"\n\n# 存储类型\nmode(x)\n\n[1] \"numeric\"\n\n# 数据结构\nstr(x)\n\n Time-Series [1:100] from 2017 to 2017: 0.338 -0.609 0.641 -1.209 0.613 ...\n\n\n函数 start() 和 end() 查看开始和结束的时间点。\n\nc(start(x), end(x))\n\n[1] 2017 1 2017 100\n\n\n函数 time() 可以查看在以上时间区间的划分。\n\ntime(x)\n\nTime Series:\nStart = c(2017, 1) \nEnd = c(2017, 100) \nFrequency = 525960 \n [1] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [16] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [31] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [46] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [61] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [76] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n [91] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017\n\n\n函数 tsp() 可以查看其期初、期末和周期。\n\ntsp(x)\n\n[1] 2017 2017 525960", "crumbs": [ "数据准备", "1  数据对象" @@ -211,7 +211,7 @@ "href": "wrangling-collection.html#sec-rest-api", "title": "2  数据获取", "section": "\n2.4 从数据接口中获取", - "text": "2.4 从数据接口中获取\n\n2.4.1 Github\n从 Github API 接口中获取托管在 Github 上的 R 包的信息,比如点赞、关注和转发的数量。首先从 CRAN 上获得 R 包元数据信息,接着筛选出托管在 Github 上的 R 包,清理出 R 包在 Github 上的网址。\n\npdb <- readRDS(file = \"data/cran-package-db-20231231.rds\")\n# 过滤出 Github \npdb <- subset(\n x = pdb, subset = !duplicated(Package) & grepl(pattern = \"github\", x = BugReports),\n select = c(\"Package\", \"Maintainer\", \"Title\", \"BugReports\")\n)\n# 掐头去尾\npdb$repo <- sub(x = pdb$BugReports, pattern = \"(http|https)://(www\\\\.){0,1}github\\\\.com/\", replacement = \"\")\npdb$repo <- sub(x = pdb$repo, pattern = \"/{1,}(issues|blob).*\", replacement = \"\")\npdb$repo <- sub(x = pdb$repo, pattern = \"/{1,}(discussions|wiki)\", replacement = \"\")\npdb$repo <- sub(x = pdb$repo, pattern = \"/$\", replacement = \"\")\n\n获取某代码仓库信息的 Github API 是 https://api.github.com/repos ,为了批量地访问 API ,收集想要的数据,将数据请求、结果整理的过程打包成一个函数。\n\ngithub_stats <- function(repo) {\n url <- paste(\"https://api.github.com/repos\", repo, sep = \"/\")\n # 最多允许失败 5 次,每失败一次休息 5s\n req <- xfun::retry(curl::curl_fetch_memory, url = url, .times = 5, .pause = 5)\n x <- jsonlite::fromJSON(rawToChar(req$content))\n # 爬失败的标记一下\n if(is.null(x$stargazers_count)) x$stargazers_count <- x$subscribers_count <- x$forks_count <- -1\n # 爬一个休息 1s\n Sys.sleep(1)\n data.frame(\n repo = repo,\n # 点赞 仓库上 star 的人数\n stargazers_count = x$stargazers_count,\n # 关注 仓库上 watch 的人数\n subscribers_count = x$subscribers_count,\n # 转发 仓库上 fork 的人数\n forks_count = x$forks_count\n )\n}\n\n下面测试一下这段代码,获取代码仓库 yihui/knitr 的点赞、关注和转发的人数。\n\n# 测试代码\ngithub_stats(repo = \"yihui/knitr\")\n\n repo stargazers_count subscribers_count forks_count\n1 yihui/knitr 2328 115 878\n\n\n理论上,使用函数 lapply() 遍历所有 R 包可得所需数据,将数据收集函数应用到每一个 R 包上再合并结果,即如下操作。\n\n# 合并数据\ngh_repo_db <- data.table::rbindlist(lapply(pdb$repo, github_stats))\n\n实际上,在没有访问令牌的情况下,Github API 的访问次数是有限制的,只有 60 次(一段时间内)。首先在 Github 开发者设置中申请一个应用,获得应用名称(appname)、客户端 ID(key)和密钥(secret),下面借助 httr 包配置 OAuth 凭证。\n\nlibrary(httr)\n# Github API Oauth2\noauth_endpoints(\"github\")\n# 应用名称(appname)、客户端 ID(key)和密钥(secret)\nmyapp <- oauth_app(\n appname = \"Application Name\", key = \"Client ID\",\n secret = \"Client Secrets\"\n)\n# 获取 OAuth 凭证\ngithub_token <- oauth2.0_token(oauth_endpoints(\"github\"), myapp)\n# 使用 API\ngtoken <- config(token = github_token)\n\n修改函数 github_stats() 中请求 Github API 的一行代码,发送带密钥的 GET 请求。\n\nreq <- xfun::retry(GET, url = url, config = gtoken, .times = 5, .pause = 5)\n\n此外,请求难免出现意外,按照上面的方式,一旦报错,数据都将丢失。因此,要预先准备存储空间,每获取一条数据就存进去,如果报错了,就打个标记。\n\n# 准备存储数据\ngh_repo_db <- data.frame(\n repo = pdb$repo, stargazers_count = rep(-1, length(pdb$repo)),\n subscribers_count = rep(-1, length(pdb$repo)),\n forks_count = rep(-1, length(pdb$repo))\n)\n# 不断更新数据\nwhile (any(gh_repo_db$stargazers_count == -1)) {\n tmp <- gh_repo_db[gh_repo_db$stargazers_count == -1, ]\n for (repo in tmp$repo) {\n gh_repo_db[gh_repo_db$repo == repo, ] <- github_stats(repo = repo)\n }\n if(repo == tmp$repo[length(tmp$repo)]) break\n}\n\n最后,将收集整理好的数据保存到磁盘上,下面按点赞数量给 R 包排序,篇幅所限,仅展示前 20。\n\ngh_repo_db <- readRDS(file = \"data/gh-repo-db-2023.rds\")\ngh_repo_db <- gh_repo_db[!duplicated(gh_repo_db$repo),]\ngh_repo_db <- gh_repo_db[order(gh_repo_db$stargazers_count, decreasing = T),] \nhead(gh_repo_db, 20)\n\n repo stargazers_count subscribers_count forks_count\n8434 dmlc/xgboost 25266 909 8707\n5553 facebook/prophet 17415 425 4474\n4307 mlflow/mlflow 16365 292 3793\n3807 Microsoft/LightGBM 15821 437 3798\n265 apache/arrow 13080 356 3220\n3080 h2oai/h2o-3 6624 384 2016\n2790 tidyverse/ggplot2 6210 308 2028\n3430 interpretml/interpret 5894 141 706\n6921 rstudio/shiny 5180 339 1818\n4317 mlpack/mlpack 4668 185 1577\n1754 tidyverse/dplyr 4612 246 2131\n640 rstudio/bookdown 3565 122 1263\n1430 Rdatatable/data.table 3437 170 977\n6316 rstudio/rmarkdown 2758 146 977\n2269 wesm/feather 2708 97 174\n5324 plotly/plotly.R 2467 117 628\n5084 thomasp85/patchwork 2344 49 159\n1389 r-lib/devtools 2340 120 760\n3658 yihui/knitr 2326 115 877\n6868 satijalab/seurat 2034 75 867\n\n\n将发布在 Github 上的受欢迎的 R 包列出来了,方便读者选用,也看到一些有意思的结果。\n\n机器学习相关的 R 包靠在最前面,实际上,它们(占十之七八)多是对应软件的 R 语言接口,点赞的数目应当算上其它语言接口的贡献。\n在机器学习之后,依次是数据可视化(ggplot2、shiny、plotly.R、patchwork)、数据操作(dplyr、data.table、feather)和可重复性计算(bookdown、rmarkdown、knitr)、R 包开发(devtools)和生物信息(seurat)。\n\n最后,简要说明数据的情况:以上观察结果是基于 CRAN 在 2023-12-31 发布的 R 包元数据,8475 个 R 包在 Github 托管源代码,这些 R 包的点赞、关注和转发数据是在 2024-01-30 爬取的。其中,共有 29 个 R 包不按规矩填写、改名字、换地方、甚至删库了,这些 R 包是可忽略的。当然,也存在一些 R 包并未托管在 Github 上,但质量不错,比如 glmnet 包、colorspace 包、fGarch 包等,应当是少量的。\n\n2.4.2 中国地震台网\n中国地震台网 可以想象后台有一个数据库,在页面的小窗口中输入查询条件,转化为某种 SQL 语句,传递给数据库管理系统,执行查询语句,返回查询结果,即数据。\n\n2.4.3 美国地质调查局\n美国地质调查局提供一些选项窗口,可供选择数据范围,直接下载 CSV 或 XLS 文件。\n\n2.4.4 美国人口调查局\n美国人口调查局\ntidycensus 需要注册账号,获取使用 API 接口的访问令牌,可以想象后台不仅有一个数据库,在此之上,还有一层数据鉴权。\n\n2.4.5 世界银行\n世界银行和国际货币基金组织\nwbstats 包封装世界银行提供的数据接口 REST API", + "text": "2.4 从数据接口中获取\n\n2.4.1 Github\n从 Github API 接口中获取托管在 Github 上的 R 包的信息,比如点赞、关注和转发的数量。首先从 CRAN 上获得 R 包元数据信息,接着筛选出托管在 Github 上的 R 包,清理出 R 包在 Github 上的网址。\n\npdb <- readRDS(file = \"data/cran-package-db-20231231.rds\")\n# 过滤出 Github \npdb <- subset(\n x = pdb, subset = !duplicated(Package) & grepl(pattern = \"github\", x = BugReports),\n select = c(\"Package\", \"Maintainer\", \"Title\", \"BugReports\")\n)\n# 掐头去尾\npdb$repo <- sub(x = pdb$BugReports, pattern = \"(http|https)://(www\\\\.){0,1}github\\\\.com/\", replacement = \"\")\npdb$repo <- sub(x = pdb$repo, pattern = \"/{1,}(issues|blob).*\", replacement = \"\")\npdb$repo <- sub(x = pdb$repo, pattern = \"/{1,}(discussions|wiki)\", replacement = \"\")\npdb$repo <- sub(x = pdb$repo, pattern = \"/$\", replacement = \"\")\n\n获取某代码仓库信息的 Github API 是 https://api.github.com/repos ,为了批量地访问 API ,收集想要的数据,将数据请求、结果整理的过程打包成一个函数。\n\ngithub_stats <- function(repo) {\n url <- paste(\"https://api.github.com/repos\", repo, sep = \"/\")\n # 最多允许失败 5 次,每失败一次休息 5s\n req <- xfun::retry(curl::curl_fetch_memory, url = url, .times = 5, .pause = 5)\n x <- jsonlite::fromJSON(rawToChar(req$content))\n # 爬失败的标记一下\n if(is.null(x$stargazers_count)) x$stargazers_count <- x$subscribers_count <- x$forks_count <- -1\n # 爬一个休息 1s\n Sys.sleep(1)\n data.frame(\n repo = repo,\n # 点赞 仓库上 star 的人数\n stargazers_count = x$stargazers_count,\n # 关注 仓库上 watch 的人数\n subscribers_count = x$subscribers_count,\n # 转发 仓库上 fork 的人数\n forks_count = x$forks_count\n )\n}\n\n下面测试一下这段代码,获取代码仓库 yihui/knitr 的点赞、关注和转发的人数。\n\n# 测试代码\ngithub_stats(repo = \"yihui/knitr\")\n\n repo stargazers_count subscribers_count forks_count\n1 yihui/knitr -1 -1 -1\n\n\n理论上,使用函数 lapply() 遍历所有 R 包可得所需数据,将数据收集函数应用到每一个 R 包上再合并结果,即如下操作。\n\n# 合并数据\ngh_repo_db <- data.table::rbindlist(lapply(pdb$repo, github_stats))\n\n实际上,在没有访问令牌的情况下,Github API 的访问次数是有限制的,只有 60 次(一段时间内)。首先在 Github 开发者设置中申请一个应用,获得应用名称(appname)、客户端 ID(key)和密钥(secret),下面借助 httr 包配置 OAuth 凭证。\n\nlibrary(httr)\n# Github API Oauth2\noauth_endpoints(\"github\")\n# 应用名称(appname)、客户端 ID(key)和密钥(secret)\nmyapp <- oauth_app(\n appname = \"Application Name\", key = \"Client ID\",\n secret = \"Client Secrets\"\n)\n# 获取 OAuth 凭证\ngithub_token <- oauth2.0_token(oauth_endpoints(\"github\"), myapp)\n# 使用 API\ngtoken <- config(token = github_token)\n\n修改函数 github_stats() 中请求 Github API 的一行代码,发送带密钥的 GET 请求。\n\nreq <- xfun::retry(GET, url = url, config = gtoken, .times = 5, .pause = 5)\n\n此外,请求难免出现意外,按照上面的方式,一旦报错,数据都将丢失。因此,要预先准备存储空间,每获取一条数据就存进去,如果报错了,就打个标记。\n\n# 准备存储数据\ngh_repo_db <- data.frame(\n repo = pdb$repo, stargazers_count = rep(-1, length(pdb$repo)),\n subscribers_count = rep(-1, length(pdb$repo)),\n forks_count = rep(-1, length(pdb$repo))\n)\n# 不断更新数据\nwhile (any(gh_repo_db$stargazers_count == -1)) {\n tmp <- gh_repo_db[gh_repo_db$stargazers_count == -1, ]\n for (repo in tmp$repo) {\n gh_repo_db[gh_repo_db$repo == repo, ] <- github_stats(repo = repo)\n }\n if(repo == tmp$repo[length(tmp$repo)]) break\n}\n\n最后,将收集整理好的数据保存到磁盘上,下面按点赞数量给 R 包排序,篇幅所限,仅展示前 20。\n\ngh_repo_db <- readRDS(file = \"data/gh-repo-db-2023.rds\")\ngh_repo_db <- gh_repo_db[!duplicated(gh_repo_db$repo),]\ngh_repo_db <- gh_repo_db[order(gh_repo_db$stargazers_count, decreasing = T),] \nhead(gh_repo_db, 20)\n\n repo stargazers_count subscribers_count forks_count\n8434 dmlc/xgboost 25266 909 8707\n5553 facebook/prophet 17415 425 4474\n4307 mlflow/mlflow 16365 292 3793\n3807 Microsoft/LightGBM 15821 437 3798\n265 apache/arrow 13080 356 3220\n3080 h2oai/h2o-3 6624 384 2016\n2790 tidyverse/ggplot2 6210 308 2028\n3430 interpretml/interpret 5894 141 706\n6921 rstudio/shiny 5180 339 1818\n4317 mlpack/mlpack 4668 185 1577\n1754 tidyverse/dplyr 4612 246 2131\n640 rstudio/bookdown 3565 122 1263\n1430 Rdatatable/data.table 3437 170 977\n6316 rstudio/rmarkdown 2758 146 977\n2269 wesm/feather 2708 97 174\n5324 plotly/plotly.R 2467 117 628\n5084 thomasp85/patchwork 2344 49 159\n1389 r-lib/devtools 2340 120 760\n3658 yihui/knitr 2326 115 877\n6868 satijalab/seurat 2034 75 867\n\n\n将发布在 Github 上的受欢迎的 R 包列出来了,方便读者选用,也看到一些有意思的结果。\n\n机器学习相关的 R 包靠在最前面,实际上,它们(占十之七八)多是对应软件的 R 语言接口,点赞的数目应当算上其它语言接口的贡献。\n在机器学习之后,依次是数据可视化(ggplot2、shiny、plotly.R、patchwork)、数据操作(dplyr、data.table、feather)和可重复性计算(bookdown、rmarkdown、knitr)、R 包开发(devtools)和生物信息(seurat)。\n\n最后,简要说明数据的情况:以上观察结果是基于 CRAN 在 2023-12-31 发布的 R 包元数据,8475 个 R 包在 Github 托管源代码,这些 R 包的点赞、关注和转发数据是在 2024-01-30 爬取的。其中,共有 29 个 R 包不按规矩填写、改名字、换地方、甚至删库了,这些 R 包是可忽略的。当然,也存在一些 R 包并未托管在 Github 上,但质量不错,比如 glmnet 包、colorspace 包、fGarch 包等,应当是少量的。\n\n2.4.2 中国地震台网\n中国地震台网 可以想象后台有一个数据库,在页面的小窗口中输入查询条件,转化为某种 SQL 语句,传递给数据库管理系统,执行查询语句,返回查询结果,即数据。\n\n2.4.3 美国地质调查局\n美国地质调查局提供一些选项窗口,可供选择数据范围,直接下载 CSV 或 XLS 文件。\n\n2.4.4 美国人口调查局\n美国人口调查局\ntidycensus 需要注册账号,获取使用 API 接口的访问令牌,可以想象后台不仅有一个数据库,在此之上,还有一层数据鉴权。\n\n2.4.5 世界银行\n世界银行和国际货币基金组织\nwbstats 包封装世界银行提供的数据接口 REST API", "crumbs": [ "数据准备", "2  数据获取" @@ -1586,7 +1586,7 @@ "href": "analyze-network-data.html#sec-community-org", "title": "23  网络数据分析", "section": "\n23.2 R 语言社区的组织", - "text": "23.2 R 语言社区的组织\n除了 RStudio 公司出品的 tidyverse (Wickham 等 2019) 和 tidymodels (Kuhn 和 Wickham 2020),还有一些数据分析、建模的工具箱,如 mlr3verse (Lang 和 Schratz 2023)、easystats (Lüdecke 等 2022)、strengejacke (Lüdecke 2019) 和 DrWhy (Biecek 2023)。也有的组织基本停止了开发,如 Omegahat。还有的被商业公司收购后,不再活跃了,如 Revolution Analytics。它们作为解决方案大都属于一些组织,还有深藏功与名,有待笔者挖掘的。因不存在明显的规律,下面从开发者的邮箱出发,隶属企业、组织往往有统一的邮箱后缀。\n\nstr_extract <- function(text, pattern, ...) regmatches(text, regexpr(pattern, text, ...))\n# 移除 ORPHANED\npdb <- subset(pdb, subset = Maintainer != \"ORPHANED\")\n# 抽取邮件后缀\nextract_email_suffix <- function(x) {\n x <- str_extract(text = x, pattern = \"<.*?>\")\n sub(x = x, pattern = \".*?@(.*?)>\", replacement = \"\\\\1\")\n}\npdb$Email_suffix <- extract_email_suffix(pdb$Maintainer)\n\n按组织统计扩展包的数量(总的 R 包数量约 2 万),即各个组织开发的 R 包。\n\npdb_pkg <- aggregate(\n data = pdb, Package ~ Email_suffix, FUN = function(x) { length(unique(x)) }\n)\nhead(pdb_pkg[order(pdb_pkg$Package, decreasing = TRUE), ], 20)\n\n#> Email_suffix Package\n#> 876 gmail.com 6968\n#> 2044 rstudio.com 208\n#> 979 hotmail.com 185\n#> 1825 outlook.com 152\n#> 1971 R-project.org 106\n#> 2 163.com 94\n#> 210 berkeley.edu 91\n#> 2559 umich.edu 91\n#> 2819 uw.edu 74\n#> 1927 protonmail.com 73\n#> 2564 umn.edu 69\n#> 581 debian.org 68\n#> 2951 yahoo.com 68\n#> 1828 outlook.fr 63\n#> 2212 stanford.edu 58\n#> 155 auckland.ac.nz 57\n#> 887 gmx.de 55\n#> 2911 wisc.edu 55\n#> 895 googlemail.com 50\n#> 1970 r-project.org 50\n\n\n不难看出,至少有如下几类:\n\n邮件服务提供商。6968 个 R 包使用 gmail 邮箱作为联系维护者的方式,googlemail.com 也是谷歌提供的服务。hotmail.com 和 outlook.com 都是微软提供的邮箱服务,outlook.fr (法国)也是,除此之外,比较大的邮件服务提供商就是 163.com(网易)、 protonmail.com 和 yahoo.com (雅虎)等。\n商业组织。208 个 R 包来自 RStudio 公司的员工,这些维护者使用 RStudio 公司提供的邮箱。\n开源组织。R-project.org 和 r-project.org 都是 R 语言组织的联系方式,自不必多说,R 语言核心团队成员不仅维护 R 软件源码,还维护了很多 R 包。debian.org 是 Debian 组织的联系方式,都是开源组织(Open Source Org)。\n教育机构。berkeley.edu 、umich.edu 等以 edu 结尾的北美(国)的大学,gmx.de、 posteo.de 等以 de 结尾的德国大学,ucl.ac.uk 等以 uk 结尾的英国的大学,auckland.ac.nz 等以 nz 结尾的新西兰的大学,uwaterloo.ca 等以 ca 结尾的加拿大的大学。\n\n按组织统计开发者的数量(总的开发者数量约 1 万),即各个组织的 R 包开发者。\n\npdb_org <- aggregate(\n data = pdb, Maintainer2 ~ Email_suffix, FUN = function(x) { length(unique(x)) }\n)\nhead(pdb_org[order(pdb_org$Maintainer2, decreasing = TRUE), ], 20)\n\n#> Email_suffix Maintainer2\n#> 876 gmail.com 3800\n#> 979 hotmail.com 110\n#> 1825 outlook.com 87\n#> 2 163.com 57\n#> 2559 umich.edu 54\n#> 2951 yahoo.com 51\n#> 2564 umn.edu 47\n#> 1927 protonmail.com 46\n#> 2819 uw.edu 46\n#> 887 gmx.de 34\n#> 210 berkeley.edu 33\n#> 2044 rstudio.com 30\n#> 895 googlemail.com 28\n#> 2212 stanford.edu 27\n#> 468 columbia.edu 26\n#> 1114 inrae.fr 26\n#> 2451 ucl.ac.uk 25\n#> 2964 yale.edu 25\n#> 635 duke.edu 23\n#> 1906 posteo.de 23\n\n\n可见,大部分开发者采用邮件服务提供商的邮件地址。3800 个开发者使用来自谷歌的 gmail.com、197 个开发者使用来自微软的 hotmail.com 或 outlook.com,57 个开发者使用来自网易的 163.com,51 个开发者使用来自雅虎的 yahoo.com,46 个开发者使用来自 Proton 的 protonmail.com。\n无论从开发者数量还是 R 包数量的角度看,都有两个显著特点。其一马太效应,往头部集中,其二,长尾分布,尾部占比接近甚至超过 50%。\n\n23.2.1 美国、英国和加拿大\n1666 个开发者来自以 edu 为后缀的邮箱。各个组织(主要是大学)及其 R 包开发者数据如下:\n\nsum(pdb_org[grepl(pattern = \"edu$\", x = pdb_org$Email_suffix), \"Maintainer2\"])\n\n#> [1] 1666\n\npdb_org_edu <- pdb_org[grepl(pattern = \"edu$\", x = pdb_org$Email_suffix), ]\npdb_org_edu[order(pdb_org_edu$Maintainer2, decreasing = TRUE), ] |> head(20)\n\n#> Email_suffix Maintainer2\n#> 2559 umich.edu 54\n#> 2564 umn.edu 47\n#> 2819 uw.edu 46\n#> 210 berkeley.edu 33\n#> 2212 stanford.edu 27\n#> 468 columbia.edu 26\n#> 2964 yale.edu 25\n#> 635 duke.edu 23\n#> 2911 wisc.edu 23\n#> 482 cornell.edu 22\n#> 2444 ucdavis.edu 21\n#> 1929 psu.edu 19\n#> 2449 uchicago.edu 19\n#> 2830 vanderbilt.edu 19\n#> 1660 ncsu.edu 18\n#> 1663 nd.edu 18\n#> 1008 iastate.edu 17\n#> 1919 princeton.edu 17\n#> 1815 osu.edu 16\n#> 2523 uiowa.edu 16\n\n\n好吧,几乎全是美国各个 NB 大学的,比如华盛顿大学( uw.edu)、密歇根大学(umich.edu)、加州伯克利大学(berkeley.edu)等等。顺便一说,美国各个大学的网站,特别是统计院系很厉害的,已经帮大家收集得差不多了,有留学打算的读者自取,邮箱后缀就是学校/院官网。\n有些邮箱后缀带有院系,但是并没有向上合并到学校这一级,比如 stanford.edu 、stat.stanford.edu 和 alumni.stanford.edu 等没有合并统计。实际上,使用 edu 邮箱的教育机构大部份位于美国。有的邮箱来自教育机构,但是不以 edu 结尾,比如新西兰奥克兰大学 auckland.ac.nz 、瑞士苏黎世联邦理工学院 stat.math.ethz.ch 等美国以外的教育机构。下面分别查看英国和加拿大的情况。\n350 个开发者来自以 uk 为后缀的邮箱。各个组织(主要是大学)及其 R 包开发者数据如下:\n\nsum(pdb_org[grepl(pattern = \"uk$\", x = pdb_org$Email_suffix), \"Maintainer2\"])\n\n#> [1] 350\n\npdb_org_uk <- pdb_org[grepl(pattern = \"uk$\", x = pdb_org$Email_suffix), ]\npdb_org_uk[order(pdb_org_uk$Maintainer2, decreasing = TRUE), ] |> head(20)\n\n#> Email_suffix Maintainer2\n#> 2451 ucl.ac.uk 25\n#> 329 cam.ac.uk 17\n#> 295 bristol.ac.uk 15\n#> 1088 imperial.ac.uk 14\n#> 658 ed.ac.uk 13\n#> 1286 lancaster.ac.uk 11\n#> 1363 lse.ac.uk 9\n#> 1605 mrc-bsu.cam.ac.uk 9\n#> 2878 warwick.ac.uk 9\n#> 870 glasgow.ac.uk 8\n#> 1364 lshtm.ac.uk 8\n#> 1424 manchester.ac.uk 8\n#> 636 durham.ac.uk 7\n#> 744 exeter.ac.uk 7\n#> 2260 statslab.cam.ac.uk 7\n#> 2188 soton.ac.uk 6\n#> 2972 york.ac.uk 6\n#> 978 hotmail.co.uk 5\n#> 1948 qmul.ac.uk 5\n#> 248 bioss.ac.uk 4\n\n\n258 个开发者来自以 ca 为后缀的邮箱。各个组织(主要是大学)及其 R 包开发者数据如下:\n\nsum(pdb_org[grepl(pattern = \"ca$\", x = pdb_org$Email_suffix), \"Maintainer2\"])\n\n#> [1] 258\n\npdb_org_ca <- pdb_org[grepl(pattern = \"ca$\", x = pdb_org$Email_suffix), ]\npdb_org_ca[order(pdb_org_ca$Maintainer2, decreasing = TRUE), ] |> head(10)\n\n#> Email_suffix Maintainer2\n#> 2822 uwaterloo.ca 19\n#> 1397 mail.mcgill.ca 14\n#> 2123 sfu.ca 12\n#> 2801 utoronto.ca 12\n#> 2426 ualberta.ca 11\n#> 2239 stat.ubc.ca 9\n#> 2434 ubc.ca 9\n#> 2813 uvic.ca 8\n#> 952 hec.ca 7\n#> 1416 mail.utoronto.ca 7\n\n\n\n23.2.2 CRAN 和 RStudio\n下面根据邮箱后缀匹配抽取 CRAN 团队及开发的 R 包,规则也许不能覆盖所有的情况,比如署名 CRAN Team 的维护者代表的是 CRAN 团队,XML 和 RCurl 包就由他们维护。再比如,Brian Ripley 的邮箱 ripley@stats.ox.ac.uk 就不是 CRAN 官网域名。读者若有补充,欢迎 PR 给我。\n代码cran_dev <- subset(pdb,\n subset = grepl(\n x = Maintainer,\n pattern = paste0(c(\n \"(@[Rr]-project\\\\.org)\", # 官方邮箱\n \"(ripley@stats.ox.ac.uk)\", # Brian Ripley\n \"(p.murrell@auckland.ac.nz)\", # Paul Murrell\n \"(paul@stat.auckland.ac.nz)\", # Paul Murrell\n \"(maechler@stat.math.ethz.ch)\", # Martin Maechler\n \"(mmaechler+Matrix@gmail.com)\", # Martin Maechler\n \"(bates@stat.wisc.edu)\", # Douglas Bates\n \"(pd.mes@cbs.dk)\", # Peter Dalgaard\n \"(ligges@statistik.tu-dortmund.de)\", # Uwe Ligges\n \"(tlumley@u.washington.edu)\", # Thomas Lumley\n \"(t.lumley@auckland.ac.nz)\", # Thomas Lumley\n \"(martyn.plummer@gmail.com)\", # Martyn Plummer\n \"(luke-tierney@uiowa.edu)\", # Luke Tierney\n \"(stefano.iacus@unimi.it)\", # Stefano M. Iacus\n \"(murdoch.duncan@gmail.com)\", # Duncan Murdoch\n \"(michafla@gene.com)\" # Michael Lawrence\n ), collapse = \"|\")\n ),\n select = c(\"Package\", \"Maintainer\")\n) |>\n transform(Maintainer = gsub(\n x = Maintainer,\n pattern = '(<([^<>]*)>)|(\")',\n replacement = \"\"\n )) |>\n transform(Maintainer = gsub(\n x = Maintainer,\n pattern = \"(R-core)|(R Core Team)\",\n replacement = \"CRAN Team\"\n )) |>\n transform(Maintainer = gsub(\n x = Maintainer,\n pattern = \"(S. M. Iacus)|(Stefano M.Iacus)|(Stefano Maria Iacus)\",\n replacement = \"Stefano M. Iacus\"\n )) |>\n transform(Maintainer = gsub(\n x = Maintainer,\n pattern = \"(Toby Hocking)\",\n replacement = \"Toby Dylan Hocking\"\n )) |>\n transform(Maintainer = gsub(\n x = Maintainer,\n pattern = \"(John M Chambers)\",\n replacement = \"John Chambers\"\n ))\n\ncran_dev <- aggregate(data = cran_dev, Package ~ Maintainer, FUN = function(x) length(unique(x)))\ncran_dev <- cran_dev[order(cran_dev$Package, decreasing = TRUE), ]\n\nknitr::kable(head(cran_dev, ceiling(nrow(cran_dev) / 2)),\n col.names = c(\"团队成员\", \"R 包数量\"), row.names = FALSE\n)\n代码knitr::kable(tail(cran_dev, floor(nrow(cran_dev) / 2)),\n col.names = c(\"团队成员\", \"R 包数量\"), row.names = FALSE\n)\n\n\n表格 23.1: CRAN 团队开发维护 R 包数量情况\n\n\n\n\n\n(a) 表\n\n\n\n团队成员\nR 包数量\n\n\n\nKurt Hornik\n28\n\n\nSimon Urbanek\n26\n\n\nAchim Zeileis\n25\n\n\nMartin Maechler\n25\n\n\nTorsten Hothorn\n25\n\n\nPaul Murrell\n19\n\n\nToby Dylan Hocking\n17\n\n\nBrian Ripley\n12\n\n\nThomas Lumley\n12\n\n\nUwe Ligges\n9\n\n\nDuncan Murdoch\n7\n\n\nDavid Meyer\n6\n\n\nCRAN Team\n5\n\n\n\n\n\n\n\n\n\n\n(b) 续表\n\n\n\n团队成员\nR 包数量\n\n\n\nFriedrich Leisch\n5\n\n\nLuke Tierney\n5\n\n\nMichael Lawrence\n5\n\n\nStefan Theussl\n5\n\n\nBettina Grün\n3\n\n\nJohn Chambers\n3\n\n\nSimon Wood\n3\n\n\nBettina Gruen\n2\n\n\nDeepayan Sarkar\n2\n\n\nDouglas Bates\n2\n\n\nMartyn Plummer\n2\n\n\nPeter Dalgaard\n1\n\n\n\n\n\n\n\n\n\n\n\nKurt Hornik、Simon Urbanek、Achim Zeileis 等真是高产呐!除了维护 R 语言核心代码,还开发维护了那么多 R 包。以 Brian Ripley 为例,看看他都具体维护了哪些 R 包。\n\n代码subset(pdb,\n subset = grepl(x = Maintainer, pattern = \"Brian Ripley\"),\n select = c(\"Package\", \"Title\"), drop = TRUE\n) |>\n unique(by = \"Package\") |>\n transform(Title = gsub(pattern = \"(\\\\\\n)\", replacement = \" \", x = Title)) |>\n knitr::kable(row.names = FALSE)\n\n\n表格 23.2: Brian Ripley 维护的 R 包\n\n\n\n\n\n\n\n\nPackage\nTitle\n\n\n\nboot\nBootstrap Functions (Originally by Angelo Canty for S)\n\n\nclass\nFunctions for Classification\n\n\nfastICA\nFastICA Algorithms to Perform ICA and Projection Pursuit\n\n\ngee\nGeneralized Estimation Equation Solver\n\n\nKernSmooth\nFunctions for Kernel Smoothing Supporting Wand & Jones (1995)\n\n\nMASS\nSupport Functions and Datasets for Venables and Ripley’s MASS\n\n\nmix\nEstimation/Multiple Imputation for Mixed Categorical and Continuous Data\n\n\nnnet\nFeed-Forward Neural Networks and Multinomial Log-Linear Models\n\n\npspline\nPenalized Smoothing Splines\n\n\nRODBC\nODBC Database Access\n\n\nspatial\nFunctions for Kriging and Point Pattern Analysis\n\n\ntree\nClassification and Regression Trees\n\n\n\n\n\n\n\n\n震惊!有一半收录在 R 软件中,所以已经持续维护 20 多年了。下面继续根据邮箱后缀将 RStudio 团队的情况统计出来,结果见下表。\n代码rstudio_dev <- subset(pdb,\n subset = grepl(x = Maintainer, pattern = \"(posit.co)|(rstudio.com)|(yihui.name)\"),\n select = c(\"Package\", \"Maintainer\")\n) |>\n transform(Maintainer = extract_maintainer(Maintainer))\n\nrstudio_dev <- aggregate(data = rstudio_dev, Package ~ Maintainer, FUN = function(x) length(unique(x)))\nrstudio_dev <- rstudio_dev[order(rstudio_dev$Package, decreasing = TRUE), ]\n\nknitr::kable(head(rstudio_dev, ceiling(nrow(rstudio_dev) / 2)),\n col.names = c(\"团队成员\", \"R 包数量\"), row.names = FALSE\n)\n代码knitr::kable(tail(rstudio_dev, floor(nrow(rstudio_dev) / 2)),\n col.names = c(\"团队成员\", \"R 包数量\"), row.names = FALSE\n)\n\n\n表格 23.3: RStudio 团队开发维护 R 包数量情况(部分)\n\n\n\n\n\n(a) 表\n\n\n\n团队成员\nR 包数量\n\n\n\nHadley Wickham\n48\n\n\nYihui Xie\n22\n\n\nMax Kuhn\n18\n\n\nLionel Henry\n15\n\n\nWinston Chang\n15\n\n\nDaniel Falbel\n13\n\n\nJennifer Bryan\n13\n\n\nDavis Vaughan\n11\n\n\nCarson Sievert\n10\n\n\nTomasz Kalinowski\n8\n\n\nBarret Schloerke\n6\n\n\nThomas Lin Pedersen\n6\n\n\nHannah Frick\n5\n\n\nChristophe Dervieux\n4\n\n\nJoe Cheng\n4\n\n\nJulia Silge\n4\n\n\n\n\n\n\n\n\n\n\n(b) 续表\n\n\n\n团队成员\nR 包数量\n\n\n\nCole Arendt\n3\n\n\nEdgar Ruiz\n3\n\n\nJJ Allaire\n3\n\n\nKevin Kuo\n3\n\n\nKevin Ushey\n3\n\n\nRichard Iannone\n3\n\n\nAron Atkins\n2\n\n\nRomain François\n2\n\n\nYitao Li\n2\n\n\nBrian Smith\n1\n\n\nEmil Hvitfeldt\n1\n\n\nGarrick Aden-Buie\n1\n\n\nJames Blair\n1\n\n\nNathan Stephens\n1\n\n\nNick Strayer\n1\n\n\n\n\n\n\n\n\n\n\n\nCRAN 和 RStudio 团队是 R 语言社区最为熟悉的,其它团队需借助一些网络分析算法挖掘了。", + "text": "23.2 R 语言社区的组织\n除了 RStudio 公司出品的 tidyverse (Wickham 等 2019) 和 tidymodels (Kuhn 和 Wickham 2020),还有一些数据分析、建模的工具箱,如 mlr3verse (Lang 和 Schratz 2023)、easystats (Lüdecke 等 2022)、strengejacke (Lüdecke 2019) 和 DrWhy (Biecek 2023)。也有的组织基本停止了开发,如 Omegahat。还有的被商业公司收购后,不再活跃了,如 Revolution Analytics。它们作为解决方案大都属于一些组织,还有深藏功与名,有待笔者挖掘的。因不存在明显的规律,下面从开发者的邮箱出发,隶属企业、组织往往有统一的邮箱后缀。\n\nstr_extract <- function(text, pattern, ...) regmatches(text, regexpr(pattern, text, ...))\n# 移除 ORPHANED\npdb <- subset(pdb, subset = Maintainer != \"ORPHANED\")\n# 抽取邮件后缀\nextract_email_suffix <- function(x) {\n x <- str_extract(text = x, pattern = \"<.*?>\")\n sub(x = x, pattern = \".*?@(.*?)>\", replacement = \"\\\\1\")\n}\npdb$Email_suffix <- extract_email_suffix(pdb$Maintainer)\n\n按组织统计扩展包的数量(总的 R 包数量约 2 万),即各个组织开发的 R 包。\n\npdb_pkg <- aggregate(\n data = pdb, Package ~ Email_suffix, FUN = function(x) { length(unique(x)) }\n)\nhead(pdb_pkg[order(pdb_pkg$Package, decreasing = TRUE), ], 20)\n\n#> Email_suffix Package\n#> 876 gmail.com 6968\n#> 2044 rstudio.com 208\n#> 979 hotmail.com 185\n#> 1825 outlook.com 152\n#> 1971 R-project.org 106\n#> 2 163.com 94\n#> 210 berkeley.edu 91\n#> 2559 umich.edu 91\n#> 2819 uw.edu 74\n#> 1927 protonmail.com 73\n#> 2564 umn.edu 69\n#> 581 debian.org 68\n#> 2951 yahoo.com 68\n#> 1828 outlook.fr 63\n#> 2212 stanford.edu 58\n#> 155 auckland.ac.nz 57\n#> 887 gmx.de 55\n#> 2911 wisc.edu 55\n#> 895 googlemail.com 50\n#> 1970 r-project.org 50\n\n\n不难看出,至少有如下几类:\n\n邮件服务提供商。6968 个 R 包使用 gmail 邮箱作为联系维护者的方式,googlemail.com 也是谷歌提供的服务。hotmail.com 和 outlook.com 都是微软提供的邮箱服务,outlook.fr (法国)也是,除此之外,比较大的邮件服务提供商就是 163.com(网易)、 protonmail.com 和 yahoo.com (雅虎)等。\n商业组织。208 个 R 包来自 RStudio 公司的员工,这些维护者使用 RStudio 公司提供的邮箱。\n开源组织。R-project.org 和 r-project.org 都是 R 语言组织的联系方式,自不必多说,R 语言核心团队成员不仅维护 R 软件源码,还维护了很多 R 包。debian.org 是 Debian 组织的联系方式,都是开源组织(Open Source Org)。\n教育机构。berkeley.edu 、umich.edu 等以 edu 结尾的北美(国)的大学,gmx.de、 posteo.de 等以 de 结尾的德国大学,ucl.ac.uk 等以 uk 结尾的英国的大学,auckland.ac.nz 等以 nz 结尾的新西兰的大学,uwaterloo.ca 等以 ca 结尾的加拿大的大学。\n\n按组织统计开发者的数量(总的开发者数量约 1 万),即各个组织的 R 包开发者。\n\npdb_org <- aggregate(\n data = pdb, Maintainer2 ~ Email_suffix, FUN = function(x) { length(unique(x)) }\n)\nhead(pdb_org[order(pdb_org$Maintainer2, decreasing = TRUE), ], 20)\n\n#> Email_suffix Maintainer2\n#> 876 gmail.com 3800\n#> 979 hotmail.com 110\n#> 1825 outlook.com 87\n#> 2 163.com 57\n#> 2559 umich.edu 54\n#> 2951 yahoo.com 51\n#> 2564 umn.edu 47\n#> 1927 protonmail.com 46\n#> 2819 uw.edu 46\n#> 887 gmx.de 34\n#> 210 berkeley.edu 33\n#> 2044 rstudio.com 30\n#> 895 googlemail.com 28\n#> 2212 stanford.edu 27\n#> 468 columbia.edu 26\n#> 1114 inrae.fr 26\n#> 2451 ucl.ac.uk 25\n#> 2964 yale.edu 25\n#> 635 duke.edu 23\n#> 1906 posteo.de 23\n\n\n可见,大部分开发者采用邮件服务提供商的邮件地址。3800 个开发者使用来自谷歌的 gmail.com、197 个开发者使用来自微软的 hotmail.com 或 outlook.com,57 个开发者使用来自网易的 163.com,51 个开发者使用来自雅虎的 yahoo.com,46 个开发者使用来自 Proton 的 protonmail.com。\n无论从开发者数量还是 R 包数量的角度看,都有两个显著特点。其一马太效应,往头部集中,其二,长尾分布,尾部占比接近甚至超过 50%。\n\n23.2.1 美国、英国和加拿大\n1666 个开发者来自以 edu 为后缀的邮箱。各个组织(主要是大学)及其 R 包开发者数据如下:\n\nsum(pdb_org[grepl(pattern = \"edu$\", x = pdb_org$Email_suffix), \"Maintainer2\"])\n\n#> [1] 1666\n\npdb_org_edu <- pdb_org[grepl(pattern = \"edu$\", x = pdb_org$Email_suffix), ]\npdb_org_edu[order(pdb_org_edu$Maintainer2, decreasing = TRUE), ] |> head(20)\n\n#> Email_suffix Maintainer2\n#> 2559 umich.edu 54\n#> 2564 umn.edu 47\n#> 2819 uw.edu 46\n#> 210 berkeley.edu 33\n#> 2212 stanford.edu 27\n#> 468 columbia.edu 26\n#> 2964 yale.edu 25\n#> 635 duke.edu 23\n#> 2911 wisc.edu 23\n#> 482 cornell.edu 22\n#> 2444 ucdavis.edu 21\n#> 1929 psu.edu 19\n#> 2449 uchicago.edu 19\n#> 2830 vanderbilt.edu 19\n#> 1660 ncsu.edu 18\n#> 1663 nd.edu 18\n#> 1008 iastate.edu 17\n#> 1919 princeton.edu 17\n#> 1815 osu.edu 16\n#> 2523 uiowa.edu 16\n\n\n好吧,几乎全是美国各个 NB 大学的,比如华盛顿大学( uw.edu)、密歇根大学(umich.edu)、加州伯克利大学(berkeley.edu)等等。顺便一说,美国各个大学的网站,特别是统计院系很厉害的,已经帮大家收集得差不多了,有留学打算的读者自取,邮箱后缀就是学校/院官网。\n有些邮箱后缀带有院系,但是并没有向上合并到学校这一级,比如 stanford.edu 、stat.stanford.edu 和 alumni.stanford.edu 等没有合并统计。实际上,使用 edu 邮箱的教育机构大部份位于美国。有的邮箱来自教育机构,但是不以 edu 结尾,比如新西兰奥克兰大学 auckland.ac.nz 、瑞士苏黎世联邦理工学院 stat.math.ethz.ch 等美国以外的教育机构。下面分别查看英国和加拿大的情况。\n350 个开发者来自以 uk 为后缀的邮箱。各个组织(主要是大学)及其 R 包开发者数据如下:\n\nsum(pdb_org[grepl(pattern = \"uk$\", x = pdb_org$Email_suffix), \"Maintainer2\"])\n\n#> [1] 350\n\npdb_org_uk <- pdb_org[grepl(pattern = \"uk$\", x = pdb_org$Email_suffix), ]\npdb_org_uk[order(pdb_org_uk$Maintainer2, decreasing = TRUE), ] |> head(20)\n\n#> Email_suffix Maintainer2\n#> 2451 ucl.ac.uk 25\n#> 329 cam.ac.uk 17\n#> 295 bristol.ac.uk 15\n#> 1088 imperial.ac.uk 14\n#> 658 ed.ac.uk 13\n#> 1286 lancaster.ac.uk 11\n#> 1363 lse.ac.uk 9\n#> 1605 mrc-bsu.cam.ac.uk 9\n#> 2878 warwick.ac.uk 9\n#> 870 glasgow.ac.uk 8\n#> 1364 lshtm.ac.uk 8\n#> 1424 manchester.ac.uk 8\n#> 636 durham.ac.uk 7\n#> 744 exeter.ac.uk 7\n#> 2260 statslab.cam.ac.uk 7\n#> 2188 soton.ac.uk 6\n#> 2972 york.ac.uk 6\n#> 978 hotmail.co.uk 5\n#> 1948 qmul.ac.uk 5\n#> 248 bioss.ac.uk 4\n\n\n258 个开发者来自以 ca 为后缀的邮箱。各个组织(主要是大学)及其 R 包开发者数据如下:\n\nsum(pdb_org[grepl(pattern = \"ca$\", x = pdb_org$Email_suffix), \"Maintainer2\"])\n\n#> [1] 258\n\npdb_org_ca <- pdb_org[grepl(pattern = \"ca$\", x = pdb_org$Email_suffix), ]\npdb_org_ca[order(pdb_org_ca$Maintainer2, decreasing = TRUE), ] |> head(10)\n\n#> Email_suffix Maintainer2\n#> 2822 uwaterloo.ca 19\n#> 1397 mail.mcgill.ca 14\n#> 2123 sfu.ca 12\n#> 2801 utoronto.ca 12\n#> 2426 ualberta.ca 11\n#> 2239 stat.ubc.ca 9\n#> 2434 ubc.ca 9\n#> 2813 uvic.ca 8\n#> 952 hec.ca 7\n#> 1416 mail.utoronto.ca 7\n\n\n\n23.2.2 CRAN 和 RStudio\n下面根据邮箱后缀匹配抽取 CRAN 团队及开发的 R 包,规则也许不能覆盖所有的情况,比如署名 CRAN Team 的维护者代表的是 CRAN 团队,XML 和 RCurl 包就由他们维护。再比如,Brian Ripley 的邮箱 ripley@stats.ox.ac.uk 就不是 CRAN 官网域名。读者若有补充,欢迎 PR 给我。\n代码cran_dev <- subset(pdb,\n subset = grepl(\n x = Maintainer,\n pattern = paste0(c(\n \"(@[Rr]-project\\\\.org)\", # 官方邮箱\n \"(ripley@stats.ox.ac.uk)\", # Brian Ripley\n \"(p.murrell@auckland.ac.nz)\", # Paul Murrell\n \"(paul@stat.auckland.ac.nz)\", # Paul Murrell\n \"(maechler@stat.math.ethz.ch)\", # Martin Maechler\n \"(mmaechler+Matrix@gmail.com)\", # Martin Maechler\n \"(bates@stat.wisc.edu)\", # Douglas Bates\n \"(pd.mes@cbs.dk)\", # Peter Dalgaard\n \"(ligges@statistik.tu-dortmund.de)\", # Uwe Ligges\n \"(tlumley@u.washington.edu)\", # Thomas Lumley\n \"(t.lumley@auckland.ac.nz)\", # Thomas Lumley\n \"(martyn.plummer@gmail.com)\", # Martyn Plummer\n \"(luke-tierney@uiowa.edu)\", # Luke Tierney\n \"(stefano.iacus@unimi.it)\", # Stefano M. Iacus\n \"(murdoch.duncan@gmail.com)\", # Duncan Murdoch\n \"(michafla@gene.com)\" # Michael Lawrence\n ), collapse = \"|\")\n ),\n select = c(\"Package\", \"Maintainer\")\n) |>\n transform(Maintainer = gsub(\n x = Maintainer, pattern = '(<([^<>]*)>)|(\")', replacement = \"\"\n )) |>\n transform(Maintainer = gsub(\n x = Maintainer, pattern = \"(R-core)|(R Core Team)\", replacement = \"CRAN Team\"\n )) |>\n transform(Maintainer = gsub(\n x = Maintainer,\n pattern = \"(S. M. Iacus)|(Stefano M.Iacus)|(Stefano Maria Iacus)\",\n replacement = \"Stefano M. Iacus\"\n )) |>\n transform(Maintainer = gsub(\n x = Maintainer, pattern = \"(Toby Hocking)\",\n replacement = \"Toby Dylan Hocking\"\n )) |>\n transform(Maintainer = gsub(\n x = Maintainer, pattern = \"(John M Chambers)\", replacement = \"John Chambers\"\n ))\ncran_dev <- aggregate(data = cran_dev, Package ~ Maintainer, FUN = function(x) length(unique(x)))\ncran_dev <- cran_dev[order(cran_dev$Package, decreasing = TRUE), ]\nknitr::kable(head(cran_dev, ceiling(nrow(cran_dev) / 2)),\n col.names = c(\"团队成员\", \"R 包数量\"), row.names = FALSE\n)\n代码knitr::kable(tail(cran_dev, floor(nrow(cran_dev) / 2)),\n col.names = c(\"团队成员\", \"R 包数量\"), row.names = FALSE\n)\n\n\n表格 23.1: CRAN 团队开发维护 R 包数量情况\n\n\n\n\n\n(a) 表\n\n\n\n团队成员\nR 包数量\n\n\n\nKurt Hornik\n28\n\n\nSimon Urbanek\n26\n\n\nAchim Zeileis\n25\n\n\nMartin Maechler\n25\n\n\nTorsten Hothorn\n25\n\n\nPaul Murrell\n19\n\n\nToby Dylan Hocking\n17\n\n\nBrian Ripley\n12\n\n\nThomas Lumley\n12\n\n\nUwe Ligges\n9\n\n\nDuncan Murdoch\n7\n\n\nDavid Meyer\n6\n\n\nCRAN Team\n5\n\n\n\n\n\n\n\n\n\n\n(b) 续表\n\n\n\n团队成员\nR 包数量\n\n\n\nFriedrich Leisch\n5\n\n\nLuke Tierney\n5\n\n\nMichael Lawrence\n5\n\n\nStefan Theussl\n5\n\n\nBettina Grün\n3\n\n\nJohn Chambers\n3\n\n\nSimon Wood\n3\n\n\nBettina Gruen\n2\n\n\nDeepayan Sarkar\n2\n\n\nDouglas Bates\n2\n\n\nMartyn Plummer\n2\n\n\nPeter Dalgaard\n1\n\n\n\n\n\n\n\n\n\n\n\nKurt Hornik、Simon Urbanek、Achim Zeileis 等真是高产呐!除了维护 R 语言核心代码,还开发维护了那么多 R 包。以 Brian Ripley 为例,看看他都具体维护了哪些 R 包。\n\n代码subset(pdb,\n subset = grepl(x = Maintainer, pattern = \"Brian Ripley\"),\n select = c(\"Package\", \"Title\"), drop = TRUE\n) |>\n unique(by = \"Package\") |>\n transform(Title = gsub(pattern = \"(\\\\\\n)\", replacement = \" \", x = Title)) |>\n knitr::kable(row.names = FALSE)\n\n\n表格 23.2: Brian Ripley 维护的 R 包\n\n\n\n\n\n\n\n\nPackage\nTitle\n\n\n\nboot\nBootstrap Functions (Originally by Angelo Canty for S)\n\n\nclass\nFunctions for Classification\n\n\nfastICA\nFastICA Algorithms to Perform ICA and Projection Pursuit\n\n\ngee\nGeneralized Estimation Equation Solver\n\n\nKernSmooth\nFunctions for Kernel Smoothing Supporting Wand & Jones (1995)\n\n\nMASS\nSupport Functions and Datasets for Venables and Ripley’s MASS\n\n\nmix\nEstimation/Multiple Imputation for Mixed Categorical and Continuous Data\n\n\nnnet\nFeed-Forward Neural Networks and Multinomial Log-Linear Models\n\n\npspline\nPenalized Smoothing Splines\n\n\nRODBC\nODBC Database Access\n\n\nspatial\nFunctions for Kriging and Point Pattern Analysis\n\n\ntree\nClassification and Regression Trees\n\n\n\n\n\n\n\n\n震惊!有一半收录在 R 软件中,所以已经持续维护 20 多年了。下面继续根据邮箱后缀将 RStudio 团队的情况统计出来,结果见下表。\n代码rstudio_dev <- subset(pdb,\n subset = grepl(x = Maintainer, pattern = \"(posit.co)|(rstudio.com)|(yihui.name)\"),\n select = c(\"Package\", \"Maintainer\")\n) |>\n transform(Maintainer = extract_maintainer(Maintainer))\nrstudio_dev <- aggregate(data = rstudio_dev, Package ~ Maintainer, FUN = function(x) length(unique(x)))\nrstudio_dev <- rstudio_dev[order(rstudio_dev$Package, decreasing = TRUE), ]\nknitr::kable(head(rstudio_dev, ceiling(nrow(rstudio_dev) / 2)),\n col.names = c(\"团队成员\", \"R 包数量\"), row.names = FALSE\n)\n代码knitr::kable(tail(rstudio_dev, floor(nrow(rstudio_dev) / 2)),\n col.names = c(\"团队成员\", \"R 包数量\"), row.names = FALSE\n)\n\n\n表格 23.3: RStudio 团队开发维护 R 包数量情况(部分)\n\n\n\n\n\n(a) 表\n\n\n\n团队成员\nR 包数量\n\n\n\nHadley Wickham\n48\n\n\nYihui Xie\n22\n\n\nMax Kuhn\n18\n\n\nLionel Henry\n15\n\n\nWinston Chang\n15\n\n\nDaniel Falbel\n13\n\n\nJennifer Bryan\n13\n\n\nDavis Vaughan\n11\n\n\nCarson Sievert\n10\n\n\nTomasz Kalinowski\n8\n\n\nBarret Schloerke\n6\n\n\nThomas Lin Pedersen\n6\n\n\nHannah Frick\n5\n\n\nChristophe Dervieux\n4\n\n\nJoe Cheng\n4\n\n\nJulia Silge\n4\n\n\n\n\n\n\n\n\n\n\n(b) 续表\n\n\n\n团队成员\nR 包数量\n\n\n\nCole Arendt\n3\n\n\nEdgar Ruiz\n3\n\n\nJJ Allaire\n3\n\n\nKevin Kuo\n3\n\n\nKevin Ushey\n3\n\n\nRichard Iannone\n3\n\n\nAron Atkins\n2\n\n\nRomain François\n2\n\n\nYitao Li\n2\n\n\nBrian Smith\n1\n\n\nEmil Hvitfeldt\n1\n\n\nGarrick Aden-Buie\n1\n\n\nJames Blair\n1\n\n\nNathan Stephens\n1\n\n\nNick Strayer\n1\n\n\n\n\n\n\n\n\n\n\n\nCRAN 和 RStudio 团队是 R 语言社区最为熟悉的,其它团队需借助一些网络分析算法挖掘了。", "crumbs": [ "数据建模", "23  网络数据分析" @@ -1597,7 +1597,7 @@ "href": "analyze-network-data.html#sec-community-developer", "title": "23  网络数据分析", "section": "\n23.3 R 语言社区的开发者", - "text": "23.3 R 语言社区的开发者\n\n23.3.1 最高产的开发者\n继续基于数据集 pdb ,将维护 R 包数量比较多的开发者统计出来。\n\n代码pdb_ctb <- aggregate(data = pdb, Package ~ Maintainer2, FUN = length)\nggplot(data = pdb_ctb[pdb_ctb$Package >= 20, ]) +\n geom_col(aes(x = Package, y = reorder(Maintainer2, Package)), width = .1) +\n theme_classic() +\n labs(x = \"R 包数量\", y = \"开发者\")\n\n\n\n\n\n\n图 23.3: 高产的 R 包开发者\n\n\n\n\n这些开发者的主页和主要的 R 社区贡献如下:\n\n\nDirk Eddelbuettel 维护了 Rcpp、 RcppEigen 等流行的 R 包,通过 Rcpp 包将很多优秀的 C++ 库引入 R 语言社区。\n\nStéphane Laurent 维护了很多与 shiny 、htmlwidgets 相关的 R 包,比如 rAmCharts4 包。\n\nGábor Csárdi 维护了 igraph 包以及大量帮助 R 包开发的基础设施,RStudio 雇员。\n\nHadley Wickham 维护了 ggplot2、dplyr、devtools 等流行的 R 包,RStudio 雇员。\n\nJeroen Ooms 维护了 magick、 curl 以及大量帮助 R 包开发的基础设施。\n\nScott Chamberlain 维护了很多与 HTTP/Web 相关的 R 包,rOpenSci 联合创始人。\n\nRobin K. S. Hankin 维护了很多与贝叶斯、多元统计相关的 R 包。\n\nHenrik Bengtsson 维护了 future 和 parallelly 等流行的 R 包,在并行计算方面有很多贡献。\nJan Wijffels 维护了很多与自然语言处理、图像识别相关的 R 包,比如 udpipe 、BTM 和 word2vec 等包,Bnosac 团队成员。\nKurt Hornik 参与维护 R 软件代码并许多与自然语言处理相关的 R 包,CRAN 核心团队成员。\nMartin Maechler 维护了 Matrix 包,CRAN 核心团队成员。\n\nMax Kuhn 维护了 tidymodels 等包,RStudio 雇员。\n\nBob Rudis 维护了一些与 ggplot2 相关的 R 包,如 ggalt、hrbrthemes 和 statebins 等。\nKartikeya Bolar 维护了很多统计与 shiny 结合的 R 包,比如方差分析、逻辑回归、列联表、聚类分析等。\n\nKirill Müller 维护了 DBI 等大量与数据库连接的 R 包。\nShannon T. Holloway 维护了许多与生存分析相关的 R 包。\n\nSimon Urbanek 维护了 rJava、Rserve 等流行的 R 包,CRAN 核心团队成员,负责维护 R 软件中与 MacOS 平台相关的部分。\n\nAchim Zeileis 维护了 colorspace 等流行的 R 包,CRAN 核心团队成员。\nMuhammad Yaseen 维护了多个与 Multiple Indicator Cluster Survey 相关的 R 包。\nPablo Sanchez 维护了多个与市场营销平台连接的 R 语言接口,Windsor.ai 组织成员。\n\nThomas Lin Pedersen 维护了 patchwork、 gganimate 和 ggraph 等流行的 R 包,RStudio 雇员。\nTorsten Hothorn 在统计检验方面贡献了不少内容,比如 coin 和 multcomp 等包,CRAN 核心团队成员。\n\nRichard Cotton 维护了 assertive 和 rebus 系列 R 包,代码可读性检查。\nFlorian Schwendinger 维护了大量运筹优化方面的 R 包,扩展了 ROI 包的能力。\n\nGuangchuang Yu 维护了 ggtree 和 ggimage 等 R 包,在生物信息和可视化领域有不少贡献。\n\nWinston Chang 维护了 shiny 等流行的 R 包,RStudio 雇员。\n\nJohn Muschelli 维护了多个关于神经图像的 R 包。\nKevin R. Coombes 维护了多个关于生物信息的 R 包,如 oompaBase 和 oompaData 等。\n\nYihui Xie 维护了 knitr 、rmarkdown 等流行的 R 包,RStudio 雇员。\nCarl Boettiger 维护了多个接口包,比如 rfishbase 等,rOpenSci 团队成员。\n\nMichael D. Sumner 维护了多个空间统计相关的 R 包。\n\nEmil Hvitfeldt 维护了多个统计学习相关的 R 包,如 fastTextR 包等,RStudio 雇员。\n\nGeorgi N. Boshnakov 维护了多个金融时间序列相关的 R 包,如 fGarch、timeDate 和 timeSeries 等包。\n\nHana Sevcikova 维护了多个与贝叶斯人口统计相关的 R 包。\nJoe Thorley 维护了多个与贝叶斯 MCMC 相关的 R 包,Poisson Consulting 雇员。\n\n统计开发者数量随维护 R 包数量的分布,发现,开发 1 个 R 包的开发者有 6732 人,开发 2 个 R 包的开发者有 1685 人,第二名是第一名的五分之一,递减规律非常符合指数分布。\n\ntable(pdb_ctb$Package)\n\n#> \n#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 \n#> 6732 1685 725 328 177 82 80 52 37 37 29 15 18 8 11 7 \n#> 17 18 19 20 21 22 23 24 25 26 27 28 31 32 33 52 \n#> 1 3 4 4 2 3 3 1 5 5 2 1 1 1 1 3 \n#> 58 63 69 \n#> 1 1 1\n\n\n过滤掉非常高产的开发者,可以发现变化规律服从幂律分布。\nggplot(data = pdb_ctb, aes(x = Package)) +\n geom_histogram(binwidth = 1) +\n theme_classic() +\n labs(x = \"R 包数量\", y = \"开发者\")\nggplot(data = pdb_ctb[pdb_ctb$Package <= 20, ], aes(x = Package)) +\n geom_histogram(binwidth = 1, fill = NA, color = \"gray20\") +\n scale_y_log10() +\n theme_classic() +\n labs(x = \"R 包数量\", y = \"开发者\")\n\n\n\n\n\n\n\n\n\n(a) 直方图\n\n\n\n\n\n\n\n\n\n(b) 直方图(对数尺度)\n\n\n\n\n\n\n图 23.4: 开发者数量的分布\n\n\n最高产 Top 1% 的开发者 131 人(开发 R 包超过 10 个的开发者)贡献了 2329 / 18976 = 12.3% 的扩展包 ,高产的是商业公司、开源组织、大学机构。\n\ndim(pdb_ctb[pdb_ctb$Package > 10, ])\n\n#> [1] 131 2\n\nsum(pdb_ctb[pdb_ctb$Package > 10, \"Package\"])\n\n#> [1] 2329\n\n\n最低产 Bottom 的开发者 6732 人(仅开发一个 R 包的开发者)占总开发者的比例 6732 / 10067 = 66.87%, 贡献了 6732 / 18976 = 35.5 % 的扩展包 ,低产的人是主体。\n\n23.3.2 开发者协作关系\n如果一个开发者维护了一个 R 包,就成为维护者。一个 R 包有唯一的一个维护者,可能有一个至多个贡献者,这样,维护者和贡献者之间就形成了有向关系,贡献者可能又是另一个 R 包的维护者,也可能不是。不仅有向而且可能存在环。在一个 R 包中,A 是 B 的贡献者,而在另一个 R 包中,B 是 A 的贡献者,A 和 B 之间可能通过多个 R 包存在多次互相协作关系,这也表明 A 和 B 之间的关系密切。有向环的节点可能有 2 个以上,一个人可能同时属于多个环。\n维护者 A 接受来自多个开发者的贡献,接受次数(所有贡献者人数的累和,A 的每个 R 包的贡献者人数相加)视为 A 的入度。维护者 A 作为开发者给多个维护者贡献,贡献次数(作为开发者给其它 R 包做贡献的次数,向外参与贡献的 R 包数目)视为 A 的出度。注意,A 作为维护者,必然包含 A 作为开发者,忽略 A 到 A 的贡献,只考虑贡献/协作关系。\n\n# 过滤重复和缺失的记录\npdb <- subset(\n x = pdb, subset = !duplicated(Package) & !is.na(`Authors@R`),\n select = c(\"Package\", \"Maintainer\", \"Authors@R\")\n)\n# 提取维护者的名字\npdb$Maintainer <- extract_maintainer(pdb$Maintainer)\n\n有些包的元数据中没有 Authors@R 字段,有可能是没有贡献者,比如 mgcv 包、gam 包等,但也有可能是有贡献者,只是维护者没有填写这个字段,比如 Rcpp 包、RcppEigen 包等,因此将这些先过滤出来。总之,本文是以 Authors@R 字段作为贡献者的来源,共计 12503 个 R 包含有 Authors@R ,有 6000+ 个 R 包没有该字段,缺失约占 R 包总数的 1/3,在不那么考虑准确性的情况下,也可以使用。Author 字段是一段没有结构的文本,相比于 Author 字段,Authors@R 字段是以 R 语言中的 person 类型为存储结构的,比较规范,因此,提取贡献者的操作比较方便。作为示例,下面提取 Matrix 包的贡献者。\n\ntmp <- eval(parse(text = pdb[pdb$Package == \"Matrix\", \"Authors@R\"]))\ntmp <- unlist(lapply(tmp, function(x) format(x, include = c(\"given\", \"family\"))))\n# 返回一个整洁的数据框\ntmp <- data.frame(Package = \"Matrix\", Maintainer = pdb[pdb$Package == \"Matrix\", \"Maintainer\"], Authors = tmp)\n# 去掉 Authors 是 Maintainer 的记录\nsubset(tmp, subset = Maintainer != Authors)\n\n#> Package Maintainer Authors\n#> 1 Matrix Martin Maechler Douglas Bates\n#> 3 Matrix Martin Maechler Mikael Jagan\n#> 4 Matrix Martin Maechler Timothy A. Davis\n#> 5 Matrix Martin Maechler Jens Oehlschlägel\n#> 6 Matrix Martin Maechler Jason Riedy\n#> 7 Matrix Martin Maechler R Core Team\n\n\n数据框包含 R 包(Package 字段)、及其维护者(Maintainer 字段)和贡献者(Authors 字段)。将上述过程写成一个函数,接着,将所有 R 包的贡献者提取出来,形成一个大的数据框。\n\nextract_authors <- function(pkg) {\n sub_pdb <- pdb[pdb$Package == pkg, ]\n tmp <- eval(parse(text = sub_pdb[, \"Authors@R\"]))\n tmp <- unlist(lapply(tmp, function(x) format(x, include = c(\"given\", \"family\"))))\n tmp <- data.frame(Package = pkg, Maintainer = sub_pdb[, \"Maintainer\"], Authors = tmp)\n subset(tmp, subset = Maintainer != Authors)\n}\nextract_authors(\"Matrix\")\n\n#> Package Maintainer Authors\n#> 1 Matrix Martin Maechler Douglas Bates\n#> 3 Matrix Martin Maechler Mikael Jagan\n#> 4 Matrix Martin Maechler Timothy A. Davis\n#> 5 Matrix Martin Maechler Jens Oehlschlägel\n#> 6 Matrix Martin Maechler Jason Riedy\n#> 7 Matrix Martin Maechler R Core Team\n\n# lapply(c(\"Matrix\", \"gt\"), extract_authors)\n# 抽取所有 R 包的贡献者,运行需要1-2分钟时间\npdb_authors_list <- lapply(pdb[, \"Package\"], extract_authors)\n# 合并列表\npdb_authors_dt <- data.table::rbindlist(pdb_authors_list)\n\n最后整理出来的大数据框 pdb_authors_dt 含有近 26000 条记录,即边的规模大小。考虑到有些维护者和贡献者之间可能存在多次合作的情况,下面统计一下合作次数。\n\npdb_authors_dt[ ,.(cnt = length(Package)) , by = c(\"Maintainer\", \"Authors\")][cnt >= 10, ][order(cnt, decreasing = T), ]\n\n#> Maintainer Authors cnt\n#> <char> <char> <int>\n#> 1: Hadley Wickham RStudio 36\n#> 2: Pablo Sanchez Windsor.ai 25\n#> 3: Jan Wijffels BNOSAC 24\n#> 4: Gábor Csárdi RStudio 19\n#> 5: Hong Ooi Microsoft 16\n#> 6: Max Kuhn RStudio 14\n#> 7: Lionel Henry RStudio 14\n#> 8: Robrecht Cannoodt Wouter Saelens 13\n#> 9: Scott Chamberlain rOpenSci 13\n#> 10: Joe Thorley Poisson Consulting 13\n#> 11: Frederic Bertrand Myriam Maumy-Bertrand 12\n#> 12: Winston Chang RStudio 12\n#> 13: Daniel Falbel RStudio 12\n#> 14: David Kretch Adam Banker 12\n#> 15: David Kretch Amazon.com, Inc. 12\n#> 16: Victor Perrier Fanny Meyer 11\n#> 17: Jennifer Bryan RStudio 11\n#> 18: William Michael Landau Eli Lilly and Company 11\n#> 19: Adrian Baddeley Ege Rubak 11\n#> 20: Gábor Csárdi Jim Hester 10\n#> 21: Kirill Müller RStudio 10\n#> 22: Carson Sievert RStudio 10\n#> 23: Thomas Lin Pedersen RStudio 10\n#> 24: Lionel Henry Hadley Wickham 10\n#> 25: Adrian Baddeley Rolf Turner 10\n#> Maintainer Authors cnt\n\n\nAuthors 字段出现了不少组织的名字,这是因为有许多 R 包的维护者受雇于该组织,版权归属于该组织,组织不仅提供持续的资金,而且还提供其它帮助。以 dplyr 包为例,Hadley Wickham 受雇于 RStudio 公司,在 dplyr 包的元数据中,字段 Authors@R 中 RStudio 的角色是 cph 和 fnd ,即版权所有和资金支持。角色 cre 就是维护者,负责与 CRAN 团队的沟通。角色 aut 就是对 R 包有实质贡献的人。\n\nformat(eval(parse(text = pdb[pdb$Package == \"dplyr\", \"Authors@R\"])),\n include = c(\"given\", \"family\", \"role\"))\n\n#> [1] \"Hadley Wickham [aut, cre]\" \"Romain François [aut]\" \n#> [3] \"Lionel Henry [aut]\" \"Kirill Müller [aut]\" \n#> [5] \"RStudio [cph, fnd]\"\n\n\n此外,同属于一个组织的维护者之间常常合作紧密,从上面的结果可以看到,Gábor Csárdi 和 Jim Hester ,Lionel Henry 和 Hadley Wickham,Carson Sievert 和 Joe Cheng ,Jennifer Bryan 和 Hadley Wickham 等同属于 RStudio 公司,常常协作开发项目。对 RStudio、CRAN Team 和 rOpenSci 不再赘述,下面对排名靠前的其它组织略作说明。\n\n\nWindsor.ai 提供一系列可以连接各大营销平台,获取营销效果数据 R 包。\n\nBNOSAC 提供一系列计算机视觉、图像识别、自然语言处理方面的 R 包,比如 udpipe、word2vec、doc2vec 等包。\nMicrosoft 提供一系列连接和操作 Azure 云套件的 R 包,比如 AzureR 包。\n\nWouter Saelens 提供一系列单细胞轨迹推理(single-cell trajectory inference)相关的 R 包,形成一个 dynverse 家族。\n\n\nPoisson Consulting 提供一系列用于计算生物学和统计生态学的 R 包和相关研究论文。\n\nAmazon.com, Inc. 提供一系列用于存储、管理、操作等 Amazon 云服务的 R 包,形成一个 paws 套件。\nEli Lilly and Company 可能是 rOpenSci 的一员,赞助了旗下的 targets 和 jagstargets 等 R 包。\n\n最后,统计协作次数的分布,网络中边的权重的分布。\n\npdb_authors_net <- pdb_authors_dt[, .(cnt = .N), by = c(\"Maintainer\", \"Authors\")]\ntable(pdb_authors_net$cnt)\n\n#> \n#> 1 2 3 4 5 6 7 8 9 10 11 12 13 \n#> 20432 1511 365 121 44 28 14 8 3 6 4 5 3 \n#> 14 16 19 24 25 36 \n#> 2 1 1 1 1 1\n\n\n可以发现,绝大多数人之间协作只有一次。\n\n23.3.3 节点出入度分布\n下面简化这个网络,仅考虑贡献者也是维护者的情况,就是说网络中所有节点既是维护者也是贡献者,这会过滤掉组织机构、大量没有在 CRAN 发过 R 包的贡献者、从没给其它维护者做贡献的维护者。简化后,网络节点的出度、入度的分布图如下。\n# Maintainer 的入度\npdb_authors_net_indegree <- pdb_authors_dt[Authors %in% Maintainer, ][, .(in_degree = length(Authors)), by = \"Maintainer\"]\n# Authors 的出度\npdb_authors_net_outdegree <- pdb_authors_dt[Authors %in% Maintainer, ][, .(out_degree = length(Maintainer)), by = \"Authors\"]\n\nggplot(pdb_authors_net_indegree, aes(x = in_degree)) +\n geom_histogram(binwidth = 1) +\n geom_freqpoly(binwidth = 1) +\n theme_classic()\nggplot(pdb_authors_net_outdegree, aes(x = out_degree)) +\n geom_histogram(binwidth = 1) +\n geom_freqpoly(binwidth = 1) +\n theme_classic()\n\n\n\n\n\n\n\n\n\n(a) 入度的分布\n\n\n\n\n\n\n\n\n\n(b) 出度的分布\n\n\n\n\n\n\n图 23.5: 节点的入度和出度的分布\n\n\n\n23.3.4 可视化协作网络\n节点的大小以维护者维护的 R 包数量来表示,边的大小以维护者之间协作次数来表示。为了美观起见,更为了突出重点,仅保留协作次数大于 1 的边。\n\n# 边\npdb_authors_net_edge <- pdb_authors_dt[Authors %in% Maintainer, ][, .(edge_cnt = .N), by = c(\"Authors\", \"Maintainer\")][edge_cnt > 1,]\npdb_authors_net_edge[order(edge_cnt, decreasing = TRUE),]\n\n#> Authors Maintainer edge_cnt\n#> <char> <char> <int>\n#> 1: Jim Hester Gábor Csárdi 10\n#> 2: Hadley Wickham Lionel Henry 10\n#> 3: Joe Cheng Carson Sievert 9\n#> 4: Hadley Wickham Jennifer Bryan 8\n#> 5: Steven Andrew Culpepper James Joseph Balamuta 8\n#> --- \n#> 526: Aaron Wolen Scott Chamberlain 2\n#> 527: Bob Rudis Simon Garnier 2\n#> 528: Marco Sciaini Simon Garnier 2\n#> 529: Carlos Morales Martin Chan 2\n#> 530: Md Yeasin Ranjit Kumar Paul 2\n\n# 顶点\npdb_authors_net_vertex <- pdb_authors_dt[, .(vertex_cnt = length(unique(Package))), by = \"Maintainer\"][Maintainer %in% c(pdb_authors_net_edge$Maintainer, pdb_authors_net_edge$Authors),]\npdb_authors_net_vertex[order(vertex_cnt, decreasing = TRUE),]\n\n#> Maintainer vertex_cnt\n#> <char> <int>\n#> 1: Hadley Wickham 43\n#> 2: Gábor Csárdi 33\n#> 3: Jeroen Ooms 28\n#> 4: Scott Chamberlain 28\n#> 5: Yihui Xie 21\n#> --- \n#> 579: Katriona Goldmann 1\n#> 580: Carlo Pacioni 1\n#> 581: Michael Scholz 1\n#> 582: Javier Roca-Pardinas 1\n#> 583: Xianying Tan 1\n\n\n这是一个有向图,其各个字段含义如下。\n\nMaintainer 维护者(代表流 to)\nAuthors 贡献者(代表源 from)\n\nedge_cnt 边的大小表示维护者 Maintainer 和贡献者 Authors 的协作次数\n\nvertex_cnt 顶点大小表示维护者 Maintainer 维护的 R 包数量\n\n下面先考虑用 igraph 包可视化这个复杂的有向带权网络。pdb_authors_net_edge 和 pdb_authors_net_vertex 都是数据框,首先调用 igraph 包的函数 graph_from_data_frame() 将其转化为网络类型 igraph ,然后使用函数 plot() 绘制网络图。\n\n代码# 构造图\nlibrary(igraph)\npdb_authors_graph <- graph_from_data_frame(d = pdb_authors_net_edge, vertices = pdb_authors_net_vertex, directed = TRUE)\n# 可视化\nop <- par(mar = rep(0, 4))\nplot(pdb_authors_graph,\n edge.width = (E(pdb_authors_graph)$edge_cnt) / 2,\n edge.arrow.size = .01,\n edge.curved = .1,\n layout = layout.kamada.kawai,\n vertex.size = (V(pdb_authors_graph)$vertex_cnt) / 8,\n vertex.label.cex = sqrt(V(pdb_authors_graph)$vertex_cnt) / 8\n)\non.exit(par(op), add = TRUE)\n\n\n\n\n\n\n图 23.6: 开发者的协作关系网络\n\n\n\n\n协作关系弱的开发者占大部分,构成一个「月亮」的造型,其中,不乏维护多个 R 包的开发者,这些人要么单干,要么在专业小领域、小组织内协作。与之相对应的是协作关系较强的开发者,人数虽少,影响力却大,构成一个「太阳」的造型。协作得多往往意味着维护的 R 包也不少,甚至同属于一个组织,因此,高产的开发者、影响力大的组织聚集在一起,如 R Core Team、RStudio、rOpenSci 等。\n\neb <- cluster_edge_betweenness(pdb_authors_graph)\neb\n\n#> IGRAPH clustering edge betweenness, groups: 181, mod: 0.88\n#> + groups:\n#> $`1`\n#> [1] \"Matt Nunes\" \"Daniel Grose\" \"Guy Nason\" \n#> [4] \"Rebecca Killick\" \"Idris Eckley\" \"Alessandro Cardinali\"\n#> \n#> $`2`\n#> [1] \"Jin Zhu\" \"Shiyun Lin\"\n#> \n#> $`3`\n#> [1] \"Julio Trecenti\" \"Henrik Bengtsson\" \"Morgane Pierre-Jean\" \n#> [4] \"Zhian N. Kamvar\" \"Pierre Neuvial\" \"Michal Bojanowski\" \n#> + ... omitted several groups/vertices\n\n\nigraph 包提供多种社区探测的算法,上面简单使用函数 cluster_edge_betweenness() 来探测,结果显示有 181 个社区。社区 1 包含的成员如下:\n\neb$names[eb$membership == 1]\n\n#> [1] \"Matt Nunes\" \"Daniel Grose\" \"Guy Nason\" \n#> [4] \"Rebecca Killick\" \"Idris Eckley\" \"Alessandro Cardinali\"\n\n\n社区 3、14、21、34、46、52、75 的成员是比较多的。其中,社区 3 是以 RStudio 为核心的大社区,社区 14 是以 CRAN 为核心的大社区。\n\n# RStudio 为核心的大社区\neb$names[eb$membership == 3]\n\n#> [1] \"Julio Trecenti\" \"Henrik Bengtsson\" \"Morgane Pierre-Jean\" \n#> [4] \"Zhian N. Kamvar\" \"Pierre Neuvial\" \"Michal Bojanowski\" \n#> [7] \"Ian Lyttle\" \"Thomas Lin Pedersen\" \"Yihui Xie\" \n#> [10] \"Dirk Schumacher\" \"Jeroen Ooms\" \"Gábor Csárdi\" \n#> [13] \"Sean Kross\" \"Carl Boettiger\" \"Neal Richardson\" \n#> [16] \"Ryan Hafen\" \"Matthew Fidler\" \"Hadley Wickham\" \n#> [19] \"Mark Edmondson\" \"Kirill Müller\" \"Richard Iannone\" \n#> [22] \"Carson Sievert\" \"Winston Chang\" \"Lionel Henry\" \n#> [25] \"Jennifer Bryan\" \"Michael Sumner\" \"Scott Chamberlain\" \n#> [28] \"Garrick Aden-Buie\" \"Daniel Falbel\" \"Matthew B. Jones\" \n#> [31] \"Hiroaki Yutani\" \"Taiyun Wei\" \"Jim Hester\" \n#> [34] \"Romain François\" \"Greg Freedman Ellis\" \"Rhian Davies\" \n#> [37] \"Bryce Mecum\" \"Steph Locke\" \"Christophe Dervieux\" \n#> [40] \"Jonathan Keane\" \"Thibaut Jombart\" \"Dewey Dunnington\" \n#> [43] \"Anne Cori\" \"Bill Denney\" \"Jared Huling\" \n#> [46] \"Wush Wu\" \"Atsushi Yasumoto\" \"Barret Schloerke\" \n#> [49] \"Yuan Tang\" \"Duncan Garmonsway\" \"Edzer Pebesma\" \n#> [52] \"Sebastian Meyer\" \"Derek Burk\" \"Tim Taylor\" \n#> [55] \"Alicia Schep\" \"Tomasz Kalinowski\" \"Michael Rustler\" \n#> [58] \"Joe Cheng\" \"Bhaskar Karambelkar\" \"Sebastian Kreutzer\" \n#> [61] \"JJ Allaire\" \"JooYoung Seo\" \"Zachary Foster\" \n#> [64] \"Malcolm Barrett\" \"Aaron Wolen\" \"Bruno Tremblay\" \n#> [67] \"Justin Wilkins\" \"Yixuan Qiu\" \"Johannes Friedrich\" \n#> [70] \"Kevin Ushey\" \"Steven M. Mortimer\" \"Karthik Ram\" \n#> [73] \"Jorrit Poelen\" \"Maëlle Salmon\" \"Aron Atkins\" \n#> [76] \"Ramnath Vaidyanathan\" \"Thomas Leeper\" \"Dirk Eddelbuettel\" \n#> [79] \"Xianying Tan\"\n\n# CRAN 为核心的大社区\neb$names[eb$membership == 14]\n\n#> [1] \"Achim Zeileis\" \"Michael Hahsler\" \"Michel Lang\" \n#> [4] \"Nikolaus Umlauf\" \"Vincent Dorie\" \"Bettina Gruen\" \n#> [7] \"Bernd Bischl\" \"Ben Bolker\" \"Marc Becker\" \n#> [10] \"Friedrich Leisch\" \"Brian Ripley\" \"Michael Friendly\" \n#> [13] \"John Fox\" \"Kurt Hornik\" \"Patrick Schratz\" \n#> [16] \"Volodymyr Melnykov\" \"Martin Maechler\" \"George Ostrouchov\" \n#> [19] \"Drew Schmidt\" \"Georgi N. Boshnakov\" \"Wei-Chen Chen\" \n#> [22] \"Stefan Theussl\" \"David Meyer\" \"Jakob Bossek\" \n#> [25] \"Francois Michonneau\" \"Marius Hofert\" \"Florian Schwendinger\"\n#> [28] \"Felix Zimmer\" \"Martin Binder\" \"Phil Chalmers\" \n#> [31] \"Lukas Sablica\" \"Sebastian Fischer\" \"Lennart Schneider\" \n#> [34] \"Jakob Richter\" \"Florian Wickelmaier\" \"Rudolf Debelak\" \n#> [37] \"Duncan Murdoch\" \"Alexander Brenning\" \"Ingo Feinerer\"\n\n\n同时,在 RStudio 这个大社区下,有一些与之紧密相关的小社区,比如 Rob Hyndman 等人的时间序列社区、Roger Bivand 等人的空间统计社区。\n\n# 时间序列 Rob Hyndman\neb$names[eb$membership == 52]\n\n#> [1] \"Asael Alonzo Matamoros\" \"Nicholas Tierney\" \n#> [3] \"Sevvandi Kandanaarachchi\" \"Rob Hyndman\" \n#> [5] \"Di Cook\" \"Mitchell O'Hara-Wild\" \n#> [7] \"Han Lin Shang\" \"Sayani Gupta\" \n#> [9] \"Earo Wang\" \"Christoph Bergmeir\"\n\n# 空间统计 Roger Bivand\neb$names[eb$membership == 75]\n\n#> [1] \"Sebastian Jeworutzki\" \"Roger Bivand\" \"Colin Rundel\" \n#> [4] \"Angela Li\" \"Gianfranco Piras\" \"Patrick Giraudoux\" \n#> [7] \"Giovanni Millo\"\n\n\n结合前面的 图 23.6 ,知道有很多小圈圈,这些放一边,重点关注那些大的圈圈,见下图。\n\n代码op <- par(mar = rep(0, 4))\nplot(eb, pdb_authors_graph,\n edge.width = (E(pdb_authors_graph)$edge_cnt) / 4,\n edge.arrow.size = .01,\n edge.curved = .1,\n layout = layout.kamada.kawai,\n vertex.size = (V(pdb_authors_graph)$vertex_cnt) / 8,\n vertex.label.cex = sqrt(V(pdb_authors_graph)$vertex_cnt) / 8\n)\non.exit(par(op), add = TRUE)\n\n\n\n\n\n\n图 23.7: 探测协作关系网络中的社区\n\n\n\n\n下面使用 tidygraph 包构造图数据、计算节点中心度,dplyr 包操作数据。中心度代表节点(开发者)的影响力(或者重要性)。最后,借助 ggraph 包绘制维护者之间的贡献网络,节点的大小代表维护者影响力的强弱。\n\n代码pdb_authors_g <- tidygraph::as_tbl_graph(pdb_authors_net_edge, directed = T) |> \n dplyr::mutate(Popularity = tidygraph::centrality_degree(mode = 'in'))\nlibrary(ggraph)\nggraph(pdb_authors_g, layout = \"kk\") +\n geom_edge_fan(aes(alpha = after_stat(index)), show.legend = FALSE) +\n geom_node_point(aes(size = Popularity), alpha = 0.5) +\n theme_graph(base_family = \"sans\")\n\n\n\n\n\n\n图 23.8: 开发者的影响力网络\n\n\n\n\n前面两个网络图基于同一份数据、同样的网络布局算法,得到非常类似的结果。静态图上的标签相互重叠,影响细节的观察和探索,比如连接 CRAN 和 RStudio 两大阵营的通道。下面使用 visNetwork 包制作交互式网络图形,它是 JS 库 vis-network 的 R 语言接口, 使用 visNetwork 包绘制交互式网络图后,可以在图上使用鼠标放大、拖拽。可以发现在 CRAN 社区的 Achim Zeileis 和 RStudio 社区的 Max Kuhn 之间是由 Andri Signorell 牵线搭桥。此外,读者若有兴趣,可以使用 Richard Iannone 开发的 DiagrammeR 包制作静态的矢量网页图形。\n\n代码library(visNetwork)\n# 将 igraph 对象转为 visNetwork 包可用的数据\ndat <- toVisNetworkData(pdb_authors_graph)\nnodes_df <- dat$nodes\nnodes_df$value <- nodes_df$vertex_cnt\nedges_df <- dat$edges\nedges_df$value <- edges_df$edge_cnt\n# 输入节点和边的数据\nvisNetwork(nodes = nodes_df, edges = edges_df, height = \"600px\") |> \n visIgraphLayout(randomSeed = 20232023, layout = \"layout.kamada.kawai\")\n\n\n\n\n\n\n图 23.9: 开发者的影响力网络(visNetwork)", + "text": "23.3 R 语言社区的开发者\n\n23.3.1 最高产的开发者\n继续基于数据集 pdb ,将维护 R 包数量比较多的开发者统计出来。\n\n代码pdb_ctb <- aggregate(data = pdb, Package ~ Maintainer2, FUN = length)\nggplot(data = pdb_ctb[pdb_ctb$Package >= 20, ]) +\n geom_col(aes(x = Package, y = reorder(Maintainer2, Package)), width = .1) +\n theme_classic() +\n labs(x = \"R 包数量\", y = \"开发者\")\n\n\n\n\n\n\n图 23.3: 高产的 R 包开发者\n\n\n\n\n这些开发者的主页和主要的 R 社区贡献如下:\n\n\nDirk Eddelbuettel 维护了 Rcpp、RcppEigen 等流行的 R 包,通过 Rcpp 包将很多优秀的 C++ 库引入 R 语言社区。\n\nStéphane Laurent 维护了很多与 shiny、htmlwidgets 相关的 R 包,比如 rAmCharts4 包。\n\nGábor Csárdi 维护了 igraph 包以及大量帮助 R 包开发的基础设施,RStudio 雇员。\n\nHadley Wickham 维护了 ggplot2、dplyr、devtools 等流行的 R 包,RStudio 雇员。\n\nJeroen Ooms 维护了 magick、curl 以及大量帮助 R 包开发的基础设施。\n\nScott Chamberlain 维护了很多与 HTTP/Web 相关的 R 包,rOpenSci 联合创始人。\n\nRobin K. S. Hankin 维护了很多与贝叶斯、多元统计相关的 R 包。\n\nHenrik Bengtsson 维护了 future 和 parallelly 等流行的 R 包,在并行计算方面有很多贡献。\n\nJan Wijffels 维护了很多与自然语言处理、图像识别相关的 R 包,比如 udpipe 、BTM 和 word2vec 等包,Bnosac 团队成员。\n\nKurt Hornik 参与维护 R 软件代码并许多与自然语言处理相关的 R 包,R 核心团队成员。\n\nMartin Maechler 维护了 Matrix 包,R 核心团队成员。\n\nMax Kuhn 维护了 tidymodels 等包,RStudio 雇员。\n\nBob Rudis 维护了一些与 ggplot2 相关的 R 包,如 ggalt、hrbrthemes 和 statebins 等。\n\nKartikeya Bolar 维护了很多统计与 shiny 结合的 R 包,比如方差分析、逻辑回归、列联表、聚类分析等。\n\nKirill Müller 维护了 DBI 等大量与数据库连接的 R 包。\n\nShannon T. Holloway 维护了许多与生存分析相关的 R 包。\n\nSimon Urbanek 维护了 rJava、Rserve 等流行的 R 包,R 核心团队成员,负责维护 R 软件中与 MacOS 平台相关的部分。\n\nAchim Zeileis 维护了 colorspace 等流行的 R 包,R 核心团队成员。\n\nMuhammad Yaseen 维护了多个与 Multiple Indicator Cluster Survey 相关的 R 包。\n\nPablo Sanchez 维护了多个与市场营销平台连接的 R 语言接口,Windsor.ai 组织成员。\n\nThomas Lin Pedersen 维护了 patchwork、 gganimate 和 ggraph 等流行的 R 包,RStudio 雇员。\n\nTorsten Hothorn 在统计检验方面贡献了不少内容,比如 coin 和 multcomp 等包,R 核心团队成员。\n\nRichard Cotton 维护了 assertive 和 rebus 系列 R 包,代码可读性检查。\n\nFlorian Schwendinger 维护了大量运筹优化方面的 R 包,扩展了 ROI 包的能力。\n\nGuangchuang Yu 维护了 ggtree 和 ggimage 等 R 包,在生物信息和可视化领域有不少贡献。\n\nWinston Chang 维护了 shiny 等流行的 R 包,RStudio 雇员。\n\nJohn Muschelli 维护了多个关于神经图像的 R 包。\n\nKevin R. Coombes 维护了多个关于生物信息的 R 包,如 oompaBase 和 oompaData 等。\n\nYihui Xie 维护了 knitr 、rmarkdown 等流行的 R 包,RStudio 雇员。\n\nCarl Boettiger 维护了多个接口包,比如 rfishbase 等,rOpenSci 团队成员。\n\nMichael D. Sumner 维护了多个空间统计相关的 R 包。\n\nEmil Hvitfeldt 维护了多个统计学习相关的 R 包,如 fastTextR 包等,RStudio 雇员。\n\nGeorgi N. Boshnakov 维护了多个金融时间序列相关的 R 包,如 fGarch、timeDate 和 timeSeries 等包。\n\nHana Sevcikova 维护了多个与贝叶斯人口统计相关的 R 包。\n\nJoe Thorley 维护了多个与贝叶斯 MCMC 相关的 R 包,Poisson Consulting 雇员。\n\n统计开发者数量随维护 R 包数量的分布,发现,开发 1 个 R 包的开发者有 6732 人,开发 2 个 R 包的开发者有 1685 人,第二名是第一名的五分之一,递减规律非常符合指数分布。\n\ntable(pdb_ctb$Package)\n\n#> \n#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 \n#> 6732 1685 725 328 177 82 80 52 37 37 29 15 18 8 11 7 \n#> 17 18 19 20 21 22 23 24 25 26 27 28 31 32 33 52 \n#> 1 3 4 4 2 3 3 1 5 5 2 1 1 1 1 3 \n#> 58 63 69 \n#> 1 1 1\n\n\n过滤掉非常高产的开发者,可以发现变化规律服从幂律分布。\nggplot(data = pdb_ctb, aes(x = Package)) +\n geom_histogram(binwidth = 1) +\n theme_classic() +\n labs(x = \"R 包数量\", y = \"开发者\")\nggplot(data = pdb_ctb[pdb_ctb$Package <= 20, ], aes(x = Package)) +\n geom_histogram(binwidth = 1, fill = NA, color = \"gray20\") +\n scale_y_log10() +\n theme_classic() +\n labs(x = \"R 包数量\", y = \"开发者\")\n\n\n\n\n\n\n\n\n\n(a) 直方图\n\n\n\n\n\n\n\n\n\n(b) 直方图(对数尺度)\n\n\n\n\n\n\n图 23.4: 开发者数量的分布\n\n\n最高产 Top 1% 的开发者 131 人(开发 R 包超过 10 个的开发者)贡献了 2329 / 18976 = 12.3% 的扩展包 ,高产的是商业公司、开源组织、大学机构。\n\ndim(pdb_ctb[pdb_ctb$Package > 10, ])\n\n#> [1] 131 2\n\nsum(pdb_ctb[pdb_ctb$Package > 10, \"Package\"])\n\n#> [1] 2329\n\n\n最低产 Bottom 的开发者 6732 人(仅开发一个 R 包的开发者)占总开发者的比例 6732 / 10067 = 66.87%, 贡献了 6732 / 18976 = 35.5 % 的扩展包 ,低产的人是主体。\n\n23.3.2 开发者协作关系\n如果一个开发者维护了一个 R 包,就成为维护者。一个 R 包有唯一的一个维护者,可能有一个至多个贡献者,这样,维护者和贡献者之间就形成了有向关系,贡献者可能又是另一个 R 包的维护者,也可能不是。不仅有向而且可能存在环。在一个 R 包中,A 是 B 的贡献者,而在另一个 R 包中,B 是 A 的贡献者,A 和 B 之间可能通过多个 R 包存在多次互相协作关系,这也表明 A 和 B 之间的关系密切。有向环的节点可能有 2 个以上,一个人可能同时属于多个环。\n维护者 A 接受来自多个开发者的贡献,接受次数(所有贡献者人数的累和,A 的每个 R 包的贡献者人数相加)视为 A 的入度。维护者 A 作为开发者给多个维护者贡献,贡献次数(作为开发者给其它 R 包做贡献的次数,向外参与贡献的 R 包数目)视为 A 的出度。注意,A 作为维护者,必然包含 A 作为开发者,忽略 A 到 A 的贡献,只考虑贡献/协作关系。\n\n# 过滤重复和缺失的记录\npdb <- subset(\n x = pdb, subset = !duplicated(Package) & !is.na(`Authors@R`),\n select = c(\"Package\", \"Maintainer\", \"Authors@R\")\n)\n# 提取维护者的名字\npdb$Maintainer <- extract_maintainer(pdb$Maintainer)\n\n有些包的元数据中没有 Authors@R 字段,有可能是没有贡献者,比如 mgcv 包、gam 包等,但也有可能是有贡献者,只是维护者没有填写这个字段,比如 Rcpp 包、RcppEigen 包等,因此将这些先过滤出来。总之,本文是以 Authors@R 字段作为贡献者的来源,共计 12503 个 R 包含有 Authors@R ,有 6000+ 个 R 包没有该字段,缺失约占 R 包总数的 1/3,在不那么考虑准确性的情况下,也可以使用。Author 字段是一段没有结构的文本,相比于 Author 字段,Authors@R 字段是以 R 语言中的 person 类型为存储结构的,比较规范,因此,提取贡献者的操作比较方便。作为示例,下面提取 Matrix 包的贡献者。\n\ntmp <- eval(parse(text = pdb[pdb$Package == \"Matrix\", \"Authors@R\"]))\ntmp <- unlist(lapply(tmp, function(x) format(x, include = c(\"given\", \"family\"))))\n# 返回一个整洁的数据框\ntmp <- data.frame(Package = \"Matrix\", Maintainer = pdb[pdb$Package == \"Matrix\", \"Maintainer\"], Authors = tmp)\n# 去掉 Authors 是 Maintainer 的记录\nsubset(tmp, subset = Maintainer != Authors)\n\n#> Package Maintainer Authors\n#> 1 Matrix Martin Maechler Douglas Bates\n#> 3 Matrix Martin Maechler Mikael Jagan\n#> 4 Matrix Martin Maechler Timothy A. Davis\n#> 5 Matrix Martin Maechler Jens Oehlschlägel\n#> 6 Matrix Martin Maechler Jason Riedy\n#> 7 Matrix Martin Maechler R Core Team\n\n\n数据框包含 R 包(Package 字段)、及其维护者(Maintainer 字段)和贡献者(Authors 字段)。将上述过程写成一个函数,接着,将所有 R 包的贡献者提取出来,形成一个大的数据框。\n\nextract_authors <- function(pkg) {\n sub_pdb <- pdb[pdb$Package == pkg, ]\n tmp <- eval(parse(text = sub_pdb[, \"Authors@R\"]))\n tmp <- unlist(lapply(tmp, function(x) format(x, include = c(\"given\", \"family\"))))\n tmp <- data.frame(Package = pkg, Maintainer = sub_pdb[, \"Maintainer\"], Authors = tmp)\n subset(tmp, subset = Maintainer != Authors)\n}\nextract_authors(\"Matrix\")\n\n#> Package Maintainer Authors\n#> 1 Matrix Martin Maechler Douglas Bates\n#> 3 Matrix Martin Maechler Mikael Jagan\n#> 4 Matrix Martin Maechler Timothy A. Davis\n#> 5 Matrix Martin Maechler Jens Oehlschlägel\n#> 6 Matrix Martin Maechler Jason Riedy\n#> 7 Matrix Martin Maechler R Core Team\n\n# lapply(c(\"Matrix\", \"gt\"), extract_authors)\n# 抽取所有 R 包的贡献者,运行需要1-2分钟时间\npdb_authors_list <- lapply(pdb[, \"Package\"], extract_authors)\n# 合并列表\npdb_authors_dt <- data.table::rbindlist(pdb_authors_list)\n\n最后整理出来的大数据框 pdb_authors_dt 含有近 26000 条记录,即边的规模大小。考虑到有些维护者和贡献者之间可能存在多次合作的情况,下面统计一下合作次数。\n\npdb_authors_dt[ ,.(cnt = length(Package)) , by = c(\"Maintainer\", \"Authors\")\n ][cnt >= 10, ][order(cnt, decreasing = T), ]\n\n#> Maintainer Authors cnt\n#> <char> <char> <int>\n#> 1: Hadley Wickham RStudio 36\n#> 2: Pablo Sanchez Windsor.ai 25\n#> 3: Jan Wijffels BNOSAC 24\n#> 4: Gábor Csárdi RStudio 19\n#> 5: Hong Ooi Microsoft 16\n#> 6: Max Kuhn RStudio 14\n#> 7: Lionel Henry RStudio 14\n#> 8: Robrecht Cannoodt Wouter Saelens 13\n#> 9: Scott Chamberlain rOpenSci 13\n#> 10: Joe Thorley Poisson Consulting 13\n#> 11: Frederic Bertrand Myriam Maumy-Bertrand 12\n#> 12: Winston Chang RStudio 12\n#> 13: Daniel Falbel RStudio 12\n#> 14: David Kretch Adam Banker 12\n#> 15: David Kretch Amazon.com, Inc. 12\n#> 16: Victor Perrier Fanny Meyer 11\n#> 17: Jennifer Bryan RStudio 11\n#> 18: William Michael Landau Eli Lilly and Company 11\n#> 19: Adrian Baddeley Ege Rubak 11\n#> 20: Gábor Csárdi Jim Hester 10\n#> 21: Kirill Müller RStudio 10\n#> 22: Carson Sievert RStudio 10\n#> 23: Thomas Lin Pedersen RStudio 10\n#> 24: Lionel Henry Hadley Wickham 10\n#> 25: Adrian Baddeley Rolf Turner 10\n#> Maintainer Authors cnt\n\n\nAuthors 字段出现了不少组织的名字,这是因为有许多 R 包的维护者受雇于该组织,版权归属于该组织,组织不仅提供持续的资金,而且还提供其它帮助。以 dplyr 包为例,Hadley Wickham 受雇于 RStudio 公司,在 dplyr 包的元数据中,字段 Authors@R 中 RStudio 的角色是 cph 和 fnd ,即版权所有和资金支持。角色 cre 就是维护者,负责与 CRAN 团队的沟通。角色 aut 就是对 R 包有实质贡献的人。\n\nformat(eval(parse(text = pdb[pdb$Package == \"dplyr\", \"Authors@R\"])),\n include = c(\"given\", \"family\", \"role\"))\n\n#> [1] \"Hadley Wickham [aut, cre]\" \"Romain François [aut]\" \n#> [3] \"Lionel Henry [aut]\" \"Kirill Müller [aut]\" \n#> [5] \"RStudio [cph, fnd]\"\n\n\n此外,同属于一个组织的维护者之间常常合作紧密,从上面的结果可以看到,Gábor Csárdi 和 Jim Hester ,Lionel Henry 和 Hadley Wickham,Carson Sievert 和 Joe Cheng ,Jennifer Bryan 和 Hadley Wickham 等同属于 RStudio 公司,常常协作开发项目。对 RStudio、CRAN Team 和 rOpenSci 不再赘述,下面对排名靠前的其它组织略作说明。\n\n\nWindsor.ai 提供一系列可以连接各大营销平台,获取营销效果数据 R 包。\n\nBNOSAC 提供一系列计算机视觉、图像识别、自然语言处理方面的 R 包,比如 udpipe、word2vec、doc2vec 等包。\nMicrosoft 提供一系列连接和操作 Azure 云套件的 R 包,比如 AzureR 包。\n\nWouter Saelens 提供一系列单细胞轨迹推理(single-cell trajectory inference)相关的 R 包,形成一个 dynverse 家族。\n\n\nPoisson Consulting 提供一系列用于计算生物学和统计生态学的 R 包和相关研究论文。\n\nAmazon.com, Inc. 提供一系列用于存储、管理、操作等 Amazon 云服务的 R 包,形成一个 paws 套件。\n\nEli Lilly and Company 可能是 rOpenSci 的一员,赞助了旗下的 targets 和 jagstargets 等 R 包。\n\n最后,统计协作次数的分布,网络中边的权重的分布。\n\npdb_authors_net <- pdb_authors_dt[, .(cnt = .N), by = c(\"Maintainer\", \"Authors\")]\ntable(pdb_authors_net$cnt)\n\n#> \n#> 1 2 3 4 5 6 7 8 9 10 11 12 13 \n#> 20432 1511 365 121 44 28 14 8 3 6 4 5 3 \n#> 14 16 19 24 25 36 \n#> 2 1 1 1 1 1\n\n\n可以发现,绝大多数人之间协作只有一次。\n\n23.3.3 节点出入度分布\n下面简化这个网络,仅考虑贡献者也是维护者的情况,就是说网络中所有节点既是维护者也是贡献者,这会过滤掉组织机构、大量没有在 CRAN 发过 R 包的贡献者、从没给其它维护者做贡献的维护者。简化后,网络节点的出度、入度的分布图如下。\n# Maintainer 的入度\npdb_authors_net_indegree <- pdb_authors_dt[Authors %in% Maintainer, \n ][, .(in_degree = length(Authors)), by = \"Maintainer\"]\n# Authors 的出度\npdb_authors_net_outdegree <- pdb_authors_dt[Authors %in% Maintainer, \n ][, .(out_degree = length(Maintainer)), by = \"Authors\"]\n\nggplot(pdb_authors_net_indegree, aes(x = in_degree)) +\n geom_histogram(binwidth = 1) +\n geom_freqpoly(binwidth = 1) +\n theme_classic()\nggplot(pdb_authors_net_outdegree, aes(x = out_degree)) +\n geom_histogram(binwidth = 1) +\n geom_freqpoly(binwidth = 1) +\n theme_classic()\n\n\n\n\n\n\n\n\n\n(a) 入度的分布\n\n\n\n\n\n\n\n\n\n(b) 出度的分布\n\n\n\n\n\n\n图 23.5: 节点的入度和出度的分布\n\n\n\n23.3.4 可视化协作网络\n节点的大小以维护者维护的 R 包数量来表示,边的大小以维护者之间协作次数来表示。为了美观起见,更为了突出重点,仅保留协作次数大于 1 的边。\n\n# 边\npdb_authors_net_edge <- pdb_authors_dt[Authors %in% Maintainer, \n ][, .(edge_cnt = .N), by = c(\"Authors\", \"Maintainer\")][edge_cnt > 1, ]\npdb_authors_net_edge[order(edge_cnt, decreasing = TRUE),]\n\n#> Authors Maintainer edge_cnt\n#> <char> <char> <int>\n#> 1: Jim Hester Gábor Csárdi 10\n#> 2: Hadley Wickham Lionel Henry 10\n#> 3: Joe Cheng Carson Sievert 9\n#> 4: Hadley Wickham Jennifer Bryan 8\n#> 5: Steven Andrew Culpepper James Joseph Balamuta 8\n#> --- \n#> 526: Aaron Wolen Scott Chamberlain 2\n#> 527: Bob Rudis Simon Garnier 2\n#> 528: Marco Sciaini Simon Garnier 2\n#> 529: Carlos Morales Martin Chan 2\n#> 530: Md Yeasin Ranjit Kumar Paul 2\n\n# 顶点\npdb_authors_net_vertex <- pdb_authors_dt[, .(vertex_cnt = length(unique(Package))), by = \"Maintainer\"\n ][Maintainer %in% c(pdb_authors_net_edge$Maintainer, pdb_authors_net_edge$Authors),]\npdb_authors_net_vertex[order(vertex_cnt, decreasing = TRUE),]\n\n#> Maintainer vertex_cnt\n#> <char> <int>\n#> 1: Hadley Wickham 43\n#> 2: Gábor Csárdi 33\n#> 3: Jeroen Ooms 28\n#> 4: Scott Chamberlain 28\n#> 5: Yihui Xie 21\n#> --- \n#> 579: Katriona Goldmann 1\n#> 580: Carlo Pacioni 1\n#> 581: Michael Scholz 1\n#> 582: Javier Roca-Pardinas 1\n#> 583: Xianying Tan 1\n\n\n这是一个有向图,其各个字段含义如下。\n\nMaintainer 维护者(代表流 to)\nAuthors 贡献者(代表源 from)\n\nedge_cnt 边的大小表示维护者 Maintainer 和贡献者 Authors 的协作次数\n\nvertex_cnt 顶点大小表示维护者 Maintainer 维护的 R 包数量\n\n下面先考虑用 igraph 包可视化这个复杂的有向带权网络。pdb_authors_net_edge 和 pdb_authors_net_vertex 都是数据框,首先调用 igraph 包的函数 graph_from_data_frame() 将其转化为网络类型 igraph ,然后使用函数 plot() 绘制网络图。\n\n代码# 构造图\nlibrary(igraph)\npdb_authors_graph <- graph_from_data_frame(d = pdb_authors_net_edge, vertices = pdb_authors_net_vertex, directed = TRUE)\n# 可视化\nop <- par(mar = rep(0, 4))\nplot(pdb_authors_graph,\n edge.width = (E(pdb_authors_graph)$edge_cnt) / 2,\n edge.arrow.size = .01,\n edge.curved = .1,\n layout = layout.kamada.kawai,\n vertex.size = (V(pdb_authors_graph)$vertex_cnt) / 8,\n vertex.label.cex = sqrt(V(pdb_authors_graph)$vertex_cnt) / 8\n)\non.exit(par(op), add = TRUE)\n\n\n\n\n\n\n图 23.6: 开发者的协作关系网络\n\n\n\n\n协作关系弱的开发者占大部分,构成一个「月亮」的造型,其中,不乏维护多个 R 包的开发者,这些人要么单干,要么在专业小领域、小组织内协作。与之相对应的是协作关系较强的开发者,人数虽少,影响力却大,构成一个「太阳」的造型。协作得多往往意味着维护的 R 包也不少,甚至同属于一个组织,因此,高产的开发者、影响力大的组织聚集在一起,如 R Core Team、RStudio、rOpenSci 等。\n\neb <- cluster_edge_betweenness(pdb_authors_graph)\neb\n\n#> IGRAPH clustering edge betweenness, groups: 181, mod: 0.88\n#> + groups:\n#> $`1`\n#> [1] \"Matt Nunes\" \"Daniel Grose\" \"Guy Nason\" \n#> [4] \"Rebecca Killick\" \"Idris Eckley\" \"Alessandro Cardinali\"\n#> \n#> $`2`\n#> [1] \"Jin Zhu\" \"Shiyun Lin\"\n#> \n#> $`3`\n#> [1] \"Julio Trecenti\" \"Henrik Bengtsson\" \"Morgane Pierre-Jean\" \n#> [4] \"Zhian N. Kamvar\" \"Pierre Neuvial\" \"Michal Bojanowski\" \n#> + ... omitted several groups/vertices\n\n\nigraph 包提供多种社区探测的算法,上面简单使用函数 cluster_edge_betweenness() 来探测,结果显示有 181 个社区。社区 1 包含的成员如下:\n\neb$names[eb$membership == 1]\n\n#> [1] \"Matt Nunes\" \"Daniel Grose\" \"Guy Nason\" \n#> [4] \"Rebecca Killick\" \"Idris Eckley\" \"Alessandro Cardinali\"\n\n\n社区 3、14、21、34、46、52、75 的成员是比较多的。其中,社区 3 是以 RStudio 为核心的大社区,社区 14 是以 CRAN 为核心的大社区。\n\n# RStudio 为核心的大社区\neb$names[eb$membership == 3]\n\n#> [1] \"Julio Trecenti\" \"Henrik Bengtsson\" \"Morgane Pierre-Jean\" \n#> [4] \"Zhian N. Kamvar\" \"Pierre Neuvial\" \"Michal Bojanowski\" \n#> [7] \"Ian Lyttle\" \"Thomas Lin Pedersen\" \"Yihui Xie\" \n#> [10] \"Dirk Schumacher\" \"Jeroen Ooms\" \"Gábor Csárdi\" \n#> [13] \"Sean Kross\" \"Carl Boettiger\" \"Neal Richardson\" \n#> [16] \"Ryan Hafen\" \"Matthew Fidler\" \"Hadley Wickham\" \n#> [19] \"Mark Edmondson\" \"Kirill Müller\" \"Richard Iannone\" \n#> [22] \"Carson Sievert\" \"Winston Chang\" \"Lionel Henry\" \n#> [25] \"Jennifer Bryan\" \"Michael Sumner\" \"Scott Chamberlain\" \n#> [28] \"Garrick Aden-Buie\" \"Daniel Falbel\" \"Matthew B. Jones\" \n#> [31] \"Hiroaki Yutani\" \"Taiyun Wei\" \"Jim Hester\" \n#> [34] \"Romain François\" \"Greg Freedman Ellis\" \"Rhian Davies\" \n#> [37] \"Bryce Mecum\" \"Steph Locke\" \"Christophe Dervieux\" \n#> [40] \"Jonathan Keane\" \"Thibaut Jombart\" \"Dewey Dunnington\" \n#> [43] \"Anne Cori\" \"Bill Denney\" \"Jared Huling\" \n#> [46] \"Wush Wu\" \"Atsushi Yasumoto\" \"Barret Schloerke\" \n#> [49] \"Yuan Tang\" \"Duncan Garmonsway\" \"Edzer Pebesma\" \n#> [52] \"Sebastian Meyer\" \"Derek Burk\" \"Tim Taylor\" \n#> [55] \"Alicia Schep\" \"Tomasz Kalinowski\" \"Michael Rustler\" \n#> [58] \"Joe Cheng\" \"Bhaskar Karambelkar\" \"Sebastian Kreutzer\" \n#> [61] \"JJ Allaire\" \"JooYoung Seo\" \"Zachary Foster\" \n#> [64] \"Malcolm Barrett\" \"Aaron Wolen\" \"Bruno Tremblay\" \n#> [67] \"Justin Wilkins\" \"Yixuan Qiu\" \"Johannes Friedrich\" \n#> [70] \"Kevin Ushey\" \"Steven M. Mortimer\" \"Karthik Ram\" \n#> [73] \"Jorrit Poelen\" \"Maëlle Salmon\" \"Aron Atkins\" \n#> [76] \"Ramnath Vaidyanathan\" \"Thomas Leeper\" \"Dirk Eddelbuettel\" \n#> [79] \"Xianying Tan\"\n\n# CRAN 为核心的大社区\neb$names[eb$membership == 14]\n\n#> [1] \"Achim Zeileis\" \"Michael Hahsler\" \"Michel Lang\" \n#> [4] \"Nikolaus Umlauf\" \"Vincent Dorie\" \"Bettina Gruen\" \n#> [7] \"Bernd Bischl\" \"Ben Bolker\" \"Marc Becker\" \n#> [10] \"Friedrich Leisch\" \"Brian Ripley\" \"Michael Friendly\" \n#> [13] \"John Fox\" \"Kurt Hornik\" \"Patrick Schratz\" \n#> [16] \"Volodymyr Melnykov\" \"Martin Maechler\" \"George Ostrouchov\" \n#> [19] \"Drew Schmidt\" \"Georgi N. Boshnakov\" \"Wei-Chen Chen\" \n#> [22] \"Stefan Theussl\" \"David Meyer\" \"Jakob Bossek\" \n#> [25] \"Francois Michonneau\" \"Marius Hofert\" \"Florian Schwendinger\"\n#> [28] \"Felix Zimmer\" \"Martin Binder\" \"Phil Chalmers\" \n#> [31] \"Lukas Sablica\" \"Sebastian Fischer\" \"Lennart Schneider\" \n#> [34] \"Jakob Richter\" \"Florian Wickelmaier\" \"Rudolf Debelak\" \n#> [37] \"Duncan Murdoch\" \"Alexander Brenning\" \"Ingo Feinerer\"\n\n\n同时,在 RStudio 这个大社区下,有一些与之紧密相关的小社区,比如 Rob Hyndman 等人的时间序列社区、Roger Bivand 等人的空间统计社区。\n\n# 时间序列 Rob Hyndman\neb$names[eb$membership == 52]\n\n#> [1] \"Asael Alonzo Matamoros\" \"Nicholas Tierney\" \n#> [3] \"Sevvandi Kandanaarachchi\" \"Rob Hyndman\" \n#> [5] \"Di Cook\" \"Mitchell O'Hara-Wild\" \n#> [7] \"Han Lin Shang\" \"Sayani Gupta\" \n#> [9] \"Earo Wang\" \"Christoph Bergmeir\"\n\n# 空间统计 Roger Bivand\neb$names[eb$membership == 75]\n\n#> [1] \"Sebastian Jeworutzki\" \"Roger Bivand\" \"Colin Rundel\" \n#> [4] \"Angela Li\" \"Gianfranco Piras\" \"Patrick Giraudoux\" \n#> [7] \"Giovanni Millo\"\n\n\n结合前面的 图 23.6 ,知道有很多小圈圈,这些放一边,重点关注那些大的圈圈,见下图。\n\n代码op <- par(mar = rep(0, 4))\nplot(eb, pdb_authors_graph,\n edge.width = (E(pdb_authors_graph)$edge_cnt) / 4,\n edge.arrow.size = .01,\n edge.curved = .1,\n layout = layout.kamada.kawai,\n vertex.size = (V(pdb_authors_graph)$vertex_cnt) / 8,\n vertex.label.cex = sqrt(V(pdb_authors_graph)$vertex_cnt) / 8\n)\non.exit(par(op), add = TRUE)\n\n\n\n\n\n\n图 23.7: 探测协作关系网络中的社区\n\n\n\n\n下面使用 tidygraph 包构造图数据、计算节点中心度,dplyr 包操作数据。中心度代表节点(开发者)的影响力(或者重要性)。最后,借助 ggraph 包绘制维护者之间的贡献网络,节点的大小代表维护者影响力的强弱。\n\n代码pdb_authors_g <- tidygraph::as_tbl_graph(pdb_authors_net_edge, directed = T) |> \n dplyr::mutate(Popularity = tidygraph::centrality_degree(mode = 'in'))\nlibrary(ggraph)\nggraph(pdb_authors_g, layout = \"kk\") +\n geom_edge_fan(aes(alpha = after_stat(index)), show.legend = FALSE) +\n geom_node_point(aes(size = Popularity), alpha = 0.5) +\n theme_graph(base_family = \"sans\")\n\n\n\n\n\n\n图 23.8: 开发者的影响力网络\n\n\n\n\n前面两个网络图基于同一份数据、同样的网络布局算法,得到非常类似的结果。静态图上的标签相互重叠,影响细节的观察和探索,比如连接 CRAN 和 RStudio 两大阵营的通道。下面使用 visNetwork 包制作交互式网络图形,它是 JS 库 vis-network 的 R 语言接口, 使用 visNetwork 包绘制交互式网络图后,可以在图上使用鼠标放大、拖拽。可以发现在 CRAN 社区的 Achim Zeileis 和 RStudio 社区的 Max Kuhn 之间是由 Andri Signorell 牵线搭桥。此外,读者若有兴趣,可以使用 Richard Iannone 开发的 DiagrammeR 包制作静态的矢量网页图形。\n\n代码library(visNetwork)\n# 将 igraph 对象转为 visNetwork 包可用的数据\ndat <- toVisNetworkData(pdb_authors_graph)\nnodes_df <- dat$nodes\nnodes_df$value <- nodes_df$vertex_cnt\nedges_df <- dat$edges\nedges_df$value <- edges_df$edge_cnt\n# 输入节点和边的数据\nvisNetwork(nodes = nodes_df, edges = edges_df, height = \"600px\") |> \n visIgraphLayout(randomSeed = 20232023, layout = \"layout.kamada.kawai\")\n\n\n\n\n\n\n图 23.9: 开发者的影响力网络(visNetwork)", "crumbs": [ "数据建模", "23  网络数据分析" @@ -1674,7 +1674,7 @@ "href": "analyze-text-data.html#sec-topic-models", "title": "24  文本数据分析", "section": "\n24.4 主题的探索", - "text": "24.4 主题的探索\n益辉的日志是没有分类和标签的,所以,先聚类,接着逐个分析每个类代表的实际含义。然后,将聚类的结果作为结果标签,再应用多分类回归模型,最后联合聚类、分类模型,从无监督转化到有监督模型。\ntopicmodels (Grün 和 Hornik 2011) 基于 tm (Feinerer, Hornik, 和 Meyer 2008) 支持潜在狄利克雷分配(Latent Dirichlet Allocation,简称 LDA) 和 Correlated Topics Models (CTM) 文本主题建模,这一套工具比较适合英文文本分词、向量化和建模。text2vec 包支持多个统计模型,如LDA 、LSA 、GloVe 等,文本向量化后,结合统计学习模型,可用于分类、回归、聚类等任务,更多详情见 https://text2vec.org。\n接下来使用 David M. Blei 等提出 LDA 算法做主题建模,详情见 LDA 算法原始论文。\n\nlibrary(text2vec)\n\n首先将所有日志分词、向量化,构建文档-词矩阵 document-term matrix (DTM)\n\n# 移除链接\nremove_links <- function(x) {\n gsub(pattern = \"(<http.*?>)|(\\\\(http.*?\\\\))|(<www.*?>)|(\\\\(www.*?>\\\\))\", replacement = \"\", x)\n}\n# 清理、分词、清理\nfile_list1 <- lapply(file_list, remove_yaml)\nfile_list1 <- lapply(file_list1, remove_links)\nfile_list1 <- lapply(file_list1, segment, jiebar = jieba_seg)\nfile_list1 <- lapply(file_list1, remove_number_english)\n\n去掉没啥实际意义的词(比如单个字),极高频词和极低频词。\n\n# Token 化\nit <- itoken(file_list1, ids = 1:length(file_list1), progressbar = FALSE)\nv <- create_vocabulary(it)\n# 去掉单个字 减少 3K\nv <- v[nchar(v$term) > 1,]\n# 去掉极高频词和极低频词 减少 1.4W\nv <- prune_vocabulary(v, term_count_min = 10, doc_proportion_max = 0.2)\n\n采用 LDA(Latent Dirichlet Allocation)算法建模\n\n# 词向量化\nvectorizer <- vocab_vectorizer(v)\n# 文档-词矩阵 DTM\ndtm <- create_dtm(it, vectorizer, type = \"dgTMatrix\")\n# 10 个主题\nlda_model <- LDA$new(n_topics = 9, doc_topic_prior = 0.1, topic_word_prior = 0.01)\n# 训练模型\ndoc_topic_distr <- lda_model$fit_transform(\n x = dtm, n_iter = 1000, convergence_tol = 0.001, \n n_check_convergence = 25, progressbar = FALSE\n )\n\n#> INFO [06:14:42.698] early stopping at 175 iteration\n#> INFO [06:14:43.321] early stopping at 50 iteration\n\n\n下图展示主题的分布,各个主题及其所占比例。\n\nbarplot(\n doc_topic_distr[1, ], xlab = \"主题\", ylab = \"比例\", \n ylim = c(0, 1), names.arg = 1:ncol(doc_topic_distr)\n)\n\n\n\n\n\n\n图 24.3: 主题分布\n\n\n\n\n将 9 个主题的 Top 12 词分别打印出来。\n\nlda_model$get_top_words(n = 12, topic_number = 1L:9L, lambda = 0.3)\n\n#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] \n#> [1,] \"例子\" \"一首\" \"社会\" \"统计\" \"代码\" \"记得\" \"吱吱\" \"时代\" \"网站\" \n#> [2,] \"翻译\" \"歌词\" \"观点\" \"会议\" \"函数\" \"不知\" \"照片\" \"意义\" \"数据\" \n#> [3,] \"字符\" \"手机\" \"痛苦\" \"模型\" \"文档\" \"同学\" \"好吃\" \"媒体\" \"图形\" \n#> [4,] \"特征\" \"这首\" \"教育\" \"论文\" \"文件\" \"阿姨\" \"我家\" \"文化\" \"域名\" \n#> [5,] \"作品\" \"首歌\" \"人类\" \"老师\" \"变量\" \"居然\" \"家里\" \"现实\" \"软件\" \n#> [6,] \"中文\" \"遗憾\" \"追求\" \"分布\" \"字体\" \"看见\" \"味道\" \"社交\" \"服务器\"\n#> [7,] \"排版\" \"艺术\" \"思考\" \"小子\" \"元素\" \"学校\" \"厨房\" \"社区\" \"邮件\" \n#> [8,] \"意思\" \"小说\" \"强烈\" \"统计学\" \"语法\" \"路上\" \"在家\" \"眼中\" \"提供\" \n#> [9,] \"风格\" \"生活\" \"成功\" \"参加\" \"编译\" \"听说\" \"黄瓜\" \"避免\" \"编辑\" \n#> [10,] \"主题\" \"诗词\" \"接受\" \"报告\" \"图片\" \"印象\" \"包子\" \"造成\" \"系统\" \n#> [11,] \"伟大\" \"鸡蛋\" \"工作\" \"检验\" \"参数\" \"当时\" \"叶子\" \"事实\" \"浏览器\"\n#> [12,] \"表示\" \"人间\" \"努力\" \"学生\" \"生成\" \"名字\" \"辣椒\" \"政治\" \"注册\"\n\n\n结果有点意思,说明益辉喜欢读书写作(主题 1、3、8)、诗词歌赋(主题 2)、统计图形(主题 4)、代码编程(主题 5)、回忆青春(主题 6)、做菜吃饭(7)、倒腾网站(主题 9)。\n\n\n\n\n\n\n注释\n\n\n\n提示:参考论文 (Zhang, Li, 和 Zhang 2023) 根据 perplexities 做交叉验证选择最合适的主题数量。", + "text": "24.4 主题的探索\n益辉的日志是没有分类和标签的,所以,先聚类,接着逐个分析每个类代表的实际含义。然后,将聚类的结果作为结果标签,再应用多分类回归模型,最后联合聚类、分类模型,从无监督转化到有监督模型。\ntopicmodels (Grün 和 Hornik 2011) 基于 tm (Feinerer, Hornik, 和 Meyer 2008) 支持潜在狄利克雷分配(Latent Dirichlet Allocation,简称 LDA) 和 Correlated Topics Models (CTM) 文本主题建模,这一套工具比较适合英文文本分词、向量化和建模。text2vec 包支持多个统计模型,如LDA 、LSA 、GloVe 等,文本向量化后,结合统计学习模型,可用于分类、回归、聚类等任务,更多详情见 https://text2vec.org。\n接下来使用 David M. Blei 等提出 LDA 算法做主题建模,详情见 LDA 算法原始论文。\n\nlibrary(text2vec)\n\n首先将所有日志分词、向量化,构建文档-词矩阵 document-term matrix (DTM)\n\n# 移除链接\nremove_links <- function(x) {\n gsub(pattern = \"(<http.*?>)|(\\\\(http.*?\\\\))|(<www.*?>)|(\\\\(www.*?>\\\\))\", replacement = \"\", x)\n}\n# 清理、分词、清理\nfile_list1 <- lapply(file_list, remove_yaml)\nfile_list1 <- lapply(file_list1, remove_links)\nfile_list1 <- lapply(file_list1, segment, jiebar = jieba_seg)\nfile_list1 <- lapply(file_list1, remove_number_english)\n\n去掉没啥实际意义的词(比如单个字),极高频词和极低频词。\n\n# Token 化\nit <- itoken(file_list1, ids = 1:length(file_list1), progressbar = FALSE)\nv <- create_vocabulary(it)\n# 去掉单个字 减少 3K\nv <- v[nchar(v$term) > 1,]\n# 去掉极高频词和极低频词 减少 1.4W\nv <- prune_vocabulary(v, term_count_min = 10, doc_proportion_max = 0.2)\n\n采用 LDA(Latent Dirichlet Allocation)算法建模\n\n# 词向量化\nvectorizer <- vocab_vectorizer(v)\n# 文档-词矩阵 DTM\ndtm <- create_dtm(it, vectorizer, type = \"dgTMatrix\")\n# 10 个主题\nlda_model <- LDA$new(n_topics = 9, doc_topic_prior = 0.1, topic_word_prior = 0.01)\n# 训练模型\ndoc_topic_distr <- lda_model$fit_transform(\n x = dtm, n_iter = 1000, convergence_tol = 0.001, \n n_check_convergence = 25, progressbar = FALSE\n )\n\n#> INFO [05:04:54.962] early stopping at 175 iteration\n#> INFO [05:04:55.568] early stopping at 50 iteration\n\n\n下图展示主题的分布,各个主题及其所占比例。\n\nbarplot(\n doc_topic_distr[1, ], xlab = \"主题\", ylab = \"比例\", \n ylim = c(0, 1), names.arg = 1:ncol(doc_topic_distr)\n)\n\n\n\n\n\n\n图 24.3: 主题分布\n\n\n\n\n将 9 个主题的 Top 12 词分别打印出来。\n\nlda_model$get_top_words(n = 12, topic_number = 1L:9L, lambda = 0.3)\n\n#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] \n#> [1,] \"例子\" \"一首\" \"社会\" \"统计\" \"代码\" \"记得\" \"吱吱\" \"时代\" \"网站\" \n#> [2,] \"翻译\" \"歌词\" \"观点\" \"会议\" \"函数\" \"不知\" \"照片\" \"意义\" \"数据\" \n#> [3,] \"字符\" \"手机\" \"痛苦\" \"模型\" \"文档\" \"同学\" \"好吃\" \"媒体\" \"图形\" \n#> [4,] \"特征\" \"这首\" \"教育\" \"论文\" \"文件\" \"阿姨\" \"我家\" \"文化\" \"域名\" \n#> [5,] \"作品\" \"首歌\" \"人类\" \"老师\" \"变量\" \"居然\" \"家里\" \"现实\" \"软件\" \n#> [6,] \"中文\" \"遗憾\" \"追求\" \"分布\" \"字体\" \"看见\" \"味道\" \"社交\" \"服务器\"\n#> [7,] \"排版\" \"艺术\" \"思考\" \"小子\" \"元素\" \"学校\" \"厨房\" \"社区\" \"邮件\" \n#> [8,] \"意思\" \"小说\" \"强烈\" \"统计学\" \"语法\" \"路上\" \"在家\" \"眼中\" \"提供\" \n#> [9,] \"风格\" \"生活\" \"成功\" \"参加\" \"编译\" \"听说\" \"黄瓜\" \"避免\" \"编辑\" \n#> [10,] \"主题\" \"诗词\" \"接受\" \"报告\" \"图片\" \"印象\" \"包子\" \"造成\" \"系统\" \n#> [11,] \"伟大\" \"鸡蛋\" \"工作\" \"检验\" \"参数\" \"当时\" \"叶子\" \"事实\" \"浏览器\"\n#> [12,] \"表示\" \"人间\" \"努力\" \"学生\" \"生成\" \"名字\" \"辣椒\" \"政治\" \"注册\"\n\n\n结果有点意思,说明益辉喜欢读书写作(主题 1、3、8)、诗词歌赋(主题 2)、统计图形(主题 4)、代码编程(主题 5)、回忆青春(主题 6)、做菜吃饭(7)、倒腾网站(主题 9)。\n\n\n\n\n\n\n注释\n\n\n\n提示:参考论文 (Zhang, Li, 和 Zhang 2023) 根据 perplexities 做交叉验证选择最合适的主题数量。", "crumbs": [ "数据建模", "24  文本数据分析" @@ -1718,7 +1718,7 @@ "href": "analyze-survival-data.html#模型拟合", "title": "25  生存数据分析", "section": "\n25.2 模型拟合", - "text": "25.2 模型拟合\nCox 比例风险回归模型与 Box-Cox 变换 (Box 和 Cox 1964)\n\n\nsurvival::coxph() Cox 比例风险回归模型\n\nMASS::boxcox() Box-Cox 变换\nglmnet::glmnet(family = \"cox\")\nINLA 包的函数 inla() 与 inla.surv() 一起拟合,链接\n\n\nsurvstan Stan 与生存分析\nrstanarm 包的函数 stan_jm() 使用说明 Estimating Joint Models for Longitudinal and Time-to-Event Data with rstanarm 链接\n\nrstanarm 包的生存分析分支\n\n\n\n25.2.1 survival\nR 软件内置了 survival 包,它是实现生存分析的核心 R 包 (Terry M. Therneau 和 Patricia M. Grambsch 2000),其函数 survfit() 拟合模型。\n\naml_survival <- survfit(Surv(time, status) ~ x, data = aml)\nsummary(aml_survival)\n\nCall: survfit(formula = Surv(time, status) ~ x, data = aml)\n\n x=Maintained \n time n.risk n.event survival std.err lower 95% CI upper 95% CI\n 9 11 1 0.909 0.0867 0.7541 1.000\n 13 10 1 0.818 0.1163 0.6192 1.000\n 18 8 1 0.716 0.1397 0.4884 1.000\n 23 7 1 0.614 0.1526 0.3769 0.999\n 31 5 1 0.491 0.1642 0.2549 0.946\n 34 4 1 0.368 0.1627 0.1549 0.875\n 48 2 1 0.184 0.1535 0.0359 0.944\n\n x=Nonmaintained \n time n.risk n.event survival std.err lower 95% CI upper 95% CI\n 5 12 2 0.8333 0.1076 0.6470 1.000\n 8 10 2 0.6667 0.1361 0.4468 0.995\n 12 8 1 0.5833 0.1423 0.3616 0.941\n 23 6 1 0.4861 0.1481 0.2675 0.883\n 27 5 1 0.3889 0.1470 0.1854 0.816\n 30 4 1 0.2917 0.1387 0.1148 0.741\n 33 3 1 0.1944 0.1219 0.0569 0.664\n 43 2 1 0.0972 0.0919 0.0153 0.620\n 45 1 1 0.0000 NaN NA NA\n\n\n拟合 Cox 比例风险回归模型(Cox Proportional Hazards Regression Model)\n\naml_coxph <- coxph(Surv(time, status) ~ 1 + x, data = aml)\nsummary(aml_coxph)\n\nCall:\ncoxph(formula = Surv(time, status) ~ 1 + x, data = aml)\n\n n= 23, number of events= 18 \n\n coef exp(coef) se(coef) z Pr(>|z|) \nxNonmaintained 0.9155 2.4981 0.5119 1.788 0.0737 .\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n exp(coef) exp(-coef) lower .95 upper .95\nxNonmaintained 2.498 0.4003 0.9159 6.813\n\nConcordance= 0.619 (se = 0.063 )\nLikelihood ratio test= 3.38 on 1 df, p=0.07\nWald test = 3.2 on 1 df, p=0.07\nScore (logrank) test = 3.42 on 1 df, p=0.06\n\n\n展示拟合结果。可以绘制生存分析的图的 R 包有很多,比如 ggfortify 包、ggsurvfit 包和 survminer 包等。ggfortify 包可以直接针对函数 survfit() 的返回对象绘图,ggsurvfit 包提供新函数 survfit2() 拟合模型、函数 ggsurvfit() 绘制图形,画面内容更加丰富,而 survminer 包依赖很多。\n\nlibrary(ggplot2)\nlibrary(ggfortify)\nautoplot(aml_survival, data = aml) +\n theme_minimal()\n\n\n\n\n\n\n图 25.2: 急性粒细胞白血病生存数据\n\n\n\n\n参数化的生存分析模型(参数模型,相对于非参数模型而言)\n\naml_surv_reg <- survreg(Surv(time, status) ~ x, data = aml, dist = \"weibull\")\nsummary(aml_surv_reg)\n\n\nCall:\nsurvreg(formula = Surv(time, status) ~ x, data = aml, dist = \"weibull\")\n Value Std. Error z p\n(Intercept) 4.109 0.300 13.70 <2e-16\nxNonmaintained -0.929 0.383 -2.43 0.015\nLog(scale) -0.235 0.178 -1.32 0.188\n\nScale= 0.791 \n\nWeibull distribution\nLoglik(model)= -80.5 Loglik(intercept only)= -83.2\n Chisq= 5.31 on 1 degrees of freedom, p= 0.021 \nNumber of Newton-Raphson Iterations: 5 \nn= 23 \n\n\n\n25.2.2 glmnet\nglmnet 包拟合 Cox 比例风险回归模型 (Simon 等 2011) 适合需要多变量筛选的情况。\n\nlibrary(glmnet)\n# alpha = 1 lasso\naml_glmnet <- glmnet(x = aml$x, y = Surv(aml$time, aml$status), family = \"cox\", alpha = 1)\naml_glmnet_cv <- cv.glmnet(x = aml$x, y = Surv(aml$time, aml$status), family = \"cox\", alpha = 1)\n\n\n25.2.3 INLA\nINLA 包拟合 Cox 比例风险回归模型 (Gómez-Rubio 2020) 采用近似贝叶斯推断。\n\nlibrary(INLA)\ninla.setOption(short.summary = TRUE)\naml_inla <- inla(inla.surv(time, status) ~ x, data = aml, family = \"exponential.surv\", num.threads = \"1:1\")\nsummary(aml_inla)\n\nFixed effects:\n mean sd 0.025quant 0.5quant 0.975quant mode kld\n(Intercept) -4.172 0.376 -4.910 -4.172 -3.434 -4.172 0\nxNonmaintained 0.983 0.482 0.038 0.983 1.929 0.983 0\n\n is computed", + "text": "25.2 模型拟合\nCox 比例风险回归模型与 Box-Cox 变换 (Box 和 Cox 1964)\n\n\nsurvival::coxph() Cox 比例风险回归模型\n\nMASS::boxcox() Box-Cox 变换\nglmnet::glmnet(family = \"cox\")\nINLA 包的函数 inla() 与 inla.surv() 一起拟合,链接\n\n\nsurvstan Stan 与生存分析\nrstanarm 包的函数 stan_jm() 使用说明 Estimating Joint Models for Longitudinal and Time-to-Event Data with rstanarm 链接\n\nrstanarm 包的生存分析分支\n\n\n\n25.2.1 survival\nR 软件内置了 survival 包,它是实现生存分析的核心 R 包 (Terry M. Therneau 和 Patricia M. Grambsch 2000),其函数 survfit() 拟合模型。\n\naml_survival <- survfit(Surv(time, status) ~ x, data = aml)\nsummary(aml_survival)\n\nCall: survfit(formula = Surv(time, status) ~ x, data = aml)\n\n x=Maintained \n time n.risk n.event survival std.err lower 95% CI upper 95% CI\n 9 11 1 0.909 0.0867 0.7541 1.000\n 13 10 1 0.818 0.1163 0.6192 1.000\n 18 8 1 0.716 0.1397 0.4884 1.000\n 23 7 1 0.614 0.1526 0.3769 0.999\n 31 5 1 0.491 0.1642 0.2549 0.946\n 34 4 1 0.368 0.1627 0.1549 0.875\n 48 2 1 0.184 0.1535 0.0359 0.944\n\n x=Nonmaintained \n time n.risk n.event survival std.err lower 95% CI upper 95% CI\n 5 12 2 0.8333 0.1076 0.6470 1.000\n 8 10 2 0.6667 0.1361 0.4468 0.995\n 12 8 1 0.5833 0.1423 0.3616 0.941\n 23 6 1 0.4861 0.1481 0.2675 0.883\n 27 5 1 0.3889 0.1470 0.1854 0.816\n 30 4 1 0.2917 0.1387 0.1148 0.741\n 33 3 1 0.1944 0.1219 0.0569 0.664\n 43 2 1 0.0972 0.0919 0.0153 0.620\n 45 1 1 0.0000 NaN NA NA\n\n\n拟合 Cox 比例风险回归模型(Cox Proportional Hazards Regression Model)\n\naml_coxph <- coxph(Surv(time, status) ~ 1 + x, data = aml)\nsummary(aml_coxph)\n\nCall:\ncoxph(formula = Surv(time, status) ~ 1 + x, data = aml)\n\n n= 23, number of events= 18 \n\n coef exp(coef) se(coef) z Pr(>|z|) \nxNonmaintained 0.9155 2.4981 0.5119 1.788 0.0737 .\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n exp(coef) exp(-coef) lower .95 upper .95\nxNonmaintained 2.498 0.4003 0.9159 6.813\n\nConcordance= 0.619 (se = 0.063 )\nLikelihood ratio test= 3.38 on 1 df, p=0.07\nWald test = 3.2 on 1 df, p=0.07\nScore (logrank) test = 3.42 on 1 df, p=0.06\n\n\n展示拟合结果。可以绘制生存分析的图的 R 包有很多,比如 ggfortify 包、ggsurvfit 包和 survminer 包等。ggfortify 包可以直接针对函数 survfit() 的返回对象绘图,ggsurvfit 包提供新函数 survfit2() 拟合模型、函数 ggsurvfit() 绘制图形,画面内容更加丰富,而 survminer 包依赖很多。\n\nlibrary(ggplot2)\nlibrary(ggfortify)\nautoplot(aml_survival, data = aml) +\n theme_minimal()\n\n\n\n\n\n\n图 25.2: 急性粒细胞白血病生存数据\n\n\n\n\n参数化的生存分析模型(参数模型,相对于非参数模型而言)\n\naml_surv_reg <- survreg(Surv(time, status) ~ x, data = aml, dist = \"weibull\")\nsummary(aml_surv_reg)\n\n\nCall:\nsurvreg(formula = Surv(time, status) ~ x, data = aml, dist = \"weibull\")\n Value Std. Error z p\n(Intercept) 4.109 0.300 13.70 <2e-16\nxNonmaintained -0.929 0.383 -2.43 0.015\nLog(scale) -0.235 0.178 -1.32 0.188\n\nScale= 0.791 \n\nWeibull distribution\nLoglik(model)= -80.5 Loglik(intercept only)= -83.2\n Chisq= 5.31 on 1 degrees of freedom, p= 0.021 \nNumber of Newton-Raphson Iterations: 5 \nn= 23 \n\n\n\n25.2.2 glmnet\nglmnet 包拟合 Cox 比例风险回归模型 (Simon 等 2011) 适合需要多变量筛选的情况。\n\nlibrary(glmnet)\n# alpha = 1 lasso\naml_glmnet <- glmnet(x = aml$x, y = Surv(aml$time, aml$status), family = \"cox\", alpha = 1)\naml_glmnet_cv <- cv.glmnet(x = aml$x, y = Surv(aml$time, aml$status), family = \"cox\", alpha = 1)\n\n\n25.2.3 INLA\nINLA 包拟合 Cox 比例风险回归模型 (Gómez-Rubio 2020) 采用近似贝叶斯推断。\n\nlibrary(INLA)\ninla.setOption(short.summary = TRUE)\naml_inla <- inla(inla.surv(time, status) ~ x, data = aml, family = \"exponential.surv\", num.threads = \"1:1\")\nsummary(aml_inla)\n\nFixed effects:\n mean sd 0.025quant 0.5quant 0.975quant mode kld\n(Intercept) -4.173 0.378 -4.913 -4.173 -3.432 -4.173 0\nxNonmaintained 0.984 0.483 0.036 0.984 1.931 0.984 0\n\n is computed", "crumbs": [ "数据建模", "25  生存数据分析" @@ -1949,7 +1949,7 @@ "href": "analyze-areal-data.html#美国各州犯罪率分析", "title": "29  区域数据分析", "section": "\n29.2 美国各州犯罪率分析", - "text": "29.2 美国各州犯罪率分析\n响应变量服从高斯分布的调查数据 (Bivand 2001)\n数据集 USArrests 记录 1973 年美国各州每 10 万居民中因谋杀 Murder、袭击 Assault 和强奸 Rape 被警察逮捕的人数以及城市人口所占百分比(可以看作城市化率)。\n\n\n\n表格 29.1: 数据集 USArrests(部分)\n\n\n\n\n州名\n区域划分\n谋杀犯\n袭击犯\n城市化率\n强奸犯\n\n\n\nAlabama\nSouth\n13.2\n236\n58\n21.2\n\n\nAlaska\nWest\n10.0\n263\n48\n44.5\n\n\nArizona\nWest\n8.1\n294\n80\n31.0\n\n\nArkansas\nSouth\n8.8\n190\n50\n19.5\n\n\nCalifornia\nWest\n9.0\n276\n91\n40.6\n\n\nColorado\nWest\n7.9\n204\n78\n38.7\n\n\n\n\n\n\n\n\n\nlibrary(sf)\n# 州数据\nus_state_sf <- readRDS(\"data/us-state-map-2010.rds\")\n# 观测数据\nus_state_df <- merge(x = us_state_sf, y = us_arrests,\n by.x = \"NAME\", by.y = \"state_name\", all.x = TRUE)\n\nggplot() +\n geom_sf(\n data = us_state_df, aes(fill = Assault), color = \"gray80\", lwd = 0.25) +\n scale_fill_viridis_c(option = \"plasma\", na.value = \"white\") +\n theme_void()\n\n\n\n\n\n\n图 29.2: 因袭击被逮捕的人数分布\n\n\n\n\n1973 年美国各州因袭击被逮捕的人数与城市化率的关系:相关分析\n\n代码library(ggrepel)\nggplot(data = us_arrests, aes(x = UrbanPop, y = Assault)) +\n geom_point(aes(color = state_region)) +\n geom_text_repel(aes(label = state_name), size = 3, seed = 2022) +\n theme_classic() +\n labs(x = \"城市化率(%)\", y = \"因袭击被逮捕人数\", color = \"区域划分\")\n\n\n\n\n\n\n图 29.3: 逮捕人数比例与城市化率的关系\n\n\n\n\n阿拉斯加州和夏威夷州与其它州都不相连,属于孤立的情况,下面在空间相关性的分析中排除这两个州。\n\n# 州的中心\ncenters48 <- subset(\n x = data.frame(x = state.center$x, y = state.center$y),\n subset = !state.name %in% c(\"Alaska\", \"Hawaii\")\n)\n# 观测数据\narrests48 <- subset(\n x = USArrests,\n subset = !rownames(USArrests) %in% c(\"Alaska\", \"Hawaii\")\n)\n\n\nlibrary(spData)\nlibrary(spdep)\n# KNN\nk4.48 <- knn2nb(knearneigh(as.matrix(centers48), k = 4))\n# Moran I test\nmoran.test(x = arrests48$Assault, listw = nb2listw(k4.48))\n\n\n Moran I test under randomisation\n\ndata: arrests48$Assault \nweights: nb2listw(k4.48) \n\nMoran I statistic standard deviate = 3.4216, p-value = 0.0003113\nalternative hypothesis: greater\nsample estimates:\nMoran I statistic Expectation Variance \n 0.294385644 -0.021276596 0.008511253 \n\n# Permutation test for Moran's I statistic\nmoran.mc(x = arrests48$Assault, listw = nb2listw(k4.48), nsim = 499)\n\n\n Monte-Carlo simulation of Moran I\n\ndata: arrests48$Assault \nweights: nb2listw(k4.48) \nnumber of simulations + 1: 500 \n\nstatistic = 0.29439, observed rank = 500, p-value = 0.002\nalternative hypothesis: greater\n\n\n\n\n\n\nBivand, Roger. 2001. 《More on Spatial Data Analysis》. R News 1 (3): 13–17. https://www.r-project.org/doc/Rnews/Rnews_2001-3.pdf.\n\n\nBlangiardo, Marta, Michela Cameletti, Gianluca Baio, 和 Håvard Rue. 2013. 《Spatial and spatio-temporal models with R-INLA》. Spatial and Spatio-temporal Epidemiology 7 (十二月): 39–55. https://doi.org/10.1016/j.sste.2013.07.003.\n\n\nCabral, Rafael, David Bolin, 和 Håvard Rue. 2022. 《Controlling the Flexibility of Non-Gaussian Processes Through Shrinkage Priors》. Bayesian Analysis -1 (-1): 1–24. https://doi.org/10.1214/22-BA1342.\n\n\nDonegan, Connor. 2022. 《geostan: An R package for Bayesian spatialanalysis》. Journal of Open Source Software 7 (79): 4716. https://doi.org/10.21105/joss.04716.\n\n\nMoraga, Paula. 2020. Geospatial health data: modeling and visualization with R-INLA and Shiny. Boca Raton, Florida: Chapman; Hall/CRC. https://www.paulamoraga.com/book-geospatial/.\n\n\nMorris, Mitzi, Katherine Wheeler-Martin, Dan Simpson, Stephen J. Mooney, Andrew Gelman, 和 Charles DiMaggio. 2019. 《Bayesian hierarchical spatial models: Implementing the Besag York Mollié model in stan》. Spatial and Spatio-temporal Epidemiology 31 (十一月): 100301. https://doi.org/10.1016/j.sste.2019.100301.\n\n\nTobler, Waldo. 1970. 《A computer movie simulating urban growth in the Detroit region》. Economic Geography 46 (Supplement): 234–40. https://doi.org/10.2307/143141.", + "text": "29.2 美国各州犯罪率分析\n响应变量服从高斯分布的调查数据 (Bivand 2001)\n数据集 USArrests 记录 1973 年美国各州每 10 万居民中因谋杀 Murder、袭击 Assault 和强奸 Rape 被警察逮捕的人数以及城市人口所占百分比(可以看作城市化率)。\n\n\n\n表格 29.1: 数据集 USArrests(部分)\n\n\n\n\n州名\n区域划分\n谋杀犯\n袭击犯\n城市化率\n强奸犯\n\n\n\nAlabama\nSouth\n13.2\n236\n58\n21.2\n\n\nAlaska\nWest\n10.0\n263\n48\n44.5\n\n\nArizona\nWest\n8.1\n294\n80\n31.0\n\n\nArkansas\nSouth\n8.8\n190\n50\n19.5\n\n\nCalifornia\nWest\n9.0\n276\n91\n40.6\n\n\nColorado\nWest\n7.9\n204\n78\n38.7\n\n\n\n\n\n\n\n\n\nlibrary(sf)\n# 州数据\nus_state_sf <- readRDS(\"data/us-state-map-2010.rds\")\n# 观测数据\nus_state_df <- merge(x = us_state_sf, y = us_arrests,\n by.x = \"NAME\", by.y = \"state_name\", all.x = TRUE)\n\nggplot() +\n geom_sf(\n data = us_state_df, aes(fill = Assault), color = \"gray80\", lwd = 0.25) +\n scale_fill_viridis_c(option = \"plasma\", na.value = \"white\") +\n theme_void()\n\n\n\n\n\n\n图 29.2: 因袭击被逮捕的人数分布\n\n\n\n\n1973 年美国各州因袭击被逮捕的人数与城市化率的关系:相关分析\n\n代码library(ggrepel)\nggplot(data = us_arrests, aes(x = UrbanPop, y = Assault)) +\n geom_point(aes(color = state_region)) +\n geom_text_repel(aes(label = state_name), size = 3, seed = 2022) +\n theme_classic() +\n labs(x = \"城市化率(%)\", y = \"因袭击被逮捕人数\", color = \"区域划分\")\n\n\n\n\n\n\n图 29.3: 逮捕人数比例与城市化率的关系\n\n\n\n\n阿拉斯加州和夏威夷州与其它州都不相连,属于孤立的情况,下面在空间相关性的分析中排除这两个州。\n\n# 州的中心\ncenters48 <- subset(\n x = data.frame(x = state.center$x, y = state.center$y),\n subset = !state.name %in% c(\"Alaska\", \"Hawaii\")\n)\n# 观测数据\narrests48 <- subset(\n x = USArrests,\n subset = !rownames(USArrests) %in% c(\"Alaska\", \"Hawaii\")\n)\n\n\nlibrary(spData)\nlibrary(spdep)\n# KNN\nk4.48 <- knn2nb(knearneigh(as.matrix(centers48), k = 4))\n# Moran I test\nmoran.test(x = arrests48$Assault, listw = nb2listw(k4.48))\n\n\n Moran I test under randomisation\n\ndata: arrests48$Assault \nweights: nb2listw(k4.48) \n\nMoran I statistic standard deviate = 3.4216, p-value = 0.0003113\nalternative hypothesis: greater\nsample estimates:\nMoran I statistic Expectation Variance \n 0.294385644 -0.021276596 0.008511253 \n\n# Permutation test for Moran's I statistic\nmoran.mc(x = arrests48$Assault, listw = nb2listw(k4.48), nsim = 499)\n\n\n Monte-Carlo simulation of Moran I\n\ndata: arrests48$Assault \nweights: nb2listw(k4.48) \nnumber of simulations + 1: 500 \n\nstatistic = 0.29439, observed rank = 499, p-value = 0.002\nalternative hypothesis: greater\n\n\n\n\n\n\nBivand, Roger. 2001. 《More on Spatial Data Analysis》. R News 1 (3): 13–17. https://www.r-project.org/doc/Rnews/Rnews_2001-3.pdf.\n\n\nBlangiardo, Marta, Michela Cameletti, Gianluca Baio, 和 Håvard Rue. 2013. 《Spatial and spatio-temporal models with R-INLA》. Spatial and Spatio-temporal Epidemiology 7 (十二月): 39–55. https://doi.org/10.1016/j.sste.2013.07.003.\n\n\nCabral, Rafael, David Bolin, 和 Håvard Rue. 2022. 《Controlling the Flexibility of Non-Gaussian Processes Through Shrinkage Priors》. Bayesian Analysis -1 (-1): 1–24. https://doi.org/10.1214/22-BA1342.\n\n\nDonegan, Connor. 2022. 《geostan: An R package for Bayesian spatialanalysis》. Journal of Open Source Software 7 (79): 4716. https://doi.org/10.21105/joss.04716.\n\n\nMoraga, Paula. 2020. Geospatial health data: modeling and visualization with R-INLA and Shiny. Boca Raton, Florida: Chapman; Hall/CRC. https://www.paulamoraga.com/book-geospatial/.\n\n\nMorris, Mitzi, Katherine Wheeler-Martin, Dan Simpson, Stephen J. Mooney, Andrew Gelman, 和 Charles DiMaggio. 2019. 《Bayesian hierarchical spatial models: Implementing the Besag York Mollié model in stan》. Spatial and Spatio-temporal Epidemiology 31 (十一月): 100301. https://doi.org/10.1016/j.sste.2019.100301.\n\n\nTobler, Waldo. 1970. 《A computer movie simulating urban growth in the Detroit region》. Economic Geography 46 (Supplement): 234–40. https://doi.org/10.2307/143141.", "crumbs": [ "空间分析", "29  区域数据分析" @@ -2048,7 +2048,7 @@ "href": "numerical-optimization.html#sec-nonlinear-optimization", "title": "31  数值优化", "section": "\n31.4 非线性优化", - "text": "31.4 非线性优化\n非线性优化按是否带有约束,以及约束是线性还是非线性,分为无约束优化、箱式约束优化、线性约束优化和非线性约束优化。箱式约束可看作是线性约束的特殊情况。\n\nR 软件内置的非线性优化函数\n\n\nnlm()\nnlminb()\nconstrOptim()\noptim()\n\n\n\n无约束\n支持\n支持\n不支持\n支持\n\n\n箱式约束\n不支持\n支持\n支持\n支持\n\n\n线性约束\n不支持\n不支持\n支持\n不支持\n\n\n\nR 软件内置的 stats 包有 4 个数值优化方面的函数,函数 nlm() 可求解无约束优化问题,函数 nlminb() 可求解无约束、箱式约束优化问题,函数 constrOptim() 可求解箱式和线性约束优化。函数 optim() 是通用型求解器,包含多个优化算法,可求解无约束、箱式约束优化问题。尽管这些函数在 R 语言中长期存在,在统计中有广泛的使用,如非线性最小二乘 stats::nls(),极大似然估计 stats4::mle() 和广义最小二乘估计 nlme::gls() 等。但是,这些优化函数的求解能力有重合,使用语法不尽相同,对于非线性约束无能为力,下面仍然主要使用 ROI 包来求解多维非线性优化问题。\n\n31.4.1 一元非线性优化\n求如下一维分段非线性函数的最小值,其函数图像见 图 31.5 ,这个函数是不连续的,更不光滑。\n\\[\nf(x) =\n\\begin{cases}\n10 & x \\in (-\\infty,-1] \\\\\n\\exp(-\\frac{1}{|x-1|}) & x \\in (-1,4) \\\\\n10 & x \\in [4, +\\infty)\n\\end{cases}\n\\]\n\nfn <- function(x) ifelse(x > -1, ifelse(x < 4, exp(-1 / abs(x - 1)), 10), 10)\n\n\n代码op <- par(mar = c(4, 4, 0.5, 0.5))\ncurve(\n expr = fn, from = -2, to = 5, lwd = 2,\n panel.first = grid(),\n xlab = \"$x$\", ylab = \"$f(x)$\"\n)\non.exit(par(op), add = TRUE)\n\n\n\n\n\n\n图 31.5: 一维函数图像\n\n\n\n\n函数 optimize() 可以求解一元函数的极值问题,默认求极小值,参数 f 表示目标函数,参数 interval 表示搜索在此区间内最小值。函数返回一个列表,元素 minimum 表示极小值点,objective 表示极值点对应的目标函数值。\n\noptimize(f = fn, interval = c(-4, 20), maximum = FALSE)\n\n#> $minimum\n#> [1] 19.99995\n#> \n#> $objective\n#> [1] 10\n\noptimize(f = fn, interval = c(-7, 20), maximum = FALSE)\n\n#> $minimum\n#> [1] 0.9992797\n#> \n#> $objective\n#> [1] 0\n\n\n值得注意,对于不连续的分段函数,在不同的区间内搜索极值,可能获得不同的结果,可以绘制函数图像帮助选择最小值。\n\n31.4.2 多元隐函数优化\n这个优化问题来自 1stOpt 软件的帮助文档,下面利用 R 语言来求该多元隐函数的极值。\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}} y = & ~\\sin\\Big((yx_1 -0.5)^2 + 2x_1 x_2^2 - \\frac{y}{10} \\Big)\\cdot \\\\\n&~\\exp\\Big(-\\Big( \\big(x_1 - 0.5 -\\exp(-x_2 + y)\\big)^2 + x_2^2 - \\frac{y}{5} + 3 \\Big)\\Big)\n\\end{aligned}\n\\]\n其中, \\(x_1 \\in [-1,7],x_2 \\in [-2,2]\\) 。\n对于隐函数 \\(f(x_1,x_2,y)=0\\) ,常规的做法是先计算隐函数的偏导数,并令偏导数为 0,再求解非线性方程组,得到各个驻点,最后,将驻点代入原方程,比较驻点处函数值,根据优化目标选择最大或最小值。\n\\[\n\\begin{aligned}\n\\frac{\\partial f(x_1,x_2,y)}{\\partial x_1} = 0 \\\\\n\\frac{\\partial f(x_1,x_2,y)}{\\partial x_2} = 0\n\\end{aligned}\n\\]\n如果目标函数很复杂,隐函数偏导数难以计算,可以考虑暴力网格搜索。先估计隐函数值 \\(z\\) 的大致范围,给定 \\(x,y\\) 时,计算一元非线性方程的根。\n\nfn <- function(m) {\n subfun <- function(x) {\n f1 <- (m[1] * x - 0.5)^2 + 2 * m[1] * m[2]^2 - x / 10\n f2 <- -((m[1] - 0.5 - exp(-m[2] + x))^2 + m[2]^2 - x / 5 + 3)\n x - sin(f1) * exp(f2)\n }\n uniroot(f = subfun, interval = c(-1, 1))$root\n}\n\n在位置 \\((1,2)\\) 处函数值为 0.0007368468。\n\n# 测试函数 fn\nfn(m = c(1, 2))\n\n#> [1] 0.0007368468\n\n\n将目标区域网格化,通过一元非线性方程求根的方式获得每个格点处的函数值。\n\ndf <- expand.grid(\n x1 = seq(from = -1, to = 7, length.out = 81),\n x2 = seq(from = -2, to = 2, length.out = 41)\n)\n# 计算格点处的函数值\ndf$fn <- apply(df, 1, FUN = fn)\n\n在此基础上,绘制隐函数图像,如 图 31.6 所示,可以获得关于隐函数的大致情况。\n\n代码# 绘图\nwireframe(\n data = df, fn ~ x1 * x2,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]), ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90),\n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\n\n\n\n\n\n图 31.6: 隐函数图像\n\n\n\n\n最后,获得暴力网格搜索的结果,目标函数在 \\((2.8,-0.9)\\) 处取得最小值 \\(-0.02159723\\)。总的来说,这是一个近似结果,如果进一步缩小搜索区域,将网格划分得越细,搜索的结果将越接近全局最小值。\n\ndf[df$fn == min(df$fn), ]\n\n#> x1 x2 fn\n#> 930 2.8 -0.9 -0.02159723\n\n\n将求隐函数极值的问题转为含非线性等式约束的非线性优化问题。\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}} \\quad & y \\\\\n\\text{s.t.} \\quad & f(x_1,x_2,y) = 0\n\\end{aligned}\n\\]\n由于等式约束非常复杂,手动计算等式约束的雅可比矩阵不可行,可以用 numDeriv 包的函数 jacobian() 计算等式约束的雅可比矩阵。考虑到本例中仅含有一个等式约束,雅可比矩阵退化为梯度向量,这可以用 numDeriv 包的另一个函数 grad() 计算。\n\n# 等式约束\nheq <- function(x) {\n f1 <- (x[1] * x[3] - 0.5)^2 + 2 * x[1] * x[2]^2 - x[3] / 10\n f2 <- (x[1] - 0.5 - exp(-x[2] + x[3]))^2 + x[2]^2 - x[3] / 5 + 3\n x[3] - sin(f1) * exp(-f2)\n}\n# 等式约束的梯度\nheq.jac <- function(x) {\n numDeriv::grad(func = heq, x = x)\n}\n\n函数 L_objective() 表示含 1 个决策变量的线性目标函数,函数 F_constraint() 表示非线性等式约束。\n\n# 定义优化问题\nop <- OP(\n objective = L_objective(L = c(0, 0, 1)),\n constraints = F_constraint(\n # 等式约束\n F = list(heq = heq),\n dir = \"==\",\n rhs = 0,\n # 等式约束的雅可比\n J = list(heq.jac = heq.jac)\n ),\n bounds = V_bound(\n ld = -Inf, ud = Inf,\n li = c(1, 2), ui = c(1, 2),\n lb = c(-1, -2), ub = c(7, 2),\n nobj = 3L\n ),\n maximum = FALSE # 求最小\n)\nop\n\n#> ROI Optimization Problem:\n#> \n#> Minimize a linear objective function of length 3 with\n#> - 3 continuous objective variables,\n#> \n#> subject to\n#> - 1 constraint of type nonlinear.\n#> - 3 lower and 2 upper non-standard variable bounds.\n\n\n将网格搜索的结果作为初值,继续寻找更优的目标函数值。\n\nnlp <- ROI_solve(op,\n solver = \"nloptr.slsqp\", start = c(2.8, -0.9, -0.02159723)\n)\n# 最优解\nnlp$solution\n\n#> [1] 2.89826224 -0.85731584 -0.02335409\n\n# 目标函数值\nnlp$objval\n\n#> [1] -0.02335409\n\n\n可以发现,更优的目标函数值 \\(-0.02335\\) 在 \\((2.898,-0.8573)\\) 取得。\n\n31.4.3 多元无约束优化\n\n31.4.3.1 示例 1\nRastrigin 函数是一个 \\(n\\) 维优化问题测试函数。\n\\[\n\\min_{\\boldsymbol{x}} \\sum_{i=1}^{n}\\big(x_i^2 - 10 \\cos(2\\pi x_i) + 10\\big)\n\\]\n计算函数值的 R 代码如下:\n\nfn <- function(x) {\n sum(x^2 - 10 * cos(2 * pi * x) + 10)\n}\n\n绘制二维情形下的 Rastrigin 函数图像,如 图 31.7 所示,这是一个多模态的函数,有许多局部极小值。如果采用 BFGS 算法寻优容易陷入局部极值点。\n\n代码df <- expand.grid(\n x = seq(-4, 4, length.out = 151),\n y = seq(-4, 4, length.out = 151)\n)\n\ndf$fnxy <- apply(df, 1, fn)\nwireframe(\n data = df, fnxy ~ x * y,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]),\n ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90),\n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\n\n\n\n\n\n图 31.7: 二维 Rastrigin 函数图像\n\n\n\n\n不失一般性,考虑函数维数 \\(n=20\\) ,决策变量 \\(x_i \\in [-50,50], i = 1,2,\\ldots,n\\) 的情况。\n\nop <- OP(\n objective = F_objective(fn, n = 20L),\n bounds = V_bound(ld = -50, ud = 50, nobj = 20L)\n)\n\n调全局优化器求解优化问题。\n\nnlp <- ROI_solve(op, solver = \"nloptr.directL\")\n# 最优解\nnlp$solution\n\n#> [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n\n# 目标函数值\nnlp$objval\n\n#> [1] 0\n\n\n\n代码# R 语言内置的非线性优化函数\n# 无约束\nnlm(f = fn, p = rep(1, 20))\noptim(par = rep(1, 20), fn = fn, method = \"BFGS\")\noptim(par = rep(1, 20), fn = fn, method = \"Nelder-Mead\")\n\n# 箱式约束\noptim(par = rep(1, 20), fn = fn, \n lower = -50, upper = 50, method = \"L-BFGS-B\")\nnlminb(start = rep(1, 20), objective = fn, lower = -50, upper = 50)\nconstrOptim(\n theta = rep(1, 20), f = fn, grad = NULL,\n ui = rbind(diag(rep(1, 20)), diag(rep(-1, 20))),\n ci = c(rep(-50, 20), rep(-50, 20))\n)\n\n\n\n31.4.3.2 示例 2\n下面这个优化问题来自 1stOpt 软件帮助手册,是一个无约束非线性优化问题,它的目标函数非常复杂,一般的求解器都无法求解。最优解在 \\((7.999982, 7.999982)\\) 取得,目标函数值为 -7.978832。\n\\[\n\\begin{aligned}\n & \\min_{\\boldsymbol{x}} ~ \\cos(x_1)\\cos(x_2) - \\sum_{i=1}^{5}\\Big( (-1)^i \\cdot i \\cdot 2 \\cdot \\exp\\big(-500 \\cdot ( (x_1 - i \\cdot 2)^2 + (x_2 - i\\cdot 2)^2 ) \\big) \\Big)\n\\end{aligned}\n\\]\n目标函数分两步计算,先计算累加部分的通项,然后代入计算目标函数。\n\nsubfun <- function(i, m) {\n (-1)^i * i * 2 * exp(-500 * ((m[1] - i * 2)^2 + (m[2] - i * 2)^2))\n}\nfn <- function(x) {\n cos(x[1]) * cos(x[2]) -\n sum(mapply(FUN = subfun, i = 1:5, MoreArgs = list(m = x)))\n}\n\n直观起见,绘制目标函数在区域 \\([-50, 50] \\times [-50, 50]\\) 内的图像,如 图 31.8 (a) 所示,可以看到几乎没有变化的梯度,给寻优过程带来很大困难。再将区域 \\([0, 12] \\times [0, 12]\\) 上的三维图像绘制出来,如 图 31.8 (b) 所示,可见,有不少局部陷阱,且分布在 \\(x_2 = x_1\\) 的直线上。\n代码df <- expand.grid(\n x = seq(-50, 50, length.out = 101),\n y = seq(-50, 50, length.out = 101)\n)\ndf$fnxy <- apply(df, 1, fn)\nwireframe(\n data = df, fnxy ~ x * y,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]),\n ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90),\n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\ndf <- expand.grid(\n x = seq(0, 12, length.out = 151),\n y = seq(0, 12, length.out = 151)\n)\ndf$fnxy <- apply(df, 1, fn)\nwireframe(\n data = df, fnxy ~ x * y,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]),\n ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90), alpha = 0.75, \n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\n\n\n\n\n\n\n\n\n(a) 区域 \\([-50,50]\\times[-50,50]\\) 内的函数图像\n\n\n\n\n\n\n\n\n\n\n\n(b) 区域 \\([0,12]\\times[0,12]\\) 内的函数图像\n\n\n\n\n\n\n图 31.8: 局部放大前后的函数图像\n\n\n不失一般性,下面考虑 \\(x_1,x_2 \\in [-50,50]\\) ,面对如此复杂的函数,调用全局优化器 nloptr.directL 寻优。\n\nop <- OP(\n objective = F_objective(fn, n = 2L),\n bounds = V_bound(ld = -50, ud = 50, nobj = 2L)\n)\nnlp <- ROI_solve(op, solver = \"nloptr.directL\")\nnlp$solution\n\n#> [1] 0.00000 22.22222\n\nnlp$objval\n\n#> [1] -0.9734211\n\n\n结果还是陷入局部最优解。运筹优化方面的商业软件,著名的有 Lingo 和 Matlab,下面采用 Lingo 20 求解,Lingo 代码如下:\nSETS:\nP/1..5/;\nEndsets\nMin=@cos(x1) * @cos(x2) - @Sum(P(j): (-1)^j * j * 2 * @exp(-500 * ((x1 - j * 2)^2 + (x2 - j * 2)^2)));\n@Bnd(-50, x1, 50);\n@Bnd(-50, x2, 50);\n启用全局优化求解器后,在 \\((x_1 = 7.999982, x_2 = 7.999982)\\) 取得最小值 -7.978832。而默认未启用全局优化求解器的情况下,在 \\((x_1 = 18.84956, x_2 = -40.84070)\\) 取得局部极小值 -1.000000。\n在这种情况下,数值优化算法遇到瓶颈,可以采用一些全局随机优化算法,比如 GA 包 (Scrucca 2013) 实现的遗传算法。经过对参数的一些调优,可以获得与商业软件几乎一样的结果。\n\nnlp <- GA::ga(\n type = \"real-valued\",\n fitness = function(x) -fn(x),\n lower = c(0, 0), upper = c(12, 12),\n popSize = 500, maxiter = 100, \n monitor = FALSE, seed = 20232023\n)\n# 最优解\nnlp@solution\n\n#> x1 x2\n#> [1,] 7.999982 7.999981\n\n# 目标函数值\nnlp@fitnessValue\n\n#> [1] 7.978832\n\n\n其中,参数 type 指定决策变量的类型,type = \"real-valued\" 表示目标函数中的决策变量是实值连续的,参数 fitness 是目标函数,函数 ga() 对目标函数求极大,所以,对当前优化问题,添加了一个负号。 参数 popSize 控制种群大小,值越大,运行时间越长,搜索范围越广,获得的全局优化解越好。对于复杂的优化问题,可以不断增加种群大小来寻优,直至增加种群大小也不能获得更好的解。参数 maxiter 控制种群进化的次数,值越大,搜索次数可以越多,获得的解越好。参数 popSize 的影响大于参数 maxiter ,减少陷入局部最优解(陷阱)的可能。根据已知条件尽可能缩小可行域,以减少种群数量,进而缩短算法迭代时间。\n\n31.4.4 多元箱式约束优化\n有如下带箱式约束的多元非线性优化问题,该示例来自函数 nlminb() 的帮助文档,如果没有箱式约束,全局极小值点在 \\((1,1,\\cdots,1)\\) 处取得。\n\\[\n\\begin{aligned}\n \\min_{\\boldsymbol{x}} \\quad & (x_1 - 1)^2 + 4\\sum_{i =1}^{n -1}(x_{i+1} -x_i^2)^2 \\\\\n \\text{s.t.} \\quad & 2 \\leq x_1,x_2,\\cdots,x_n \\leq 4\n\\end{aligned}\n\\]\nR 语言编码的函数代码如下:\n\nfn <- function(x) {\n n <- length(x)\n sum(c(1, rep(4, n - 1)) * (x - c(1, x[-n])^2)^2)\n}\n\n在二维的情形下,可以绘制目标函数的三维图像,见 图 31.9 ,函数曲面和香蕉函数有些相似。\n\n代码dat <- expand.grid(\n x1 = seq(from = 0, to = 4, length.out = 41),\n x2 = seq(from = 0, to = 4, length.out = 41)\n)\ndat$fn <- apply(dat, 1, fn)\n\nwireframe(\n data = dat, fn ~ x1 * x2,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]), ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90),\n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\n\n\n\n\n\n图 31.9: 类香蕉函数的曲面图\n\n\n\n\nBase R 有 3 个函数可以求解这个优化问题,分别是 nlminb() 、constrOptim()和optim() ,因此,不妨在这个示例上,用这 3 个函数分别求解该优化问题,介绍它们的用法,最后,介绍 ROI 包实现的方法。这个优化问题的目标函数是 \\(n\\) 维非线性的,不失一般性,又不让问题变得太过简单,下面考虑 25 维的情况,\n\n31.4.4.1 nlminb()\n\n函数 nlminb() 参数 start 指定迭代初始值,参数 objective 指定目标函数,参数 lower 和 upper 分别指定箱式约束中的下界和上界。给定初值 \\((3, 3, \\cdots, 3)\\),下界 \\((2,2,\\cdots,2)\\) 和上界 \\((4,4,\\cdots,4)\\) 。nlminb() 帮助文档说该函数出于历史兼容性的原因尚且存在,一般来说,这个函数会一直维护下去的。\n\nnlminb(\n start = rep(3, 25), objective = fn,\n lower = rep(2, 25), upper = rep(4, 25)\n)\n\n#> $par\n#> [1] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [9] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [17] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.109093\n#> [25] 4.000000\n#> \n#> $objective\n#> [1] 368.1059\n#> \n#> $convergence\n#> [1] 0\n#> \n#> $iterations\n#> [1] 6\n#> \n#> $evaluations\n#> function gradient \n#> 10 177 \n#> \n#> $message\n#> [1] \"relative convergence (4)\"\n\n\n从返回结果来看,求解过程成功收敛,最优解的前 23 个决策变量取值为 2,在箱式约束的边界上,第 24 个分量没有边界上,而在内部,第 25 个决策变量取值为 4,也在边界上。目标函数值为 368.1059。\n\n31.4.4.2 constrOptim()\n\n使用 constrOptim() 函数求解,默认求极小,需将箱式或线性不等式约束写成矩阵形式,即 \\(Ax \\geq b\\) 的形式,参数 ui 是 \\(k \\times n\\) 的约束矩阵 \\(A\\),ci 是右侧 \\(k\\) 维约束向量 \\(b\\)。以上面的优化问题为例,将箱式约束 \\(2 \\leq x_1,x_2 \\leq 4\\) 转化为矩阵形式,约束矩阵和向量分别为:\n\\[\nA = \\begin{bmatrix}\n1 & 0 \\\\\n0 & 1 \\\\\n-1 & 0 \\\\\n0 & -1\n\\end{bmatrix}, \\quad\nb = \\begin{bmatrix}\n2 \\\\\n2 \\\\\n-4 \\\\\n-4\n\\end{bmatrix}\n\\]\n\nconstrOptim(\n theta = rep(3, 25), # 初始值\n f = fn, # 目标函数\n method = \"Nelder-Mead\", # 没有提供梯度,则必须用 Nelder-Mead 方法\n ui = rbind(diag(rep(1, 25)), diag(rep(-1, 25))),\n ci = c(rep(2, 25), rep(-4, 25))\n)\n\n#> $par\n#> [1] 2.006142 2.002260 2.003971 2.003967 2.004143 2.004255 2.001178 2.002990\n#> [9] 2.003883 2.006029 2.017345 2.009236 2.000949 2.007793 2.025831 2.007896\n#> [17] 2.004514 2.004381 2.008771 2.015695 2.005803 2.009127 2.017988 2.257782\n#> [25] 3.999846\n#> \n#> $value\n#> [1] 378.4208\n#> \n#> $counts\n#> function gradient \n#> 12048 NA \n#> \n#> $convergence\n#> [1] 1\n#> \n#> $message\n#> NULL\n#> \n#> $outer.iterations\n#> [1] 25\n#> \n#> $barrier.value\n#> [1] -0.003278963\n\n\n返回结果中 convergence = 1 表示迭代次数到达默认的极限 maxit = 500 。参考函数 nlminb() 的求解结果,可知还没有收敛。如果没有提供梯度,则必须用 Nelder-Mead 方法,下面增加迭代次数到 1000。\n\nconstrOptim(\n theta = rep(3, 25), # 初始值\n f = fn, # 目标函数\n method = \"Nelder-Mead\", \n control = list(maxit = 1000),\n ui = rbind(diag(rep(1, 25)), diag(rep(-1, 25))),\n ci = c(rep(2, 25), rep(-4, 25))\n)\n\n#> $par\n#> [1] 2.000081 2.000142 2.001919 2.000584 2.000007 2.000003 2.001097 2.001600\n#> [9] 2.000207 2.000042 2.000250 2.000295 2.000580 2.002165 2.000453 2.000932\n#> [17] 2.000456 2.000363 2.000418 2.000474 2.009483 2.001156 2.003173 2.241046\n#> [25] 3.990754\n#> \n#> $value\n#> [1] 370.8601\n#> \n#> $counts\n#> function gradient \n#> 18036 NA \n#> \n#> $convergence\n#> [1] 1\n#> \n#> $message\n#> NULL\n#> \n#> $outer.iterations\n#> [1] 19\n#> \n#> $barrier.value\n#> [1] -0.003366467\n\n\n结果有改善,目标函数值从 378.4208 减小到 370.8601,但还是没有收敛,可见 Nelder-Mead 方法在这个优化问题上收敛速度比较慢。下面考虑调用基于梯度的 BFGS 优化算法,这得先计算出来目标函数的梯度。\n\n# 输入 n 维向量,输出 n 维向量\ngr <- function(x) {\n n <- length(x)\n c(2 * (x[1] - 2), rep(0, n - 1))\n +8 * c(0, x[-1] - x[-n]^2)\n -16 * c(x[-n], 0) * c(x[-1] - x[-n]^2, 0)\n}\nconstrOptim(\n theta = rep(3, 25), # 初始值\n f = fn, # 目标函数\n grad = gr,\n method = \"BFGS\", \n control = list(maxit = 1000),\n ui = rbind(diag(rep(1, 25)), diag(rep(-1, 25))),\n ci = c(rep(2, 25), rep(-4, 25))\n)\n\n#> $par\n#> [1] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [9] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [17] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000001\n#> [25] 3.000000\n#> \n#> $value\n#> [1] 373\n#> \n#> $counts\n#> function gradient \n#> 3721 464 \n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> NULL\n#> \n#> $outer.iterations\n#> [1] 3\n#> \n#> $barrier.value\n#> [1] -0.003327104\n\n\n从结果来看,虽然已经收敛,但相比于 Nelder-Mead 方法,目标函数值变大了,可见已陷入局部最优解。\n\n31.4.4.3 optim()\n\n下面再使用函数 optim() 提供的 L-BFGS-B 算法求解优化问题。\n\noptim(\n par = rep(3, 25), fn = fn, gr = NULL, method = \"L-BFGS-B\",\n lower = rep(2, 25), upper = rep(4, 25)\n)\n\n#> $par\n#> [1] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [9] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [17] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.109093\n#> [25] 4.000000\n#> \n#> $value\n#> [1] 368.1059\n#> \n#> $counts\n#> function gradient \n#> 6 6 \n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> [1] \"CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH\"\n\n\n发现结果和函数 nlminb() 的结果差不多了。\n\noptim(\n par = rep(3, 25), fn = fn, gr = gr, method = \"L-BFGS-B\",\n lower = rep(2, 25), upper = rep(4, 25)\n)\n\n#> $par\n#> [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3\n#> \n#> $value\n#> [1] 373\n#> \n#> $counts\n#> function gradient \n#> 2 2 \n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> [1] \"CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL\"\n\n\n然而,当在函数 optim() 里提供梯度信息的时候,虽然目标函数及梯度的计算次数变少了,求解速度提升了,但是最优解反而变差了,最优解和在函数 constrOptim() 中设置 method = \"BFGS\" 算法基本一致。\n\n31.4.4.4 ROI 包\n下面通过 ROI 包,分别调用求解器 nloptr.lbfgs 和 nloptr.directL ,发现前者同样陷入局部最优解,而后者可以获得与 nlminb() 函数一致的结果。\n\nop <- OP(\n objective = F_objective(fn, n = 25L, G = gr),\n bounds = V_bound(ld = 2, ud = 4, nobj = 25L)\n)\nnlp <- ROI_solve(op, solver = \"nloptr.lbfgs\", start = rep(3, 25))\n# 目标函数值\nnlp$objval\n\n#> [1] 373\n\n# 最优解\nnlp$solution\n\n#> [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3\n\n\n调全局优化算法。\n\nnlp <- ROI_solve(op, solver = \"nloptr.directL\")\n# 目标函数值\nnlp$objval\n\n#> [1] 368.1061\n\n# 最优解\nnlp$solution\n\n#> [1] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [9] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [17] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.109093\n#> [25] 4.000000\n\n\n\n31.4.5 多元线性约束优化\n对于带线性约束的多元非线性优化问题,Base R 提供函数 constrOptim() 来求解,下面的示例来自其帮助文档,这是一个带线性约束的二次规划问题。\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}}\n\\quad & - \\begin{bmatrix}\n0 \\\\\n5 \\\\\n0\n\\end{bmatrix}^{\\top} \\boldsymbol{x} +\\frac{1}{2} \\boldsymbol{x}^{\\top}\\boldsymbol{x} \\\\\n\\text{s.t.} \\quad & \\begin{bmatrix}\n-4 & 2 & 0 \\\\\n-3 & 1 & -2 \\\\\n0 & 0 & 1\n\\end{bmatrix}^{\\top}\\boldsymbol{x} \\geq \\begin{bmatrix}\n-8 \\\\\n2 \\\\\n0\n\\end{bmatrix}\n\\end{aligned}\n\\]\n\nfQP <- function(x) {\n -sum(c(0, 5, 0) * x) + 0.5 * sum(x * x)\n}\nAmat <- matrix(c(-4, -3, 0, 2, 1, 0, 0, -2, 1),\n ncol = 3, nrow = 3, byrow = FALSE\n)\nbvec <- c(-8, 2, 0)\n# 目标函数的梯度\ngQP <- function(x) {\n -c(0, 5, 0) + x\n}\nconstrOptim(\n theta = c(2, -1, -1), \n f = fQP, g = gQP, \n ui = t(Amat), ci = bvec\n)\n\n#> $par\n#> [1] 0.4761908 1.0476188 2.0952376\n#> \n#> $value\n#> [1] -2.380952\n#> \n#> $counts\n#> function gradient \n#> 406 81 \n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> NULL\n#> \n#> $outer.iterations\n#> [1] 3\n#> \n#> $barrier.value\n#> [1] -0.0006243894\n\n\n在上一节,箱式约束可以看作线性约束的一种特殊情况,ROI 包是支持箱式、线性、二次、锥和非线性约束的。因此,下面给出调用 ROI 包求解上述优化问题的代码。\n\nDmat <- diag(rep(1,3))\ndvec <- c(0, 5, 0)\nop <- OP(\n objective = Q_objective(Q = Dmat, L = -dvec),\n constraints = L_constraint(L = t(Amat), dir = rep(\">=\", 3), rhs = bvec),\n maximum = FALSE\n)\nnlp <- ROI_solve(op, solver = \"nloptr.slsqp\", start = c(0, 1, 2))\n# 最优解\nnlp$solution\n\n#> [1] 0.4761905 1.0476190 2.0952381\n\n# 目标函数值\nnlp$objval\n\n#> [1] -2.380952\n\n\n可见输出结果与函数 constrOptim() 是一致的。\n\n代码# quadprog\nlibrary(quadprog)\nsol <- solve.QP(\n Dmat = Dmat, dvec = dvec, Amat = Amat, bvec = bvec\n)\nsol\n\n\n\n31.4.6 多元非线性约束优化\nnloptr 包的非线性优化能力覆盖开源优化软件 Octave 和 Ipopt 。通过插件包 ROI.plugin.nloptr,ROI 包可以调用 nloptr 包内置的所有求解器,常用的求解器见下表。表中从优化器类型(局部还是全局优化器),支持的约束条件类型(箱式还是非线性),是否需要提供目标函数的梯度、黑塞和约束条件的雅可比矩阵信息等方面归纳各个求解器的能力。\n\n常用的非线性优化求解器\n\n求解器\n类型\n约束\n梯度\n黑塞\n雅可比\n\n\n\nnloptr.lbfgs\n局部\n箱式\n需要\n不需要\n不需要\n\n\nnloptr.slsqp\n局部\n非线性\n需要\n不需要\n需要\n\n\nnloptr.auglag\n局部\n非线性\n需要\n不需要\n需要\n\n\nnloptr.directL\n全局\n箱式\n不需要\n不需要\n不需要\n\n\nnloptr.isres\n全局\n非线性\n不需要\n不需要\n不需要\n\n\n\n\n31.4.6.1 非线性等式约束\n下面这个示例来自 Octave 软件的非线性优化帮助文档,Octave 中的函数 sqp() 使用序列二次优化求解器(successive quadratic programming solver)求解非线性优化问题,示例中该优化问题包含多个非线性等式约束。\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}} \\quad & \\exp\\big(\\prod_{i=1}^{5} x_i\\big) - \\frac{1}{2}(x_1^3 + x_2^3 + 1)^2 \\\\\n\\text{s.t.} \\quad & \\left\\{\n \\begin{array}{l}\n \\sum_{i=1}^{5}x_i^2 - 10 = 0 \\\\\n x_2 x_3 - 5x_4 x_5 = 0 \\\\\n x_1^3 + x_2^3 + 1 = 0\n \\end{array} \\right.\n\\end{aligned}\n\\]\n目标函数是非线性的,有 5 个变量,约束条件也是非线性的,有 3 个等式约束。先手动计算目标函数的梯度,等式约束的雅可比矩阵。\n\n# 目标函数\nfn <- function(x) {\n exp(prod(x)) - 0.5 * (x[1]^3 + x[2]^3 + 1)^2\n}\n# 目标函数的梯度\ngr <- function(x) {\n c(\n exp(prod(x)) * prod(x[-1]) - 3 * (x[1]^3 + x[2]^3 + 1) * x[1]^2,\n exp(prod(x)) * prod(x[-2]) - 3 * (x[1]^3 + x[2]^3 + 1) * x[2]^2,\n exp(prod(x)) * prod(x[-3]),\n exp(prod(x)) * prod(x[-4]),\n exp(prod(x)) * prod(x[-5])\n )\n}\n# 等式约束\nheq <- function(x) {\n c(\n sum(x^2) - 10,\n x[2] * x[3] - 5 * x[4] * x[5],\n x[1]^3 + x[2]^3 + 1\n )\n}\n# 等式约束的雅可比矩阵\nheq.jac <- function(x) {\n matrix(c(2 * x[1], 2 * x[2], 2 * x[3], 2 * x[4], 2 * x[5],\n 0, x[3], x[2], -5 * x[5], -5 * x[4],\n 3 * x[1]^2, 3 * x[2]^2, 0, 0, 0),\n ncol = 5, byrow = TRUE\n )\n}\n\n在 OP() 函数里定义目标优化的各个成分。\n\n# 定义目标优化\nop <- OP(\n # 5 个决策变量\n objective = F_objective(F = fn, n = 5L, G = gr), \n constraints = F_constraint(\n F = list(heq = heq),\n dir = \"==\",\n rhs = 0,\n # 等式约束的雅可比矩阵\n J = list(heq.jac = heq.jac)\n ),\n bounds = V_bound(ld = -Inf, ud = Inf, nobj = 5L),\n maximum = FALSE # 求最小\n)\nop\n\n#> ROI Optimization Problem:\n#> \n#> Minimize a nonlinear objective function of length 5 with\n#> - 5 continuous objective variables,\n#> \n#> subject to\n#> - 1 constraint of type nonlinear.\n#> - 5 lower and 0 upper non-standard variable bounds.\n\n\n调用 SQP(序列二次优化) 求解器 nloptr.slsqp 。\n\nnlp <- ROI_solve(op,\n solver = \"nloptr.slsqp\",\n start = c(-1.8, 1.7, 1.9, -0.8, -0.8)\n)\n# 最优解\nnlp$solution\n\n#> [1] -1.7171435 1.5957096 1.8272458 -0.7636431 -0.7636431\n\n# 目标函数值\nnlp$objval\n\n#> [1] 0.05394985\n\n\n计算结果和 Octave 的示例一致。\n\n31.4.6.2 多种非线性约束\n\n非线性等式约束\n非线性不等式约束,不等式约束包含等号\n箱式约束\n\n此优化问题来源于 Ipopt 官网的帮助文档,约束条件比较复杂。提供的初始值为 \\(x_0 = (1,5,5,1)\\),最优解为 \\(x_{\\star} = (1.00000000,4.74299963,3.82114998,1.37940829)\\)。优化问题的具体内容如下:\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}} & \\quad x_1 x_4 (x_1 + x_2 + x_3) + x_3 \\\\\n\\text{s.t.} & \\quad \\left\\{\n \\begin{array}{l}\n x_1^2 + x_2^2 + x_3^2 + x_4^2 = 40 \\\\\n x_1 x_2 x_3 x_4 \\geq 25 \\\\\n 1 \\leq x_1, x_2, x_3, x_4 \\leq 5\n \\end{array} \\right.\n\\end{aligned}\n\\]\n下面用 ROI 调 nloptr 包求解,看结果是否和例子一致,nloptr 支持箱式约束且支持不等式约束包含等号。\n\n# 一个 4 维的目标函数\nfn <- function(x) {\n x[1] * x[4] * (x[1] + x[2] + x[3]) + x[3]\n}\n# 目标函数的梯度\ngr <- function(x) {\n c(\n x[4] * (2 * x[1] + x[2] + x[3]), x[1] * x[4],\n x[1] * x[4] + 1, x[1] * (x[1] + x[2] + x[3])\n )\n}\n# 等式约束\nheq <- function(x) {\n sum(x^2)\n}\n# 等式约束的雅可比\nheq.jac <- function(x) {\n 2 * c(x[1], x[2], x[3], x[4])\n}\n# 不等式约束\nhin <- function(x) {\n prod(x)\n}\n# 不等式约束的雅可比\nhin.jac <- function(x) {\n c(prod(x[-1]), prod(x[-2]), prod(x[-3]), prod(x[-4]))\n}\n# 定义目标优化\nop <- OP(\n objective = F_objective(F = fn, n = 4L, G = gr), # 4 个决策变量\n constraints = F_constraint(\n F = list(heq = heq, hin = hin),\n dir = c(\"==\", \">=\"),\n rhs = c(40, 25),\n # 等式和不等式约束的雅可比\n J = list(heq.jac = heq.jac, hin.jac = hin.jac)\n ),\n bounds = V_bound(ld = 1, ud = 5, nobj = 4L),\n maximum = FALSE # 求最小\n)\n\n作为对比参考,先计算目标函数的初始值和最优值。\n\n# 目标函数初始值\nfn(c(1, 5, 5, 1))\n\n#> [1] 16\n\n# 目标函数最优值\nfn(c(1.00000000, 4.74299963, 3.82114998, 1.37940829))\n\n#> [1] 17.01402\n\n\n求解一般的非线性约束问题。\n\n求解器 nloptr.mma / nloptr.cobyla 仅支持非线性不等式约束,不支持等式约束。\n函数 nlminb() 只支持等式约束。\n\n因此,下面分别调用 nloptr.auglag、nloptr.slsqp 和 nloptr.isres 来求解上述优化问题。\n\nnlp <- ROI_solve(op, solver = \"nloptr.auglag\", start = c(1, 5, 5, 1))\nnlp$solution\n\n#> [1] 1.000000 4.743174 3.820922 1.379440\n\nnlp$objval\n\n#> [1] 17.01402\n\n\n\nnlp <- ROI_solve(op, solver = \"nloptr.slsqp\", start = c(1, 5, 5, 1))\nnlp$solution\n\n#> [1] 1.000000 4.742996 3.821155 1.379408\n\nnlp$objval\n\n#> [1] 17.01402\n\n\n\nnlp <- ROI_solve(op, solver = \"nloptr.isres\", start = c(1, 5, 5, 1))\nnlp$solution\n\n#> [1] 1.111263 4.830733 3.723643 1.251758\n\nnlp$objval\n\n#> [1] 17.16886\n\n\n可以看出,nloptr 提供的优化能力可以覆盖 Ipopt 求解器,从以上求解的情况来看,推荐使用 nloptr.slsqp 求解器,这也是 Octave 的选择。", + "text": "31.4 非线性优化\n非线性优化按是否带有约束,以及约束是线性还是非线性,分为无约束优化、箱式约束优化、线性约束优化和非线性约束优化。箱式约束可看作是线性约束的特殊情况。\n\nR 软件内置的非线性优化函数\n\n\nnlm()\nnlminb()\nconstrOptim()\noptim()\n\n\n\n无约束\n支持\n支持\n不支持\n支持\n\n\n箱式约束\n不支持\n支持\n支持\n支持\n\n\n线性约束\n不支持\n不支持\n支持\n不支持\n\n\n\nR 软件内置的 stats 包有 4 个数值优化方面的函数,函数 nlm() 可求解无约束优化问题,函数 nlminb() 可求解无约束、箱式约束优化问题,函数 constrOptim() 可求解箱式和线性约束优化。函数 optim() 是通用型求解器,包含多个优化算法,可求解无约束、箱式约束优化问题。尽管这些函数在 R 语言中长期存在,在统计中有广泛的使用,如非线性最小二乘 stats::nls(),极大似然估计 stats4::mle() 和广义最小二乘估计 nlme::gls() 等。但是,这些优化函数的求解能力有重合,使用语法不尽相同,对于非线性约束无能为力,下面仍然主要使用 ROI 包来求解多维非线性优化问题。\n\n31.4.1 一元非线性优化\n求如下一维分段非线性函数的最小值,其函数图像见 图 31.5 ,这个函数是不连续的,更不光滑。\n\\[\nf(x) =\n\\begin{cases}\n10 & x \\in (-\\infty,-1] \\\\\n\\exp(-\\frac{1}{|x-1|}) & x \\in (-1,4) \\\\\n10 & x \\in [4, +\\infty)\n\\end{cases}\n\\]\n\nfn <- function(x) ifelse(x > -1, ifelse(x < 4, exp(-1 / abs(x - 1)), 10), 10)\n\n\n代码op <- par(mar = c(4, 4, 0.5, 0.5))\ncurve(\n expr = fn, from = -2, to = 5, lwd = 2,\n panel.first = grid(),\n xlab = \"$x$\", ylab = \"$f(x)$\"\n)\non.exit(par(op), add = TRUE)\n\n\n\n\n\n\n图 31.5: 一维函数图像\n\n\n\n\n函数 optimize() 可以求解一元函数的极值问题,默认求极小值,参数 f 表示目标函数,参数 interval 表示搜索在此区间内最小值。函数返回一个列表,元素 minimum 表示极小值点,objective 表示极值点对应的目标函数值。\n\noptimize(f = fn, interval = c(-4, 20), maximum = FALSE)\n\n#> $minimum\n#> [1] 19.99995\n#> \n#> $objective\n#> [1] 10\n\noptimize(f = fn, interval = c(-7, 20), maximum = FALSE)\n\n#> $minimum\n#> [1] 0.9992797\n#> \n#> $objective\n#> [1] 0\n\n\n值得注意,对于不连续的分段函数,在不同的区间内搜索极值,可能获得不同的结果,可以绘制函数图像帮助选择最小值。\n\n31.4.2 多元隐函数优化\n这个优化问题来自 1stOpt 软件的帮助文档,下面利用 R 语言来求该多元隐函数的极值。\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}} y = & ~\\sin\\Big((yx_1 -0.5)^2 + 2x_1 x_2^2 - \\frac{y}{10} \\Big)\\cdot \\\\\n&~\\exp\\Big(-\\Big( \\big(x_1 - 0.5 -\\exp(-x_2 + y)\\big)^2 + x_2^2 - \\frac{y}{5} + 3 \\Big)\\Big)\n\\end{aligned}\n\\]\n其中, \\(x_1 \\in [-1,7],x_2 \\in [-2,2]\\) 。\n对于隐函数 \\(f(x_1,x_2,y)=0\\) ,常规的做法是先计算隐函数的偏导数,并令偏导数为 0,再求解非线性方程组,得到各个驻点,最后,将驻点代入原方程,比较驻点处函数值,根据优化目标选择最大或最小值。\n\\[\n\\begin{aligned}\n\\frac{\\partial f(x_1,x_2,y)}{\\partial x_1} = 0 \\\\\n\\frac{\\partial f(x_1,x_2,y)}{\\partial x_2} = 0\n\\end{aligned}\n\\]\n如果目标函数很复杂,隐函数偏导数难以计算,可以考虑暴力网格搜索。先估计隐函数值 \\(z\\) 的大致范围,给定 \\(x,y\\) 时,计算一元非线性方程的根。\n\nfn <- function(m) {\n subfun <- function(x) {\n f1 <- (m[1] * x - 0.5)^2 + 2 * m[1] * m[2]^2 - x / 10\n f2 <- -((m[1] - 0.5 - exp(-m[2] + x))^2 + m[2]^2 - x / 5 + 3)\n x - sin(f1) * exp(f2)\n }\n uniroot(f = subfun, interval = c(-1, 1))$root\n}\n\n在位置 \\((1,2)\\) 处函数值为 0.0007368468。\n\n# 测试函数 fn\nfn(m = c(1, 2))\n\n#> [1] 0.0007368468\n\n\n将目标区域网格化,通过一元非线性方程求根的方式获得每个格点处的函数值。\n\ndf <- expand.grid(\n x1 = seq(from = -1, to = 7, length.out = 81),\n x2 = seq(from = -2, to = 2, length.out = 41)\n)\n# 计算格点处的函数值\ndf$fn <- apply(df, 1, FUN = fn)\n\n在此基础上,绘制隐函数图像,如 图 31.6 所示,可以获得关于隐函数的大致情况。\n\n代码# 绘图\nwireframe(\n data = df, fn ~ x1 * x2,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]), ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90),\n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\n\n\n\n\n\n图 31.6: 隐函数图像\n\n\n\n\n最后,获得暴力网格搜索的结果,目标函数在 \\((2.8,-0.9)\\) 处取得最小值 \\(-0.02159723\\)。总的来说,这是一个近似结果,如果进一步缩小搜索区域,将网格划分得越细,搜索的结果将越接近全局最小值。\n\ndf[df$fn == min(df$fn), ]\n\n#> x1 x2 fn\n#> 930 2.8 -0.9 -0.02159723\n\n\n将求隐函数极值的问题转为含非线性等式约束的非线性优化问题。\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}} \\quad & y \\\\\n\\text{s.t.} \\quad & f(x_1,x_2,y) = 0\n\\end{aligned}\n\\]\n由于等式约束非常复杂,手动计算等式约束的雅可比矩阵不可行,可以用 numDeriv 包的函数 jacobian() 计算等式约束的雅可比矩阵。考虑到本例中仅含有一个等式约束,雅可比矩阵退化为梯度向量,这可以用 numDeriv 包的另一个函数 grad() 计算。\n\n# 等式约束\nheq <- function(x) {\n f1 <- (x[1] * x[3] - 0.5)^2 + 2 * x[1] * x[2]^2 - x[3] / 10\n f2 <- (x[1] - 0.5 - exp(-x[2] + x[3]))^2 + x[2]^2 - x[3] / 5 + 3\n x[3] - sin(f1) * exp(-f2)\n}\n# 等式约束的梯度\nheq.jac <- function(x) {\n numDeriv::grad(func = heq, x = x)\n}\n\n函数 L_objective() 表示含 1 个决策变量的线性目标函数,函数 F_constraint() 表示非线性等式约束。\n\n# 定义优化问题\nop <- OP(\n objective = L_objective(L = c(0, 0, 1)),\n constraints = F_constraint(\n # 等式约束\n F = list(heq = heq),\n dir = \"==\",\n rhs = 0,\n # 等式约束的雅可比\n J = list(heq.jac = heq.jac)\n ),\n bounds = V_bound(\n ld = -Inf, ud = Inf,\n li = c(1, 2), ui = c(1, 2),\n lb = c(-1, -2), ub = c(7, 2),\n nobj = 3L\n ),\n maximum = FALSE # 求最小\n)\nop\n\n#> ROI Optimization Problem:\n#> \n#> Minimize a linear objective function of length 3 with\n#> - 3 continuous objective variables,\n#> \n#> subject to\n#> - 1 constraint of type nonlinear.\n#> - 3 lower and 2 upper non-standard variable bounds.\n\n\n将网格搜索的结果作为初值,继续寻找更优的目标函数值。\n\nnlp <- ROI_solve(op,\n solver = \"nloptr.slsqp\", start = c(2.8, -0.9, -0.02159723)\n)\n# 最优解\nnlp$solution\n\n#> [1] 2.89826224 -0.85731584 -0.02335409\n\n# 目标函数值\nnlp$objval\n\n#> [1] -0.02335409\n\n\n可以发现,更优的目标函数值 \\(-0.02335\\) 在 \\((2.898,-0.8573)\\) 取得。\n\n31.4.3 多元无约束优化\n\n31.4.3.1 示例 1\nRastrigin 函数是一个 \\(n\\) 维优化问题测试函数。\n\\[\n\\min_{\\boldsymbol{x}} \\sum_{i=1}^{n}\\big(x_i^2 - 10 \\cos(2\\pi x_i) + 10\\big)\n\\]\n计算函数值的 R 代码如下:\n\nfn <- function(x) {\n sum(x^2 - 10 * cos(2 * pi * x) + 10)\n}\n\n绘制二维情形下的 Rastrigin 函数图像,如 图 31.7 所示,这是一个多模态的函数,有许多局部极小值。如果采用 BFGS 算法寻优容易陷入局部极值点。\n\n代码df <- expand.grid(\n x = seq(-4, 4, length.out = 151),\n y = seq(-4, 4, length.out = 151)\n)\n\ndf$fnxy <- apply(df, 1, fn)\nwireframe(\n data = df, fnxy ~ x * y,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]),\n ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90),\n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\n\n\n\n\n\n图 31.7: 二维 Rastrigin 函数图像\n\n\n\n\n不失一般性,考虑函数维数 \\(n=20\\) ,决策变量 \\(x_i \\in [-50,50], i = 1,2,\\ldots,n\\) 的情况。\n\nop <- OP(\n objective = F_objective(fn, n = 20L),\n bounds = V_bound(ld = -50, ud = 50, nobj = 20L)\n)\n\n调全局优化器求解优化问题。\n\nnlp <- ROI_solve(op, solver = \"nloptr.directL\")\n# 最优解\nnlp$solution\n\n#> [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n\n# 目标函数值\nnlp$objval\n\n#> [1] 0\n\n\n\n代码# R 语言内置的非线性优化函数\n# 无约束\nnlm(f = fn, p = rep(1, 20))\noptim(par = rep(1, 20), fn = fn, method = \"BFGS\")\noptim(par = rep(1, 20), fn = fn, method = \"Nelder-Mead\")\n\n# 箱式约束\noptim(par = rep(1, 20), fn = fn, \n lower = -50, upper = 50, method = \"L-BFGS-B\")\nnlminb(start = rep(1, 20), objective = fn, lower = -50, upper = 50)\nconstrOptim(\n theta = rep(1, 20), f = fn, grad = NULL,\n ui = rbind(diag(rep(1, 20)), diag(rep(-1, 20))),\n ci = c(rep(-50, 20), rep(-50, 20))\n)\n\n\n\n31.4.3.2 示例 2\n下面这个优化问题来自 1stOpt 软件帮助手册,是一个无约束非线性优化问题,它的目标函数非常复杂,一般的求解器都无法求解。最优解在 \\((7.999982, 7.999982)\\) 取得,目标函数值为 -7.978832。\n\\[\n\\begin{aligned}\n & \\min_{\\boldsymbol{x}} ~ \\cos(x_1)\\cos(x_2) - \\sum_{i=1}^{5}\\Big( (-1)^i \\cdot i \\cdot 2 \\cdot \\exp\\big(-500 \\cdot ( (x_1 - i \\cdot 2)^2 + (x_2 - i\\cdot 2)^2 ) \\big) \\Big)\n\\end{aligned}\n\\]\n目标函数分两步计算,先计算累加部分的通项,然后代入计算目标函数。\n\nsubfun <- function(i, m) {\n (-1)^i * i * 2 * exp(-500 * ((m[1] - i * 2)^2 + (m[2] - i * 2)^2))\n}\nfn <- function(x) {\n cos(x[1]) * cos(x[2]) -\n sum(mapply(FUN = subfun, i = 1:5, MoreArgs = list(m = x)))\n}\n\n直观起见,绘制目标函数在区域 \\([-50, 50] \\times [-50, 50]\\) 内的图像,如 图 31.8 (a) 所示,可以看到几乎没有变化的梯度,给寻优过程带来很大困难。再将区域 \\([0, 12] \\times [0, 12]\\) 上的三维图像绘制出来,如 图 31.8 (b) 所示,可见,有不少局部陷阱,且分布在 \\(x_2 = x_1\\) 的直线上。\n代码df <- expand.grid(\n x = seq(-50, 50, length.out = 101),\n y = seq(-50, 50, length.out = 101)\n)\ndf$fnxy <- apply(df, 1, fn)\nwireframe(\n data = df, fnxy ~ x * y,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]),\n ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90),\n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\ndf <- expand.grid(\n x = seq(0, 12, length.out = 151),\n y = seq(0, 12, length.out = 151)\n)\ndf$fnxy <- apply(df, 1, fn)\nwireframe(\n data = df, fnxy ~ x * y,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]),\n ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90), alpha = 0.75, \n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\n\n\n\n\n\n\n\n\n(a) 区域 \\([-50,50]\\times[-50,50]\\) 内的函数图像\n\n\n\n\n\n\n\n\n\n\n\n(b) 区域 \\([0,12]\\times[0,12]\\) 内的函数图像\n\n\n\n\n\n\n图 31.8: 局部放大前后的函数图像\n\n\n不失一般性,下面考虑 \\(x_1,x_2 \\in [-50,50]\\) ,面对如此复杂的函数,调用全局优化器 nloptr.directL 寻优。\n\nop <- OP(\n objective = F_objective(fn, n = 2L),\n bounds = V_bound(ld = -50, ud = 50, nobj = 2L)\n)\nnlp <- ROI_solve(op, solver = \"nloptr.directL\")\nnlp$solution\n\n#> [1] 22.22222 0.00000\n\nnlp$objval\n\n#> [1] -0.9734211\n\n\n结果还是陷入局部最优解。运筹优化方面的商业软件,著名的有 Lingo 和 Matlab,下面采用 Lingo 20 求解,Lingo 代码如下:\nSETS:\nP/1..5/;\nEndsets\nMin=@cos(x1) * @cos(x2) - @Sum(P(j): (-1)^j * j * 2 * @exp(-500 * ((x1 - j * 2)^2 + (x2 - j * 2)^2)));\n@Bnd(-50, x1, 50);\n@Bnd(-50, x2, 50);\n启用全局优化求解器后,在 \\((x_1 = 7.999982, x_2 = 7.999982)\\) 取得最小值 -7.978832。而默认未启用全局优化求解器的情况下,在 \\((x_1 = 18.84956, x_2 = -40.84070)\\) 取得局部极小值 -1.000000。\n在这种情况下,数值优化算法遇到瓶颈,可以采用一些全局随机优化算法,比如 GA 包 (Scrucca 2013) 实现的遗传算法。经过对参数的一些调优,可以获得与商业软件几乎一样的结果。\n\nnlp <- GA::ga(\n type = \"real-valued\",\n fitness = function(x) -fn(x),\n lower = c(0, 0), upper = c(12, 12),\n popSize = 500, maxiter = 100, \n monitor = FALSE, seed = 20232023\n)\n# 最优解\nnlp@solution\n\n#> x1 x2\n#> [1,] 7.999982 7.999981\n\n# 目标函数值\nnlp@fitnessValue\n\n#> [1] 7.978832\n\n\n其中,参数 type 指定决策变量的类型,type = \"real-valued\" 表示目标函数中的决策变量是实值连续的,参数 fitness 是目标函数,函数 ga() 对目标函数求极大,所以,对当前优化问题,添加了一个负号。 参数 popSize 控制种群大小,值越大,运行时间越长,搜索范围越广,获得的全局优化解越好。对于复杂的优化问题,可以不断增加种群大小来寻优,直至增加种群大小也不能获得更好的解。参数 maxiter 控制种群进化的次数,值越大,搜索次数可以越多,获得的解越好。参数 popSize 的影响大于参数 maxiter ,减少陷入局部最优解(陷阱)的可能。根据已知条件尽可能缩小可行域,以减少种群数量,进而缩短算法迭代时间。\n\n31.4.4 多元箱式约束优化\n有如下带箱式约束的多元非线性优化问题,该示例来自函数 nlminb() 的帮助文档,如果没有箱式约束,全局极小值点在 \\((1,1,\\cdots,1)\\) 处取得。\n\\[\n\\begin{aligned}\n \\min_{\\boldsymbol{x}} \\quad & (x_1 - 1)^2 + 4\\sum_{i =1}^{n -1}(x_{i+1} -x_i^2)^2 \\\\\n \\text{s.t.} \\quad & 2 \\leq x_1,x_2,\\cdots,x_n \\leq 4\n\\end{aligned}\n\\]\nR 语言编码的函数代码如下:\n\nfn <- function(x) {\n n <- length(x)\n sum(c(1, rep(4, n - 1)) * (x - c(1, x[-n])^2)^2)\n}\n\n在二维的情形下,可以绘制目标函数的三维图像,见 图 31.9 ,函数曲面和香蕉函数有些相似。\n\n代码dat <- expand.grid(\n x1 = seq(from = 0, to = 4, length.out = 41),\n x2 = seq(from = 0, to = 4, length.out = 41)\n)\ndat$fn <- apply(dat, 1, fn)\n\nwireframe(\n data = dat, fn ~ x1 * x2,\n shade = TRUE, drape = FALSE,\n xlab = expression(x[1]), ylab = expression(x[2]),\n zlab = list(expression(\n italic(f) ~ group(\"(\", list(x[1], x[2]), \")\")\n ), rot = 90),\n scales = list(arrows = FALSE, col = \"black\"),\n shade.colors.palette = custom_palette,\n # 减少三维图形的边空\n lattice.options = list(\n layout.widths = list(\n left.padding = list(x = -0.5, units = \"inches\"),\n right.padding = list(x = -1.0, units = \"inches\")\n ),\n layout.heights = list(\n bottom.padding = list(x = -1.5, units = \"inches\"),\n top.padding = list(x = -1.5, units = \"inches\")\n )\n ),\n par.settings = list(axis.line = list(col = \"transparent\")),\n screen = list(z = 30, x = -65, y = 0)\n)\n\n\n\n\n\n\n图 31.9: 类香蕉函数的曲面图\n\n\n\n\nBase R 有 3 个函数可以求解这个优化问题,分别是 nlminb() 、constrOptim()和optim() ,因此,不妨在这个示例上,用这 3 个函数分别求解该优化问题,介绍它们的用法,最后,介绍 ROI 包实现的方法。这个优化问题的目标函数是 \\(n\\) 维非线性的,不失一般性,又不让问题变得太过简单,下面考虑 25 维的情况,\n\n31.4.4.1 nlminb()\n\n函数 nlminb() 参数 start 指定迭代初始值,参数 objective 指定目标函数,参数 lower 和 upper 分别指定箱式约束中的下界和上界。给定初值 \\((3, 3, \\cdots, 3)\\),下界 \\((2,2,\\cdots,2)\\) 和上界 \\((4,4,\\cdots,4)\\) 。nlminb() 帮助文档说该函数出于历史兼容性的原因尚且存在,一般来说,这个函数会一直维护下去的。\n\nnlminb(\n start = rep(3, 25), objective = fn,\n lower = rep(2, 25), upper = rep(4, 25)\n)\n\n#> $par\n#> [1] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [9] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [17] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.109093\n#> [25] 4.000000\n#> \n#> $objective\n#> [1] 368.1059\n#> \n#> $convergence\n#> [1] 0\n#> \n#> $iterations\n#> [1] 6\n#> \n#> $evaluations\n#> function gradient \n#> 10 177 \n#> \n#> $message\n#> [1] \"relative convergence (4)\"\n\n\n从返回结果来看,求解过程成功收敛,最优解的前 23 个决策变量取值为 2,在箱式约束的边界上,第 24 个分量没有边界上,而在内部,第 25 个决策变量取值为 4,也在边界上。目标函数值为 368.1059。\n\n31.4.4.2 constrOptim()\n\n使用 constrOptim() 函数求解,默认求极小,需将箱式或线性不等式约束写成矩阵形式,即 \\(Ax \\geq b\\) 的形式,参数 ui 是 \\(k \\times n\\) 的约束矩阵 \\(A\\),ci 是右侧 \\(k\\) 维约束向量 \\(b\\)。以上面的优化问题为例,将箱式约束 \\(2 \\leq x_1,x_2 \\leq 4\\) 转化为矩阵形式,约束矩阵和向量分别为:\n\\[\nA = \\begin{bmatrix}\n1 & 0 \\\\\n0 & 1 \\\\\n-1 & 0 \\\\\n0 & -1\n\\end{bmatrix}, \\quad\nb = \\begin{bmatrix}\n2 \\\\\n2 \\\\\n-4 \\\\\n-4\n\\end{bmatrix}\n\\]\n\nconstrOptim(\n theta = rep(3, 25), # 初始值\n f = fn, # 目标函数\n method = \"Nelder-Mead\", # 没有提供梯度,则必须用 Nelder-Mead 方法\n ui = rbind(diag(rep(1, 25)), diag(rep(-1, 25))),\n ci = c(rep(2, 25), rep(-4, 25))\n)\n\n#> $par\n#> [1] 2.006142 2.002260 2.003971 2.003967 2.004143 2.004255 2.001178 2.002990\n#> [9] 2.003883 2.006029 2.017345 2.009236 2.000949 2.007793 2.025831 2.007896\n#> [17] 2.004514 2.004381 2.008771 2.015695 2.005803 2.009127 2.017988 2.257782\n#> [25] 3.999846\n#> \n#> $value\n#> [1] 378.4208\n#> \n#> $counts\n#> function gradient \n#> 12048 NA \n#> \n#> $convergence\n#> [1] 1\n#> \n#> $message\n#> NULL\n#> \n#> $outer.iterations\n#> [1] 25\n#> \n#> $barrier.value\n#> [1] -0.003278963\n\n\n返回结果中 convergence = 1 表示迭代次数到达默认的极限 maxit = 500 。参考函数 nlminb() 的求解结果,可知还没有收敛。如果没有提供梯度,则必须用 Nelder-Mead 方法,下面增加迭代次数到 1000。\n\nconstrOptim(\n theta = rep(3, 25), # 初始值\n f = fn, # 目标函数\n method = \"Nelder-Mead\", \n control = list(maxit = 1000),\n ui = rbind(diag(rep(1, 25)), diag(rep(-1, 25))),\n ci = c(rep(2, 25), rep(-4, 25))\n)\n\n#> $par\n#> [1] 2.000081 2.000142 2.001919 2.000584 2.000007 2.000003 2.001097 2.001600\n#> [9] 2.000207 2.000042 2.000250 2.000295 2.000580 2.002165 2.000453 2.000932\n#> [17] 2.000456 2.000363 2.000418 2.000474 2.009483 2.001156 2.003173 2.241046\n#> [25] 3.990754\n#> \n#> $value\n#> [1] 370.8601\n#> \n#> $counts\n#> function gradient \n#> 18036 NA \n#> \n#> $convergence\n#> [1] 1\n#> \n#> $message\n#> NULL\n#> \n#> $outer.iterations\n#> [1] 19\n#> \n#> $barrier.value\n#> [1] -0.003366467\n\n\n结果有改善,目标函数值从 378.4208 减小到 370.8601,但还是没有收敛,可见 Nelder-Mead 方法在这个优化问题上收敛速度比较慢。下面考虑调用基于梯度的 BFGS 优化算法,这得先计算出来目标函数的梯度。\n\n# 输入 n 维向量,输出 n 维向量\ngr <- function(x) {\n n <- length(x)\n c(2 * (x[1] - 2), rep(0, n - 1))\n +8 * c(0, x[-1] - x[-n]^2)\n -16 * c(x[-n], 0) * c(x[-1] - x[-n]^2, 0)\n}\nconstrOptim(\n theta = rep(3, 25), # 初始值\n f = fn, # 目标函数\n grad = gr,\n method = \"BFGS\", \n control = list(maxit = 1000),\n ui = rbind(diag(rep(1, 25)), diag(rep(-1, 25))),\n ci = c(rep(2, 25), rep(-4, 25))\n)\n\n#> $par\n#> [1] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [9] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [17] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000001\n#> [25] 3.000000\n#> \n#> $value\n#> [1] 373\n#> \n#> $counts\n#> function gradient \n#> 3721 464 \n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> NULL\n#> \n#> $outer.iterations\n#> [1] 3\n#> \n#> $barrier.value\n#> [1] -0.003327104\n\n\n从结果来看,虽然已经收敛,但相比于 Nelder-Mead 方法,目标函数值变大了,可见已陷入局部最优解。\n\n31.4.4.3 optim()\n\n下面再使用函数 optim() 提供的 L-BFGS-B 算法求解优化问题。\n\noptim(\n par = rep(3, 25), fn = fn, gr = NULL, method = \"L-BFGS-B\",\n lower = rep(2, 25), upper = rep(4, 25)\n)\n\n#> $par\n#> [1] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [9] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [17] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.109093\n#> [25] 4.000000\n#> \n#> $value\n#> [1] 368.1059\n#> \n#> $counts\n#> function gradient \n#> 6 6 \n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> [1] \"CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH\"\n\n\n发现结果和函数 nlminb() 的结果差不多了。\n\noptim(\n par = rep(3, 25), fn = fn, gr = gr, method = \"L-BFGS-B\",\n lower = rep(2, 25), upper = rep(4, 25)\n)\n\n#> $par\n#> [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3\n#> \n#> $value\n#> [1] 373\n#> \n#> $counts\n#> function gradient \n#> 2 2 \n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> [1] \"CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL\"\n\n\n然而,当在函数 optim() 里提供梯度信息的时候,虽然目标函数及梯度的计算次数变少了,求解速度提升了,但是最优解反而变差了,最优解和在函数 constrOptim() 中设置 method = \"BFGS\" 算法基本一致。\n\n31.4.4.4 ROI 包\n下面通过 ROI 包,分别调用求解器 nloptr.lbfgs 和 nloptr.directL ,发现前者同样陷入局部最优解,而后者可以获得与 nlminb() 函数一致的结果。\n\nop <- OP(\n objective = F_objective(fn, n = 25L, G = gr),\n bounds = V_bound(ld = 2, ud = 4, nobj = 25L)\n)\nnlp <- ROI_solve(op, solver = \"nloptr.lbfgs\", start = rep(3, 25))\n# 目标函数值\nnlp$objval\n\n#> [1] 373\n\n# 最优解\nnlp$solution\n\n#> [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3\n\n\n调全局优化算法。\n\nnlp <- ROI_solve(op, solver = \"nloptr.directL\")\n# 目标函数值\nnlp$objval\n\n#> [1] 368.106\n\n# 最优解\nnlp$solution\n\n#> [1] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [9] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000\n#> [17] 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.109093\n#> [25] 4.000000\n\n\n\n31.4.5 多元线性约束优化\n对于带线性约束的多元非线性优化问题,Base R 提供函数 constrOptim() 来求解,下面的示例来自其帮助文档,这是一个带线性约束的二次规划问题。\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}}\n\\quad & - \\begin{bmatrix}\n0 \\\\\n5 \\\\\n0\n\\end{bmatrix}^{\\top} \\boldsymbol{x} +\\frac{1}{2} \\boldsymbol{x}^{\\top}\\boldsymbol{x} \\\\\n\\text{s.t.} \\quad & \\begin{bmatrix}\n-4 & 2 & 0 \\\\\n-3 & 1 & -2 \\\\\n0 & 0 & 1\n\\end{bmatrix}^{\\top}\\boldsymbol{x} \\geq \\begin{bmatrix}\n-8 \\\\\n2 \\\\\n0\n\\end{bmatrix}\n\\end{aligned}\n\\]\n\nfQP <- function(x) {\n -sum(c(0, 5, 0) * x) + 0.5 * sum(x * x)\n}\nAmat <- matrix(c(-4, -3, 0, 2, 1, 0, 0, -2, 1),\n ncol = 3, nrow = 3, byrow = FALSE\n)\nbvec <- c(-8, 2, 0)\n# 目标函数的梯度\ngQP <- function(x) {\n -c(0, 5, 0) + x\n}\nconstrOptim(\n theta = c(2, -1, -1), \n f = fQP, g = gQP, \n ui = t(Amat), ci = bvec\n)\n\n#> $par\n#> [1] 0.4761908 1.0476188 2.0952376\n#> \n#> $value\n#> [1] -2.380952\n#> \n#> $counts\n#> function gradient \n#> 406 81 \n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> NULL\n#> \n#> $outer.iterations\n#> [1] 3\n#> \n#> $barrier.value\n#> [1] -0.0006243894\n\n\n在上一节,箱式约束可以看作线性约束的一种特殊情况,ROI 包是支持箱式、线性、二次、锥和非线性约束的。因此,下面给出调用 ROI 包求解上述优化问题的代码。\n\nDmat <- diag(rep(1,3))\ndvec <- c(0, 5, 0)\nop <- OP(\n objective = Q_objective(Q = Dmat, L = -dvec),\n constraints = L_constraint(L = t(Amat), dir = rep(\">=\", 3), rhs = bvec),\n maximum = FALSE\n)\nnlp <- ROI_solve(op, solver = \"nloptr.slsqp\", start = c(0, 1, 2))\n# 最优解\nnlp$solution\n\n#> [1] 0.4761905 1.0476190 2.0952381\n\n# 目标函数值\nnlp$objval\n\n#> [1] -2.380952\n\n\n可见输出结果与函数 constrOptim() 是一致的。\n\n代码# quadprog\nlibrary(quadprog)\nsol <- solve.QP(\n Dmat = Dmat, dvec = dvec, Amat = Amat, bvec = bvec\n)\nsol\n\n\n\n31.4.6 多元非线性约束优化\nnloptr 包的非线性优化能力覆盖开源优化软件 Octave 和 Ipopt 。通过插件包 ROI.plugin.nloptr,ROI 包可以调用 nloptr 包内置的所有求解器,常用的求解器见下表。表中从优化器类型(局部还是全局优化器),支持的约束条件类型(箱式还是非线性),是否需要提供目标函数的梯度、黑塞和约束条件的雅可比矩阵信息等方面归纳各个求解器的能力。\n\n常用的非线性优化求解器\n\n求解器\n类型\n约束\n梯度\n黑塞\n雅可比\n\n\n\nnloptr.lbfgs\n局部\n箱式\n需要\n不需要\n不需要\n\n\nnloptr.slsqp\n局部\n非线性\n需要\n不需要\n需要\n\n\nnloptr.auglag\n局部\n非线性\n需要\n不需要\n需要\n\n\nnloptr.directL\n全局\n箱式\n不需要\n不需要\n不需要\n\n\nnloptr.isres\n全局\n非线性\n不需要\n不需要\n不需要\n\n\n\n\n31.4.6.1 非线性等式约束\n下面这个示例来自 Octave 软件的非线性优化帮助文档,Octave 中的函数 sqp() 使用序列二次优化求解器(successive quadratic programming solver)求解非线性优化问题,示例中该优化问题包含多个非线性等式约束。\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}} \\quad & \\exp\\big(\\prod_{i=1}^{5} x_i\\big) - \\frac{1}{2}(x_1^3 + x_2^3 + 1)^2 \\\\\n\\text{s.t.} \\quad & \\left\\{\n \\begin{array}{l}\n \\sum_{i=1}^{5}x_i^2 - 10 = 0 \\\\\n x_2 x_3 - 5x_4 x_5 = 0 \\\\\n x_1^3 + x_2^3 + 1 = 0\n \\end{array} \\right.\n\\end{aligned}\n\\]\n目标函数是非线性的,有 5 个变量,约束条件也是非线性的,有 3 个等式约束。先手动计算目标函数的梯度,等式约束的雅可比矩阵。\n\n# 目标函数\nfn <- function(x) {\n exp(prod(x)) - 0.5 * (x[1]^3 + x[2]^3 + 1)^2\n}\n# 目标函数的梯度\ngr <- function(x) {\n c(\n exp(prod(x)) * prod(x[-1]) - 3 * (x[1]^3 + x[2]^3 + 1) * x[1]^2,\n exp(prod(x)) * prod(x[-2]) - 3 * (x[1]^3 + x[2]^3 + 1) * x[2]^2,\n exp(prod(x)) * prod(x[-3]),\n exp(prod(x)) * prod(x[-4]),\n exp(prod(x)) * prod(x[-5])\n )\n}\n# 等式约束\nheq <- function(x) {\n c(\n sum(x^2) - 10,\n x[2] * x[3] - 5 * x[4] * x[5],\n x[1]^3 + x[2]^3 + 1\n )\n}\n# 等式约束的雅可比矩阵\nheq.jac <- function(x) {\n matrix(c(2 * x[1], 2 * x[2], 2 * x[3], 2 * x[4], 2 * x[5],\n 0, x[3], x[2], -5 * x[5], -5 * x[4],\n 3 * x[1]^2, 3 * x[2]^2, 0, 0, 0),\n ncol = 5, byrow = TRUE\n )\n}\n\n在 OP() 函数里定义目标优化的各个成分。\n\n# 定义目标优化\nop <- OP(\n # 5 个决策变量\n objective = F_objective(F = fn, n = 5L, G = gr), \n constraints = F_constraint(\n F = list(heq = heq),\n dir = \"==\",\n rhs = 0,\n # 等式约束的雅可比矩阵\n J = list(heq.jac = heq.jac)\n ),\n bounds = V_bound(ld = -Inf, ud = Inf, nobj = 5L),\n maximum = FALSE # 求最小\n)\nop\n\n#> ROI Optimization Problem:\n#> \n#> Minimize a nonlinear objective function of length 5 with\n#> - 5 continuous objective variables,\n#> \n#> subject to\n#> - 1 constraint of type nonlinear.\n#> - 5 lower and 0 upper non-standard variable bounds.\n\n\n调用 SQP(序列二次优化) 求解器 nloptr.slsqp 。\n\nnlp <- ROI_solve(op,\n solver = \"nloptr.slsqp\",\n start = c(-1.8, 1.7, 1.9, -0.8, -0.8)\n)\n# 最优解\nnlp$solution\n\n#> [1] -1.7171435 1.5957096 1.8272458 -0.7636431 -0.7636431\n\n# 目标函数值\nnlp$objval\n\n#> [1] 0.05394985\n\n\n计算结果和 Octave 的示例一致。\n\n31.4.6.2 多种非线性约束\n\n非线性等式约束\n非线性不等式约束,不等式约束包含等号\n箱式约束\n\n此优化问题来源于 Ipopt 官网的帮助文档,约束条件比较复杂。提供的初始值为 \\(x_0 = (1,5,5,1)\\),最优解为 \\(x_{\\star} = (1.00000000,4.74299963,3.82114998,1.37940829)\\)。优化问题的具体内容如下:\n\\[\n\\begin{aligned}\n\\min_{\\boldsymbol{x}} & \\quad x_1 x_4 (x_1 + x_2 + x_3) + x_3 \\\\\n\\text{s.t.} & \\quad \\left\\{\n \\begin{array}{l}\n x_1^2 + x_2^2 + x_3^2 + x_4^2 = 40 \\\\\n x_1 x_2 x_3 x_4 \\geq 25 \\\\\n 1 \\leq x_1, x_2, x_3, x_4 \\leq 5\n \\end{array} \\right.\n\\end{aligned}\n\\]\n下面用 ROI 调 nloptr 包求解,看结果是否和例子一致,nloptr 支持箱式约束且支持不等式约束包含等号。\n\n# 一个 4 维的目标函数\nfn <- function(x) {\n x[1] * x[4] * (x[1] + x[2] + x[3]) + x[3]\n}\n# 目标函数的梯度\ngr <- function(x) {\n c(\n x[4] * (2 * x[1] + x[2] + x[3]), x[1] * x[4],\n x[1] * x[4] + 1, x[1] * (x[1] + x[2] + x[3])\n )\n}\n# 等式约束\nheq <- function(x) {\n sum(x^2)\n}\n# 等式约束的雅可比\nheq.jac <- function(x) {\n 2 * c(x[1], x[2], x[3], x[4])\n}\n# 不等式约束\nhin <- function(x) {\n prod(x)\n}\n# 不等式约束的雅可比\nhin.jac <- function(x) {\n c(prod(x[-1]), prod(x[-2]), prod(x[-3]), prod(x[-4]))\n}\n# 定义目标优化\nop <- OP(\n objective = F_objective(F = fn, n = 4L, G = gr), # 4 个决策变量\n constraints = F_constraint(\n F = list(heq = heq, hin = hin),\n dir = c(\"==\", \">=\"),\n rhs = c(40, 25),\n # 等式和不等式约束的雅可比\n J = list(heq.jac = heq.jac, hin.jac = hin.jac)\n ),\n bounds = V_bound(ld = 1, ud = 5, nobj = 4L),\n maximum = FALSE # 求最小\n)\n\n作为对比参考,先计算目标函数的初始值和最优值。\n\n# 目标函数初始值\nfn(c(1, 5, 5, 1))\n\n#> [1] 16\n\n# 目标函数最优值\nfn(c(1.00000000, 4.74299963, 3.82114998, 1.37940829))\n\n#> [1] 17.01402\n\n\n求解一般的非线性约束问题。\n\n求解器 nloptr.mma / nloptr.cobyla 仅支持非线性不等式约束,不支持等式约束。\n函数 nlminb() 只支持等式约束。\n\n因此,下面分别调用 nloptr.auglag、nloptr.slsqp 和 nloptr.isres 来求解上述优化问题。\n\nnlp <- ROI_solve(op, solver = \"nloptr.auglag\", start = c(1, 5, 5, 1))\nnlp$solution\n\n#> [1] 1.000000 4.743174 3.820922 1.379440\n\nnlp$objval\n\n#> [1] 17.01402\n\n\n\nnlp <- ROI_solve(op, solver = \"nloptr.slsqp\", start = c(1, 5, 5, 1))\nnlp$solution\n\n#> [1] 1.000000 4.742996 3.821155 1.379408\n\nnlp$objval\n\n#> [1] 17.01402\n\n\n\nnlp <- ROI_solve(op, solver = \"nloptr.isres\", start = c(1, 5, 5, 1))\nnlp$solution\n\n#> [1] 1.284840 4.793586 3.749755 1.145225\n\nnlp$objval\n\n#> [1] 18.21124\n\n\n可以看出,nloptr 提供的优化能力可以覆盖 Ipopt 求解器,从以上求解的情况来看,推荐使用 nloptr.slsqp 求解器,这也是 Octave 的选择。", "crumbs": [ "优化建模", "31  数值优化" @@ -2125,7 +2125,7 @@ "href": "optimization-problems.html#sec-poisson-mixture-distributions", "title": "32  优化问题", "section": "\n32.4 泊松混合分布", - "text": "32.4 泊松混合分布\n有限混合模型(Finite Mixtures of Distributions)的应用非常广泛,本节参考 BB 包 (Varadhan 和 Gilbert 2009) 的帮助手册,以泊松混合分布为例,介绍其参数的极大似然估计。更多详细的理论和算法介绍从略,感兴趣的读者可以查阅相关文献 (Hasselblad 1969)。BB 包比内置函数 optim() 功能更强,可以求解大规模非线性方程组,也可以求解带简单约束的非线性优化问题,还可以从多个初始值出发寻找全局最优解。\n两个泊松分布以一定比例 \\(p\\) 混合,以概率 \\(p\\) 服从泊松分布 \\(\\mathrm{Poisson}(\\lambda_1)\\) ,而以概率 \\(1-p\\) 服从泊松分布 \\(\\mathrm{Poisson}(\\lambda_1)\\) 。\n\\[\np\\times \\mathrm{Poisson}(\\lambda_1) + (1 - p)\\times \\mathrm{Poisson}(\\lambda_2)\n\\]\n泊松混合分布的概率密度函数 \\(f(x;p,\\lambda_1,\\lambda_2)\\) 如下:\n\\[\nf(x;p,\\lambda_1,\\lambda_2) = p \\times \\frac{\\lambda_1^x \\exp(-\\lambda_1)}{x!} + (1 - p) \\times \\frac{\\lambda_2^x \\exp(-\\lambda_2)}{x!}\n\\]\n随机变量 \\(X\\) 服从参数为 \\(p\\) 的伯努利分布 \\(X \\sim \\mathrm{Bernoulli}(1, p)\\) ,随机变量 \\(Y\\) 服从泊松混合分布,在伯努利分布的基础上,泊松混合分布也可作如下定义:\n\\[\n\\begin{array}{l}\nY \\sim \\left\\{\n\\begin{array}{l}\n\\mathrm{Poisson}(\\lambda_1), \\quad \\text{当} ~ X = 1 ~ \\text{时},\\\\\n\\mathrm{Poisson}(\\lambda_2), \\quad \\text{当} ~ X = 0 ~ \\text{时}.\n\\end{array} \\right.\n\\end{array}\n\\]\n对数似然函数如下:\n\\[\n\\ell(p,\\lambda_1,\\lambda_2) = \\sum_{i=0}^{n}y_i \\log\\big(p\\times \\exp(-\\lambda_1) \\times\\frac{\\lambda_1^{x_i}}{x_i!} + (1 - p)\\times \\exp(-\\lambda_2) \\times\\frac{\\lambda_2 ^{x_i}}{x_i!} \\big)\n\\]\n下 表格 32.1 数据来自 1947 年 Walter Schilling 发表在 JASA 的一篇文章 (Schilling 1947)。连续三年搜集伦敦《泰晤士报》刊登的死亡告示,每天的告示发布 80 岁及以上女性死亡人数。经过汇总统计,发现,在三年里,没有人死亡的告示出现 162 次,死亡 1 人的告示出现 267 次。\n\n\n表格 32.1: 死亡人数的统计\n\n\n\n死亡人数\n0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n\n\n发生频次\n162\n267\n271\n185\n111\n61\n27\n8\n3\n1\n\n\n\n\n\n考虑到夏季和冬季对老人死亡率的影响是不同的,因此,引入泊松混合分布来对数据建模。\n\n# 对数似然函数\n# p 是一个长度为 3 的向量\n# y 是观测数据向量\npoissmix_loglik <- function(p, y) {\n i <- 0:(length(y) - 1)\n loglik <- y * log(p[1] * exp(-p[2]) * p[2]^i / exp(lgamma(i + 1)) +\n (1 - p[1]) * exp(-p[3]) * p[3]^i / exp(lgamma(i + 1)))\n sum(loglik)\n}\n# lgamma(i + 1) 表示整数 i 的阶乘的对数\n# 参数的下限\nlo <- c(0, 0, 0)\n# 参数的上限\nhi <- c(1, Inf, Inf)\n# 随机生成一组参数初始值\np0 <- runif(3, c(0.2, 1, 1), c(0.8, 5, 8)) \n# 汇总统计出来的死亡人数的频次分布\ny <- c(162, 267, 271, 185, 111, 61, 27, 8, 3, 1)\n\n调用 BB 包的函数 BBoptim() 求解多元非线性箱式约束优化问题。\n\nlibrary(BB)\n# 参数估计\nans <- BBoptim(\n par = p0, fn = poissmix_loglik, y = y,\n lower = lo, upper = hi, \n control = list(maximize = TRUE)\n)\n\n#> iter: 0 f-value: -2290.408 pgrad: 110.367 \n#> iter: 10 f-value: -1998.362 pgrad: 2.092643 \n#> iter: 20 f-value: -1998.255 pgrad: 2.135778 \n#> iter: 30 f-value: -1989.95 pgrad: 0.2171396 \n#> iter: 40 f-value: -1989.946 pgrad: 0.2082606 \n#> iter: 50 f-value: -1989.946 pgrad: 0.360415 \n#> Successful convergence.\n\nans\n\n#> $par\n#> [1] 0.3598829 1.2560907 2.6634012\n#> \n#> $value\n#> [1] -1989.946\n#> \n#> $gradient\n#> [1] 6.82121e-06\n#> \n#> $fn.reduction\n#> [1] -300.4626\n#> \n#> $iter\n#> [1] 56\n#> \n#> $feval\n#> [1] 58\n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> [1] \"Successful convergence\"\n#> \n#> $cpar\n#> method M \n#> 2 50\n\n\nnumDeriv::hessian 计算极大似然点的黑塞矩阵,然后计算参数估计的标准差。\n\n# 黑塞矩阵\nhess <- numDeriv::hessian(x = ans$par, func = poissmix_loglik, y = y)\nhess\n\n#> [,1] [,2] [,3]\n#> [1,] -907.1104 270.22856 341.25434\n#> [2,] 270.2286 -113.47936 -61.68191\n#> [3,] 341.2543 -61.68191 -192.78218\n\n# 标准差\nse <- sqrt(diag(solve(-hess)))\nse\n\n#> [1] 0.1946833 0.3500301 0.2504766\n\n\nmultiStart 从不同初始值出发寻找全局最大值,先找一系列局部极大值,通过比较获得全局最大值。\n\n# 随机生成 10 组初始值\np0 <- matrix(runif(30, c(0.2, 1, 1), c(0.8, 8, 8)), \n nrow = 10, ncol = 3, byrow = TRUE)\nans <- multiStart(\n par = p0, fn = poissmix_loglik, action = \"optimize\",\n y = y, lower = lo, upper = hi, quiet = TRUE,\n control = list(maximize = TRUE, trace = FALSE)\n)\n# 筛选出迭代收敛的解\npmat <- round(cbind(ans$fvalue[ans$conv], ans$par[ans$conv, ]), 4)\ndimnames(pmat) <- list(NULL, c(\"fvalue\", \"parameter 1\", \n \"parameter 2\", \"parameter 3\"))\n# 去掉结果一样的重复解\npmat[!duplicated(pmat), ]\n\n#> fvalue parameter 1 parameter 2 parameter 3\n#> [1,] -1992.839 0.1764 3.6867 1.8441\n#> [2,] -1989.946 0.6401 2.6634 1.2561\n#> [3,] -1998.253 0.5668 2.3774 1.8620\n#> [4,] -2000.055 0.9896 2.1176 6.5843\n#> [5,] -1999.731 0.0131 6.3280 2.1094\n#> [6,] -1989.946 0.3599 1.2561 2.6634", + "text": "32.4 泊松混合分布\n有限混合模型(Finite Mixtures of Distributions)的应用非常广泛,本节参考 BB 包 (Varadhan 和 Gilbert 2009) 的帮助手册,以泊松混合分布为例,介绍其参数的极大似然估计。更多详细的理论和算法介绍从略,感兴趣的读者可以查阅相关文献 (Hasselblad 1969)。BB 包比内置函数 optim() 功能更强,可以求解大规模非线性方程组,也可以求解带简单约束的非线性优化问题,还可以从多个初始值出发寻找全局最优解。\n两个泊松分布以一定比例 \\(p\\) 混合,以概率 \\(p\\) 服从泊松分布 \\(\\mathrm{Poisson}(\\lambda_1)\\) ,而以概率 \\(1-p\\) 服从泊松分布 \\(\\mathrm{Poisson}(\\lambda_1)\\) 。\n\\[\np\\times \\mathrm{Poisson}(\\lambda_1) + (1 - p)\\times \\mathrm{Poisson}(\\lambda_2)\n\\]\n泊松混合分布的概率密度函数 \\(f(x;p,\\lambda_1,\\lambda_2)\\) 如下:\n\\[\nf(x;p,\\lambda_1,\\lambda_2) = p \\times \\frac{\\lambda_1^x \\exp(-\\lambda_1)}{x!} + (1 - p) \\times \\frac{\\lambda_2^x \\exp(-\\lambda_2)}{x!}\n\\]\n随机变量 \\(X\\) 服从参数为 \\(p\\) 的伯努利分布 \\(X \\sim \\mathrm{Bernoulli}(1, p)\\) ,随机变量 \\(Y\\) 服从泊松混合分布,在伯努利分布的基础上,泊松混合分布也可作如下定义:\n\\[\n\\begin{array}{l}\nY \\sim \\left\\{\n\\begin{array}{l}\n\\mathrm{Poisson}(\\lambda_1), \\quad \\text{当} ~ X = 1 ~ \\text{时},\\\\\n\\mathrm{Poisson}(\\lambda_2), \\quad \\text{当} ~ X = 0 ~ \\text{时}.\n\\end{array} \\right.\n\\end{array}\n\\]\n对数似然函数如下:\n\\[\n\\ell(p,\\lambda_1,\\lambda_2) = \\sum_{i=0}^{n}y_i \\log\\big(p\\times \\exp(-\\lambda_1) \\times\\frac{\\lambda_1^{x_i}}{x_i!} + (1 - p)\\times \\exp(-\\lambda_2) \\times\\frac{\\lambda_2 ^{x_i}}{x_i!} \\big)\n\\]\n下 表格 32.1 数据来自 1947 年 Walter Schilling 发表在 JASA 的一篇文章 (Schilling 1947)。连续三年搜集伦敦《泰晤士报》刊登的死亡告示,每天的告示发布 80 岁及以上女性死亡人数。经过汇总统计,发现,在三年里,没有人死亡的告示出现 162 次,死亡 1 人的告示出现 267 次。\n\n\n表格 32.1: 死亡人数的统计\n\n\n\n死亡人数\n0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n\n\n发生频次\n162\n267\n271\n185\n111\n61\n27\n8\n3\n1\n\n\n\n\n\n考虑到夏季和冬季对老人死亡率的影响是不同的,因此,引入泊松混合分布来对数据建模。\n\n# 对数似然函数\n# p 是一个长度为 3 的向量\n# y 是观测数据向量\npoissmix_loglik <- function(p, y) {\n i <- 0:(length(y) - 1)\n loglik <- y * log(p[1] * exp(-p[2]) * p[2]^i / exp(lgamma(i + 1)) +\n (1 - p[1]) * exp(-p[3]) * p[3]^i / exp(lgamma(i + 1)))\n sum(loglik)\n}\n# lgamma(i + 1) 表示整数 i 的阶乘的对数\n# 参数的下限\nlo <- c(0, 0, 0)\n# 参数的上限\nhi <- c(1, Inf, Inf)\n# 随机生成一组参数初始值\np0 <- runif(3, c(0.2, 1, 1), c(0.8, 5, 8)) \n# 汇总统计出来的死亡人数的频次分布\ny <- c(162, 267, 271, 185, 111, 61, 27, 8, 3, 1)\n\n调用 BB 包的函数 BBoptim() 求解多元非线性箱式约束优化问题。\n\nlibrary(BB)\n# 参数估计\nans <- BBoptim(\n par = p0, fn = poissmix_loglik, y = y,\n lower = lo, upper = hi, \n control = list(maximize = TRUE)\n)\n\n#> iter: 0 f-value: -2822.177 pgrad: 6.550888 \n#> iter: 10 f-value: -1990.894 pgrad: 2.158326 \n#> iter: 20 f-value: -1990.057 pgrad: 1.542383 \n#> iter: 30 f-value: -1989.991 pgrad: 2.684052 \n#> iter: 40 f-value: -1989.946 pgrad: 0.05826678 \n#> iter: 50 f-value: -1989.946 pgrad: 0.005381935 \n#> iter: 60 f-value: -1989.946 pgrad: 0.0008139978 \n#> Successful convergence.\n\nans\n\n#> $par\n#> [1] 0.3598724 1.2560702 2.6633898\n#> \n#> $value\n#> [1] -1989.946\n#> \n#> $gradient\n#> [1] 0.0002114575\n#> \n#> $fn.reduction\n#> [1] -832.2316\n#> \n#> $iter\n#> [1] 66\n#> \n#> $feval\n#> [1] 68\n#> \n#> $convergence\n#> [1] 0\n#> \n#> $message\n#> [1] \"Successful convergence\"\n#> \n#> $cpar\n#> method M \n#> 2 50\n\n\nnumDeriv::hessian 计算极大似然点的黑塞矩阵,然后计算参数估计的标准差。\n\n# 黑塞矩阵\nhess <- numDeriv::hessian(x = ans$par, func = poissmix_loglik, y = y)\nhess\n\n#> [,1] [,2] [,3]\n#> [1,] -907.1271 270.22673 341.26114\n#> [2,] 270.2267 -113.47615 -61.68078\n#> [3,] 341.2611 -61.68078 -192.78769\n\n# 标准差\nse <- sqrt(diag(solve(-hess)))\nse\n\n#> [1] 0.1946820 0.3500371 0.2504738\n\n\nmultiStart 从不同初始值出发寻找全局最大值,先找一系列局部极大值,通过比较获得全局最大值。\n\n# 随机生成 10 组初始值\np0 <- matrix(runif(30, c(0.2, 1, 1), c(0.8, 8, 8)), \n nrow = 10, ncol = 3, byrow = TRUE)\nans <- multiStart(\n par = p0, fn = poissmix_loglik, action = \"optimize\",\n y = y, lower = lo, upper = hi, quiet = TRUE,\n control = list(maximize = TRUE, trace = FALSE)\n)\n# 筛选出迭代收敛的解\npmat <- round(cbind(ans$fvalue[ans$conv], ans$par[ans$conv, ]), 4)\ndimnames(pmat) <- list(NULL, c(\"fvalue\", \"parameter 1\", \n \"parameter 2\", \"parameter 3\"))\n# 去掉结果一样的重复解\npmat[!duplicated(pmat), ]\n\n#> fvalue parameter 1 parameter 2 parameter 3\n#> [1,] -1989.946 0.3599 1.2561 2.6634\n#> [2,] -1989.946 0.6401 2.6634 1.2561", "crumbs": [ "优化建模", "32  优化问题" @@ -2191,7 +2191,7 @@ "href": "probabilistic-reasoning-framework.html#sec-choose-inference", "title": "33  概率推理框架", "section": "\n33.4 推理算法", - "text": "33.4 推理算法\n开篇提及 Stan 内置了多种推理算法,不同的算法获得的结果是存在差异的。\n\nfull Bayesian statistical inference with MCMC sampling (NUTS, HMC)\napproximate Bayesian inference with variational inference (ADVI)\npenalized maximum likelihood estimation with optimization (L-BFGS)\n\n\n33.4.1 惩罚极大似然算法\nL-BFGS 算法拟合模型,速度非常快。\n\n# L-BFGS 算法拟合模型\nfit_optim_logit <- mod_logit_lasso$optimize(\n data = mdata, # 观测数据\n init = 0, # 所有参数初值设为 0\n refresh = 0, # 不显示迭代进程\n algorithm = \"lbfgs\", # 优化器\n threads = 1, # 单线程\n seed = 20232023 # 随机数种子\n)\n\n#> Finished in 0.3 seconds.\n\nfit_optim_logit$summary(c(\"alpha\", \"beta\", \"lambda\", \"lp__\"))\n\n#> # A tibble: 13 × 2\n#> variable estimate\n#> <chr> <dbl>\n#> 1 alpha 0.981 \n#> 2 beta[1] 3.05 \n#> 3 beta[2] -1.96 \n#> 4 beta[3] 0.0488 \n#> 5 beta[4] -0.0166 \n#> 6 beta[5] 0.00528\n#> 7 beta[6] 0.0126 \n#> 8 beta[7] 0.0923 \n#> 9 beta[8] -0.0204 \n#> 10 beta[9] -0.0777 \n#> 11 beta[10] 0.0721 \n#> 12 lambda 0.488 \n#> 13 lp__ -768.\n\n\n\n33.4.2 变分近似推断算法\nADVI 算法拟合模型,可选的优化器有 meanfield 和 fullrank ,相比于 L-BFGS 稍慢\n\n# ADVI 算法拟合模型\nfit_advi_logit <- mod_logit_lasso$variational(\n data = mdata, # 观测数据\n init = 0, # 所有参数初值设为 0\n refresh = 0, # 不显示迭代进程\n algorithm = \"meanfield\", # 优化器\n threads = 1, # 单线程\n seed = 20232023 # 随机数种子\n)\n\n#> Finished in 2.3 seconds.\n\nfit_advi_logit$summary(c(\"alpha\", \"beta\", \"lambda\", \"lp__\"))\n\n#> # A tibble: 13 × 7\n#> variable mean median sd mad q5 q95\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 alpha 1.02 1.02 0.0615 0.0630 0.914 1.11 \n#> 2 beta[1] 3.07 3.07 0.0899 0.0870 2.93 3.22 \n#> 3 beta[2] -1.98 -1.98 0.0675 0.0666 -2.08 -1.86 \n#> 4 beta[3] 0.0161 0.0159 0.0678 0.0670 -0.0945 0.129 \n#> 5 beta[4] -0.0199 -0.0221 0.0639 0.0621 -0.121 0.0857\n#> 6 beta[5] -0.00128 -0.00202 0.0722 0.0713 -0.116 0.121 \n#> 7 beta[6] -0.0423 -0.0446 0.0705 0.0689 -0.156 0.0754\n#> 8 beta[7] 0.0760 0.0750 0.0490 0.0489 -0.00517 0.152 \n#> 9 beta[8] -0.0742 -0.0752 0.0659 0.0637 -0.181 0.0347\n#> 10 beta[9] -0.0495 -0.0501 0.0802 0.0818 -0.185 0.0805\n#> 11 beta[10] 0.0444 0.0440 0.0520 0.0522 -0.0444 0.128 \n#> 12 lambda 0.698 0.662 0.243 0.228 0.379 1.13 \n#> 13 lp__ -777. -777. 3.39 3.22 -783. -773.\n\n\n\n33.4.3 拉普拉斯近似算法\nStan 内置的 Laplace 近似算法是对后验分布的 Laplace 正态近似,再从近似的后验分布中采样获得样本,最后,对样本进行统计分析获得参数的后验估计。详见 Stan 语言参考手册的Laplace Approximation 一章。\n\n# Laplace 算法\nfit_laplace_logit <- mod_logit_lasso$laplace(\n data = mdata, # 观测数据\n init = 0, # 所有参数初值设为 0\n refresh = 0, # 不显示迭代进程\n threads = 1, # 单线程\n seed = 20232023 # 随机数种子\n)\n\n#> Finished in 0.1 seconds.\n#> Finished in 1.9 seconds.\n\nfit_laplace_logit$summary(c(\"alpha\", \"beta\", \"lambda\", \"lp__\"))\n\n#> # A tibble: 13 × 7\n#> variable mean median sd mad q5 q95\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 alpha 0.983 0.981 0.0701 0.0719 0.874 1.10 \n#> 2 beta[1] 3.05 3.05 0.131 0.129 2.84 3.28 \n#> 3 beta[2] -1.97 -1.97 0.0960 0.0977 -2.12 -1.80 \n#> 4 beta[3] 0.0516 0.0493 0.0620 0.0605 -0.0432 0.150 \n#> 5 beta[4] -0.0200 -0.0216 0.0647 0.0602 -0.121 0.0904\n#> 6 beta[5] 0.00546 0.00505 0.0639 0.0645 -0.0971 0.110 \n#> 7 beta[6] 0.0135 0.0138 0.0642 0.0629 -0.0929 0.116 \n#> 8 beta[7] 0.0920 0.0917 0.0638 0.0659 -0.0179 0.195 \n#> 9 beta[8] -0.0231 -0.0217 0.0641 0.0669 -0.128 0.0867\n#> 10 beta[9] -0.0810 -0.0798 0.0646 0.0640 -0.194 0.0212\n#> 11 beta[10] 0.0732 0.0745 0.0639 0.0614 -0.0328 0.176 \n#> 12 lambda 0.562 0.536 0.168 0.154 0.333 0.866 \n#> 13 lp__ -775. -775. 2.63 2.46 -780. -772.\n\n\n\n33.4.4 探路者变分算法\n探路者算法 Pathfinder 属于变分法,针对可微的对数目标密度函数,沿着逆牛顿优化算法的迭代路径,获得目标密度函数的正态近似。正态近似中的局部协方差的估计采用 LBFGS 计算的负逆 Hessian 矩阵。探路者算法的优势是可以极大地减少对数密度函数和梯度的计算次数,缓解迭代陷入局部最优点和鞍点(何为鞍点,一个可视化示例详见 章节 32.3 )。\n\n# Pathfinder 算法\nfit_pathfinder_logit <- mod_logit_lasso$pathfinder(\n data = mdata, # 观测数据\n init = 0, # 所有参数初值设为 0\n refresh = 0, # 不显示迭代进程\n num_threads = 1, # 单线程\n seed = 20232023 # 随机数种子\n)\n\n#> Finished in 5.2 seconds.\n\nfit_pathfinder_logit$summary(c(\"alpha\", \"beta\", \"lambda\", \"lp__\"))\n\n#> # A tibble: 13 × 7\n#> variable mean median sd mad q5 q95\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 alpha 0.995 0.993 0.0765 0.0816 0.875 1.12 \n#> 2 beta[1] 3.08 3.07 0.136 0.130 2.88 3.32 \n#> 3 beta[2] -1.98 -1.98 0.102 0.106 -2.15 -1.81 \n#> 4 beta[3] 0.0555 0.0594 0.0616 0.0586 -0.0509 0.158 \n#> 5 beta[4] -0.0217 -0.0209 0.0627 0.0614 -0.124 0.0761\n#> 6 beta[5] 0.0125 0.0132 0.0653 0.0631 -0.0885 0.118 \n#> 7 beta[6] 0.0202 0.0198 0.0620 0.0648 -0.0816 0.125 \n#> 8 beta[7] 0.0968 0.0969 0.0596 0.0617 -0.00574 0.193 \n#> 9 beta[8] -0.0286 -0.0247 0.0600 0.0572 -0.139 0.0643\n#> 10 beta[9] -0.0795 -0.0748 0.0598 0.0623 -0.183 0.0102\n#> 11 beta[10] 0.0748 0.0713 0.0618 0.0612 -0.0234 0.173 \n#> 12 lambda 0.598 0.581 0.171 0.163 0.361 0.914 \n#> 13 lp__ -775. -775. 2.60 2.41 -780. -771.", + "text": "33.4 推理算法\n开篇提及 Stan 内置了多种推理算法,不同的算法获得的结果是存在差异的。\n\nfull Bayesian statistical inference with MCMC sampling (NUTS, HMC)\napproximate Bayesian inference with variational inference (ADVI)\npenalized maximum likelihood estimation with optimization (L-BFGS)\n\n\n33.4.1 惩罚极大似然算法\nL-BFGS 算法拟合模型,速度非常快。\n\n# L-BFGS 算法拟合模型\nfit_optim_logit <- mod_logit_lasso$optimize(\n data = mdata, # 观测数据\n init = 0, # 所有参数初值设为 0\n refresh = 0, # 不显示迭代进程\n algorithm = \"lbfgs\", # 优化器\n threads = 1, # 单线程\n seed = 20232023 # 随机数种子\n)\n\n#> Finished in 0.2 seconds.\n\nfit_optim_logit$summary(c(\"alpha\", \"beta\", \"lambda\", \"lp__\"))\n\n#> # A tibble: 13 × 2\n#> variable estimate\n#> <chr> <dbl>\n#> 1 alpha 0.981 \n#> 2 beta[1] 3.05 \n#> 3 beta[2] -1.96 \n#> 4 beta[3] 0.0488 \n#> 5 beta[4] -0.0166 \n#> 6 beta[5] 0.00528\n#> 7 beta[6] 0.0126 \n#> 8 beta[7] 0.0923 \n#> 9 beta[8] -0.0204 \n#> 10 beta[9] -0.0777 \n#> 11 beta[10] 0.0721 \n#> 12 lambda 0.488 \n#> 13 lp__ -768.\n\n\n\n33.4.2 变分近似推断算法\nADVI 算法拟合模型,可选的优化器有 meanfield 和 fullrank ,相比于 L-BFGS 稍慢\n\n# ADVI 算法拟合模型\nfit_advi_logit <- mod_logit_lasso$variational(\n data = mdata, # 观测数据\n init = 0, # 所有参数初值设为 0\n refresh = 0, # 不显示迭代进程\n algorithm = \"meanfield\", # 优化器\n threads = 1, # 单线程\n seed = 20232023 # 随机数种子\n)\n\n#> Finished in 1.8 seconds.\n\nfit_advi_logit$summary(c(\"alpha\", \"beta\", \"lambda\", \"lp__\"))\n\n#> # A tibble: 13 × 7\n#> variable mean median sd mad q5 q95\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 alpha 1.02 1.02 0.0615 0.0630 0.914 1.11 \n#> 2 beta[1] 3.07 3.07 0.0899 0.0870 2.93 3.22 \n#> 3 beta[2] -1.98 -1.98 0.0675 0.0666 -2.08 -1.86 \n#> 4 beta[3] 0.0161 0.0159 0.0678 0.0670 -0.0945 0.129 \n#> 5 beta[4] -0.0199 -0.0221 0.0639 0.0621 -0.121 0.0857\n#> 6 beta[5] -0.00128 -0.00202 0.0722 0.0713 -0.116 0.121 \n#> 7 beta[6] -0.0423 -0.0446 0.0705 0.0689 -0.156 0.0754\n#> 8 beta[7] 0.0760 0.0750 0.0490 0.0489 -0.00517 0.152 \n#> 9 beta[8] -0.0742 -0.0752 0.0659 0.0637 -0.181 0.0347\n#> 10 beta[9] -0.0495 -0.0501 0.0802 0.0818 -0.185 0.0805\n#> 11 beta[10] 0.0444 0.0440 0.0520 0.0522 -0.0444 0.128 \n#> 12 lambda 0.698 0.662 0.243 0.228 0.379 1.13 \n#> 13 lp__ -777. -777. 3.39 3.22 -783. -773.\n\n\n\n33.4.3 拉普拉斯近似算法\nStan 内置的 Laplace 近似算法是对后验分布的 Laplace 正态近似,再从近似的后验分布中采样获得样本,最后,对样本进行统计分析获得参数的后验估计。详见 Stan 语言参考手册的Laplace Approximation 一章。\n\n# Laplace 算法\nfit_laplace_logit <- mod_logit_lasso$laplace(\n data = mdata, # 观测数据\n init = 0, # 所有参数初值设为 0\n refresh = 0, # 不显示迭代进程\n threads = 1, # 单线程\n seed = 20232023 # 随机数种子\n)\n\n#> Finished in 0.3 seconds.\n#> Finished in 1.7 seconds.\n\nfit_laplace_logit$summary(c(\"alpha\", \"beta\", \"lambda\", \"lp__\"))\n\n#> # A tibble: 13 × 7\n#> variable mean median sd mad q5 q95\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 alpha 0.983 0.981 0.0701 0.0719 0.874 1.10 \n#> 2 beta[1] 3.05 3.05 0.131 0.129 2.84 3.28 \n#> 3 beta[2] -1.97 -1.97 0.0960 0.0977 -2.12 -1.80 \n#> 4 beta[3] 0.0516 0.0493 0.0620 0.0605 -0.0432 0.150 \n#> 5 beta[4] -0.0200 -0.0216 0.0647 0.0602 -0.121 0.0904\n#> 6 beta[5] 0.00546 0.00505 0.0639 0.0645 -0.0971 0.110 \n#> 7 beta[6] 0.0135 0.0138 0.0642 0.0629 -0.0929 0.116 \n#> 8 beta[7] 0.0920 0.0917 0.0638 0.0659 -0.0179 0.195 \n#> 9 beta[8] -0.0231 -0.0217 0.0641 0.0669 -0.128 0.0867\n#> 10 beta[9] -0.0810 -0.0798 0.0646 0.0640 -0.194 0.0212\n#> 11 beta[10] 0.0732 0.0745 0.0639 0.0614 -0.0328 0.176 \n#> 12 lambda 0.562 0.536 0.168 0.154 0.333 0.866 \n#> 13 lp__ -775. -775. 2.63 2.46 -780. -772.\n\n\n\n33.4.4 探路者变分算法\n探路者算法 Pathfinder 属于变分法,针对可微的对数目标密度函数,沿着逆牛顿优化算法的迭代路径,获得目标密度函数的正态近似。正态近似中的局部协方差的估计采用 LBFGS 计算的负逆 Hessian 矩阵。探路者算法的优势是可以极大地减少对数密度函数和梯度的计算次数,缓解迭代陷入局部最优点和鞍点(何为鞍点,一个可视化示例详见 章节 32.3 )。\n\n# Pathfinder 算法\nfit_pathfinder_logit <- mod_logit_lasso$pathfinder(\n data = mdata, # 观测数据\n init = 0, # 所有参数初值设为 0\n refresh = 0, # 不显示迭代进程\n num_threads = 1, # 单线程\n seed = 20232023 # 随机数种子\n)\n\n#> Finished in 3.8 seconds.\n\nfit_pathfinder_logit$summary(c(\"alpha\", \"beta\", \"lambda\", \"lp__\"))\n\n#> # A tibble: 13 × 7\n#> variable mean median sd mad q5 q95\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 alpha 0.995 0.993 0.0765 0.0816 0.875 1.12 \n#> 2 beta[1] 3.08 3.07 0.136 0.130 2.88 3.32 \n#> 3 beta[2] -1.98 -1.98 0.102 0.106 -2.15 -1.81 \n#> 4 beta[3] 0.0555 0.0594 0.0616 0.0586 -0.0509 0.158 \n#> 5 beta[4] -0.0217 -0.0209 0.0627 0.0614 -0.124 0.0761\n#> 6 beta[5] 0.0125 0.0132 0.0653 0.0631 -0.0885 0.118 \n#> 7 beta[6] 0.0202 0.0198 0.0620 0.0648 -0.0816 0.125 \n#> 8 beta[7] 0.0968 0.0969 0.0596 0.0617 -0.00574 0.193 \n#> 9 beta[8] -0.0286 -0.0247 0.0600 0.0572 -0.139 0.0643\n#> 10 beta[9] -0.0795 -0.0748 0.0598 0.0623 -0.183 0.0102\n#> 11 beta[10] 0.0748 0.0713 0.0618 0.0612 -0.0234 0.173 \n#> 12 lambda 0.598 0.581 0.171 0.163 0.361 0.914 \n#> 13 lp__ -775. -775. 2.60 2.41 -780. -771.", "crumbs": [ "贝叶斯建模", "33  概率推理框架" @@ -2312,7 +2312,7 @@ "href": "hierarchical-normal-models.html#sec-8schools-rstan", "title": "35  分层正态模型", "section": "", - "text": "35.1.1 拟合模型\n用 rstan 包来拟合模型,下面采用非中心的参数化表示,降低参数的相关性,减少发散的迭代次数,提高采样效率。\n\n# 编译模型\neight_schools_fit <- stan(\n model_name = \"eight_schools\",\n # file = \"code/eight_schools.stan\",\n model_code = \"\n // saved as eight_schools.stan\n data {\n int<lower=0> J; // number of schools\n array[J] real y; // estimated treatment effects\n array[J] real <lower=0> sigma; // standard error of effect estimates\n }\n parameters {\n real mu; // population treatment effect\n real<lower=0> tau; // standard deviation in treatment effects\n vector[J] eta; // unscaled deviation from mu by school\n }\n transformed parameters {\n vector[J] theta = mu + tau * eta; // school treatment effects\n }\n model {\n target += normal_lpdf(mu | 0, 100); \n target += normal_lpdf(tau | 0, 100);\n target += normal_lpdf(eta | 0, 1); // prior log-density\n target += normal_lpdf(y | theta, sigma); // log-likelihood\n }\n \",\n data = list( # 观测数据\n J = 8,\n y = c(28, 8, -3, 7, -1, 1, 18, 12),\n sigma = c(15, 10, 16, 11, 9, 11, 10, 18)\n ),\n warmup = 1000, # 每条链预处理迭代次数\n iter = 2000, # 每条链总迭代次数\n chains = 2, # 马尔科夫链的数目\n cores = 2, # 指定 CPU 核心数,可以给每条链分配一个\n verbose = FALSE, # 不显示迭代的中间过程\n refresh = 0, # 不显示采样的进度\n seed = 20232023 # 设置随机数种子,不要使用 set.seed() 函数\n)\n\n\n35.1.2 模型输出\n用函数 print() 打印输出结果,保留 2 位小数。\n\nprint(eight_schools_fit, digits = 2)\n\n#> Inference for Stan model: eight_schools.\n#> 2 chains, each with iter=2000; warmup=1000; thin=1; \n#> post-warmup draws per chain=1000, total post-warmup draws=2000.\n#> \n#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat\n#> mu 7.90 0.16 5.00 -1.79 4.60 7.71 11.05 18.37 988 1\n#> tau 6.43 0.20 5.35 0.22 2.49 5.23 8.94 19.73 733 1\n#> eta[1] 0.40 0.02 0.96 -1.57 -0.21 0.41 1.06 2.27 2252 1\n#> eta[2] 0.00 0.02 0.88 -1.74 -0.59 -0.01 0.57 1.77 1977 1\n#> eta[3] -0.17 0.02 0.92 -1.93 -0.78 -0.18 0.41 1.71 2307 1\n#> eta[4] -0.04 0.02 0.91 -1.90 -0.64 -0.01 0.55 1.76 2053 1\n#> eta[5] -0.34 0.02 0.89 -2.03 -0.97 -0.38 0.25 1.43 1769 1\n#> eta[6] -0.22 0.02 0.87 -1.89 -0.81 -0.23 0.36 1.52 1959 1\n#> eta[7] 0.33 0.02 0.86 -1.30 -0.26 0.31 0.90 2.00 2020 1\n#> eta[8] 0.05 0.02 0.96 -1.89 -0.59 0.05 0.69 1.94 2597 1\n#> theta[1] 11.29 0.20 8.11 -1.69 5.88 10.14 15.34 31.08 1728 1\n#> theta[2] 7.80 0.13 6.25 -4.68 3.88 7.88 11.62 19.78 2362 1\n#> theta[3] 6.30 0.17 7.57 -10.90 2.15 6.63 10.87 20.04 1902 1\n#> theta[4] 7.72 0.14 6.53 -5.33 3.78 7.52 11.66 21.22 2249 1\n#> theta[5] 5.10 0.14 6.48 -9.20 1.04 5.69 9.50 16.71 2030 1\n#> theta[6] 6.06 0.16 6.88 -8.61 2.07 6.39 10.33 19.01 1766 1\n#> theta[7] 10.41 0.14 6.42 -0.18 6.03 9.59 13.98 24.97 2057 1\n#> theta[8] 8.45 0.19 8.00 -7.45 4.02 8.13 12.69 26.56 1728 1\n#> lp__ -50.67 0.11 2.64 -56.69 -52.25 -50.40 -48.78 -46.34 584 1\n#> \n#> Samples were drawn using NUTS(diag_e) at Thu Feb 1 06:32:23 2024.\n#> For each parameter, n_eff is a crude measure of effective sample size,\n#> and Rhat is the potential scale reduction factor on split chains (at \n#> convergence, Rhat=1).\n\n\n值得一提,数据有限而且规律不明确,数据隐含的信息不是很多,则先验分布的情况将会对参数估计结果产生很大影响。Stan 默认采用无信息的先验分布,当使用非常弱的信息先验时,结果就非常不同了。提取任意一个参数的结果,如查看参数 \\(\\tau\\) 的 95% 置信区间。\n\nprint(eight_schools_fit, pars = \"tau\", probs = c(0.025, 0.975))\n\n#> Inference for Stan model: eight_schools.\n#> 2 chains, each with iter=2000; warmup=1000; thin=1; \n#> post-warmup draws per chain=1000, total post-warmup draws=2000.\n#> \n#> mean se_mean sd 2.5% 97.5% n_eff Rhat\n#> tau 6.43 0.2 5.35 0.22 19.73 733 1\n#> \n#> Samples were drawn using NUTS(diag_e) at Thu Feb 1 06:32:23 2024.\n#> For each parameter, n_eff is a crude measure of effective sample size,\n#> and Rhat is the potential scale reduction factor on split chains (at \n#> convergence, Rhat=1).\n\n\n从迭代抽样数据获得与 print(fit) 一样的结果。以便后续对原始采样数据做任意的进一步分析。rstan 包扩展泛型函数 summary() 以支持对 stanfit 数据对象汇总,输出各个参数分链条和合并链条的后验分布结果。\n\n35.1.3 操作数据\n抽取数据对象 eight_schools_fit 中的采样数据,合并几条马氏链的结果,返回的结果是一个列表。\n\neight_schools_sim <- extract(eight_schools_fit, permuted = TRUE)\n\n返回列表中的每个元素是一个数组,标量参数对应一维数组,向量参数对应二维数组。\n\nstr(eight_schools_sim)\n\n#> List of 5\n#> $ mu : num [1:2000(1d)] 10.9 7.32 11.08 6.01 7.17 ...\n#> ..- attr(*, \"dimnames\")=List of 1\n#> .. ..$ iterations: NULL\n#> $ tau : num [1:2000(1d)] 0.0611 13.9378 0.2101 10.2247 8.0898 ...\n#> ..- attr(*, \"dimnames\")=List of 1\n#> .. ..$ iterations: NULL\n#> $ eta : num [1:2000, 1:8] -0.0263 1.2116 0.6171 0.4923 1.2238 ...\n#> ..- attr(*, \"dimnames\")=List of 2\n#> .. ..$ iterations: NULL\n#> .. ..$ : NULL\n#> $ theta: num [1:2000, 1:8] 10.9 24.2 11.2 11 17.1 ...\n#> ..- attr(*, \"dimnames\")=List of 2\n#> .. ..$ iterations: NULL\n#> .. ..$ : NULL\n#> $ lp__ : num [1:2000(1d)] -54 -46.3 -54 -48.5 -51 ...\n#> ..- attr(*, \"dimnames\")=List of 1\n#> .. ..$ iterations: NULL\n\n\n对于列表,适合用函数 lapply() 配合算术函数计算 \\(\\mu,\\tau\\) 等参数的均值。\n\nfun_mean <- function(x) {\n if (length(dim(x)) > 1) {\n apply(x, 2, mean)\n } else {\n mean(x)\n }\n}\nlapply(eight_schools_sim, FUN = fun_mean)\n\n#> $mu\n#> [1] 7.896911\n#> \n#> $tau\n#> [1] 6.427487\n#> \n#> $eta\n#> [1] 0.39815800 -0.00403665 -0.17091492 -0.03835530 -0.34447579 -0.21592391\n#> [7] 0.33375651 0.04527884\n#> \n#> $theta\n#> [1] 11.293515 7.796730 6.300619 7.722628 5.100476 6.059899 10.411386\n#> [8] 8.451163\n#> \n#> $lp__\n#> [1] -50.66637\n\n\n类似地,计算 \\(\\mu,\\tau\\) 等参数的分位点。\n\nfun_quantile <- function(x, probs) {\n if (length(dim(x)) > 1) {\n t(apply(x, 2, quantile, probs = probs))\n } else {\n quantile(x, probs = probs)\n }\n}\nlapply(eight_schools_sim, fun_quantile, probs = c(2.5, 25, 50, 75, 97.5) / 100)\n\n#> $mu\n#> 2.5% 25% 50% 75% 97.5% \n#> -1.787025 4.603868 7.706957 11.054801 18.370903 \n#> \n#> $tau\n#> 2.5% 25% 50% 75% 97.5% \n#> 0.2221582 2.4933088 5.2289974 8.9369194 19.7296019 \n#> \n#> $eta\n#> \n#> 2.5% 25% 50% 75% 97.5%\n#> [1,] -1.571814 -0.2051970 0.412478990 1.0592655 2.267580\n#> [2,] -1.740860 -0.5924433 -0.009587292 0.5740810 1.768344\n#> [3,] -1.933032 -0.7815080 -0.181728735 0.4102823 1.709490\n#> [4,] -1.896206 -0.6383988 -0.008634091 0.5491973 1.758581\n#> [5,] -2.029051 -0.9674769 -0.376669547 0.2505514 1.425229\n#> [6,] -1.890733 -0.8146685 -0.227713876 0.3642582 1.524509\n#> [7,] -1.295196 -0.2552945 0.310815774 0.9046577 1.995717\n#> [8,] -1.894366 -0.5853674 0.053723000 0.6912584 1.940257\n#> \n#> $theta\n#> \n#> 2.5% 25% 50% 75% 97.5%\n#> [1,] -1.6909041 5.879186 10.136104 15.343330 31.08250\n#> [2,] -4.6774314 3.880805 7.876314 11.616239 19.77814\n#> [3,] -10.9015495 2.149377 6.629543 10.872316 20.03882\n#> [4,] -5.3329649 3.779338 7.521149 11.663011 21.22172\n#> [5,] -9.2028941 1.035711 5.692159 9.501092 16.70603\n#> [6,] -8.6129198 2.068728 6.393843 10.333108 19.00727\n#> [7,] -0.1804168 6.033491 9.594300 13.980905 24.96505\n#> [8,] -7.4463975 4.015728 8.129632 12.694195 26.55720\n#> \n#> $lp__\n#> 2.5% 25% 50% 75% 97.5% \n#> -56.69300 -52.24867 -50.39536 -48.78482 -46.34192\n\n\n同理,可以计算最大值 max()、最小值 min() 和中位数 median() 等。\n\n35.1.4 采样诊断\n获取马尔科夫链迭代点列数据\n\neight_schools_sim <- extract(eight_schools_fit, permuted = FALSE)\n\neight_schools_sim 是一个三维数组,1000(次迭代)* 2 (条链)* 19(个参数)。如果 permuted = TRUE 则会合并马氏链的迭代结果,变成一个列表。\n\n# 数据类型\nclass(eight_schools_sim)\n\n#> [1] \"array\"\n\n# 1000(次迭代)* 2 (条链)* 19(个参数)\nstr(eight_schools_sim)\n\n#> num [1:1000, 1:2, 1:19] 14.18 2.19 12.29 12.51 5.02 ...\n#> - attr(*, \"dimnames\")=List of 3\n#> ..$ iterations: NULL\n#> ..$ chains : chr [1:2] \"chain:1\" \"chain:2\"\n#> ..$ parameters: chr [1:19] \"mu\" \"tau\" \"eta[1]\" \"eta[2]\" ...\n\n\n提取参数 \\(\\mu\\) 的迭代点列,绘制迭代轨迹。\n\neight_schools_mu_sim <- eight_schools_sim[, , \"mu\"]\nmatplot(\n eight_schools_mu_sim, xlab = \"迭代次数\", ylab = expression(mu),\n type = \"l\", lty = \"solid\", col = custom_colors\n)\nabline(h = apply(eight_schools_mu_sim, 2, mean), col = custom_colors)\nlegend(\n \"topleft\", legend = paste(\"chain\", 1:2), box.col = \"white\", \n inset = 0.01, lty = \"solid\", horiz = TRUE, col = custom_colors\n)\n\n\n\n\n\n\n图 35.1: Base R 绘制参数 \\(\\mu\\) 的迭代轨迹\n\n\n\n\n也可以使用 rstan 包提供的函数 traceplot() 或者 stan_trace() 绘制参数的迭代轨迹图。\n\nstan_trace(eight_schools_fit, pars = \"mu\") +\n labs(x = \"迭代次数\", y = expression(mu))\n\n\n\n\n\n\n图 35.2: rstan 绘制参数 \\(\\mu\\) 的迭代轨迹\n\n\n\n\n\n35.1.5 后验分布\n可以用函数 stan_hist() 或 stan_dens() 绘制后验分布图。下图分别展示参数 \\(\\mu\\)、\\(\\tau\\) 的直方图,以及二者的散点图,参数 \\(\\mu\\) 的后验概率密度分布图。\n\np1 <- stan_hist(eight_schools_fit, pars = c(\"mu\",\"tau\"), bins = 30)\np2 <- stan_scat(eight_schools_fit, pars = c(\"mu\",\"tau\"), size = 1) +\n labs(x = expression(mu), y = expression(tau))\np3 <- stan_dens(eight_schools_fit, pars = \"mu\") + labs(x = expression(mu))\nlibrary(patchwork)\np1 / (p2 + p3)\n\n\n\n\n\n\n图 35.3: rstan 包绘制后验分布图\n\n\n\n\n相比于 rstan 包,bayesplot 包可视化能力更强,支持对特定的参数做变换。bayesplot 包的函数 mcmc_pairs() 以矩阵图展示多个参数的分布,下图展示参数 \\(\\mu\\),\\(\\log(\\tau)\\) 后验分布图。但是,这些函数都固定了一些标题,不能修改。\n\nbayesplot::mcmc_pairs(\n eight_schools_fit, pars = c(\"mu\", \"tau\"), transform = list(tau = \"log\")\n)\n\n\n\n\n\n\n图 35.4: bayesplot 包绘制后验分布图", + "text": "35.1.1 拟合模型\n用 rstan 包来拟合模型,下面采用非中心的参数化表示,降低参数的相关性,减少发散的迭代次数,提高采样效率。\n\n# 编译模型\neight_schools_fit <- stan(\n model_name = \"eight_schools\",\n # file = \"code/eight_schools.stan\",\n model_code = \"\n // saved as eight_schools.stan\n data {\n int<lower=0> J; // number of schools\n array[J] real y; // estimated treatment effects\n array[J] real <lower=0> sigma; // standard error of effect estimates\n }\n parameters {\n real mu; // population treatment effect\n real<lower=0> tau; // standard deviation in treatment effects\n vector[J] eta; // unscaled deviation from mu by school\n }\n transformed parameters {\n vector[J] theta = mu + tau * eta; // school treatment effects\n }\n model {\n target += normal_lpdf(mu | 0, 100); \n target += normal_lpdf(tau | 0, 100);\n target += normal_lpdf(eta | 0, 1); // prior log-density\n target += normal_lpdf(y | theta, sigma); // log-likelihood\n }\n \",\n data = list( # 观测数据\n J = 8,\n y = c(28, 8, -3, 7, -1, 1, 18, 12),\n sigma = c(15, 10, 16, 11, 9, 11, 10, 18)\n ),\n warmup = 1000, # 每条链预处理迭代次数\n iter = 2000, # 每条链总迭代次数\n chains = 2, # 马尔科夫链的数目\n cores = 2, # 指定 CPU 核心数,可以给每条链分配一个\n verbose = FALSE, # 不显示迭代的中间过程\n refresh = 0, # 不显示采样的进度\n seed = 20232023 # 设置随机数种子,不要使用 set.seed() 函数\n)\n\n\n35.1.2 模型输出\n用函数 print() 打印输出结果,保留 2 位小数。\n\nprint(eight_schools_fit, digits = 2)\n\n#> Inference for Stan model: eight_schools.\n#> 2 chains, each with iter=2000; warmup=1000; thin=1; \n#> post-warmup draws per chain=1000, total post-warmup draws=2000.\n#> \n#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat\n#> mu 7.90 0.16 5.00 -1.79 4.60 7.71 11.05 18.37 988 1\n#> tau 6.43 0.20 5.35 0.22 2.49 5.23 8.94 19.73 733 1\n#> eta[1] 0.40 0.02 0.96 -1.57 -0.21 0.41 1.06 2.27 2252 1\n#> eta[2] 0.00 0.02 0.88 -1.74 -0.59 -0.01 0.57 1.77 1977 1\n#> eta[3] -0.17 0.02 0.92 -1.93 -0.78 -0.18 0.41 1.71 2307 1\n#> eta[4] -0.04 0.02 0.91 -1.90 -0.64 -0.01 0.55 1.76 2053 1\n#> eta[5] -0.34 0.02 0.89 -2.03 -0.97 -0.38 0.25 1.43 1769 1\n#> eta[6] -0.22 0.02 0.87 -1.89 -0.81 -0.23 0.36 1.52 1959 1\n#> eta[7] 0.33 0.02 0.86 -1.30 -0.26 0.31 0.90 2.00 2020 1\n#> eta[8] 0.05 0.02 0.96 -1.89 -0.59 0.05 0.69 1.94 2597 1\n#> theta[1] 11.29 0.20 8.11 -1.69 5.88 10.14 15.34 31.08 1728 1\n#> theta[2] 7.80 0.13 6.25 -4.68 3.88 7.88 11.62 19.78 2362 1\n#> theta[3] 6.30 0.17 7.57 -10.90 2.15 6.63 10.87 20.04 1902 1\n#> theta[4] 7.72 0.14 6.53 -5.33 3.78 7.52 11.66 21.22 2249 1\n#> theta[5] 5.10 0.14 6.48 -9.20 1.04 5.69 9.50 16.71 2030 1\n#> theta[6] 6.06 0.16 6.88 -8.61 2.07 6.39 10.33 19.01 1766 1\n#> theta[7] 10.41 0.14 6.42 -0.18 6.03 9.59 13.98 24.97 2057 1\n#> theta[8] 8.45 0.19 8.00 -7.45 4.02 8.13 12.69 26.56 1728 1\n#> lp__ -50.67 0.11 2.64 -56.69 -52.25 -50.40 -48.78 -46.34 584 1\n#> \n#> Samples were drawn using NUTS(diag_e) at Mon Feb 5 05:18:50 2024.\n#> For each parameter, n_eff is a crude measure of effective sample size,\n#> and Rhat is the potential scale reduction factor on split chains (at \n#> convergence, Rhat=1).\n\n\n值得一提,数据有限而且规律不明确,数据隐含的信息不是很多,则先验分布的情况将会对参数估计结果产生很大影响。Stan 默认采用无信息的先验分布,当使用非常弱的信息先验时,结果就非常不同了。提取任意一个参数的结果,如查看参数 \\(\\tau\\) 的 95% 置信区间。\n\nprint(eight_schools_fit, pars = \"tau\", probs = c(0.025, 0.975))\n\n#> Inference for Stan model: eight_schools.\n#> 2 chains, each with iter=2000; warmup=1000; thin=1; \n#> post-warmup draws per chain=1000, total post-warmup draws=2000.\n#> \n#> mean se_mean sd 2.5% 97.5% n_eff Rhat\n#> tau 6.43 0.2 5.35 0.22 19.73 733 1\n#> \n#> Samples were drawn using NUTS(diag_e) at Mon Feb 5 05:18:50 2024.\n#> For each parameter, n_eff is a crude measure of effective sample size,\n#> and Rhat is the potential scale reduction factor on split chains (at \n#> convergence, Rhat=1).\n\n\n从迭代抽样数据获得与 print(fit) 一样的结果。以便后续对原始采样数据做任意的进一步分析。rstan 包扩展泛型函数 summary() 以支持对 stanfit 数据对象汇总,输出各个参数分链条和合并链条的后验分布结果。\n\n35.1.3 操作数据\n抽取数据对象 eight_schools_fit 中的采样数据,合并几条马氏链的结果,返回的结果是一个列表。\n\neight_schools_sim <- extract(eight_schools_fit, permuted = TRUE)\n\n返回列表中的每个元素是一个数组,标量参数对应一维数组,向量参数对应二维数组。\n\nstr(eight_schools_sim)\n\n#> List of 5\n#> $ mu : num [1:2000(1d)] 4.379 8.171 12.021 -0.284 7.584 ...\n#> ..- attr(*, \"dimnames\")=List of 1\n#> .. ..$ iterations: NULL\n#> $ tau : num [1:2000(1d)] 14.523 0.742 13.614 14.05 10.707 ...\n#> ..- attr(*, \"dimnames\")=List of 1\n#> .. ..$ iterations: NULL\n#> $ eta : num [1:2000, 1:8] 0.8887 1.7902 -0.3404 1.1472 0.0448 ...\n#> ..- attr(*, \"dimnames\")=List of 2\n#> .. ..$ iterations: NULL\n#> .. ..$ : NULL\n#> $ theta: num [1:2000, 1:8] 17.29 9.5 7.39 15.83 8.06 ...\n#> ..- attr(*, \"dimnames\")=List of 2\n#> .. ..$ iterations: NULL\n#> .. ..$ : NULL\n#> $ lp__ : num [1:2000(1d)] -48.4 -51.8 -49.8 -46.1 -49 ...\n#> ..- attr(*, \"dimnames\")=List of 1\n#> .. ..$ iterations: NULL\n\n\n对于列表,适合用函数 lapply() 配合算术函数计算 \\(\\mu,\\tau\\) 等参数的均值。\n\nfun_mean <- function(x) {\n if (length(dim(x)) > 1) {\n apply(x, 2, mean)\n } else {\n mean(x)\n }\n}\nlapply(eight_schools_sim, FUN = fun_mean)\n\n#> $mu\n#> [1] 7.896911\n#> \n#> $tau\n#> [1] 6.427487\n#> \n#> $eta\n#> [1] 0.39815800 -0.00403665 -0.17091492 -0.03835530 -0.34447579 -0.21592391\n#> [7] 0.33375651 0.04527884\n#> \n#> $theta\n#> [1] 11.293515 7.796730 6.300619 7.722628 5.100476 6.059899 10.411386\n#> [8] 8.451163\n#> \n#> $lp__\n#> [1] -50.66637\n\n\n类似地,计算 \\(\\mu,\\tau\\) 等参数的分位点。\n\nfun_quantile <- function(x, probs) {\n if (length(dim(x)) > 1) {\n t(apply(x, 2, quantile, probs = probs))\n } else {\n quantile(x, probs = probs)\n }\n}\nlapply(eight_schools_sim, fun_quantile, probs = c(2.5, 25, 50, 75, 97.5) / 100)\n\n#> $mu\n#> 2.5% 25% 50% 75% 97.5% \n#> -1.787025 4.603868 7.706957 11.054801 18.370903 \n#> \n#> $tau\n#> 2.5% 25% 50% 75% 97.5% \n#> 0.2221582 2.4933088 5.2289974 8.9369194 19.7296019 \n#> \n#> $eta\n#> \n#> 2.5% 25% 50% 75% 97.5%\n#> [1,] -1.571814 -0.2051970 0.412478990 1.0592655 2.267580\n#> [2,] -1.740860 -0.5924433 -0.009587292 0.5740810 1.768344\n#> [3,] -1.933032 -0.7815080 -0.181728735 0.4102823 1.709490\n#> [4,] -1.896206 -0.6383988 -0.008634091 0.5491973 1.758581\n#> [5,] -2.029051 -0.9674769 -0.376669547 0.2505514 1.425229\n#> [6,] -1.890733 -0.8146685 -0.227713876 0.3642582 1.524509\n#> [7,] -1.295196 -0.2552945 0.310815774 0.9046577 1.995717\n#> [8,] -1.894366 -0.5853674 0.053723000 0.6912584 1.940257\n#> \n#> $theta\n#> \n#> 2.5% 25% 50% 75% 97.5%\n#> [1,] -1.6909041 5.879186 10.136104 15.343330 31.08250\n#> [2,] -4.6774314 3.880805 7.876314 11.616239 19.77814\n#> [3,] -10.9015495 2.149377 6.629543 10.872316 20.03882\n#> [4,] -5.3329649 3.779338 7.521149 11.663011 21.22172\n#> [5,] -9.2028941 1.035711 5.692159 9.501092 16.70603\n#> [6,] -8.6129198 2.068728 6.393843 10.333108 19.00727\n#> [7,] -0.1804168 6.033491 9.594300 13.980905 24.96505\n#> [8,] -7.4463975 4.015728 8.129632 12.694195 26.55720\n#> \n#> $lp__\n#> 2.5% 25% 50% 75% 97.5% \n#> -56.69300 -52.24867 -50.39536 -48.78482 -46.34192\n\n\n同理,可以计算最大值 max()、最小值 min() 和中位数 median() 等。\n\n35.1.4 采样诊断\n获取马尔科夫链迭代点列数据\n\neight_schools_sim <- extract(eight_schools_fit, permuted = FALSE)\n\neight_schools_sim 是一个三维数组,1000(次迭代)* 2 (条链)* 19(个参数)。如果 permuted = TRUE 则会合并马氏链的迭代结果,变成一个列表。\n\n# 数据类型\nclass(eight_schools_sim)\n\n#> [1] \"array\"\n\n# 1000(次迭代)* 2 (条链)* 19(个参数)\nstr(eight_schools_sim)\n\n#> num [1:1000, 1:2, 1:19] 14.18 2.19 12.29 12.51 5.02 ...\n#> - attr(*, \"dimnames\")=List of 3\n#> ..$ iterations: NULL\n#> ..$ chains : chr [1:2] \"chain:1\" \"chain:2\"\n#> ..$ parameters: chr [1:19] \"mu\" \"tau\" \"eta[1]\" \"eta[2]\" ...\n\n\n提取参数 \\(\\mu\\) 的迭代点列,绘制迭代轨迹。\n\neight_schools_mu_sim <- eight_schools_sim[, , \"mu\"]\nmatplot(\n eight_schools_mu_sim, xlab = \"迭代次数\", ylab = expression(mu),\n type = \"l\", lty = \"solid\", col = custom_colors\n)\nabline(h = apply(eight_schools_mu_sim, 2, mean), col = custom_colors)\nlegend(\n \"topleft\", legend = paste(\"chain\", 1:2), box.col = \"white\", \n inset = 0.01, lty = \"solid\", horiz = TRUE, col = custom_colors\n)\n\n\n\n\n\n\n图 35.1: Base R 绘制参数 \\(\\mu\\) 的迭代轨迹\n\n\n\n\n也可以使用 rstan 包提供的函数 traceplot() 或者 stan_trace() 绘制参数的迭代轨迹图。\n\nstan_trace(eight_schools_fit, pars = \"mu\") +\n labs(x = \"迭代次数\", y = expression(mu))\n\n\n\n\n\n\n图 35.2: rstan 绘制参数 \\(\\mu\\) 的迭代轨迹\n\n\n\n\n\n35.1.5 后验分布\n可以用函数 stan_hist() 或 stan_dens() 绘制后验分布图。下图分别展示参数 \\(\\mu\\)、\\(\\tau\\) 的直方图,以及二者的散点图,参数 \\(\\mu\\) 的后验概率密度分布图。\n\np1 <- stan_hist(eight_schools_fit, pars = c(\"mu\",\"tau\"), bins = 30)\np2 <- stan_scat(eight_schools_fit, pars = c(\"mu\",\"tau\"), size = 1) +\n labs(x = expression(mu), y = expression(tau))\np3 <- stan_dens(eight_schools_fit, pars = \"mu\") + labs(x = expression(mu))\nlibrary(patchwork)\np1 / (p2 + p3)\n\n\n\n\n\n\n图 35.3: rstan 包绘制后验分布图\n\n\n\n\n相比于 rstan 包,bayesplot 包可视化能力更强,支持对特定的参数做变换。bayesplot 包的函数 mcmc_pairs() 以矩阵图展示多个参数的分布,下图展示参数 \\(\\mu\\),\\(\\log(\\tau)\\) 后验分布图。但是,这些函数都固定了一些标题,不能修改。\n\nbayesplot::mcmc_pairs(\n eight_schools_fit, pars = c(\"mu\", \"tau\"), transform = list(tau = \"log\")\n)\n\n\n\n\n\n\n图 35.4: bayesplot 包绘制后验分布图", "crumbs": [ "贝叶斯建模", "35  分层正态模型" @@ -2356,7 +2356,7 @@ "href": "hierarchical-normal-models.html#sec-rats-bayesianism", "title": "35  分层正态模型", "section": "\n35.5 贝叶斯方法", - "text": "35.5 贝叶斯方法\n\n35.5.1 rstan\n初始化模型参数,设置采样算法的参数。\n\n# 迭代链\nchains <- 4\n# 迭代次数\niter <- 1000\n# 初始值\ninit <- rep(list(list(\n alpha = rep(250, 30), beta = rep(6, 30),\n alpha_c = 150, beta_c = 10,\n tausq_c = 1, tausq_alpha = 1,\n tausq_beta = 1\n)), chains)\n\n接下来,基于重复测量数据,建立线性生长曲线模型:\n\\[\n\\begin{aligned}\n\\alpha_c &\\sim \\mathcal{N}(0,100) \\quad \\beta_c \\sim \\mathcal{N}(0,100) \\\\\n\\tau^2_{\\alpha} &\\sim \\mathrm{inv\\_gamma}(0.001, 0.001) \\\\\n\\tau^2_{\\beta} &\\sim \\mathrm{inv\\_gamma}(0.001, 0.001) \\\\\n\\tau^2_c &\\sim \\mathrm{inv\\_gamma}(0.001, 0.001) \\\\\n\\alpha_n &\\sim \\mathcal{N}(\\alpha_c, \\tau_{\\alpha}) \\quad\n\\beta_n \\sim \\mathcal{N}(\\beta_c, \\tau_{\\beta}) \\\\\ny_{nt} &\\sim \\mathcal{N}(\\alpha_n + \\beta_n * (x_t - \\bar{x}), \\tau_c) \\\\\n& n = 1,2,\\ldots,N \\quad t = 1,2,\\ldots,T\n\\end{aligned}\n\\]\n其中, \\(\\alpha_c,\\beta_c,\\tau_c,\\tau_{\\alpha},\\tau_{\\beta}\\) 为无信息先验,\\(\\bar{x} = 22\\) 表示第 22 天,\\(N = 30\\) 和 \\(T = 5\\) 分别表示实验中的小鼠数量和测量次数,下面采用 Stan 编码、编译、采样和拟合模型。\n\nrats_fit <- stan(\n model_name = \"rats\",\n model_code = \"\n data {\n int<lower=0> N;\n int<lower=0> T;\n vector[T] x;\n matrix[N,T] y;\n real xbar;\n }\n parameters {\n vector[N] alpha;\n vector[N] beta;\n\n real alpha_c;\n real beta_c; // beta.c in original bugs model\n\n real<lower=0> tausq_c;\n real<lower=0> tausq_alpha;\n real<lower=0> tausq_beta;\n }\n transformed parameters {\n real<lower=0> tau_c; // sigma in original bugs model\n real<lower=0> tau_alpha;\n real<lower=0> tau_beta;\n\n tau_c = sqrt(tausq_c);\n tau_alpha = sqrt(tausq_alpha);\n tau_beta = sqrt(tausq_beta);\n }\n model {\n alpha_c ~ normal(0, 100);\n beta_c ~ normal(0, 100);\n tausq_c ~ inv_gamma(0.001, 0.001);\n tausq_alpha ~ inv_gamma(0.001, 0.001);\n tausq_beta ~ inv_gamma(0.001, 0.001);\n alpha ~ normal(alpha_c, tau_alpha); // vectorized\n beta ~ normal(beta_c, tau_beta); // vectorized\n for (n in 1:N)\n for (t in 1:T)\n y[n,t] ~ normal(alpha[n] + beta[n] * (x[t] - xbar), tau_c);\n }\n generated quantities {\n real alpha0;\n alpha0 = alpha_c - xbar * beta_c;\n }\n \",\n data = list(N = N, T = T, y = y, x = x, xbar = xbar),\n chains = chains, init = init, iter = iter, \n verbose = FALSE, refresh = 0, seed = 20190425\n)\n\n模型输出结果如下:\n\nprint(rats_fit, pars = c(\"alpha\", \"beta\"), include = FALSE, digits = 1)\n\n#> Inference for Stan model: rats.\n#> 4 chains, each with iter=1000; warmup=500; thin=1; \n#> post-warmup draws per chain=500, total post-warmup draws=2000.\n#> \n#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat\n#> alpha_c 242.5 0.1 2.7 237.1 240.6 242.5 244.3 247.7 1728 1\n#> beta_c 6.2 0.0 0.1 6.0 6.1 6.2 6.3 6.4 2205 1\n#> tausq_c 37.0 0.2 5.8 27.6 32.9 36.4 40.6 50.0 947 1\n#> tausq_alpha 218.0 1.6 64.1 125.3 172.0 207.8 253.0 372.8 1703 1\n#> tausq_beta 0.3 0.0 0.1 0.1 0.2 0.3 0.3 0.5 1481 1\n#> tau_c 6.1 0.0 0.5 5.3 5.7 6.0 6.4 7.1 938 1\n#> tau_alpha 14.6 0.0 2.1 11.2 13.1 14.4 15.9 19.3 1826 1\n#> tau_beta 0.5 0.0 0.1 0.4 0.5 0.5 0.6 0.7 1429 1\n#> alpha0 106.3 0.1 3.6 99.5 103.9 106.4 108.8 113.2 1965 1\n#> lp__ -438.0 0.3 7.0 -452.8 -442.6 -437.4 -433.2 -425.4 558 1\n#> \n#> Samples were drawn using NUTS(diag_e) at Thu Feb 1 06:34:04 2024.\n#> For each parameter, n_eff is a crude measure of effective sample size,\n#> and Rhat is the potential scale reduction factor on split chains (at \n#> convergence, Rhat=1).\n\n\nalpha_c 表示小鼠 5 次测量的平均重量,beta_c 表示小鼠体重的增长率,\\(\\alpha_i,\\beta_i\\) 分别表示第 \\(i\\) 只小鼠在第 22 天(第 3 次测量或 \\(x_t = \\bar{x}\\) )的重量和增长率(每日增加的重量)。\n对于分量众多的参数向量,比较适合用岭线图展示后验分布,下面调用 bayesplot 包绘制参数向量 \\(\\boldsymbol{\\alpha},\\boldsymbol{\\beta}\\) 的后验分布。\n\n# plot(rats_fit, pars = \"alpha\", show_density = TRUE, ci_level = 0.8, outer_level = 0.95)\nbayesplot::mcmc_areas_ridges(rats_fit, pars = paste0(\"alpha\", \"[\", 1:30, \"]\")) +\n scale_y_discrete(labels = scales::parse_format()) \n\n\n\n\n\n\n图 35.8: 参数 \\(\\boldsymbol{\\alpha}\\) 的后验分布\n\n\n\n\n参数向量 \\(\\boldsymbol{\\alpha}\\) 的后验估计可以看作 \\(x_t = \\bar{x}\\) 时小鼠的重量,上图即为各个小鼠重量的后验分布。\n\n# plot(rats_fit, pars = \"beta\", ci_level = 0.8, outer_level = 0.95)\nbayesplot::mcmc_areas_ridges(rats_fit, pars = paste0(\"beta\", \"[\", 1:30, \"]\")) +\n scale_y_discrete(labels = scales::parse_format()) \n\n\n\n\n\n\n图 35.9: 参数 \\(\\boldsymbol{\\beta}\\) 的后验分布\n\n\n\n\n参数向量 \\(\\boldsymbol{\\beta}\\) 的后验估计可以看作是小鼠的重量的增长率,上图即为各个小鼠重量的增长率的后验分布。\n\n35.5.2 cmdstanr\n从 rstan 包转 cmdstanr 包是非常容易的,只要语法兼容,模型代码可以原封不动。\n\nlibrary(cmdstanr)\nmod_rats <- cmdstan_model(\n stan_file = \"code/rats.stan\",\n compile = TRUE, cpp_options = list(stan_threads = TRUE)\n)\nfit_rats <- mod_rats$sample(\n data = list(N = N, T = T, y = y, x = x, xbar = xbar), # 数据\n chains = 2, # 总链条数\n parallel_chains = 2, # 并行数目\n iter_warmup = 1000, # 每条链预处理的迭代次数\n iter_sampling = 1000, # 每条链采样的迭代次数\n threads_per_chain = 2, # 每条链设置 2 个线程\n seed = 20232023, # 随机数种子\n show_messages = FALSE, # 不显示消息\n adapt_delta = 0.9, # 接受率\n refresh = 0 # 不显示采样迭代的进度\n)\n\n模型输出\n\n# 显示除了参数 alpha 和 beta 以外的结果\nvars <- setdiff(fit_rats$metadata()$stan_variables, c(\"alpha\", \"beta\"))\nfit_rats$summary(variables = vars)\n\n#> # A tibble: 10 × 10\n#> variable mean median sd mad q5 q95 rhat ess_bulk\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 lp__ -438. -438. 6.87 6.83 -450. -427. 1.01 586.\n#> 2 alpha_c 243. 242. 2.79 2.70 238. 247. 1.00 3528.\n#> 3 beta_c 6.18 6.18 0.106 0.107 6.01 6.36 1.00 3494.\n#> 4 tausq_c 37.4 36.8 5.57 5.49 28.9 47.1 1.00 1720.\n#> 5 tausq_alp… 217. 208. 63.5 57.7 134. 335. 1.00 3104.\n#> 6 tausq_beta 0.275 0.259 0.0975 0.0884 0.148 0.457 0.999 2070.\n#> 7 tau_c 6.10 6.07 0.451 0.453 5.38 6.86 1.00 1720.\n#> 8 tau_alpha 14.6 14.4 2.06 2.00 11.6 18.3 1.00 3104.\n#> 9 tau_beta 0.517 0.509 0.0896 0.0882 0.385 0.676 0.999 2070.\n#> 10 alpha0 106. 107. 3.63 3.68 100. 112. 0.999 3565.\n#> # ℹ 1 more variable: ess_tail <dbl>\n\n\n诊断信息\n\nfit_rats$diagnostic_summary()\n\n#> $num_divergent\n#> [1] 0 0\n#> \n#> $num_max_treedepth\n#> [1] 0 0\n#> \n#> $ebfmi\n#> [1] 0.8995806 0.8828313\n\n\n\n35.5.3 brms\nbrms 包是基于 rstan 包的,基于 Stan 语言做贝叶斯推断,提供与 lme4 包一致的公式语法,且扩展了模型种类。\n\nrats_brms <- brms::brm(weight ~ days + (days | rats), data = rats_data)\nsummary(rats_brms)\n\n Family: gaussian \n Links: mu = identity; sigma = identity \nFormula: weight ~ days + (days | rats) \n Data: rats_data (Number of observations: 150) \n Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;\n total post-warmup draws = 4000\n\nGroup-Level Effects: \n~rats (Number of levels: 30) \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nsd(Intercept) 11.27 2.23 7.36 16.08 1.00 2172 2939\nsd(days) 0.54 0.09 0.37 0.74 1.00 1380 2356\ncor(Intercept,days) -0.11 0.24 -0.53 0.39 1.00 920 1541\n\nPopulation-Level Effects: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nIntercept 106.47 2.47 101.61 111.23 1.00 2173 2768\ndays 6.18 0.11 5.96 6.41 1.00 1617 2177\n\nFamily Specific Parameters: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nsigma 6.15 0.47 5.30 7.14 1.00 1832 3151\n\nDraws were sampled using sampling(NUTS). For each parameter, Bulk_ESS\nand Tail_ESS are effective sample size measures, and Rhat is the potential\nscale reduction factor on split chains (at convergence, Rhat = 1).\n\n35.5.4 rstanarm\nrstanarm 包与 brms 包类似,区别是前者预编译了 Stan 模型,后者根据输入数据和模型编译即时编译,此外,后者支持的模型范围更加广泛。\n\nlibrary(rstanarm)\nrats_rstanarm <- stan_lmer(formula = weight ~ days + (days | rats), data = rats_data)\nsummary(rats_rstanarm)\n\nModel Info:\n function: stan_lmer\n family: gaussian [identity]\n formula: weight ~ days + (days | rats)\n algorithm: sampling\n sample: 4000 (posterior sample size)\n priors: see help('prior_summary')\n observations: 150\n groups: rats (30)\n\nEstimates:\n mean sd 10% 50% 90% \n(Intercept) 106.575 2.236 103.789 106.559 109.415\ndays 6.187 0.111 6.048 6.185 6.329\nsigma 6.219 0.497 5.626 6.183 6.862\nSigma[rats:(Intercept),(Intercept)] 103.927 42.705 57.329 98.128 159.086\nSigma[rats:days,(Intercept)] -0.545 1.492 -2.361 -0.402 1.162\nSigma[rats:days,days] 0.304 0.112 0.181 0.285 0.445\n\nMCMC diagnostics\n mcse Rhat n_eff\n(Intercept) 0.043 1.000 2753 \ndays 0.003 1.005 1694 \nsigma 0.015 1.001 1172 \nSigma[rats:(Intercept),(Intercept)] 1.140 1.000 1403 \nSigma[rats:days,(Intercept)] 0.054 1.006 772 \nSigma[rats:days,days] 0.003 1.000 1456 \n\nFor each parameter, mcse is Monte Carlo standard error, \nn_eff is a crude measure of effective sample size, \nand Rhat is the potential scale reduction factor \non split chains (at convergence Rhat=1).\n固定效应的部分,截距和斜率如下:\nEstimates:\n mean sd 10% 50% 90% \n(Intercept) 106.575 2.236 103.789 106.559 109.415\ndays 6.187 0.111 6.048 6.185 6.329\n模型残差的标准差 sigma、随机效应 Sigma 的随机截距的方差 103.927 、随机斜率的方差 0.304 及其协方差 -0.545。\nsigma 6.219 0.497 5.626 6.183 6.862\nSigma[rats:(Intercept),(Intercept)] 103.927 42.705 57.329 98.128 159.086\nSigma[rats:days,(Intercept)] -0.545 1.492 -2.361 -0.402 1.162\nSigma[rats:days,days] 0.304 0.112 0.181 0.285 0.445\nrstanarm 和 brms 包的结果基本一致的。\n\n35.5.5 blme\nblme 包 (Chung 等 2013) 基于 lme4 包 (Bates 等 2015) 拟合贝叶斯线性混合效应模型。参考前面 rstan 小节中关于模型参数的先验设置,下面将残差方差的先验设置为逆伽马分布,随机效应的协方差设置为扁平分布。发现拟合结果和 nlme 和 lme4 包的几乎一样。\n\nrats_blme <- blme::blmer(\n weight ~ days + (days | rats), data = rats_data,\n resid.prior = invgamma, cov.prior = NULL\n)\nsummary(rats_blme)\n\n#> Resid prior: invgamma(shape = 0, scale = 0, posterior.scale = var)\n#> Prior dev : 7.1328\n#> \n#> Linear mixed model fit by REML ['blmerMod']\n#> Formula: weight ~ days + (days | rats)\n#> Data: rats_data\n#> \n#> REML criterion at convergence: 1095.4\n#> \n#> Scaled residuals: \n#> Min 1Q Median 3Q Max \n#> -2.6697 -0.5440 0.1202 0.4968 2.6317 \n#> \n#> Random effects:\n#> Groups Name Variance Std.Dev. Corr \n#> rats (Intercept) 116.3517 10.7866 \n#> days 0.2623 0.5121 -0.16\n#> Residual 35.3891 5.9489 \n#> Number of obs: 150, groups: rats, 30\n#> \n#> Fixed effects:\n#> Estimate Std. Error t value\n#> (Intercept) 106.5676 2.2977 46.38\n#> days 6.1857 0.1056 58.58\n#> \n#> Correlation of Fixed Effects:\n#> (Intr)\n#> days -0.343\n\n\n与 lme4 包的函数 lmer() 所不同的是参数 resid.prior 、fixef.prior 和 cov.prior ,它们设置参数的先验分布,其它参数的含义同 lme4 包。resid.prior = invgamma 表示残差方差参数使用逆伽马分布,cov.prior = NULL 表示随机效应的协方差参数使用扁平先验 flat priors。\n\n35.5.6 rjags\nrjags (Plummer 2021) 是 JAGS 软件的 R 语言接口,可以拟合分层正态模型,再借助 coda 包 (Plummer 等 2006) 可以分析 JAGS 返回的各项数据。\nJAGS 代码和 Stan 代码有不少相似之处,最大的共同点在于以直观的统计模型的符号表示编码模型,仿照 Stan 代码, JAGS 编码的模型(BUGS 代码)如下:\nmodel {\n alpha_c ~ dnorm(0, 1.0E-4);\n beta_c ~ dnorm(0, 1.0E-4);\n \n tau_c ~ dgamma(0.001, 0.001);\n tau_alpha ~ dgamma(0.001, 0.001);\n tau_beta ~ dgamma(0.001, 0.001);\n\n sigma_c <- 1.0 / sqrt(tau_c);\n sigma_alpha <- 1.0 / sqrt(tau_alpha);\n sigma_beta <- 1.0 / sqrt(tau_beta);\n \n for (n in 1:N){\n alpha[n] ~ dnorm(alpha_c, tau_alpha); \n beta[n] ~ dnorm(beta_c, tau_beta);\n for (t in 1:T) {\n y[n,t] ~ dnorm(alpha[n] + beta[n] * (x[t] - xbar), tau_c);\n }\n }\n}\n转化主要集中在模型块,注意二者概率分布的名称以及参数含义对应关系,JAGS 使用 precision 而不是 standard deviation or variance,比如正态分布中的方差(标准偏差)被替换为其倒数。JAGS 可以省略类型声明(初始化模型时会补上),最后,JAGS 不支持 Stan 中的向量化操作,这种新特性是独特的。\n\nlibrary(rjags)\n# 初始值\nrats_inits <- list(\n list(\".RNG.name\" = \"base::Marsaglia-Multicarry\", \n \".RNG.seed\" = 20222022, \n \"alpha_c\" = 100, \"beta_c\" = 6, \"tau_c\" = 5, \"tau_alpha\" = 10, \"tau_beta\" = 0.5),\n list(\".RNG.name\" = \"base::Marsaglia-Multicarry\", \n \".RNG.seed\" = 20232023, \n \"alpha_c\" = 200, \"beta_c\" = 10, \"tau_c\" = 15, \"tau_alpha\" = 15, \"tau_beta\" = 1)\n)\n# 模型\nrats_model <- jags.model(\n file = \"code/rats.bugs\",\n data = list(x = x, y = y, N = 30, T = 5, xbar = 22.0),\n inits = rats_inits, \n n.chains = 2, quiet = TRUE\n)\n# burn-in\nupdate(rats_model, n.iter = 2000)\n# 抽样\nrats_samples <- coda.samples(rats_model,\n variable.names = c(\"alpha_c\", \"beta_c\", \"sigma_alpha\", \"sigma_beta\", \"sigma_c\"),\n n.iter = 4000, thin = 1\n)\n# 参数的后验估计\nsummary(rats_samples)\n\n#> \n#> Iterations = 2001:6000\n#> Thinning interval = 1 \n#> Number of chains = 2 \n#> Sample size per chain = 4000 \n#> \n#> 1. Empirical mean and standard deviation for each variable,\n#> plus standard error of the mean:\n#> \n#> Mean SD Naive SE Time-series SE\n#> alpha_c 242.4752 2.72749 0.030494 0.031571\n#> beta_c 6.1878 0.10798 0.001207 0.001481\n#> sigma_alpha 14.6233 2.05688 0.022997 0.025070\n#> sigma_beta 0.5176 0.09266 0.001036 0.001741\n#> sigma_c 6.0731 0.46425 0.005191 0.007984\n#> \n#> 2. Quantiles for each variable:\n#> \n#> 2.5% 25% 50% 75% 97.5%\n#> alpha_c 237.0333 240.6832 242.5024 244.2965 247.7816\n#> beta_c 5.9785 6.1150 6.1867 6.2593 6.4035\n#> sigma_alpha 11.1840 13.1802 14.4152 15.8340 19.2429\n#> sigma_beta 0.3571 0.4538 0.5098 0.5734 0.7187\n#> sigma_c 5.2384 5.7479 6.0455 6.3803 7.0413\n\n\n输出结果与 rstan 十分一致,且采样速度极快。类似地,alpha0 = alpha_c - xbar * beta_c 可得 alpha0 = 242.4752 - 22 * 6.1878 = 106.3436。\n\n35.5.7 MCMCglmm\n同前,先考虑变截距的混合效应模型,MCMCglmm 包 (Hadfield 2010) 给出的拟合结果与 nlme 包很接近。\n\n## 变截距模型\nprior1 <- list(\n R = list(V = 1, nu = 0.002),\n G = list(G1 = list(V = 1, nu = 0.002))\n)\nset.seed(20232023)\nrats_mcmc1 <- MCMCglmm::MCMCglmm(\n weight ~ days, random = ~ rats,\n data = rats_data, verbose = FALSE, prior = prior1\n)\nsummary(rats_mcmc1)\n\n#> \n#> Iterations = 3001:12991\n#> Thinning interval = 10\n#> Sample size = 1000 \n#> \n#> DIC: 1088.71 \n#> \n#> G-structure: ~rats\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> rats 213 108.4 336.4 1000\n#> \n#> R-structure: ~units\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> units 68.58 50.63 86.58 1000\n#> \n#> Location effects: weight ~ days \n#> \n#> post.mean l-95% CI u-95% CI eff.samp pMCMC \n#> (Intercept) 106.568 100.464 112.897 1000 <0.001 ***\n#> days 6.185 6.051 6.315 1000 <0.001 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n\n随机效应的方差(组间方差)为 211.4 ,则标准差为 14.539。残差方差(组内方差)为 68.77,则标准差为 8.293。\n再考虑变截距和斜率的混合效应模型。\n\n## 变截距、变斜率模型\nprior2 <- list(\n R = list(V = 1, nu = 0.002),\n G = list(G1 = list(V = diag(2), nu = 0.002))\n)\nset.seed(20232023)\nrats_mcmc2 <- MCMCglmm::MCMCglmm(weight ~ days,\n random = ~ us(1 + days):rats,\n data = rats_data, verbose = FALSE, prior = prior2\n)\nsummary(rats_mcmc2)\n\n#> \n#> Iterations = 3001:12991\n#> Thinning interval = 10\n#> Sample size = 1000 \n#> \n#> DIC: 1018.746 \n#> \n#> G-structure: ~us(1 + days):rats\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> (Intercept):(Intercept).rats 124.1327 41.5313 226.059 847.2\n#> days:(Intercept).rats -0.7457 -4.3090 2.571 896.6\n#> (Intercept):days.rats -0.7457 -4.3090 2.571 896.6\n#> days:days.rats 0.2783 0.1067 0.493 786.9\n#> \n#> R-structure: ~units\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> units 38.14 27.07 51.08 1000\n#> \n#> Location effects: weight ~ days \n#> \n#> post.mean l-95% CI u-95% CI eff.samp pMCMC \n#> (Intercept) 106.40 101.70 110.78 823.3 <0.001 ***\n#> days 6.19 5.99 6.41 963.4 <0.001 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n\nG-structure 代表随机效应部分,R-structure 代表残差效应部分,Location effects 代表固定效应部分。MCMCglmm 包的这套模型表示术语源自商业软件 ASReml 。\n随机截距的方差为 124.1327,标准差为 11.1415,随机斜率的方差 0.2783,标准差为 0.5275,随机截距和随机斜率的协方差 -0.7457,相关系数为 -0.1268,这与 nlme 包结果很接近。\n\n35.5.8 INLA\n同前,先考虑变截距的混合效应模型。\n\nlibrary(INLA)\ninla.setOption(short.summary = TRUE)\n# 数值稳定性考虑\nrats_data$weight <- rats_data$weight / 400\n# 变截距\nrats_inla1 <- inla(weight ~ days + f(rats, model = \"iid\", n = 30), \n family = \"gaussian\", data = rats_data)\n# 输出结果\nsummary(rats_inla1)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 0.266 0.008 0.252 0.266 0.281 0.266 0\n#> days 0.015 0.000 0.015 0.015 0.016 0.015 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 2414.80 311.28 1852.68 2397.51\n#> Precision for rats 888.43 244.91 495.43 858.84\n#> 0.975quant mode\n#> Precision for the Gaussian observations 3076.02 2368.15\n#> Precision for rats 1451.76 804.50\n#> \n#> is computed\n\n\n再考虑变截距和斜率的混合效应模型。\n\n# https://inla.r-inla-download.org/r-inla.org/doc/latent/iid.pdf\n# 二维高斯随机效应的先验为 Wishart prior\nrats_data$rats <- as.integer(rats_data$rats)\nrats_data$slopeid <- 30 + rats_data$rats\n# 变截距、变斜率\nrats_inla2 <- inla(\n weight ~ 1 + days + f(rats, model = \"iid2d\", n = 2 * 30) + f(slopeid, days, copy = \"rats\"),\n data = rats_data, family = \"gaussian\"\n)\n# 输出结果\nsummary(rats_inla2)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 0.266 0.034 0.20 0.266 0.333 0.266 0\n#> days 0.015 0.033 -0.05 0.015 0.080 0.015 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 4522.632 666.547 3334.879 4480.318\n#> Precision for rats (component 1) 32.097 7.956 18.939 31.260\n#> Precision for rats (component 2) 33.234 8.227 19.603 32.377\n#> Rho1:2 for rats -0.001 0.172 -0.335 -0.001\n#> 0.975quant mode\n#> Precision for the Gaussian observations 5952.967 4407.841\n#> Precision for rats (component 1) 50.049 29.769\n#> Precision for rats (component 2) 51.773 30.859\n#> Rho1:2 for rats 0.334 -0.001\n#> \n#> is computed\n\n\n\n\n\n\n\n\n警告\n\n\n\n对于变截距和斜率混合效应模型,还未完全弄清楚 INLA 包的输出结果。固定效应部分和残差部分都是和前面一致的,但不清楚随机效应的方差协方差矩阵的估计与 INLA 输出的对应关系。参考《Bayesian inference with INLA》第 3 章第 3 小节。", + "text": "35.5 贝叶斯方法\n\n35.5.1 rstan\n初始化模型参数,设置采样算法的参数。\n\n# 迭代链\nchains <- 4\n# 迭代次数\niter <- 1000\n# 初始值\ninit <- rep(list(list(\n alpha = rep(250, 30), beta = rep(6, 30),\n alpha_c = 150, beta_c = 10,\n tausq_c = 1, tausq_alpha = 1,\n tausq_beta = 1\n)), chains)\n\n接下来,基于重复测量数据,建立线性生长曲线模型:\n\\[\n\\begin{aligned}\n\\alpha_c &\\sim \\mathcal{N}(0,100) \\quad \\beta_c \\sim \\mathcal{N}(0,100) \\\\\n\\tau^2_{\\alpha} &\\sim \\mathrm{inv\\_gamma}(0.001, 0.001) \\\\\n\\tau^2_{\\beta} &\\sim \\mathrm{inv\\_gamma}(0.001, 0.001) \\\\\n\\tau^2_c &\\sim \\mathrm{inv\\_gamma}(0.001, 0.001) \\\\\n\\alpha_n &\\sim \\mathcal{N}(\\alpha_c, \\tau_{\\alpha}) \\quad\n\\beta_n \\sim \\mathcal{N}(\\beta_c, \\tau_{\\beta}) \\\\\ny_{nt} &\\sim \\mathcal{N}(\\alpha_n + \\beta_n * (x_t - \\bar{x}), \\tau_c) \\\\\n& n = 1,2,\\ldots,N \\quad t = 1,2,\\ldots,T\n\\end{aligned}\n\\]\n其中, \\(\\alpha_c,\\beta_c,\\tau_c,\\tau_{\\alpha},\\tau_{\\beta}\\) 为无信息先验,\\(\\bar{x} = 22\\) 表示第 22 天,\\(N = 30\\) 和 \\(T = 5\\) 分别表示实验中的小鼠数量和测量次数,下面采用 Stan 编码、编译、采样和拟合模型。\n\nrats_fit <- stan(\n model_name = \"rats\",\n model_code = \"\n data {\n int<lower=0> N;\n int<lower=0> T;\n vector[T] x;\n matrix[N,T] y;\n real xbar;\n }\n parameters {\n vector[N] alpha;\n vector[N] beta;\n\n real alpha_c;\n real beta_c; // beta.c in original bugs model\n\n real<lower=0> tausq_c;\n real<lower=0> tausq_alpha;\n real<lower=0> tausq_beta;\n }\n transformed parameters {\n real<lower=0> tau_c; // sigma in original bugs model\n real<lower=0> tau_alpha;\n real<lower=0> tau_beta;\n\n tau_c = sqrt(tausq_c);\n tau_alpha = sqrt(tausq_alpha);\n tau_beta = sqrt(tausq_beta);\n }\n model {\n alpha_c ~ normal(0, 100);\n beta_c ~ normal(0, 100);\n tausq_c ~ inv_gamma(0.001, 0.001);\n tausq_alpha ~ inv_gamma(0.001, 0.001);\n tausq_beta ~ inv_gamma(0.001, 0.001);\n alpha ~ normal(alpha_c, tau_alpha); // vectorized\n beta ~ normal(beta_c, tau_beta); // vectorized\n for (n in 1:N)\n for (t in 1:T)\n y[n,t] ~ normal(alpha[n] + beta[n] * (x[t] - xbar), tau_c);\n }\n generated quantities {\n real alpha0;\n alpha0 = alpha_c - xbar * beta_c;\n }\n \",\n data = list(N = N, T = T, y = y, x = x, xbar = xbar),\n chains = chains, init = init, iter = iter, \n verbose = FALSE, refresh = 0, seed = 20190425\n)\n\n模型输出结果如下:\n\nprint(rats_fit, pars = c(\"alpha\", \"beta\"), include = FALSE, digits = 1)\n\n#> Inference for Stan model: rats.\n#> 4 chains, each with iter=1000; warmup=500; thin=1; \n#> post-warmup draws per chain=500, total post-warmup draws=2000.\n#> \n#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat\n#> alpha_c 242.5 0.1 2.7 237.1 240.6 242.5 244.3 247.7 1728 1\n#> beta_c 6.2 0.0 0.1 6.0 6.1 6.2 6.3 6.4 2205 1\n#> tausq_c 37.0 0.2 5.8 27.6 32.9 36.4 40.6 50.0 947 1\n#> tausq_alpha 218.0 1.6 64.1 125.3 172.0 207.8 253.0 372.8 1703 1\n#> tausq_beta 0.3 0.0 0.1 0.1 0.2 0.3 0.3 0.5 1481 1\n#> tau_c 6.1 0.0 0.5 5.3 5.7 6.0 6.4 7.1 938 1\n#> tau_alpha 14.6 0.0 2.1 11.2 13.1 14.4 15.9 19.3 1826 1\n#> tau_beta 0.5 0.0 0.1 0.4 0.5 0.5 0.6 0.7 1429 1\n#> alpha0 106.3 0.1 3.6 99.5 103.9 106.4 108.8 113.2 1965 1\n#> lp__ -438.0 0.3 7.0 -452.8 -442.6 -437.4 -433.2 -425.4 558 1\n#> \n#> Samples were drawn using NUTS(diag_e) at Mon Feb 5 05:20:20 2024.\n#> For each parameter, n_eff is a crude measure of effective sample size,\n#> and Rhat is the potential scale reduction factor on split chains (at \n#> convergence, Rhat=1).\n\n\nalpha_c 表示小鼠 5 次测量的平均重量,beta_c 表示小鼠体重的增长率,\\(\\alpha_i,\\beta_i\\) 分别表示第 \\(i\\) 只小鼠在第 22 天(第 3 次测量或 \\(x_t = \\bar{x}\\) )的重量和增长率(每日增加的重量)。\n对于分量众多的参数向量,比较适合用岭线图展示后验分布,下面调用 bayesplot 包绘制参数向量 \\(\\boldsymbol{\\alpha},\\boldsymbol{\\beta}\\) 的后验分布。\n\n# plot(rats_fit, pars = \"alpha\", show_density = TRUE, ci_level = 0.8, outer_level = 0.95)\nbayesplot::mcmc_areas_ridges(rats_fit, pars = paste0(\"alpha\", \"[\", 1:30, \"]\")) +\n scale_y_discrete(labels = scales::parse_format()) \n\n\n\n\n\n\n图 35.8: 参数 \\(\\boldsymbol{\\alpha}\\) 的后验分布\n\n\n\n\n参数向量 \\(\\boldsymbol{\\alpha}\\) 的后验估计可以看作 \\(x_t = \\bar{x}\\) 时小鼠的重量,上图即为各个小鼠重量的后验分布。\n\n# plot(rats_fit, pars = \"beta\", ci_level = 0.8, outer_level = 0.95)\nbayesplot::mcmc_areas_ridges(rats_fit, pars = paste0(\"beta\", \"[\", 1:30, \"]\")) +\n scale_y_discrete(labels = scales::parse_format()) \n\n\n\n\n\n\n图 35.9: 参数 \\(\\boldsymbol{\\beta}\\) 的后验分布\n\n\n\n\n参数向量 \\(\\boldsymbol{\\beta}\\) 的后验估计可以看作是小鼠的重量的增长率,上图即为各个小鼠重量的增长率的后验分布。\n\n35.5.2 cmdstanr\n从 rstan 包转 cmdstanr 包是非常容易的,只要语法兼容,模型代码可以原封不动。\n\nlibrary(cmdstanr)\nmod_rats <- cmdstan_model(\n stan_file = \"code/rats.stan\",\n compile = TRUE, cpp_options = list(stan_threads = TRUE)\n)\nfit_rats <- mod_rats$sample(\n data = list(N = N, T = T, y = y, x = x, xbar = xbar), # 数据\n chains = 2, # 总链条数\n parallel_chains = 2, # 并行数目\n iter_warmup = 1000, # 每条链预处理的迭代次数\n iter_sampling = 1000, # 每条链采样的迭代次数\n threads_per_chain = 2, # 每条链设置 2 个线程\n seed = 20232023, # 随机数种子\n show_messages = FALSE, # 不显示消息\n adapt_delta = 0.9, # 接受率\n refresh = 0 # 不显示采样迭代的进度\n)\n\n模型输出\n\n# 显示除了参数 alpha 和 beta 以外的结果\nvars <- setdiff(fit_rats$metadata()$stan_variables, c(\"alpha\", \"beta\"))\nfit_rats$summary(variables = vars)\n\n#> # A tibble: 10 × 10\n#> variable mean median sd mad q5 q95 rhat ess_bulk\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 lp__ -438. -438. 6.87 6.83 -450. -427. 1.01 586.\n#> 2 alpha_c 243. 242. 2.79 2.70 238. 247. 1.00 3528.\n#> 3 beta_c 6.18 6.18 0.106 0.107 6.01 6.36 1.00 3494.\n#> 4 tausq_c 37.4 36.8 5.57 5.49 28.9 47.1 1.00 1720.\n#> 5 tausq_alp… 217. 208. 63.5 57.7 134. 335. 1.00 3104.\n#> 6 tausq_beta 0.275 0.259 0.0975 0.0884 0.148 0.457 0.999 2070.\n#> 7 tau_c 6.10 6.07 0.451 0.453 5.38 6.86 1.00 1720.\n#> 8 tau_alpha 14.6 14.4 2.06 2.00 11.6 18.3 1.00 3104.\n#> 9 tau_beta 0.517 0.509 0.0896 0.0882 0.385 0.676 0.999 2070.\n#> 10 alpha0 106. 107. 3.63 3.68 100. 112. 0.999 3565.\n#> # ℹ 1 more variable: ess_tail <dbl>\n\n\n诊断信息\n\nfit_rats$diagnostic_summary()\n\n#> $num_divergent\n#> [1] 0 0\n#> \n#> $num_max_treedepth\n#> [1] 0 0\n#> \n#> $ebfmi\n#> [1] 0.8995806 0.8828313\n\n\n\n35.5.3 brms\nbrms 包是基于 rstan 包的,基于 Stan 语言做贝叶斯推断,提供与 lme4 包一致的公式语法,且扩展了模型种类。\n\nrats_brms <- brms::brm(weight ~ days + (days | rats), data = rats_data)\nsummary(rats_brms)\n\n Family: gaussian \n Links: mu = identity; sigma = identity \nFormula: weight ~ days + (days | rats) \n Data: rats_data (Number of observations: 150) \n Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;\n total post-warmup draws = 4000\n\nGroup-Level Effects: \n~rats (Number of levels: 30) \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nsd(Intercept) 11.27 2.23 7.36 16.08 1.00 2172 2939\nsd(days) 0.54 0.09 0.37 0.74 1.00 1380 2356\ncor(Intercept,days) -0.11 0.24 -0.53 0.39 1.00 920 1541\n\nPopulation-Level Effects: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nIntercept 106.47 2.47 101.61 111.23 1.00 2173 2768\ndays 6.18 0.11 5.96 6.41 1.00 1617 2177\n\nFamily Specific Parameters: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nsigma 6.15 0.47 5.30 7.14 1.00 1832 3151\n\nDraws were sampled using sampling(NUTS). For each parameter, Bulk_ESS\nand Tail_ESS are effective sample size measures, and Rhat is the potential\nscale reduction factor on split chains (at convergence, Rhat = 1).\n\n35.5.4 rstanarm\nrstanarm 包与 brms 包类似,区别是前者预编译了 Stan 模型,后者根据输入数据和模型编译即时编译,此外,后者支持的模型范围更加广泛。\n\nlibrary(rstanarm)\nrats_rstanarm <- stan_lmer(formula = weight ~ days + (days | rats), data = rats_data)\nsummary(rats_rstanarm)\n\nModel Info:\n function: stan_lmer\n family: gaussian [identity]\n formula: weight ~ days + (days | rats)\n algorithm: sampling\n sample: 4000 (posterior sample size)\n priors: see help('prior_summary')\n observations: 150\n groups: rats (30)\n\nEstimates:\n mean sd 10% 50% 90% \n(Intercept) 106.575 2.236 103.789 106.559 109.415\ndays 6.187 0.111 6.048 6.185 6.329\nsigma 6.219 0.497 5.626 6.183 6.862\nSigma[rats:(Intercept),(Intercept)] 103.927 42.705 57.329 98.128 159.086\nSigma[rats:days,(Intercept)] -0.545 1.492 -2.361 -0.402 1.162\nSigma[rats:days,days] 0.304 0.112 0.181 0.285 0.445\n\nMCMC diagnostics\n mcse Rhat n_eff\n(Intercept) 0.043 1.000 2753 \ndays 0.003 1.005 1694 \nsigma 0.015 1.001 1172 \nSigma[rats:(Intercept),(Intercept)] 1.140 1.000 1403 \nSigma[rats:days,(Intercept)] 0.054 1.006 772 \nSigma[rats:days,days] 0.003 1.000 1456 \n\nFor each parameter, mcse is Monte Carlo standard error, \nn_eff is a crude measure of effective sample size, \nand Rhat is the potential scale reduction factor \non split chains (at convergence Rhat=1).\n固定效应的部分,截距和斜率如下:\nEstimates:\n mean sd 10% 50% 90% \n(Intercept) 106.575 2.236 103.789 106.559 109.415\ndays 6.187 0.111 6.048 6.185 6.329\n模型残差的标准差 sigma、随机效应 Sigma 的随机截距的方差 103.927 、随机斜率的方差 0.304 及其协方差 -0.545。\nsigma 6.219 0.497 5.626 6.183 6.862\nSigma[rats:(Intercept),(Intercept)] 103.927 42.705 57.329 98.128 159.086\nSigma[rats:days,(Intercept)] -0.545 1.492 -2.361 -0.402 1.162\nSigma[rats:days,days] 0.304 0.112 0.181 0.285 0.445\nrstanarm 和 brms 包的结果基本一致的。\n\n35.5.5 blme\nblme 包 (Chung 等 2013) 基于 lme4 包 (Bates 等 2015) 拟合贝叶斯线性混合效应模型。参考前面 rstan 小节中关于模型参数的先验设置,下面将残差方差的先验设置为逆伽马分布,随机效应的协方差设置为扁平分布。发现拟合结果和 nlme 和 lme4 包的几乎一样。\n\nrats_blme <- blme::blmer(\n weight ~ days + (days | rats), data = rats_data,\n resid.prior = invgamma, cov.prior = NULL\n)\nsummary(rats_blme)\n\n#> Resid prior: invgamma(shape = 0, scale = 0, posterior.scale = var)\n#> Prior dev : 7.1328\n#> \n#> Linear mixed model fit by REML ['blmerMod']\n#> Formula: weight ~ days + (days | rats)\n#> Data: rats_data\n#> \n#> REML criterion at convergence: 1095.4\n#> \n#> Scaled residuals: \n#> Min 1Q Median 3Q Max \n#> -2.6697 -0.5440 0.1202 0.4968 2.6317 \n#> \n#> Random effects:\n#> Groups Name Variance Std.Dev. Corr \n#> rats (Intercept) 116.3517 10.7866 \n#> days 0.2623 0.5121 -0.16\n#> Residual 35.3891 5.9489 \n#> Number of obs: 150, groups: rats, 30\n#> \n#> Fixed effects:\n#> Estimate Std. Error t value\n#> (Intercept) 106.5676 2.2977 46.38\n#> days 6.1857 0.1056 58.58\n#> \n#> Correlation of Fixed Effects:\n#> (Intr)\n#> days -0.343\n\n\n与 lme4 包的函数 lmer() 所不同的是参数 resid.prior 、fixef.prior 和 cov.prior ,它们设置参数的先验分布,其它参数的含义同 lme4 包。resid.prior = invgamma 表示残差方差参数使用逆伽马分布,cov.prior = NULL 表示随机效应的协方差参数使用扁平先验 flat priors。\n\n35.5.6 rjags\nrjags (Plummer 2021) 是 JAGS 软件的 R 语言接口,可以拟合分层正态模型,再借助 coda 包 (Plummer 等 2006) 可以分析 JAGS 返回的各项数据。\nJAGS 代码和 Stan 代码有不少相似之处,最大的共同点在于以直观的统计模型的符号表示编码模型,仿照 Stan 代码, JAGS 编码的模型(BUGS 代码)如下:\nmodel {\n alpha_c ~ dnorm(0, 1.0E-4);\n beta_c ~ dnorm(0, 1.0E-4);\n \n tau_c ~ dgamma(0.001, 0.001);\n tau_alpha ~ dgamma(0.001, 0.001);\n tau_beta ~ dgamma(0.001, 0.001);\n\n sigma_c <- 1.0 / sqrt(tau_c);\n sigma_alpha <- 1.0 / sqrt(tau_alpha);\n sigma_beta <- 1.0 / sqrt(tau_beta);\n \n for (n in 1:N){\n alpha[n] ~ dnorm(alpha_c, tau_alpha); \n beta[n] ~ dnorm(beta_c, tau_beta);\n for (t in 1:T) {\n y[n,t] ~ dnorm(alpha[n] + beta[n] * (x[t] - xbar), tau_c);\n }\n }\n}\n转化主要集中在模型块,注意二者概率分布的名称以及参数含义对应关系,JAGS 使用 precision 而不是 standard deviation or variance,比如正态分布中的方差(标准偏差)被替换为其倒数。JAGS 可以省略类型声明(初始化模型时会补上),最后,JAGS 不支持 Stan 中的向量化操作,这种新特性是独特的。\n\nlibrary(rjags)\n# 初始值\nrats_inits <- list(\n list(\".RNG.name\" = \"base::Marsaglia-Multicarry\", \n \".RNG.seed\" = 20222022, \n \"alpha_c\" = 100, \"beta_c\" = 6, \"tau_c\" = 5, \"tau_alpha\" = 10, \"tau_beta\" = 0.5),\n list(\".RNG.name\" = \"base::Marsaglia-Multicarry\", \n \".RNG.seed\" = 20232023, \n \"alpha_c\" = 200, \"beta_c\" = 10, \"tau_c\" = 15, \"tau_alpha\" = 15, \"tau_beta\" = 1)\n)\n# 模型\nrats_model <- jags.model(\n file = \"code/rats.bugs\",\n data = list(x = x, y = y, N = 30, T = 5, xbar = 22.0),\n inits = rats_inits, \n n.chains = 2, quiet = TRUE\n)\n# burn-in\nupdate(rats_model, n.iter = 2000)\n# 抽样\nrats_samples <- coda.samples(rats_model,\n variable.names = c(\"alpha_c\", \"beta_c\", \"sigma_alpha\", \"sigma_beta\", \"sigma_c\"),\n n.iter = 4000, thin = 1\n)\n# 参数的后验估计\nsummary(rats_samples)\n\n#> \n#> Iterations = 2001:6000\n#> Thinning interval = 1 \n#> Number of chains = 2 \n#> Sample size per chain = 4000 \n#> \n#> 1. Empirical mean and standard deviation for each variable,\n#> plus standard error of the mean:\n#> \n#> Mean SD Naive SE Time-series SE\n#> alpha_c 242.4752 2.72749 0.030494 0.031571\n#> beta_c 6.1878 0.10798 0.001207 0.001481\n#> sigma_alpha 14.6233 2.05688 0.022997 0.025070\n#> sigma_beta 0.5176 0.09266 0.001036 0.001741\n#> sigma_c 6.0731 0.46425 0.005191 0.007984\n#> \n#> 2. Quantiles for each variable:\n#> \n#> 2.5% 25% 50% 75% 97.5%\n#> alpha_c 237.0333 240.6832 242.5024 244.2965 247.7816\n#> beta_c 5.9785 6.1150 6.1867 6.2593 6.4035\n#> sigma_alpha 11.1840 13.1802 14.4152 15.8340 19.2429\n#> sigma_beta 0.3571 0.4538 0.5098 0.5734 0.7187\n#> sigma_c 5.2384 5.7479 6.0455 6.3803 7.0413\n\n\n输出结果与 rstan 十分一致,且采样速度极快。类似地,alpha0 = alpha_c - xbar * beta_c 可得 alpha0 = 242.4752 - 22 * 6.1878 = 106.3436。\n\n35.5.7 MCMCglmm\n同前,先考虑变截距的混合效应模型,MCMCglmm 包 (Hadfield 2010) 给出的拟合结果与 nlme 包很接近。\n\n## 变截距模型\nprior1 <- list(\n R = list(V = 1, nu = 0.002),\n G = list(G1 = list(V = 1, nu = 0.002))\n)\nset.seed(20232023)\nrats_mcmc1 <- MCMCglmm::MCMCglmm(\n weight ~ days, random = ~ rats,\n data = rats_data, verbose = FALSE, prior = prior1\n)\nsummary(rats_mcmc1)\n\n#> \n#> Iterations = 3001:12991\n#> Thinning interval = 10\n#> Sample size = 1000 \n#> \n#> DIC: 1088.71 \n#> \n#> G-structure: ~rats\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> rats 213 108.4 336.4 1000\n#> \n#> R-structure: ~units\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> units 68.58 50.63 86.58 1000\n#> \n#> Location effects: weight ~ days \n#> \n#> post.mean l-95% CI u-95% CI eff.samp pMCMC \n#> (Intercept) 106.568 100.464 112.897 1000 <0.001 ***\n#> days 6.185 6.051 6.315 1000 <0.001 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n\n随机效应的方差(组间方差)为 211.4 ,则标准差为 14.539。残差方差(组内方差)为 68.77,则标准差为 8.293。\n再考虑变截距和斜率的混合效应模型。\n\n## 变截距、变斜率模型\nprior2 <- list(\n R = list(V = 1, nu = 0.002),\n G = list(G1 = list(V = diag(2), nu = 0.002))\n)\nset.seed(20232023)\nrats_mcmc2 <- MCMCglmm::MCMCglmm(weight ~ days,\n random = ~ us(1 + days):rats,\n data = rats_data, verbose = FALSE, prior = prior2\n)\nsummary(rats_mcmc2)\n\n#> \n#> Iterations = 3001:12991\n#> Thinning interval = 10\n#> Sample size = 1000 \n#> \n#> DIC: 1018.746 \n#> \n#> G-structure: ~us(1 + days):rats\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> (Intercept):(Intercept).rats 124.1327 41.5313 226.059 847.2\n#> days:(Intercept).rats -0.7457 -4.3090 2.571 896.6\n#> (Intercept):days.rats -0.7457 -4.3090 2.571 896.6\n#> days:days.rats 0.2783 0.1067 0.493 786.9\n#> \n#> R-structure: ~units\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> units 38.14 27.07 51.08 1000\n#> \n#> Location effects: weight ~ days \n#> \n#> post.mean l-95% CI u-95% CI eff.samp pMCMC \n#> (Intercept) 106.40 101.70 110.78 823.3 <0.001 ***\n#> days 6.19 5.99 6.41 963.4 <0.001 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n\nG-structure 代表随机效应部分,R-structure 代表残差效应部分,Location effects 代表固定效应部分。MCMCglmm 包的这套模型表示术语源自商业软件 ASReml 。\n随机截距的方差为 124.1327,标准差为 11.1415,随机斜率的方差 0.2783,标准差为 0.5275,随机截距和随机斜率的协方差 -0.7457,相关系数为 -0.1268,这与 nlme 包结果很接近。\n\n35.5.8 INLA\n同前,先考虑变截距的混合效应模型。\n\nlibrary(INLA)\ninla.setOption(short.summary = TRUE)\n# 数值稳定性考虑\nrats_data$weight <- rats_data$weight / 400\n# 变截距\nrats_inla1 <- inla(weight ~ days + f(rats, model = \"iid\", n = 30), \n family = \"gaussian\", data = rats_data)\n# 输出结果\nsummary(rats_inla1)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 0.266 0.008 0.252 0.266 0.281 0.266 0\n#> days 0.015 0.000 0.015 0.015 0.016 0.015 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 2414.67 311.30 1852.63 2397.34\n#> Precision for rats 888.04 244.24 494.56 859.03\n#> 0.975quant mode\n#> Precision for the Gaussian observations 3076.07 2367.82\n#> Precision for rats 1448.25 806.15\n#> \n#> is computed\n\n\n再考虑变截距和斜率的混合效应模型。\n\n# https://inla.r-inla-download.org/r-inla.org/doc/latent/iid.pdf\n# 二维高斯随机效应的先验为 Wishart prior\nrats_data$rats <- as.integer(rats_data$rats)\nrats_data$slopeid <- 30 + rats_data$rats\n# 变截距、变斜率\nrats_inla2 <- inla(\n weight ~ 1 + days + f(rats, model = \"iid2d\", n = 2 * 30) + f(slopeid, days, copy = \"rats\"),\n data = rats_data, family = \"gaussian\"\n)\n# 输出结果\nsummary(rats_inla2)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 0.266 0.034 0.20 0.266 0.333 0.266 0\n#> days 0.015 0.033 -0.05 0.015 0.080 0.015 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 4522.633 666.549 3334.882 4480.317\n#> Precision for rats (component 1) 32.097 7.956 18.939 31.260\n#> Precision for rats (component 2) 33.234 8.227 19.603 32.377\n#> Rho1:2 for rats -0.001 0.172 -0.335 -0.001\n#> 0.975quant mode\n#> Precision for the Gaussian observations 5952.976 4407.835\n#> Precision for rats (component 1) 50.049 29.770\n#> Precision for rats (component 2) 51.773 30.859\n#> Rho1:2 for rats 0.334 -0.001\n#> \n#> is computed\n\n\n\n\n\n\n\n\n警告\n\n\n\n对于变截距和斜率混合效应模型,还未完全弄清楚 INLA 包的输出结果。固定效应部分和残差部分都是和前面一致的,但不清楚随机效应的方差协方差矩阵的估计与 INLA 输出的对应关系。参考《Bayesian inference with INLA》第 3 章第 3 小节。", "crumbs": [ "贝叶斯建模", "35  分层正态模型" @@ -2411,7 +2411,7 @@ "href": "mixed-effects-models.html#sec-lmm", "title": "36  混合效应模型", "section": "", - "text": "I think what we are seeking is the marginal variance-covariance matrix of the parameter estimators (marginal with respect to the random effects random variable, B), which would have the form of the inverse of the crossproduct of a \\((q+p)\\) by \\(p\\) matrix composed of the vertical concatenation of \\(-L^{-1}RZXRX^{-1}\\) and \\(RX^{-1}\\). (Note: You do not want to calculate the first term by inverting \\(L\\), use solve(L, RZX, system = \"L\")\n\n[…] don’t even think about using solve(L)\ndon’t!, don’t!, don’t!\nhave I made myself clear?\ndon’t do that (and we all know that someone will do exactly that for a very large \\(L\\) and then send out messages about “R is SOOOOO SLOOOOW!!!!” :-) )\n\n— Douglas Bates 2\n\n\n\n\n\n\n\n提示\n\n\n\n\n一般的模型结构和假设\n一般的模型表达公式\n\nnlme 包的函数 lme()\n\n公式语法和示例模型表示\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n36.1.1 nlme\n考虑两水平的混合效应模型,其中随机截距 \\(\\beta_{0j}\\) 和随机斜率 \\(\\beta_{1j}\\),指标 \\(j\\) 表示分组的编号,也叫变截距和变斜率模型\n\\[\n\\begin{aligned}\n\\mathrm{Reaction}_{ij} &= \\beta_{0j} + \\beta_{1j} \\cdot \\mathrm{Days}_{ij} + \\epsilon_{ij} \\\\\n\\beta_{0j} &= \\gamma_{00} + U_{0j} \\\\\n\\beta_{1j} &= \\gamma_{10} + U_{1j} \\\\\n\\begin{pmatrix}\nU_{0j} \\\\\nU_{1j}\n\\end{pmatrix} &\\sim \\mathcal{N}\n\\begin{bmatrix}\n\\begin{pmatrix}\n0 \\\\\n0\n\\end{pmatrix}\n,\n\\begin{pmatrix}\n\\tau^2_{00} & \\tau_{01} \\\\\n\\tau_{01} & \\tau^2_{10}\n\\end{pmatrix}\n\\end{bmatrix} \\\\\n\\epsilon_{ij} &\\sim \\mathcal{N}(0, \\sigma^2) \\\\\ni = 0,1,\\cdots,9 &\\quad j = 308,309,\\cdots, 372.\n\\end{aligned}\n\\]\n下面用 nlme 包 (Pinheiro 和 Bates 2000) 拟合模型。\n\nlibrary(nlme)\nsleep_nlme <- lme(Reaction ~ Days, random = ~ Days | Subject, data = sleepstudy)\nsummary(sleep_nlme)\n\n#> Linear mixed-effects model fit by REML\n#> Data: sleepstudy \n#> AIC BIC logLik\n#> 1755.628 1774.719 -871.8141\n#> \n#> Random effects:\n#> Formula: ~Days | Subject\n#> Structure: General positive-definite, Log-Cholesky parametrization\n#> StdDev Corr \n#> (Intercept) 24.740241 (Intr)\n#> Days 5.922103 0.066 \n#> Residual 25.591843 \n#> \n#> Fixed effects: Reaction ~ Days \n#> Value Std.Error DF t-value p-value\n#> (Intercept) 251.40510 6.824516 161 36.83853 0\n#> Days 10.46729 1.545783 161 6.77151 0\n#> Correlation: \n#> (Intr)\n#> Days -0.138\n#> \n#> Standardized Within-Group Residuals:\n#> Min Q1 Med Q3 Max \n#> -3.95355735 -0.46339976 0.02311783 0.46339621 5.17925089 \n#> \n#> Number of Observations: 180\n#> Number of Groups: 18\n\n\n随机效应(Random effects)部分:\n\n# 前 6 个 subject\nhead(ranef(sleep_nlme))\n\n#> (Intercept) Days\n#> 308 2.258754 9.1989366\n#> 309 -40.398490 -8.6197167\n#> 310 -38.960098 -5.4489048\n#> 330 23.690228 -4.8142826\n#> 331 22.259981 -3.0698548\n#> 332 9.039458 -0.2721585\n\n\n固定效应(Fixed effects)部分:\n\nfixef(sleep_nlme)\n\n#> (Intercept) Days \n#> 251.40510 10.46729\n\n\nggeffects 包的函数 ggpredict() 和 ggeffect() 可以用来绘制混合效应模型的边际效应( Marginal Effects),ggPMX 包 可以用来绘制混合效应模型的诊断图。下 图 36.3 展示关于变量 Days 的边际效应图。\n\nlibrary(ggeffects)\nmydf <- ggpredict(sleep_nlme, terms = \"Days\")\nggplot(mydf, aes(x = x, y = predicted)) +\n geom_line() +\n geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +\n scale_x_continuous(n.breaks = 6) +\n theme_bw() +\n labs(x = \"Days\", y = \"Reaction\")\n\n\n\n\n\n\n图 36.3: 边际效应图\n\n\n\n\n\n36.1.2 MASS\n\nsleep_mass <- MASS::glmmPQL(Reaction ~ Days,\n random = ~ Days | Subject, verbose = FALSE,\n data = sleepstudy, family = gaussian\n)\nsummary(sleep_mass)\n\n#> Linear mixed-effects model fit by maximum likelihood\n#> Data: sleepstudy \n#> AIC BIC logLik\n#> NA NA NA\n#> \n#> Random effects:\n#> Formula: ~Days | Subject\n#> Structure: General positive-definite, Log-Cholesky parametrization\n#> StdDev Corr \n#> (Intercept) 23.780376 (Intr)\n#> Days 5.716807 0.081 \n#> Residual 25.591842 \n#> \n#> Variance function:\n#> Structure: fixed weights\n#> Formula: ~invwt \n#> Fixed effects: Reaction ~ Days \n#> Value Std.Error DF t-value p-value\n#> (Intercept) 251.40510 6.669396 161 37.69533 0\n#> Days 10.46729 1.510647 161 6.92901 0\n#> Correlation: \n#> (Intr)\n#> Days -0.138\n#> \n#> Standardized Within-Group Residuals:\n#> Min Q1 Med Q3 Max \n#> -3.94156355 -0.46559311 0.02894656 0.46361051 5.17933587 \n#> \n#> Number of Observations: 180\n#> Number of Groups: 18\n\n\n\n36.1.3 lme4\n\nsleep_lme4 <- lme4::lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy)\nsummary(sleep_lme4)\n\n#> Linear mixed model fit by REML ['lmerMod']\n#> Formula: Reaction ~ Days + (Days | Subject)\n#> Data: sleepstudy\n#> \n#> REML criterion at convergence: 1743.6\n#> \n#> Scaled residuals: \n#> Min 1Q Median 3Q Max \n#> -3.9536 -0.4634 0.0231 0.4634 5.1793 \n#> \n#> Random effects:\n#> Groups Name Variance Std.Dev. Corr\n#> Subject (Intercept) 612.10 24.741 \n#> Days 35.07 5.922 0.07\n#> Residual 654.94 25.592 \n#> Number of obs: 180, groups: Subject, 18\n#> \n#> Fixed effects:\n#> Estimate Std. Error t value\n#> (Intercept) 251.405 6.825 36.838\n#> Days 10.467 1.546 6.771\n#> \n#> Correlation of Fixed Effects:\n#> (Intr)\n#> Days -0.138\n\n\n\n36.1.4 blme\n\nsleep_blme <- blme::blmer(\n Reaction ~ Days + (Days | Subject), data = sleepstudy,\n control = lme4::lmerControl(check.conv.grad = \"ignore\"),\n cov.prior = NULL)\nsummary(sleep_blme)\n\n#> Prior dev : 0\n#> \n#> Linear mixed model fit by REML ['blmerMod']\n#> Formula: Reaction ~ Days + (Days | Subject)\n#> Data: sleepstudy\n#> Control: lme4::lmerControl(check.conv.grad = \"ignore\")\n#> \n#> REML criterion at convergence: 1743.6\n#> \n#> Scaled residuals: \n#> Min 1Q Median 3Q Max \n#> -3.9536 -0.4634 0.0231 0.4634 5.1793 \n#> \n#> Random effects:\n#> Groups Name Variance Std.Dev. Corr\n#> Subject (Intercept) 612.10 24.741 \n#> Days 35.07 5.922 0.07\n#> Residual 654.94 25.592 \n#> Number of obs: 180, groups: Subject, 18\n#> \n#> Fixed effects:\n#> Estimate Std. Error t value\n#> (Intercept) 251.405 6.825 36.838\n#> Days 10.467 1.546 6.771\n#> \n#> Correlation of Fixed Effects:\n#> (Intr)\n#> Days -0.138\n\n\n\n36.1.5 brms\n\nsleep_brms <- brms::brm(Reaction ~ Days + (Days | Subject), data = sleepstudy)\nsummary(sleep_brms)\n\n Family: gaussian \n Links: mu = identity; sigma = identity \nFormula: Reaction ~ Days + (Days | Subject) \n Data: sleepstudy (Number of observations: 180) \n Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;\n total post-warmup draws = 4000\n\nGroup-Level Effects: \n~Subject (Number of levels: 18) \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nsd(Intercept) 27.03 6.60 15.88 42.13 1.00 1728 2469\nsd(Days) 6.61 1.50 4.18 9.97 1.00 1517 2010\ncor(Intercept,Days) 0.08 0.29 -0.46 0.65 1.00 991 1521\n\nPopulation-Level Effects: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nIntercept 251.26 7.42 236.27 266.12 1.00 1982 2687\nDays 10.36 1.77 6.85 13.85 1.00 1415 1982\n\nFamily Specific Parameters: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nsigma 25.88 1.54 22.99 29.06 1.00 3204 2869\n\nDraws were sampled using sampling(NUTS). For each parameter, Bulk_ESS\nand Tail_ESS are effective sample size measures, and Rhat is the potential\nscale reduction factor on split chains (at convergence, Rhat = 1).\n\n# predictions\nconds <- brms::make_conditions(sleep_brms, \"Subject\")\nsleep_brms |>\n brms::marginal_effects(\n re_formula = NULL,\n conditions = conds\n ) |>\n plot(points = TRUE, ncol = 6)\n\n\n36.1.6 MCMCglmm\nMCMCglmm 包拟合变截距、变斜率模型,随机截距和随机斜率之间存在相关性。\n\n## 变截距、变斜率模型\nprior1 <- list(\n R = list(V = 1, fix = 1),\n G = list(G1 = list(V = diag(2), nu = 0.002))\n)\nset.seed(20232023)\nsleep_mcmcglmm <- MCMCglmm::MCMCglmm(\n Reaction ~ Days, random = ~ us(1 + Days):Subject, prior = prior1,\n data = sleepstudy, family = \"gaussian\", verbose = FALSE\n)\nsummary(sleep_mcmcglmm)\n\n#> \n#> Iterations = 3001:12991\n#> Thinning interval = 10\n#> Sample size = 1000 \n#> \n#> DIC: 94714.46 \n#> \n#> G-structure: ~us(1 + Days):Subject\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> (Intercept):(Intercept).Subject 1005.69 454.97 1840.04 1000.0\n#> Days:(Intercept).Subject -34.44 -167.15 84.09 1000.0\n#> (Intercept):Days.Subject -34.44 -167.15 84.09 1000.0\n#> Days:Days.Subject 52.36 22.74 95.60 902.3\n#> \n#> R-structure: ~units\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> units 1 1 1 0\n#> \n#> Location effects: Reaction ~ Days \n#> \n#> post.mean l-95% CI u-95% CI eff.samp pMCMC \n#> (Intercept) 251.374 235.935 265.961 1000 <0.001 ***\n#> Days 10.419 7.262 13.976 1000 <0.001 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n\n固定随机效应 R-structure 方差。固定效应 Location effects 截距 (Intercept) 为 251.374,斜率 Days 为 10.419 。\n\n36.1.7 INLA\n将数据集 sleepstudy 中的 Reaction 除以 1000,目的是数值稳定性,减小迭代序列的相关性。先考虑变截距模型\n\nlibrary(INLA)\ninla.setOption(short.summary = TRUE)\n# 做尺度变换\nsleepstudy$Reaction <- sleepstudy$Reaction / 1000\n# 变截距\nsleep_inla1 <- inla(Reaction ~ Days + f(Subject, model = \"iid\", n = 18), \n family = \"gaussian\", data = sleepstudy)\n# 输出结果\nsummary(sleep_inla1)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 0.251 0.010 0.232 0.251 0.270 0.251 0\n#> Days 0.010 0.001 0.009 0.010 0.012 0.010 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 1054.07 116.99 840.14 1048.46\n#> Precision for Subject 843.97 300.88 391.25 798.36\n#> 0.975quant mode\n#> Precision for the Gaussian observations 1300.13 1038.99\n#> Precision for Subject 1559.54 714.37\n#> \n#> is computed\n\n\n再考虑变截距和变斜率模型\n\n# https://inla.r-inla-download.org/r-inla.org/doc/latent/iid.pdf\n# 二维高斯随机效应的先验为 Wishart prior\nsleepstudy$Subject <- as.integer(sleepstudy$Subject)\nsleepstudy$slopeid <- 18 + sleepstudy$Subject\n# 变截距、变斜率\nsleep_inla2 <- inla(\n Reaction ~ 1 + Days + f(Subject, model = \"iid2d\", n = 2 * 18) + f(slopeid, Days, copy = \"Subject\"),\n data = sleepstudy, family = \"gaussian\"\n)\n# 输出结果\nsummary(sleep_inla2)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 0.251 0.055 0.142 0.251 0.360 0.251 0\n#> Days 0.010 0.054 -0.097 0.010 0.118 0.010 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 1549.517 181.247 1218.039 1540.851\n#> Precision for Subject (component 1) 20.871 6.507 10.626 20.024\n#> Precision for Subject (component 2) 21.224 6.611 10.804 20.367\n#> Rho1:2 for Subject -0.001 0.213 -0.414 -0.002\n#> 0.975quant mode\n#> Precision for the Gaussian observations 1930.552 1527.280\n#> Precision for Subject (component 1) 35.970 18.479\n#> Precision for Subject (component 2) 36.555 18.806\n#> Rho1:2 for Subject 0.413 -0.003\n#> \n#> is computed", + "text": "I think what we are seeking is the marginal variance-covariance matrix of the parameter estimators (marginal with respect to the random effects random variable, B), which would have the form of the inverse of the crossproduct of a \\((q+p)\\) by \\(p\\) matrix composed of the vertical concatenation of \\(-L^{-1}RZXRX^{-1}\\) and \\(RX^{-1}\\). (Note: You do not want to calculate the first term by inverting \\(L\\), use solve(L, RZX, system = \"L\")\n\n[…] don’t even think about using solve(L)\ndon’t!, don’t!, don’t!\nhave I made myself clear?\ndon’t do that (and we all know that someone will do exactly that for a very large \\(L\\) and then send out messages about “R is SOOOOO SLOOOOW!!!!” :-) )\n\n— Douglas Bates 2\n\n\n\n\n\n\n\n提示\n\n\n\n\n一般的模型结构和假设\n一般的模型表达公式\n\nnlme 包的函数 lme()\n\n公式语法和示例模型表示\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n36.1.1 nlme\n考虑两水平的混合效应模型,其中随机截距 \\(\\beta_{0j}\\) 和随机斜率 \\(\\beta_{1j}\\),指标 \\(j\\) 表示分组的编号,也叫变截距和变斜率模型\n\\[\n\\begin{aligned}\n\\mathrm{Reaction}_{ij} &= \\beta_{0j} + \\beta_{1j} \\cdot \\mathrm{Days}_{ij} + \\epsilon_{ij} \\\\\n\\beta_{0j} &= \\gamma_{00} + U_{0j} \\\\\n\\beta_{1j} &= \\gamma_{10} + U_{1j} \\\\\n\\begin{pmatrix}\nU_{0j} \\\\\nU_{1j}\n\\end{pmatrix} &\\sim \\mathcal{N}\n\\begin{bmatrix}\n\\begin{pmatrix}\n0 \\\\\n0\n\\end{pmatrix}\n,\n\\begin{pmatrix}\n\\tau^2_{00} & \\tau_{01} \\\\\n\\tau_{01} & \\tau^2_{10}\n\\end{pmatrix}\n\\end{bmatrix} \\\\\n\\epsilon_{ij} &\\sim \\mathcal{N}(0, \\sigma^2) \\\\\ni = 0,1,\\cdots,9 &\\quad j = 308,309,\\cdots, 372.\n\\end{aligned}\n\\]\n下面用 nlme 包 (Pinheiro 和 Bates 2000) 拟合模型。\n\nlibrary(nlme)\nsleep_nlme <- lme(Reaction ~ Days, random = ~ Days | Subject, data = sleepstudy)\nsummary(sleep_nlme)\n\n#> Linear mixed-effects model fit by REML\n#> Data: sleepstudy \n#> AIC BIC logLik\n#> 1755.628 1774.719 -871.8141\n#> \n#> Random effects:\n#> Formula: ~Days | Subject\n#> Structure: General positive-definite, Log-Cholesky parametrization\n#> StdDev Corr \n#> (Intercept) 24.740241 (Intr)\n#> Days 5.922103 0.066 \n#> Residual 25.591843 \n#> \n#> Fixed effects: Reaction ~ Days \n#> Value Std.Error DF t-value p-value\n#> (Intercept) 251.40510 6.824516 161 36.83853 0\n#> Days 10.46729 1.545783 161 6.77151 0\n#> Correlation: \n#> (Intr)\n#> Days -0.138\n#> \n#> Standardized Within-Group Residuals:\n#> Min Q1 Med Q3 Max \n#> -3.95355735 -0.46339976 0.02311783 0.46339621 5.17925089 \n#> \n#> Number of Observations: 180\n#> Number of Groups: 18\n\n\n随机效应(Random effects)部分:\n\n# 前 6 个 subject\nhead(ranef(sleep_nlme))\n\n#> (Intercept) Days\n#> 308 2.258754 9.1989366\n#> 309 -40.398490 -8.6197167\n#> 310 -38.960098 -5.4489048\n#> 330 23.690228 -4.8142826\n#> 331 22.259981 -3.0698548\n#> 332 9.039458 -0.2721585\n\n\n固定效应(Fixed effects)部分:\n\nfixef(sleep_nlme)\n\n#> (Intercept) Days \n#> 251.40510 10.46729\n\n\nggeffects 包的函数 ggpredict() 和 ggeffect() 可以用来绘制混合效应模型的边际效应( Marginal Effects),ggPMX 包 可以用来绘制混合效应模型的诊断图。下 图 36.3 展示关于变量 Days 的边际效应图。\n\nlibrary(ggeffects)\nmydf <- ggpredict(sleep_nlme, terms = \"Days\")\nggplot(mydf, aes(x = x, y = predicted)) +\n geom_line() +\n geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +\n scale_x_continuous(n.breaks = 6) +\n theme_bw() +\n labs(x = \"Days\", y = \"Reaction\")\n\n\n\n\n\n\n图 36.3: 边际效应图\n\n\n\n\n\n36.1.2 MASS\n\nsleep_mass <- MASS::glmmPQL(Reaction ~ Days,\n random = ~ Days | Subject, verbose = FALSE,\n data = sleepstudy, family = gaussian\n)\nsummary(sleep_mass)\n\n#> Linear mixed-effects model fit by maximum likelihood\n#> Data: sleepstudy \n#> AIC BIC logLik\n#> NA NA NA\n#> \n#> Random effects:\n#> Formula: ~Days | Subject\n#> Structure: General positive-definite, Log-Cholesky parametrization\n#> StdDev Corr \n#> (Intercept) 23.780376 (Intr)\n#> Days 5.716807 0.081 \n#> Residual 25.591842 \n#> \n#> Variance function:\n#> Structure: fixed weights\n#> Formula: ~invwt \n#> Fixed effects: Reaction ~ Days \n#> Value Std.Error DF t-value p-value\n#> (Intercept) 251.40510 6.669396 161 37.69533 0\n#> Days 10.46729 1.510647 161 6.92901 0\n#> Correlation: \n#> (Intr)\n#> Days -0.138\n#> \n#> Standardized Within-Group Residuals:\n#> Min Q1 Med Q3 Max \n#> -3.94156355 -0.46559311 0.02894656 0.46361051 5.17933587 \n#> \n#> Number of Observations: 180\n#> Number of Groups: 18\n\n\n\n36.1.3 lme4\n\nsleep_lme4 <- lme4::lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy)\nsummary(sleep_lme4)\n\n#> Linear mixed model fit by REML ['lmerMod']\n#> Formula: Reaction ~ Days + (Days | Subject)\n#> Data: sleepstudy\n#> \n#> REML criterion at convergence: 1743.6\n#> \n#> Scaled residuals: \n#> Min 1Q Median 3Q Max \n#> -3.9536 -0.4634 0.0231 0.4634 5.1793 \n#> \n#> Random effects:\n#> Groups Name Variance Std.Dev. Corr\n#> Subject (Intercept) 612.10 24.741 \n#> Days 35.07 5.922 0.07\n#> Residual 654.94 25.592 \n#> Number of obs: 180, groups: Subject, 18\n#> \n#> Fixed effects:\n#> Estimate Std. Error t value\n#> (Intercept) 251.405 6.825 36.838\n#> Days 10.467 1.546 6.771\n#> \n#> Correlation of Fixed Effects:\n#> (Intr)\n#> Days -0.138\n\n\n\n36.1.4 blme\n\nsleep_blme <- blme::blmer(\n Reaction ~ Days + (Days | Subject), data = sleepstudy,\n control = lme4::lmerControl(check.conv.grad = \"ignore\"),\n cov.prior = NULL)\nsummary(sleep_blme)\n\n#> Prior dev : 0\n#> \n#> Linear mixed model fit by REML ['blmerMod']\n#> Formula: Reaction ~ Days + (Days | Subject)\n#> Data: sleepstudy\n#> Control: lme4::lmerControl(check.conv.grad = \"ignore\")\n#> \n#> REML criterion at convergence: 1743.6\n#> \n#> Scaled residuals: \n#> Min 1Q Median 3Q Max \n#> -3.9536 -0.4634 0.0231 0.4634 5.1793 \n#> \n#> Random effects:\n#> Groups Name Variance Std.Dev. Corr\n#> Subject (Intercept) 612.10 24.741 \n#> Days 35.07 5.922 0.07\n#> Residual 654.94 25.592 \n#> Number of obs: 180, groups: Subject, 18\n#> \n#> Fixed effects:\n#> Estimate Std. Error t value\n#> (Intercept) 251.405 6.825 36.838\n#> Days 10.467 1.546 6.771\n#> \n#> Correlation of Fixed Effects:\n#> (Intr)\n#> Days -0.138\n\n\n\n36.1.5 brms\n\nsleep_brms <- brms::brm(Reaction ~ Days + (Days | Subject), data = sleepstudy)\nsummary(sleep_brms)\n\n Family: gaussian \n Links: mu = identity; sigma = identity \nFormula: Reaction ~ Days + (Days | Subject) \n Data: sleepstudy (Number of observations: 180) \n Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;\n total post-warmup draws = 4000\n\nGroup-Level Effects: \n~Subject (Number of levels: 18) \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nsd(Intercept) 27.03 6.60 15.88 42.13 1.00 1728 2469\nsd(Days) 6.61 1.50 4.18 9.97 1.00 1517 2010\ncor(Intercept,Days) 0.08 0.29 -0.46 0.65 1.00 991 1521\n\nPopulation-Level Effects: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nIntercept 251.26 7.42 236.27 266.12 1.00 1982 2687\nDays 10.36 1.77 6.85 13.85 1.00 1415 1982\n\nFamily Specific Parameters: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nsigma 25.88 1.54 22.99 29.06 1.00 3204 2869\n\nDraws were sampled using sampling(NUTS). For each parameter, Bulk_ESS\nand Tail_ESS are effective sample size measures, and Rhat is the potential\nscale reduction factor on split chains (at convergence, Rhat = 1).\n\n# predictions\nconds <- brms::make_conditions(sleep_brms, \"Subject\")\nsleep_brms |>\n brms::marginal_effects(\n re_formula = NULL,\n conditions = conds\n ) |>\n plot(points = TRUE, ncol = 6)\n\n\n36.1.6 MCMCglmm\nMCMCglmm 包拟合变截距、变斜率模型,随机截距和随机斜率之间存在相关性。\n\n## 变截距、变斜率模型\nprior1 <- list(\n R = list(V = 1, fix = 1),\n G = list(G1 = list(V = diag(2), nu = 0.002))\n)\nset.seed(20232023)\nsleep_mcmcglmm <- MCMCglmm::MCMCglmm(\n Reaction ~ Days, random = ~ us(1 + Days):Subject, prior = prior1,\n data = sleepstudy, family = \"gaussian\", verbose = FALSE\n)\nsummary(sleep_mcmcglmm)\n\n#> \n#> Iterations = 3001:12991\n#> Thinning interval = 10\n#> Sample size = 1000 \n#> \n#> DIC: 94714.46 \n#> \n#> G-structure: ~us(1 + Days):Subject\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> (Intercept):(Intercept).Subject 1005.69 454.97 1840.04 1000.0\n#> Days:(Intercept).Subject -34.44 -167.15 84.09 1000.0\n#> (Intercept):Days.Subject -34.44 -167.15 84.09 1000.0\n#> Days:Days.Subject 52.36 22.74 95.60 902.3\n#> \n#> R-structure: ~units\n#> \n#> post.mean l-95% CI u-95% CI eff.samp\n#> units 1 1 1 0\n#> \n#> Location effects: Reaction ~ Days \n#> \n#> post.mean l-95% CI u-95% CI eff.samp pMCMC \n#> (Intercept) 251.374 235.935 265.961 1000 <0.001 ***\n#> Days 10.419 7.262 13.976 1000 <0.001 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n\n固定随机效应 R-structure 方差。固定效应 Location effects 截距 (Intercept) 为 251.374,斜率 Days 为 10.419 。\n\n36.1.7 INLA\n将数据集 sleepstudy 中的 Reaction 除以 1000,目的是数值稳定性,减小迭代序列的相关性。先考虑变截距模型\n\nlibrary(INLA)\ninla.setOption(short.summary = TRUE)\n# 做尺度变换\nsleepstudy$Reaction <- sleepstudy$Reaction / 1000\n# 变截距\nsleep_inla1 <- inla(Reaction ~ Days + f(Subject, model = \"iid\", n = 18), \n family = \"gaussian\", data = sleepstudy)\n# 输出结果\nsummary(sleep_inla1)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 0.251 0.010 0.232 0.251 0.270 0.251 0\n#> Days 0.010 0.001 0.009 0.010 0.012 0.010 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 1054.07 116.99 840.14 1048.46\n#> Precision for Subject 843.97 300.88 391.25 798.36\n#> 0.975quant mode\n#> Precision for the Gaussian observations 1300.13 1038.99\n#> Precision for Subject 1559.54 714.37\n#> \n#> is computed\n\n\n再考虑变截距和变斜率模型\n\n# https://inla.r-inla-download.org/r-inla.org/doc/latent/iid.pdf\n# 二维高斯随机效应的先验为 Wishart prior\nsleepstudy$Subject <- as.integer(sleepstudy$Subject)\nsleepstudy$slopeid <- 18 + sleepstudy$Subject\n# 变截距、变斜率\nsleep_inla2 <- inla(\n Reaction ~ 1 + Days + f(Subject, model = \"iid2d\", n = 2 * 18) + f(slopeid, Days, copy = \"Subject\"),\n data = sleepstudy, family = \"gaussian\"\n)\n# 输出结果\nsummary(sleep_inla2)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 0.251 0.055 0.142 0.251 0.360 0.251 0\n#> Days 0.010 0.054 -0.097 0.010 0.118 0.010 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 1549.517 181.248 1218.040 1540.851\n#> Precision for Subject (component 1) 20.871 6.507 10.626 20.024\n#> Precision for Subject (component 2) 21.225 6.611 10.804 20.367\n#> Rho1:2 for Subject -0.001 0.213 -0.414 -0.002\n#> 0.975quant mode\n#> Precision for the Gaussian observations 1930.554 1527.279\n#> Precision for Subject (component 1) 35.971 18.479\n#> Precision for Subject (component 2) 36.556 18.806\n#> Rho1:2 for Subject 0.413 -0.003\n#> \n#> is computed", "crumbs": [ "贝叶斯建模", "36  混合效应模型" @@ -2510,7 +2510,7 @@ "href": "generalized-additive-models.html#sec-rongelap-gamm", "title": "37  广义可加模型", "section": "\n37.2 案例:朗格拉普岛核污染", - "text": "37.2 案例:朗格拉普岛核污染\n从线性到可加,意味着从线性到非线性,可加模型容纳非线性的成分,比如高斯过程、样条。\n\n37.2.1 mgcv\n本节复用 章节 28 朗格拉普岛核污染数据,相关背景不再赘述,下面首先加载数据到 R 环境。\n\n# 加载数据\nrongelap <- readRDS(file = \"data/rongelap.rds\")\nrongelap_coastline <- readRDS(file = \"data/rongelap_coastline.rds\")\n\n接着,将岛上各采样点的辐射强度展示出来,算是简单回顾一下数据概况。\n\n代码library(plot3D)\nwith(rongelap, {\n opar <- par(mar = c(.1, 2.5, .1, .1), no.readonly = TRUE)\n rongelap_coastline$cZ <- 0\n scatter3D(\n x = cX, y = cY, z = counts / time, \n xlim = c(-6500, 50), ylim = c(-3800, 110),\n xlab = \"\\n横坐标(米)\", ylab = \"\\n纵坐标(米)\",\n zlab = \"\\n辐射强度\", lwd = 0.5, cex = 0.8,\n pch = 16, type = \"h\", ticktype = \"detailed\",\n phi = 40, theta = -30, r = 50, d = 1,\n expand = 0.5, box = TRUE, bty = \"b\",\n colkey = F, col = \"black\",\n panel.first = function(trans) {\n XY <- trans3D(\n x = rongelap_coastline$cX,\n y = rongelap_coastline$cY,\n z = rongelap_coastline$cZ,\n pmat = trans\n )\n lines(XY, col = \"gray50\", lwd = 2)\n }\n )\n rongelap_coastline$cZ <- NULL\n on.exit(par(opar), add = TRUE)\n})\n\n\n\n\n\n\n图 37.5: 岛上各采样点的辐射强度\n\n\n\n\n在这里,从广义可加混合效应模型的角度来对核污染数据建模,空间效应仍然是用高斯过程来表示,响应变量服从带漂移项的泊松分布。采用 mgcv 包 (S. N. Wood 2004) 的函数 gam() 拟合模型,其中,含 49 个参数的样条近似高斯过程,高斯过程的核函数为默认的梅隆型。更多详情见 mgcv 包的函数 s() 帮助文档参数的说明,默认值是梅隆型相关函数及默认的范围参数,作者自己定义了一套符号约定。\n\nlibrary(nlme)\nlibrary(mgcv)\nfit_rongelap_gam <- gam(\n counts ~ s(cX, cY, bs = \"gp\", k = 50), offset = log(time), \n data = rongelap, family = poisson(link = \"log\")\n)\n# 模型输出\nsummary(fit_rongelap_gam)\n\n#> \n#> Family: poisson \n#> Link function: log \n#> \n#> Formula:\n#> counts ~ s(cX, cY, bs = \"gp\", k = 50)\n#> \n#> Parametric coefficients:\n#> Estimate Std. Error z value Pr(>|z|) \n#> (Intercept) 1.976815 0.001642 1204 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Approximate significance of smooth terms:\n#> edf Ref.df Chi.sq p-value \n#> s(cX,cY) 48.98 49 34030 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> R-sq.(adj) = 0.876 Deviance explained = 60.7%\n#> UBRE = 153.78 Scale est. = 1 n = 157\n\n# 随机效应\ngam.vcomp(fit_rongelap_gam)\n\n#> s(cX,cY) \n#> 2543.376\n\n\n值得一提的是核函数的类型和默认参数的选择,参数 m 接受一个向量, m[1] 取值为 1 至 5,分别代表球型 spherical, 幂指数 power exponential 和梅隆型 Matern with \\(\\kappa\\) = 1.5, 2.5 or 3.5 等 5 种相关/核函数。\n\n# 球型相关函数及范围参数为 0.5\nfit_rongelap_gam <- gam(\n counts ~ s(cX, cY, bs = \"gp\", k = 50, m = c(1, .5)),\n offset = log(time), data = rongelap, family = poisson(link = \"log\")\n)\n\n接下来,基于岛屿的海岸线数据划分出网格,将格点作为新的预测位置。\n\nlibrary(sf)\n\n#> Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE\n\nlibrary(abind)\nlibrary(stars)\n# 类型转化\nrongelap_sf <- st_as_sf(rongelap, coords = c(\"cX\", \"cY\"), dim = \"XY\")\nrongelap_coastline_sf <- st_as_sf(rongelap_coastline, coords = c(\"cX\", \"cY\"), dim = \"XY\")\nrongelap_coastline_sfp <- st_cast(st_combine(st_geometry(rongelap_coastline_sf)), \"POLYGON\")\n# 添加缓冲区\nrongelap_coastline_buffer <- st_buffer(rongelap_coastline_sfp, dist = 50)\n# 构造带边界约束的网格\nrongelap_coastline_grid <- st_make_grid(rongelap_coastline_buffer, n = c(150, 75))\n# 将 sfc 类型转化为 sf 类型\nrongelap_coastline_grid <- st_as_sf(rongelap_coastline_grid)\nrongelap_coastline_buffer <- st_as_sf(rongelap_coastline_buffer)\nrongelap_grid <- rongelap_coastline_grid[rongelap_coastline_buffer, op = st_intersects]\n# 计算网格中心点坐标\nrongelap_grid_centroid <- st_centroid(rongelap_grid)\n# 共计 1612 个预测点\nrongelap_grid_df <- as.data.frame(st_coordinates(rongelap_grid_centroid))\ncolnames(rongelap_grid_df) <- c(\"cX\", \"cY\")\n\n模型对象 fit_rongelap_gam 在新的格点上预测核辐射强度,接着整理预测结果数据。\n\n# 预测\nrongelap_grid_df$ypred <- as.vector(predict(fit_rongelap_gam, newdata = rongelap_grid_df, type = \"response\")) \n# 整理预测数据\nrongelap_grid_sf <- st_as_sf(rongelap_grid_df, coords = c(\"cX\", \"cY\"), dim = \"XY\")\nrongelap_grid_stars <- st_rasterize(rongelap_grid_sf, nx = 150, ny = 75)\nrongelap_stars <- st_crop(x = rongelap_grid_stars, y = rongelap_coastline_sfp)\n\n最后,将岛上各个格点的核辐射强度绘制出来,给出全岛核辐射强度的空间分布。\n\n代码library(ggplot2)\nggplot() +\n geom_stars(data = rongelap_stars, aes(fill = ypred), na.action = na.omit) +\n geom_sf(data = rongelap_coastline_sfp, fill = NA, color = \"gray50\", linewidth = 0.5) +\n scale_fill_viridis_c(option = \"C\") +\n theme_bw() +\n labs(x = \"横坐标(米)\", y = \"纵坐标(米)\", fill = \"预测值\")\n\n\n\n\n\n\n图 37.6: 核辐射强度的预测分布\n\n\n\n\n\n37.2.2 cmdstanr\nFRK 包 (Sainsbury-Dale, Zammit-Mangion, 和 Cressie 2022)(Fixed Rank Kriging,固定秩克里金) 可对有一定规模的(时空)空间区域数据和点参考数据集建模,响应变量的分布从高斯分布扩展到指数族,放在(时空)空间广义线性混合效应模型的框架下统一建模。然而,不支持带漂移项的泊松分布。\nbrms 包支持一大类贝叶斯统计模型,但是对高斯过程建模十分低效,当遇到有一定规模的数据,建模是不可行的,因为经过对 brms 包生成的模型代码的分析,发现它采用潜变量高斯过程(latent variable GP)模型,这也是采样效率低下的一个关键因素。\n\n# 预计运行 1 个小时以上\nrongelap_brm <- brms::brm(counts ~ gp(cX, cY) + offset(log(time)),\n data = rongelap, family = poisson(link = \"log\")\n)\n# 基样条近似拟合也很慢\nrongelap_brm <- brms::brm(\n counts ~ gp(cX, cY, c = 5/4, k = 5) + offset(log(time)),\n data = rongelap, family = poisson(link = \"log\")\n)\n\n当设置 \\(k = 5\\) 时,用 5 个基函数来近似高斯过程,编译完成后,采样速度很快,但是结果不可靠,采样过程中的问题很多。当将横、纵坐标值同时缩小 6000 倍,采样效率并未得到改善。当设置 \\(k = 15\\) 时,运行时间明显增加,采样过程的诊断结果类似 \\(k = 5\\) 的情况,还是不可靠。截止写作时间,函数 gp() 的参数 cov 只能取指数二次核函数(exponentiated-quadratic kernel) 。说明 brms 包不适合处理含高斯过程的模型。\n实际上,Stan 没有现成的有效算法或扩展包做有规模的高斯过程建模,详见 Bob Carpenter 在 2023 年 Stan 大会的报告,因此,必须采用一些近似方法,通过 Stan 编码实现。接下来,分别手动实现低秩和基样条两种方法近似边际高斯过程(marginal likelihood GP)(Rasmussen 和 Williams 2006),用 Stan 编码模型。代码文件分别是 rongelap_poisson_lr.stan 和 rongelap_poisson_splines.stan 。\n\nlibrary(cmdstanr)\n\n\n37.2.3 GINLA\nmgcv 包的函数 ginla() 实现简化版的 Integrated Nested Laplace Approximation, INLA (Simon N. Wood 2019)。\n\nrongelap_gam <- gam(\n counts ~ s(cX, cY, bs = \"gp\", k = 50), offset = log(time), \n data = rongelap, family = poisson(link = \"log\"), fit = FALSE\n)\n# 简化版 INLA\nrongelap_ginla <- ginla(G = rongelap_gam)\nstr(rongelap_ginla)\n\n#> List of 2\n#> $ density: num [1:50, 1:100] 2.49e-01 9.03e-06 3.51e-06 1.97e-06 1.17e-06 ...\n#> $ beta : num [1:50, 1:100] 1.97 -676.61 -572.67 4720.77 240.12 ...\n\n\n其中, \\(k = 50\\) 表示 49 个样条参数,每个参数的分布对应有 100 个采样点,另外,截距项的边际后验概率密度分布如下:\n\nplot(\n rongelap_ginla$beta[1, ], rongelap_ginla$density[1, ],\n type = \"l\", xlab = \"截距项\", ylab = \"概率密度\"\n)\n\n\n\n\n\n\n图 37.7: 截距项的边际后验概率密度分布\n\n\n\n\n不难看出,截距项在 1.976 至 1.978 之间,50个参数的最大后验估计分别如下:\n\nidx <- apply(rongelap_ginla$density, 1, function(x) x == max(x))\nrongelap_ginla$beta[t(idx)]\n\n#> [1] 1.977019e+00 -5.124099e+02 5.461183e+03 1.515296e+03 -2.822166e+03\n#> [6] -1.598371e+04 -6.417855e+03 1.938121e+02 -4.270878e+03 3.769951e+03\n#> [11] -1.002035e+04 1.914717e+03 -9.721572e+03 -3.794461e+04 -1.401549e+04\n#> [16] -5.376582e+04 -1.585899e+04 -2.338235e+04 6.239053e+04 -3.574500e+02\n#> [21] -4.587927e+04 1.723604e+04 -4.514781e+03 9.184026e-02 3.496526e-01\n#> [26] -1.477406e+02 4.585057e+03 9.153647e+03 1.929387e+04 -1.116512e+04\n#> [31] -1.166149e+04 8.079451e+02 3.627369e+03 -9.835680e+03 1.357777e+04\n#> [36] 1.487742e+04 3.880562e+04 -1.708858e+03 2.775844e+04 2.527415e+04\n#> [41] -3.932957e+04 3.548123e+04 -1.116341e+04 1.630910e+04 -9.789381e+02\n#> [46] -2.011250e+04 2.699657e+04 -4.744393e+04 2.753347e+04 2.834356e+04\n\n\n\n37.2.4 INLA\n接下来,介绍完整版的近似贝叶斯推断方法 INLA — 集成嵌套拉普拉斯近似 (Integrated Nested Laplace Approximations,简称 INLA) (Rue, Martino, 和 Chopin 2009)。根据研究区域的边界构造非凸的内外边界,处理边界效应。\n\nlibrary(INLA)\nlibrary(splancs)\n# 构造非凸的边界\nboundary <- list(\n inla.nonconvex.hull(\n points = as.matrix(rongelap_coastline[,c(\"cX\", \"cY\")]), \n convex = 100, concave = 150, resolution = 100),\n inla.nonconvex.hull(\n points = as.matrix(rongelap_coastline[,c(\"cX\", \"cY\")]), \n convex = 200, concave = 200, resolution = 200)\n)\n\n根据研究区域的情况构造网格,边界内部三角网格最大边长为 300,边界外部最大边长为 600,边界外凸出距离为 100 米。\n\n# 构造非凸的网格\nmesh <- inla.mesh.2d(\n loc = as.matrix(rongelap[, c(\"cX\", \"cY\")]), offset = 100,\n max.edge = c(300, 600), boundary = boundary\n)\n\n构建 SPDE,指定自协方差函数为指数型,则 \\(\\nu = 1/2\\) ,因是二维平面,则 \\(d = 2\\) ,根据 \\(\\alpha = \\nu + d/2\\) ,从而 alpha = 3/2 。\n\nspde <- inla.spde2.matern(mesh = mesh, alpha = 3/2, constr = TRUE)\n\n生成 SPDE 模型的指标集,也是随机效应部分。\n\nindexs <- inla.spde.make.index(name = \"s\", n.spde = spde$n.spde)\nlengths(indexs)\n\n#> s s.group s.repl \n#> 691 691 691\n\n\n投影矩阵,三角网格和采样点坐标之间的投影。观测数据 rongelap 和未采样待预测的位置数据 rongelap_grid_df\n\n# 观测位置投影到三角网格上\nA <- inla.spde.make.A(mesh = mesh, loc = as.matrix(rongelap[, c(\"cX\", \"cY\")]) )\n# 预测位置投影到三角网格上\ncoop <- as.matrix(rongelap_grid_df[, c(\"cX\", \"cY\")])\nAp <- inla.spde.make.A(mesh = mesh, loc = coop)\n# 1612 个预测位置\ndim(Ap)\n\n#> [1] 1612 691\n\n\n准备观测数据和预测位置,构造一个 INLA 可以使用的数据栈 Data Stack。\n\n# 在采样点的位置上估计 estimation stk.e\nstk.e <- inla.stack(\n tag = \"est\",\n data = list(y = rongelap$counts, E = rongelap$time),\n A = list(rep(1, 157), A),\n effects = list(data.frame(b0 = 1), s = indexs)\n)\n\n# 在新生成的位置上预测 prediction stk.p\nstk.p <- inla.stack(\n tag = \"pred\",\n data = list(y = NA, E = NA),\n A = list(rep(1, 1612), Ap),\n effects = list(data.frame(b0 = 1), s = indexs)\n)\n\n# 合并数据 stk.full has stk.e and stk.p\nstk.full <- inla.stack(stk.e, stk.p)\n\n指定响应变量与漂移项、联系函数、模型公式。\n\n# 精简输出\ninla.setOption(short.summary = TRUE)\n# 模型拟合\nres <- inla(formula = y ~ 0 + b0 + f(s, model = spde),\n data = inla.stack.data(stk.full),\n E = E, # E 已知漂移项\n control.family = list(link = \"log\"),\n control.predictor = list(\n compute = TRUE, \n link = 1, # 与 control.family 联系函数相同\n A = inla.stack.A(stk.full)\n ),\n control.compute = list(\n cpo = TRUE, \n waic = TRUE, # WAIC 统计量 通用信息准则\n dic = TRUE # DIC 统计量 偏差信息准则\n ),\n family = \"poisson\"\n)\n# 模型输出\nsummary(res)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> b0 1.828 0.061 1.706 1.828 1.948 1.828 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant 0.975quant mode\n#> Theta1 for s 2.00 0.062 1.88 2.00 2.12 2.00\n#> Theta2 for s -4.85 0.130 -5.11 -4.85 -4.59 -4.85\n#> \n#> Deviance Information Criterion (DIC) ...............: 1834.57\n#> Deviance Information Criterion (DIC, saturated) ....: 314.90\n#> Effective number of parameters .....................: 156.46\n#> \n#> Watanabe-Akaike information criterion (WAIC) ...: 1789.32\n#> Effective number of parameters .................: 80.06\n#> \n#> is computed\n\n\n\nkld 表示 Kullback-Leibler divergence (KLD) 它的值描述标准高斯分布与 Simplified Laplace Approximation 之间的差别,值越小越表示拉普拉斯的近似效果好。\nDIC 和 WAIC 指标都是评估模型预测表现的。另外,还有两个量计算出来了,但是没有显示,分别是 CPO 和 PIT 。CPO 表示 Conditional Predictive Ordinate (CPO),PIT 表示 Probability Integral Transforms (PIT) 。\n\n固定效应(截距)和超参数部分\n\n# 截距\nres$summary.fixed\n\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> b0 1.828027 0.06147354 1.706422 1.828284 1.948169 1.828279 1.782545e-08\n\n# 超参数\nres$summary.hyperpar\n\n#> mean sd 0.025quant 0.5quant 0.975quant mode\n#> Theta1 for s 2.000684 0.0623506 1.876512 2.001169 2.122006 2.003209\n#> Theta2 for s -4.851258 0.1297349 -5.105062 -4.851807 -4.594250 -4.854094\n\n\n提取预测数据,并整理数据。\n\n# 预测值对应的指标集合\nindex <- inla.stack.index(stk.full, tag = \"pred\")$data\n# 提取预测结果,后验均值\n# pred_mean <- res$summary.fitted.values[index, \"mean\"]\n# 95% 预测下限\n# pred_ll <- res$summary.fitted.values[index, \"0.025quant\"]\n# 95% 预测上限\n# pred_ul <- res$summary.fitted.values[index, \"0.975quant\"]\n# 整理数据\nrongelap_grid_df$ypred <- res$summary.fitted.values[index, \"mean\"]\n# 预测值数据\nrongelap_grid_sf <- st_as_sf(rongelap_grid_df, coords = c(\"cX\", \"cY\"), dim = \"XY\")\nrongelap_grid_stars <- st_rasterize(rongelap_grid_sf, nx = 150, ny = 75)\nrongelap_stars <- st_crop(x = rongelap_grid_stars, y = rongelap_coastline_sfp)\n\n最后,类似之前 mgcv 建模的最后一步,将 INLA 的预测结果绘制出来。\n\nggplot() +\n geom_stars(data = rongelap_stars, aes(fill = ypred), na.action = na.omit) +\n geom_sf(data = rongelap_coastline_sfp, fill = NA, color = \"gray50\", linewidth = 0.5) +\n scale_fill_viridis_c(option = \"C\") +\n theme_bw() +\n labs(x = \"横坐标(米)\", y = \"纵坐标(米)\", fill = \"预测值\")\n\n\n\n\n\n\n图 37.8: 核辐射强度的预测分布", + "text": "37.2 案例:朗格拉普岛核污染\n从线性到可加,意味着从线性到非线性,可加模型容纳非线性的成分,比如高斯过程、样条。\n\n37.2.1 mgcv\n本节复用 章节 28 朗格拉普岛核污染数据,相关背景不再赘述,下面首先加载数据到 R 环境。\n\n# 加载数据\nrongelap <- readRDS(file = \"data/rongelap.rds\")\nrongelap_coastline <- readRDS(file = \"data/rongelap_coastline.rds\")\n\n接着,将岛上各采样点的辐射强度展示出来,算是简单回顾一下数据概况。\n\n代码library(plot3D)\nwith(rongelap, {\n opar <- par(mar = c(.1, 2.5, .1, .1), no.readonly = TRUE)\n rongelap_coastline$cZ <- 0\n scatter3D(\n x = cX, y = cY, z = counts / time, \n xlim = c(-6500, 50), ylim = c(-3800, 110),\n xlab = \"\\n横坐标(米)\", ylab = \"\\n纵坐标(米)\",\n zlab = \"\\n辐射强度\", lwd = 0.5, cex = 0.8,\n pch = 16, type = \"h\", ticktype = \"detailed\",\n phi = 40, theta = -30, r = 50, d = 1,\n expand = 0.5, box = TRUE, bty = \"b\",\n colkey = F, col = \"black\",\n panel.first = function(trans) {\n XY <- trans3D(\n x = rongelap_coastline$cX,\n y = rongelap_coastline$cY,\n z = rongelap_coastline$cZ,\n pmat = trans\n )\n lines(XY, col = \"gray50\", lwd = 2)\n }\n )\n rongelap_coastline$cZ <- NULL\n on.exit(par(opar), add = TRUE)\n})\n\n\n\n\n\n\n图 37.5: 岛上各采样点的辐射强度\n\n\n\n\n在这里,从广义可加混合效应模型的角度来对核污染数据建模,空间效应仍然是用高斯过程来表示,响应变量服从带漂移项的泊松分布。采用 mgcv 包 (S. N. Wood 2004) 的函数 gam() 拟合模型,其中,含 49 个参数的样条近似高斯过程,高斯过程的核函数为默认的梅隆型。更多详情见 mgcv 包的函数 s() 帮助文档参数的说明,默认值是梅隆型相关函数及默认的范围参数,作者自己定义了一套符号约定。\n\nlibrary(nlme)\nlibrary(mgcv)\nfit_rongelap_gam <- gam(\n counts ~ s(cX, cY, bs = \"gp\", k = 50), offset = log(time), \n data = rongelap, family = poisson(link = \"log\")\n)\n# 模型输出\nsummary(fit_rongelap_gam)\n\n#> \n#> Family: poisson \n#> Link function: log \n#> \n#> Formula:\n#> counts ~ s(cX, cY, bs = \"gp\", k = 50)\n#> \n#> Parametric coefficients:\n#> Estimate Std. Error z value Pr(>|z|) \n#> (Intercept) 1.976815 0.001642 1204 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Approximate significance of smooth terms:\n#> edf Ref.df Chi.sq p-value \n#> s(cX,cY) 48.98 49 34030 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> R-sq.(adj) = 0.876 Deviance explained = 60.7%\n#> UBRE = 153.78 Scale est. = 1 n = 157\n\n# 随机效应\ngam.vcomp(fit_rongelap_gam)\n\n#> s(cX,cY) \n#> 2543.376\n\n\n值得一提的是核函数的类型和默认参数的选择,参数 m 接受一个向量, m[1] 取值为 1 至 5,分别代表球型 spherical, 幂指数 power exponential 和梅隆型 Matern with \\(\\kappa\\) = 1.5, 2.5 or 3.5 等 5 种相关/核函数。\n\n# 球型相关函数及范围参数为 0.5\nfit_rongelap_gam <- gam(\n counts ~ s(cX, cY, bs = \"gp\", k = 50, m = c(1, .5)),\n offset = log(time), data = rongelap, family = poisson(link = \"log\")\n)\n\n接下来,基于岛屿的海岸线数据划分出网格,将格点作为新的预测位置。\n\nlibrary(sf)\n\n#> Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE\n\nlibrary(abind)\nlibrary(stars)\n# 类型转化\nrongelap_sf <- st_as_sf(rongelap, coords = c(\"cX\", \"cY\"), dim = \"XY\")\nrongelap_coastline_sf <- st_as_sf(rongelap_coastline, coords = c(\"cX\", \"cY\"), dim = \"XY\")\nrongelap_coastline_sfp <- st_cast(st_combine(st_geometry(rongelap_coastline_sf)), \"POLYGON\")\n# 添加缓冲区\nrongelap_coastline_buffer <- st_buffer(rongelap_coastline_sfp, dist = 50)\n# 构造带边界约束的网格\nrongelap_coastline_grid <- st_make_grid(rongelap_coastline_buffer, n = c(150, 75))\n# 将 sfc 类型转化为 sf 类型\nrongelap_coastline_grid <- st_as_sf(rongelap_coastline_grid)\nrongelap_coastline_buffer <- st_as_sf(rongelap_coastline_buffer)\nrongelap_grid <- rongelap_coastline_grid[rongelap_coastline_buffer, op = st_intersects]\n# 计算网格中心点坐标\nrongelap_grid_centroid <- st_centroid(rongelap_grid)\n# 共计 1612 个预测点\nrongelap_grid_df <- as.data.frame(st_coordinates(rongelap_grid_centroid))\ncolnames(rongelap_grid_df) <- c(\"cX\", \"cY\")\n\n模型对象 fit_rongelap_gam 在新的格点上预测核辐射强度,接着整理预测结果数据。\n\n# 预测\nrongelap_grid_df$ypred <- as.vector(predict(fit_rongelap_gam, newdata = rongelap_grid_df, type = \"response\")) \n# 整理预测数据\nrongelap_grid_sf <- st_as_sf(rongelap_grid_df, coords = c(\"cX\", \"cY\"), dim = \"XY\")\nrongelap_grid_stars <- st_rasterize(rongelap_grid_sf, nx = 150, ny = 75)\nrongelap_stars <- st_crop(x = rongelap_grid_stars, y = rongelap_coastline_sfp)\n\n最后,将岛上各个格点的核辐射强度绘制出来,给出全岛核辐射强度的空间分布。\n\n代码library(ggplot2)\nggplot() +\n geom_stars(data = rongelap_stars, aes(fill = ypred), na.action = na.omit) +\n geom_sf(data = rongelap_coastline_sfp, fill = NA, color = \"gray50\", linewidth = 0.5) +\n scale_fill_viridis_c(option = \"C\") +\n theme_bw() +\n labs(x = \"横坐标(米)\", y = \"纵坐标(米)\", fill = \"预测值\")\n\n\n\n\n\n\n图 37.6: 核辐射强度的预测分布\n\n\n\n\n\n37.2.2 cmdstanr\nFRK 包 (Sainsbury-Dale, Zammit-Mangion, 和 Cressie 2022)(Fixed Rank Kriging,固定秩克里金) 可对有一定规模的(时空)空间区域数据和点参考数据集建模,响应变量的分布从高斯分布扩展到指数族,放在(时空)空间广义线性混合效应模型的框架下统一建模。然而,不支持带漂移项的泊松分布。\nbrms 包支持一大类贝叶斯统计模型,但是对高斯过程建模十分低效,当遇到有一定规模的数据,建模是不可行的,因为经过对 brms 包生成的模型代码的分析,发现它采用潜变量高斯过程(latent variable GP)模型,这也是采样效率低下的一个关键因素。\n\n# 预计运行 1 个小时以上\nrongelap_brm <- brms::brm(counts ~ gp(cX, cY) + offset(log(time)),\n data = rongelap, family = poisson(link = \"log\")\n)\n# 基样条近似拟合也很慢\nrongelap_brm <- brms::brm(\n counts ~ gp(cX, cY, c = 5/4, k = 5) + offset(log(time)),\n data = rongelap, family = poisson(link = \"log\")\n)\n\n当设置 \\(k = 5\\) 时,用 5 个基函数来近似高斯过程,编译完成后,采样速度很快,但是结果不可靠,采样过程中的问题很多。当将横、纵坐标值同时缩小 6000 倍,采样效率并未得到改善。当设置 \\(k = 15\\) 时,运行时间明显增加,采样过程的诊断结果类似 \\(k = 5\\) 的情况,还是不可靠。截止写作时间,函数 gp() 的参数 cov 只能取指数二次核函数(exponentiated-quadratic kernel) 。说明 brms 包不适合处理含高斯过程的模型。\n实际上,Stan 没有现成的有效算法或扩展包做有规模的高斯过程建模,详见 Bob Carpenter 在 2023 年 Stan 大会的报告,因此,必须采用一些近似方法,通过 Stan 编码实现。接下来,分别手动实现低秩和基样条两种方法近似边际高斯过程(marginal likelihood GP)(Rasmussen 和 Williams 2006),用 Stan 编码模型。代码文件分别是 rongelap_poisson_lr.stan 和 rongelap_poisson_splines.stan 。\n\nlibrary(cmdstanr)\n\n\n37.2.3 GINLA\nmgcv 包的函数 ginla() 实现简化版的 Integrated Nested Laplace Approximation, INLA (Simon N. Wood 2019)。\n\nrongelap_gam <- gam(\n counts ~ s(cX, cY, bs = \"gp\", k = 50), offset = log(time), \n data = rongelap, family = poisson(link = \"log\"), fit = FALSE\n)\n# 简化版 INLA\nrongelap_ginla <- ginla(G = rongelap_gam)\nstr(rongelap_ginla)\n\n#> List of 2\n#> $ density: num [1:50, 1:100] 2.49e-01 9.03e-06 3.51e-06 1.97e-06 1.17e-06 ...\n#> $ beta : num [1:50, 1:100] 1.97 -676.61 -572.67 4720.77 240.12 ...\n\n\n其中, \\(k = 50\\) 表示 49 个样条参数,每个参数的分布对应有 100 个采样点,另外,截距项的边际后验概率密度分布如下:\n\nplot(\n rongelap_ginla$beta[1, ], rongelap_ginla$density[1, ],\n type = \"l\", xlab = \"截距项\", ylab = \"概率密度\"\n)\n\n\n\n\n\n\n图 37.7: 截距项的边际后验概率密度分布\n\n\n\n\n不难看出,截距项在 1.976 至 1.978 之间,50个参数的最大后验估计分别如下:\n\nidx <- apply(rongelap_ginla$density, 1, function(x) x == max(x))\nrongelap_ginla$beta[t(idx)]\n\n#> [1] 1.977019e+00 -5.124099e+02 5.461183e+03 1.515296e+03 -2.822166e+03\n#> [6] -1.598371e+04 -6.417855e+03 1.938121e+02 -4.270878e+03 3.769951e+03\n#> [11] -1.002035e+04 1.914717e+03 -9.721572e+03 -3.794461e+04 -1.401549e+04\n#> [16] -5.376582e+04 -1.585899e+04 -2.338235e+04 6.239053e+04 -3.574500e+02\n#> [21] -4.587927e+04 1.723604e+04 -4.514781e+03 9.184026e-02 3.496526e-01\n#> [26] -1.477406e+02 4.585057e+03 9.153647e+03 1.929387e+04 -1.116512e+04\n#> [31] -1.166149e+04 8.079451e+02 3.627369e+03 -9.835680e+03 1.357777e+04\n#> [36] 1.487742e+04 3.880562e+04 -1.708858e+03 2.775844e+04 2.527415e+04\n#> [41] -3.932957e+04 3.548123e+04 -1.116341e+04 1.630910e+04 -9.789381e+02\n#> [46] -2.011250e+04 2.699657e+04 -4.744393e+04 2.753347e+04 2.834356e+04\n\n\n\n37.2.4 INLA\n接下来,介绍完整版的近似贝叶斯推断方法 INLA — 集成嵌套拉普拉斯近似 (Integrated Nested Laplace Approximations,简称 INLA) (Rue, Martino, 和 Chopin 2009)。根据研究区域的边界构造非凸的内外边界,处理边界效应。\n\nlibrary(INLA)\nlibrary(splancs)\n# 构造非凸的边界\nboundary <- list(\n inla.nonconvex.hull(\n points = as.matrix(rongelap_coastline[,c(\"cX\", \"cY\")]), \n convex = 100, concave = 150, resolution = 100),\n inla.nonconvex.hull(\n points = as.matrix(rongelap_coastline[,c(\"cX\", \"cY\")]), \n convex = 200, concave = 200, resolution = 200)\n)\n\n根据研究区域的情况构造网格,边界内部三角网格最大边长为 300,边界外部最大边长为 600,边界外凸出距离为 100 米。\n\n# 构造非凸的网格\nmesh <- inla.mesh.2d(\n loc = as.matrix(rongelap[, c(\"cX\", \"cY\")]), offset = 100,\n max.edge = c(300, 600), boundary = boundary\n)\n\n构建 SPDE,指定自协方差函数为指数型,则 \\(\\nu = 1/2\\) ,因是二维平面,则 \\(d = 2\\) ,根据 \\(\\alpha = \\nu + d/2\\) ,从而 alpha = 3/2 。\n\nspde <- inla.spde2.matern(mesh = mesh, alpha = 3/2, constr = TRUE)\n\n生成 SPDE 模型的指标集,也是随机效应部分。\n\nindexs <- inla.spde.make.index(name = \"s\", n.spde = spde$n.spde)\nlengths(indexs)\n\n#> s s.group s.repl \n#> 691 691 691\n\n\n投影矩阵,三角网格和采样点坐标之间的投影。观测数据 rongelap 和未采样待预测的位置数据 rongelap_grid_df\n\n# 观测位置投影到三角网格上\nA <- inla.spde.make.A(mesh = mesh, loc = as.matrix(rongelap[, c(\"cX\", \"cY\")]) )\n# 预测位置投影到三角网格上\ncoop <- as.matrix(rongelap_grid_df[, c(\"cX\", \"cY\")])\nAp <- inla.spde.make.A(mesh = mesh, loc = coop)\n# 1612 个预测位置\ndim(Ap)\n\n#> [1] 1612 691\n\n\n准备观测数据和预测位置,构造一个 INLA 可以使用的数据栈 Data Stack。\n\n# 在采样点的位置上估计 estimation stk.e\nstk.e <- inla.stack(\n tag = \"est\",\n data = list(y = rongelap$counts, E = rongelap$time),\n A = list(rep(1, 157), A),\n effects = list(data.frame(b0 = 1), s = indexs)\n)\n\n# 在新生成的位置上预测 prediction stk.p\nstk.p <- inla.stack(\n tag = \"pred\",\n data = list(y = NA, E = NA),\n A = list(rep(1, 1612), Ap),\n effects = list(data.frame(b0 = 1), s = indexs)\n)\n\n# 合并数据 stk.full has stk.e and stk.p\nstk.full <- inla.stack(stk.e, stk.p)\n\n指定响应变量与漂移项、联系函数、模型公式。\n\n# 精简输出\ninla.setOption(short.summary = TRUE)\n# 模型拟合\nres <- inla(formula = y ~ 0 + b0 + f(s, model = spde),\n data = inla.stack.data(stk.full),\n E = E, # E 已知漂移项\n control.family = list(link = \"log\"),\n control.predictor = list(\n compute = TRUE, \n link = 1, # 与 control.family 联系函数相同\n A = inla.stack.A(stk.full)\n ),\n control.compute = list(\n cpo = TRUE, \n waic = TRUE, # WAIC 统计量 通用信息准则\n dic = TRUE # DIC 统计量 偏差信息准则\n ),\n family = \"poisson\"\n)\n# 模型输出\nsummary(res)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> b0 1.828 0.061 1.706 1.828 1.948 1.828 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant 0.975quant mode\n#> Theta1 for s 2.00 0.062 1.88 2.00 2.12 2.00\n#> Theta2 for s -4.85 0.130 -5.11 -4.85 -4.59 -4.85\n#> \n#> Deviance Information Criterion (DIC) ...............: 1834.57\n#> Deviance Information Criterion (DIC, saturated) ....: 314.90\n#> Effective number of parameters .....................: 156.46\n#> \n#> Watanabe-Akaike information criterion (WAIC) ...: 1789.32\n#> Effective number of parameters .................: 80.06\n#> \n#> is computed\n\n\n\nkld 表示 Kullback-Leibler divergence (KLD) 它的值描述标准高斯分布与 Simplified Laplace Approximation 之间的差别,值越小越表示拉普拉斯的近似效果好。\nDIC 和 WAIC 指标都是评估模型预测表现的。另外,还有两个量计算出来了,但是没有显示,分别是 CPO 和 PIT 。CPO 表示 Conditional Predictive Ordinate (CPO),PIT 表示 Probability Integral Transforms (PIT) 。\n\n固定效应(截距)和超参数部分\n\n# 截距\nres$summary.fixed\n\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> b0 1.828027 0.06147357 1.706422 1.828283 1.948169 1.828279 1.782558e-08\n\n# 超参数\nres$summary.hyperpar\n\n#> mean sd 0.025quant 0.5quant 0.975quant mode\n#> Theta1 for s 2.000684 0.06235036 1.876512 2.001169 2.122006 2.003209\n#> Theta2 for s -4.851258 0.12973214 -5.105057 -4.851807 -4.594256 -4.854095\n\n\n提取预测数据,并整理数据。\n\n# 预测值对应的指标集合\nindex <- inla.stack.index(stk.full, tag = \"pred\")$data\n# 提取预测结果,后验均值\n# pred_mean <- res$summary.fitted.values[index, \"mean\"]\n# 95% 预测下限\n# pred_ll <- res$summary.fitted.values[index, \"0.025quant\"]\n# 95% 预测上限\n# pred_ul <- res$summary.fitted.values[index, \"0.975quant\"]\n# 整理数据\nrongelap_grid_df$ypred <- res$summary.fitted.values[index, \"mean\"]\n# 预测值数据\nrongelap_grid_sf <- st_as_sf(rongelap_grid_df, coords = c(\"cX\", \"cY\"), dim = \"XY\")\nrongelap_grid_stars <- st_rasterize(rongelap_grid_sf, nx = 150, ny = 75)\nrongelap_stars <- st_crop(x = rongelap_grid_stars, y = rongelap_coastline_sfp)\n\n最后,类似之前 mgcv 建模的最后一步,将 INLA 的预测结果绘制出来。\n\nggplot() +\n geom_stars(data = rongelap_stars, aes(fill = ypred), na.action = na.omit) +\n geom_sf(data = rongelap_coastline_sfp, fill = NA, color = \"gray50\", linewidth = 0.5) +\n scale_fill_viridis_c(option = \"C\") +\n theme_bw() +\n labs(x = \"横坐标(米)\", y = \"纵坐标(米)\", fill = \"预测值\")\n\n\n\n\n\n\n图 37.8: 核辐射强度的预测分布", "crumbs": [ "贝叶斯建模", "37  广义可加模型" @@ -2609,7 +2609,7 @@ "href": "time-series-regression.html#随机波动率模型", "title": "39  时间序列回归", "section": "", - "text": "meituan_log_return <- diff(log(meituan[, \"3690.HK.Adjusted\"]))[-1]\nautoplot(meituan_log_return) +\n theme_classic() +\n labs(x = \"日期\", y = \"对数收益率\")\nggplot(data = meituan_log_return, aes(x = `3690.HK.Adjusted`)) +\n geom_histogram(color = \"black\", fill = \"gray\", bins = 30) +\n theme_classic() +\n labs(x = \"对数收益率\", y = \"频数(天数)\")\n\n\n\n\n\n\n\n\n\n(a) 对数收益率的变动\n\n\n\n\n\n\n\n\n\n(b) 对数收益率的分布\n\n\n\n\n\n\n图 39.2: 美团股价对数收益率的情况\n\n\n\n\n\n\n\n\n\n\n39.1.1 Stan 框架\n随机波动率模型如下\n\\[\n\\begin{aligned}\ny_t &= \\epsilon_t \\exp(h_t / 2) \\\\\nh_{t+1} &= \\mu + \\phi (h_t - \\mu) + \\delta_t \\sigma \\\\\nh_1 &\\sim \\textsf{normal}\\left( \\mu, \\frac{\\sigma}{\\sqrt{1 - \\phi^2}} \\right) \\\\\n\\epsilon_t &\\sim \\textsf{normal}(0,1) \\\\\n\\delta_t &\\sim \\textsf{normal}(0,1)\n\\end{aligned}\n\\]\n其中, \\(y_t\\) 表示在时间 \\(t\\) 时股价的回报(对数收益率),\\(\\epsilon_t\\) 表示股价回报在时间 \\(t\\) 时的白噪声扰/波动,\\(\\delta_t\\) 表示波动率在时间\\(t\\) 时的波动。\\(h_t\\) 表示对数波动率,带有参数 \\(\\mu\\) (对数波动率的均值),\\(\\phi\\) (对数波动率的趋势)。代表波动率的序列 \\(\\{h_t\\}\\) 假定是平稳 \\((|\\phi| < 1)\\) 的随机过程,\\(h_1\\) 来自平稳的分布(此处为正态分布),\\(\\epsilon_t\\) 和 \\(\\delta_t\\) 是服从不相关的标准正态分布。\nStan 代码如下\ndata {\n int<lower=0> T; // # time points (equally spaced)\n vector[T] y; // mean corrected return at time t\n}\nparameters {\n real mu; // mean log volatility\n real<lower=-1, upper=1> phi; // persistence of volatility\n real<lower=0> sigma; // white noise shock scale\n vector[T] h_std; // std log volatility time t\n}\ntransformed parameters {\n vector[T] h = h_std * sigma; // now h ~ normal(0, sigma)\n h[1] /= sqrt(1 - phi * phi); // rescale h[1]\n h += mu;\n for (t in 2:T) {\n h[t] += phi * (h[t - 1] - mu);\n }\n}\nmodel {\n phi ~ uniform(-1, 1);\n sigma ~ cauchy(0, 5);\n mu ~ cauchy(0, 10);\n \n h_std ~ std_normal();\n y ~ normal(0, exp(h / 2));\n}\n编译和拟合模型\n\nlibrary(cmdstanr)\n# 编译模型\nmod_volatility_normal <- cmdstan_model(\n stan_file = \"code/stochastic_volatility_models.stan\",\n compile = TRUE, cpp_options = list(stan_threads = TRUE)\n)\n# 准备数据\nmdata = list(T = 1274, y = as.vector(meituan_log_return))\n# 拟合模型\nfit_volatility_normal <- mod_volatility_normal$sample(\n data = mdata,\n chains = 2,\n parallel_chains = 2,\n iter_warmup = 1000, \n iter_sampling = 1000, \n threads_per_chain = 2, \n seed = 20232023,\n show_messages = FALSE,\n refresh = 0\n)\n# 输出结果\nfit_volatility_normal$summary(c(\"mu\", \"phi\", \"sigma\", \"lp__\"))\n\n#> # A tibble: 4 × 10\n#> variable mean median sd mad q5 q95 rhat ess_bulk\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 mu -6.86 -6.86 0.116 0.111 -7.04 -6.67 1.00 1536.\n#> 2 phi 0.912 0.917 0.0342 0.0314 0.850 0.957 1.01 287.\n#> 3 sigma 0.299 0.294 0.0613 0.0575 0.209 0.407 1.00 309.\n#> 4 lp__ 3089. 3089. 29.1 29.8 3039. 3135. 1.00 548.\n#> # ℹ 1 more variable: ess_tail <dbl>\n\n\n\n39.1.2 fGarch 包\n《金融时间序列分析讲义》两个波动率建模方法\n\n自回归条件异方差模型(Autoregressive Conditional Heteroskedasticity,简称 ARCH)。\n广义自回归条件异方差模型 (Generalized Autoregressive Conditional Heteroskedasticity,简称 GARCH )\n\n确定 ARCH 模型的阶,观察残差的平方的 ACF 和 PACF 。\nacf((meituan_log_return - mean(meituan_log_return))^2, main = \"\")\npacf((meituan_log_return - mean(meituan_log_return))^2, main = \"\")\n\n\n\n\n\n\n\n\n\n(a) 自相关图\n\n\n\n\n\n\n\n\n\n(b) 偏自相关图\n\n\n\n\n\n\n图 39.4: 对数收益率的残差平方\n\n\n发现 ACF 在滞后 1、2、3 阶比较突出,PACF 在滞后 1、2、16、18、29 阶比较突出。所以下面先来考虑低阶的 ARCH(2) 模型,设 \\(r_t\\) 为对数收益率。\n\\[\n\\begin{aligned}\nr_t &= \\mu + a_t, \\quad a_t = \\sigma_t \\epsilon_t, \\quad \\epsilon_t \\sim \\mathcal{N}(0,1) \\\\\n\\sigma_t^2 &= \\alpha_0 + \\alpha_1 a_{t-1}^2\n + \\alpha_2 a_{t-2}^2.\n\\end{aligned}\n\\]\n拟合 ARCH 模型,比较模型估计结果,根据系数显著性的结果,采纳 ARCH(2) 模型。\n\nlibrary(fGarch)\nmeituan_garch1 <- garchFit(\n formula = ~ 1 + garch(2, 0),\n data = meituan_log_return, trace = FALSE, cond.dist = \"std\"\n)\nsummary(meituan_garch1)\n\n#> \n#> Title:\n#> GARCH Modelling \n#> \n#> Call:\n#> garchFit(formula = ~1 + garch(2, 0), data = meituan_log_return, \n#> cond.dist = \"std\", trace = FALSE) \n#> \n#> Mean and Variance Equation:\n#> data ~ 1 + garch(2, 0)\n#> <environment: 0x7f786ea50758>\n#> [data = meituan_log_return]\n#> \n#> Conditional Distribution:\n#> std \n#> \n#> Coefficient(s):\n#> mu omega alpha1 alpha2 shape \n#> 0.0002577 0.0010729 0.1119940 0.1382923 4.9356152 \n#> \n#> Std. Errors:\n#> based on Hessian \n#> \n#> Error Analysis:\n#> Estimate Std. Error t value Pr(>|t|) \n#> mu 2.577e-04 8.970e-04 0.287 0.77390 \n#> omega 1.073e-03 9.432e-05 11.375 < 2e-16 ***\n#> alpha1 1.120e-01 4.292e-02 2.609 0.00907 ** \n#> alpha2 1.383e-01 4.725e-02 2.927 0.00343 ** \n#> shape 4.936e+00 7.008e-01 7.043 1.88e-12 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Log Likelihood:\n#> 2459.345 normalized: 1.930412 \n#> \n#> Description:\n#> Thu Feb 1 07:15:02 2024 by user: \n#> \n#> \n#> Standardised Residuals Tests:\n#> Statistic p-Value\n#> Jarque-Bera Test R Chi^2 260.647924 0.000000e+00\n#> Shapiro-Wilk Test R W 0.975911 9.515126e-14\n#> Ljung-Box Test R Q(10) 21.212778 1.965775e-02\n#> Ljung-Box Test R Q(15) 24.773595 5.306786e-02\n#> Ljung-Box Test R Q(20) 33.252167 3.165160e-02\n#> Ljung-Box Test R^2 Q(10) 17.362563 6.671658e-02\n#> Ljung-Box Test R^2 Q(15) 29.329034 1.458467e-02\n#> Ljung-Box Test R^2 Q(20) 53.703334 6.400548e-05\n#> LM Arch Test R TR^2 19.254073 8.257864e-02\n#> \n#> Information Criterion Statistics:\n#> AIC BIC SIC HQIC \n#> -3.852975 -3.832764 -3.853006 -3.845384\n\n\n函数 garchFit() 的参数 cond.dist 默认值为 \"norm\" 表示标准正态分布,cond.dist = \"std\" 表示标准 t 分布。模型均值的估计值接近 0 是符合预期的,且显著性没通过,对数收益率在 0 上下波动。将估计结果代入模型,得到\n\\[\n\\begin{aligned}\nr_t &= -5.665 \\times 10^{-5} + a_t, \\quad a_t = \\sigma_t \\epsilon_t, \\quad \\epsilon_t \\sim \\mathcal{N}(0,1) \\\\\n\\sigma_t^2 &= 1.070 \\times 10^{-3} + 0.1156 a_{t-1}^2 + 0.1438a_{t-2}^2.\n\\end{aligned}\n\\]\n下面考虑 GARCH(1,1) 模型\n\\[\n\\begin{aligned}\nr_t &= \\mu + a_t, \\quad a_t = \\sigma_t \\epsilon_t, \\quad \\epsilon_t \\sim \\mathcal{N}(0,1) \\\\\n\\sigma_t^2 &= \\alpha_0 + \\alpha_1 a_{t-1}^2\n + \\beta_1 \\sigma_{t-1}^2.\n\\end{aligned}\n\\]\n\nmeituan_garch2 <- garchFit(\n formula = ~ 1 + garch(1, 1),\n data = meituan_log_return, trace = FALSE, cond.dist = \"std\"\n)\nsummary(meituan_garch2)\n\n#> \n#> Title:\n#> GARCH Modelling \n#> \n#> Call:\n#> garchFit(formula = ~1 + garch(1, 1), data = meituan_log_return, \n#> cond.dist = \"std\", trace = FALSE) \n#> \n#> Mean and Variance Equation:\n#> data ~ 1 + garch(1, 1)\n#> <environment: 0x7f7886af2b60>\n#> [data = meituan_log_return]\n#> \n#> Conditional Distribution:\n#> std \n#> \n#> Coefficient(s):\n#> mu omega alpha1 beta1 shape \n#> 2.8296e-04 3.4454e-05 5.9798e-02 9.1678e-01 5.4352e+00 \n#> \n#> Std. Errors:\n#> based on Hessian \n#> \n#> Error Analysis:\n#> Estimate Std. Error t value Pr(>|t|) \n#> mu 2.830e-04 8.702e-04 0.325 0.74505 \n#> omega 3.445e-05 1.937e-05 1.779 0.07525 . \n#> alpha1 5.980e-02 1.855e-02 3.224 0.00127 ** \n#> beta1 9.168e-01 2.784e-02 32.933 < 2e-16 ***\n#> shape 5.435e+00 8.137e-01 6.680 2.39e-11 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Log Likelihood:\n#> 2478.473 normalized: 1.945427 \n#> \n#> Description:\n#> Thu Feb 1 07:15:02 2024 by user: \n#> \n#> \n#> Standardised Residuals Tests:\n#> Statistic p-Value\n#> Jarque-Bera Test R Chi^2 226.7198909 0.000000e+00\n#> Shapiro-Wilk Test R W 0.9781165 5.582467e-13\n#> Ljung-Box Test R Q(10) 16.0489249 9.824041e-02\n#> Ljung-Box Test R Q(15) 19.6491086 1.858104e-01\n#> Ljung-Box Test R Q(20) 27.2460587 1.284822e-01\n#> Ljung-Box Test R^2 Q(10) 7.8054550 6.478332e-01\n#> Ljung-Box Test R^2 Q(15) 9.8336579 8.300707e-01\n#> Ljung-Box Test R^2 Q(20) 24.7640454 2.106076e-01\n#> LM Arch Test R TR^2 9.5999700 6.510091e-01\n#> \n#> Information Criterion Statistics:\n#> AIC BIC SIC HQIC \n#> -3.883004 -3.862792 -3.883034 -3.875413\n\n\n波动率的贡献主要来自 \\(\\sigma_{t-1}^2\\) ,其系数 \\(\\beta_1\\) 为 0.918。通过对数似然的比较,可以发现 GARCH(1,1) 模型比 ARCH(2) 模型更好。", + "text": "meituan_log_return <- diff(log(meituan[, \"3690.HK.Adjusted\"]))[-1]\nautoplot(meituan_log_return) +\n theme_classic() +\n labs(x = \"日期\", y = \"对数收益率\")\nggplot(data = meituan_log_return, aes(x = `3690.HK.Adjusted`)) +\n geom_histogram(color = \"black\", fill = \"gray\", bins = 30) +\n theme_classic() +\n labs(x = \"对数收益率\", y = \"频数(天数)\")\n\n\n\n\n\n\n\n\n\n(a) 对数收益率的变动\n\n\n\n\n\n\n\n\n\n(b) 对数收益率的分布\n\n\n\n\n\n\n图 39.2: 美团股价对数收益率的情况\n\n\n\n\n\n\n\n\n\n\n39.1.1 Stan 框架\n随机波动率模型如下\n\\[\n\\begin{aligned}\ny_t &= \\epsilon_t \\exp(h_t / 2) \\\\\nh_{t+1} &= \\mu + \\phi (h_t - \\mu) + \\delta_t \\sigma \\\\\nh_1 &\\sim \\textsf{normal}\\left( \\mu, \\frac{\\sigma}{\\sqrt{1 - \\phi^2}} \\right) \\\\\n\\epsilon_t &\\sim \\textsf{normal}(0,1) \\\\\n\\delta_t &\\sim \\textsf{normal}(0,1)\n\\end{aligned}\n\\]\n其中, \\(y_t\\) 表示在时间 \\(t\\) 时股价的回报(对数收益率),\\(\\epsilon_t\\) 表示股价回报在时间 \\(t\\) 时的白噪声扰/波动,\\(\\delta_t\\) 表示波动率在时间\\(t\\) 时的波动。\\(h_t\\) 表示对数波动率,带有参数 \\(\\mu\\) (对数波动率的均值),\\(\\phi\\) (对数波动率的趋势)。代表波动率的序列 \\(\\{h_t\\}\\) 假定是平稳 \\((|\\phi| < 1)\\) 的随机过程,\\(h_1\\) 来自平稳的分布(此处为正态分布),\\(\\epsilon_t\\) 和 \\(\\delta_t\\) 是服从不相关的标准正态分布。\nStan 代码如下\ndata {\n int<lower=0> T; // # time points (equally spaced)\n vector[T] y; // mean corrected return at time t\n}\nparameters {\n real mu; // mean log volatility\n real<lower=-1, upper=1> phi; // persistence of volatility\n real<lower=0> sigma; // white noise shock scale\n vector[T] h_std; // std log volatility time t\n}\ntransformed parameters {\n vector[T] h = h_std * sigma; // now h ~ normal(0, sigma)\n h[1] /= sqrt(1 - phi * phi); // rescale h[1]\n h += mu;\n for (t in 2:T) {\n h[t] += phi * (h[t - 1] - mu);\n }\n}\nmodel {\n phi ~ uniform(-1, 1);\n sigma ~ cauchy(0, 5);\n mu ~ cauchy(0, 10);\n \n h_std ~ std_normal();\n y ~ normal(0, exp(h / 2));\n}\n编译和拟合模型\n\nlibrary(cmdstanr)\n# 编译模型\nmod_volatility_normal <- cmdstan_model(\n stan_file = \"code/stochastic_volatility_models.stan\",\n compile = TRUE, cpp_options = list(stan_threads = TRUE)\n)\n# 准备数据\nmdata = list(T = 1274, y = as.vector(meituan_log_return))\n# 拟合模型\nfit_volatility_normal <- mod_volatility_normal$sample(\n data = mdata,\n chains = 2,\n parallel_chains = 2,\n iter_warmup = 1000, \n iter_sampling = 1000, \n threads_per_chain = 2, \n seed = 20232023,\n show_messages = FALSE,\n refresh = 0\n)\n# 输出结果\nfit_volatility_normal$summary(c(\"mu\", \"phi\", \"sigma\", \"lp__\"))\n\n#> # A tibble: 4 × 10\n#> variable mean median sd mad q5 q95 rhat ess_bulk\n#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n#> 1 mu -6.86 -6.86 0.116 0.111 -7.04 -6.67 1.00 1536.\n#> 2 phi 0.912 0.917 0.0342 0.0314 0.850 0.957 1.01 287.\n#> 3 sigma 0.299 0.294 0.0613 0.0575 0.209 0.407 1.00 309.\n#> 4 lp__ 3089. 3089. 29.1 29.8 3039. 3135. 1.00 548.\n#> # ℹ 1 more variable: ess_tail <dbl>\n\n\n\n39.1.2 fGarch 包\n《金融时间序列分析讲义》两个波动率建模方法\n\n自回归条件异方差模型(Autoregressive Conditional Heteroskedasticity,简称 ARCH)。\n广义自回归条件异方差模型 (Generalized Autoregressive Conditional Heteroskedasticity,简称 GARCH )\n\n确定 ARCH 模型的阶,观察残差的平方的 ACF 和 PACF 。\nacf((meituan_log_return - mean(meituan_log_return))^2, main = \"\")\npacf((meituan_log_return - mean(meituan_log_return))^2, main = \"\")\n\n\n\n\n\n\n\n\n\n(a) 自相关图\n\n\n\n\n\n\n\n\n\n(b) 偏自相关图\n\n\n\n\n\n\n图 39.4: 对数收益率的残差平方\n\n\n发现 ACF 在滞后 1、2、3 阶比较突出,PACF 在滞后 1、2、16、18、29 阶比较突出。所以下面先来考虑低阶的 ARCH(2) 模型,设 \\(r_t\\) 为对数收益率。\n\\[\n\\begin{aligned}\nr_t &= \\mu + a_t, \\quad a_t = \\sigma_t \\epsilon_t, \\quad \\epsilon_t \\sim \\mathcal{N}(0,1) \\\\\n\\sigma_t^2 &= \\alpha_0 + \\alpha_1 a_{t-1}^2\n + \\alpha_2 a_{t-2}^2.\n\\end{aligned}\n\\]\n拟合 ARCH 模型,比较模型估计结果,根据系数显著性的结果,采纳 ARCH(2) 模型。\n\nlibrary(fGarch)\nmeituan_garch1 <- garchFit(\n formula = ~ 1 + garch(2, 0),\n data = meituan_log_return, trace = FALSE, cond.dist = \"std\"\n)\nsummary(meituan_garch1)\n\n#> \n#> Title:\n#> GARCH Modelling \n#> \n#> Call:\n#> garchFit(formula = ~1 + garch(2, 0), data = meituan_log_return, \n#> cond.dist = \"std\", trace = FALSE) \n#> \n#> Mean and Variance Equation:\n#> data ~ 1 + garch(2, 0)\n#> <environment: 0x7fa3a717a788>\n#> [data = meituan_log_return]\n#> \n#> Conditional Distribution:\n#> std \n#> \n#> Coefficient(s):\n#> mu omega alpha1 alpha2 shape \n#> 0.0002577 0.0010729 0.1119940 0.1382923 4.9356152 \n#> \n#> Std. Errors:\n#> based on Hessian \n#> \n#> Error Analysis:\n#> Estimate Std. Error t value Pr(>|t|) \n#> mu 2.577e-04 8.970e-04 0.287 0.77390 \n#> omega 1.073e-03 9.432e-05 11.375 < 2e-16 ***\n#> alpha1 1.120e-01 4.292e-02 2.609 0.00907 ** \n#> alpha2 1.383e-01 4.725e-02 2.927 0.00343 ** \n#> shape 4.936e+00 7.008e-01 7.043 1.88e-12 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Log Likelihood:\n#> 2459.345 normalized: 1.930412 \n#> \n#> Description:\n#> Mon Feb 5 05:55:54 2024 by user: \n#> \n#> \n#> Standardised Residuals Tests:\n#> Statistic p-Value\n#> Jarque-Bera Test R Chi^2 260.647924 0.000000e+00\n#> Shapiro-Wilk Test R W 0.975911 9.515126e-14\n#> Ljung-Box Test R Q(10) 21.212778 1.965775e-02\n#> Ljung-Box Test R Q(15) 24.773595 5.306786e-02\n#> Ljung-Box Test R Q(20) 33.252167 3.165160e-02\n#> Ljung-Box Test R^2 Q(10) 17.362563 6.671658e-02\n#> Ljung-Box Test R^2 Q(15) 29.329034 1.458467e-02\n#> Ljung-Box Test R^2 Q(20) 53.703334 6.400548e-05\n#> LM Arch Test R TR^2 19.254073 8.257864e-02\n#> \n#> Information Criterion Statistics:\n#> AIC BIC SIC HQIC \n#> -3.852975 -3.832764 -3.853006 -3.845384\n\n\n函数 garchFit() 的参数 cond.dist 默认值为 \"norm\" 表示标准正态分布,cond.dist = \"std\" 表示标准 t 分布。模型均值的估计值接近 0 是符合预期的,且显著性没通过,对数收益率在 0 上下波动。将估计结果代入模型,得到\n\\[\n\\begin{aligned}\nr_t &= -5.665 \\times 10^{-5} + a_t, \\quad a_t = \\sigma_t \\epsilon_t, \\quad \\epsilon_t \\sim \\mathcal{N}(0,1) \\\\\n\\sigma_t^2 &= 1.070 \\times 10^{-3} + 0.1156 a_{t-1}^2 + 0.1438a_{t-2}^2.\n\\end{aligned}\n\\]\n下面考虑 GARCH(1,1) 模型\n\\[\n\\begin{aligned}\nr_t &= \\mu + a_t, \\quad a_t = \\sigma_t \\epsilon_t, \\quad \\epsilon_t \\sim \\mathcal{N}(0,1) \\\\\n\\sigma_t^2 &= \\alpha_0 + \\alpha_1 a_{t-1}^2\n + \\beta_1 \\sigma_{t-1}^2.\n\\end{aligned}\n\\]\n\nmeituan_garch2 <- garchFit(\n formula = ~ 1 + garch(1, 1),\n data = meituan_log_return, trace = FALSE, cond.dist = \"std\"\n)\nsummary(meituan_garch2)\n\n#> \n#> Title:\n#> GARCH Modelling \n#> \n#> Call:\n#> garchFit(formula = ~1 + garch(1, 1), data = meituan_log_return, \n#> cond.dist = \"std\", trace = FALSE) \n#> \n#> Mean and Variance Equation:\n#> data ~ 1 + garch(1, 1)\n#> <environment: 0x7fa38c870aa8>\n#> [data = meituan_log_return]\n#> \n#> Conditional Distribution:\n#> std \n#> \n#> Coefficient(s):\n#> mu omega alpha1 beta1 shape \n#> 2.8296e-04 3.4454e-05 5.9798e-02 9.1678e-01 5.4352e+00 \n#> \n#> Std. Errors:\n#> based on Hessian \n#> \n#> Error Analysis:\n#> Estimate Std. Error t value Pr(>|t|) \n#> mu 2.830e-04 8.702e-04 0.325 0.74505 \n#> omega 3.445e-05 1.937e-05 1.779 0.07525 . \n#> alpha1 5.980e-02 1.855e-02 3.224 0.00127 ** \n#> beta1 9.168e-01 2.784e-02 32.933 < 2e-16 ***\n#> shape 5.435e+00 8.137e-01 6.680 2.39e-11 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Log Likelihood:\n#> 2478.473 normalized: 1.945427 \n#> \n#> Description:\n#> Mon Feb 5 05:55:54 2024 by user: \n#> \n#> \n#> Standardised Residuals Tests:\n#> Statistic p-Value\n#> Jarque-Bera Test R Chi^2 226.7198909 0.000000e+00\n#> Shapiro-Wilk Test R W 0.9781165 5.582467e-13\n#> Ljung-Box Test R Q(10) 16.0489249 9.824041e-02\n#> Ljung-Box Test R Q(15) 19.6491086 1.858104e-01\n#> Ljung-Box Test R Q(20) 27.2460587 1.284822e-01\n#> Ljung-Box Test R^2 Q(10) 7.8054550 6.478332e-01\n#> Ljung-Box Test R^2 Q(15) 9.8336579 8.300707e-01\n#> Ljung-Box Test R^2 Q(20) 24.7640454 2.106076e-01\n#> LM Arch Test R TR^2 9.5999700 6.510091e-01\n#> \n#> Information Criterion Statistics:\n#> AIC BIC SIC HQIC \n#> -3.883004 -3.862792 -3.883034 -3.875413\n\n\n波动率的贡献主要来自 \\(\\sigma_{t-1}^2\\) ,其系数 \\(\\beta_1\\) 为 0.918。通过对数似然的比较,可以发现 GARCH(1,1) 模型比 ARCH(2) 模型更好。", "crumbs": [ "贝叶斯建模", "39  时间序列回归" @@ -2620,7 +2620,7 @@ "href": "time-series-regression.html#贝叶斯可加模型", "title": "39  时间序列回归", "section": "\n39.2 贝叶斯可加模型", - "text": "39.2 贝叶斯可加模型\n大规模时间序列回归,观察值是比较多的,可达数十万、数百万,乃至更多。粗粒度时时间跨度往往很长,比如数十年的天粒度数据,细粒度时时间跨度可短可长,比如数年的半小时级数据,总之,需要包含多个季节的数据,各种季节性重复出现。通过时序图可以观察到明显的季节性,而且往往是多种周期不同的季节性混合在一起,有时还包含一定的趋势性。举例来说,比如 2018-2023 年美国旧金山犯罪事件报告数据,事件数量的变化趋势,除了上述季节性因素,特殊事件疫情肯定会影响,数据规模约 200 M 。再比如 2018-2023 年美国境内和跨境旅游业中的航班数据,原始数据非常大,R 包 nycflights13 提供纽约机场的部分航班数据。\n为简单起见,下面以 R 内置的数据集 AirPassengers 为例,介绍 Stan 框架和 INLA 框架建模的过程。数据集 AirPassengers 包含周期性(季节性)和趋势性。作为对比的基础,下面建立非线性回归模型,趋势项和周期项是可加的形式:\n\\[\ny = at + b + c \\sin(\\frac{t}{12} \\times 2\\pi) + d \\cos(\\frac{t}{12} \\times 2\\pi) + \\epsilon\n\\]\n根据数据变化的周期规律,设置周期为 12,还可以在模型中添加周期为 3 或 4 的小周期。其中,\\(y\\) 代表观察值, \\(a,b,c,d\\) 为待定的参数,\\(\\epsilon\\) 代表服从标准正态分布的随机误差。\n\nair_passengers_df <- data.frame(y = as.vector(AirPassengers), t = 1:144)\nfit_lm1 <- lm(y ~ t + sin(t / 12 * 2 * pi) + cos(t / 12 * 2 * pi), data = air_passengers_df)\nfit_lm2 <- update(fit_lm1, . ~ . +\n sin(t / 12 * 2 * 2 * pi) + cos(t / 12 * 2 * 2 * pi), data = air_passengers_df\n)\nfit_lm3 <- update(fit_lm2, . ~ . +\n sin(t / 12 * 3 * 2 * pi) + cos(t / 12 * 3 * 2 * pi), data = air_passengers_df\n)\nplot(y ~ t, air_passengers_df, type = \"l\")\nlines(x = air_passengers_df$t, y = fit_lm1$fitted.values, col = \"red\")\nlines(x = air_passengers_df$t, y = fit_lm2$fitted.values, col = \"green\")\nlines(x = air_passengers_df$t, y = fit_lm3$fitted.values, col = \"orange\")\n\n\n\n\n\n\n图 39.5: 非线性回归\n\n\n\n\n模型 1 已经很好地捕捉到趋势和周期信息,当添加小周期后,略有改善,继续添加更多的小周期,不再有明显改善。实际上,小周期对应的回归系数也将不再显著。所以,这类模型的优化空间见顶了,需要进一步观察和利用残差的规律,使用更加复杂的模型。\n\n39.2.1 Stan 框架\n非线性趋势、多季节性(多个周期混合)、特殊节假日、突发热点事件、残差成分(平稳),能同时应对这五种情况的建模方法是贝叶斯可加模型和神经网络模型,比如基于 Stan 实现的 prophet 包和 tensorflow 框架。\n\n\n\n\n\n\n提示\n\n\n\nprophet 包是如何同时处理这些情况,是否可以在 cmdstanr 包中实现,是否可以在 mgcv 和 INLA 中实现?\n\n\n\nlibrary(cmdstanr)\n\n\n39.2.2 INLA 框架\n阿卜杜拉国王科技大学(King Abdullah University of Science and Technology 简称 KAUST)的 Håvard Rue 等开发了 INLA 框架 (Rue, Martino, 和 Chopin 2009)。《贝叶斯推断与 INLA 》的第3章混合效应模型中随机游走部分 (Gómez-Rubio 2020),一个随机过程(如随机游走、AR(p) 过程)作为随机效应。AirPassengers 的方差在变大,取对数尺度后,方差基本保持不变,一阶差分后基本保持平稳。\nlibrary(ggfortify)\nautoplot(log(AirPassengers)) +\n theme_classic() +\n labs(x = \"年月\", y = \"对数值\")\nautoplot(diff(log(AirPassengers))) +\n theme_classic() +\n labs(x = \"年月\", y = \"差分对数值\")\n\n\n\n\n\n\n\n\n\n(a) 对数尺度\n\n\n\n\n\n\n\n\n\n(b) 一阶差分\n\n\n\n\n\n\n图 39.6: AirPassengers 的时序图\n\n\n因此,下面基于对数尺度建模。首先考虑 RW1 随机游走模型,而后考虑季节性。RW1 模型意味着取对数、一阶差分后序列平稳高斯过程,序列值服从高斯分布。下面设置似然函数的高斯先验 \\(\\mathcal{N}(1,0.2)\\) ,目的是防止过拟合。\n\nlibrary(INLA)\ninla.setOption(short.summary = TRUE)\nair_passengers_df <- data.frame(\n y = as.vector(AirPassengers),\n year = as.factor(rep(1949:1960, each = 12)),\n month = as.factor(rep(1:12, times = 12)),\n ID = 1:length(AirPassengers)\n)\nmod_inla_rw1 <- inla(\n formula = log(y) ~ year + f(ID, model = \"rw1\"),\n family = \"gaussian\", data = air_passengers_df,\n control.family = list(hyper = list(prec = list(param = c(1, 0.2)))),\n control.predictor = list(compute = TRUE)\n)\nsummary(mod_inla_rw1)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 5.159 0.252 4.666 5.158 5.657 5.158 0\n#> year1950 0.050 0.134 -0.215 0.050 0.313 0.050 0\n#> year1951 0.174 0.190 -0.201 0.174 0.546 0.174 0\n#> year1952 0.262 0.233 -0.198 0.262 0.717 0.262 0\n#> year1953 0.330 0.269 -0.201 0.331 0.857 0.331 0\n#> year1954 0.357 0.301 -0.236 0.358 0.945 0.358 0\n#> year1955 0.442 0.329 -0.208 0.443 1.086 0.443 0\n#> year1956 0.510 0.356 -0.193 0.511 1.206 0.511 0\n#> year1957 0.567 0.381 -0.184 0.568 1.311 0.568 0\n#> year1958 0.576 0.403 -0.220 0.577 1.365 0.577 0\n#> year1959 0.647 0.425 -0.191 0.648 1.478 0.648 0\n#> year1960 0.683 0.445 -0.195 0.684 1.555 0.684 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 101.38 18.14 69.78 99.99\n#> Precision for ID 155.14 40.69 92.69 149.32\n#> 0.975quant mode\n#> Precision for the Gaussian observations 140.94 97.59\n#> Precision for ID 251.67 137.49\n#> \n#> is computed\n\n\n这里,将年份作为因子型变量,从输出结果可以看出,以1949年作为参照,回归系数的后验均值在逐年变大,这符合 AirPassengers 时序图呈现的趋势。\n存在周期性的波动规律,考虑季节性\n\nmod_inla_sea <- inla(\n formula = log(y) ~ year + f(ID, model = \"seasonal\", season.length = 12),\n family = \"gaussian\", data = air_passengers_df,\n control.family = list(hyper = list(prec = list(param = c(1, 0.2)))),\n control.predictor = list(compute = TRUE)\n)\nsummary(mod_inla_sea)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 4.836 0.020 4.797 4.836 4.875 4.836 0\n#> year1950 0.095 0.028 0.039 0.095 0.150 0.095 0\n#> year1951 0.295 0.028 0.240 0.295 0.351 0.295 0\n#> year1952 0.441 0.028 0.386 0.441 0.496 0.441 0\n#> year1953 0.573 0.028 0.517 0.573 0.628 0.573 0\n#> year1954 0.630 0.028 0.575 0.630 0.686 0.630 0\n#> year1955 0.803 0.028 0.748 0.803 0.859 0.803 0\n#> year1956 0.948 0.028 0.893 0.948 1.004 0.948 0\n#> year1957 1.062 0.028 1.007 1.062 1.118 1.062 0\n#> year1958 1.094 0.028 1.039 1.094 1.150 1.094 0\n#> year1959 1.212 0.028 1.157 1.212 1.268 1.212 0\n#> year1960 1.318 0.028 1.263 1.318 1.373 1.318 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 213.02 27.47 163.43 211.49\n#> Precision for ID 42123.02 27651.60 10119.85 35347.52\n#> 0.975quant mode\n#> Precision for the Gaussian observations 271.39 208.87\n#> Precision for ID 113902.57 24290.69\n#> \n#> is computed\n\n\n最后,将两个模型的拟合结果展示出来,见下图,黑线表示原对数值,红线表示拟合值,灰色区域表示在置信水平 95% 下的区间。区间更短说明季节性模型更好。\nmod_inla_rw1_fitted <- data.frame(\n ID = 1:length(AirPassengers),\n y = as.vector(log(AirPassengers)),\n mean = mod_inla_rw1$summary.fitted.values$mean,\n `0.025quant` = mod_inla_rw1$summary.fitted.values$`0.025quant`,\n `0.975quant` = mod_inla_rw1$summary.fitted.values$`0.975quant`,\n check.names = FALSE\n)\nmod_inla_sea_fitted <- data.frame(\n ID = 1:length(AirPassengers),\n y = as.vector(log(AirPassengers)),\n mean = mod_inla_sea$summary.fitted.values$mean,\n `0.025quant` = mod_inla_sea$summary.fitted.values$`0.025quant`,\n `0.975quant` = mod_inla_sea$summary.fitted.values$`0.975quant`,\n check.names = FALSE\n)\nggplot(data = mod_inla_rw1_fitted, aes(ID)) +\n geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`), fill = \"gray\") +\n geom_line(aes(y = y)) +\n geom_line(aes(y = mean), color = \"red\") +\n theme_classic() +\n labs(x = \"序号\", y = \"对数值\")\nggplot(data = mod_inla_sea_fitted, aes(ID)) +\n geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`), fill = \"gray\") +\n geom_line(aes(y = y)) +\n geom_line(aes(y = mean), color = \"red\") +\n theme_classic() +\n labs(x = \"序号\", y = \"对数值\")\n\n\n\n\n\n\n\n\n\n(a) 随机游走模型\n\n\n\n\n\n\n\n\n\n(b) 季节效应模型\n\n\n\n\n\n\n图 39.7: AirPassengers 的拟合图", + "text": "39.2 贝叶斯可加模型\n大规模时间序列回归,观察值是比较多的,可达数十万、数百万,乃至更多。粗粒度时时间跨度往往很长,比如数十年的天粒度数据,细粒度时时间跨度可短可长,比如数年的半小时级数据,总之,需要包含多个季节的数据,各种季节性重复出现。通过时序图可以观察到明显的季节性,而且往往是多种周期不同的季节性混合在一起,有时还包含一定的趋势性。举例来说,比如 2018-2023 年美国旧金山犯罪事件报告数据,事件数量的变化趋势,除了上述季节性因素,特殊事件疫情肯定会影响,数据规模约 200 M 。再比如 2018-2023 年美国境内和跨境旅游业中的航班数据,原始数据非常大,R 包 nycflights13 提供纽约机场的部分航班数据。\n为简单起见,下面以 R 内置的数据集 AirPassengers 为例,介绍 Stan 框架和 INLA 框架建模的过程。数据集 AirPassengers 包含周期性(季节性)和趋势性。作为对比的基础,下面建立非线性回归模型,趋势项和周期项是可加的形式:\n\\[\ny = at + b + c \\sin(\\frac{t}{12} \\times 2\\pi) + d \\cos(\\frac{t}{12} \\times 2\\pi) + \\epsilon\n\\]\n根据数据变化的周期规律,设置周期为 12,还可以在模型中添加周期为 3 或 4 的小周期。其中,\\(y\\) 代表观察值, \\(a,b,c,d\\) 为待定的参数,\\(\\epsilon\\) 代表服从标准正态分布的随机误差。\n\nair_passengers_df <- data.frame(y = as.vector(AirPassengers), t = 1:144)\nfit_lm1 <- lm(y ~ t + sin(t / 12 * 2 * pi) + cos(t / 12 * 2 * pi), data = air_passengers_df)\nfit_lm2 <- update(fit_lm1, . ~ . +\n sin(t / 12 * 2 * 2 * pi) + cos(t / 12 * 2 * 2 * pi), data = air_passengers_df\n)\nfit_lm3 <- update(fit_lm2, . ~ . +\n sin(t / 12 * 3 * 2 * pi) + cos(t / 12 * 3 * 2 * pi), data = air_passengers_df\n)\nplot(y ~ t, air_passengers_df, type = \"l\")\nlines(x = air_passengers_df$t, y = fit_lm1$fitted.values, col = \"red\")\nlines(x = air_passengers_df$t, y = fit_lm2$fitted.values, col = \"green\")\nlines(x = air_passengers_df$t, y = fit_lm3$fitted.values, col = \"orange\")\n\n\n\n\n\n\n图 39.5: 非线性回归\n\n\n\n\n模型 1 已经很好地捕捉到趋势和周期信息,当添加小周期后,略有改善,继续添加更多的小周期,不再有明显改善。实际上,小周期对应的回归系数也将不再显著。所以,这类模型的优化空间见顶了,需要进一步观察和利用残差的规律,使用更加复杂的模型。\n\n39.2.1 Stan 框架\n非线性趋势、多季节性(多个周期混合)、特殊节假日、突发热点事件、残差成分(平稳),能同时应对这五种情况的建模方法是贝叶斯可加模型和神经网络模型,比如基于 Stan 实现的 prophet 包和 tensorflow 框架。\n\n\n\n\n\n\n提示\n\n\n\nprophet 包是如何同时处理这些情况,是否可以在 cmdstanr 包中实现,是否可以在 mgcv 和 INLA 中实现?\n\n\n\nlibrary(cmdstanr)\n\n\n39.2.2 INLA 框架\n阿卜杜拉国王科技大学(King Abdullah University of Science and Technology 简称 KAUST)的 Håvard Rue 等开发了 INLA 框架 (Rue, Martino, 和 Chopin 2009)。《贝叶斯推断与 INLA 》的第3章混合效应模型中随机游走部分 (Gómez-Rubio 2020),一个随机过程(如随机游走、AR(p) 过程)作为随机效应。AirPassengers 的方差在变大,取对数尺度后,方差基本保持不变,一阶差分后基本保持平稳。\nlibrary(ggfortify)\nautoplot(log(AirPassengers)) +\n theme_classic() +\n labs(x = \"年月\", y = \"对数值\")\nautoplot(diff(log(AirPassengers))) +\n theme_classic() +\n labs(x = \"年月\", y = \"差分对数值\")\n\n\n\n\n\n\n\n\n\n(a) 对数尺度\n\n\n\n\n\n\n\n\n\n(b) 一阶差分\n\n\n\n\n\n\n图 39.6: AirPassengers 的时序图\n\n\n因此,下面基于对数尺度建模。首先考虑 RW1 随机游走模型,而后考虑季节性。RW1 模型意味着取对数、一阶差分后序列平稳高斯过程,序列值服从高斯分布。下面设置似然函数的高斯先验 \\(\\mathcal{N}(1,0.2)\\) ,目的是防止过拟合。\n\nlibrary(INLA)\ninla.setOption(short.summary = TRUE)\nair_passengers_df <- data.frame(\n y = as.vector(AirPassengers),\n year = as.factor(rep(1949:1960, each = 12)),\n month = as.factor(rep(1:12, times = 12)),\n ID = 1:length(AirPassengers)\n)\nmod_inla_rw1 <- inla(\n formula = log(y) ~ year + f(ID, model = \"rw1\"),\n family = \"gaussian\", data = air_passengers_df,\n control.family = list(hyper = list(prec = list(param = c(1, 0.2)))),\n control.predictor = list(compute = TRUE)\n)\nsummary(mod_inla_rw1)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 5.159 0.252 4.666 5.158 5.657 5.158 0\n#> year1950 0.050 0.134 -0.215 0.050 0.313 0.050 0\n#> year1951 0.174 0.190 -0.201 0.174 0.546 0.174 0\n#> year1952 0.262 0.233 -0.198 0.262 0.717 0.262 0\n#> year1953 0.330 0.269 -0.201 0.331 0.857 0.331 0\n#> year1954 0.357 0.301 -0.236 0.358 0.945 0.358 0\n#> year1955 0.442 0.329 -0.208 0.443 1.086 0.443 0\n#> year1956 0.510 0.356 -0.193 0.511 1.206 0.511 0\n#> year1957 0.567 0.381 -0.184 0.568 1.311 0.568 0\n#> year1958 0.576 0.403 -0.220 0.577 1.365 0.577 0\n#> year1959 0.647 0.425 -0.191 0.648 1.478 0.648 0\n#> year1960 0.683 0.445 -0.195 0.684 1.555 0.684 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 101.38 18.14 69.78 99.99\n#> Precision for ID 155.14 40.70 92.68 149.32\n#> 0.975quant mode\n#> Precision for the Gaussian observations 140.95 97.59\n#> Precision for ID 251.69 137.49\n#> \n#> is computed\n\n\n这里,将年份作为因子型变量,从输出结果可以看出,以1949年作为参照,回归系数的后验均值在逐年变大,这符合 AirPassengers 时序图呈现的趋势。\n存在周期性的波动规律,考虑季节性\n\nmod_inla_sea <- inla(\n formula = log(y) ~ year + f(ID, model = \"seasonal\", season.length = 12),\n family = \"gaussian\", data = air_passengers_df,\n control.family = list(hyper = list(prec = list(param = c(1, 0.2)))),\n control.predictor = list(compute = TRUE)\n)\nsummary(mod_inla_sea)\n\n#> Fixed effects:\n#> mean sd 0.025quant 0.5quant 0.975quant mode kld\n#> (Intercept) 4.836 0.020 4.797 4.836 4.875 4.836 0\n#> year1950 0.095 0.028 0.039 0.095 0.150 0.095 0\n#> year1951 0.295 0.028 0.240 0.295 0.351 0.295 0\n#> year1952 0.441 0.028 0.386 0.441 0.496 0.441 0\n#> year1953 0.573 0.028 0.517 0.573 0.628 0.573 0\n#> year1954 0.630 0.028 0.575 0.630 0.686 0.630 0\n#> year1955 0.803 0.028 0.748 0.803 0.859 0.803 0\n#> year1956 0.948 0.028 0.893 0.948 1.004 0.948 0\n#> year1957 1.062 0.028 1.007 1.062 1.118 1.062 0\n#> year1958 1.094 0.028 1.039 1.094 1.150 1.094 0\n#> year1959 1.212 0.028 1.157 1.212 1.268 1.212 0\n#> year1960 1.318 0.028 1.263 1.318 1.373 1.318 0\n#> \n#> Model hyperparameters:\n#> mean sd 0.025quant 0.5quant\n#> Precision for the Gaussian observations 213.02 27.47 163.44 211.49\n#> Precision for ID 42091.49 27574.60 10100.56 35350.07\n#> 0.975quant mode\n#> Precision for the Gaussian observations 271.39 208.87\n#> Precision for ID 113626.53 24299.06\n#> \n#> is computed\n\n\n最后,将两个模型的拟合结果展示出来,见下图,黑线表示原对数值,红线表示拟合值,灰色区域表示在置信水平 95% 下的区间。区间更短说明季节性模型更好。\nmod_inla_rw1_fitted <- data.frame(\n ID = 1:length(AirPassengers),\n y = as.vector(log(AirPassengers)),\n mean = mod_inla_rw1$summary.fitted.values$mean,\n `0.025quant` = mod_inla_rw1$summary.fitted.values$`0.025quant`,\n `0.975quant` = mod_inla_rw1$summary.fitted.values$`0.975quant`,\n check.names = FALSE\n)\nmod_inla_sea_fitted <- data.frame(\n ID = 1:length(AirPassengers),\n y = as.vector(log(AirPassengers)),\n mean = mod_inla_sea$summary.fitted.values$mean,\n `0.025quant` = mod_inla_sea$summary.fitted.values$`0.025quant`,\n `0.975quant` = mod_inla_sea$summary.fitted.values$`0.975quant`,\n check.names = FALSE\n)\nggplot(data = mod_inla_rw1_fitted, aes(ID)) +\n geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`), fill = \"gray\") +\n geom_line(aes(y = y)) +\n geom_line(aes(y = mean), color = \"red\") +\n theme_classic() +\n labs(x = \"序号\", y = \"对数值\")\nggplot(data = mod_inla_sea_fitted, aes(ID)) +\n geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`), fill = \"gray\") +\n geom_line(aes(y = y)) +\n geom_line(aes(y = mean), color = \"red\") +\n theme_classic() +\n labs(x = \"序号\", y = \"对数值\")\n\n\n\n\n\n\n\n\n\n(a) 随机游走模型\n\n\n\n\n\n\n\n\n\n(b) 季节效应模型\n\n\n\n\n\n\n图 39.7: AirPassengers 的拟合图", "crumbs": [ "贝叶斯建模", "39  时间序列回归" @@ -2631,7 +2631,7 @@ "href": "time-series-regression.html#一些非参数模型", "title": "39  时间序列回归", "section": "\n39.3 一些非参数模型", - "text": "39.3 一些非参数模型\n\n39.3.1 mgcv 包\nmgcv 包 (S. N. Wood 2017) 是 R 软件内置的推荐组件,由 Simon Wood 开发和维护,历经多年,成熟稳定。函数 bam() 相比于函数 gam() 的优势是可以处理大规模的时间序列数据。对于时间序列数据预测,数万和百万级观测值都可以 (Simon N. Wood, Goude, 和 Shaw 2015)。\n\nair_passengers_tbl <- data.frame(\n y = as.vector(AirPassengers),\n year = rep(1949:1960, each = 12),\n month = rep(1:12, times = 12)\n)\nmod1 <- gam(y ~ s(year) + s(month, bs = \"cr\", k = 12),\n data = air_passengers_tbl, family = gaussian\n)\nsummary(mod1)\n\n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> y ~ s(year) + s(month, bs = \"cr\", k = 12)\n#> \n#> Parametric coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 280.299 1.957 143.2 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Approximate significance of smooth terms:\n#> edf Ref.df F p-value \n#> s(year) 6.102 7.265 441.39 <2e-16 ***\n#> s(month) 8.796 10.097 38.25 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> R-sq.(adj) = 0.962 Deviance explained = 96.6%\n#> GCV = 619.9 Scale est. = 551.47 n = 144\n\n\n观察年和月的趋势变化,逐年增长趋势基本是线性的,略有波动,逐月变化趋势比较复杂,不过,可以明显看出在 7-9 月是高峰期,11 月和1-3月是低谷期。\n\nlayout(matrix(1:2, nrow = 1))\nplot(mod1, shade = TRUE)\n\n\n\n\n\n\n图 39.8: 年和月的趋势变化\n\n\n\n\n将拟合效果绘制出来,见下图,整体上,捕捉到了趋势和周期,不过,存在欠拟合,年周期内波动幅度随时间有变化趋势,趋势和周期存在交互作用。\n\nair_passengers_ts <- ts(mod1$fitted.values, start = c(1949, 1), frequency = 12)\nplot(AirPassengers)\nlines(air_passengers_ts, col = \"red\")\n\n\n\n\n\n\n图 39.9: 趋势拟合效果\n\n\n\n\n整体上,乘客数逐年呈线性增长,每年不同月份呈现波动,淡季和旺季出行的流量有很大差异,近年来,这种差异的波动在扩大。为了刻画这种情况,考虑年度趋势和月度波动的交互作用。\n\nmod2 <- gam(y ~ s(year, month), data = air_passengers_tbl, family = gaussian)\nsummary(mod2)\n\n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> y ~ s(year, month)\n#> \n#> Parametric coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 280.299 1.059 264.7 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Approximate significance of smooth terms:\n#> edf Ref.df F p-value \n#> s(year,month) 28.21 28.96 435.5 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> R-sq.(adj) = 0.989 Deviance explained = 99.1%\n#> GCV = 202.62 Scale est. = 161.52 n = 144\n\n\n可以看到,调整的 \\(R^2\\) 明显增加,拟合效果更好,各年各月份的乘客数变化,见下图。\n\nop <- par(mar = c(4, 4, 2, 0))\nplot(mod2)\non.exit(par(op), add = TRUE) \n\n\n\n\n\n\n图 39.10: 交互作用\n\n\n\n\n上图是轮廓图,下面用透视图展示趋势拟合的效果。\n\nop <- par(mar = c(0, 1.5, 0, 0))\nvis.gam(mod2, theta = -35, phi = 20, ticktype = \"detailed\", expand = .65, zlab = \"\")\non.exit(par(op), add = TRUE) \n\n\n\n\n\n\n图 39.11: 趋势拟合效果\n\n\n\n\n最后,在原始数据的基础上,添加拟合数据,得到如下拟合趋势图,与前面的拟合图比较,可以看出效果提升很明显。\n\nair_passengers_ts <- ts(mod2$fitted.values, start = c(1949, 1), frequency = 12)\nplot(AirPassengers)\nlines(air_passengers_ts, col = \"red\")\n\n\n\n\n\n\n图 39.12: 趋势拟合效果\n\n\n\n\n\n39.3.2 tensorflow 框架\n前面介绍的模型都具有非常强的可解释性,比如各个参数对模型的作用。对于复杂的时间序列数据,比较适合用复杂的模型来拟合,看重模型的泛化能力,而不那么关注模型的机理。\n多层感知机是一种全连接层的前馈神经网络。nnet 包的函数 nnet() 实现了单隐藏层的简单前馈神经网络,可用于时间序列预测,也可用于分类数据的预测。作为对比的基础,下面先用 nnet 包训练和预测数据。\n\n# 准备数据\nair_passengers <- as.matrix(embed(AirPassengers, 4))\ncolnames(air_passengers) <- c(\"y\", \"x3\", \"x2\", \"x1\")\ndata_size <- nrow(air_passengers)\n# 拆分数据集\ntrain_size <- floor(data_size * 0.67)\ntrain_data <- air_passengers[1:train_size, ]\ntest_data <- air_passengers[train_size:data_size, ]\n\n# 随机数种子对结果的影响非常大 试试 set.seed(20232023) \nset.seed(20222022) \n# 单隐藏层 8 个神经元\nmod_nnet <- nnet::nnet(\n y ~ x1 + x2 + x3,\n data = air_passengers, # 数据集\n subset = 1:train_size, # 训练数据的指标向量\n linout = TRUE, size = 4, rang = 0.1,\n decay = 5e-4, maxit = 400, trace = FALSE\n)\n# 预测\ntrain_pred <- predict(mod_nnet, newdata = air_passengers[1:train_size,], type = \"raw\")\n# 训练集 RMSE\nsqrt(mean((air_passengers[1:train_size, \"y\"] - train_pred )^2))\n\n#> [1] 21.59392\n\n# 预测\ntest_pred <- predict(mod_nnet, newdata = air_passengers[-(1:train_size),], type = \"raw\")\n# 测试集 RMSE\nsqrt(mean((air_passengers[-(1:train_size), \"y\"] - test_pred)^2))\n\n#> [1] 53.79107\n\n\n下面将原观测序列,训练集和测试集上的预测序列放在一张图上展示。图中,红色曲线表示训练集上的预测结果,绿色曲线为测试集上预测结果。\n\ntrain_pred_ts <- ts(data = train_pred, start = c(1949, 3), frequency = 12)\ntest_pred_ts <- ts(data = test_pred, start = c(1957, 1), frequency = 12)\nplot(AirPassengers)\nlines(train_pred_ts, col = \"red\")\nlines(test_pred_ts, col = \"green\")\n\n\n\n\n\n\n图 39.13: 单层感知机预测\n\n\n\n\n由图可知,在测试集上,随着时间拉长,预测越来越不准。\n下面使用 tensorflow 包构造多层感知机训练数据和预测。\n\nlibrary(tensorflow)\nlibrary(keras)\nset_random_seed(20222022)\n# 模型结构\nmod_mlp <- keras_model_sequential() |> \n layer_dense(units = 12, activation = \"relu\", input_shape = c(3)) |> \n layer_dense(units = 8, activation = \"relu\") |> \n layer_dense(units = 1)\n# 训练目标\ncompile(mod_mlp,\n loss = \"mse\", # 损失函数\n optimizer = \"adam\", # 优化器\n metrics = \"mae\" # 监控度量\n)\n# 模型概览\nsummary(mod_mlp)\n\n#> Model: \"sequential\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_2 (Dense) (None, 12) 48 \n#> dense_1 (Dense) (None, 8) 104 \n#> dense (Dense) (None, 1) 9 \n#> ================================================================================\n#> Total params: 161 (644.00 Byte)\n#> Trainable params: 161 (644.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n\n\n输入层为 3 个节点,中间两个隐藏层,第一层为 12 个节点,第二层为 8 个节点,全连接网络,最后输出为一层单节点,意味着单个输出。每一层都有节点和权重,参数总数为 161。\n\n# 拟合模型\nfit(mod_mlp,\n x = train_data[, c(\"x1\", \"x2\", \"x3\")],\n y = train_data[, \"y\"],\n epochs = 200,\n batch_size = 10, # 每次更新梯度所用的样本量\n validation_split = 0.2, # 从训练数据中拆分一部分用作验证集\n verbose = 0 # 不显示训练进度\n)\n# 将测试数据代入模型,计算损失函数和监控度量\nevaluate(mod_mlp, test_data[, c(\"x1\", \"x2\", \"x3\")], test_data[, \"y\"])\n\n#> 2/2 - 0s - loss: 2517.1836 - mae: 40.8876 - 22ms/epoch - 11ms/step\n\n\n#> loss mae \n#> 2517.18359 40.88759\n\n# 测试集上的预测\nmlp_test_pred <- predict(mod_mlp, test_data[, c(\"x1\", \"x2\", \"x3\")]) \n\n#> 2/2 - 0s - 69ms/epoch - 35ms/step\n\nmlp_train_pred <- predict(mod_mlp, train_data[, c(\"x1\", \"x2\", \"x3\")]) \n\n#> 3/3 - 0s - 19ms/epoch - 6ms/step\n\nsqrt(mean((test_data[, \"y\"] - mlp_test_pred)^2)) # 计算均方根误差\n\n#> [1] 50.17154\n\n\n从 RMSE 来看,MLP(多层感知机)预测效果比单层感知机稍好些,可网络复杂度是增加很多的。\n\nmlp_train_pred_ts <- ts(data = mlp_train_pred, start = c(1949, 3), frequency = 12)\nmlp_test_pred_ts <- ts(data = mlp_test_pred, start = c(1957, 1), frequency = 12)\nplot(AirPassengers)\nlines(mlp_train_pred_ts, col = \"red\")\nlines(mlp_test_pred_ts, col = \"green\")\n\n\n\n\n\n\n图 39.14: 多层感知机预测\n\n\n\n\n下面用 LSTM (长短期记忆)神经网络来训练时间序列数据,预测未来一周的趋势。输出不再是一天(单点输出),而是 7 天的预测值(多点输出)。参考 tensorflow 包的官网中 RNN 递归神经网络的介绍。", + "text": "39.3 一些非参数模型\n\n39.3.1 mgcv 包\nmgcv 包 (S. N. Wood 2017) 是 R 软件内置的推荐组件,由 Simon Wood 开发和维护,历经多年,成熟稳定。函数 bam() 相比于函数 gam() 的优势是可以处理大规模的时间序列数据。对于时间序列数据预测,数万和百万级观测值都可以 (Simon N. Wood, Goude, 和 Shaw 2015)。\n\nair_passengers_tbl <- data.frame(\n y = as.vector(AirPassengers),\n year = rep(1949:1960, each = 12),\n month = rep(1:12, times = 12)\n)\nmod1 <- gam(y ~ s(year) + s(month, bs = \"cr\", k = 12),\n data = air_passengers_tbl, family = gaussian\n)\nsummary(mod1)\n\n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> y ~ s(year) + s(month, bs = \"cr\", k = 12)\n#> \n#> Parametric coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 280.299 1.957 143.2 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Approximate significance of smooth terms:\n#> edf Ref.df F p-value \n#> s(year) 6.102 7.265 441.39 <2e-16 ***\n#> s(month) 8.796 10.097 38.25 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> R-sq.(adj) = 0.962 Deviance explained = 96.6%\n#> GCV = 619.9 Scale est. = 551.47 n = 144\n\n\n观察年和月的趋势变化,逐年增长趋势基本是线性的,略有波动,逐月变化趋势比较复杂,不过,可以明显看出在 7-9 月是高峰期,11 月和1-3月是低谷期。\n\nlayout(matrix(1:2, nrow = 1))\nplot(mod1, shade = TRUE)\n\n\n\n\n\n\n图 39.8: 年和月的趋势变化\n\n\n\n\n将拟合效果绘制出来,见下图,整体上,捕捉到了趋势和周期,不过,存在欠拟合,年周期内波动幅度随时间有变化趋势,趋势和周期存在交互作用。\n\nair_passengers_ts <- ts(mod1$fitted.values, start = c(1949, 1), frequency = 12)\nplot(AirPassengers)\nlines(air_passengers_ts, col = \"red\")\n\n\n\n\n\n\n图 39.9: 趋势拟合效果\n\n\n\n\n整体上,乘客数逐年呈线性增长,每年不同月份呈现波动,淡季和旺季出行的流量有很大差异,近年来,这种差异的波动在扩大。为了刻画这种情况,考虑年度趋势和月度波动的交互作用。\n\nmod2 <- gam(y ~ s(year, month), data = air_passengers_tbl, family = gaussian)\nsummary(mod2)\n\n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> y ~ s(year, month)\n#> \n#> Parametric coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 280.299 1.059 264.7 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Approximate significance of smooth terms:\n#> edf Ref.df F p-value \n#> s(year,month) 28.21 28.96 435.5 <2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> R-sq.(adj) = 0.989 Deviance explained = 99.1%\n#> GCV = 202.62 Scale est. = 161.52 n = 144\n\n\n可以看到,调整的 \\(R^2\\) 明显增加,拟合效果更好,各年各月份的乘客数变化,见下图。\n\nop <- par(mar = c(4, 4, 2, 0))\nplot(mod2)\non.exit(par(op), add = TRUE) \n\n\n\n\n\n\n图 39.10: 交互作用\n\n\n\n\n上图是轮廓图,下面用透视图展示趋势拟合的效果。\n\nop <- par(mar = c(0, 1.5, 0, 0))\nvis.gam(mod2, theta = -35, phi = 20, ticktype = \"detailed\", expand = .65, zlab = \"\")\non.exit(par(op), add = TRUE) \n\n\n\n\n\n\n图 39.11: 趋势拟合效果\n\n\n\n\n最后,在原始数据的基础上,添加拟合数据,得到如下拟合趋势图,与前面的拟合图比较,可以看出效果提升很明显。\n\nair_passengers_ts <- ts(mod2$fitted.values, start = c(1949, 1), frequency = 12)\nplot(AirPassengers)\nlines(air_passengers_ts, col = \"red\")\n\n\n\n\n\n\n图 39.12: 趋势拟合效果\n\n\n\n\n\n39.3.2 tensorflow 框架\n前面介绍的模型都具有非常强的可解释性,比如各个参数对模型的作用。对于复杂的时间序列数据,比较适合用复杂的模型来拟合,看重模型的泛化能力,而不那么关注模型的机理。\n多层感知机是一种全连接层的前馈神经网络。nnet 包的函数 nnet() 实现了单隐藏层的简单前馈神经网络,可用于时间序列预测,也可用于分类数据的预测。作为对比的基础,下面先用 nnet 包训练和预测数据。\n\n# 准备数据\nair_passengers <- as.matrix(embed(AirPassengers, 4))\ncolnames(air_passengers) <- c(\"y\", \"x3\", \"x2\", \"x1\")\ndata_size <- nrow(air_passengers)\n# 拆分数据集\ntrain_size <- floor(data_size * 0.67)\ntrain_data <- air_passengers[1:train_size, ]\ntest_data <- air_passengers[train_size:data_size, ]\n\n# 随机数种子对结果的影响非常大 试试 set.seed(20232023) \nset.seed(20222022) \n# 单隐藏层 8 个神经元\nmod_nnet <- nnet::nnet(\n y ~ x1 + x2 + x3,\n data = air_passengers, # 数据集\n subset = 1:train_size, # 训练数据的指标向量\n linout = TRUE, size = 4, rang = 0.1,\n decay = 5e-4, maxit = 400, trace = FALSE\n)\n# 预测\ntrain_pred <- predict(mod_nnet, newdata = air_passengers[1:train_size,], type = \"raw\")\n# 训练集 RMSE\nsqrt(mean((air_passengers[1:train_size, \"y\"] - train_pred )^2))\n\n#> [1] 21.59392\n\n# 预测\ntest_pred <- predict(mod_nnet, newdata = air_passengers[-(1:train_size),], type = \"raw\")\n# 测试集 RMSE\nsqrt(mean((air_passengers[-(1:train_size), \"y\"] - test_pred)^2))\n\n#> [1] 53.79107\n\n\n下面将原观测序列,训练集和测试集上的预测序列放在一张图上展示。图中,红色曲线表示训练集上的预测结果,绿色曲线为测试集上预测结果。\n\ntrain_pred_ts <- ts(data = train_pred, start = c(1949, 3), frequency = 12)\ntest_pred_ts <- ts(data = test_pred, start = c(1957, 1), frequency = 12)\nplot(AirPassengers)\nlines(train_pred_ts, col = \"red\")\nlines(test_pred_ts, col = \"green\")\n\n\n\n\n\n\n图 39.13: 单层感知机预测\n\n\n\n\n由图可知,在测试集上,随着时间拉长,预测越来越不准。\n下面使用 tensorflow 包构造多层感知机训练数据和预测。\n\nlibrary(tensorflow)\nlibrary(keras)\nset_random_seed(20222022)\n# 模型结构\nmod_mlp <- keras_model_sequential() |> \n layer_dense(units = 12, activation = \"relu\", input_shape = c(3)) |> \n layer_dense(units = 8, activation = \"relu\") |> \n layer_dense(units = 1)\n# 训练目标\ncompile(mod_mlp,\n loss = \"mse\", # 损失函数\n optimizer = \"adam\", # 优化器\n metrics = \"mae\" # 监控度量\n)\n# 模型概览\nsummary(mod_mlp)\n\n#> Model: \"sequential\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_2 (Dense) (None, 12) 48 \n#> dense_1 (Dense) (None, 8) 104 \n#> dense (Dense) (None, 1) 9 \n#> ================================================================================\n#> Total params: 161 (644.00 Byte)\n#> Trainable params: 161 (644.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n\n\n输入层为 3 个节点,中间两个隐藏层,第一层为 12 个节点,第二层为 8 个节点,全连接网络,最后输出为一层单节点,意味着单个输出。每一层都有节点和权重,参数总数为 161。\n\n# 拟合模型\nfit(mod_mlp,\n x = train_data[, c(\"x1\", \"x2\", \"x3\")],\n y = train_data[, \"y\"],\n epochs = 200,\n batch_size = 10, # 每次更新梯度所用的样本量\n validation_split = 0.2, # 从训练数据中拆分一部分用作验证集\n verbose = 0 # 不显示训练进度\n)\n# 将测试数据代入模型,计算损失函数和监控度量\nevaluate(mod_mlp, test_data[, c(\"x1\", \"x2\", \"x3\")], test_data[, \"y\"])\n\n#> 2/2 - 0s - loss: 2517.1836 - mae: 40.8876 - 17ms/epoch - 8ms/step\n\n\n#> loss mae \n#> 2517.18359 40.88759\n\n# 测试集上的预测\nmlp_test_pred <- predict(mod_mlp, test_data[, c(\"x1\", \"x2\", \"x3\")]) \n\n#> 2/2 - 0s - 66ms/epoch - 33ms/step\n\nmlp_train_pred <- predict(mod_mlp, train_data[, c(\"x1\", \"x2\", \"x3\")]) \n\n#> 3/3 - 0s - 16ms/epoch - 5ms/step\n\nsqrt(mean((test_data[, \"y\"] - mlp_test_pred)^2)) # 计算均方根误差\n\n#> [1] 50.17154\n\n\n从 RMSE 来看,MLP(多层感知机)预测效果比单层感知机稍好些,可网络复杂度是增加很多的。\n\nmlp_train_pred_ts <- ts(data = mlp_train_pred, start = c(1949, 3), frequency = 12)\nmlp_test_pred_ts <- ts(data = mlp_test_pred, start = c(1957, 1), frequency = 12)\nplot(AirPassengers)\nlines(mlp_train_pred_ts, col = \"red\")\nlines(mlp_test_pred_ts, col = \"green\")\n\n\n\n\n\n\n图 39.14: 多层感知机预测\n\n\n\n\n下面用 LSTM (长短期记忆)神经网络来训练时间序列数据,预测未来一周的趋势。输出不再是一天(单点输出),而是 7 天的预测值(多点输出)。参考 tensorflow 包的官网中 RNN 递归神经网络的介绍。", "crumbs": [ "贝叶斯建模", "39  时间序列回归" @@ -2708,7 +2708,7 @@ "href": "classification-problems.html#sec-support-vector-machines", "title": "40  分类问题", "section": "\n40.5 支持向量机", - "text": "40.5 支持向量机\ne1071 包也提供支持向量机\n\n# e1071\niris_svm <- svm(Species ~ ., data = iris)\niris_svm\n\n\nCall:\nsvm(formula = Species ~ ., data = iris)\n\n\nParameters:\n SVM-Type: C-classification \n SVM-Kernel: radial \n cost: 1 \n\nNumber of Support Vectors: 51\n\n# 预测\niris_svm_pred <- predict(iris_svm, newdata = iris, probability = FALSE)\n# 预测结果\ntable(iris_svm_pred, iris[, 5])\n\n \niris_svm_pred setosa versicolor virginica\n setosa 50 0 0\n versicolor 0 48 2\n virginica 0 2 48\n\n\nkernlab 包提供核支持向量机。\n\nlibrary(kernlab)\niris_ksvm <- ksvm(Species ~ ., data = iris)\niris_ksvm\n\nSupport Vector Machine object of class \"ksvm\" \n\nSV type: C-svc (classification) \n parameter : cost C = 1 \n\nGaussian Radial Basis kernel function. \n Hyperparameter : sigma = 0.845469789207426 \n\nNumber of Support Vectors : 59 \n\nObjective Function Value : -4.658 -5.2078 -20.1766 \nTraining error : 0.026667 \n\n\nkernlab 包 (Karatzoglou 等 2004) 的绘图函数 plot() 仅支持二分类模型。\n\niris_pred_svm <- predict(iris_ksvm, iris[, -5], type = \"response\")\ntable(iris_pred_svm, iris[, 5])\n\n \niris_pred_svm setosa versicolor virginica\n setosa 50 0 0\n versicolor 0 48 2\n virginica 0 2 48", + "text": "40.5 支持向量机\ne1071 包也提供支持向量机\n\n# e1071\niris_svm <- svm(Species ~ ., data = iris)\niris_svm\n\n\nCall:\nsvm(formula = Species ~ ., data = iris)\n\n\nParameters:\n SVM-Type: C-classification \n SVM-Kernel: radial \n cost: 1 \n\nNumber of Support Vectors: 51\n\n# 预测\niris_svm_pred <- predict(iris_svm, newdata = iris, probability = FALSE)\n# 预测结果\ntable(iris_svm_pred, iris[, 5])\n\n \niris_svm_pred setosa versicolor virginica\n setosa 50 0 0\n versicolor 0 48 2\n virginica 0 2 48\n\n\nkernlab 包提供核支持向量机。\n\nlibrary(kernlab)\niris_ksvm <- ksvm(Species ~ ., data = iris)\niris_ksvm\n\nSupport Vector Machine object of class \"ksvm\" \n\nSV type: C-svc (classification) \n parameter : cost C = 1 \n\nGaussian Radial Basis kernel function. \n Hyperparameter : sigma = 0.623008360928226 \n\nNumber of Support Vectors : 59 \n\nObjective Function Value : -4.0414 -4.3676 -20.9506 \nTraining error : 0.026667 \n\n\nkernlab 包 (Karatzoglou 等 2004) 的绘图函数 plot() 仅支持二分类模型。\n\niris_pred_svm <- predict(iris_ksvm, iris[, -5], type = \"response\")\ntable(iris_pred_svm, iris[, 5])\n\n \niris_pred_svm setosa versicolor virginica\n setosa 50 0 0\n versicolor 0 48 2\n virginica 0 2 48", "crumbs": [ "机器学习", "40  分类问题" @@ -2719,7 +2719,7 @@ "href": "classification-problems.html#sec-k-nearest-neighbour", "title": "40  分类问题", "section": "\n40.6 K 最近邻", - "text": "40.6 K 最近邻\n\n# 将 iris3 数据集拆分为训练集和测试集\niris_train <- rbind(iris3[1:25, , 1], iris3[1:25, , 2], iris3[1:25, , 3])\niris_test <- rbind(iris3[26:50, , 1], iris3[26:50, , 2], iris3[26:50, , 3])\niris_species <- factor(rep(c(\"setosa\", \"versicolor\", \"virginica\"), each = 25))\n\n\nlibrary(class)\n# 分 3 类\niris_knn <- knn(\n train = iris_train, test = iris_test,\n cl = iris_species, k = 3, prob = TRUE\n)\n# 分类结果汇总\ntable(iris_knn, iris_species) \n\n iris_species\niris_knn setosa versicolor virginica\n setosa 25 0 0\n versicolor 0 23 4\n virginica 0 2 21", + "text": "40.6 K 最近邻\n\n# 将 iris3 数据集拆分为训练集和测试集\niris_train <- rbind(iris3[1:25, , 1], iris3[1:25, , 2], iris3[1:25, , 3])\niris_test <- rbind(iris3[26:50, , 1], iris3[26:50, , 2], iris3[26:50, , 3])\niris_species <- factor(rep(c(\"setosa\", \"versicolor\", \"virginica\"), each = 25))\n\n\nlibrary(class)\n# 分 3 类\niris_knn <- knn(\n train = iris_train, test = iris_test,\n cl = iris_species, k = 3, prob = TRUE\n)\n# 分类结果汇总\ntable(iris_knn, iris_species) \n\n iris_species\niris_knn setosa versicolor virginica\n setosa 25 0 0\n versicolor 0 23 3\n virginica 0 2 22", "crumbs": [ "机器学习", "40  分类问题" @@ -2730,7 +2730,7 @@ "href": "classification-problems.html#sec-neural-networks", "title": "40  分类问题", "section": "\n40.7 神经网络", - "text": "40.7 神经网络\n\nlibrary(nnet)\niris_nnet <- nnet(Species ~ ., data = iris, size = 4, trace = FALSE)\nsummary(iris_nnet)\n\na 4-4-3 network with 35 weights\noptions were - softmax modelling \n b->h1 i1->h1 i2->h1 i3->h1 i4->h1 \n 0.14 0.26 0.60 0.76 1.01 \n b->h2 i1->h2 i2->h2 i3->h2 i4->h2 \n 0.72 3.10 11.52 -13.44 -6.45 \n b->h3 i1->h3 i2->h3 i3->h3 i4->h3 \n-71.48 -86.52 38.38 86.85 52.74 \n b->h4 i1->h4 i2->h4 i3->h4 i4->h4 \n 3.07 -0.60 2.10 -0.40 -2.95 \n b->o1 h1->o1 h2->o1 h3->o1 h4->o1 \n -0.13 -0.24 30.32 -10.63 5.79 \n b->o2 h1->o2 h2->o2 h3->o2 h4->o2 \n 21.04 -0.40 -28.63 -18.75 16.31 \n b->o3 h1->o3 h2->o3 h3->o3 h4->o3 \n-19.47 0.37 -1.86 29.78 -20.79 \n\n\nsize 隐藏层中的神经元数量\n\niris_pred_nnet <- predict(iris_nnet, newdata = iris[,-5], type = \"class\")\ntable(iris_pred_nnet, iris[, 5])\n\n \niris_pred_nnet setosa versicolor virginica\n setosa 50 0 0\n versicolor 0 49 0\n virginica 0 1 50", + "text": "40.7 神经网络\n\nlibrary(nnet)\niris_nnet <- nnet(Species ~ ., data = iris, size = 4, trace = FALSE)\nsummary(iris_nnet)\n\na 4-4-3 network with 35 weights\noptions were - softmax modelling \n b->h1 i1->h1 i2->h1 i3->h1 i4->h1 \n 148.05 169.39 702.94 -1218.15 -537.34 \n b->h2 i1->h2 i2->h2 i3->h2 i4->h2 \n -51.93 -119.60 -91.22 276.26 130.38 \n b->h3 i1->h3 i2->h3 i3->h3 i4->h3 \n-1371.99 -76.88 -144.45 349.89 337.51 \n b->h4 i1->h4 i2->h4 i3->h4 i4->h4 \n -24.41 -191.75 -58.38 -229.31 -91.16 \n b->o1 h1->o1 h2->o1 h3->o1 h4->o1 \n 71.09 512.43 -293.22 -543.92 4.49 \n b->o2 h1->o2 h2->o2 h3->o2 h4->o2 \n -3.08 -325.66 368.75 16.87 37.73 \n b->o3 h1->o3 h2->o3 h3->o3 h4->o3 \n -67.71 -185.66 -74.37 528.55 -42.73 \n\n\nsize 隐藏层中的神经元数量\n\niris_pred_nnet <- predict(iris_nnet, newdata = iris[,-5], type = \"class\")\ntable(iris_pred_nnet, iris[, 5])\n\n \niris_pred_nnet setosa versicolor virginica\n setosa 50 0 0\n versicolor 0 49 0\n virginica 0 1 50", "crumbs": [ "机器学习", "40  分类问题" @@ -2862,7 +2862,7 @@ "href": "regression-problems.html#sec-linear-regressions", "title": "42  回归问题", "section": "", - "text": "42.1.1 最小二乘回归\n\\[\n\\mathcal{L}(\\bm{\\beta}) = \\sum_{i=1}^{n}(y_i - \\bm{x}_i^{\\top}\\bm{\\beta})^2\n\\]\n\nfit_lm <- lm(medv ~ ., data = Boston)\nsummary(fit_lm)\n\n#> \n#> Call:\n#> lm(formula = medv ~ ., data = Boston)\n#> \n#> Residuals:\n#> Min 1Q Median 3Q Max \n#> -15.595 -2.730 -0.518 1.777 26.199 \n#> \n#> Coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***\n#> crim -1.080e-01 3.286e-02 -3.287 0.001087 ** \n#> zn 4.642e-02 1.373e-02 3.382 0.000778 ***\n#> indus 2.056e-02 6.150e-02 0.334 0.738288 \n#> chas 2.687e+00 8.616e-01 3.118 0.001925 ** \n#> nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***\n#> rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***\n#> age 6.922e-04 1.321e-02 0.052 0.958229 \n#> dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***\n#> rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***\n#> tax -1.233e-02 3.760e-03 -3.280 0.001112 ** \n#> ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***\n#> black 9.312e-03 2.686e-03 3.467 0.000573 ***\n#> lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Residual standard error: 4.745 on 492 degrees of freedom\n#> Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338 \n#> F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16\n\n\n\n42.1.2 逐步回归\n逐步回归是筛选变量,有向前、向后和两个方向同时进行三个方法。\n\n\ndirection = \"both\" 双向\n\ndirection = \"backward\" 向后\n\ndirection = \"forward\" 向前\n\n\nfit_step <- step(fit_lm, direction = \"both\", trace = 0)\nsummary(fit_step)\n\n#> \n#> Call:\n#> lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + \n#> tax + ptratio + black + lstat, data = Boston)\n#> \n#> Residuals:\n#> Min 1Q Median 3Q Max \n#> -15.5984 -2.7386 -0.5046 1.7273 26.2373 \n#> \n#> Coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 36.341145 5.067492 7.171 2.73e-12 ***\n#> crim -0.108413 0.032779 -3.307 0.001010 ** \n#> zn 0.045845 0.013523 3.390 0.000754 ***\n#> chas 2.718716 0.854240 3.183 0.001551 ** \n#> nox -17.376023 3.535243 -4.915 1.21e-06 ***\n#> rm 3.801579 0.406316 9.356 < 2e-16 ***\n#> dis -1.492711 0.185731 -8.037 6.84e-15 ***\n#> rad 0.299608 0.063402 4.726 3.00e-06 ***\n#> tax -0.011778 0.003372 -3.493 0.000521 ***\n#> ptratio -0.946525 0.129066 -7.334 9.24e-13 ***\n#> black 0.009291 0.002674 3.475 0.000557 ***\n#> lstat -0.522553 0.047424 -11.019 < 2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Residual standard error: 4.736 on 494 degrees of freedom\n#> Multiple R-squared: 0.7406, Adjusted R-squared: 0.7348 \n#> F-statistic: 128.2 on 11 and 494 DF, p-value: < 2.2e-16\n\n\n\n42.1.3 偏最小二乘回归\n偏最小二乘回归适用于存在多重共线性问题或变量个数远大于样本量的情况。\n10 折交叉验证,ncomp = 6 表示 6 个主成分,拟合方法 kernelpls 表示核算法,validation = \"CV\" 表示采用交叉验证的方式调整参数。\n\nfit_pls <- pls::plsr(medv ~ ., ncomp = 6, data = Boston, validation = \"CV\")\nsummary(fit_pls)\n\n#> Data: X dimension: 506 13 \n#> Y dimension: 506 1\n#> Fit method: kernelpls\n#> Number of components considered: 6\n#> \n#> VALIDATION: RMSEP\n#> Cross-validated using 10 random segments.\n#> (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps\n#> CV 9.206 8.033 7.902 7.632 6.539 5.944 5.799\n#> adjCV 9.206 8.031 7.901 7.631 6.535 5.936 5.794\n#> \n#> TRAINING: % variance explained\n#> 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps\n#> X 80.51 94.45 98.97 99.34 99.80 99.91\n#> medv 24.23 26.94 32.05 51.05 60.08 62.49\n\n\n交叉验证的方法还可选留一交叉验证 validation = \"LOO\" 。预测的均方根误差 RMSEP 来评估交叉验证的结果。\n\npls::validationplot(fit_pls, val.type = \"RMSEP\")\n\n\n\n\n\n\n图 42.1: RMSE 随成分数量的变化\n\n\n\n\n\n42.1.4 主成分回归\n主成分回归采用降维的方法处理高维和多重共线性问题。\n10 折交叉验证,6 个主成分,拟合方法 svdpc 表示奇异值分解算法。\n\nfit_pcr <- pls::pcr(medv ~ ., ncomp = 6, data = Boston, validation = \"CV\")\nsummary(fit_pcr)\n\n#> Data: X dimension: 506 13 \n#> Y dimension: 506 1\n#> Fit method: svdpc\n#> Number of components considered: 6\n#> \n#> VALIDATION: RMSEP\n#> Cross-validated using 10 random segments.\n#> (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps\n#> CV 9.206 8.051 8.024 7.797 7.778 7.647 6.034\n#> adjCV 9.206 8.050 8.022 7.795 7.776 7.656 6.026\n#> \n#> TRAINING: % variance explained\n#> 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps\n#> X 80.58 96.89 99.02 99.72 99.85 99.92\n#> medv 23.71 24.28 28.77 29.33 32.71 57.77", + "text": "42.1.1 最小二乘回归\n\\[\n\\mathcal{L}(\\bm{\\beta}) = \\sum_{i=1}^{n}(y_i - \\bm{x}_i^{\\top}\\bm{\\beta})^2\n\\]\n\nfit_lm <- lm(medv ~ ., data = Boston)\nsummary(fit_lm)\n\n#> \n#> Call:\n#> lm(formula = medv ~ ., data = Boston)\n#> \n#> Residuals:\n#> Min 1Q Median 3Q Max \n#> -15.595 -2.730 -0.518 1.777 26.199 \n#> \n#> Coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***\n#> crim -1.080e-01 3.286e-02 -3.287 0.001087 ** \n#> zn 4.642e-02 1.373e-02 3.382 0.000778 ***\n#> indus 2.056e-02 6.150e-02 0.334 0.738288 \n#> chas 2.687e+00 8.616e-01 3.118 0.001925 ** \n#> nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***\n#> rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***\n#> age 6.922e-04 1.321e-02 0.052 0.958229 \n#> dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***\n#> rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***\n#> tax -1.233e-02 3.760e-03 -3.280 0.001112 ** \n#> ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***\n#> black 9.312e-03 2.686e-03 3.467 0.000573 ***\n#> lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Residual standard error: 4.745 on 492 degrees of freedom\n#> Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338 \n#> F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16\n\n\n\n42.1.2 逐步回归\n逐步回归是筛选变量,有向前、向后和两个方向同时进行三个方法。\n\n\ndirection = \"both\" 双向\n\ndirection = \"backward\" 向后\n\ndirection = \"forward\" 向前\n\n\nfit_step <- step(fit_lm, direction = \"both\", trace = 0)\nsummary(fit_step)\n\n#> \n#> Call:\n#> lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + \n#> tax + ptratio + black + lstat, data = Boston)\n#> \n#> Residuals:\n#> Min 1Q Median 3Q Max \n#> -15.5984 -2.7386 -0.5046 1.7273 26.2373 \n#> \n#> Coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 36.341145 5.067492 7.171 2.73e-12 ***\n#> crim -0.108413 0.032779 -3.307 0.001010 ** \n#> zn 0.045845 0.013523 3.390 0.000754 ***\n#> chas 2.718716 0.854240 3.183 0.001551 ** \n#> nox -17.376023 3.535243 -4.915 1.21e-06 ***\n#> rm 3.801579 0.406316 9.356 < 2e-16 ***\n#> dis -1.492711 0.185731 -8.037 6.84e-15 ***\n#> rad 0.299608 0.063402 4.726 3.00e-06 ***\n#> tax -0.011778 0.003372 -3.493 0.000521 ***\n#> ptratio -0.946525 0.129066 -7.334 9.24e-13 ***\n#> black 0.009291 0.002674 3.475 0.000557 ***\n#> lstat -0.522553 0.047424 -11.019 < 2e-16 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Residual standard error: 4.736 on 494 degrees of freedom\n#> Multiple R-squared: 0.7406, Adjusted R-squared: 0.7348 \n#> F-statistic: 128.2 on 11 and 494 DF, p-value: < 2.2e-16\n\n\n\n42.1.3 偏最小二乘回归\n偏最小二乘回归适用于存在多重共线性问题或变量个数远大于样本量的情况。\n10 折交叉验证,ncomp = 6 表示 6 个主成分,拟合方法 kernelpls 表示核算法,validation = \"CV\" 表示采用交叉验证的方式调整参数。\n\nfit_pls <- pls::plsr(medv ~ ., ncomp = 6, data = Boston, validation = \"CV\")\nsummary(fit_pls)\n\n#> Data: X dimension: 506 13 \n#> Y dimension: 506 1\n#> Fit method: kernelpls\n#> Number of components considered: 6\n#> \n#> VALIDATION: RMSEP\n#> Cross-validated using 10 random segments.\n#> (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps\n#> CV 9.206 8.058 7.923 7.649 6.527 5.885 5.742\n#> adjCV 9.206 8.055 7.921 7.646 6.526 5.881 5.739\n#> \n#> TRAINING: % variance explained\n#> 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps\n#> X 80.51 94.45 98.97 99.34 99.80 99.91\n#> medv 24.23 26.94 32.05 51.05 60.08 62.49\n\n\n交叉验证的方法还可选留一交叉验证 validation = \"LOO\" 。预测的均方根误差 RMSEP 来评估交叉验证的结果。\n\npls::validationplot(fit_pls, val.type = \"RMSEP\")\n\n\n\n\n\n\n图 42.1: RMSE 随成分数量的变化\n\n\n\n\n\n42.1.4 主成分回归\n主成分回归采用降维的方法处理高维和多重共线性问题。\n10 折交叉验证,6 个主成分,拟合方法 svdpc 表示奇异值分解算法。\n\nfit_pcr <- pls::pcr(medv ~ ., ncomp = 6, data = Boston, validation = \"CV\")\nsummary(fit_pcr)\n\n#> Data: X dimension: 506 13 \n#> Y dimension: 506 1\n#> Fit method: svdpc\n#> Number of components considered: 6\n#> \n#> VALIDATION: RMSEP\n#> Cross-validated using 10 random segments.\n#> (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps\n#> CV 9.206 8.056 8.038 7.816 7.806 7.661 6.088\n#> adjCV 9.206 8.055 8.036 7.813 7.801 7.663 6.080\n#> \n#> TRAINING: % variance explained\n#> 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps\n#> X 80.58 96.89 99.02 99.72 99.85 99.92\n#> medv 23.71 24.28 28.77 29.33 32.71 57.77", "crumbs": [ "机器学习", "42  回归问题" @@ -2884,7 +2884,7 @@ "href": "regression-problems.html#sec-svm-regression", "title": "42  回归问题", "section": "\n42.3 支持向量机", - "text": "42.3 支持向量机\n\nlibrary(kernlab)\nfit_ksvm <- ksvm(medv ~ ., data = Boston)\nfit_ksvm\n\n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 0.0974585171913089 \n#> \n#> Number of Support Vectors : 337 \n#> \n#> Objective Function Value : -79.922 \n#> Training error : 0.098901\n\n\n\n# 预测\npred_medv_svm <- predict(fit_ksvm, newdata = Boston)\n# RMSE\nrmse(Boston$medv, pred_medv_svm)\n\n#> [1] 2.892347", + "text": "42.3 支持向量机\n\nlibrary(kernlab)\nfit_ksvm <- ksvm(medv ~ ., data = Boston)\nfit_ksvm\n\n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 0.117108604447976 \n#> \n#> Number of Support Vectors : 336 \n#> \n#> Objective Function Value : -79.0442 \n#> Training error : 0.093441\n\n\n\n# 预测\npred_medv_svm <- predict(fit_ksvm, newdata = Boston)\n# RMSE\nrmse(Boston$medv, pred_medv_svm)\n\n#> [1] 2.811383", "crumbs": [ "机器学习", "42  回归问题" @@ -2895,7 +2895,7 @@ "href": "regression-problems.html#sec-nnet-regression", "title": "42  回归问题", "section": "\n42.4 神经网络", - "text": "42.4 神经网络\n单隐藏层的神经网络\n\nlibrary(nnet)\nfit_nnet <- nnet(medv ~ .,\n data = Boston, trace = FALSE,\n size = 12, # 隐藏层单元数量\n maxit = 500, # 最大迭代次数\n linout = TRUE, # 线性输出单元\n decay = 0.01 # 权重下降的参数\n)\npred_medv_nnet <- predict(fit_nnet, newdata = Boston[, -14], type = \"raw\")\nrmse(Boston$medv, pred_medv_nnet)\n\n#> [1] 2.78669", + "text": "42.4 神经网络\n单隐藏层的神经网络\n\nlibrary(nnet)\nfit_nnet <- nnet(medv ~ .,\n data = Boston, trace = FALSE,\n size = 12, # 隐藏层单元数量\n maxit = 500, # 最大迭代次数\n linout = TRUE, # 线性输出单元\n decay = 0.01 # 权重下降的参数\n)\npred_medv_nnet <- predict(fit_nnet, newdata = Boston[, -14], type = \"raw\")\nrmse(Boston$medv, pred_medv_nnet)\n\n#> [1] 2.687467", "crumbs": [ "机器学习", "42  回归问题" @@ -2917,7 +2917,7 @@ "href": "regression-problems.html#sec-rf-regression", "title": "42  回归问题", "section": "\n42.6 随机森林", - "text": "42.6 随机森林\n\nlibrary(randomForest)\nfit_rf <- randomForest(medv ~ ., data = Boston)\nprint(fit_rf)\n\n#> \n#> Call:\n#> randomForest(formula = medv ~ ., data = Boston) \n#> Type of random forest: regression\n#> Number of trees: 500\n#> No. of variables tried at each split: 4\n#> \n#> Mean of squared residuals: 9.897626\n#> % Var explained: 88.28\n\npred_medv_rf <- predict(fit_rf, newdata = Boston[, -14])\nrmse(Boston$medv, pred_medv_rf)\n\n#> [1] 1.399153", + "text": "42.6 随机森林\n\nlibrary(randomForest)\nfit_rf <- randomForest(medv ~ ., data = Boston)\nprint(fit_rf)\n\n#> \n#> Call:\n#> randomForest(formula = medv ~ ., data = Boston) \n#> Type of random forest: regression\n#> Number of trees: 500\n#> No. of variables tried at each split: 4\n#> \n#> Mean of squared residuals: 9.832193\n#> % Var explained: 88.35\n\npred_medv_rf <- predict(fit_rf, newdata = Boston[, -14])\nrmse(Boston$medv, pred_medv_rf)\n\n#> [1] 1.412122", "crumbs": [ "机器学习", "42  回归问题" diff --git a/statistical-computation_files/figure-pdf/fig-log-logit-lik-1.pdf b/statistical-computation_files/figure-pdf/fig-log-logit-lik-1.pdf index 9e476bd3..50dc9134 100644 Binary files a/statistical-computation_files/figure-pdf/fig-log-logit-lik-1.pdf and b/statistical-computation_files/figure-pdf/fig-log-logit-lik-1.pdf differ diff --git a/statistical-computation_files/figure-pdf/fig-logistic-1.pdf b/statistical-computation_files/figure-pdf/fig-logistic-1.pdf index 3af07122..c5d45a36 100644 Binary files a/statistical-computation_files/figure-pdf/fig-logistic-1.pdf and b/statistical-computation_files/figure-pdf/fig-logistic-1.pdf differ diff --git a/statistical-computation_files/figure-pdf/fig-logistic-2.pdf b/statistical-computation_files/figure-pdf/fig-logistic-2.pdf index ba8db419..444e04c0 100644 Binary files a/statistical-computation_files/figure-pdf/fig-logistic-2.pdf and b/statistical-computation_files/figure-pdf/fig-logistic-2.pdf differ diff --git a/statistical-computation_files/figure-pdf/fig-logit-glmnet-1.pdf b/statistical-computation_files/figure-pdf/fig-logit-glmnet-1.pdf index 47dc03d7..f9af28c9 100644 Binary files a/statistical-computation_files/figure-pdf/fig-logit-glmnet-1.pdf and b/statistical-computation_files/figure-pdf/fig-logit-glmnet-1.pdf differ diff --git a/statistical-computation_files/figure-pdf/fig-logit-glmnet-lambda-1.pdf b/statistical-computation_files/figure-pdf/fig-logit-glmnet-lambda-1.pdf index a1fe41f0..86d0b4ea 100644 Binary files a/statistical-computation_files/figure-pdf/fig-logit-glmnet-lambda-1.pdf and b/statistical-computation_files/figure-pdf/fig-logit-glmnet-lambda-1.pdf differ diff --git a/statistical-computation_files/figure-pdf/fig-logit-roc-1.pdf b/statistical-computation_files/figure-pdf/fig-logit-roc-1.pdf index 645cd0e9..b0792032 100644 Binary files a/statistical-computation_files/figure-pdf/fig-logit-roc-1.pdf and b/statistical-computation_files/figure-pdf/fig-logit-roc-1.pdf differ diff --git a/time-series-regression.html b/time-series-regression.html index 034f0fa3..a652c705 100644 --- a/time-series-regression.html +++ b/time-series-regression.html @@ -900,7 +900,7 @@

    # 将测试数据代入模型,计算损失函数和监控度量 evaluate(mod_mlp, test_data[, c("x1", "x2", "x3")], test_data[, "y"])
    -
    #> 2/2 - 0s - loss: 2517.1836 - mae: 40.8876 - 22ms/epoch - 11ms/step
    +
    #> 2/2 - 0s - loss: 2517.1836 - mae: 40.8876 - 17ms/epoch - 8ms/step
    #>       loss        mae 
    @@ -1518,11 +1518,11 @@ 

    # 测试集上的预测
     mlp_test_pred <- predict(mod_mlp, test_data[, c("x1", "x2", "x3")]) 

    -
    #> 2/2 - 0s - 69ms/epoch - 35ms/step
    +
    #> 2/2 - 0s - 66ms/epoch - 33ms/step
    mlp_train_pred <- predict(mod_mlp, train_data[, c("x1", "x2", "x3")]) 
    -
    #> 3/3 - 0s - 19ms/epoch - 6ms/step
    +
    #> 3/3 - 0s - 16ms/epoch - 5ms/step
    sqrt(mean((test_data[, "y"] - mlp_test_pred)^2)) # 计算均方根误差
    diff --git a/time-series-regression_files/figure-html/fig-fitted-airpassengers-1.png b/time-series-regression_files/figure-html/fig-fitted-airpassengers-1.png index 9e161fd3..479569e0 100644 Binary files a/time-series-regression_files/figure-html/fig-fitted-airpassengers-1.png and b/time-series-regression_files/figure-html/fig-fitted-airpassengers-1.png differ diff --git a/time-series-regression_files/figure-html/fig-fitted-airpassengers-2.png b/time-series-regression_files/figure-html/fig-fitted-airpassengers-2.png index 52780366..47e00cfc 100644 Binary files a/time-series-regression_files/figure-html/fig-fitted-airpassengers-2.png and b/time-series-regression_files/figure-html/fig-fitted-airpassengers-2.png differ diff --git a/time-series-regression_files/figure-pdf/fig-fitted-airpassengers-1.pdf b/time-series-regression_files/figure-pdf/fig-fitted-airpassengers-1.pdf index 2f788b6e..1f72f4aa 100644 Binary files a/time-series-regression_files/figure-pdf/fig-fitted-airpassengers-1.pdf and b/time-series-regression_files/figure-pdf/fig-fitted-airpassengers-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-fitted-airpassengers-2.pdf b/time-series-regression_files/figure-pdf/fig-fitted-airpassengers-2.pdf index ae7ebaac..2deb7f5d 100644 Binary files a/time-series-regression_files/figure-pdf/fig-fitted-airpassengers-2.pdf and b/time-series-regression_files/figure-pdf/fig-fitted-airpassengers-2.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-lm-1.pdf b/time-series-regression_files/figure-pdf/fig-lm-1.pdf index a6604510..cb8c169d 100644 Binary files a/time-series-regression_files/figure-pdf/fig-lm-1.pdf and b/time-series-regression_files/figure-pdf/fig-lm-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-log-airpassengers-1.pdf b/time-series-regression_files/figure-pdf/fig-log-airpassengers-1.pdf index 2733bc00..2a728b22 100644 Binary files a/time-series-regression_files/figure-pdf/fig-log-airpassengers-1.pdf and b/time-series-regression_files/figure-pdf/fig-log-airpassengers-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-log-airpassengers-2.pdf b/time-series-regression_files/figure-pdf/fig-log-airpassengers-2.pdf index 70b6fd5b..fe31a56f 100644 Binary files a/time-series-regression_files/figure-pdf/fig-log-airpassengers-2.pdf and b/time-series-regression_files/figure-pdf/fig-log-airpassengers-2.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-log-return-1.pdf b/time-series-regression_files/figure-pdf/fig-log-return-1.pdf index f5eca13d..999e0dc7 100644 Binary files a/time-series-regression_files/figure-pdf/fig-log-return-1.pdf and b/time-series-regression_files/figure-pdf/fig-log-return-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-log-return-resid-1.pdf b/time-series-regression_files/figure-pdf/fig-log-return-resid-1.pdf index c9989d29..16d5f7dd 100644 Binary files a/time-series-regression_files/figure-pdf/fig-log-return-resid-1.pdf and b/time-series-regression_files/figure-pdf/fig-log-return-resid-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-log-return-resid-2.pdf b/time-series-regression_files/figure-pdf/fig-log-return-resid-2.pdf index 62494576..c8878533 100644 Binary files a/time-series-regression_files/figure-pdf/fig-log-return-resid-2.pdf and b/time-series-regression_files/figure-pdf/fig-log-return-resid-2.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-meituan-log-return-1.pdf b/time-series-regression_files/figure-pdf/fig-meituan-log-return-1.pdf index c7bc73b6..619a380e 100644 Binary files a/time-series-regression_files/figure-pdf/fig-meituan-log-return-1.pdf and b/time-series-regression_files/figure-pdf/fig-meituan-log-return-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-meituan-log-return-2.pdf b/time-series-regression_files/figure-pdf/fig-meituan-log-return-2.pdf index a076ecc8..231d9c82 100644 Binary files a/time-series-regression_files/figure-pdf/fig-meituan-log-return-2.pdf and b/time-series-regression_files/figure-pdf/fig-meituan-log-return-2.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-meituan-stack-1.pdf b/time-series-regression_files/figure-pdf/fig-meituan-stack-1.pdf index 2189f69f..14ea93b7 100644 Binary files a/time-series-regression_files/figure-pdf/fig-meituan-stack-1.pdf and b/time-series-regression_files/figure-pdf/fig-meituan-stack-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-mgcv-1-1.pdf b/time-series-regression_files/figure-pdf/fig-mgcv-1-1.pdf index 1dbaff1e..61868c3b 100644 Binary files a/time-series-regression_files/figure-pdf/fig-mgcv-1-1.pdf and b/time-series-regression_files/figure-pdf/fig-mgcv-1-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-mgcv-2-1.pdf b/time-series-regression_files/figure-pdf/fig-mgcv-2-1.pdf index 95b99704..9894787a 100644 Binary files a/time-series-regression_files/figure-pdf/fig-mgcv-2-1.pdf and b/time-series-regression_files/figure-pdf/fig-mgcv-2-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-mgcv-interaction-1.pdf b/time-series-regression_files/figure-pdf/fig-mgcv-interaction-1.pdf index 252e9892..8f6ec03e 100644 Binary files a/time-series-regression_files/figure-pdf/fig-mgcv-interaction-1.pdf and b/time-series-regression_files/figure-pdf/fig-mgcv-interaction-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-mgcv-persp-1.pdf b/time-series-regression_files/figure-pdf/fig-mgcv-persp-1.pdf index e11c609b..e80e4d07 100644 Binary files a/time-series-regression_files/figure-pdf/fig-mgcv-persp-1.pdf and b/time-series-regression_files/figure-pdf/fig-mgcv-persp-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-mgcv-trend-1.pdf b/time-series-regression_files/figure-pdf/fig-mgcv-trend-1.pdf index e6871e26..1e124e12 100644 Binary files a/time-series-regression_files/figure-pdf/fig-mgcv-trend-1.pdf and b/time-series-regression_files/figure-pdf/fig-mgcv-trend-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-nnet-1.pdf b/time-series-regression_files/figure-pdf/fig-nnet-1.pdf index 55cda04f..b0345258 100644 Binary files a/time-series-regression_files/figure-pdf/fig-nnet-1.pdf and b/time-series-regression_files/figure-pdf/fig-nnet-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-sunspots-1.pdf b/time-series-regression_files/figure-pdf/fig-sunspots-1.pdf index 04178e49..6158b73e 100644 Binary files a/time-series-regression_files/figure-pdf/fig-sunspots-1.pdf and b/time-series-regression_files/figure-pdf/fig-sunspots-1.pdf differ diff --git a/time-series-regression_files/figure-pdf/fig-tensorflow-mlp-1.pdf b/time-series-regression_files/figure-pdf/fig-tensorflow-mlp-1.pdf index 3612281f..8430b74b 100644 Binary files a/time-series-regression_files/figure-pdf/fig-tensorflow-mlp-1.pdf and b/time-series-regression_files/figure-pdf/fig-tensorflow-mlp-1.pdf differ diff --git a/visualization-advanced_files/figure-html/fig-ggplot2-stripchart-3.png b/visualization-advanced_files/figure-html/fig-ggplot2-stripchart-3.png index a30f62bc..eb2ac3dd 100644 Binary files a/visualization-advanced_files/figure-html/fig-ggplot2-stripchart-3.png and b/visualization-advanced_files/figure-html/fig-ggplot2-stripchart-3.png differ diff --git a/visualization-advanced_files/figure-html/fig-ridge-line-2.png b/visualization-advanced_files/figure-html/fig-ridge-line-2.png index 78f6cd0d..0e130e06 100644 Binary files a/visualization-advanced_files/figure-html/fig-ridge-line-2.png and b/visualization-advanced_files/figure-html/fig-ridge-line-2.png differ diff --git a/visualization-advanced_files/figure-html/fig-sina-1.png b/visualization-advanced_files/figure-html/fig-sina-1.png index 28c53bb5..57266042 100644 Binary files a/visualization-advanced_files/figure-html/fig-sina-1.png and b/visualization-advanced_files/figure-html/fig-sina-1.png differ diff --git a/visualization-advanced_files/figure-html/fig-sina-2.png b/visualization-advanced_files/figure-html/fig-sina-2.png index a0405b61..661e0500 100644 Binary files a/visualization-advanced_files/figure-html/fig-sina-2.png and b/visualization-advanced_files/figure-html/fig-sina-2.png differ diff --git a/visualization-advanced_files/figure-pdf/fig-bump-1.pdf b/visualization-advanced_files/figure-pdf/fig-bump-1.pdf index 5b39241f..f201f8f1 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-bump-1.pdf and b/visualization-advanced_files/figure-pdf/fig-bump-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-china-raise-illiteracy-1.pdf b/visualization-advanced_files/figure-pdf/fig-china-raise-illiteracy-1.pdf index d99b96dd..9ccc6aac 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-china-raise-illiteracy-1.pdf and b/visualization-advanced_files/figure-pdf/fig-china-raise-illiteracy-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-china-raise-illiteracy-bubble-1.pdf b/visualization-advanced_files/figure-pdf/fig-china-raise-illiteracy-bubble-1.pdf index 3cab3f27..73180285 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-china-raise-illiteracy-bubble-1.pdf and b/visualization-advanced_files/figure-pdf/fig-china-raise-illiteracy-bubble-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-clopper-pearson-ci-1.pdf b/visualization-advanced_files/figure-pdf/fig-clopper-pearson-ci-1.pdf index 84d8ef12..beb89888 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-clopper-pearson-ci-1.pdf and b/visualization-advanced_files/figure-pdf/fig-clopper-pearson-ci-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-1.pdf b/visualization-advanced_files/figure-pdf/fig-density-1.pdf index 0d874c74..1a4dfc56 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-1.pdf and b/visualization-advanced_files/figure-pdf/fig-density-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-2d-1.pdf b/visualization-advanced_files/figure-pdf/fig-density-2d-1.pdf index 9951bb36..e79d9494 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-2d-1.pdf and b/visualization-advanced_files/figure-pdf/fig-density-2d-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-2d-filled-1.pdf b/visualization-advanced_files/figure-pdf/fig-density-2d-filled-1.pdf index 6c473a63..aea3add1 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-2d-filled-1.pdf and b/visualization-advanced_files/figure-pdf/fig-density-2d-filled-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-diamonds-1.pdf b/visualization-advanced_files/figure-pdf/fig-density-diamonds-1.pdf index 5d36822f..71a16c0e 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-diamonds-1.pdf and b/visualization-advanced_files/figure-pdf/fig-density-diamonds-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-diamonds-2.pdf b/visualization-advanced_files/figure-pdf/fig-density-diamonds-2.pdf index 0afc0389..141947bc 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-diamonds-2.pdf and b/visualization-advanced_files/figure-pdf/fig-density-diamonds-2.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-hdr-1.pdf b/visualization-advanced_files/figure-pdf/fig-density-hdr-1.pdf index ba6005b2..4c40e5fe 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-hdr-1.pdf and b/visualization-advanced_files/figure-pdf/fig-density-hdr-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-margins-1.pdf b/visualization-advanced_files/figure-pdf/fig-density-margins-1.pdf index 7404b0ff..b70eb986 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-margins-1.pdf and b/visualization-advanced_files/figure-pdf/fig-density-margins-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-stack-1.pdf b/visualization-advanced_files/figure-pdf/fig-density-stack-1.pdf index 933a386e..52e4915b 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-stack-1.pdf and b/visualization-advanced_files/figure-pdf/fig-density-stack-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-density-stack-2.pdf b/visualization-advanced_files/figure-pdf/fig-density-stack-2.pdf index 8428fc07..6b375e8e 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-density-stack-2.pdf and b/visualization-advanced_files/figure-pdf/fig-density-stack-2.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-1.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-1.pdf index f4c18f93..c4aa3a1a 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-1.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-2.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-2.pdf index 3cf39876..03e90ca9 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-2.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-2.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-3.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-3.pdf index fe13a2c6..36aea0a0 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-3.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-boxplot-3.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-1.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-1.pdf index 69bdc44b..026abd9f 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-1.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-2.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-2.pdf index c87a0f64..68df1a8d 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-2.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-2.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-3.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-3.pdf index 946ed6ec..2d95f447 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-3.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-stripchart-3.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-1.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-1.pdf index 3a09f0f0..4cbbf4e8 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-1.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-2.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-2.pdf index d40fa091..41ef2c15 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-2.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-2.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-3.pdf b/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-3.pdf index 4ec73128..2b5e3f2d 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-3.pdf and b/visualization-advanced_files/figure-pdf/fig-ggplot2-violin-3.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-histogram-1.pdf b/visualization-advanced_files/figure-pdf/fig-histogram-1.pdf index d0aff74e..4a371ad9 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-histogram-1.pdf and b/visualization-advanced_files/figure-pdf/fig-histogram-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-histogram-2.pdf b/visualization-advanced_files/figure-pdf/fig-histogram-2.pdf index b528e07c..441666be 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-histogram-2.pdf and b/visualization-advanced_files/figure-pdf/fig-histogram-2.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-lm-diagnostics-1.pdf b/visualization-advanced_files/figure-pdf/fig-lm-diagnostics-1.pdf index 02161b69..6e7446d8 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-lm-diagnostics-1.pdf and b/visualization-advanced_files/figure-pdf/fig-lm-diagnostics-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-marginal-effects-1.pdf b/visualization-advanced_files/figure-pdf/fig-marginal-effects-1.pdf index 8830ca70..b2d3c477 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-marginal-effects-1.pdf and b/visualization-advanced_files/figure-pdf/fig-marginal-effects-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-plant-growth-1.pdf b/visualization-advanced_files/figure-pdf/fig-plant-growth-1.pdf index 8b665fcf..eb985bd2 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-plant-growth-1.pdf and b/visualization-advanced_files/figure-pdf/fig-plant-growth-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ridge-line-1.pdf b/visualization-advanced_files/figure-pdf/fig-ridge-line-1.pdf index d8681c73..be336370 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ridge-line-1.pdf and b/visualization-advanced_files/figure-pdf/fig-ridge-line-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ridge-line-2.pdf b/visualization-advanced_files/figure-pdf/fig-ridge-line-2.pdf index 388b5f5b..dada1ffe 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ridge-line-2.pdf and b/visualization-advanced_files/figure-pdf/fig-ridge-line-2.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-ridge-line-3.pdf b/visualization-advanced_files/figure-pdf/fig-ridge-line-3.pdf index 3f355fd4..e5e57ead 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-ridge-line-3.pdf and b/visualization-advanced_files/figure-pdf/fig-ridge-line-3.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-school-graph-1.pdf b/visualization-advanced_files/figure-pdf/fig-school-graph-1.pdf index f2f6613d..7740d431 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-school-graph-1.pdf and b/visualization-advanced_files/figure-pdf/fig-school-graph-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-signif-1.pdf b/visualization-advanced_files/figure-pdf/fig-signif-1.pdf index 32a05089..5f329286 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-signif-1.pdf and b/visualization-advanced_files/figure-pdf/fig-signif-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-sina-1.pdf b/visualization-advanced_files/figure-pdf/fig-sina-1.pdf index b46f153a..a90ee4eb 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-sina-1.pdf and b/visualization-advanced_files/figure-pdf/fig-sina-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-sina-2.pdf b/visualization-advanced_files/figure-pdf/fig-sina-2.pdf index ef2cef5a..b68a5ca9 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-sina-2.pdf and b/visualization-advanced_files/figure-pdf/fig-sina-2.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-sina-3.pdf b/visualization-advanced_files/figure-pdf/fig-sina-3.pdf index 663de5bb..eb7f8277 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-sina-3.pdf and b/visualization-advanced_files/figure-pdf/fig-sina-3.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-sina-4.pdf b/visualization-advanced_files/figure-pdf/fig-sina-4.pdf index 787d15b9..f1986859 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-sina-4.pdf and b/visualization-advanced_files/figure-pdf/fig-sina-4.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-smooth-1.pdf b/visualization-advanced_files/figure-pdf/fig-smooth-1.pdf index 598ab450..c89c1716 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-smooth-1.pdf and b/visualization-advanced_files/figure-pdf/fig-smooth-1.pdf differ diff --git a/visualization-advanced_files/figure-pdf/fig-venn-1.pdf b/visualization-advanced_files/figure-pdf/fig-venn-1.pdf index 0f16f193..5a128c27 100644 Binary files a/visualization-advanced_files/figure-pdf/fig-venn-1.pdf and b/visualization-advanced_files/figure-pdf/fig-venn-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-1.png b/visualization-basic_files/figure-pdf/fig-animate-1.png index 0e475e0c..85119c73 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-1.png and b/visualization-basic_files/figure-pdf/fig-animate-1.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-10.png b/visualization-basic_files/figure-pdf/fig-animate-10.png index ca69983e..d4cdc66e 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-10.png and b/visualization-basic_files/figure-pdf/fig-animate-10.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-11.png b/visualization-basic_files/figure-pdf/fig-animate-11.png index 3220da1e..45af515c 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-11.png and b/visualization-basic_files/figure-pdf/fig-animate-11.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-12.png b/visualization-basic_files/figure-pdf/fig-animate-12.png index 49e3ef0e..3edbd8fa 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-12.png and b/visualization-basic_files/figure-pdf/fig-animate-12.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-13.png b/visualization-basic_files/figure-pdf/fig-animate-13.png index ad432eaf..6f478a13 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-13.png and b/visualization-basic_files/figure-pdf/fig-animate-13.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-14.png b/visualization-basic_files/figure-pdf/fig-animate-14.png index 54f5a506..b6663ff3 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-14.png and b/visualization-basic_files/figure-pdf/fig-animate-14.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-15.png b/visualization-basic_files/figure-pdf/fig-animate-15.png index 68381f7b..e70c17d7 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-15.png and b/visualization-basic_files/figure-pdf/fig-animate-15.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-16.png b/visualization-basic_files/figure-pdf/fig-animate-16.png index aead69b9..aa13bc3f 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-16.png and b/visualization-basic_files/figure-pdf/fig-animate-16.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-17.png b/visualization-basic_files/figure-pdf/fig-animate-17.png index 708c4978..03fc3e15 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-17.png and b/visualization-basic_files/figure-pdf/fig-animate-17.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-18.png b/visualization-basic_files/figure-pdf/fig-animate-18.png index 2c74aa38..1f4fdc00 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-18.png and b/visualization-basic_files/figure-pdf/fig-animate-18.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-19.png b/visualization-basic_files/figure-pdf/fig-animate-19.png index 2ebc6270..6a544c92 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-19.png and b/visualization-basic_files/figure-pdf/fig-animate-19.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-2.png b/visualization-basic_files/figure-pdf/fig-animate-2.png index bb2b8950..739c00b3 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-2.png and b/visualization-basic_files/figure-pdf/fig-animate-2.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-20.png b/visualization-basic_files/figure-pdf/fig-animate-20.png index 8b21ae38..39f6dcee 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-20.png and b/visualization-basic_files/figure-pdf/fig-animate-20.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-21.png b/visualization-basic_files/figure-pdf/fig-animate-21.png index bde9af4e..3383ec56 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-21.png and b/visualization-basic_files/figure-pdf/fig-animate-21.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-22.png b/visualization-basic_files/figure-pdf/fig-animate-22.png index 77046a99..dfd06d70 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-22.png and b/visualization-basic_files/figure-pdf/fig-animate-22.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-23.png b/visualization-basic_files/figure-pdf/fig-animate-23.png index 9c906c1f..db246c46 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-23.png and b/visualization-basic_files/figure-pdf/fig-animate-23.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-24.png b/visualization-basic_files/figure-pdf/fig-animate-24.png index fc087c5e..ed908b98 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-24.png and b/visualization-basic_files/figure-pdf/fig-animate-24.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-25.png b/visualization-basic_files/figure-pdf/fig-animate-25.png index 60780905..c9b0c1d1 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-25.png and b/visualization-basic_files/figure-pdf/fig-animate-25.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-26.png b/visualization-basic_files/figure-pdf/fig-animate-26.png index 8b1534cc..c10ea43f 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-26.png and b/visualization-basic_files/figure-pdf/fig-animate-26.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-27.png b/visualization-basic_files/figure-pdf/fig-animate-27.png index 9d4d1540..dd4a8dd5 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-27.png and b/visualization-basic_files/figure-pdf/fig-animate-27.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-28.png b/visualization-basic_files/figure-pdf/fig-animate-28.png index 750e0f60..59620192 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-28.png and b/visualization-basic_files/figure-pdf/fig-animate-28.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-29.png b/visualization-basic_files/figure-pdf/fig-animate-29.png index 3422532c..d73afea7 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-29.png and b/visualization-basic_files/figure-pdf/fig-animate-29.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-3.png b/visualization-basic_files/figure-pdf/fig-animate-3.png index ee53b36d..56fbfc6c 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-3.png and b/visualization-basic_files/figure-pdf/fig-animate-3.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-30.png b/visualization-basic_files/figure-pdf/fig-animate-30.png index 4d7e8c6a..e17398e0 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-30.png and b/visualization-basic_files/figure-pdf/fig-animate-30.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-4.png b/visualization-basic_files/figure-pdf/fig-animate-4.png index af4e7373..a5ed6684 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-4.png and b/visualization-basic_files/figure-pdf/fig-animate-4.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-5.png b/visualization-basic_files/figure-pdf/fig-animate-5.png index d5a8035d..20b16f8c 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-5.png and b/visualization-basic_files/figure-pdf/fig-animate-5.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-6.png b/visualization-basic_files/figure-pdf/fig-animate-6.png index 7cb95010..229dc6f6 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-6.png and b/visualization-basic_files/figure-pdf/fig-animate-6.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-7.png b/visualization-basic_files/figure-pdf/fig-animate-7.png index ac5373b5..52351c15 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-7.png and b/visualization-basic_files/figure-pdf/fig-animate-7.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-8.png b/visualization-basic_files/figure-pdf/fig-animate-8.png index fd8c50e8..e901eeea 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-8.png and b/visualization-basic_files/figure-pdf/fig-animate-8.png differ diff --git a/visualization-basic_files/figure-pdf/fig-animate-9.png b/visualization-basic_files/figure-pdf/fig-animate-9.png index 47b7da1f..7b9ccaf3 100644 Binary files a/visualization-basic_files/figure-pdf/fig-animate-9.png and b/visualization-basic_files/figure-pdf/fig-animate-9.png differ diff --git a/visualization-basic_files/figure-pdf/fig-arts-1.pdf b/visualization-basic_files/figure-pdf/fig-arts-1.pdf index 828e7098..43f55a50 100644 Binary files a/visualization-basic_files/figure-pdf/fig-arts-1.pdf and b/visualization-basic_files/figure-pdf/fig-arts-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-arts-2.pdf b/visualization-basic_files/figure-pdf/fig-arts-2.pdf index 2410630e..f5671417 100644 Binary files a/visualization-basic_files/figure-pdf/fig-arts-2.pdf and b/visualization-basic_files/figure-pdf/fig-arts-2.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-arts-3.pdf b/visualization-basic_files/figure-pdf/fig-arts-3.pdf index 9c4087f6..4f953d34 100644 Binary files a/visualization-basic_files/figure-pdf/fig-arts-3.pdf and b/visualization-basic_files/figure-pdf/fig-arts-3.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-arts-4.pdf b/visualization-basic_files/figure-pdf/fig-arts-4.pdf index 4898dd98..04332368 100644 Binary files a/visualization-basic_files/figure-pdf/fig-arts-4.pdf and b/visualization-basic_files/figure-pdf/fig-arts-4.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-color-brewer-1.pdf b/visualization-basic_files/figure-pdf/fig-color-brewer-1.pdf index bcfa4d44..0b36aaf8 100644 Binary files a/visualization-basic_files/figure-pdf/fig-color-brewer-1.pdf and b/visualization-basic_files/figure-pdf/fig-color-brewer-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-color-manual-1.pdf b/visualization-basic_files/figure-pdf/fig-color-manual-1.pdf index 3c4793e1..7e6a197b 100644 Binary files a/visualization-basic_files/figure-pdf/fig-color-manual-1.pdf and b/visualization-basic_files/figure-pdf/fig-color-manual-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-facet-ncol-1.pdf b/visualization-basic_files/figure-pdf/fig-facet-ncol-1.pdf index 449f92f3..935f23c8 100644 Binary files a/visualization-basic_files/figure-pdf/fig-facet-ncol-1.pdf and b/visualization-basic_files/figure-pdf/fig-facet-ncol-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-facet-wrap-1.pdf b/visualization-basic_files/figure-pdf/fig-facet-wrap-1.pdf index 0da3f224..e425d78a 100644 Binary files a/visualization-basic_files/figure-pdf/fig-facet-wrap-1.pdf and b/visualization-basic_files/figure-pdf/fig-facet-wrap-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-label-1.pdf b/visualization-basic_files/figure-pdf/fig-label-1.pdf index 6a528ee0..e989d195 100644 Binary files a/visualization-basic_files/figure-pdf/fig-label-1.pdf and b/visualization-basic_files/figure-pdf/fig-label-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-label-2.pdf b/visualization-basic_files/figure-pdf/fig-label-2.pdf index 89aaf0ec..ca082c58 100644 Binary files a/visualization-basic_files/figure-pdf/fig-label-2.pdf and b/visualization-basic_files/figure-pdf/fig-label-2.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-layer-point-1.pdf b/visualization-basic_files/figure-pdf/fig-layer-point-1.pdf index 3339409f..fb5f7e64 100644 Binary files a/visualization-basic_files/figure-pdf/fig-layer-point-1.pdf and b/visualization-basic_files/figure-pdf/fig-layer-point-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-layer-point-2.pdf b/visualization-basic_files/figure-pdf/fig-layer-point-2.pdf index ff82bbe0..56c9a24e 100644 Binary files a/visualization-basic_files/figure-pdf/fig-layer-point-2.pdf and b/visualization-basic_files/figure-pdf/fig-layer-point-2.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-layer-point-3.pdf b/visualization-basic_files/figure-pdf/fig-layer-point-3.pdf index f3d80494..6eb4421e 100644 Binary files a/visualization-basic_files/figure-pdf/fig-layer-point-3.pdf and b/visualization-basic_files/figure-pdf/fig-layer-point-3.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-layer-point-4.pdf b/visualization-basic_files/figure-pdf/fig-layer-point-4.pdf index ef917a9d..d31436c5 100644 Binary files a/visualization-basic_files/figure-pdf/fig-layer-point-4.pdf and b/visualization-basic_files/figure-pdf/fig-layer-point-4.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-legend-label-1.pdf b/visualization-basic_files/figure-pdf/fig-legend-label-1.pdf index ab2158a0..fefde71f 100644 Binary files a/visualization-basic_files/figure-pdf/fig-legend-label-1.pdf and b/visualization-basic_files/figure-pdf/fig-legend-label-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-legend-show-1.pdf b/visualization-basic_files/figure-pdf/fig-legend-show-1.pdf index 70761ecc..2890cf6f 100644 Binary files a/visualization-basic_files/figure-pdf/fig-legend-show-1.pdf and b/visualization-basic_files/figure-pdf/fig-legend-show-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-patchwork-1.pdf b/visualization-basic_files/figure-pdf/fig-patchwork-1.pdf index a99a2558..913b9f8a 100644 Binary files a/visualization-basic_files/figure-pdf/fig-patchwork-1.pdf and b/visualization-basic_files/figure-pdf/fig-patchwork-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-scale-dollar-1.pdf b/visualization-basic_files/figure-pdf/fig-scale-dollar-1.pdf index 7cc22383..80413be3 100644 Binary files a/visualization-basic_files/figure-pdf/fig-scale-dollar-1.pdf and b/visualization-basic_files/figure-pdf/fig-scale-dollar-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-scale-labels-1.pdf b/visualization-basic_files/figure-pdf/fig-scale-labels-1.pdf index 6dbfdc59..03817367 100644 Binary files a/visualization-basic_files/figure-pdf/fig-scale-labels-1.pdf and b/visualization-basic_files/figure-pdf/fig-scale-labels-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-scale-limits-1.pdf b/visualization-basic_files/figure-pdf/fig-scale-limits-1.pdf index b56decbc..e5b09b7a 100644 Binary files a/visualization-basic_files/figure-pdf/fig-scale-limits-1.pdf and b/visualization-basic_files/figure-pdf/fig-scale-limits-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-scale-log10-1.pdf b/visualization-basic_files/figure-pdf/fig-scale-log10-1.pdf index fa92cf3d..191c37ab 100644 Binary files a/visualization-basic_files/figure-pdf/fig-scale-log10-1.pdf and b/visualization-basic_files/figure-pdf/fig-scale-log10-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-scale-minor-breaks-1.pdf b/visualization-basic_files/figure-pdf/fig-scale-minor-breaks-1.pdf index 0eaa7c0b..d775f6a1 100644 Binary files a/visualization-basic_files/figure-pdf/fig-scale-minor-breaks-1.pdf and b/visualization-basic_files/figure-pdf/fig-scale-minor-breaks-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-text-1.pdf b/visualization-basic_files/figure-pdf/fig-text-1.pdf index 580e3f8f..69e8f7e6 100644 Binary files a/visualization-basic_files/figure-pdf/fig-text-1.pdf and b/visualization-basic_files/figure-pdf/fig-text-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-text-ggrepel-1.pdf b/visualization-basic_files/figure-pdf/fig-text-ggrepel-1.pdf index 917ec897..308d2abc 100644 Binary files a/visualization-basic_files/figure-pdf/fig-text-ggrepel-1.pdf and b/visualization-basic_files/figure-pdf/fig-text-ggrepel-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-theme-classic-1.pdf b/visualization-basic_files/figure-pdf/fig-theme-classic-1.pdf index ec3f51ba..8a67ef7a 100644 Binary files a/visualization-basic_files/figure-pdf/fig-theme-classic-1.pdf and b/visualization-basic_files/figure-pdf/fig-theme-classic-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-theme-custom-1.pdf b/visualization-basic_files/figure-pdf/fig-theme-custom-1.pdf index 34350e2f..55c6dd32 100644 Binary files a/visualization-basic_files/figure-pdf/fig-theme-custom-1.pdf and b/visualization-basic_files/figure-pdf/fig-theme-custom-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-theme-position-1.pdf b/visualization-basic_files/figure-pdf/fig-theme-position-1.pdf index dc6b8f64..9d3016a2 100644 Binary files a/visualization-basic_files/figure-pdf/fig-theme-position-1.pdf and b/visualization-basic_files/figure-pdf/fig-theme-position-1.pdf differ diff --git a/visualization-basic_files/figure-pdf/fig-theme-tufte-1.pdf b/visualization-basic_files/figure-pdf/fig-theme-tufte-1.pdf index c157c1f5..2b92ecb4 100644 Binary files a/visualization-basic_files/figure-pdf/fig-theme-tufte-1.pdf and b/visualization-basic_files/figure-pdf/fig-theme-tufte-1.pdf differ diff --git a/visualization-graphics_files/figure-html/fig-contour-1.pdf b/visualization-graphics_files/figure-html/fig-contour-1.pdf index 445f52cd..37cafbe4 100644 Binary files a/visualization-graphics_files/figure-html/fig-contour-1.pdf and b/visualization-graphics_files/figure-html/fig-contour-1.pdf differ diff --git a/visualization-graphics_files/figure-html/fig-contour-1.tex b/visualization-graphics_files/figure-html/fig-contour-1.tex index c5b30f6f..15598744 100644 --- a/visualization-graphics_files/figure-html/fig-contour-1.tex +++ b/visualization-graphics_files/figure-html/fig-contour-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:05:23 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:55:16 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-graphics_files/figure-html/fig-filled-contour-1.pdf b/visualization-graphics_files/figure-html/fig-filled-contour-1.pdf index 71715f3b..5135306a 100644 Binary files a/visualization-graphics_files/figure-html/fig-filled-contour-1.pdf and b/visualization-graphics_files/figure-html/fig-filled-contour-1.pdf differ diff --git a/visualization-graphics_files/figure-html/fig-filled-contour-1.tex b/visualization-graphics_files/figure-html/fig-filled-contour-1.tex index c0f0334c..17322700 100644 --- a/visualization-graphics_files/figure-html/fig-filled-contour-1.tex +++ b/visualization-graphics_files/figure-html/fig-filled-contour-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:05:39 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:55:33 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-graphics_files/figure-html/fig-image-1.pdf b/visualization-graphics_files/figure-html/fig-image-1.pdf index d3ee156c..13ee8e37 100644 Binary files a/visualization-graphics_files/figure-html/fig-image-1.pdf and b/visualization-graphics_files/figure-html/fig-image-1.pdf differ diff --git a/visualization-graphics_files/figure-html/fig-image-1.tex b/visualization-graphics_files/figure-html/fig-image-1.tex index bbdc0083..6d3f858f 100644 --- a/visualization-graphics_files/figure-html/fig-image-1.tex +++ b/visualization-graphics_files/figure-html/fig-image-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:04:57 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:54:48 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-graphics_files/figure-html/fig-persp-1.pdf b/visualization-graphics_files/figure-html/fig-persp-1.pdf index d79f9ae7..f27baab7 100644 Binary files a/visualization-graphics_files/figure-html/fig-persp-1.pdf and b/visualization-graphics_files/figure-html/fig-persp-1.pdf differ diff --git a/visualization-graphics_files/figure-html/fig-persp-1.tex b/visualization-graphics_files/figure-html/fig-persp-1.tex index cacd854f..b2e346b8 100644 --- a/visualization-graphics_files/figure-html/fig-persp-1.tex +++ b/visualization-graphics_files/figure-html/fig-persp-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:05:04 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:54:56 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-graphics_files/figure-pdf/fig-anscombe-1.pdf b/visualization-graphics_files/figure-pdf/fig-anscombe-1.pdf index 4742ee51..8c875a90 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-anscombe-1.pdf and b/visualization-graphics_files/figure-pdf/fig-anscombe-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-contour-1.pdf b/visualization-graphics_files/figure-pdf/fig-contour-1.pdf index 1013ebcd..7d8b3a40 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-contour-1.pdf and b/visualization-graphics_files/figure-pdf/fig-contour-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-contour-1.png b/visualization-graphics_files/figure-pdf/fig-contour-1.png index 94b26036..afe3cb5f 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-contour-1.png and b/visualization-graphics_files/figure-pdf/fig-contour-1.png differ diff --git a/visualization-graphics_files/figure-pdf/fig-contour-1.tex b/visualization-graphics_files/figure-pdf/fig-contour-1.tex index 2ba30b85..b16466c8 100644 --- a/visualization-graphics_files/figure-pdf/fig-contour-1.tex +++ b/visualization-graphics_files/figure-pdf/fig-contour-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:06:25 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:56:22 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-graphics_files/figure-pdf/fig-filled-contour-1.pdf b/visualization-graphics_files/figure-pdf/fig-filled-contour-1.pdf index d078dae1..d8381bcb 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-filled-contour-1.pdf and b/visualization-graphics_files/figure-pdf/fig-filled-contour-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-filled-contour-1.png b/visualization-graphics_files/figure-pdf/fig-filled-contour-1.png index 6653a2ba..e440bc43 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-filled-contour-1.png and b/visualization-graphics_files/figure-pdf/fig-filled-contour-1.png differ diff --git a/visualization-graphics_files/figure-pdf/fig-filled-contour-1.tex b/visualization-graphics_files/figure-pdf/fig-filled-contour-1.tex index c08457ff..191285f2 100644 --- a/visualization-graphics_files/figure-pdf/fig-filled-contour-1.tex +++ b/visualization-graphics_files/figure-pdf/fig-filled-contour-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:06:28 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:56:25 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-graphics_files/figure-pdf/fig-graphics-palette-1.pdf b/visualization-graphics_files/figure-pdf/fig-graphics-palette-1.pdf index ff2d8959..a8311b9b 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-graphics-palette-1.pdf and b/visualization-graphics_files/figure-pdf/fig-graphics-palette-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-image-1.pdf b/visualization-graphics_files/figure-pdf/fig-image-1.pdf index 7f53e604..8b49ab4f 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-image-1.pdf and b/visualization-graphics_files/figure-pdf/fig-image-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-image-1.png b/visualization-graphics_files/figure-pdf/fig-image-1.png index 3646315d..5b994177 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-image-1.png and b/visualization-graphics_files/figure-pdf/fig-image-1.png differ diff --git a/visualization-graphics_files/figure-pdf/fig-image-1.tex b/visualization-graphics_files/figure-pdf/fig-image-1.tex index 2ec585a0..f09ef781 100644 --- a/visualization-graphics_files/figure-pdf/fig-image-1.tex +++ b/visualization-graphics_files/figure-pdf/fig-image-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:06:12 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:56:09 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-graphics_files/figure-pdf/fig-iris-color-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-color-1.pdf index 147937b4..97ddd3ce 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-color-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-color-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-fonts-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-fonts-1.pdf index acdc2820..3c1b5add 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-fonts-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-fonts-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-group1-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-group1-1.pdf index cec9e2fe..f01527d4 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-group1-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-group1-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-group2-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-group2-1.pdf index 5c287a23..f19a89fa 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-group2-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-group2-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-label-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-label-1.pdf index ea35406f..be7afa27 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-label-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-label-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-legend1-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-legend1-1.pdf index d89bd514..4a6fb90e 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-legend1-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-legend1-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-legend2-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-legend2-1.pdf index 49e80ac0..f70a2bb4 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-legend2-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-legend2-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-legend3-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-legend3-1.pdf index e26c0c6b..94479eb0 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-legend3-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-legend3-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-lm-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-lm-1.pdf index 5bc88ff0..a4318acb 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-lm-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-lm-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-plot-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-plot-1.pdf index 27ba135e..11fae21d 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-plot-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-plot-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-plot-2.pdf b/visualization-graphics_files/figure-pdf/fig-iris-plot-2.pdf index babc6177..7ef4a62e 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-plot-2.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-plot-2.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-iris-text-1.pdf b/visualization-graphics_files/figure-pdf/fig-iris-text-1.pdf index 29bb51a8..2d33743a 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-iris-text-1.pdf and b/visualization-graphics_files/figure-pdf/fig-iris-text-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-math-annotation-1.pdf b/visualization-graphics_files/figure-pdf/fig-math-annotation-1.pdf index 3e06aab4..810c7630 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-math-annotation-1.pdf and b/visualization-graphics_files/figure-pdf/fig-math-annotation-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-persp-1.pdf b/visualization-graphics_files/figure-pdf/fig-persp-1.pdf index 4b67d7f9..784bd6a9 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-persp-1.pdf and b/visualization-graphics_files/figure-pdf/fig-persp-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-persp-1.png b/visualization-graphics_files/figure-pdf/fig-persp-1.png index 102099a7..dadfc553 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-persp-1.png and b/visualization-graphics_files/figure-pdf/fig-persp-1.png differ diff --git a/visualization-graphics_files/figure-pdf/fig-persp-1.tex b/visualization-graphics_files/figure-pdf/fig-persp-1.tex index 80ee8818..8eb3be68 100644 --- a/visualization-graphics_files/figure-pdf/fig-persp-1.tex +++ b/visualization-graphics_files/figure-pdf/fig-persp-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:06:17 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:56:14 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-graphics_files/figure-pdf/fig-plot2-density-1.pdf b/visualization-graphics_files/figure-pdf/fig-plot2-density-1.pdf index 3bfc10ef..61c3d644 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-plot2-density-1.pdf and b/visualization-graphics_files/figure-pdf/fig-plot2-density-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-plot2-iris-1.pdf b/visualization-graphics_files/figure-pdf/fig-plot2-iris-1.pdf index 8969b5b4..5bedbfa3 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-plot2-iris-1.pdf and b/visualization-graphics_files/figure-pdf/fig-plot2-iris-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-plot2-legend-1.pdf b/visualization-graphics_files/figure-pdf/fig-plot2-legend-1.pdf index 5b55ec7f..031c8522 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-plot2-legend-1.pdf and b/visualization-graphics_files/figure-pdf/fig-plot2-legend-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-volcano-plot3d-1.pdf b/visualization-graphics_files/figure-pdf/fig-volcano-plot3d-1.pdf index 4f1191b2..51270b6a 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-volcano-plot3d-1.pdf and b/visualization-graphics_files/figure-pdf/fig-volcano-plot3d-1.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-volcano-plot3d-2.pdf b/visualization-graphics_files/figure-pdf/fig-volcano-plot3d-2.pdf index 5f43380a..1e40975c 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-volcano-plot3d-2.pdf and b/visualization-graphics_files/figure-pdf/fig-volcano-plot3d-2.pdf differ diff --git a/visualization-graphics_files/figure-pdf/fig-volcano-plt-1.pdf b/visualization-graphics_files/figure-pdf/fig-volcano-plt-1.pdf index 0318dd36..036d8168 100644 Binary files a/visualization-graphics_files/figure-pdf/fig-volcano-plt-1.pdf and b/visualization-graphics_files/figure-pdf/fig-volcano-plt-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-barchart-1.pdf b/visualization-intermediate_files/figure-pdf/fig-barchart-1.pdf index 9714cf61..e1959243 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-barchart-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-barchart-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-calendar-1.pdf b/visualization-intermediate_files/figure-pdf/fig-calendar-1.pdf index 6679844f..fea69674 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-calendar-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-calendar-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-column-1.pdf b/visualization-intermediate_files/figure-pdf/fig-column-1.pdf index 61c92a8f..4551361d 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-column-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-column-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-column-better-1.pdf b/visualization-intermediate_files/figure-pdf/fig-column-better-1.pdf index cf1fe411..73d3a56a 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-column-better-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-column-better-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-dotchart-1.pdf b/visualization-intermediate_files/figure-pdf/fig-dotchart-1.pdf index 65529ad0..1d733809 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-dotchart-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-dotchart-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-github-profile-1.pdf b/visualization-intermediate_files/figure-pdf/fig-github-profile-1.pdf index 33b477e4..5b8ba997 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-github-profile-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-github-profile-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-likert-fake-1.pdf b/visualization-intermediate_files/figure-pdf/fig-likert-fake-1.pdf index c8ee816a..02b4ca89 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-likert-fake-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-likert-fake-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-likert-scale-1.pdf b/visualization-intermediate_files/figure-pdf/fig-likert-scale-1.pdf index 0d5051e6..480e5131 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-likert-scale-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-likert-scale-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-mosaic-1.pdf b/visualization-intermediate_files/figure-pdf/fig-mosaic-1.pdf index 3ad676df..bb342c06 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-mosaic-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-mosaic-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-pareto-1.pdf b/visualization-intermediate_files/figure-pdf/fig-pareto-1.pdf index ff20e1ab..11eb345a 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-pareto-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-pareto-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-pareto-fill-1.pdf b/visualization-intermediate_files/figure-pdf/fig-pareto-fill-1.pdf index 5cdb9186..a21995da 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-pareto-fill-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-pareto-fill-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-pareto-stack-1.pdf b/visualization-intermediate_files/figure-pdf/fig-pareto-stack-1.pdf index a7cd15b1..a77376ae 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-pareto-stack-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-pareto-stack-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-pie-1.pdf b/visualization-intermediate_files/figure-pdf/fig-pie-1.pdf index 2c36071b..5a967aab 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-pie-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-pie-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-pie-bad-1.pdf b/visualization-intermediate_files/figure-pdf/fig-pie-bad-1.pdf index 140584e9..80402d92 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-pie-bad-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-pie-bad-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-pie-repel-1.pdf b/visualization-intermediate_files/figure-pdf/fig-pie-repel-1.pdf index f3578ff4..12a786b5 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-pie-repel-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-pie-repel-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-ring-pie-1.pdf b/visualization-intermediate_files/figure-pdf/fig-ring-pie-1.pdf index 28e54e7c..7c45e86a 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-ring-pie-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-ring-pie-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-rversion-chess-1.pdf b/visualization-intermediate_files/figure-pdf/fig-rversion-chess-1.pdf index b7f0f08b..0a2fe6f6 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-rversion-chess-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-rversion-chess-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-rversion-major-1.pdf b/visualization-intermediate_files/figure-pdf/fig-rversion-major-1.pdf index d3a79817..d4366c4c 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-rversion-major-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-rversion-major-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-rversion-timeline-1.pdf b/visualization-intermediate_files/figure-pdf/fig-rversion-timeline-1.pdf index 0e864bac..d60d5e38 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-rversion-timeline-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-rversion-timeline-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-streamgraph-1.pdf b/visualization-intermediate_files/figure-pdf/fig-streamgraph-1.pdf index 8fb65248..c15e9f59 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-streamgraph-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-streamgraph-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-treemap-1.pdf b/visualization-intermediate_files/figure-pdf/fig-treemap-1.pdf index a2946c7b..5f4499e6 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-treemap-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-treemap-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-author-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-author-1.pdf index 175eb836..e7a57ae6 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-author-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-author-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-hour-line-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-hour-line-1.pdf index a84ee0d8..4d2ee152 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-hour-line-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-hour-line-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-month-heatmap-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-month-heatmap-1.pdf index 7496e5d2..b8acbe40 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-month-heatmap-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-month-heatmap-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-month-line-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-month-line-1.pdf index 3a73a3b4..6dcc1db4 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-month-line-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-month-line-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-3dbarplot-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-3dbarplot-1.pdf index 8fd03180..ef02db66 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-3dbarplot-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-3dbarplot-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-heatmap-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-heatmap-1.pdf index a4a92bd1..ba2c53d4 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-heatmap-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-heatmap-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-line-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-line-1.pdf index 9022a4e5..5453161e 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-line-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-line-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-loess-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-loess-1.pdf index a750f0d3..e06fbcb8 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-loess-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-loess-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-nls-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-nls-1.pdf index b3d41170..633c3f34 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-nls-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-nls-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-smooth-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-smooth-1.pdf index 64a2bb4c..ee3b8294 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-smooth-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-smooth-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-wireframe-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-wireframe-1.pdf index 2372b6cd..486d9658 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-wireframe-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-wireframe-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-1.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-1.pdf index 09475ce3..10640f14 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-2.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-2.pdf index fdde87fa..448cac14 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-2.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-2.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-3.pdf b/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-3.pdf index 5c3318cc..fe6b4acb 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-3.pdf and b/visualization-intermediate_files/figure-pdf/fig-trunk-year-xxspline-3.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-waterfall-1.pdf b/visualization-intermediate_files/figure-pdf/fig-waterfall-1.pdf index 330ad8f1..2c900bd6 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-waterfall-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-waterfall-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-waterfall-ggTimeSeries-1.pdf b/visualization-intermediate_files/figure-pdf/fig-waterfall-ggTimeSeries-1.pdf index cd4d728d..3f462410 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-waterfall-ggTimeSeries-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-waterfall-ggTimeSeries-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-wind-rose-1.pdf b/visualization-intermediate_files/figure-pdf/fig-wind-rose-1.pdf index dba13ef7..e34c0d87 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-wind-rose-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-wind-rose-1.pdf differ diff --git a/visualization-intermediate_files/figure-pdf/fig-wordcloud-1.pdf b/visualization-intermediate_files/figure-pdf/fig-wordcloud-1.pdf index d0e38162..e287d2a1 100644 Binary files a/visualization-intermediate_files/figure-pdf/fig-wordcloud-1.pdf and b/visualization-intermediate_files/figure-pdf/fig-wordcloud-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-barchart-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-barchart-1.pdf index fdba4497..c281d356 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-barchart-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-barchart-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-bwplot-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-bwplot-1.pdf index 4528d30f..1a45b5cd 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-bwplot-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-bwplot-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-choropleth-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-choropleth-1.pdf index 649d051d..7920b24f 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-choropleth-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-choropleth-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-cloud-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-cloud-1.pdf index bdecf7cb..8ebcf4a3 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-cloud-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-cloud-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-colours-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-colours-1.pdf index 71b7f6bd..bf862520 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-colours-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-colours-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-ecdfplot-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-ecdfplot-1.pdf index fe3b2ed1..e4e8e20d 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-ecdfplot-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-ecdfplot-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-ellipse-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-ellipse-1.pdf index 9bb51826..1b2d6804 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-ellipse-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-ellipse-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-extra-themes-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-extra-themes-1.pdf index 30335244..50621bde 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-extra-themes-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-extra-themes-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-extra-themes-2.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-extra-themes-2.pdf index 0b76e590..418f8f2e 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-extra-themes-2.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-extra-themes-2.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-levelplot-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-levelplot-1.pdf index 8296513b..615262d4 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-levelplot-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-levelplot-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-par-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-par-1.pdf index e90fbecf..28bb21dc 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-par-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-par-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-rongelap-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-rongelap-1.pdf index 98a5f342..58c2e25b 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-rongelap-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-rongelap-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-segplot-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-segplot-1.pdf index 8590f685..8be67a29 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-segplot-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-segplot-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-settings-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-settings-1.pdf index 346e2803..6cb9b6f9 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-settings-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-settings-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-smoother-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-smoother-1.pdf index 79073f89..82364732 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-smoother-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-smoother-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-smoother-2.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-smoother-2.pdf index 203bb0d0..923f3c48 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-smoother-2.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-smoother-2.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-smoother-3.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-smoother-3.pdf index f676921a..38fb8704 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-smoother-3.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-smoother-3.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-smoother-4.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-smoother-4.pdf index 2cbc29bb..18e425c1 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-smoother-4.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-smoother-4.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-vs-base-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-vs-base-1.pdf index 8c1ff2b7..730b6b22 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-vs-base-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-vs-base-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-vs-base-2.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-vs-base-2.pdf index 457bc5de..e5aae2b6 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-vs-base-2.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-vs-base-2.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-wireframe-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-wireframe-1.pdf index 2264e31d..169fe675 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-wireframe-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-wireframe-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-1-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-1-1.pdf index df2e3352..526aedea 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-1-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-1-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-2-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-2-1.pdf index 6a6f2635..24ca0528 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-2-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-2-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-key-1.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-key-1.pdf index a54f97e2..bfae63f7 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-key-1.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-key-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-key-2.pdf b/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-key-2.pdf index f41c41f5..d05c3909 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-key-2.pdf and b/visualization-lattice_files/figure-pdf/fig-lattice-xyplot-key-2.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-levelplot-topo-1.pdf b/visualization-lattice_files/figure-pdf/fig-levelplot-topo-1.pdf index b19c078b..3145a97f 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-levelplot-topo-1.pdf and b/visualization-lattice_files/figure-pdf/fig-levelplot-topo-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-levelplot-volcano-1.pdf b/visualization-lattice_files/figure-pdf/fig-levelplot-volcano-1.pdf index b59a9d2e..fe82ec64 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-levelplot-volcano-1.pdf and b/visualization-lattice_files/figure-pdf/fig-levelplot-volcano-1.pdf differ diff --git a/visualization-lattice_files/figure-pdf/fig-wireframe-volcano-1.pdf b/visualization-lattice_files/figure-pdf/fig-wireframe-volcano-1.pdf index a463c907..eb8c7712 100644 Binary files a/visualization-lattice_files/figure-pdf/fig-wireframe-volcano-1.pdf and b/visualization-lattice_files/figure-pdf/fig-wireframe-volcano-1.pdf differ diff --git a/visualization-practice_files/figure-html/fig-china-household-sex-1.png b/visualization-practice_files/figure-html/fig-china-household-sex-1.png index 73589bf1..e9b0ba6a 100644 Binary files a/visualization-practice_files/figure-html/fig-china-household-sex-1.png and b/visualization-practice_files/figure-html/fig-china-household-sex-1.png differ diff --git a/visualization-practice_files/figure-html/fig-china-household-sex-2.png b/visualization-practice_files/figure-html/fig-china-household-sex-2.png index 4d7d42aa..72e043bc 100644 Binary files a/visualization-practice_files/figure-html/fig-china-household-sex-2.png and b/visualization-practice_files/figure-html/fig-china-household-sex-2.png differ diff --git a/visualization-practice_files/figure-html/fig-coverage-1.png b/visualization-practice_files/figure-html/fig-coverage-1.png index eeb84f8e..30a8b15a 100644 Binary files a/visualization-practice_files/figure-html/fig-coverage-1.png and b/visualization-practice_files/figure-html/fig-coverage-1.png differ diff --git a/visualization-practice_files/figure-pdf/fig-china-household-sex-1.pdf b/visualization-practice_files/figure-pdf/fig-china-household-sex-1.pdf index 05ab5621..42869317 100644 Binary files a/visualization-practice_files/figure-pdf/fig-china-household-sex-1.pdf and b/visualization-practice_files/figure-pdf/fig-china-household-sex-1.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-china-household-sex-2.pdf b/visualization-practice_files/figure-pdf/fig-china-household-sex-2.pdf index c2de2fc2..2ad230c3 100644 Binary files a/visualization-practice_files/figure-pdf/fig-china-household-sex-2.pdf and b/visualization-practice_files/figure-pdf/fig-china-household-sex-2.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-confidence-belt-1.pdf b/visualization-practice_files/figure-pdf/fig-confidence-belt-1.pdf index dfa25edb..663943cd 100644 Binary files a/visualization-practice_files/figure-pdf/fig-confidence-belt-1.pdf and b/visualization-practice_files/figure-pdf/fig-confidence-belt-1.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-coverage-1.pdf b/visualization-practice_files/figure-pdf/fig-coverage-1.pdf index 16f3501f..17fef64d 100644 Binary files a/visualization-practice_files/figure-pdf/fig-coverage-1.pdf and b/visualization-practice_files/figure-pdf/fig-coverage-1.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-diamonds-distr-1.pdf b/visualization-practice_files/figure-pdf/fig-diamonds-distr-1.pdf index 287e431e..80e238e0 100644 Binary files a/visualization-practice_files/figure-pdf/fig-diamonds-distr-1.pdf and b/visualization-practice_files/figure-pdf/fig-diamonds-distr-1.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-1.pdf b/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-1.pdf index 1b277c71..ecdfeae1 100644 Binary files a/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-1.pdf and b/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-1.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-2.pdf b/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-2.pdf index c0318294..e897640c 100644 Binary files a/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-2.pdf and b/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-2.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-3.pdf b/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-3.pdf index 83fe5a56..ec20022d 100644 Binary files a/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-3.pdf and b/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-3.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-4.pdf b/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-4.pdf index 390581ac..cc9ca39d 100644 Binary files a/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-4.pdf and b/visualization-practice_files/figure-pdf/fig-faithful-bkde2d-4.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-usa-mortality-1.pdf b/visualization-practice_files/figure-pdf/fig-usa-mortality-1.pdf index 92a0a8b9..6dc7644a 100644 Binary files a/visualization-practice_files/figure-pdf/fig-usa-mortality-1.pdf and b/visualization-practice_files/figure-pdf/fig-usa-mortality-1.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-usa-mortality-heatmap-1.pdf b/visualization-practice_files/figure-pdf/fig-usa-mortality-heatmap-1.pdf index 4b964db3..b9cf654c 100644 Binary files a/visualization-practice_files/figure-pdf/fig-usa-mortality-heatmap-1.pdf and b/visualization-practice_files/figure-pdf/fig-usa-mortality-heatmap-1.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-virginia-age-group-1.pdf b/visualization-practice_files/figure-pdf/fig-virginia-age-group-1.pdf index a7f13dd5..cb1faa29 100644 Binary files a/visualization-practice_files/figure-pdf/fig-virginia-age-group-1.pdf and b/visualization-practice_files/figure-pdf/fig-virginia-age-group-1.pdf differ diff --git a/visualization-practice_files/figure-pdf/fig-virginia-population-group-1.pdf b/visualization-practice_files/figure-pdf/fig-virginia-population-group-1.pdf index 45c0fff4..2b2e0ddc 100644 Binary files a/visualization-practice_files/figure-pdf/fig-virginia-population-group-1.pdf and b/visualization-practice_files/figure-pdf/fig-virginia-population-group-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-bessel-function-1.pdf b/visualization-tikz_files/figure-html/fig-bessel-function-1.pdf index 9f5beb23..dbef5582 100644 Binary files a/visualization-tikz_files/figure-html/fig-bessel-function-1.pdf and b/visualization-tikz_files/figure-html/fig-bessel-function-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-bessel-function-1.tex b/visualization-tikz_files/figure-html/fig-bessel-function-1.tex index a2bc80a3..5f2689ee 100644 --- a/visualization-tikz_files/figure-html/fig-bessel-function-1.tex +++ b/visualization-tikz_files/figure-html/fig-bessel-function-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:08:39 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:58:50 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-tikz_files/figure-html/fig-bessel-function-2.pdf b/visualization-tikz_files/figure-html/fig-bessel-function-2.pdf index 3707c967..c3c0b201 100644 Binary files a/visualization-tikz_files/figure-html/fig-bessel-function-2.pdf and b/visualization-tikz_files/figure-html/fig-bessel-function-2.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-bessel-function-2.tex b/visualization-tikz_files/figure-html/fig-bessel-function-2.tex index 1eb015a3..6312b9be 100644 --- a/visualization-tikz_files/figure-html/fig-bessel-function-2.tex +++ b/visualization-tikz_files/figure-html/fig-bessel-function-2.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:08:41 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:58:53 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-tikz_files/figure-html/fig-linear-model-1.pdf b/visualization-tikz_files/figure-html/fig-linear-model-1.pdf index 22cb4fdf..19e42433 100644 Binary files a/visualization-tikz_files/figure-html/fig-linear-model-1.pdf and b/visualization-tikz_files/figure-html/fig-linear-model-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-linear-model-1.tex b/visualization-tikz_files/figure-html/fig-linear-model-1.tex index a2921b3d..4ff3a2b4 100644 --- a/visualization-tikz_files/figure-html/fig-linear-model-1.tex +++ b/visualization-tikz_files/figure-html/fig-linear-model-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:08:19 +% Created by tikzDevice version 0.12.6 on 2024-02-05 04:58:26 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-tikz_files/figure-html/fig-network-seven-bridges-1.pdf b/visualization-tikz_files/figure-html/fig-network-seven-bridges-1.pdf index 7e59c5a4..fb153d96 100644 Binary files a/visualization-tikz_files/figure-html/fig-network-seven-bridges-1.pdf and b/visualization-tikz_files/figure-html/fig-network-seven-bridges-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-pstricks-1.pdf b/visualization-tikz_files/figure-html/fig-pstricks-1.pdf index 856216f5..716b4fa4 100644 Binary files a/visualization-tikz_files/figure-html/fig-pstricks-1.pdf and b/visualization-tikz_files/figure-html/fig-pstricks-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-smartdiagram-bubble-1.pdf b/visualization-tikz_files/figure-html/fig-smartdiagram-bubble-1.pdf index f5c55cd3..8d9d98ac 100644 Binary files a/visualization-tikz_files/figure-html/fig-smartdiagram-bubble-1.pdf and b/visualization-tikz_files/figure-html/fig-smartdiagram-bubble-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-smartdiagram-descriptive-1.pdf b/visualization-tikz_files/figure-html/fig-smartdiagram-descriptive-1.pdf index 8108bddf..9c047f3b 100644 Binary files a/visualization-tikz_files/figure-html/fig-smartdiagram-descriptive-1.pdf and b/visualization-tikz_files/figure-html/fig-smartdiagram-descriptive-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-tikz-1.pdf b/visualization-tikz_files/figure-html/fig-tikz-1.pdf index 149cfc3b..793a9d7d 100644 Binary files a/visualization-tikz_files/figure-html/fig-tikz-1.pdf and b/visualization-tikz_files/figure-html/fig-tikz-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-tikz-cool-1.pdf b/visualization-tikz_files/figure-html/fig-tikz-cool-1.pdf index a081bef8..a5f31f46 100644 Binary files a/visualization-tikz_files/figure-html/fig-tikz-cool-1.pdf and b/visualization-tikz_files/figure-html/fig-tikz-cool-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-tikz-jet-1.pdf b/visualization-tikz_files/figure-html/fig-tikz-jet-1.pdf index 2e4ef2dc..25f094f6 100644 Binary files a/visualization-tikz_files/figure-html/fig-tikz-jet-1.pdf and b/visualization-tikz_files/figure-html/fig-tikz-jet-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-tikz-mindmap-1.pdf b/visualization-tikz_files/figure-html/fig-tikz-mindmap-1.pdf index bb5b3ef0..4b61d766 100644 Binary files a/visualization-tikz_files/figure-html/fig-tikz-mindmap-1.pdf and b/visualization-tikz_files/figure-html/fig-tikz-mindmap-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-tikz-pgf1-1.pdf b/visualization-tikz_files/figure-html/fig-tikz-pgf1-1.pdf index 1c93863d..8cefc361 100644 Binary files a/visualization-tikz_files/figure-html/fig-tikz-pgf1-1.pdf and b/visualization-tikz_files/figure-html/fig-tikz-pgf1-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-tikz-pgf2-1.pdf b/visualization-tikz_files/figure-html/fig-tikz-pgf2-1.pdf index 371993b7..fd816cdb 100644 Binary files a/visualization-tikz_files/figure-html/fig-tikz-pgf2-1.pdf and b/visualization-tikz_files/figure-html/fig-tikz-pgf2-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-tikz-pgf3-1.pdf b/visualization-tikz_files/figure-html/fig-tikz-pgf3-1.pdf index 3d1d20e6..dba9122c 100644 Binary files a/visualization-tikz_files/figure-html/fig-tikz-pgf3-1.pdf and b/visualization-tikz_files/figure-html/fig-tikz-pgf3-1.pdf differ diff --git a/visualization-tikz_files/figure-html/fig-tikz-viridis-1.pdf b/visualization-tikz_files/figure-html/fig-tikz-viridis-1.pdf index 2b047023..97970d8f 100644 Binary files a/visualization-tikz_files/figure-html/fig-tikz-viridis-1.pdf and b/visualization-tikz_files/figure-html/fig-tikz-viridis-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-bessel-function-1.pdf b/visualization-tikz_files/figure-pdf/fig-bessel-function-1.pdf index 60366a6e..af5734d2 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-bessel-function-1.pdf and b/visualization-tikz_files/figure-pdf/fig-bessel-function-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-bessel-function-1.png b/visualization-tikz_files/figure-pdf/fig-bessel-function-1.png index e1d02bc5..d57f9f82 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-bessel-function-1.png and b/visualization-tikz_files/figure-pdf/fig-bessel-function-1.png differ diff --git a/visualization-tikz_files/figure-pdf/fig-bessel-function-1.tex b/visualization-tikz_files/figure-pdf/fig-bessel-function-1.tex index 8f878f47..59021bfe 100644 --- a/visualization-tikz_files/figure-pdf/fig-bessel-function-1.tex +++ b/visualization-tikz_files/figure-pdf/fig-bessel-function-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:09:54 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:00:09 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-tikz_files/figure-pdf/fig-bessel-function-2.pdf b/visualization-tikz_files/figure-pdf/fig-bessel-function-2.pdf index 8b978511..f4c05e14 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-bessel-function-2.pdf and b/visualization-tikz_files/figure-pdf/fig-bessel-function-2.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-bessel-function-2.png b/visualization-tikz_files/figure-pdf/fig-bessel-function-2.png index df147593..53b5bd35 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-bessel-function-2.png and b/visualization-tikz_files/figure-pdf/fig-bessel-function-2.png differ diff --git a/visualization-tikz_files/figure-pdf/fig-bessel-function-2.tex b/visualization-tikz_files/figure-pdf/fig-bessel-function-2.tex index d34f42f8..8646cace 100644 --- a/visualization-tikz_files/figure-pdf/fig-bessel-function-2.tex +++ b/visualization-tikz_files/figure-pdf/fig-bessel-function-2.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:09:56 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:00:11 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-tikz_files/figure-pdf/fig-linear-model-1.pdf b/visualization-tikz_files/figure-pdf/fig-linear-model-1.pdf index 2cecf057..e036288d 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-linear-model-1.pdf and b/visualization-tikz_files/figure-pdf/fig-linear-model-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-linear-model-1.png b/visualization-tikz_files/figure-pdf/fig-linear-model-1.png index d6ad23fc..7a2d2361 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-linear-model-1.png and b/visualization-tikz_files/figure-pdf/fig-linear-model-1.png differ diff --git a/visualization-tikz_files/figure-pdf/fig-linear-model-1.tex b/visualization-tikz_files/figure-pdf/fig-linear-model-1.tex index b168388a..2c71132b 100644 --- a/visualization-tikz_files/figure-pdf/fig-linear-model-1.tex +++ b/visualization-tikz_files/figure-pdf/fig-linear-model-1.tex @@ -1,4 +1,4 @@ -% Created by tikzDevice version 0.12.6 on 2024-02-01 06:09:50 +% Created by tikzDevice version 0.12.6 on 2024-02-05 05:00:04 % !TEX encoding = UTF-8 Unicode \documentclass[tikz]{standalone} diff --git a/visualization-tikz_files/figure-pdf/fig-network-seven-bridges-1.pdf b/visualization-tikz_files/figure-pdf/fig-network-seven-bridges-1.pdf index 7e233a2d..b288a8f3 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-network-seven-bridges-1.pdf and b/visualization-tikz_files/figure-pdf/fig-network-seven-bridges-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-pstricks-1.pdf b/visualization-tikz_files/figure-pdf/fig-pstricks-1.pdf index 58cfde70..5038d5d0 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-pstricks-1.pdf and b/visualization-tikz_files/figure-pdf/fig-pstricks-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-smartdiagram-bubble-1.pdf b/visualization-tikz_files/figure-pdf/fig-smartdiagram-bubble-1.pdf index 9467a955..cbee75fe 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-smartdiagram-bubble-1.pdf and b/visualization-tikz_files/figure-pdf/fig-smartdiagram-bubble-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-smartdiagram-descriptive-1.pdf b/visualization-tikz_files/figure-pdf/fig-smartdiagram-descriptive-1.pdf index 61bec5d9..1fabfdc0 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-smartdiagram-descriptive-1.pdf and b/visualization-tikz_files/figure-pdf/fig-smartdiagram-descriptive-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-tikz-1.pdf b/visualization-tikz_files/figure-pdf/fig-tikz-1.pdf index e61faf9b..115f469e 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-tikz-1.pdf and b/visualization-tikz_files/figure-pdf/fig-tikz-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-tikz-cool-1.pdf b/visualization-tikz_files/figure-pdf/fig-tikz-cool-1.pdf index 3be16f0b..1ccca25b 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-tikz-cool-1.pdf and b/visualization-tikz_files/figure-pdf/fig-tikz-cool-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-tikz-jet-1.pdf b/visualization-tikz_files/figure-pdf/fig-tikz-jet-1.pdf index 02a675fd..d16b0de4 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-tikz-jet-1.pdf and b/visualization-tikz_files/figure-pdf/fig-tikz-jet-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-tikz-mindmap-1.pdf b/visualization-tikz_files/figure-pdf/fig-tikz-mindmap-1.pdf index 5fc8269d..45b269c0 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-tikz-mindmap-1.pdf and b/visualization-tikz_files/figure-pdf/fig-tikz-mindmap-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-tikz-pgf1-1.pdf b/visualization-tikz_files/figure-pdf/fig-tikz-pgf1-1.pdf index 6f749dad..a65fd625 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-tikz-pgf1-1.pdf and b/visualization-tikz_files/figure-pdf/fig-tikz-pgf1-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-tikz-pgf2-1.pdf b/visualization-tikz_files/figure-pdf/fig-tikz-pgf2-1.pdf index 70028a7a..fbab4109 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-tikz-pgf2-1.pdf and b/visualization-tikz_files/figure-pdf/fig-tikz-pgf2-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-tikz-pgf3-1.pdf b/visualization-tikz_files/figure-pdf/fig-tikz-pgf3-1.pdf index afe1a956..e445a9ba 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-tikz-pgf3-1.pdf and b/visualization-tikz_files/figure-pdf/fig-tikz-pgf3-1.pdf differ diff --git a/visualization-tikz_files/figure-pdf/fig-tikz-viridis-1.pdf b/visualization-tikz_files/figure-pdf/fig-tikz-viridis-1.pdf index 81f159a5..db1f253d 100644 Binary files a/visualization-tikz_files/figure-pdf/fig-tikz-viridis-1.pdf and b/visualization-tikz_files/figure-pdf/fig-tikz-viridis-1.pdf differ diff --git a/wrangling-collection.html b/wrangling-collection.html index 3fbfeeb6..1fd8a8f7 100644 --- a/wrangling-collection.html +++ b/wrangling-collection.html @@ -688,7 +688,7 @@

    github_stats(repo = "yihui/knitr")

             repo stargazers_count subscribers_count forks_count
    -1 yihui/knitr             2328               115         878
    +1 yihui/knitr -1 -1 -1

    理论上,使用函数 lapply() 遍历所有 R 包可得所需数据,将数据收集函数应用到每一个 R 包上再合并结果,即如下操作。

    diff --git a/wrangling-objects.html b/wrangling-objects.html index d1f7c12d..0eeab8c8 100644 --- a/wrangling-objects.html +++ b/wrangling-objects.html @@ -688,7 +688,7 @@

    # 数据结构
     str(x)
    -
     Time-Series [1:100] from 2017 to 2017: 0.2767 -1.8463 2.4645 -0.1078 0.0194 ...
    +
     Time-Series [1:100] from 2017 to 2017: 0.338 -0.609 0.641 -1.209 0.613 ...

    函数 start()end() 查看开始和结束的时间点。