课后作业8

作者

姓名

发布于

2025年4月9日

数据

下载airquality.xlsx,并读取数据。

# 下载至临时文件
if (FALSE) {
  tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx")
  download.file(
    "https://git.drwater.net/course/RWEP/raw/branch/PUB/data/airquality.xlsx",
    destfile = tmpxlsxpath
  )
  airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2)
  metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1)
  saveRDS(airqualitydf, "./airqualitydf.RDS")
  saveRDS(metadf, "./metadf.RDS")
}
airqualitydf <- readRDS("./airqualitydf.RDS")
metadf <- readRDS("./metadf.RDS")

描述统计

根据airqualitydf.xlsx,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。

require(tidyverse)
airqualitydf |>
  select(datetime, site, AQI) |>
  filter(!is.na(AQI)) |>
  group_by(site) |>
  summarize(AQI.median = median(AQI, na.rm = TRUE)) |>
  left_join(metadf |> select(site, city = Area)) |>
  group_by(city) |>
  filter(n() > 5) |>
  summarize(
    p = sum(
      AQI.median < quantile(airqualitydf$AQI, probs = 0.5, na.rm = TRUE)
    ) /
      n()
  ) |>
  top_n(10, p)
# A tibble: 29 × 2
   city           p
   <chr>      <dbl>
 1 东莞市         1
 2 伊春市         1
 3 佛山市         1
 4 包头市         1
 5 南充市         1
 6 南宁市         1
 7 吉林市         1
 8 呼和浩特市     1
 9 哈尔滨市       1
10 大庆市         1
# ℹ 19 more rows
airqualitydf |>
  select(datetime, site, AQI) |>
  filter(!is.na(AQI)) |>
  group_by(site) |>
  summarize(AQI.median = median(AQI, na.rm = TRUE))
# A tibble: 1,626 × 2
   site  AQI.median
   <chr>      <dbl>
 1 1001A       69  
 2 1003A       63  
 3 1004A       69  
 4 1005A       57  
 5 1006A       71  
 6 1007A       66  
 7 1008A       55.5
 8 1009A       53  
 9 1010A       65  
10 1011A       63.5
# ℹ 1,616 more rows
airqualitydf |>
  select(datetime, site, AQI) |>
  filter(!is.na(AQI)) |>
  left_join(metadf |> select(site, city = Area)) |>
  group_by(city) |>
  filter(length(unique(site)) >= 5) |>
  summarize(
    p = sum(AQI < quantile(airqualitydf$AQI, probs = 0.2, na.rm = TRUE)) / n()
  ) |>
  slice_max(p, n = 10) |>
  knitr::kable()
city p
阜新市 0.9245283
清远市 0.8833333
玉林市 0.8421053
肇庆市 0.8305085
韶关市 0.7962963
海口市 0.7857143
伊春市 0.7375000
贵港市 0.7288136
东莞市 0.7285714
锦州市 0.7192982

统计检验

按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。

if (FALSE) {
  require(infer)
  require(tidyverse)
  testdf <- airqualitydf |>
    select(datetime, site, AQI) |>
    filter(!is.na(AQI)) |>
    left_join(metadf |> select(site, city = Area)) |>
    group_by(city) |>
    filter(length(unique(site)) >= 5) |>
    mutate(
      dayornight = factor(
        ifelse(between(hour(datetime), 8, 20), "day", "night"),
        levels = c("day", "night")
      )
    ) |>
    group_by(city) |>
    nest(citydf = -city) |>
    mutate(
      median_diff = purrr::map_dbl(
        citydf,
        ~ .x |>
          specify(AQI ~ dayornight) |>
          calculate(stat = "diff in medians", order = c("day", "night")) |>
          pull(stat)
      )
    ) |>
    ungroup() |>
    #  slice_sample(n = 12) |>
    mutate(
      null_dist = purrr::map(
        citydf,
        ~ .x |>
          specify(AQI ~ dayornight) |>
          hypothesize(null = "independence") |>
          generate(reps = 1000, type = "permute") |>
          calculate(stat = "diff in medians", order = c("day", "night"))
      )
    ) |>
    mutate(
      p_value = purrr::map2_dbl(
        null_dist,
        median_diff,
        ~ get_p_value(.x, obs_stat = .y, direction = "both") |>
          pull(p_value)
      )
    ) |>
    mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异")) |>
    mutate(
      fig = purrr::pmap(
        list(null_dist, median_diff, city, sigdiff),
        ~ visualize(..1) +
          shade_p_value(obs_stat = ..2, direction = "both") +
          ggtitle(paste0(..3, ":", ..4)) +
          theme_sci(2, 2)
      )
    ) |>
    arrange(p_value)
  saveRDS(testdf, "./testdf.RDS")
}

if (FALSE) {
  lang <- "cn"
  require(dwfun)
  require(rmdify)
  require(drwateR)
  dwfun::init()
  rmdify::rmd_init()

  testdf <- readRDS("./testdf.RDS")
  require(tidyverse)
  testdf |>
    select(city, median_diff, p_value, sigdiff) |>
    knitr::kable()
  testdf |>
    mutate(grp = (row_number() - 1) %/% 12) |>
    group_by(grp) |>
    nest(grpdf = -grp) |>
    ungroup() |>
    #  slice(1) |>
    mutate(
      gp = purrr::map(
        grpdf,
        ~ (.x |>
          pull(fig)) |>
          patchwork::wrap_plots(ncol = 3) +
          dwfun::theme_sci(5, 7)
      )
    ) |>
    pull(gp)
}