# 下载至临时文件
if (FALSE) {
tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx")
download.file(
"https://git.drwater.net/course/RWEP/raw/branch/PUB/data/airquality.xlsx",
destfile = tmpxlsxpath
)
airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2)
metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1)
saveRDS(airqualitydf, "./airqualitydf.RDS")
saveRDS(metadf, "./metadf.RDS")
}
airqualitydf <- readRDS("./airqualitydf.RDS")
metadf <- readRDS("./metadf.RDS")
课后作业8
数据
下载airquality.xlsx,并读取数据。
描述统计
根据airqualitydf.xlsx
,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。
require(tidyverse)
airqualitydf |>
select(datetime, site, AQI) |>
filter(!is.na(AQI)) |>
group_by(site) |>
summarize(AQI.median = median(AQI, na.rm = TRUE)) |>
left_join(metadf |> select(site, city = Area)) |>
group_by(city) |>
filter(n() > 5) |>
summarize(
p = sum(
AQI.median < quantile(airqualitydf$AQI, probs = 0.5, na.rm = TRUE)
) /
n()
) |>
top_n(10, p)
# A tibble: 29 × 2
city p
<chr> <dbl>
1 东莞市 1
2 伊春市 1
3 佛山市 1
4 包头市 1
5 南充市 1
6 南宁市 1
7 吉林市 1
8 呼和浩特市 1
9 哈尔滨市 1
10 大庆市 1
# ℹ 19 more rows
airqualitydf |>
select(datetime, site, AQI) |>
filter(!is.na(AQI)) |>
group_by(site) |>
summarize(AQI.median = median(AQI, na.rm = TRUE))
# A tibble: 1,626 × 2
site AQI.median
<chr> <dbl>
1 1001A 69
2 1003A 63
3 1004A 69
4 1005A 57
5 1006A 71
6 1007A 66
7 1008A 55.5
8 1009A 53
9 1010A 65
10 1011A 63.5
# ℹ 1,616 more rows
airqualitydf |>
select(datetime, site, AQI) |>
filter(!is.na(AQI)) |>
left_join(metadf |> select(site, city = Area)) |>
group_by(city) |>
filter(length(unique(site)) >= 5) |>
summarize(
p = sum(AQI < quantile(airqualitydf$AQI, probs = 0.2, na.rm = TRUE)) / n()
) |>
slice_max(p, n = 10) |>
knitr::kable()
city | p |
---|---|
阜新市 | 0.9245283 |
清远市 | 0.8833333 |
玉林市 | 0.8421053 |
肇庆市 | 0.8305085 |
韶关市 | 0.7962963 |
海口市 | 0.7857143 |
伊春市 | 0.7375000 |
贵港市 | 0.7288136 |
东莞市 | 0.7285714 |
锦州市 | 0.7192982 |
统计检验
按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。
if (FALSE) {
require(infer)
require(tidyverse)
testdf <- airqualitydf |>
select(datetime, site, AQI) |>
filter(!is.na(AQI)) |>
left_join(metadf |> select(site, city = Area)) |>
group_by(city) |>
filter(length(unique(site)) >= 5) |>
mutate(
dayornight = factor(
ifelse(between(hour(datetime), 8, 20), "day", "night"),
levels = c("day", "night")
)
) |>
group_by(city) |>
nest(citydf = -city) |>
mutate(
median_diff = purrr::map_dbl(
citydf,
~ .x |>
specify(AQI ~ dayornight) |>
calculate(stat = "diff in medians", order = c("day", "night")) |>
pull(stat)
)
) |>
ungroup() |>
# slice_sample(n = 12) |>
mutate(
null_dist = purrr::map(
citydf,
~ .x |>
specify(AQI ~ dayornight) |>
hypothesize(null = "independence") |>
generate(reps = 1000, type = "permute") |>
calculate(stat = "diff in medians", order = c("day", "night"))
)
) |>
mutate(
p_value = purrr::map2_dbl(
null_dist,
median_diff,
~ get_p_value(.x, obs_stat = .y, direction = "both") |>
pull(p_value)
)
) |>
mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异")) |>
mutate(
fig = purrr::pmap(
list(null_dist, median_diff, city, sigdiff),
~ visualize(..1) +
shade_p_value(obs_stat = ..2, direction = "both") +
ggtitle(paste0(..3, ":", ..4)) +
theme_sci(2, 2)
)
) |>
arrange(p_value)
saveRDS(testdf, "./testdf.RDS")
}
if (FALSE) {
lang <- "cn"
require(dwfun)
require(rmdify)
require(drwateR)
dwfun::init()
rmdify::rmd_init()
testdf <- readRDS("./testdf.RDS")
require(tidyverse)
testdf |>
select(city, median_diff, p_value, sigdiff) |>
knitr::kable()
testdf |>
mutate(grp = (row_number() - 1) %/% 12) |>
group_by(grp) |>
nest(grpdf = -grp) |>
ungroup() |>
# slice(1) |>
mutate(
gp = purrr::map(
grpdf,
~ (.x |>
pull(fig)) |>
patchwork::wrap_plots(ncol = 3) +
dwfun::theme_sci(5, 7)
)
) |>
pull(gp)
}