---
title: "Copernicus Marine Extraction"
params:
yml: "meta/erddap_sst.yml"
subtitle: "metadata: *`r basename(params$yml)`*"
---
```{r}
#| label: setup
librarian::shelf(
dplyr, DT, fs, glue, here, logger, jsonlite, listviewer, lubridate, mapview,
purrr, RColorBrewer, readr, reticulate, sf, stringr, terra, tibble, tidyr,
units, yaml,
quiet = T)
options(readr.show_col_types = F)
mapviewOptions(
basemaps = "Esri.OceanBasemap",
vector.palette = colorRampPalette(brewer.pal(n=5, name="Spectral")))
if (!exists("params"))
params <- list(yml = here("meta/copernicus_phy.yml")) # DEBUG
cm_r_dflyrs <- function(r){
# CopernicusMarine raster to dataframe of layers
d_lyrs <- tibble(
lyr = names(r)) |>
separate_wider_regex(
lyr, c(
dataset = "^\\S+", " ",
var = "\\S+",
depth = "(?:\\sdepth=[.0-9]+)?",
itime = "\\s[0-9]+$")) |>
mutate(
depth = str_replace(depth, "depth=", "") |> as.numeric(),
itime = as.integer(itime),
time = time(r))
d_lyrs
}
```
## Polygons
```{r}
#| label: polygons
sanctuaries_rds <- here("../climate-dashboard/data/sanctuaries.rds")
buf_pct <- 0.20 # 20% of area
ply_sanctuaries <- readRDS(sanctuaries_rds)
# buffer polygon and get bounding boxes
ply_sanctuaries <- ply_sanctuaries |>
mutate(
area_km2 = st_area(geom) |>
set_units("km^2") |>
as.numeric() |>
round(2),
buf_km = (sqrt(area_km2) * buf_pct) |>
round(2),
bbox_geom = map2(geom, area_km2, \(g,a) {
st_sfc(g, crs = 4326) |>
st_buffer(buf_km * 1000) |>
st_bbox() |>
round(2) |>
st_as_sfc() }),
bbox_chr = map_chr(bbox_geom, \(x) { # xmin, ymin, xmax and ymax
st_bbox(x) |>
as.numeric() |>
paste(collapse = ",") } ) ) |>
unnest(bbox_geom)
bb_sanctuaries <- st_set_geometry(ply_sanctuaries, "bbox_geom") |>
select(-geom)
ply_sanctuaries <- select(ply_sanctuaries, -bbox_geom)
# show map
mapView(ply_sanctuaries, zcol = "nms") +
mapView(bb_sanctuaries, zcol = "nms", legend = F)
ply_sanctuaries |>
st_drop_geometry() |>
datatable()
```
## Dataset
### Metadata
Contents of `r basename(params$yml)`:
```{r}
#| label: yml
#| results: asis
cat(
"```yaml",
readLines(params$yml),
"```",
sep = "\n")
```
- [Python interface - Copernicus Marine Toolbox documentation](https://toolbox-docs.marine.copernicus.eu/en/v2.0.1/python-interface.html#copernicusmarine.subset)
- [Command subset - Copernicus Marine Toolbox documentation](https://toolbox-docs.marine.copernicus.eu/en/v2.0.1/usage/subset-usage.html#option-coordinates-selection-method)
```{r}
#| label: dataset
# import copernicusmarine as cmt ----
# do once: create virtual enviroment and install copernicusmarine Python module
# virtualenv_create(envname = "CopernicusMarine")
# virtualenv_install(envname = "CopernicusMarine", packages = c("copernicusmarine"))
# use CopernicusMarine env with copernicusmarine Python module
use_virtualenv(virtualenv = "CopernicusMarine", required = TRUE)
py_require("copernicusmarine>2.1")
cmt <- import("copernicusmarine")
# login to cmt ----
# register for username and password at https://data.marine.copernicus.eu/register
user <- "bbest1"
pass_txt <- ifelse(
Sys.info()[["sysname"]] == "Linux",
"/share/private/data.marine.copernicus.eu_bbest1-password.txt", # server
"~/My Drive/private/data.marine.copernicus.eu_bbest1-password.txt") # laptop
pass <- readLines(pass_txt)
logged_in <- cmt$login(user, pass, force_overwrite = T) # py_help(cmt$login)
stopifnot(logged_in)
meta <- read_yaml(params$yml)
copernicus_prod <- path_ext_remove(basename(params$yml))
dir_out <- glue("/share/data/noaa-onms/climate-dashboard-app/{copernicus_prod}")
dir_create(dir_out)
```
```{r}
# get times ----
get_dataset_description <- function(dataset_id){
cmt$describe(
dataset_id = dataset_id,
disable_progress_bar = T)$json() |>
fromJSON()
}
get_dataset_times <- function(dataset_description){
services <- pluck(
dataset_description, "products", "datasets", 1, "versions", 1, "parts", 1, "services", 1)
service_names <- pluck(
services, "service_name")
i_timeseries <- which(service_names == "arco-time-series")
variables <- pluck(
services, "variables", i_timeseries, "short_name")
coordinates <- pluck(
services, "variables", i_timeseries, "coordinates")
coordinate_ids <- pluck(
coordinates, 1, "coordinate_id") # assuming 1st variable
i_time <- which(coordinate_ids == "time")
time_min <- pluck(
coordinates, 1, "minimum_value", i_time) |>
# convert from [milliseconds since 1970-01-01 00:00:00Z (no leap seconds)]
as.numeric() %>% {./1000} |>
as.POSIXct(origin = "1970-01-01", tz = "UTC")
time_max <- pluck(
coordinates, 1, "maximum_value", i_time) |>
as.numeric() %>% {./1000} |>
as.POSIXct(origin = "1970-01-01", tz = "UTC")
time_step_secs <- pluck(
coordinates, 1, "step", i_time) |>
as.numeric() %>%
{./1000}
seq.POSIXt(
from = time_min,
to = time_max,
by = time_step_secs)
}
d_ds <- tibble(
dataset_id = meta |>
pluck("datasets") |>
map_chr("id")) |>
mutate(
description = map(dataset_id, get_dataset_description),
version = map_chr(
description, pluck, "products", "datasets", 1, "versions", 1, "label"),
times = map(description, get_dataset_times),
time_min = map_vec(times, min),
time_max = map_vec(times, max),
n_times = map_int(times, length))
d_ds |>
select(dataset_id, version, time_min, time_max, n_times) |>
datatable(
caption = "Copernicus datasets: version and times. Notice transition from Daily (dataset_id = *phy_my*) to Interim, daily (dataset_id = *phy_myint*) at 2021-07-01.")
```
```{r}
#| label: existing_lyr_dates
#| eval: false
#| echo: false
system.time({
d_tif <- tibble(
tif = dir_ls(dir_out, glob = "*.tif", recurse = T)) |>
mutate(
lyrs = map(tif, \(x) names(rast(x))) ) # 18.387
})
lyrs <- d_tif$lyrs[[1]]
rast(d_tif$tif[1]) |>
cm_r_dflyrs()
d_lyrs <- d_tif |>
# slice(c(1:3,(nrow(d_tif) - 2):nrow(d_tif))) |>
mutate(
d = map(tif, \(x) {
r <- rast(x)
cm_r_dflyrs(r) }) ) |>
unnest(d) |>
mutate(
nms = dirname(tif) |> basename(),
yr = year(time))
d_lyrs_sum <- d_lyrs |>
group_by(nms, yr) |>
summarize(
dataset = unique(dataset) |> paste(collapse = ','),
.groups = "drop")
# select(-tif)
table(d_lyrs$dataset)
# View(d_tif)
system.time({
d_csv <- tibble(
csv = dir_ls(dir_out, glob = "*.csv", recurse = T)) |>
mutate(
d = map(csv, read_csv) ) # 7.652
})
```
## Polygon dataset years
```{r}
#| label: d_nms_ds_t
d_ds_yr <- d_ds |>
mutate(
d_yrs = map(times, \(ts) {
yrs <- unique(year(ts))
tibble(
yr = yrs) |>
mutate(
time_min = map_vec(yr, \(yr) min(ts[year(ts) == yr])),
time_max = map_vec(yr, \(yr) max(ts[year(ts) == yr])),
n_times = map_int(yr, \(yr) length(ts[year(ts) == yr])) ) }) ) |>
select(-description, -times, -time_min, -time_max, -n_times) |>
unnest(d_yrs) |>
select(dataset_id, yr, time_min, time_max, n_times)
# d_ds_yr |>
# filter(yr == 2021)
d_nms_ds_yr_todo <- ply_sanctuaries |>
st_drop_geometry() |>
arrange(nms) |>
select(nms) |>
cross_join(
d_ds_yr) |>
mutate(
csv = glue("{dir_out}/{nms}/{yr}.csv"),
tif = glue("{dir_out}/{nms}/{yr}.tif") ) |>
filter(
yr == 2021) |>
mutate(
d_csv = pmap(
list(tif, yr, dataset_id, time_min, time_max),
\(tif, yr, dataset_id, time_min, time_max, ...) { # nms = "TBNMS"; year = 2025
times_yr <- seq.POSIXt(time_min, time_max, "day")
# TODO: generalize for month, 6-hr, etc from new meta/*.yml entry
# if missing tif, return all times
if (!file.exists(tif))
return(list(
time_min = min(times_yr),
time_max = max(times_yr),
n_times = length(times_yr)))
# get times from tif
r <- rast(tif)
times_tif <- unique(time(r))
# if all times done, return NAs
if (all(times_yr %in% times_tif))
return(list(
time_min = NA,
time_max = NA,
n_times = 0))
# otherwise, return time range missing
i <- !times_yr %in% times_tif
list(
time_min = min(times_yr[i]),
time_max = max(times_yr[i]),
n_times = sum(i) ) })) |>
select(-time_min, -time_max, -n_times) |>
mutate(
time_min = map_vec(d_csv, pluck, "time_min"),
time_max = map_vec(d_csv, pluck, "time_max"),
n_times = map_int(d_csv, pluck, "n_times")) |>
select(-d_csv) |>
filter(!is.na(time_min)) |>
rowid_to_column("i") |>
relocate(i)
d_nms_ds_yr_todo |>
group_by(nms, dataset_id) %>%
{if (nrow(.) > 0)
summarize(
.,
n_times = sum(n_times),
time_min = min(time_min, na.rm = T),
time_max = max(time_max, na.rm = T),
.groups = "drop") else .} |>
datatable(
caption = "Sanctuaries missing available Copernicus Marine times.",
rownames = F,
options = list(
pageLength = 5,
lengthMenu = if( nrow(d_nms_ds_yr_todo) > 5){
ceiling(seq.int(5, nrow(d_nms_ds_yr_todo), length.out = 3))
} else {
c(5) } ) )
```
## Extract missing times in datasets per polygon year
Using the [Copernicus Marine Toolbox](https://help.marine.copernicus.eu/en/collections/9080063-copernicus-marine-toolbox) in R. See:
- [How to download data via the Copernicus Marine Toolbox in R? | Copernicus Marine Help Center](https://help.marine.copernicus.eu/en/articles/8638253-how-to-download-data-via-the-copernicus-marine-toolbox-in-r)
```{r}
#| label: iterate_subset
fxn <- function(i, dataset_id, nms, time_min, time_max, ...){
# i = 1; nms = "CBNMS"; dataset_id = "cmems_mod_glo_phy_my_0.083deg_P1D-m"
# time_min = as.POSIXct("1993-01-01", tz = "UTC"); time_max = as.POSIXct("1993-12-01", tz = "UTC")
# time_min = as.POSIXct("1993-12-02", tz = "UTC"); time_max = as.POSIXct("1993-12-31", tz = "UTC")
# d_nms_ds_yr |> slice(1) |> attach()
err <- tryCatch({
log_info("{sprintf('%03d', i)}: {nms}, {dataset_id}, {year(time_min)} ({time_min} to {time_max})")
yr <- year(time_min)
r_nc <- glue("{dir_out}/{nms}/{yr}.nc")
r_tif <- path_ext_set(r_nc, "tif")
p_csv <- path_ext_set(r_nc, "csv")
bb <- bb_sanctuaries |>
filter(nms == !!nms) |>
st_bbox()
p <- ply_sanctuaries |>
filter(nms == !!nms)
vars <- map_chr(meta$variables, "id")
dir_create(dirname(r_nc))
res <- cmt$subset( # py_help( cmt$subset)
dataset_id = dataset_id,
service = "timeseries",
variables = vars,
minimum_longitude = bb$xmin,
maximum_longitude = bb$xmax,
minimum_latitude = bb$ymin,
maximum_latitude = bb$ymax,
start_datetime = time_min,
end_datetime = time_max,
minimum_depth = meta$depth$min,
maximum_depth = meta$depth$max,
output_directory = dirname(r_nc),
output_filename = basename(r_nc),
overwrite = TRUE,
coordinates_selection_method = "inside",
netcdf_compression_level = 1,
disable_progress_bar = TRUE)
# TODO: merge with existing
r <- rast(r_nc)
# set dataset_id into layer name
names(r) <- glue("{dataset_id} {str_replace_all(names(r), '_',' ')}")
# TODO:
# - write dataset_version into layer name
# - handle ?terra::depth()
if (file_exists(r_tif)){
r_tmp_tif <- tempfile(fileext = ".tif")
r_tmp <- c(rast(r_tif), r)
r_tmp <- subset(r_tmp, which(!duplicated(names(r_tmp)))) # rm duplicates
writeRaster(r_tmp, r_tmp_tif)
file_delete(r_tif)
file_move(r_tmp_tif, r_tif)
rm(r); rm(r_tmp)
} else {
writeRaster(r, r_tif, overwrite = T)
}
file_delete(r_nc)
# zonal tif to csv ----
r <- rast(r_tif)
extract_tbl <- function(r, p, stat = "mean"){
terra::extract(r, p, fun = stat, na.rm=T) |>
pivot_longer(!ID, names_to = "lyr") |>
separate_wider_regex(
lyr, c(
dataset = "^\\S+", " ",
var = "\\S+",
depth = "(?:\\sdepth=[.0-9]+)?",
itime = "\\s[0-9]+$")) |>
mutate(
depth = str_replace(depth, "depth=", "") |> as.numeric(),
itime = as.integer(itime),
stat = !!stat,
time = time(r)) |>
select(dataset, var, depth, time, stat, value)
}
bind_rows(
extract_tbl(r, p, "mean"),
extract_tbl(r, p, "sd")) |>
write_csv(p_csv, na = "")
return(NA)
}, error = function(e) {
log_error(conditionMessage(e))
return(conditionMessage(e))
})
err
}
res <- d_nms_ds_yr_todo |>
# slice(1) |> # DEBUG
mutate(
error = pmap_chr(list(i, dataset_id, nms, time_min, time_max), fxn))
```
### Successes
```{r}
#| label: success_summary
res |>
mutate(
success = is.na(error)) |>
group_by(success) |>
summarize(
n = n()) |>
datatable()
```
### Errors (if any)
```{r}
#| label: error_summary
res |>
filter(!is.na(error)) |>
group_by(nms) |>
summarize(
n_years = n(),
errors = unique(error) |> paste(collapse = "\n\n----\n\n")) |>
datatable()
```
::: {.content-hidden}
## TODO
`ed_extract()`:
- [x] delete *_nc dir
- [x] differentiate existing done vs todo for given year
- [ ] allow irregular datasets, like `meta/irregular/*.yaml`: `ERROR: x cell sizes are not regular`
- [ ] break up into functions, not exported
`erddap.qmd`:
- [ ] make `ed_extract()` to single MBNMS with features for main vs david; add "ALL" option to `ed_extract()`
- [ ] add buffer to all (and redo)
- [ ] wrap retry with `ed_dim()` too
:::
```{r}
#| label: rm_empty_nc_dirs
#| eval: false
#| echo: false
# find empty directories ending in _nc and delete them
d_dirs <- tibble(
dir = list.dirs(here("data"), full.names = T, recursive = T)) |>
filter(str_detect(dir, "_nc$")) |>
mutate(
n_files = map_int(dir, \(x) {length(list.files(x))}))
# show non-empty directories
d_dirs |>
filter(n_files > 0) |>
datatable()
# delete empty directories
d_dirs |>
filter(n_files == 0) |>
pull(dir) |>
walk(\(x) {
message(glue("Deleting {x}"))
unlink(x, recursive = T) })
```
::: {.callout-caution collapse="true"}
## R package versions
```{r}
devtools::session_info()
```
:::