Copernicus Marine Extraction

metadata: copernicus_phy.yml

Published

2025-07-04 00:00 UTC

Code
librarian::shelf(
  dplyr, DT, fs, glue, here, logger, jsonlite, listviewer, lubridate, mapview, 
  purrr, RColorBrewer, readr, reticulate, sf, stringr, terra, tibble, tidyr, 
  units, yaml, 
  quiet = T)
options(readr.show_col_types = F)
mapviewOptions(
  basemaps       = "Esri.OceanBasemap",
  vector.palette = colorRampPalette(brewer.pal(n=5, name="Spectral")))

if (!exists("params"))
  params <- list(yml = here("meta/copernicus_phy.yml")) # DEBUG

cm_r_dflyrs <- function(r){
  # CopernicusMarine raster to dataframe of layers
  
  d_lyrs <- tibble(
    lyr = names(r)) |> 
    separate_wider_regex(
      lyr, c(
        dataset = "^\\S+", " ", 
        var     = "\\S+", 
        depth   = "(?:\\sdepth=[.0-9]+)?",
        itime   = "\\s[0-9]+$")) |> 
    mutate(
      depth = str_replace(depth, "depth=", "") |> as.numeric(),
      itime = as.integer(itime),
      time  = time(r))
  d_lyrs
}

1 Polygons

Code
sanctuaries_rds <- here("../climate-dashboard/data/sanctuaries.rds")
buf_pct         <- 0.20 # 20% of area

ply_sanctuaries <- readRDS(sanctuaries_rds)

# buffer polygon and get bounding boxes
ply_sanctuaries <- ply_sanctuaries |>
  mutate(
    area_km2  = st_area(geom) |> 
      set_units("km^2") |> 
      as.numeric() |> 
      round(2),
    buf_km    = (sqrt(area_km2) * buf_pct) |> 
      round(2),
    bbox_geom = map2(geom, area_km2, \(g,a) {
      st_sfc(g, crs = 4326) |> 
        st_buffer(buf_km * 1000) |> 
        st_bbox() |>
        round(2) |> 
        st_as_sfc() }),
    bbox_chr  = map_chr(bbox_geom, \(x) {  # xmin, ymin, xmax and ymax
      st_bbox(x) |> 
        as.numeric() |> 
        paste(collapse = ",") } ) ) |> 
  unnest(bbox_geom)
bb_sanctuaries <- st_set_geometry(ply_sanctuaries, "bbox_geom") |> 
  select(-geom)
ply_sanctuaries <- select(ply_sanctuaries, -bbox_geom)

# show map
mapView(ply_sanctuaries, zcol = "nms") +
  mapView(bb_sanctuaries, zcol = "nms", legend = F)
Code
ply_sanctuaries |>
  st_drop_geometry() |>
  datatable()

2 Dataset

2.1 Metadata

Contents of copernicus_phy.yml:

Code
cat(
  "```yaml",
  readLines(params$yml),
  "```",
  sep = "\n")
# product: Global Ocean Physics Reanalysis (GLOBAL_MULTIYEAR_PHY_001_030)
datasets:
- id: cmems_mod_glo_phy_my_0.083deg_P1D-m     # Daily
- id: cmems_mod_glo_phy_myint_0.083deg_P1D-m  # Interim, daily
depth:
  min: 0.49402499198913574
  max: 0.49402499198913574
variables:
- name: Ocean mixed layer thickness
  units: m
  id: mlotst
- name: Sea surface temperature
  units: °C
  id: thetao
- name: Sea bottom temperature
  units: °C
  id: bottomT
- name: Sea water salinity
  units: g/kg
  id: so
Code
# import copernicusmarine as cmt ----
# do once: create virtual enviroment and install copernicusmarine Python module
# virtualenv_create(envname = "CopernicusMarine")
# virtualenv_install(envname = "CopernicusMarine", packages = c("copernicusmarine"))

# use CopernicusMarine env with copernicusmarine Python module
use_virtualenv(virtualenv = "CopernicusMarine", required = TRUE)
py_require("copernicusmarine>2.1")
cmt <- import("copernicusmarine")

# login to cmt ----
# register for username and password at https://data.marine.copernicus.eu/register
user     <- "bbest1"
pass_txt <- ifelse(
  Sys.info()[["sysname"]] == "Linux",
  "/share/private/data.marine.copernicus.eu_bbest1-password.txt",      # server
  "~/My Drive/private/data.marine.copernicus.eu_bbest1-password.txt")  # laptop

pass <- readLines(pass_txt)
logged_in <- cmt$login(user, pass, force_overwrite = T)  # py_help(cmt$login)
stopifnot(logged_in)

meta <- read_yaml(params$yml) 
copernicus_prod <- path_ext_remove(basename(params$yml))

dir_out <- glue("/share/data/noaa-onms/climate-dashboard-app/{copernicus_prod}")
dir_create(dir_out)
Code
# get times ----
get_dataset_description <- function(dataset_id){
  cmt$describe(
    dataset_id           = dataset_id,
    disable_progress_bar = T)$json() |> 
    fromJSON()
}

get_dataset_times <- function(dataset_description){
  services <- pluck(
    dataset_description, "products", "datasets", 1, "versions", 1, "parts", 1, "services", 1)
  service_names <- pluck(
    services, "service_name")
  i_timeseries <- which(service_names == "arco-time-series")
  variables <- pluck(
    services, "variables", i_timeseries, "short_name")
  coordinates <- pluck(
    services, "variables", i_timeseries, "coordinates")
  coordinate_ids <- pluck(
    coordinates, 1, "coordinate_id")   # assuming 1st variable
  i_time <- which(coordinate_ids == "time")
  time_min <- pluck(
    coordinates, 1, "minimum_value", i_time) |> 
    # convert from [milliseconds since 1970-01-01 00:00:00Z (no leap seconds)]
    as.numeric() %>% {./1000} |> 
    as.POSIXct(origin = "1970-01-01", tz = "UTC")
  time_max <- pluck(
    coordinates, 1, "maximum_value", i_time) |> 
    as.numeric() %>% {./1000} |> 
    as.POSIXct(origin = "1970-01-01", tz = "UTC")
  time_step_secs <- pluck(
    coordinates, 1, "step", i_time) |> 
    as.numeric() %>% 
    {./1000} 
  seq.POSIXt(
    from = time_min,
    to   = time_max,
    by   = time_step_secs)
}

d_ds <- tibble(
  dataset_id = meta |> 
    pluck("datasets") |> 
    map_chr("id")) |> 
  mutate(
    description = map(dataset_id, get_dataset_description),
    version     = map_chr(
      description, pluck, "products", "datasets", 1, "versions", 1, "label"),
    times       = map(description, get_dataset_times),
    time_min    = map_vec(times, min),
    time_max    = map_vec(times, max),
    n_times     = map_int(times, length))

d_ds |> 
  select(dataset_id, version, time_min, time_max, n_times) |> 
  datatable(
    caption  = "Copernicus datasets: version and times. Notice transition from Daily (dataset_id = *phy_my*) to Interim, daily (dataset_id = *phy_myint*) at 2021-07-01.")

3 Polygon dataset years

Code
d_ds_yr <- d_ds |>
  mutate(
    d_yrs = map(times, \(ts) {
      yrs <- unique(year(ts))
      tibble(
        yr = yrs) |> 
        mutate(
          time_min = map_vec(yr, \(yr) min(ts[year(ts) == yr])),
          time_max = map_vec(yr, \(yr) max(ts[year(ts) == yr])),
          n_times  = map_int(yr, \(yr) length(ts[year(ts) == yr])) ) }) ) |> 
  select(-description, -times, -time_min, -time_max, -n_times) |>
  unnest(d_yrs) |> 
  select(dataset_id, yr, time_min, time_max, n_times)

# d_ds_yr |> 
#   filter(yr == 2021)

d_nms_ds_yr_todo <- ply_sanctuaries |> 
  st_drop_geometry() |> 
  arrange(nms) |> 
  select(nms) |> 
  cross_join(
    d_ds_yr) |> 
  mutate(
    csv = glue("{dir_out}/{nms}/{yr}.csv"),  
    tif = glue("{dir_out}/{nms}/{yr}.tif") ) |> 
  filter(
    yr == 2021) |> 
  mutate(
    d_csv = pmap(
      list(tif, yr, dataset_id, time_min, time_max),
      \(tif, yr, dataset_id, time_min, time_max, ...) { # nms = "TBNMS"; year = 2025
        
        times_yr <- seq.POSIXt(time_min, time_max, "day") 
        # TODO: generalize for month, 6-hr, etc from new meta/*.yml entry

        # if missing tif, return all times
        if (!file.exists(tif))
          return(list(
            time_min = min(times_yr),
            time_max = max(times_yr),
            n_times  = length(times_yr)))
        
        # get times from tif
        r <- rast(tif)
        times_tif <- unique(time(r))
        
        # if all times done, return NAs
        if (all(times_yr %in% times_tif))
          return(list(
            time_min = NA,
            time_max = NA,
            n_times  = 0)) 
        
        # otherwise, return time range missing
        i <- !times_yr %in% times_tif
        list(
          time_min = min(times_yr[i]),
          time_max = max(times_yr[i]),
          n_times  = sum(i) ) })) |> 
  select(-time_min, -time_max, -n_times) |> 
  mutate(
    time_min = map_vec(d_csv, pluck, "time_min"),
    time_max = map_vec(d_csv, pluck, "time_max"),
    n_times  = map_int(d_csv, pluck, "n_times")) |> 
  select(-d_csv) |> 
  filter(!is.na(time_min)) |> 
  rowid_to_column("i") |>
  relocate(i)

d_nms_ds_yr_todo |> 
  group_by(nms, dataset_id) %>%
  {if (nrow(.) > 0)
    summarize(
      .,
      n_times  = sum(n_times),
      time_min = min(time_min, na.rm = T),
      time_max = max(time_max, na.rm = T),
      .groups = "drop") else .} |>
  datatable(
    caption  = "Sanctuaries missing available Copernicus Marine times.",
    rownames = F,
    options  = list(
      pageLength = 5,
      lengthMenu = if( nrow(d_nms_ds_yr_todo) > 5){
        ceiling(seq.int(5, nrow(d_nms_ds_yr_todo), length.out = 3))
      } else {
        c(5) } ) )

4 Extract missing times in datasets per polygon year

Using the Copernicus Marine Toolbox in R. See:

Code
fxn <- function(i, dataset_id, nms, time_min, time_max, ...){
  # i = 1; nms = "CBNMS"; dataset_id = "cmems_mod_glo_phy_my_0.083deg_P1D-m"
  # time_min = as.POSIXct("1993-01-01", tz = "UTC"); time_max = as.POSIXct("1993-12-01", tz = "UTC")
  # time_min = as.POSIXct("1993-12-02", tz = "UTC"); time_max = as.POSIXct("1993-12-31", tz = "UTC")
  # d_nms_ds_yr |> slice(1) |> attach()
  
  err <- tryCatch({
    
    log_info("{sprintf('%03d', i)}: {nms}, {dataset_id}, {year(time_min)} ({time_min} to {time_max})")
    
    yr    <- year(time_min)
    r_nc  <- glue("{dir_out}/{nms}/{yr}.nc")
    r_tif <- path_ext_set(r_nc, "tif")
    p_csv <- path_ext_set(r_nc, "csv")
    bb    <- bb_sanctuaries |>
      filter(nms == !!nms) |> 
      st_bbox()
    p     <- ply_sanctuaries |>
      filter(nms == !!nms)
    vars  <- map_chr(meta$variables, "id")
    
    dir_create(dirname(r_nc))
    
    res <- cmt$subset(  # py_help( cmt$subset)
      dataset_id                   = dataset_id,
      service                      = "timeseries",
      variables                    = vars,
      minimum_longitude            = bb$xmin,
      maximum_longitude            = bb$xmax,
      minimum_latitude             = bb$ymin,
      maximum_latitude             = bb$ymax,
      start_datetime               = time_min,
      end_datetime                 = time_max,
      minimum_depth                = meta$depth$min,
      maximum_depth                = meta$depth$max,
      output_directory             = dirname(r_nc),
      output_filename              = basename(r_nc),
      overwrite                    = TRUE,
      coordinates_selection_method = "inside",
      netcdf_compression_level     = 1,
      disable_progress_bar         = TRUE)

    # TODO: merge with existing
    r <- rast(r_nc)
    
    # set dataset_id into layer name
    names(r) <- glue("{dataset_id} {str_replace_all(names(r), '_',' ')}")
    # TODO:
    # - write dataset_version into layer name
    # - handle ?terra::depth()
    if (file_exists(r_tif)){
      r_tmp_tif <- tempfile(fileext = ".tif")
      r_tmp <- c(rast(r_tif), r)
      r_tmp <- subset(r_tmp, which(!duplicated(names(r_tmp)))) # rm duplicates
      writeRaster(r_tmp, r_tmp_tif)
      file_delete(r_tif)
      file_move(r_tmp_tif, r_tif)
      rm(r); rm(r_tmp)
    } else {
      writeRaster(r, r_tif, overwrite = T)  
    }
    file_delete(r_nc)
      
    # zonal tif to csv ---- 
    r <- rast(r_tif)
    extract_tbl <- function(r, p, stat = "mean"){
      terra::extract(r, p, fun = stat, na.rm=T) |> 
        pivot_longer(!ID, names_to = "lyr") |> 
        separate_wider_regex(
          lyr, c(
            dataset = "^\\S+", " ", 
            var     = "\\S+", 
            depth   = "(?:\\sdepth=[.0-9]+)?",
            itime   = "\\s[0-9]+$")) |> 
        mutate(
          depth = str_replace(depth, "depth=", "") |> as.numeric(),
          itime = as.integer(itime),
          stat = !!stat,
          time = time(r)) |> 
        select(dataset, var, depth, time, stat, value)
    }
    bind_rows(
      extract_tbl(r, p, "mean"),
      extract_tbl(r, p, "sd")) |> 
      write_csv(p_csv, na = "")

    return(NA)
  }, error = function(e) {
    
    log_error(conditionMessage(e))
    return(conditionMessage(e))
  })
  
  err
} 

res <- d_nms_ds_yr_todo |> 
  # slice(1) |>  # DEBUG
  mutate(
    error = pmap_chr(list(i, dataset_id, nms, time_min, time_max), fxn))

4.1 Successes

Code
res |> 
  mutate(
    success = is.na(error)) |> 
  group_by(success) |> 
  summarize(
    n = n()) |> 
  datatable()

4.2 Errors (if any)

Code
res |> 
  filter(!is.na(error)) |> 
  group_by(nms) |> 
  summarize(
    n_years = n(),
    errors  = unique(error) |> paste(collapse = "\n\n----\n\n")) |> 
  datatable()
Code
devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.2 (2024-10-31)
 os       Ubuntu 24.04.1 LTS
 system   x86_64, linux-gnu
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       Etc/UTC
 date     2025-07-04
 pandoc   3.5 @ /usr/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package           * version date (UTC) lib source
 base64enc           0.1-3   2015-07-28 [1] RSPM (R 4.4.0)
 brew                1.0-10  2023-12-16 [1] RSPM (R 4.4.0)
 bslib               0.9.0   2025-01-30 [1] RSPM (R 4.4.0)
 cachem              1.1.0   2024-05-16 [1] RSPM (R 4.4.0)
 class               7.3-22  2023-05-03 [2] CRAN (R 4.4.2)
 classInt            0.4-11  2025-01-08 [1] RSPM (R 4.4.0)
 cli                 3.6.5   2025-04-23 [1] RSPM (R 4.4.0)
 codetools           0.2-20  2024-03-31 [2] CRAN (R 4.4.2)
 colorspace          2.1-1   2024-07-26 [1] RSPM (R 4.4.0)
 crosstalk           1.2.1   2023-11-23 [1] RSPM (R 4.4.0)
 DBI                 1.2.3   2024-06-02 [1] RSPM (R 4.4.0)
 devtools            2.4.5   2022-10-11 [1] RSPM (R 4.4.0)
 dichromat           2.0-0.1 2022-05-02 [1] RSPM (R 4.4.0)
 digest              0.6.37  2024-08-19 [1] RSPM (R 4.4.0)
 dplyr             * 1.1.4   2023-11-17 [1] RSPM (R 4.4.0)
 DT                * 0.33    2024-04-04 [1] RSPM (R 4.4.0)
 e1071               1.7-16  2024-09-16 [1] RSPM (R 4.4.0)
 ellipsis            0.3.2   2021-04-29 [1] RSPM (R 4.4.0)
 evaluate            1.0.3   2025-01-10 [1] RSPM (R 4.4.0)
 farver              2.1.2   2024-05-13 [1] RSPM (R 4.4.0)
 fastmap             1.2.0   2024-05-15 [1] RSPM (R 4.4.0)
 fs                * 1.6.6   2025-04-12 [1] RSPM (R 4.4.0)
 generics            0.1.4   2025-05-09 [1] RSPM (R 4.4.0)
 glue              * 1.8.0   2024-09-30 [1] RSPM (R 4.4.0)
 here              * 1.0.1   2020-12-13 [1] RSPM (R 4.4.0)
 hms                 1.1.3   2023-03-21 [1] RSPM (R 4.4.0)
 htmltools           0.5.8.1 2024-04-04 [1] RSPM (R 4.4.0)
 htmlwidgets         1.6.4   2023-12-06 [1] RSPM (R 4.4.0)
 httpuv              1.6.16  2025-04-16 [1] RSPM (R 4.4.0)
 jquerylib           0.1.4   2021-04-26 [1] RSPM (R 4.4.0)
 jsonlite          * 2.0.0   2025-03-27 [1] RSPM (R 4.4.0)
 KernSmooth          2.23-24 2024-05-17 [2] CRAN (R 4.4.2)
 knitr               1.50    2025-03-16 [1] RSPM (R 4.4.0)
 later               1.4.2   2025-04-08 [1] RSPM (R 4.4.0)
 lattice             0.22-6  2024-03-20 [2] CRAN (R 4.4.2)
 leafem              0.2.4   2025-05-01 [1] RSPM (R 4.4.0)
 leaflet             2.2.2   2024-03-26 [1] RSPM (R 4.4.0)
 leaflet.providers   2.0.0   2023-10-17 [1] RSPM (R 4.4.0)
 leafpop             0.1.0   2021-05-22 [1] RSPM (R 4.4.0)
 librarian           1.8.1   2021-07-12 [1] RSPM (R 4.4.0)
 lifecycle           1.0.4   2023-11-07 [1] RSPM (R 4.4.0)
 listviewer        * 4.0.0   2023-09-30 [1] RSPM (R 4.4.0)
 logger            * 0.4.0   2024-10-22 [1] RSPM (R 4.4.0)
 lubridate         * 1.9.4   2024-12-08 [1] RSPM (R 4.4.0)
 magrittr            2.0.3   2022-03-30 [1] RSPM (R 4.4.0)
 mapview           * 2.11.2  2023-10-13 [1] RSPM (R 4.4.0)
 Matrix              1.7-1   2024-10-18 [2] CRAN (R 4.4.2)
 memoise             2.0.1   2021-11-26 [1] RSPM (R 4.4.0)
 mime                0.13    2025-03-17 [1] RSPM (R 4.4.0)
 miniUI              0.1.1.1 2018-05-18 [1] RSPM (R 4.4.0)
 pillar              1.10.2  2025-04-05 [1] RSPM (R 4.4.0)
 pkgbuild            1.4.5   2024-10-28 [1] RSPM (R 4.4.0)
 pkgconfig           2.0.3   2019-09-22 [1] RSPM (R 4.4.0)
 pkgload             1.4.0   2024-06-28 [1] RSPM (R 4.4.0)
 png                 0.1-8   2022-11-29 [1] RSPM (R 4.4.0)
 profvis             0.4.0   2024-09-20 [1] RSPM (R 4.4.0)
 promises            1.3.2   2024-11-28 [1] RSPM (R 4.4.0)
 proxy               0.4-27  2022-06-09 [1] RSPM (R 4.4.0)
 purrr             * 1.0.4   2025-02-05 [1] RSPM (R 4.4.0)
 R6                  2.6.1   2025-02-15 [1] RSPM (R 4.4.0)
 raster              3.6-32  2025-03-28 [1] RSPM (R 4.4.0)
 RColorBrewer      * 1.1-3   2022-04-03 [1] RSPM (R 4.4.0)
 Rcpp                1.0.14  2025-01-12 [1] RSPM (R 4.4.0)
 readr             * 2.1.5   2024-01-10 [1] RSPM (R 4.4.0)
 remotes             2.5.0   2024-03-17 [1] RSPM (R 4.4.0)
 reticulate        * 1.42.0  2025-03-25 [1] RSPM (R 4.4.0)
 rlang               1.1.6   2025-04-11 [1] RSPM (R 4.4.0)
 rmarkdown           2.29    2024-11-04 [1] RSPM (R 4.4.0)
 rprojroot           2.0.4   2023-11-05 [1] RSPM (R 4.4.0)
 s2                  1.1.9   2025-05-23 [1] RSPM (R 4.4.0)
 sass                0.4.10  2025-04-11 [1] RSPM (R 4.4.0)
 satellite           1.0.5   2024-02-10 [1] RSPM (R 4.4.0)
 scales              1.4.0   2025-04-24 [1] RSPM (R 4.4.0)
 sessioninfo         1.2.2   2021-12-06 [1] RSPM (R 4.4.0)
 sf                * 1.0-21  2025-05-15 [1] RSPM (R 4.4.2)
 shiny               1.10.0  2024-12-14 [1] RSPM (R 4.4.0)
 sp                  2.2-0   2025-02-01 [1] RSPM (R 4.4.0)
 stringi             1.8.7   2025-03-27 [1] RSPM (R 4.4.0)
 stringr           * 1.5.1   2023-11-14 [1] RSPM (R 4.4.0)
 svglite             2.2.1   2025-05-12 [1] RSPM (R 4.4.0)
 systemfonts         1.2.3   2025-04-30 [1] RSPM (R 4.4.0)
 terra             * 1.8-54  2025-06-01 [1] RSPM (R 4.4.0)
 textshaping         1.0.1   2025-05-01 [1] RSPM (R 4.4.0)
 tibble            * 3.3.0   2025-06-08 [1] RSPM (R 4.4.0)
 tidyr             * 1.3.1   2024-01-24 [1] RSPM (R 4.4.0)
 tidyselect          1.2.1   2024-03-11 [1] RSPM (R 4.4.0)
 timechange          0.3.0   2024-01-18 [1] RSPM (R 4.4.0)
 tzdb                0.5.0   2025-03-15 [1] RSPM (R 4.4.0)
 units             * 0.8-7   2025-03-11 [1] RSPM (R 4.4.0)
 urlchecker          1.0.1   2021-11-30 [1] RSPM (R 4.4.0)
 usethis             3.1.0   2024-11-26 [1] RSPM (R 4.4.0)
 uuid                1.2-1   2024-07-29 [1] RSPM (R 4.4.0)
 vctrs               0.6.5   2023-12-01 [1] RSPM (R 4.4.0)
 withr               3.0.2   2024-10-28 [1] RSPM (R 4.4.0)
 wk                  0.9.4   2024-10-11 [1] RSPM (R 4.4.0)
 xfun                0.52    2025-04-02 [1] RSPM (R 4.4.0)
 xtable              1.8-4   2019-04-21 [1] RSPM (R 4.4.0)
 yaml              * 2.3.10  2024-07-26 [1] RSPM (R 4.4.0)

 [1] /usr/local/lib/R/site-library
 [2] /usr/local/lib/R/library

─ Python configuration ───────────────────────────────────────────────────────
 python:         /home/admin/.virtualenvs/CopernicusMarine/bin/python
 libpython:      /usr/lib/python3.12/config-3.12-x86_64-linux-gnu/libpython3.12.so
 pythonhome:     /home/admin/.virtualenvs/CopernicusMarine:/home/admin/.virtualenvs/CopernicusMarine
 version:        3.12.3 (main, Nov  6 2024, 18:32:19) [GCC 13.2.0]
 numpy:          /home/admin/.virtualenvs/CopernicusMarine/lib/python3.12/site-packages/numpy
 numpy_version:  2.2.5
 copernicusmarine:/home/admin/.virtualenvs/CopernicusMarine/lib/python3.12/site-packages/copernicusmarine
 
 NOTE: Python version was forced by use_python() function

──────────────────────────────────────────────────────────────────────────────