Source code for weatherDB.utils.get_data

"""
Some utilities functions to download the needed data for the module to work.
"""
import requests
from pathlib import Path
from distutils.util import strtobool
import hashlib
import progressbar as pb

from ..config import config


[docs]
def download_ma_rasters(which="all", overwrite=None, update_user_config=False):
    """Get the multi annual rasters on which bases the regionalisation is done.

    The refined multi annual datasets, that are downloaded are published on Zenodo [1]_

    References
    ----------
    .. [1] Schmit, M.; Weiler, M. (2023). German weather services (DWD) multi annual meteorological rasters for the climate period 1991-2020 refined to 25m grid (1.0.0) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.10066045

    Parameters
    ----------
    which : str or [str], optional
        Which raster to download.
        Options are "dwd", "hyras", "regnie" and "all".
        The default is "all".
    overwrite : bool, optional
        Should the multi annual rasters be downloaded even if they already exist?
        If None the user will be asked.
        The default is None.
    update_user_config : bool, optional
        Should the downloaded rasters be set as the regionalisation rasters in the user configuration file?
        The default is False.
    """
    # DOI of the multi annual dataset
    DOI = "10.5281/zenodo.10066045"

    # check which
    if isinstance(which, str):
        which = [which]
    for w in which:
        if w not in ["all", "dwd", "hyras", "regnie"]:
            raise ValueError(
                "which must be one of 'all', 'dwd', 'hyras' or 'regnie'.")
        if w == "all":
            which = ["dwd", "hyras", "regnie"]
            break

    # get zenodo record
    zenodo_id = requests.get(
        f"https://doi.org/{DOI}"
    ).url.split("/")[-1]
    zenodo_rec = requests.get(
        f"https://zenodo.org/api/records/{zenodo_id}"
    ).json()

    # download files
    for file in zenodo_rec["files"]:
        file_key = file["key"].lower().split("_")[0].split("-")[0]
        if file_key in which:
            # check if file is in config
            if f"data:rasters:{file_key}" not in config:
                print(f"Skipping {file_key} as it is not in your configuration.\nPlease add a section 'data:rasters:{file_key}' to your configuration file.")
                continue

            # check if file already exists
            file_path = Path(config.get(f"data:rasters:{file_key}", "file"))
            if file_path.exists():
                skip = False
                if overwrite is False:
                    skip = True
                elif overwrite is None:
                    skip = not strtobool(input(
                        f"{file_key} already exists at {file_path}.\n"+
                        "Do you want to overwrite it? [y/n] "))

                if skip:
                    print(f"Skipping {file_key} as overwriting is not allowed.")
                    continue

            # check if the directory exists
            if not file_path.parent.exists():
                if strtobool(input(
                        f"The directory \"{file_path.parent}\" does not exist.\n"+
                        "Do you want to create it? [y/n] ")):
                    file_path.parent.mkdir(parents=True)

            # download file
            r = requests.get(file["links"]["self"], stream=True)
            if r.status_code != 200:
                r.raise_for_status()  # Will only raise for 4xx codes, so...
                raise RuntimeError(
                    f'Request to {file["links"]["self"]} returned status code {r.status_code}')
            block_size = 1024
            file_size = int(r.headers.get('Content-Length', 0))
            pbar = pb.ProgressBar(
                max_value=file_size,
                prefix=f"downloading {file_key}: ",
                widgets=[ " ",
                    pb.widgets.DataSize(),
                    "/",
                    pb.widgets.DataSize("max_value"),
                    pb.widgets.AdaptiveTransferSpeed(
                        format='(%(scaled)5.1f %(prefix)s%(unit)-s/s) '),
                    pb.widgets.Bar(), " ",
                    pb.widgets.Percentage(),
                    pb.widgets.ETA()],
                line_breaks=False,
                redirect_stdout=True
                ).start()
            md5 = hashlib.md5()
            with open(file_path, "wb+") as f:
                for i, chunk in enumerate(r.iter_content(block_size)):
                    f.write(chunk)
                    md5.update(chunk)
                    pbar.update(i*block_size)
            pbar.finish()

            # check checksum
            if md5.hexdigest() != file["checksum"].replace("md5:", ""):
                raise ValueError(
                    f"Checksum of {file_key} doesn't match. File might be corrupted.")

            # update user config
            if update_user_config:
                if config.has_user_config:
                    config.update_user_config(f"data:rasters:{file_key}", "file", str(file_path))
                else:
                    print(f"No user configuration file found, therefor the raster '{file_key}' is not set in the user configuration file.")




[docs]
def download_dem(overwrite=None, extent=(5.3, 46.1, 15.6, 55.4), update_user_config=False):
    """Download the newest DEM data from the Copernicus Sentinel dataset.

    Only the GLO-30 DEM, which has a 30m resolution, is downloaded as it is freely available.
    If you register as a scientific researcher also the EEA-10, with 10 m resolution, is available.
    You will have to download the data yourself and define it in the configuration file.

    After downloading the data, the files are merged and saved as a single tif file in the data directory in a subfolder called 'DEM'.
    To use the DEM data in the WeatherDB, you will have to define the path to the tif file in the configuration file.

    Source:
    Copernicus DEM - Global and European Digital Elevation Model. Digital Surface Model (DSM) provided in 3 different resolutions (90m, 30m, 10m) with varying geographical extent (EEA: European and GLO: global) and varying format (INSPIRE, DGED, DTED). DOI:10.5270/ESA-c5d3d65.

    Parameters
    ----------
    overwrite : bool, optional
        Should the DEM data be downloaded even if it already exists?
        If None the user will be asked.
        The default is None.
    extent : tuple, optional
        The extent in WGS84 of the DEM data to download.
        The default is the boundary of germany + ~40km = (5.3, 46.1, 15.6, 55.4).
    update_user_config : bool, optional
        Should the downloaded DEM be set as the used DEM in the user configuration file?
        The default is False.
    """
    # import necessary modules
    import rasterio as rio
    from rasterio.merge import merge
    import tarfile
    import shutil
    from tempfile import TemporaryDirectory
    import re
    import json

    # get dem_dir
    base_dir = Path(config.get("data", "base_dir"))
    dem_dir = base_dir / "DEM"
    dem_dir.mkdir(parents=True, exist_ok=True)

    # get available datasets
    prism_url = "https://prism-dem-open.copernicus.eu/pd-desk-open-access/publicDemURLs"
    avl_ds_req = json.loads(
        requests.get(
            prism_url,
            headers={"Accept": "json"}
            ).text
    )
    avl_ds = [{
        "id": e["datasetId"],
        "year": int(e["datasetId"].split("/")[1].split("_")[0]),
        "year_part": int(e["datasetId"].split("/")[1].split("_")[1]),
        "resolution": int(e["datasetId"].split("-")[2]),
        } for e in avl_ds_req]

    # select newest and highest resolution dataset
    ds_id = sorted(
        avl_ds,
        key=lambda x: (-x["resolution"], x["year"], x["year_part"])
        )[-1]["id"]

    # check if dataset already exists
    dem_file = dem_dir / f'{ds_id.replace("/", "__")}.tif'
    if dem_file.exists():
        print(f"The DEM data already exists at {dem_file}.")
        if overwrite is None:
            overwrite = strtobool(input("Do you want to overwrite it? [y/n] "))
        if not overwrite:
            print("Skipping, because overwritting was turned of.")
            return
        else:
            print("Overwriting the dataset.")
    dem_dir.mkdir(exist_ok=True)

    # selecting DEM tiles
    print(f"getting available tiles for Copernicus dataset '{ds_id}'")
    ds_files_req = json.loads(
        requests.get(
            f"{prism_url}/{ds_id.replace('/', '__')}",
            headers={"Accept": "json"}
            ).text
    )
    re_comp = re.compile(r".*/Copernicus_DSM_\d{2}_N\d*_\d{2}_E\d*.*")
    ds_files_all = [
        {"lat": int(Path(f["nativeDemUrl"]).stem.split("_")[3][1:]),
         "long": int(Path(f["nativeDemUrl"]).stem.split("_")[5][1:]),
         **f} for f in ds_files_req if re_comp.match(f["nativeDemUrl"])]
    res_deg = 1
    ds_files = list(filter(
        lambda x: (
            (extent[0] - res_deg) < x["long"] < extent[2] and
            (extent[1] - res_deg) < x["lat"] < extent[3]
            ),
        ds_files_all))

    # download DEM tiles
    print("downloading tiles")
    with TemporaryDirectory() as tmp_dir:
        tmp_dir_fp = Path(tmp_dir)
        for f in pb.progressbar(ds_files):
            with open(tmp_dir_fp / Path(f["nativeDemUrl"]).name, "wb") as d:
                d.write(requests.get(f["nativeDemUrl"]).content)
        print("downloaded all files")

        # extracting tifs from tars
        for i, f in pb.progressbar(list(enumerate(tmp_dir_fp.glob("*.tar")))):
            with tarfile.open(f) as t:
                # extract dem tif
                re_comp = re.compile(r"^.*\/DEM\/.*\.tif$")
                name = list(filter(re_comp.match, t.getnames()))[0]
                with open(tmp_dir_fp/f"{name.split('/')[-1]}", "wb") as d:
                    d.write(t.extractfile(name).read())

                # extract info contract
                if i==0:
                    re_comp = re.compile(r"^.*\/INFO\/.*\.pdf$")
                    name = list(filter(re_comp.match, t.getnames()))[0]
                    with open(tmp_dir_fp/f"{name.split('/')[-1]}", "wb") as d:
                        d.write(t.extractfile(name).read())

            # remove tar
            f.unlink()

        # merge files
        srcs = [rio.open(f) for f in tmp_dir_fp.glob("*.tif")]
        dem_np, dem_tr = merge(srcs)
        dem_meta = srcs[0].meta.copy()
        dem_meta.update({
            "driver": "GTiff",
            "height": dem_np.shape[1],
            "width": dem_np.shape[2],
            "transform": dem_tr
        })
        with rio.open(dem_file, "w", **dem_meta) as d:
            d.write(dem_np)

        # copy info contract
        tmp_eula_fp = next(tmp_dir_fp.glob("*.pdf"))
        shutil.copyfile(
            tmp_eula_fp,
            dem_dir / tmp_eula_fp.name
        )

    print(f"created DEM at '{dem_file}'.")

    # update user config
    if update_user_config:
        if config.has_user_config:
            config.update_user_config("data:rasters", "dems", str(dem_file))
            return
        else:
            print("No user configuration file found, therefor the DEM is not set in the user configuration file.")

    print("To use the DEM data in the WeatherDB, you will have to define the path to the tif file in the user configuration file.")