Source code for weatherDB.utils.get_data

"""
Some utilities functions to download the needed data for the module to work.
"""
import requests
from pathlib import Path
from distutils.util import strtobool
import hashlib
import progressbar as pb

from ..config import config

[docs] def download_ma_rasters(which="all", overwrite=None, update_user_config=False): """Get the multi annual rasters on which bases the regionalisation is done. The refined multi annual datasets, that are downloaded are published on Zenodo [1]_ References ---------- .. [1] Schmit, M.; Weiler, M. (2023). German weather services (DWD) multi annual meteorological rasters for the climate period 1991-2020 refined to 25m grid (1.0.0) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.10066045 Parameters ---------- which : str or [str], optional Which raster to download. Options are "dwd", "hyras", "regnie" and "all". The default is "all". overwrite : bool, optional Should the multi annual rasters be downloaded even if they already exist? If None the user will be asked. The default is None. update_user_config : bool, optional Should the downloaded rasters be set as the regionalisation rasters in the user configuration file? The default is False. """ # DOI of the multi annual dataset DOI = "10.5281/zenodo.10066045" # check which if isinstance(which, str): which = [which] for w in which: if w not in ["all", "dwd", "hyras", "regnie"]: raise ValueError( "which must be one of 'all', 'dwd', 'hyras' or 'regnie'.") if w == "all": which = ["dwd", "hyras", "regnie"] break # get zenodo record zenodo_id = requests.get( f"https://doi.org/{DOI}" ).url.split("/")[-1] zenodo_rec = requests.get( f"https://zenodo.org/api/records/{zenodo_id}" ).json() # download files for file in zenodo_rec["files"]: file_key = file["key"].lower().split("_")[0].split("-")[0] if file_key in which: # check if file is in config if f"data:rasters:{file_key}" not in config: print(f"Skipping {file_key} as it is not in your configuration.\nPlease add a section 'data:rasters:{file_key}' to your configuration file.") continue # check if file already exists file_path = Path(config.get(f"data:rasters:{file_key}", "file")) if file_path.exists(): skip = False if overwrite is False: skip = True elif overwrite is None: skip = not strtobool(input( f"{file_key} already exists at {file_path}.\n"+ "Do you want to overwrite it? [y/n] ")) if skip: print(f"Skipping {file_key} as overwriting is not allowed.") continue # check if the directory exists if not file_path.parent.exists(): if strtobool(input( f"The directory \"{file_path.parent}\" does not exist.\n"+ "Do you want to create it? [y/n] ")): file_path.parent.mkdir(parents=True) # download file r = requests.get(file["links"]["self"], stream=True) if r.status_code != 200: r.raise_for_status() # Will only raise for 4xx codes, so... raise RuntimeError( f'Request to {file["links"]["self"]} returned status code {r.status_code}') block_size = 1024 file_size = int(r.headers.get('Content-Length', 0)) pbar = pb.ProgressBar( max_value=file_size, prefix=f"downloading {file_key}: ", widgets=[ " ", pb.widgets.DataSize(), "/", pb.widgets.DataSize("max_value"), pb.widgets.AdaptiveTransferSpeed( format='(%(scaled)5.1f %(prefix)s%(unit)-s/s) '), pb.widgets.Bar(), " ", pb.widgets.Percentage(), pb.widgets.ETA()], line_breaks=False, redirect_stdout=True ).start() md5 = hashlib.md5() with open(file_path, "wb+") as f: for i, chunk in enumerate(r.iter_content(block_size)): f.write(chunk) md5.update(chunk) pbar.update(i*block_size) pbar.finish() # check checksum if md5.hexdigest() != file["checksum"].replace("md5:", ""): raise ValueError( f"Checksum of {file_key} doesn't match. File might be corrupted.") # update user config if update_user_config: if config.has_user_config: config.update_user_config(f"data:rasters:{file_key}", "file", str(file_path)) else: print(f"No user configuration file found, therefor the raster '{file_key}' is not set in the user configuration file.")
[docs] def download_dem(overwrite=None, extent=(5.3, 46.1, 15.6, 55.4), update_user_config=False): """Download the newest DEM data from the Copernicus Sentinel dataset. Only the GLO-30 DEM, which has a 30m resolution, is downloaded as it is freely available. If you register as a scientific researcher also the EEA-10, with 10 m resolution, is available. You will have to download the data yourself and define it in the configuration file. After downloading the data, the files are merged and saved as a single tif file in the data directory in a subfolder called 'DEM'. To use the DEM data in the WeatherDB, you will have to define the path to the tif file in the configuration file. Source: Copernicus DEM - Global and European Digital Elevation Model. Digital Surface Model (DSM) provided in 3 different resolutions (90m, 30m, 10m) with varying geographical extent (EEA: European and GLO: global) and varying format (INSPIRE, DGED, DTED). DOI:10.5270/ESA-c5d3d65. Parameters ---------- overwrite : bool, optional Should the DEM data be downloaded even if it already exists? If None the user will be asked. The default is None. extent : tuple, optional The extent in WGS84 of the DEM data to download. The default is the boundary of germany + ~40km = (5.3, 46.1, 15.6, 55.4). update_user_config : bool, optional Should the downloaded DEM be set as the used DEM in the user configuration file? The default is False. """ # import necessary modules import rasterio as rio from rasterio.merge import merge import tarfile import shutil from tempfile import TemporaryDirectory import re import json # get dem_dir base_dir = Path(config.get("data", "base_dir")) dem_dir = base_dir / "DEM" dem_dir.mkdir(parents=True, exist_ok=True) # get available datasets prism_url = "https://prism-dem-open.copernicus.eu/pd-desk-open-access/publicDemURLs" avl_ds_req = json.loads( requests.get( prism_url, headers={"Accept": "json"} ).text ) avl_ds = [{ "id": e["datasetId"], "year": int(e["datasetId"].split("/")[1].split("_")[0]), "year_part": int(e["datasetId"].split("/")[1].split("_")[1]), "resolution": int(e["datasetId"].split("-")[2]), } for e in avl_ds_req] # select newest and highest resolution dataset ds_id = sorted( avl_ds, key=lambda x: (-x["resolution"], x["year"], x["year_part"]) )[-1]["id"] # check if dataset already exists dem_file = dem_dir / f'{ds_id.replace("/", "__")}.tif' if dem_file.exists(): print(f"The DEM data already exists at {dem_file}.") if overwrite is None: overwrite = strtobool(input("Do you want to overwrite it? [y/n] ")) if not overwrite: print("Skipping, because overwritting was turned of.") return else: print("Overwriting the dataset.") dem_dir.mkdir(exist_ok=True) # selecting DEM tiles print(f"getting available tiles for Copernicus dataset '{ds_id}'") ds_files_req = json.loads( requests.get( f"{prism_url}/{ds_id.replace('/', '__')}", headers={"Accept": "json"} ).text ) re_comp = re.compile(r".*/Copernicus_DSM_\d{2}_N\d*_\d{2}_E\d*.*") ds_files_all = [ {"lat": int(Path(f["nativeDemUrl"]).stem.split("_")[3][1:]), "long": int(Path(f["nativeDemUrl"]).stem.split("_")[5][1:]), **f} for f in ds_files_req if re_comp.match(f["nativeDemUrl"])] res_deg = 1 ds_files = list(filter( lambda x: ( (extent[0] - res_deg) < x["long"] < extent[2] and (extent[1] - res_deg) < x["lat"] < extent[3] ), ds_files_all)) # download DEM tiles print("downloading tiles") with TemporaryDirectory() as tmp_dir: tmp_dir_fp = Path(tmp_dir) for f in pb.progressbar(ds_files): with open(tmp_dir_fp / Path(f["nativeDemUrl"]).name, "wb") as d: d.write(requests.get(f["nativeDemUrl"]).content) print("downloaded all files") # extracting tifs from tars for i, f in pb.progressbar(list(enumerate(tmp_dir_fp.glob("*.tar")))): with tarfile.open(f) as t: # extract dem tif re_comp = re.compile(r"^.*\/DEM\/.*\.tif$") name = list(filter(re_comp.match, t.getnames()))[0] with open(tmp_dir_fp/f"{name.split('/')[-1]}", "wb") as d: d.write(t.extractfile(name).read()) # extract info contract if i==0: re_comp = re.compile(r"^.*\/INFO\/.*\.pdf$") name = list(filter(re_comp.match, t.getnames()))[0] with open(tmp_dir_fp/f"{name.split('/')[-1]}", "wb") as d: d.write(t.extractfile(name).read()) # remove tar f.unlink() # merge files srcs = [rio.open(f) for f in tmp_dir_fp.glob("*.tif")] dem_np, dem_tr = merge(srcs) dem_meta = srcs[0].meta.copy() dem_meta.update({ "driver": "GTiff", "height": dem_np.shape[1], "width": dem_np.shape[2], "transform": dem_tr }) with rio.open(dem_file, "w", **dem_meta) as d: d.write(dem_np) # copy info contract tmp_eula_fp = next(tmp_dir_fp.glob("*.pdf")) shutil.copyfile( tmp_eula_fp, dem_dir / tmp_eula_fp.name ) print(f"created DEM at '{dem_file}'.") # update user config if update_user_config: if config.has_user_config: config.update_user_config("data:rasters", "dems", str(dem_file)) return else: print("No user configuration file found, therefor the DEM is not set in the user configuration file.") print("To use the DEM data in the WeatherDB, you will have to define the path to the tif file in the user configuration file.")