Source code for c3s_sm.misc

import os
import warnings
from glob import glob
import pandas as pd
import yaml
from parse import parse
from c3s_sm.const import fntempl as _default_template
import xarray as xr
from repurpose.process import parallel_process


[docs]def collect_ts_cov(data_path: str, n_proc=1, progressbar=False): """ Open all time series files in a directory (slow) and detect the temporal coverage. Parameters ---------- data_path : str Path where the cell files are stored. progressbar: bool, optional (default: False) Show progress bar when looping through files. Returns ------- periods: dict Periods coverged by the time series files. {(start, end): [cell, cell, ...], ...} """ fl = glob( os.path.join(data_path, '**', "[0-9][0-9][0-9][0-9].nc"), recursive=True) if len(fl) == 0: raise ValueError(f"No matching files found in {data_path}") def _func(f: str) -> tuple: cell = int(os.path.basename(f).split('.')[0]) ds = xr.open_dataset(f) start = pd.to_datetime(min(ds['time'].values)).to_pydatetime() end = pd.to_datetime(max(ds['time'].values)).to_pydatetime() parameters = list(ds.data_vars) return start, end, cell, parameters se = parallel_process( _func, ITER_KWARGS=dict(f=fl), show_progress_bars=progressbar, backend='threading', n_proc=n_proc) periods = {} parameters = None for start, end, cell, param in se: if (start, end) not in periods.keys(): periods[(start, end)] = [] periods[(start, end)].append(cell) if parameters is None: parameters = sorted(param) else: if sorted(param) != parameters: warnings.warn(f"Found different parameters for cell {cell}") return periods, parameters
[docs]def img_infer_file_props(path: str, fntempl: str = _default_template, start_from='last') -> dict: """ Parse file names to retrieve properties from :func:`c3s_sm.const.fntempl`. Does not open any files. """ files = glob(os.path.join(path, '**', '*.nc'), recursive=True) files.sort() if len(files) == 0: raise ValueError(f"No matching files for chosen template found in " f"the directory {path}") else: if start_from.lower() == 'last': files = files[::-1] elif start_from.lower() == 'first': pass else: raise NotImplementedError(f"`start_from` must be one of: " f"`first`, `last`.") for f in files: file_args = parse(fntempl, os.path.basename(f)) if file_args is None: continue return file_args.named raise ValueError(f"No matching files for chosen template found in the " f"directory {path}")
[docs]def read_summary_yml(path: str) -> dict: """ Read image summary and return fields as dict. """ path = os.path.join(path, 'overview.yml') with open(path, 'r') as stream: props = yaml.safe_load(stream) return props
[docs]def get_first_image_date(path: str, fntempl: str = _default_template) -> str: """ Parse files in the given directory (or any subdir) using the passed filename template. props will contain all fields specified in the template. the `datetime` field is required and used to determine the first image date. Parameters ---------- path: str Path to the directory containing the image files fntempl: str, optional The filename template used to parse image file names. Must contain a field `datetime` that indicates the location of the image time stamp in the filename. Returns ------- date: str Parse date from the first found image file that matches `fntempl`. """ try: props = img_infer_file_props(path, fntempl=fntempl, start_from='first') startdate = props['datetime'] except ValueError: raise ValueError('Could not infer start date from image files. ' 'Please specify startdate manually.') return startdate
[docs]def get_last_image_date(path: str, fntempl: str) -> str: """ Parse files in the given directory (or any subdir) using the passed filename template. props will contain all fields specified in the template. the `datetime` field is required and used to determine the last image date. Parameters ---------- path: str Path to the directory containing the image files fntempl: str (optional) The filename template used to parse image file names. Must contain a field `datetime` that indicates the location of the image time stamp in the filename. Returns ------- date: str Parse date from the last found image file that matches `fntempl`. """ try: props = img_infer_file_props(path, fntempl=fntempl, start_from='last') enddate = props['datetime'] except ValueError: raise ValueError('Could not infer end date from image files. ' 'Please specify enddate manually.') return enddate
[docs]def update_image_summary_file(data_path: str, out_file=None, fntempl: str = _default_template): """ Summarize image metadata as yml file Parameters ---------- data_path: str Root path to the image archive out_file: str, optional (default: None) Path to summary file. File will be created/updated. If not specified, then `data_path` is used. If a file already exists, it will be overwritten. fntempl: str, optional The filename template used to parse image file names. Must contain a field `datetime` that indicates the location of the image time stamp in the filename. """ first_image_date = get_first_image_date(data_path, fntempl=fntempl) last_image_date = get_last_image_date(data_path, fntempl=fntempl) props = img_infer_file_props(data_path, start_from='first') _ = props.pop("datetime") props['period_from'] = str(pd.to_datetime(first_image_date).date()) props['period_to'] = str(pd.to_datetime(last_image_date).date()) if out_file is None: out_file = os.path.join(data_path, f"overview.yml") with open(out_file, 'w') as f: yaml.dump(props, f, default_flow_style=False)
[docs]def update_ts_summary_file(data_path, props=None, collect_cov=False, **kwargs): """ Create a summary yml file that contains the most relevant information for a dataset. This information is then also used to update/extent the record. Parameters ---------- data_path: str Path where the time series files are stored and where the file overview.yml will be stored. props: dict, optional Additional information to write down collect_cov: bool, optional (False) Loop through all available cell files and detect their tempora coverage. Will add one or multiple `periodX` fields to the summary file, which indicate the temporal coverage of each cell. kwargs: Additional keyword arguments are passed to `collect_ts_cov` """ if collect_cov: periods = collect_ts_cov(data_path, **kwargs) else: periods = None try: old_props = read_summary_yml(data_path) except FileNotFoundError: old_props = {} props = {} if props is None else props if periods is not None: i = 1 for startend, cells in periods.items(): props[f'period{i}'] = dict( start=str(startend[0]), end=str(startend[1]), N=len(cells), cells=sorted(cells)) out_file = os.path.join(data_path, f"overview.yml") # keep old props (except datetime) for k, v in old_props.items(): if k not in props and not k.startswith('period'): props[k] = v for k, v in props.items(): if isinstance(v, tuple): props[k] = list(v) with open(out_file, 'w') as f: yaml.dump(props, f, default_flow_style=False, sort_keys=False) return props
[docs]def delete_empty_directories(path: str): """ Delete empty dirs in path """ for root, dirs, files in os.walk(path, topdown=False): for dir_name in dirs: dir_path = os.path.join(root, dir_name) # Check if the directory is empty if not os.listdir(dir_path): os.rmdir(dir_path)