Source code for gridwxcomp.prep_metadata

# -*- coding: utf-8 -*-
"""
This module has tools to read a CSV of climate station metadata information and
verify it has the contents necessary to proceed with the later steps. The
output from this module will be the in a standardized format that is used by
the :mod:`gridwxcomp.ee_download` and :mod:`gridwxcomp.calc_bias_ratios`
modules for the main bias correction workflows.
"""
import ee
import os

import numpy as np
import pandas as pd                                                             
from pathlib import Path
from gridwxcomp.util import read_config, reproject_crs_for_point


def _read_station_list(station_path):
    """
    Helper function that reads station list CSV file and return modified 
    version as a :obj:`Pandas.DataFrame` that includes file paths to each 
    station time series file. Renames some columns for consistency with other 
    ``gridwxcomp`` functions and scripts.

    Arguments:
        station_path (str): path to CSV file containing list of climate
            stations that will later be used to calculate monthly
            bias rations to gridded data.

    Returns:
        station_list (:class:`pandas.DataFrame`): ``Pandas.DataFrame`` that
            contains station name, latitude, longitude, and others for each
            climate station.

    """

    station_list = pd.read_csv(station_path)
    # mandatory columns 
    need_cols = ['Latitude', 'Longitude', 'Filename', 'Station']

    # make sure mandatory columns exist else abort
    station_cols = station_list.columns
    if not set(need_cols).issubset(set(station_cols)):
        err_msg = ('One or more of the mandatory columns is missing from the station input file, it must contain:',
                   ', '.join(c for c in need_cols))
        raise ValueError(err_msg)

    station_list.rename(
            columns={
                'Latitude': 'STATION_LAT',
                'Longitude': 'STATION_LON',
                'Elev_m': 'STATION_ELEV_M',
                'Elev_FT': 'STATION_ELEV_FT',
                'Station': 'STATION_ID',
                'Filename': 'STATION_FILE_PATH'}, inplace=True)

    # get station name only for matching to file name without extension
    station_list.STATION_FILE_PATH = station_list.STATION_FILE_PATH.str.split('.').str.get(0)

    # look at path for station CSV, look for time series files in same directory
    station_path_tuple = os.path.split(station_path)
    path_root = station_path_tuple[0]
    file_name = station_path_tuple[1]

    # look in parent directory that contains station CSV file
    if path_root != '' and file_name != '':
        file_names = os.listdir(path_root)
    # if station CSV file is in cwd look there
    else:
        file_names = os.listdir(os.getcwd())
    # match station name with time series Excel files full path,
    # assumes no other files in the directory have station names in their name
    # will accept files of any extension, e.g. xlx, csv, txt
    for i, station in enumerate(station_list.STATION_FILE_PATH):
        try:
            match = [s for s in file_names if station in s][0]
        except:
            match = None
        if match:
            station_list.loc[station_list.STATION_FILE_PATH == station, 'STATION_FILE_PATH'] = \
                os.path.abspath(os.path.join(path_root, match))
        else:
            missing_station = station_list.iloc[i]['STATION_ID']
            print('WARNING: no file was found that matches station: ', missing_station, '\nin directory: ',
                  os.path.abspath(path_root), '\nskipping.\n')
            continue

    return station_list


[docs]def prep_metadata(station_path, config_path, grid_name, out_path='formatted_input.csv'): """ Read list of climate stations in metadata and verify all needed parameters exist. An output CSV file is saved that will be the formatted in a way that is standardized for the variables that are needed by the subsequent Earth Engine download and bias calculation modules. Station time series files must be in the same directory as the main input to this function, i.e., the `station_path` metadata file. Arguments: station_path (str): path to CSV file containing metadata of climate stations that will later be used to calculate bias ratios to the gridded dataset. config_path (str): path to config file containing projection info grid_name (str): name of the gridded dataset that is being used for comparison against observed data. out_path (str): path to save output CSV, default is to save as 'merged_input.csv' to current working directory. Returns: None Example: >>> from gridwxcomp import prep_metadata >>> prep_metadata('example_metadata.txt','outfile.csv') outfile.csv will be created containing station and corresponding gridded data. This file is later used as input for :mod:`gridwxcomp.ee_download` and :mod:`gridwxcomp.calc_bias_ratios`. Important: Make sure the following column headers exist in your input station metadata file (``station_path``) and are spelled exactly: * Latitude * Longitude * Station * Filename Also, the "Filename" column should match the names of the climate time series files that should be in the same directory as the station metadata file. For example, if one of the time series files is named "Bluebell_daily_data.csv" then the following are permissiable entries as the "Filename": "Bluebell_daily_data" or "Bluebell_daily_data.csv". Raises: ValueError: if one or more of the following mandatory columns are missing from the input CSV file (``station_path`` parameter): 'Longitude', 'Latitude', 'Station', or 'Filename'. """ # Create parent directories if necessary path_root = Path(out_path).parent if not path_root.is_dir(): print('The directory: ', path_root.absolute(), ' does not exist, creating directory') os.makedirs(path_root) print('station list CSV: ', os.path.abspath(station_path)) print('merged CSV will be saved to: ', os.path.abspath(out_path)) config = read_config(config_path) stations = _read_station_list(station_path) stations[f'GRID_ID'] = f'{grid_name}_' + stations['STATION_ID'] if 'ELEV_M' in stations.columns: stations['ELEV_FT'] = stations.ELEV_M * 3.28084 # m to ft # Add WGS84 projection columns for earth engine requests temp_proj_df = stations[['STATION_LAT', 'STATION_LON']].copy(deep=True) temp_proj_df['STATION_LAT_WGS84'] = np.nan temp_proj_df['STATION_LON_WGS84'] = np.nan for index, row in temp_proj_df.iterrows(): (temp_proj_df.loc[index, 'STATION_LON_WGS84'], temp_proj_df.loc[index, 'STATION_LAT_WGS84']) =\ reproject_crs_for_point( row['STATION_LON'], row['STATION_LAT'], config['input_data_projection'], 'EPSG:4326') stations['STATION_LAT_WGS84'] = temp_proj_df['STATION_LAT_WGS84'] stations['STATION_LON_WGS84'] = temp_proj_df['STATION_LON_WGS84'] # save CSV stations.to_csv(out_path, index=False)