Source code for gridwxcomp.prep_metadata

# -*- coding: utf-8 -*-
"""
This module has tools to read a CSV of climate station metadata information and
verify it has the contents necessary to proceed with the later steps. The
output from this module will be the in a standardized format that is used by
the :mod:`gridwxcomp.ee_download` and :mod:`gridwxcomp.calc_bias_ratios`
modules for the main bias correction workflows.
"""
import ee
import os

import numpy as np
import pandas as pd                                                             
from pathlib import Path
from gridwxcomp.util import read_config, reproject_crs_for_point


def _read_station_list(station_path):
    """
    Helper function that reads station list CSV file and return modified 
    version as a :obj:`Pandas.DataFrame` that includes file paths to each 
    station time series file. Renames some columns for consistency with other 
    ``gridwxcomp`` functions and scripts.

    Arguments:
        station_path (str): path to CSV file containing list of climate
            stations that will later be used to calculate monthly
            bias rations to gridded data.

    Returns:
        station_list (:class:`pandas.DataFrame`): ``Pandas.DataFrame`` that
            contains station name, latitude, longitude, and others for each
            climate station.

    """

    station_list = pd.read_csv(station_path)
    # mandatory columns 
    need_cols = ['Latitude', 'Longitude', 'Filename', 'Station']

    # make sure mandatory columns exist else abort
    station_cols = station_list.columns
    if not set(need_cols).issubset(set(station_cols)):
        err_msg = ('One or more of the mandatory columns is missing from the station input file, it must contain:',
                   ', '.join(c for c in need_cols))
        raise ValueError(err_msg)

    station_list.rename(
            columns={
                'Latitude': 'STATION_LAT',
                'Longitude': 'STATION_LON',
                'Elev_m': 'STATION_ELEV_M',
                'Elev_FT': 'STATION_ELEV_FT',
                'Station': 'STATION_ID',
                'Filename': 'STATION_FILE_PATH'}, inplace=True)

    # get station name only for matching to file name without extension
    station_list.STATION_FILE_PATH = station_list.STATION_FILE_PATH.str.split('.').str.get(0)

    # look at path for station CSV, look for time series files in same directory
    station_path_tuple = os.path.split(station_path)
    path_root = station_path_tuple[0]
    file_name = station_path_tuple[1]

    # look in parent directory that contains station CSV file
    if path_root != '' and file_name != '':
        file_names = os.listdir(path_root)
    # if station CSV file is in cwd look there
    else:
        file_names = os.listdir(os.getcwd())
    # match station name with time series Excel files full path,
    # assumes no other files in the directory have station names in their name
    # will accept files of any extension, e.g. xlx, csv, txt
    for i, station in enumerate(station_list.STATION_FILE_PATH):
        try:
            match = [s for s in file_names if station in s][0]
        except:
            match = None
        if match:
            station_list.loc[station_list.STATION_FILE_PATH == station, 'STATION_FILE_PATH'] = \
                os.path.abspath(os.path.join(path_root, match))
        else:
            missing_station = station_list.iloc[i]['STATION_ID']
            print('WARNING: no file was found that matches station: ', missing_station, '\nin directory: ',
                  os.path.abspath(path_root), '\nskipping.\n')
            continue

    return station_list


[docs]def prep_metadata(station_path, config_path, grid_name, 
        out_path='formatted_input.csv'):
    """
    Read list of climate stations in metadata and verify all needed parameters
    exist. An output CSV file is saved that will be the formatted in a way that
    is standardized for the variables that are needed by the subsequent 
    Earth Engine download and bias calculation modules. 

    Station time series files must be in the same directory as the main input
    to this function, i.e., the `station_path` metadata file.

    Arguments:
        station_path (str): path to CSV file containing metadata of climate
            stations that will later be used to calculate bias ratios to 
            the gridded dataset.
        config_path (str): path to config file containing projection info
        grid_name (str): name of the gridded dataset that is being used
            for comparison against observed data.
        out_path (str): path to save output CSV, default is to save as 
            'merged_input.csv' to current working directory.


    Returns:
        None

    Example:

        >>> from gridwxcomp import prep_metadata
        >>> prep_metadata('example_metadata.txt','outfile.csv')
        
        outfile.csv will be created containing station and corresponding
        gridded data. This file is later used as input for
        :mod:`gridwxcomp.ee_download` and
        :mod:`gridwxcomp.calc_bias_ratios`.

    Important:
        Make sure the following column headers exist in your input station 
        metadata file (``station_path``) and are spelled exactly:

          * Latitude
          * Longitude
          * Station
          * Filename

        Also, the "Filename" column should match the names of the climate time
        series files that should be in the same directory as the station
        metadata file. For example, if one of the time series files is named
        "Bluebell_daily_data.csv" then the following are permissiable entries
        as the "Filename": "Bluebell_daily_data" or "Bluebell_daily_data.csv".
        
    Raises:
        ValueError: if one or more of the following mandatory columns are 
            missing from the input CSV file (``station_path`` parameter): 
            'Longitude', 'Latitude', 'Station', or 'Filename'.   
    """

    # Create parent directories if necessary
    path_root = Path(out_path).parent
    if not path_root.is_dir():
        print('The directory: ', path_root.absolute(), ' does not exist, creating directory')
        os.makedirs(path_root)

    print('station list CSV: ', os.path.abspath(station_path))
    print('merged CSV will be saved to: ', os.path.abspath(out_path))

    config = read_config(config_path)

    stations = _read_station_list(station_path)
    stations[f'GRID_ID'] = f'{grid_name}_' + stations['STATION_ID']

    if 'ELEV_M' in stations.columns:
        stations['ELEV_FT'] = stations.ELEV_M * 3.28084  # m to ft


    # Add WGS84 projection columns for earth engine requests
    temp_proj_df = stations[['STATION_LAT', 'STATION_LON']].copy(deep=True)
    temp_proj_df['STATION_LAT_WGS84'] = np.nan
    temp_proj_df['STATION_LON_WGS84'] = np.nan

    for index, row in temp_proj_df.iterrows():
        (temp_proj_df.loc[index, 'STATION_LON_WGS84'],
         temp_proj_df.loc[index, 'STATION_LAT_WGS84']) =\
            reproject_crs_for_point(
            row['STATION_LON'], row['STATION_LAT'],
            config['input_data_projection'], 'EPSG:4326')

    stations['STATION_LAT_WGS84'] = temp_proj_df['STATION_LAT_WGS84']
    stations['STATION_LON_WGS84'] = temp_proj_df['STATION_LON_WGS84']

    # save CSV
    stations.to_csv(out_path, index=False)