Source code for pygeochemtools.geochem.aggregation

"""Functions to calculate the max chem value down hole

.. currentmodule:: pygeochemtools.aggregation
.. moduleauthor:: Rian Dutch <riandutch@gmail.com>
"""

import pandas as pd
import numpy as np
from pathlib import Path
from typing import Union


[docs]def max_dh_chem( input_data: Union[str, pd.DataFrame], drillhole_id: str ) -> pd.DataFrame: """Function to aggregate the processed elemental geochemical data and return a dataframe containing max value in each drillhole. Requires long format data. Args: input_data (Union[str, pd.DataFrame]): Path to clean and processed single element dataset in csv format or Pandas dataframe of clean and processed single element dataset. drillhole_id (str): drillhole identifier in dataset. Raises: ValueError: Error raised if input file is not a valid csv file Returns: pd.DataFrame: Dataframe containing only the maximum value from each drill hole """ if isinstance(input_data, str): path = Path(input_data) if path.is_file() and path.suffix == ".csv": df = pd.read_csv(path) else: raise ValueError("Ensure file is a valid .csv file") else: df = input_data df_max = df.loc[df.groupby([drillhole_id])["converted_ppm"].idxmax()] return df_max
[docs]def max_dh_chem_interval( input_data: Union[str, pd.DataFrame], interval: int, drillhole_id: str, start_depth_label: str, end_depth_label: str, ) -> pd.DataFrame: """Function to aggregate the processed singel elemental geochemical data and return a dataframe containing max value in each interval down hole for each drillhole. Requires long format data. Args: input_data (Union[str, pd.DataFrame]): Input single element geochemical data, in long form, as either a path to a csv input file or a pandas dataframe. interval (int): The interval, in whole meters, overwhich to aggregate down hole. drillhole_id (str): Column headder containing the drill hole identifier. start_depth_label (str): Column headder containing the start or from depth data. end_depth_label (str): Column headder containing the finish or to depth data. Raises: ValueError: Error if input file is not a valid csv file Returns: pd.DataFrame: Dataframe continging the maximum value for each specified interval. """ if isinstance(input_data, str): path = Path(input_data) if path.is_file() and path.suffix == ".csv": df = pd.read_csv(path) else: raise ValueError("Ensure file is a valid .csv file") else: df = input_data # calculate median to-from depth df["median_depth"] = df[[start_depth_label, end_depth_label]].apply( np.nanmedian, axis=1 ) # create bins to max depth and then bin median depths bins = pd.interval_range( start=0, end=df.median_depth.max(), freq=interval, closed="left" ) df["bin"] = pd.cut(df["median_depth"], bins=bins) df.dropna(subset=["converted_ppm"], inplace=True) # aggregate max values over range grp = df.groupby([drillhole_id, "bin"]) df_max = df.loc[grp.converted_ppm.idxmax().dropna()] return df_max