Source code for soilstats.analysis.analysis

from .model import Model


[docs] class Analyse: """Mixin class with analysis methods for soil data.""" # column names used in the analysis
[docs] _grouping = ["lat", "lon", "units"] # standard grouping for all methods
[docs] _valuename = "values." # e.g. "values.mean"
[docs] _propertyname = "property"
[docs] _depthname = "depth"
@property
[docs] def df(self): """Data from the SoilGrids API as a pandas DataFrame.""" return self.get_data()
[docs] def get_data(self): """Return data from the SoilGrids API as a data frame.""" return NotImplemented
[docs] def top_property(self, properties = None, value = "mean"): """Return dataframe with the highest scoring properties for each coordinate.""" properties = self.properties if properties is None else properties valuecol = f"{self._valuename}{value}" df = self._select_content(self.df, self._propertyname, properties) df = self._numeric_and_remove_nans(df, valuecol) max_indices = df.groupby(self._grouping)[valuecol].idxmax(skipna=True) columns = self._grouping + [self._propertyname, valuecol] return df.loc[max_indices, columns].reset_index(drop=True)
[docs] def max_values(self, properties = None, value = "mean"): """Return a dataframe with the highest values per property for each coordinate. Given multiple depths in the data frame, this method will return the highest value for each location. Args: properties (list, optional): properties to include. Defaults to None, which includes all properties. value (str, optional): value to consider. Defaults to "mean". """ properties = self.properties if properties is None else properties valuecol = f"{self._valuename}{value}" df = self._select_content(self.df, self._propertyname, properties) df = self._numeric_and_remove_nans(df, valuecol) grouping = self._grouping + [self._propertyname] max_indices = df.groupby(grouping)[valuecol].idxmax(skipna=True) columns = self._grouping + [self._propertyname, self._depthname, valuecol] return df.loc[max_indices, columns].reset_index(drop=True)
[docs] def mean_values(self, properties = None, value = "mean"): """Return a dataframe with the average values per property for each coordinate. Given multiple depths in the data frame, this method will return averages for each location. Args: properties (list, optional): properties to include. Defaults to None, which includes all properties. value (str, optional): value to consider. Defaults to "mean". """ properties = self.properties if properties is None else properties valuecol = f"{self._valuename}{value}" df = self._select_content(self.df, self._propertyname, properties) df = self._numeric_and_remove_nans(df, valuecol) grouping = self._grouping + [self._propertyname] return df.groupby(grouping).agg({valuecol: "mean"}).reset_index()
@classmethod
[docs] def _numeric_and_remove_nans(cls, df, col): """Convert specific column to numeric and remove NaNs.""" df[col] = df[col].astype(float) df.dropna(subset=[col], inplace=True) return df
@classmethod
[docs] def _select_content(cls, df, col, content): """Select rows with specific content in a column.""" return df[df[col].isin(content)]
[docs] def regression(self, formula): """Perform regression analysis.""" df = self._pivot_for_model(self.df) return Model(formula = formula, data = df)
[docs] def summary(self): """Return summary statistics.""" return NotImplemented
@classmethod
[docs] def _pivot_for_model(cls, df): """Perform a pivot on the data frame to prepare for regression analysis.""" # TODO remove hardcoded variable names pivot = (df.groupby(['lat', 'lon', 'property']) .agg(value=('values.mean', 'max')) .pivot_table(index=['lat', 'lon'], columns='property', values='value') .reset_index()) properties = df['property'].unique() for property in properties: pivot = cls._numeric_and_remove_nans(pivot, property) return pivot