from .model import Model
[docs]
class Analyse:
"""Mixin class with analysis methods for soil data."""
# column names used in the analysis
[docs]
_grouping = ["lat", "lon", "units"] # standard grouping for all methods
[docs]
_valuename = "values." # e.g. "values.mean"
[docs]
_propertyname = "property"
@property
[docs]
def df(self):
"""Data from the SoilGrids API as a pandas DataFrame."""
return self.get_data()
[docs]
def get_data(self):
"""Return data from the SoilGrids API as a data frame."""
return NotImplemented
[docs]
def top_property(self, properties = None, value = "mean"):
"""Return dataframe with the highest scoring properties for each coordinate."""
properties = self.properties if properties is None else properties
valuecol = f"{self._valuename}{value}"
df = self._select_content(self.df, self._propertyname, properties)
df = self._numeric_and_remove_nans(df, valuecol)
max_indices = df.groupby(self._grouping)[valuecol].idxmax(skipna=True)
columns = self._grouping + [self._propertyname, valuecol]
return df.loc[max_indices, columns].reset_index(drop=True)
[docs]
def max_values(self, properties = None, value = "mean"):
"""Return a dataframe with the highest values per property for each coordinate.
Given multiple depths in the data frame, this method will return the highest value for each location.
Args:
properties (list, optional): properties to include. Defaults to None, which includes all properties.
value (str, optional): value to consider. Defaults to "mean".
"""
properties = self.properties if properties is None else properties
valuecol = f"{self._valuename}{value}"
df = self._select_content(self.df, self._propertyname, properties)
df = self._numeric_and_remove_nans(df, valuecol)
grouping = self._grouping + [self._propertyname]
max_indices = df.groupby(grouping)[valuecol].idxmax(skipna=True)
columns = self._grouping + [self._propertyname, self._depthname, valuecol]
return df.loc[max_indices, columns].reset_index(drop=True)
[docs]
def mean_values(self, properties = None, value = "mean"):
"""Return a dataframe with the average values per property for each coordinate.
Given multiple depths in the data frame, this method will return averages for each location.
Args:
properties (list, optional): properties to include. Defaults to None, which includes all properties.
value (str, optional): value to consider. Defaults to "mean".
"""
properties = self.properties if properties is None else properties
valuecol = f"{self._valuename}{value}"
df = self._select_content(self.df, self._propertyname, properties)
df = self._numeric_and_remove_nans(df, valuecol)
grouping = self._grouping + [self._propertyname]
return df.groupby(grouping).agg({valuecol: "mean"}).reset_index()
@classmethod
[docs]
def _numeric_and_remove_nans(cls, df, col):
"""Convert specific column to numeric and remove NaNs."""
df[col] = df[col].astype(float)
df.dropna(subset=[col], inplace=True)
return df
@classmethod
[docs]
def _select_content(cls, df, col, content):
"""Select rows with specific content in a column."""
return df[df[col].isin(content)]
[docs]
def regression(self, formula):
"""Perform regression analysis."""
df = self._pivot_for_model(self.df)
return Model(formula = formula, data = df)
[docs]
def summary(self):
"""Return summary statistics."""
return NotImplemented
@classmethod
[docs]
def _pivot_for_model(cls, df):
"""Perform a pivot on the data frame to prepare for regression analysis."""
# TODO remove hardcoded variable names
pivot = (df.groupby(['lat', 'lon', 'property'])
.agg(value=('values.mean', 'max'))
.pivot_table(index=['lat', 'lon'], columns='property', values='value')
.reset_index())
properties = df['property'].unique()
for property in properties:
pivot = cls._numeric_and_remove_nans(pivot, property)
return pivot