Aggregate Grouped Data Conditionally Over Many Columns Doing Different Operations In Python/pandas
Solution 1:
If you know your column names in advance you can build the dictionary before passing to the agg
function.
...
cutoffDate = df['Date'].max() + dt.timedelta(days=1)
agg_dict = {'Date': lambda x: (cutoffDate - x.max()).days}
DemandColumns = ['MenswearDemand', 'HomeDemand']
f = lambda x: x.sum()
agg_dict.update({col_name: f for col_name in DemandColumns})
newdf = df.groupby('CustomerID').agg(agg_dict)
Another option (knowing the column names, DemandColumns
in the previous example) is to first use the agg
function to calculate the Date
column, and then use the filter
function passing the list of desired columns as the items
argument to keep only those exact columns.
...
cutoffDate = df['Date'].max() + dt.timedelta(days=1)
groups = df.groupby('CustomerID')
newdf = groups.agg(lambda x: (cutoffDate - x.max()).days)
newdf = pd.concat([newdf, groups.apply(lambda x: x.filter(items=DemandColumns).agg(sum))], axis=1)
If the desired columns (DemandColumns
) follow a given pattern, you can exclude the list creation and use the filter
function with the regex
argument. In this case, you could use the regex '.*Demand$'
to return all columns that end with the Demand
string.
newdf = pd.concat([newdf, groups.apply(lambda x: x.filter(regex='.*Demand$').agg(sum))], axis=1)
Solution 2:
Just to give a convtools based alternative:
from datetime import datetime, timedelta
from convtools import conversion as c
from convtools.contrib.tables import Table
# this way you can define multiple meaningful metrics
metric_to_config = {
"sum_MenswearDemand": {
"column": "MenswearDemand",
"reducer": c.ReduceFuncs.Sum,
},
"sum_HomeDemand": {"column": "HomeDemand", "reducer": c.ReduceFuncs.Sum},
"median_Age": {"column": "Age", "reducer": c.ReduceFuncs.Median},
}
# pass required metric names as input
required_metrics = ["sum_MenswearDemand", "sum_HomeDemand"]
# prepare aggregation config
parse_date = c.call_func(datetime.strptime, c.this(), "%Y-%m-%d").call_method(
"date"
)
aggregate_config = {
"CustomerID": c.item("CustomerID"),
"Date": c.ReduceFuncs.Max(c.item("Date")).pipe(parse_date),
}
for metric in required_metrics:
config = metric_to_config[metric]
reducer = config["reducer"]
column = config["column"]
aggregate_config[metric] = reducer(c.item(column))
# this is where code generation happens
converter = (
c.group_by(c.item("CustomerID"))
.aggregate(aggregate_config)
.pipe(
# total max is calculated below and saved under "max_date" label# here we replace "Date" with day diffs
c.iter_mut(
c.Mut.set_item(
"Date", (c.label("max_date") - c.item("Date")).attr("days")
)
),
# calculate max date from aggregation results
label_input={
"max_date": (
c.call_func(max, c.iter(c.item("Date")))
+ timedelta(days=1)
)
},
)
.gen_converter()
)
# reading required columns from input csv file
rows = (
Table.from_csv(
"tmp/input_1.csv",
header=True,
dialect=Table.csv_dialect(delimiter="\t"),
)
.take(
"CustomerID",
"Date",
*{metric_to_config[metric]["column"] for metric in required_metrics},
)
.into_iter_rows(dict)
)
# aggregating input rows
iterable_of_results = converter(rows)
# outputting to csv file if needed
Table.from_rows(iterable_of_results).into_csv(
"tmp/out.csv",
dialect=Table.csv_dialect(delimiter="\t"),
)
The benefit of using this library is that it is lightweight, has no dependencies, allows for stream processing and sometimes it's not slower than pandas/polars because of simplicity of the generated code.
Post a Comment for "Aggregate Grouped Data Conditionally Over Many Columns Doing Different Operations In Python/pandas"