How To Generate Missing Weekids In Python Dataframes
I have a dataset given below with weekids indexed: product_name serial_number date sum 'A' '12' '202001' 150 'A' '12
Solution 1:
so generate the data
data = {'product_name' : ['a','a','a','a'],
'serial_number': [12,12,12,12],
'date': [202001,202002,202004,202005],
'Sum': [150,350,550,1500]}
df=pd.DataFrame(data)
this gives
product_name serial_number date Sum
0a122020011501a122020023502a122020045503a122020051500
then do the work, this is making the big assumption that you want to do this by product_name as well as week number
outdf = pd.DataFrame(columns = df.columns)
s=range(202001,202008)
for name, subdf in df.groupby('product_name'):
thisdf=subdf.set_index(['serial_number','date']).\
Sum.unstack().reindex(columns=s,fill_value=0).stack().reset_index()
thisdf.rename(columns={0: "Sum"}, errors="raise",inplace=True)
thisdf.reset_index(inplace=True)
thisdf.rename(columns={'index': 'product_name'}, errors="raise",inplace=True)
thisdf['product_name'] = name
thisdf = thisdf[['product_name', 'serial_number', 'date', 'Sum']]
outdf = pd.concat([outdf,thisdf])
outdf = outdf[['product_name', 'serial_number', 'date', 'Sum']]
outdf.reset_index(inplace=True)
outdf = outdf[['product_name', 'serial_number', 'date', 'Sum']]
and this yields
product_name serial_number date Sum
0a122020011501a122020023502a1220200303a122020045504a1220200515005a1220200606a122020070
Solution 2:
data = [{'product_name': 'A', 'serial_number': '12', 'date': '202001', 'sum': 150},
{'product_name': 'A', 'serial_number': '12', 'date': '202002', 'sum': 350},
{'product_name': 'A', 'serial_number': '12', 'date': '202004', 'sum': 550},
{'product_name': 'A', 'serial_number': '12', 'date': '202005', 'sum': 1500}]
df = pd.DataFrame(data)
create a dataframe with date to add and append to the origin dataframe.
# first find out which date to add -> {'202006', '202007', '202003'}
date_list = [str(date) for date inrange(202001, 202008)]
date_to_add = set(date_list) - set(df['date'].values)
# create a dataframe with date = list(date_to_add), sum = 0, and append to the origin df
dfn = pd.DataFrame({'date':list(date_to_add),'sum':0})
df_result = df.append(dfn).sort_values('date').fillna(method='ffill')
result:
print(df_result)
product_name serial_number date sum
0A122020011501A122020023502A1220200302A122020045503A1220200515000A1220200601A122020070
Solution 3:
You could use the complete function from pyjanitor to expose the missing combinations; at the moment you have to install the latest development version from github:
import pandas as pd
df = pd.DataFrame({'product_name': ['A', 'A', 'A', 'A'],
'serial_number': [12, 12, 12, 12],
'date': [202001, 202002, 202004, 202005],
'sum': [150, 350, 550, 1500]})
df
product_name serial_number datesum
0 A 12 202001 150
1 A 12 202002 350
2 A 12 202004 550
3 A 12 202005 1500
# install latest dev version# pip install git+https://github.com/ericmjl/pyjanitor.git
import janitor
(
df.complete(columns = [{"date": lambda x: np.arange(202001, 202008)}],
fill_value={"sum": 0})
.ffill()
)
product_name serial_number datesum
0 A 12.0 202001 150.0
1 A 12.0 202002 350.0
2 A 12.0 202003 0.0
3 A 12.0 202004 550.0
4 A 12.0 202005 1500.0
5 A 12.0 202006 0.0
6 A 12.0 202007 0.0
With Pandas only, you can create a series that has all the values from 202001
to 202008
and merge to existing dataframe:
complete_array = pd.Series(np.arange(202001, 202008), name="date")
complete_array
0 202001
1 202002
2 202003
3 202004
4 202005
5 202006
6 202007
Name: date, dtype: int64
(
df.merge(complete_array, how="outer", on="date")
.sort_values("date") # you can add ignore_index=True to reset the index
.fillna({"sum": 0})
.ffill()
)
product_name serial_number datesum
0 A 12.0 202001 150.0
1 A 12.0 202002 350.0
4 A 12.0 202003 0.0
2 A 12.0 202004 550.0
3 A 12.0 202005 1500.0
5 A 12.0 202006 0.0
6 A 12.0 202007 0.0
Post a Comment for "How To Generate Missing Weekids In Python Dataframes"