Ex1:美国非法药物数据集
In [96]: df = pd.read_csv('data/drugs.csv').sort_values([
....: 'State','COUNTY','SubstanceName'],ignore_index=True)
....:
In [97]: res = df.pivot(index=['State','COUNTY','SubstanceName'
....: ], columns='YYYY', values='DrugReports'
....: ).reset_index().rename_axis(columns={'YYYY':''})
....:
In [98]: res.head(5)
Out[98]:
State COUNTY SubstanceName 2010 2011 2012 2013 2014 2015 2016 2017
0 KY ADAIR Buprenorphine NaN 3.0 5.0 4.0 27.0 5.0 7.0 10.0
1 KY ADAIR Codeine NaN NaN 1.0 NaN NaN NaN NaN 1.0
2 KY ADAIR Fentanyl NaN NaN 1.0 NaN NaN NaN NaN NaN
3 KY ADAIR Heroin NaN NaN 1.0 2.0 NaN 1.0 NaN 2.0
4 KY ADAIR Hydrocodone 6.0 9.0 10.0 10.0 9.0 7.0 11.0 3.0
In [99]: res_melted = res.melt(id_vars = ['State','COUNTY','SubstanceName'],
....: value_vars = res.columns[-8:],
....: var_name = 'YYYY',
....: value_name = 'DrugReports').dropna(
....: subset=['DrugReports'])
....:
In [100]: res_melted = res_melted[df.columns].sort_values([
.....: 'State','COUNTY','SubstanceName'],ignore_index=True
.....: ).astype({'YYYY':'int64', 'DrugReports':'int64'})
.....: res_melted.equals(df)
.....:
Out[100]: True
策略一:
In [101]: res = df.pivot_table(index='YYYY', columns='State',
.....: values='DrugReports', aggfunc='sum')
.....:
In [102]: res.head(3)
Out[102]:
State KY OH PA VA WV
YYYY
2010 10453 19707 19814 8685 2890
2011 10289 20330 19987 6749 3271
2012 10722 23145 19959 7831 3376
策略二:
In [103]: res = df.groupby(['State', 'YYYY'])['DrugReports'].sum(
.....: ).to_frame().unstack(0).droplevel(0,axis=1)
.....:
In [104]: res.head(3)
Out[104]:
State KY OH PA VA WV
YYYY
2010 10453 19707 19814 8685 2890
2011 10289 20330 19987 6749 3271
2012 10722 23145 19959 7831 3376
Ex2:特殊的wide_to_long方法
In [105]: df = pd.DataFrame({'Class':[1,2],
.....: 'Name':['San Zhang', 'Si Li'],
.....: 'Chinese':[80, 90],
.....: 'Math':[80, 75]})
.....:
In [106]: df
Out[106]:
Class Name Chinese Math
0 1 San Zhang 80 80
1 2 Si Li 90 75
In [107]: df = df.rename(columns={'Chinese':'pre_Chinese', 'Math':'pre_Math'})
In [108]: pd.wide_to_long(df,
.....: stubnames=['pre'],
.....: i = ['Class', 'Name'],
.....: j='Subject',
.....: sep='_',
.....: suffix='.+').reset_index().rename(columns={'pre':'Grade'})
.....:
Out[108]:
Class Name Subject Grade
0 1 San Zhang Chinese 80
1 1 San Zhang Math 80
2 2 Si Li Chinese 90
3 2 Si Li Math 75