Ex1:公司员工数据集
In [51]: df = pd.read_csv('data/company.csv')
In [52]: dpt = ['Dairy', 'Bakery']
In [53]: df.query("(age <= 40)&(department == @dpt)&(gender=='M')").head(3)
Out[53]:
EmployeeID birthdate_key age city_name department job_title gender
3611 5791 1/14/1975 40 Kelowna Dairy Dairy Person M
3613 5793 1/22/1975 40 Richmond Bakery Baker M
3615 5795 1/30/1975 40 Nanaimo Dairy Dairy Person M
In [54]: df.loc[(df.age<=40)&df.department.isin(dpt)&(df.gender=='M')].head(3)
Out[54]:
EmployeeID birthdate_key age city_name department job_title gender
3611 5791 1/14/1975 40 Kelowna Dairy Dairy Person M
3613 5793 1/22/1975 40 Richmond Bakery Baker M
3615 5795 1/30/1975 40 Nanaimo Dairy Dairy Person M
In [55]: df.iloc[(df.EmployeeID%2==1).values,[0,2,-2]].head()
Out[55]:
EmployeeID age job_title
1 1319 58 VP Stores
3 1321 56 VP Human Resources
5 1323 53 Exec Assistant, VP Stores
6 1325 51 Exec Assistant, Legal Counsel
8 1329 48 Store Manager
In [56]: df_op = df.copy()
In [57]: df_op = df_op.set_index(df_op.columns[-3:].tolist()).swaplevel(0,2,axis=0)
In [58]: df_op = df_op.reset_index(level=1)
In [59]: df_op = df_op.rename_axis(index={'gender':'Gender'})
In [60]: df_op.index = df_op.index.map(lambda x:'_'.join(x))
In [61]: df_op.index = df_op.index.map(lambda x:tuple(x.split('_')))
In [62]: df_op = df_op.rename_axis(index=['gender', 'department'])
In [63]: df_op = df_op.reset_index().reindex(df.columns, axis=1)
In [64]: df_op.equals(df)
Out[64]: True
Ex2:巧克力数据集
In [65]: df = pd.read_csv('data/chocolate.csv')
In [66]: df.columns = [' '.join(i.split('\n')) for i in df.columns]
In [67]: df.head(3)
Out[67]:
Company Review Date Cocoa Percent Company Location Rating
0 A. Morin 2016 63% France 3.75
1 A. Morin 2015 70% France 2.75
2 A. Morin 2015 70% France 3.00
In [68]: df['Cocoa Percent'] = df['Cocoa Percent'].apply(lambda x:float(x[:-1])/100)
In [69]: df.query('(Rating<3)&(`Cocoa Percent`>`Cocoa Percent`.median())').head(3)
Out[69]:
Company Review Date Cocoa Percent Company Location Rating
33 Akesson's (Pralus) 2010 0.75 Switzerland 2.75
34 Akesson's (Pralus) 2010 0.75 Switzerland 2.75
36 Alain Ducasse 2014 0.75 France 2.75
In [70]: idx = pd.IndexSlice
In [71]: exclude = ['France', 'Canada', 'Amsterdam', 'Belgium']
In [72]: res = df.set_index(['Review Date', 'Company Location']).sort_index(level=0)
In [73]: res.loc[idx[2012:,~res.index.get_level_values(1).isin(exclude)],:].head(3)
Out[73]:
Company Cocoa Percent Rating
Review Date Company Location
2012 Australia Bahen & Co. 0.7 3.0
Australia Bahen & Co. 0.7 2.5
Australia Bahen & Co. 0.7 2.5