Ex1:缺失值与类别的相关性检验
In [123]: df = pd.read_csv('data/missing_chi.csv')
In [124]: cat_1 = df.X_1.fillna('NaN').mask(df.X_1.notna()).fillna("NotNaN")
In [125]: cat_2 = df.X_2.fillna('NaN').mask(df.X_2.notna()).fillna("NotNaN")
In [126]: df_1 = pd.crosstab(cat_1, df.y, margins=True)
In [127]: df_2 = pd.crosstab(cat_2, df.y, margins=True)
In [128]: def compute_S(my_df):
.....: S = []
.....: for i in range(2):
.....: for j in range(2):
.....: E = my_df.iat[i, j]
.....: F = my_df.iat[i, 2]*my_df.iat[2, j]/my_df.iat[2,2]
.....: S.append((E-F)**2/F)
.....: return sum(S)
.....:
In [129]: res1 = compute_S(df_1)
In [130]: res2 = compute_S(df_2)
In [131]: from scipy.stats import chi2
In [132]: chi2.sf(res1, 1) # X_1检验的p值 # 不能认为相关,剔除
Out[132]: 0.9712760884395901
In [133]: chi2.sf(res2, 1) # X_2检验的p值 # 认为相关,保留
Out[133]: 7.459641265637543e-166
结果与 scipy.stats.chi2_contingency
在不使用 Yates 修正的情况下完全一致:
In [134]: from scipy.stats import chi2_contingency
In [135]: chi2_contingency(pd.crosstab(cat_1, df.y), correction=False)[1]
Out[135]: 0.9712760884395901
In [136]: chi2_contingency(pd.crosstab(cat_2, df.y), correction=False)[1]
Out[136]: 7.459641265637543e-166
Ex2:用回归模型解决分类问题
In [137]: from sklearn.neighbors import KNeighborsRegressor
In [138]: df = pd.read_excel('data/color.xlsx')
In [139]: df_dummies = pd.get_dummies(df.Color)
In [140]: stack_list = []
In [141]: for col in df_dummies.columns:
.....: clf = KNeighborsRegressor(n_neighbors=6)
.....: clf.fit(df.iloc[:,:2], df_dummies[col])
.....: res = clf.predict([[0.8, -0.2]]).reshape(-1,1)
.....: stack_list.append(res)
.....:
In [142]: code_res = pd.Series(np.hstack(stack_list).argmax(1))
In [143]: df_dummies.columns[code_res[0]]
Out[143]: 'Yellow'
In [144]: from sklearn.neighbors import KNeighborsRegressor
In [145]: df = pd.read_csv('data/audit.csv')
In [146]: res_df = df.copy()
In [147]: df = pd.concat([pd.get_dummies(df[['Marital', 'Gender']]),
.....: df[['Age','Income','Hours']].apply(
.....: lambda x:(x-x.min())/(x.max()-x.min())), df.Employment],1)
.....:
In [148]: X_train = df.query('Employment.notna()')
In [149]: X_test = df.query('Employment.isna()')
In [150]: df_dummies = pd.get_dummies(X_train.Employment)
In [151]: stack_list = []
In [152]: for col in df_dummies.columns:
.....: clf = KNeighborsRegressor(n_neighbors=6)
.....: clf.fit(X_train.iloc[:,:-1], df_dummies[col])
.....: res = clf.predict(X_test.iloc[:,:-1]).reshape(-1,1)
.....: stack_list.append(res)
.....:
In [153]: code_res = pd.Series(np.hstack(stack_list).argmax(1))
In [154]: cat_res = code_res.replace(dict(zip(list(
.....: range(df_dummies.shape[0])),df_dummies.columns)))
.....:
In [155]: res_df.loc[res_df.Employment.isna(), 'Employment'] = cat_res.values
In [156]: res_df.isna().sum()
Out[156]:
ID 0
Age 0
Employment 0
Marital 0
Income 0
Gender 0
Hours 0
dtype: int64