该数据集中包含三个文件:LC.csv LP.csv LCIS.csv
LC数据集为标的特征表,每只标一条记录。共有21个字段,包括一个主键、7个标本身的信息字段、13个成交时借款人的信息字段。LP数据集为标的还款计划和还款记录表。每只标每期还款一个记录。共有10个字段,包括2个主键,2个还款计划字段和4个还款状态字段。LCIS数据集包含了某一个客户投资的从2015年1月1日起成交的所有标,共36个字段。包含1个主键、7个标自身信息字段和13个成交当时借款人的信息字段以及15个客户投资与收益相关的信息字段。。
二、读取数据集,合并训练集测试集 1.引入库代码如下:
import numpy as np import pandas as pd from pandas import Series, Dataframe import matplotlib.pyplot as plt %matplotlib inline # 读取训练数据集 path = r'F:教师培训ppd6PPD-First-Round-Data-UpdatedPPD-First-Round-Data-UpdateTraining Set' train_loginfo = pd.read_csv(path + 'PPD_LogInfo_3_1_Training_Set.csv', encoding='gbk') train_master = pd.read_csv(path + 'PPD_Training_Master_GBK_3_1_Training_Set.csv', encoding='gbk') train_userupdate = pd.read_csv(path + 'PPD_Userupdate_Info_3_1_Training_Set.csv', encoding='gbk') train_master.shape # 读取测试数据集 path = 'F:教师培训ppd3PPD-First-Round-Data-UpdateTest Set' test_loginfo = pd.read_csv(path + 'PPD_LogInfo_2_Test_Set.csv', encoding='gbk') test_master = pd.read_csv(path + 'PPD_Master_GBK_2_Test_Set.csv', encoding='gb18030') test_userupdate = pd.read_csv(path + 'PPD_Userupdate_Info_2_Test_Set.csv', encoding='gbk') # 合并时用于标记哪些样本来自训练集和测试集 train_master['sample_status']='train' test_master['sample_status']='test' # 训练集和测试集的合并(axis=0,增加行) df_Master = pd.concat([train_master,test_master], axis=0).reset_index(drop=True) df_loginfo = pd.concat([train_loginfo,test_loginfo], axis=0).reset_index(drop=True) df_userupdate = pd.concat([train_userupdate,test_userupdate], axis=0).reset_index(drop=True) # 缺失值为-1,替换成nan df_Master = df_Master.replace({-1:np.nan})2.缺失值处理
代码如下:
#删除缺失值比例大于0.8的列 plt.figure(figsize=(20,5)) p = (df_Master.isnull().sum().sort_values(ascending=False)/49999).reset_index().rename(columns={'index':'feat_name', 0:'rate'}) df_Master = df_Master.drop(columns=p[p.rate>0.8]['feat_name']) #然后取缺失率排在前30的特征列 plt.figure(figsize=(20,5)) (df_Master.isnull().sum().sort_values(ascending=False)/49999)[:30].plot.bar(rot=45) ![在这里插入图片描述](https://img-blog.csdnimg.cn/1d8e676d32aa415994acde183a839981.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAc3VubnVhbjAx,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center) #删除缺失数大于100的行 df_Master.isnull().sum(axis=1).sort_values(ascending=True).reset_index(drop=True).plot.line() ![在这里插入图片描述](https://img-blog.csdnimg.cn/ce279c8bc4634360b084a9aceff47fe9.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAc3VubnVhbjAx,size_11,color_FFFFFF,t_70,g_se,x_16#pic_center) a = df_Master.isnull().sum(axis=1).sort_values(ascending=False).reset_index().rename(columns={0:'count_lack_row'}) df_Master = df_Master.drop(index=a[a.count_lack_row>100]['index']) df_Master.isnull().sum(axis=1).sort_values(ascending=True).reset_index(drop=True).plot.line() ![在这里插入图片描述](https://img-blog.csdnimg.cn/6ebab8938e644fa599274f0ce3c7f2fd.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAc3VubnVhbjAx,size_12,color_FFFFFF,t_70,g_se,x_16#pic_center)3.将数据集划分为数值类型和类别类型并分别处理
#获取数值类型的特征,并且使用中值填数值类型的特征 b = df_Master.select_dtypes(include=np.float64).drop(columns=['Idx','target']) for i in b.columns: b[i].fillna(b[i].median(), inplace=True) c = b.std().sort_values().reset_index() c = c.rename(columns={0:'std'}) b = b.drop(columns=c[c['std']<0.1]['index']) b = pd.concat([b, df_Master[['Idx','target']]],axis=1) b.head() #获取类别类型的特征并进行处理 m = df_Master.select_dtypes(include='object') #统一UserInfo_9数据格式 m.replace({'UserInfo_9':{'中国移动 ':'中国移动', '中国电信 ':'中国电信','中国联通 ':'中国联通'}}, inplace=True) #统一去掉省,市方便后面操作 m['UserInfo_8'] = m['UserInfo_8'].apply(lambda x: x if x.find('市') == -1 else x[:-1]) m['UserInfo_20'] = m['UserInfo_20'].apply(lambda x: x if x.find('市') == -1 else x[:-1]) m['UserInfo_19'] = m['UserInfo_19'].apply(lambda x: x if x.find('省') == -1 else x[:-1]) m.UserInfo_19.replace({ '广西壮族自治区':'广西', '宁夏回族自治区':'宁夏', '新疆维吾尔自治区':'新疆', '西藏自治区':'西藏', '内蒙古自治区':'内蒙古' }, inplace=True) m['UserInfo_19'] = m['UserInfo_19'].apply(lambda x: x if x.find('市') == -1 else x[:-1]) #对于省份信息UserInfo_7,UserInfo_19筛选出前六名坏人比例比较高的省份,并进行二值化处理,合并到m表 def get_badrate(x): n = pd.concat([m[x],b['target']],axis=1) n = n[n['target'].notnull()] p = (n.groupby(x)['target'].sum()/n.groupby(x)['target'].count()).reset_index() p = p.rename(columns={x:'province','target':'bad_rate'}) p = p.sort_values('bad_rate',ascending=False)[:6] return p #UserInfo_7 p = get_badrate('UserInfo_7') c = [] for p in p['province']: c.append('UserInfo_7_is__'+p) n = pd.get_dummies(m['UserInfo_7'],prefix='UserInfo_7_is_')[c] m = pd.concat([m,n], axis=1) # UserInfo_19 p = get_badrate('UserInfo_19') c = [] for p in p['province']: c.append('UserInfo_19_is__'+p) n = pd.get_dummies(m['UserInfo_19'],prefix='UserInfo_19_is_')[c] m = pd.concat([m,n], axis=1) #在m表中删除这两个信息 m = m.drop(columns=['UserInfo_7', 'UserInfo_19']) #接下来对'UserInfo_2','UserInfo_4','UserInfo_8', 'UserInfo_20'这几个城市信息进行处理 # 使用xgboost筛选出重要的城市 from xgboost.sklearn import XGBClassifier from xgboost import plot_importance plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] #将数据哑变量处理后,绘制出重要的城市信息 fig = plt.figure(figsize=(20, 8)) for i, c in enumerate(['UserInfo_2','UserInfo_4','UserInfo_8', 'UserInfo_20']): k = pd.concat([m[c],b.target],axis=1) z = pd.get_dummies(k[k.target.notnull()]) ax = fig.add_subplot(2,2,i+1) clf = XGBClassifier(random_state=0).fit(z.drop(columns='target'), z['target']) plot_importance(clf,ax=ax,max_num_features=10,height=0.6) ![在这里插入图片描述](https://img-blog.csdnimg.cn/4bd798fd00c64c958bb60bc491e78e3d.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAc3VubnVhbjAx,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center) #筛选出重要城市,重新构造特征 k = pd.concat([m['UserInfo_2'],b.target],axis=1) z = pd.get_dummies(k) c = [] for p in ['淄博','成都','衡水','梧州','菏泽']: c.append('UserInfo_2_'+p) m = pd.concat([m,z[c]], axis=1) k = pd.concat([m['UserInfo_4'],b.target],axis=1) z = pd.get_dummies(k) c = [] for p in ['汕头','梧州','青岛','淄博','成都']: c.append('UserInfo_4_'+p) m = pd.concat([m,z[c]], axis=1) k = pd.concat([m['UserInfo_8'],b.target],axis=1) z = pd.get_dummies(k) c = [] for p in ['汕头','梧州','青岛','淄博','成都']: c.append('UserInfo_8_'+p) m = pd.concat([m,z[c]], axis=1) k = pd.concat([m['UserInfo_20'],b.target],axis=1) z = pd.get_dummies(k) c = [] for p in ['汕头','梧州','青岛','淄博','成都']: c.append('UserInfo_20_'+p) m = pd.concat([m,z[c]], axis=1) m = pd.concat([m,pd.get_dummies(m['UserInfo_9'])], axis=1) m.rename(columns={'不详':'未知运营商'}, inplace=True) #从'UserInfo_2','UserInfo_4','UserInfo_8','UserInfo_20'几个信息中延伸出地址变化次数的特征 m['地址变化次数'] = m[['UserInfo_2','UserInfo_4','UserInfo_8','UserInfo_20']].apply(lambda x:x.nunique(), axis=1) #删除这些特征 m = m.drop(columns=['UserInfo_2','UserInfo_4','UserInfo_8','UserInfo_9','UserInfo_20']) #合并特征 df_Master = pd.concat([m,b], axis=1) #将sample_status这一列,换到最后一列 sample_status = df_Master.pop('sample_status') df_Master.insert(loc=df_Master.shape[1], column='sample_status', value=sample_status, allow_duplicates=False) #对于类别类型WeblogInfo_19 WeblogInfo_20 WeblogInfo_21这三列的缺失值用众数填充,然后进行哑变量处理 df_Master.select_dtypes(include='object').isnull().sum() df_Master.select_dtypes(include='object')[['WeblogInfo_19','WeblogInfo_20','WeblogInfo_21']].mode() df_Master = df_Master.fillna(value={'WeblogInfo_19':'I','WeblogInfo_20':'I5','WeblogInfo_21':'D'}) df_Master.select_dtypes(include='object').isnull().sum() #至此第一阶段的特征处理完毕 pd.get_dummies(df_Master[a]).shape a = ['UserInfo_22', 'UserInfo_23', 'UserInfo_24', 'Education_Info2','Education_Info3', 'Education_Info4', 'Education_Info6', 'Education_Info7', 'Education_Info8', 'WeblogInfo_19', 'WeblogInfo_20','WeblogInfo_21'] b = ['ListingInfo','sample_status'] k = pd.get_dummies(df_Master.drop(columns=b)) df_Master1 = pd.concat([k,df_Master[b]],axis=1) df_Master1.to_csv(r'F:教师培训ppd6df_Master1.csv', encoding='gb18030')4.lightGBM进行初步的特征筛选
select = pd.concat([pd.get_dummies(df_Master[a]),df_Master['target']],axis=1) from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score,roc_curve,auc import lightgbm as lgb from multiprocessing import cpu_count c = ['target'] x = select[select['target'].notnull()].drop(columns=c) y = select[select['target'].notnull()]['target'] x_train,x_test, y_train, y_test = train_test_split(x,y,random_state=2,test_size=0.2) def roc_auc_plot(clf, x, y): auc = roc_auc_score(y,clf.predict_proba(x)[:,1]) fpr, tpr, _ = roc_curve(y,clf.predict_proba(x)[:,1]) ks = abs(fpr-tpr).max() print('ks = ', ks) print('auc = ', auc) from matplotlib import pyplot as plt plt.plot(fpr,tpr,label = 'train_or_test') plt.plot([0,1],[0,1],'k--', c='r') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show() clf = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=2, n_estimators=800,max_features = 140, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1, num_iterations = 800 #迭代次数 ) clf = clf.fit(x_train, y_train,eval_set=[(x_train, y_train),(x_test,y_test)],eval_metric='auc',early_stopping_rounds=100) roc_auc_plot(clf, x_test, y_test) ![在这里插入图片描述](https://img-blog.csdnimg.cn/6b327736c1ea4139864a799bfab05934.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAc3VubnVhbjAx,size_12,color_FFFFFF,t_70,g_se,x_16) feature = pd.Dataframe( {'name' : clf.booster_.feature_name(), 'importance' : clf.feature_importances_ }).set_index('name') mm = feature.sort_values('importance',ascending = False) kk = mm[mm.importance>0].index.to_list() a = ['UserInfo_22', 'UserInfo_23', 'UserInfo_24', 'Education_Info2','Education_Info3', 'Education_Info4', 'Education_Info6', 'Education_Info7', 'Education_Info8', 'WeblogInfo_19', 'WeblogInfo_20','WeblogInfo_21'] df_Master2 = pd.concat([df_Master.drop(columns=a),select[kk]], axis=1) df_Master2.to_csv(r'F:教师培训ppd6df_Master2.csv', encoding='gb18030')5.lightGBM重新建模
#针对去掉特征的df_Master2进行建模, from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score,roc_curve,auc import lightgbm as lgb from multiprocessing import cpu_count import pandas as pd df_Master2 = pd.read_csv(r'F:教师培训ppd6df_Master2.csv',encoding='gb18030') df_Master2 = df_Master2.drop(columns='Unnamed: 0') c = ['target','ListingInfo','sample_status'] x = df_Master2[df_Master2['target'].notnull()].drop(columns=c) y = df_Master2[df_Master2['target'].notnull()]['target'] x_train,x_test, y_train, y_test = train_test_split(x,y,random_state=2,test_size=0.2) def roc_auc_plot(clf, x_test, y_test, x_train, y_train): y_pre = clf.predict_proba(x_test)[:,1] test_auc = roc_auc_score(y_test,y_pre) test_fpr, test_tpr, _ = roc_curve(y_test,y_pre) test_ks = abs(test_fpr-test_tpr).max() print('test_ks = ', test_ks) print('test_auc = ', test_auc) y_pre = clf.predict_proba(x_train)[:,1] train_auc = roc_auc_score(y_train,y_pre) train_fpr, train_tpr, _ = roc_curve(y_train,y_pre) train_ks = abs(train_fpr-train_tpr).max() print('train_ks = ', train_ks) print('train_auc = ', train_auc) from matplotlib import pyplot as plt plt.plot(train_fpr,train_tpr,label = 'train') plt.plot(test_fpr,test_tpr,label = 'test') plt.plot([0,1],[0,1],'k--', c='r') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show() clf = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=2, n_estimators=800,max_features = 140, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1, num_iterations = 800 #迭代次数 ) clf = clf.fit(x_train, y_train,eval_set=[(x_train, y_train),(x_test,y_test)],eval_metric='auc',early_stopping_rounds=100) roc_auc_plot(clf, x_test, y_test, x_train, y_train) ![效果图](https://img-blog.csdnimg.cn/b0f0fc72fd6e4efabd5a255d04d45c2a.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAc3VubnVhbjAx,size_12,color_FFFFFF,t_70,g_se,x_16)![在这里插入图片描述](https://img-blog.csdnimg.cn/f6bf32b8405b4b01b0aff07ada353146.png) #针对没有去除特征的df_Master1进行建模 from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score,roc_curve,auc import lightgbm as lgb from multiprocessing import cpu_count import pandas as pd df_Master2 = pd.read_csv(r'F:教师培训ppd6df_Master2.csv',encoding='gb18030') df_Master2 = df_Master2.drop(columns='Unnamed: 0') c = ['target','ListingInfo','sample_status'] x = df_Master2[df_Master2['target'].notnull()].drop(columns=c) y = df_Master2[df_Master2['target'].notnull()]['target'] x_train,x_test, y_train, y_test = train_test_split(x,y,random_state=2,test_size=0.2) clf = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=2, n_estimators=800,max_features = 140, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1, num_iterations = 800 #迭代次数 ) clf = clf.fit(x_train, y_train,eval_set=[(x_train, y_train),(x_test,y_test)],eval_metric='auc',early_stopping_rounds=100) roc_auc_plot(clf, x_test, y_test, x_train, y_train) ![效果图](https://img-blog.csdnimg.cn/3e64ddd0c6f049d3bb0d16192b6965d2.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAc3VubnVhbjAx,size_12,color_FFFFFF,t_70,g_se,x_16)![在这里插入图片描述](https://img-blog.csdnimg.cn/d6eab3099e3f463298a402b8cfe984cb.png)6.对df_userupdate表进行信息特征变换
#统一进行小写处理 df_userupdate['UserupdateInfo1'] = df_userupdate['UserupdateInfo1'].apply(lambda x:x.lower()) #提取info1234信息 # 修改信息表 # 衍生变量: # 1)最近的修改时间距离成交时间差; # 2)修改信息总次数 # 3)每种信息修改的次数 # 4)按照日期修改的次数 info1 = df_userupdate.groupby('Idx')['UserupdateInfo1'].count() df_userupdate['ListingInfo1'] = pd.to_datetime(df_userupdate['ListingInfo1']) df_userupdate['UserupdateInfo2'] = pd.to_datetime(df_userupdate['UserupdateInfo2']) info2 = df_userupdate.groupby('Idx')['ListingInfo1'].max()-df_userupdate.groupby('Idx')['UserupdateInfo2'].max() info2 = info2.apply(lambda x : x.days) info3 = df_userupdate.pivot_table(index='Idx', columns='UserupdateInfo1', values='UserupdateInfo2', aggfunc={'UserupdateInfo2':'count'}).fillna(0) # 4)按照日期修改的次数 info4 = df_userupdate.groupby('Idx')['UserupdateInfo2'].nunique() df_userupdate_info = pd.concat([info1, info2, info3, info4],axis=1) df_userupdate_info.rename(columns={0:'df_use_update0'}, inplace=True) #序列化到磁盘 df_userupdate_info.to_csv(r'F:教师培训ppd6df_userupdate_info.csv',encoding='gb18030', index=True)7.对df_loginfo表进行信息特征变换
# 衍生的变量有 # 1)累计登陆次数 # 2)登陆时间的平均间隔 # 3)最近一次的登陆时间距离成交时间差 df_loginfo['Listinginfo1'] = pd.to_datetime(df_loginfo['Listinginfo1']) df_loginfo['LogInfo3'] = pd.to_datetime(df_loginfo['LogInfo3']) info1 = df_loginfo.groupby('Idx')['LogInfo3'].count() def f(x): x = x.sort_values(ascending=True) y = x-x.shift() p = y.apply(lambda z:z.days) res = p.sum()/len(x) return round(res, 3) info2 = df_loginfo.groupby('Idx')['LogInfo3'].apply(f) info3 = df_loginfo.groupby('Idx')['Listinginfo1'].max() - df_loginfo.groupby('Idx')['LogInfo3'].max() info3 = info3.apply(lambda x:x.days) df_loginfo_info = pd.concat([info1, info2, info3], axis=1) df_loginfo_info.rename(columns={'LogInfo3':'login_info1','LogInfo3':'login_info2',0:'login_info3'}, inplace=True) df_loginfo_info.to_csv(r'F:教师培训ppd6df_loginfo_info.csv',encoding='gb18030', index=True)8.合并以上三个表df_loginfo_update_info,df_Master1,df_loginfo_update_info并重新建模
df_Master1 = df_Master1.merge(df_loginfo_update_info, on='Idx') df_Master2 = pd.read_csv(r'F:教师培训ppd6df_Master2.csv',encoding='gb18030') df_Master2 = df_Master2.merge(df_loginfo_update_info, on='Idx') # LGBMClassifier在df_Master2数据集上进行预测达到效果最好,所以依此模型为基础进行预测,修改,调参 from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score,roc_curve,auc import lightgbm as lgb from multiprocessing import cpu_count import pandas as pd df_Master2 = pd.read_csv(r'F:教师培训ppd6df_Master2.csv',encoding='gb18030') df_Master2 = df_Master2.drop(columns='Unnamed: 0') c = ['target','ListingInfo','sample_status', 'Idx'] x = df_Master2[df_Master2['target'].notnull()].drop(columns=c) y = df_Master2[df_Master2['target'].notnull()]['target'] x_train,x_test, y_train, y_test = train_test_split(x,y,random_state=2,test_size=0.2) def roc_auc_plot(clf, x_test, y_test, x_train, y_train): y_pre = clf.predict_proba(x_test)[:,1] test_auc = roc_auc_score(y_test,y_pre) test_fpr, test_tpr, _ = roc_curve(y_test,y_pre) test_ks = abs(test_fpr-test_tpr).max() print('test_ks = ', test_ks) print('test_auc = ', test_auc) y_pre = clf.predict_proba(x_train)[:,1] train_auc = roc_auc_score(y_train,y_pre) train_fpr, train_tpr, _ = roc_curve(y_train,y_pre) train_ks = abs(train_fpr-train_tpr).max() print('train_ks = ', train_ks) print('train_auc = ', train_auc) from matplotlib import pyplot as plt plt.plot(train_fpr,train_tpr,label = 'train') plt.plot(test_fpr,test_tpr,label = 'test') plt.plot([0,1],[0,1],'k--', c='r') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show() clf = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=2, n_estimators=800,max_features = 140, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1, num_iterations = 800 #迭代次数 ) clf = clf.fit(x_train, y_train,eval_set=[(x_train, y_train),(x_test,y_test)],eval_metric='auc',early_stopping_rounds=100) roc_auc_plot(clf, x_test, y_test, x_train, y_train) ![模型效果](https://img-blog.csdnimg.cn/fae49700e2594a70be46ed9b4235cfca.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAc3VubnVhbjAx,size_18,color_FFFFFF,t_70,g_se,x_16)9.模型调参
# num_leaves ,步长设为5, 使用网格搜索确定最佳步长 import pandas as pd from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import StratifiedKFold import time import lightgbm as lgb df_Master2 = pd.read_csv(r'F:教师培训ppd6df_Master2.csv',encoding='gb18030') df_Master2 = df_Master2.drop(columns='Unnamed: 0') c = ['target','ListingInfo','sample_status', 'Idx'] x = df_Master2[df_Master2['target'].notnull()].drop(columns=c) y = df_Master2[df_Master2['target'].notnull()]['target'] x_train,x_test, y_train, y_test = train_test_split(x,y,random_state=2,test_size=0.2) param_find1 = {'num_leaves':range(10,50,5)} cv_fold = StratifiedKFold(n_splits=5,random_state=0,shuffle=True) start = time.time() grid_search1 = GridSearchCV(estimator=lgb.LGBMClassifier( learning_rate=0.1, n_estimators = 28, max_depth=-1, min_child_weight=0.001, min_child_samples=20, subsample=0.8, colsample_bytree=0.8, reg_lambda=0, reg_alpha=0), cv = cv_fold, n_jobs=-1, param_grid = param_find1, scoring='roc_auc') grid_search1.fit(x_train,y_train) end = time.time() print('运行时间为:{}'.format(round(end-start,0))) print(grid_search1.get_params) print('t') print(grid_search1.best_params_) print('t') print(grid_search1.best_score_) grid_search1.get_params ![在这里插入图片描述](https://img-blog.csdnimg.cn/fb0498a127bf45c4be1b6f0ea2804234.png) #num_leaves 步长为1 param_find2 = {'num_leaves':range(20,30,1)} grid_search2 = GridSearchCV( estimator=lgb.LGBMClassifier(n_estimators=28, learning_rate=0.1, min_child_weight=0.001, min_child_samples=20, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=0), cv=StratifiedKFold(n_splits=5, random_state=0,shuffle=True), n_jobs=-1, scoring='roc_auc', param_grid=param_find2 ) grid_search2.fit(x_train, y_train) print(grid_search2.get_params,'t',grid_search2.best_params_,'t',grid_search2.best_score_) ![在这里插入图片描述](https://img-blog.csdnimg.cn/c21fcf66e0b14c8592eb640034e765c2.png) import numpy as np #确定num_leaves 为25 ,面进行min_child_samples 和 min_child_weight的调参,设定步长为5 param_find3 = { 'min_child_weight':np.linspace(0.001,0.003,3), 'min_child_samples':range(15,25,3) } grid_search3 = GridSearchCV( estimator=lgb.LGBMClassifier(n_estimators=28, learning_rate=0.1, num_leaves=25, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=0), cv=StratifiedKFold(n_splits=5, random_state=0,shuffle=True), n_jobs=-1, scoring='roc_auc', param_grid=param_find3 ) grid_search3.fit(x_train, y_train) print(grid_search3.get_params,'t',grid_search3.best_params_,'t',grid_search3.best_score_) ![在这里插入图片描述](https://img-blog.csdnimg.cn/8f81ef2415534bee98983f6fbc85b7a7.png) #确定参数{'min_child_samples': 24, 'min_child_weight': 0.001},下面对subsample和colsample_bytree进行调参 param_find4 = { 'subsample':np.linspace(0.5,1,5), 'colsample_bytree':np.linspace(0.5,1,5) } grid_search4 = GridSearchCV( estimator=lgb.LGBMClassifier(n_estimators=28, learning_rate=0.1, num_leaves=25, min_child_weight=0.001, min_child_samples=24, # subsample=0.8, # colsample_bytree=0.8, reg_alpha=0, reg_lambda=0), cv=StratifiedKFold(n_splits=5, random_state=0,shuffle=True), n_jobs=-1, scoring='roc_auc', param_grid=param_find4 ) grid_search4.fit(x_train, y_train) print(grid_search4.get_params,'t',grid_search4.best_params_,'t',grid_search4.best_score_) ![在这里插入图片描述](https://img-blog.csdnimg.cn/02ea3cf0b1cc4fa7a6ef28cca8a2a752.png) #重新建模 lgb_single_model = lgb.LGBMClassifier(n_estimators=28, learning_rate=0.11188888888888888, num_leaves=25, min_child_weight=0.001, min_child_samples=24, subsample=0.5, colsample_bytree=0.75, reg_alpha=0.3, reg_lambda=0.126) lgb_single_model.fit(x_train,y_train) pre = lgb_single_model.predict_proba(x_test)[:,1] acu_curve(y_test,pre)总结
以上代码参考https://zhuanlan.zhihu.com/p/56864235此文,并加入自己理解和改进,数据集https://www.kesci.com/home/competition/56cd5f02b89b5bd026cb39c9/content/1