特枕选择和建模

数据预处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 确定哪些包是安装好的

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
1
2
data_train =pd.read_csv('./train.csv')
data_test_a = pd.read_csv('./testA.csv')
1
2
data_train.info()
is_default = data_train['isDefault']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  800000 non-null  int64  
 1   loanAmnt            800000 non-null  float64
 2   term                800000 non-null  int64  
 3   interestRate        800000 non-null  float64
 4   installment         800000 non-null  float64
 5   grade               800000 non-null  object 
 6   subGrade            800000 non-null  object 
 7   employmentTitle     799999 non-null  float64
 8   employmentLength    753201 non-null  object 
 9   homeOwnership       800000 non-null  int64  
 10  annualIncome        800000 non-null  float64
 11  verificationStatus  800000 non-null  int64  
 12  issueDate           800000 non-null  object 
 13  isDefault           800000 non-null  int64  
 14  purpose             800000 non-null  int64  
 15  postCode            799999 non-null  float64
 16  regionCode          800000 non-null  int64  
 17  dti                 799761 non-null  float64
 18  delinquency_2years  800000 non-null  float64
 19  ficoRangeLow        800000 non-null  float64
 20  ficoRangeHigh       800000 non-null  float64
 21  openAcc             800000 non-null  float64
 22  pubRec              800000 non-null  float64
 23  pubRecBankruptcies  799595 non-null  float64
 24  revolBal            800000 non-null  float64
 25  revolUtil           799469 non-null  float64
 26  totalAcc            800000 non-null  float64
 27  initialListStatus   800000 non-null  int64  
 28  applicationType     800000 non-null  int64  
 29  earliesCreditLine   800000 non-null  object 
 30  title               799999 non-null  float64
 31  policyCode          800000 non-null  float64
 32  n0                  759730 non-null  float64
 33  n1                  759730 non-null  float64
 34  n2                  759730 non-null  float64
 35  n2.1                759730 non-null  float64
 36  n4                  766761 non-null  float64
 37  n5                  759730 non-null  float64
 38  n6                  759730 non-null  float64
 39  n7                  759730 non-null  float64
 40  n8                  759729 non-null  float64
 41  n9                  759730 non-null  float64
 42  n10                 766761 non-null  float64
 43  n11                 730248 non-null  float64
 44  n12                 759730 non-null  float64
 45  n13                 759730 non-null  float64
 46  n14                 759730 non-null  float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB

缺失值填充

由于EDA中对缺失的数据发现还有很多,尝试多种缺失填充然后比较结果选择结果最优的一种

1
2
3
4
5
6
# 找到数值型的特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
# 找到非结构化的特征
category_fea = list(filter(lambda x: x not in numerical_fea, list(data_train.columns)))
# 移除‘isDefault’特征
numerical_fea.remove('isDefault')
1
print(len(numerical_fea))
41

有三种缺失值填充的方式

  • 缺失值替换为指定的值(’0’)
  • 用上面的值替换缺失值
  • 纵向用下面的值来替换,且最多只填充两个连续的
1
2
# 显示缺失值的情况
data_train.isnull().sum()
id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n2.1                  40270
n4                    33239
n5                    40270
n6                    40270
n7                    40270
n8                    40271
n9                    40270
n10                   33239
n11                   69752
n12                   40270
n13                   40270
n14                   40270
dtype: int64
1
2
3
4
5
6
7
8
9
10
fill_type = 0
if fill_type == 1:
# 缺失值替换为指定的值
data_train = data_train.fillna(0)
elif fill_type == 2:
# 用上面的值替换缺失值
data_train = data_train.fillna(axis=0,method='ffill')
elif fill_type == 3:
# 纵向用下面的值来替换,且最多只填充两个连续的
data_train = data_train.fillna(axis=0,method='bfill',limit=2)
1
2
3
4
5
6
#按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
1
2
3
4
5
6
7
8
# 处理没有结构化的数据 'issueDate' 
data_train.isnull().sum()
# 由于有些数据类型比如时间需要特别处理:['issueDate'] ['earliesCreditLine']
for data in [data_train, data_test_a]:
data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
#构造时间特征
data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
1
2
3
4
5
6
7
8
9
10
11
data_train['employmentLength'].value_counts(dropna=False).sort_index()
# employmentLength 这一列特征中主要存储的是带信息的字符串稍微处理下变成int类型
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
for data in [data_train, data_test_a]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
1
2
data['employmentLength'].value_counts(dropna=False).sort_index()
# 可以看出数据的分布主要还是集中在10年以上
0.0     15989
1.0     13182
2.0     18207
3.0     16011
4.0     11833
5.0     12543
6.0      9328
7.0      8823
8.0      8976
9.0      7594
10.0    65772
NaN     11742
Name: employmentLength, dtype: int64

‘employmentLength’这个特征表示借款人最早报告的信用额度开立的月份 存储的时候是按照str存储的

1
data_train['earliesCreditLine'].sample(5)
631614    May-2002
657973    Mar-1995
518715    Jan-2003
16630     Apr-1993
308502    Jun-2009
Name: earliesCreditLine, dtype: object
1
2
3
# 对于这个类型的特征由于时间跨度大所以只需要年份就行了
for data in [data_train, data_test_a]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

处理一些类别特征
用nunique() 可以找到不重复的类别有几种
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 79282
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 889
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 12058
policyCode 类型数: 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
semantic_dict = {
id:'为贷款清单分配的唯一信用证标识',
'loanAmnt':'贷款金额',
'term' :'贷款期限(year)',
'interestRate':'贷款利率',
'installment' :'分期付款金额',
'grade' :'贷款等级',
'subGrade' :'贷款等级之子级',
'employmentTitle' :'就业职称',
'employmentLength' :'就业年限(年)',
'homeOwnership' :'借款人在登记时提供的房屋所有权状况',
'annualIncome' :'年收入',
'verificationStatus' :'验证状态',
'issueDate' :'贷款发放的月份',
'purpose' :'借款人在贷款申请时的贷款用途类别',
'postCode' :'借款人在贷款申请中提供的邮政编码的前3位数字',
'regionCode' :'地区编码',
'dti' :'债务收入比',
'delinquency_2years' :'借款人过去2年信用档案中逾期30天以上的违约事件数',
'ficoRangeLow':'借款人在贷款发放时的fico所属的下限范围',
'ficoRangeHigh':'借款人在贷款发放时的fico所属的上限范围',
'openAcc':'借款人信用档案中未结信用额度的数量',
'pubRec':'贬损公共记录的数量',
'pubRecBankruptcies':'公开记录清除的数量',
'revolBal':'信贷周转余额合计',
'revolUtil':'循环额度利用率,或借款人使用的相对于所有可用循环信贷的信贷金额',
'totalAcc':'借款人信用档案中当前的信用额度总数',
'initialListStatus':'贷款的初始列表状态',
'applicationType':'表明贷款是个人申请还是与两个共同借款人的联合申请',
'earliesCreditLine':'借款人最早报告的信用额度开立的月份',
'title':'借款人提供的贷款名称',
'policyCode':'公开可用的策略_代码=1新产品不公开可用的策略_代码=2n系列匿名特征 匿名特征n0-n14,为一些贷款人行为计数特征的处理',}
1
2
3
4
5
# 把grad映射一下
for data in [data_train, data_test_a]:
data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
for data in [data_train, data_test_a]:
data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

异常值处理

  • 首先,如果这一异常值并不代表一种规律性的,而是极其偶然的现象,或者说你并不想研究这种偶然的现象,这时可以将其删除。
  • 其次,如果异常值存在且代表了一种真实存在的现象,那就不能随便删除。在现有的欺诈场景中很多时候欺诈数据本身相对于正常数据说就是异常的,我们要把这些异常点纳入,重新拟合模型,研究其规律。能用监督的用监督模型,不能用的还可以考虑用异常检测的算法来做。
  • 注意test的数据不能删。
    让我们来分析一下数值型数据的异常值
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 使用3sigma分析来找异常值,此时的data是[200000, 148], 增加的列编程了
def find_outliers_by_3segama(data,fea):
data_std = np.std(data[fea])
data_mean = np.mean(data[fea])
outliers_cut_off = data_std * 3
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
return data

data_train = data_train.copy()
for fea in numerical_fea:
data_train = find_outliers_by_3segama(data_train,fea)
print(data_train[fea+'_outliers'].value_counts())
print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
print('*'*10)
# data_train[80w, 89]
正常值    800000
Name: id_outliers, dtype: int64
id_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********
正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********
正常值    800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    799701
异常值       299
Name: homeOwnership_outliers, dtype: int64
homeOwnership_outliers
异常值        62
正常值    159548
Name: isDefault, dtype: int64
**********
正常值    793973
异常值      6027
Name: annualIncome_outliers, dtype: int64
annualIncome_outliers
异常值       756
正常值    158854
Name: isDefault, dtype: int64
**********
正常值    800000
Name: verificationStatus_outliers, dtype: int64
verificationStatus_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    783003
异常值     16997
Name: purpose_outliers, dtype: int64
purpose_outliers
异常值      3635
正常值    155975
Name: isDefault, dtype: int64
**********
正常值    798931
异常值      1069
Name: postCode_outliers, dtype: int64
postCode_outliers
异常值       221
正常值    159389
Name: isDefault, dtype: int64
**********
正常值    799994
异常值         6
Name: regionCode_outliers, dtype: int64
regionCode_outliers
异常值         1
正常值    159609
Name: isDefault, dtype: int64
**********
正常值    798440
异常值      1560
Name: dti_outliers, dtype: int64
dti_outliers
异常值       466
正常值    159144
Name: isDefault, dtype: int64
**********
正常值    778245
异常值     21755
Name: delinquency_2years_outliers, dtype: int64
delinquency_2years_outliers
异常值      5089
正常值    154521
Name: isDefault, dtype: int64
**********
正常值    788261
异常值     11739
Name: ficoRangeLow_outliers, dtype: int64
ficoRangeLow_outliers
异常值       778
正常值    158832
Name: isDefault, dtype: int64
**********
正常值    788261
异常值     11739
Name: ficoRangeHigh_outliers, dtype: int64
ficoRangeHigh_outliers
异常值       778
正常值    158832
Name: isDefault, dtype: int64
**********
正常值    790889
异常值      9111
Name: openAcc_outliers, dtype: int64
openAcc_outliers
异常值      2195
正常值    157415
Name: isDefault, dtype: int64
**********
正常值    792471
异常值      7529
Name: pubRec_outliers, dtype: int64
pubRec_outliers
异常值      1701
正常值    157909
Name: isDefault, dtype: int64
**********
正常值    794120
异常值      5880
Name: pubRecBankruptcies_outliers, dtype: int64
pubRecBankruptcies_outliers
异常值      1423
正常值    158187
Name: isDefault, dtype: int64
**********
正常值    790001
异常值      9999
Name: revolBal_outliers, dtype: int64
revolBal_outliers
异常值      1359
正常值    158251
Name: isDefault, dtype: int64
**********
正常值    799948
异常值        52
Name: revolUtil_outliers, dtype: int64
revolUtil_outliers
异常值        23
正常值    159587
Name: isDefault, dtype: int64
**********
正常值    791663
异常值      8337
Name: totalAcc_outliers, dtype: int64
totalAcc_outliers
异常值      1668
正常值    157942
Name: isDefault, dtype: int64
**********
正常值    800000
Name: initialListStatus_outliers, dtype: int64
initialListStatus_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    784586
异常值     15414
Name: applicationType_outliers, dtype: int64
applicationType_outliers
异常值      3875
正常值    155735
Name: isDefault, dtype: int64
**********
正常值    775134
异常值     24866
Name: title_outliers, dtype: int64
title_outliers
异常值      3900
正常值    155710
Name: isDefault, dtype: int64
**********
正常值    800000
Name: policyCode_outliers, dtype: int64
policyCode_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    782773
异常值     17227
Name: n0_outliers, dtype: int64
n0_outliers
异常值      3485
正常值    156125
Name: isDefault, dtype: int64
**********
正常值    790500
异常值      9500
Name: n1_outliers, dtype: int64
n1_outliers
异常值      2491
正常值    157119
Name: isDefault, dtype: int64
**********
正常值    789067
异常值     10933
Name: n2_outliers, dtype: int64
n2_outliers
异常值      3205
正常值    156405
Name: isDefault, dtype: int64
**********
正常值    789067
异常值     10933
Name: n2.1_outliers, dtype: int64
n2.1_outliers
异常值      3205
正常值    156405
Name: isDefault, dtype: int64
**********
正常值    788660
异常值     11340
Name: n4_outliers, dtype: int64
n4_outliers
异常值      2476
正常值    157134
Name: isDefault, dtype: int64
**********
正常值    790355
异常值      9645
Name: n5_outliers, dtype: int64
n5_outliers
异常值      1858
正常值    157752
Name: isDefault, dtype: int64
**********
正常值    786006
异常值     13994
Name: n6_outliers, dtype: int64
n6_outliers
异常值      3182
正常值    156428
Name: isDefault, dtype: int64
**********
正常值    788430
异常值     11570
Name: n7_outliers, dtype: int64
n7_outliers
异常值      2746
正常值    156864
Name: isDefault, dtype: int64
**********
正常值    789625
异常值     10375
Name: n8_outliers, dtype: int64
n8_outliers
异常值      2131
正常值    157479
Name: isDefault, dtype: int64
**********
正常值    786384
异常值     13616
Name: n9_outliers, dtype: int64
n9_outliers
异常值      3953
正常值    155657
Name: isDefault, dtype: int64
**********
正常值    788979
异常值     11021
Name: n10_outliers, dtype: int64
n10_outliers
异常值      2639
正常值    156971
Name: isDefault, dtype: int64
**********
正常值    799434
异常值       566
Name: n11_outliers, dtype: int64
n11_outliers
异常值       112
正常值    159498
Name: isDefault, dtype: int64
**********
正常值    797585
异常值      2415
Name: n12_outliers, dtype: int64
n12_outliers
异常值       545
正常值    159065
Name: isDefault, dtype: int64
**********
正常值    788907
异常值     11093
Name: n13_outliers, dtype: int64
n13_outliers
异常值      2482
正常值    157128
Name: isDefault, dtype: int64
**********
正常值    788884
异常值     11116
Name: n14_outliers, dtype: int64
n14_outliers
异常值      3364
正常值    156246
Name: isDefault, dtype: int64
**********
1
2
3
4
#删除异常值对应的行
for fea in numerical_fea:
data_train = data_train[data_train[fea+'_outliers']=='正常值']
data_train = data_train.reset_index(drop=True)

数据分桶

将一些连续数据比如999,99,8这种连续数据离散化来处理

1
2
3
4
5
6
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
## 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
## 分位数分箱
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)

特征编码

labelEncode 直接放入树模型中

1
2
3
4
5
6
7
8
#label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
le = LabelEncoder()
le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
data_train[col] = le.transform(list(data_train[col].astype(str).values))
data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')
100%|██████████| 4/4 [00:03<00:00,  1.13it/s]


Label Encoding 完成

逻辑回归等模型要单独增加的特征工程

  • 对特征做归一化,去除相关性高的特征
  • 归一化目的是让训练过程更好更快的收敛,避免特征大吃小的问题
  • 去除相关性是增加模型的可解释性,加快预测过程。

特征选择

特征选择技术可以精简掉无用的特征,以降低最终模型的复杂性,它的最终目的是得到一个简约模型,
在不降低预测准确率或对预测准确率影响不大的情况下提高计算速度。特征选择不是为了减少训练时间
(实际上,一些技术会增加总体训练时间),而是为了减少模型评分时间。

  • Filter
    • 方差选择法
    • 相关系数法(pearson 相关系数)
    • 卡方检验
    • 互信息法
  • 2 Wrapper (RFE)
    • 递归特征消除法
  • 3 Embedded
    • 基于惩罚项的特征选择法
    • 基于树模型的特征选择
1
2
"纵向用缺失值上面的值替换缺失值"
data_train = data_train.fillna(axis=0,method='ffill')
1
2
3
4
5
6
x_train = data_train
#计算协方差
data_corr = x_train.corrwith(data_train.isDefault) #计算相关性
result = pd.DataFrame(columns=['features', 'corr'])
result['features'] = data_corr.index
result['corr'] = data_corr.values
1
2
3
4
5
6
7
# 当然也可以直接看图
data_numeric = data_train[numerical_fea]
correlation = data_numeric.corr()

f , ax = plt.subplots(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True, vmax=0.8)
<matplotlib.axes._subplots.AxesSubplot at 0x16522246898>

png

1
2
3
4
features = [f for f in data_train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])

cv_scores = []

for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)

params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2020,
'nthread': 28,
'n_jobs':24,
'silent': True,
'verbose': -1,
}

model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)

# print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])

if clf_name == "xgb":
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)

params = {'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.04,
'tree_method': 'exact',
'seed': 2020,
'nthread': 36,
"silent": True,
}

watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]

model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)

if clf_name == "cat":
params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}

model = clf(iterations=20000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
cat_features=[], use_best_model=True, verbose=500)

val_pred = model.predict(val_x)
test_pred = model.predict(test_x)

train[valid_index] = val_pred
test = test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))

print(cv_scores)

print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
return train, test
1
2
3
4
5
6
7
8
9
10
11
12
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")

lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]    training's auc: 0.748799    valid_1's auc: 0.730081
[400]    training's auc: 0.764154    valid_1's auc: 0.730891
[600]    training's auc: 0.777375    valid_1's auc: 0.730927
Early stopping, best iteration is:
[439]    training's auc: 0.766861    valid_1's auc: 0.731
[0.7310002011064074]
************************************ 2 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]    training's auc: 0.748535    valid_1's auc: 0.731631
[400]    training's auc: 0.764256    valid_1's auc: 0.732332
Early stopping, best iteration is:
[345]    training's auc: 0.76031    valid_1's auc: 0.732483
[0.7310002011064074, 0.7324829219213177]
************************************ 3 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]    training's auc: 0.747855    valid_1's auc: 0.73267
[400]    training's auc: 0.763207    valid_1's auc: 0.733776
[600]    training's auc: 0.776409    valid_1's auc: 0.734096
[800]    training's auc: 0.788911    valid_1's auc: 0.733663
Early stopping, best iteration is:
[628]    training's auc: 0.778126    valid_1's auc: 0.734146
[0.7310002011064074, 0.7324829219213177, 0.7341455481432986]
************************************ 4 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]    training's auc: 0.749233    valid_1's auc: 0.727753
[400]    training's auc: 0.764767    valid_1's auc: 0.728777
[600]    training's auc: 0.777702    valid_1's auc: 0.728611
Early stopping, best iteration is:
[420]    training's auc: 0.766087    valid_1's auc: 0.728853
[0.7310002011064074, 0.7324829219213177, 0.7341455481432986, 0.7288532795103251]
************************************ 5 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]    training's auc: 0.748281    valid_1's auc: 0.733124
[400]    training's auc: 0.763463    valid_1's auc: 0.733781
[600]    training's auc: 0.776921    valid_1's auc: 0.733684
Early stopping, best iteration is:
[536]    training's auc: 0.772805    valid_1's auc: 0.733891
[0.7310002011064074, 0.7324829219213177, 0.7341455481432986, 0.7288532795103251, 0.7338908945947943]
lgb_scotrainre_list: [0.7310002011064074, 0.7324829219213177, 0.7341455481432986, 0.7288532795103251, 0.7338908945947943]
lgb_score_mean: 0.7320745690552286
lgb_score_std: 0.0019639611876869616
1
2
testA_result = pd.read_csv('./testA.csv')
roc_auc_score(testA_result['isDefault'].values, lgb_test)
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

~\anaconda3\envs\tf13\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2888             try:
-> 2889                 return self._engine.get_loc(casted_key)
   2890             except KeyError as err:


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


KeyError: 'isDefault'


The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)

<ipython-input-69-262e8275a2db> in <module>
      1 testA_result = pd.read_csv('./testA.csv')
----> 2 roc_auc_score(testA_result['isDefault'].values, lgb_test)
      3 


~\anaconda3\envs\tf13\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2897             if self.columns.nlevels > 1:
   2898                 return self._getitem_multilevel(key)
-> 2899             indexer = self.columns.get_loc(key)
   2900             if is_integer(indexer):
   2901                 indexer = [indexer]


~\anaconda3\envs\tf13\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2889                 return self._engine.get_loc(casted_key)
   2890             except KeyError as err:
-> 2891                 raise KeyError(key) from err
   2892 
   2893         if tolerance is not None:


KeyError: 'isDefault'
-------------本文结束感谢您的阅读 :D -------------
Show comments from Gitment