量化金融信⽤与⻛控分析回顾2

2019/06/18

一个简单的实战:

数据:lending club 2015的数据

目标:训练模型根据用户的信息和行为数据区分用户的好坏

1. 预处理

# coding: utf-8
# # 数据预处理
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pylab

# ## 读取数据
# - data_all是Lending Club 2016年的部分借贷数据
raw_df = pd.read_csv('../data/data_all.csv')
print '数据规模:',raw_df.shape

# ## 删除如下的特征
# - 非数值型特征
# - 对识别欺诈无意义的特征:id, member_id
reamin_features =[feat for feat in raw_df.select_dtypes(include=['float64', 'int64']).keys()                   if feat not in ['id', 'loan_status', 'member_id','issue_d']]
feature_df = raw_df[reamin_features]
df = raw_df.copy()
df = df[reamin_features+['loan_status','issue_d']]

# ## 将月份转化为数值
def map_month(x):
    """ Map the month strings to integers.
    """
    if x!=x:
        return 0
    if "Jan" in x:
        return 1
    if "Apr" in x:
        return 4
    if 'Aug' in x:
        return 8
    if 'Dec' in x:
        return 12
    if 'Feb' in x:
        return 2
    if 'Jul' in x:
        return 7
    if 'Jun' in x:
        return 6
    if 'Mar' in x:
        return 3
    if 'May' in x:
         return 5
    if 'Nov' in x:
        return 11
    if 'Oct' in x:
        return 10
    if 'Sep' in x:
        return 9

df.issue_d = map(map_month,df.issue_d)
month_max = max(df.issue_d)
print '一共'+str(month_max)+'个月的数据'

# ## 统计各特征的缺失比例
# - 这里不包括计算KS、IV、PSI的标签数据: loan_status, issue_d
def get_nan_cnt(feature_df):
    """feature_df is a data frame.
       return the missing value counts of every feature.
    """
    nan_cnt = []
    nan_cnt =  (feature_df!=feature_df).sum(axis=0)
    return nan_cnt

nan_cnt = get_nan_cnt(feature_df)
total = raw_df.shape[0]
nan_cnt = nan_cnt *1.0 / total
nan_df = pd.DataFrame(nan_cnt,columns=['nan_ratio'])
nan_df.index.name = 'feature'
print '缺失比例最高的一些特征:'
print nan_df.sort_values(by='nan_ratio',ascending=False).head(20)

# ## 输出缺失比例和处理后的数据
df.to_csv('../data/data_clean.csv',index=False)
nan_df.to_csv('../output/Completeness.csv')

2. 计算不同特征的KS和IV值,比较特征的有效性

# coding: utf-8
"""计算不同特征的KS和IV值,并进行比较
"""
from __future__ import division
import math
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pylab

# 1. 计算单个变量的KS和IV值
# 1.1 读数据
# 
# KS_IV_example.csv是课上计算KS和IV案例的数据,每一行都表示一个账户
# 
# 数据包括2个字段:category, score
# category = 0/1 表示账户的好/坏
# score 表示账户的信用评分

print '====================================='
print '计算单个变量的KS和IV值'
print '====================================='

df = pd.read_csv('../data/KS_IV_example.csv')
print "数据规模:", df.shape

value_counts = df['category'].value_counts()
good_total = value_counts[0]
bad_total = value_counts[1]
total = good_total + bad_total
print '好账户共有:', good_total
print '坏账户共有:', bad_total

# 1.2 按照信用评分score对账户进行分组
# 
# - 要求
#     - 每组都至少有10%的用户(最后一组例外)
#     - 同一score的账户必须分入同一组

# 按照score值重新排序df
df_temp = df.sort_values(by='score')
df_sorted = df_temp.reset_index(level=0, drop=True)
bin_size = int(math.ceil(total * 0.1))
bins = []  # 记录每组最后一个账户
bin_size_list = []  # 记录每组账户个数
num_bins = 0
i = 0
start_index = 0
while i < total:
    end_index = start_index + bin_size - 1
    if end_index >= total - 1:
    # 最后一组,直接分组
        end_index = total - 1
    else:
    # 非最后一组,查看当前组内最后一个账户,是否与下个账户score相同。如果相同,则将下个账户分入当前组
        while end_index + 1 <= total - 1 and df_sorted.ix[end_index]['score'] == df_sorted.ix[end_index + 1]['score']:
            end_index = end_index + 1
    bins.append(end_index)
    bin_size_list.append(end_index-start_index)
    num_bins = num_bins + 1
    start_index = end_index + 1
    i = end_index + 1

# 1.3 计算KS和IV值

cum_good_ratio = 0
cum_bad_ratio = 0
cum_good_ratio_list = [0]
cum_bad_ratio_list = [0]

IV = 0
KS = 0
start_index = 0

i = 0
while i < num_bins:
    s1 = df_sorted[start_index:(bins[i] + 1)]
    s2 = s1[s1['category'] == 0]
    s3 = s1[s1['category'] == 1]
    good_in_bin = s2.index.size
    bad_in_bin = s3.index.size
    good_ratio_in_bin = good_in_bin / good_total
    bad_ratio_in_bin = bad_in_bin / bad_total
    cum_good_ratio = cum_good_ratio + good_ratio_in_bin
    cum_bad_ratio = cum_bad_ratio + bad_ratio_in_bin
    cum_good_ratio_list.append(cum_good_ratio)
    cum_bad_ratio_list.append(cum_bad_ratio)
    margin = abs(cum_good_ratio - cum_bad_ratio)
    if (margin > KS):
        KS = margin
    iv = (good_ratio_in_bin - bad_ratio_in_bin) * math.log(good_ratio_in_bin / bad_ratio_in_bin)
    IV = IV + iv
    start_index = bins[i] + 1
    i= i + 1

print 'KS: ',round(KS * 100, 1),'%'
print 'IV: ',IV

bin_ratio = [0]+[i*1.0/total for i in bin_size_list]
pylab.figure()
pylab.plot(range(len(cum_good_ratio_list)), cum_good_ratio_list, '-o',label='good')
pylab.plot(range(len(cum_bad_ratio_list)), cum_bad_ratio_list, '-o',label='bad')
pylab.legend(loc='upper left')
pylab.bar(range(len(bin_ratio)),bin_ratio)
pylab.ylabel("cum ratio")
pylab.xlabel("bin")
pylab.title('KS = '+str(round(KS * 100, 1))+"%")
pylab.savefig('../output/KS_example.png')

# 2. 计算数据集中所有变量的KS和IV值

def get_KS_IV(category, score):
    """category and score are both lists.
       return the KS and IV value.
    """
    
    cur_df = pd.DataFrame(zip(category,score),columns=['category','feature_score'])
    cur_df = cur_df.sort_values(by='feature_score')
    cur_df = cur_df.reset_index(level=0, drop=True)   
    value_counts = cur_df['category'].value_counts()
    good_total = value_counts[0]
    bad_total = value_counts[1]
    total = good_total + bad_total   
    bin_size = int(math.ceil(total * 0.1))
    bins = []# 记录每组最后一个账户
    num_bins = 0
    i = 0
    start_index = 0
    while i < total:
        end_index = start_index + bin_size - 1
        if end_index >= total - 1:
            # 最后一组,直接分组
            end_index = total - 1
        else:
            # 非最后一组,查看当前组内最后一个账户,是否与下个账户score相同。如果相同,则将下个账户分入当前组
            while end_index + 1 <= total - 1 and cur_df.ix[end_index]['feature_score'] == cur_df.ix[end_index + 1]['feature_score']:
                end_index = end_index + 1
        bins.append(end_index)
        num_bins = num_bins + 1
        start_index = end_index + 1
        i = end_index + 1   
    cum_good_ratio = 0
    cum_bad_ratio = 0
    start_index = 0
    IV = 0
    KS = 0
    i = 0
    while i < num_bins:
        s1 = cur_df[start_index:(bins[i] + 1)]
        s2 = s1[s1['category'] == 0]
        s3 = s1[s1['category'] == 1]
        good_in_bin = s2.index.size
        bad_in_bin = s3.index.size
        good_ratio_in_bin = good_in_bin / good_total+0.01
        bad_ratio_in_bin = bad_in_bin / bad_total+0.01
        cum_good_ratio = cum_good_ratio + good_ratio_in_bin
        cum_bad_ratio = cum_bad_ratio + bad_ratio_in_bin
        margin = abs(cum_good_ratio - cum_bad_ratio)
        if (margin > KS):
            KS = margin
        iv = (good_ratio_in_bin - bad_ratio_in_bin) * math.log(good_ratio_in_bin / bad_ratio_in_bin)
        IV = IV + iv
        start_index = bins[i] + 1
        i= i + 1
    return KS,IV


def get_KS_IV_features(category,feature_df):
    """categoty is the list to indicate whether the account is good. 
       feature_df is a data frame.
       return the KS and IV value lists.
    """
    KS_IV = []
    for feature in feature_df.columns:
        cur_KS, cur_IV = get_KS_IV(category,feature_df[feature])
        KS_IV.append([cur_KS, cur_IV])
        print '计算完毕:', feature
    return KS_IV


# 2.1 读数据
# 
# - data_clean.csv是1_preprocess处理后的数据
#     - 每行都表示一个借款账户
#     - loan_status = 0/1, 表示账户的好/坏
# - LCDataDictionary.csv是Leng Club数据中的变量含义
# - Completeness.csv是各变量缺失比例的数据

print '\n====================================='
print '计算数据集的KS和IV值'
print '====================================='

print '开始读取数据'
df = pd.read_csv('../data/data_clean.csv')
dict_df = pd.read_csv('../data/LCDataDictionary_clean.csv')
dict_df = dict_df.set_index('feature')
comp_df = pd.read_csv('../output/Completeness.csv')
comp_df = comp_df.set_index('feature')

print '开始计算KS和IV'
features = [i for i in df.columns if i not in ['loan_status','issue_d']]
KS_IV = get_KS_IV_features(df.loan_status, df[features])
KS_IV_df = pd.DataFrame(KS_IV, columns = ['KS','IV'],index = features)
KS_IV_df.index.name='feature'

show_features = ['delinq_2yrs','fico_range_low','fico_range_high','inq_last_6mths','mths_since_last_record']
show_KS_IV_df=KS_IV_df.loc[show_features,]
description_list = []
for feature in show_features:
    description_list.append(dict_df.loc[feature,'Description'])
show_KS_IV_df['feature desctiption'] = description_list
show_KS_IV_df = pd.concat([comp_df.loc[show_features,],show_KS_IV_df],axis=1)
print show_KS_IV_df

plt.figure()
show_KS_IV_df['KS'].plot.barh()
plt.title('KS of Different Features')
plt.tight_layout()
plt.savefig('../output/KS.png')
plt.figure()
show_KS_IV_df['IV'].plot.barh()
plt.title('IV of Different Features')
plt.tight_layout()
plt.savefig('../output/IV.png')
plt.figure()
show_KS_IV_df[['KS','IV']].plot.barh()
plt.title('Effectiveness of Different Features')
plt.tight_layout()
plt.savefig('../output/Effectiveness.png')
KS_IV_df.to_csv('../output/Effectiveness.csv')

print '\n请查看output中和KS、IV有关的图片和Effectiveness.csv文件'

3. 计算不同特征的PSI值,比较它们的稳定性


# coding: utf-8

"""计算不同特征的PSI值,比较它们的稳定性
"""

from __future__ import division
import math
import numpy as np

import pandas as pd
from matplotlib import pylab
import matplotlib.pyplot as plt

# 1. 读数据
# data_clean.csv是1_preprocess处理后的数据
# 每行都表示一个借款账户
# issue_d表示申请贷款的月份
# LCDataDictionary.csv是Leng Club数据中的变量含义
# Completeness.csv是各变量缺失比例的数据
# Effectiveness.csv是各变量KS和IV的数据

df = pd.read_csv('../data/data_clean.csv')
# df = pd.read_csv('../data/PSI_6m.csv')
month_max = max(df.issue_d)

print "数据规模:", df.shape
print '一共'+str(month_max)+'个月的数据'

dict_df = pd.read_csv('../data/LCDataDictionary_clean.csv')
dict_df = dict_df.set_index('feature')
comp_df = pd.read_csv('../output/Completeness.csv')
comp_df = comp_df.set_index('feature')
effe_df = pd.read_csv('../output/Effectiveness.csv')
effe_df = effe_df.set_index('feature')

# 2. 计算tot_cur_bal特征在1月和6月的PSI值

print '====================================='
print '计算tot_cur_bal特征在1月和6月的PSI值'
print '====================================='

df_temp = df[['issue_d','tot_cur_bal']]
df_temp = df_temp[df_temp['issue_d'].isin([1,6])]
before_total = df_temp[df_temp['issue_d']==1].shape[0]
after_total = df_temp[df_temp['issue_d']==6].shape[0]
total = before_total + after_total
print '1月数据条数:', before_total
print '6月数据条数:', after_total

# 如果存在缺失值,则将没有tot_cur_bal特征的数据单独归为一组,计算其psi
df_null_index = np.isnan(df_temp['tot_cur_bal'])
df_null = df_temp.ix[df_null_index]
if len(df_null) > 0:
    before_in_bin = len(df_null[df_null['issue_d']==before_time])
    after_in_bin = len(df_null[df_null['issue_d']==after_time])
    befor_ratio_in_bin = before_in_bin / before_total + 0.01
    after_ratio_in_bin = after_in_bin / after_total + 0.01
    PSI  = (befor_ratio_in_bin - after_ratio_in_bin) * math.log(befor_ratio_in_bin / after_ratio_in_bin)
    total_no_nan = total - before_in_bin -after_in_bin
else:
    PSI = 0
    total_no_nan = total

#   去除缺失值后按照tot_cur_bal对数据进行排序和分组
# 
#   要求
#     - 每组都至少有10%的用户(最后一组例外)
#     - 同一score的账户必须分入同一组

df_temp.dropna(how='any')
df_temp = df_temp.sort_values(by='tot_cur_bal')
df_temp = df_temp.reset_index(level=0, drop=True)

bin_size = int(math.ceil(total * 0.1))

bins = []  # 记录每组最后一个账户

num_bins = 0

i = 0
start_index = 0
while i < total_no_nan:

    end_index = start_index + bin_size - 1
    if end_index >= total - 1:
    # 最后一组,直接分组
        end_index = total - 1
    else:
    # 非最后一组,查看当前组内最后一个账户,是否与下个账户tot_coll_amt特征值相同。如果相同,则将下个账户分入当前组
        while end_index + 1 <= total - 1 and df_temp.ix[end_index]['tot_cur_bal'] == df_temp.ix[end_index + 1]['tot_cur_bal']:
            end_index = end_index + 1

    bins.append(end_index)
    num_bins = num_bins + 1

    start_index = end_index + 1
    i = end_index + 1

# 计算PSI值

start_index = 0
i = 0
while i < num_bins:
    s1 = df_temp[start_index:(bins[i] + 1)]
    s2 = s1[s1['issue_d'] == 1]
    s3 = s1[s1['issue_d'] == 6]
    before_in_bin = s2.index.size 
    after_in_bin = s3.index.size
    befor_ratio_in_bin = before_in_bin / before_total + 0.01
    after_ratio_in_bin = after_in_bin / after_total + 0.01
    psi = (befor_ratio_in_bin - after_ratio_in_bin) * math.log(befor_ratio_in_bin / after_ratio_in_bin)
    PSI= PSI + psi
    start_index = bins[i] + 1
    i= i + 1

print 'PSI: ', PSI


# 2. 计算所有特征的PSI值,依次让2月同1月对比,3月同1月对比...6月同1月对比


def get_PSI(time,feature):
    """time and feature are both lists.
       return the PSI value.
    """
    df_temp = pd.DataFrame(zip(time,feature),columns=['issue_d','feature_score'])
    issue_d_values = sorted(list(set(time)))
    before_time = issue_d_values[0]
    after_time = issue_d_values[1]
    before_total = df_temp[df_temp['issue_d']==before_time].shape[0]
    after_total = df_temp[df_temp['issue_d']==after_time].shape[0]
    total = before_total + after_total
    
    df_null_index = np.isnan(df_temp['feature_score'])
    df_null = df_temp.ix[df_null_index]
    if len(df_null) > 0:
        before_in_bin = len(df_null[df_null['issue_d']==before_time])
        after_in_bin = len(df_null[df_null['issue_d']==after_time])
        befor_ratio_in_bin = before_in_bin / before_total + 0.01
        after_ratio_in_bin = after_in_bin / after_total + 0.01
        PSI  = (befor_ratio_in_bin - after_ratio_in_bin) * math.log(befor_ratio_in_bin / after_ratio_in_bin)
        total_no_nan = total - before_in_bin -after_in_bin
    else:
        PSI = 0
        total_no_nan = total
    
    df_temp = df_temp.dropna(how='any')
    df_temp = df_temp.sort_values(by='feature_score')
    df_temp = df_temp.reset_index(level=0, drop=True)

    
    bin_size = int(math.ceil(total_no_nan * 0.1))

    bins = []  # 记录每组最后一个账户
    num_bins = 0

    i = 0
    start_index = 0
    while i < total_no_nan:

        end_index = start_index + bin_size - 1
        if end_index >= total_no_nan - 1:
        # 最后一组,直接分组
            end_index = total_no_nan - 1
        else:
        # 非最后一组,查看当前组内最后一个账户,是否与下个账户feature_score特征值相同。如果相同,则将下个账户分入当前组
            while end_index + 1 <= total_no_nan - 1 and df_temp.ix[end_index]['feature_score'] == df_temp.ix[end_index + 1]['feature_score']:
                end_index = end_index + 1

        bins.append(end_index)
        num_bins = num_bins + 1

        start_index = end_index + 1
        i = end_index + 1
        
    start_index = 0
    PSI = 0
    i = 0
    
    while i < num_bins:
        s1 = df_temp[start_index:(bins[i] + 1)]
        s2 = s1[s1['issue_d'] == before_time]
        s3 = s1[s1['issue_d'] == after_time]

        before_in_bin = s2.index.size 
        after_in_bin = s3.index.size

        befor_ratio_in_bin = before_in_bin / before_total + 0.01
        after_ratio_in_bin = after_in_bin / after_total + 0.01

        psi = (befor_ratio_in_bin - after_ratio_in_bin) * math.log(befor_ratio_in_bin / after_ratio_in_bin)
        PSI= PSI + psi

        start_index = bins[i] + 1
        i= i + 1

    return PSI


def get_PSI_features(time, feature_df):
    """time is a list and feature_df is a data frame.
       return the PSI values of every feature in the feature_df.
    """    
    PSI = []
    for feature in feature_df.columns:
        cur_PSI = get_PSI(time,feature_df[feature])
        PSI.append(cur_PSI)
    return PSI


# 选取特征,计算不同月份同1月比较的PSI值
print '\n====================================='
print '开始计算多个特征在不同月份的PSI值'
print '====================================='

PSI_list = []
before_time = 1
feature_cols = [i for i in df.columns if i not in ['issue_d','loan_status']]


print '计算各月同1月比较的PSI值:'
for after_time in range(2,month_max+1):
    cur_df = df[df['issue_d'].isin([before_time, after_time])]
    PSI_list.append(get_PSI_features(cur_df['issue_d'],cur_df[feature_cols]))
    print '完成所有特征在1月和'+str(after_time)+'月的比较'
index_names = ['PSI_1_'+str(i) for i in range(2,month_max+1)]
PSI_df = pd.DataFrame(PSI_list, columns = feature_cols,index = index_names).T
PSI_df.index.name='feature'

show_features = ['delinq_2yrs','fico_range_low','fico_range_high','inq_last_6mths','mths_since_last_record']
pylab.figure(figsize=[8,5])
for feature in show_features:
    pylab.plot(range(2,month_max+1), PSI_df.loc[feature], '-o',label=feature)
pylab.legend(loc='upper left')
pylab.ylabel("PSI")
pylab.xlabel("Month")
pylab.ylim([0,0.01])
pylab.title('Stability of Different Features: Compared with Jan')
pylab.savefig('../output/Stability_compare_with_Jan.png')

print '\n计算各月同上月比较的PSI值:'
PSI_list2 = []
for after_time in range(2,month_max+1):
    before_time = after_time - 1
    cur_df = df[df['issue_d'].isin([before_time, after_time])]
    PSI_list2.append(get_PSI_features(cur_df['issue_d'],cur_df[feature_cols]))
    print '完成所有特征在'+str(before_time)+'月和'+str(after_time)+'月的比较'
index_names = ['PSI_'+str(i-1)+'_'+str(i) for i in range(2,month_max+1)]
PSI_df2 = pd.DataFrame(PSI_list2, columns = feature_cols,index = index_names).T
pylab.figure(figsize=[8,5])
for feature in show_features:
    pylab.plot(range(2,month_max+1), PSI_df2.loc[feature,], '-o',label=feature)
pylab.legend(loc='upper left')
pylab.ylabel("PSI")
pylab.xlabel("Month")
pylab.ylim([0,0.01])
pylab.title('Stability of Different Features: Compared with Last Month')
pylab.savefig('../output/Stability_compare_with_last_month.png')

stab_df = pd.concat([PSI_df,PSI_df2],axis=1)
stab_df.index_name = 'feature'
stab_df.to_csv('../output/Stability.csv')
pd.concat([dict_df.ix[feature_cols],comp_df,effe_df,stab_df], axis=1).\
          to_csv('../output/Comp_Effe_Stab.csv')

print '\n请查看output中的Stability.png和Comp_Effe_Stab.csv文件'

4. 构建 stacking 预测模型


# coding: utf-8

# # 构建Stacking预测模型
# - 目标:    
#    - 根据所有特征的完整性( nan_ratio%),有效性(KS/IV)和稳定性(PSI)来选取模型训练特征。
#    - 建立风控模型,对于测试集data_test.csv 中的每一条账户数据,预测其坏账(或成为坏账户)的概率([0, 1]区间上的一个数)。

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import tree  # We just train a decision tree classifier as an example. You can use whatever model you want.
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeCV,Ridge
from sklearn.cross_validation import KFold
from sklearn import *
import re


#  # 1. 读取数据
# - data_all.csv是Lending Club 2015年的1月~12月借贷数据,做训练用
#     - 每行都表示一个借款账户
#     - loan_status = 0/1, 表示账户的好/坏
#     - 除了“loan_status”以外,共计85 个字段均为借贷账户的特征
# - data_test.csv是lending club 2016年第一季度的数据,做测试用,内容同上
# - Comp_Effe_Stab.csv 存储的是前两次作业计算出的不同特征的nan_ratio,KS/IV以及不同月份的PSI值

raw_df_train = pd.read_csv('../data/data_all.csv') 
df_test = pd.read_csv('../data/data_test.csv')
df_choose = pd.read_csv('../output/Comp_Effe_Stab.csv')

print 'Number of samples: train %d, test %d' % (raw_df_train.shape[0], df_test.shape[0])
print 'Number of features: %d' % (raw_df_train.shape[1])
df_choose.head()


# # 2. 特征选择
# - nan_ratio表示空缺值所占的比例,KS/IV表示有效性,即区分好人和坏人的能力,PSI表示特征的稳定性
# - 在特征选择时,有以下几步:
#     - 删除非数值型特征和对识别欺诈无意义的特征(如id, member_id等),得到data_clean.csv(见1_preprocess.ipynb)
#     - 选择data_clean.csv中任何两个月份的PSI值小于0.1的特征
#     - 将data_clean.csv中计算出的nan_ratio大于一定阈值的特征删去,因为nan_ratio过大,表示该特征的数据缺失太多
#     - 尽量选择KS/IV值较大的特征,但是本数据集算出特征的KS/IV值相差不大,因此在本例中这部分不做重点考虑
#     
#   注:选择PSI小于0.1的原因
#   ![](../others/PSI的含义.bmp)

"""选择data_clean.csv中任何两个月份的PSI值小于0.1的特征,得到集合PSI_features
"""
m_df_choose = df_choose.shape[0]  
n_df_choose = df_choose.shape[1]
PSI_features = []
for i in range(m_df_choose):
    for j in range(5,n_df_choose):  #"5"表示PSI_1_2所在的列
        if (df_choose.loc[i][j] < 0.1):
            if j == n_df_choose - 1:
                PSI_features.append(df_choose.feature[i])
"""将data_clean.csv中计算出的nan_ratio大于一定阈值的特征删去,得到集合nan_feature
"""
df_del = df_choose[df_choose.nan_ratio < 0.05]  #将nan_ratio > 0.05的缺失严重的特征删掉
nan_feature =  list(df_del.feature)
"""求PSI_features和nan_feature两个集合的交集"""
features = list(set(PSI_features).intersection(set(nan_feature)))


df_train = raw_df_train[features + ["loan_status"]]

# Simple strategy to fill the nan term
df_train = df_train.fillna(value=0, inplace=False)

print 'Number of samples:', df_train.shape[0], df_test.shape[0]
sample_size_dict = dict(pd.value_counts(df_train['loan_status']))
print "Negative sample size:%d,\nPositive sample size:%d\nImbalance ratio:%.3f"     % (sample_size_dict[0], sample_size_dict[1], float(sample_size_dict[1])/sample_size_dict[0])
df_train.head(3)


# # 3. 数据预处理——标准化

# - 对训练数据标准化

sscaler = StandardScaler() # StandardScaler from sklearn.preprocessing
sscaler.fit(df_train[features]) # fit training data to get mean and variance of each feature term
train_matrix = sscaler.transform(df_train[features]) # transform training data to standardized vectors

train_labels = np.array(df_train['loan_status'])
print train_matrix.shape, train_labels.shape


# - 用相同的均值和方差对测试数据标准化

df_test = df_test.fillna(value=0, inplace=False) # simply fill the nan value with zero
test_matrix = sscaler.transform(df_test[features]) # standardize test data
test_labels = np.array(df_test['loan_status'])
print test_matrix.shape, test_labels.shape
df_test[features].head(3)


# # 4. Stacking模型
# - 训练一个模型来组合其他各个模型。首先先训练多个不同的模型,然后再以之前训练的各个模型的输出为输入来训练一个模型,以得到一个最终的输出。

# ## 4.1 构造扩展类
# - 类SklearnHelper能够拓展所有Sklearn分类器的内置方法(包括train, predict and fit等),使得在第一级调用4种模型方法分类器时不会显得冗余([见参考
# ](https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python))

# Some useful parameters which will come in handy later on
ntrain = df_train.shape[0]
ntest = df_test[features].shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict_proba(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_) 


# ## 4.2 K-折交叉验证(k-fold CrossValidation)
# - 生成训练集和测试集的预测标签,[见参考](http://blog.csdn.net/yc1203968305/article/details/73526615)
#   ![](../others/Stacking.png)

def get_oof(clf, x_train, y_train, test_matrix,test_labels):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)
        oof_test_skf[i, :] = clf.predict_proba(test_matrix)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    auc_score = metrics.roc_auc_score(y_true=test_labels, y_score=oof_test[:])
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1),auc_score


# ## 4.3 创建第一级分类器
# - Random Forest classifier
# - AdaBoost classifier
# - Neural_network MLP classifier
# - GradientBoosting classifier

# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 100,
     'criterion': 'entropy', 
    'max_depth': 11,
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 200,
    'learning_rate' : 0.75
}

# Neural_network MLP classifier
mlp_params = {
    'hidden_layer_sizes': (150),
    'solver': 'adam',
    'activation': 'logistic',
    'alpha': 0.0001,
    'verbose': 1,
    'learning_rate_init': 0.01,
    'warm_start': True,
}

# Gradientboosting classifier
gb_params = {
    'learning_rate':0.01,
    'n_estimators': 1200,
    'max_depth': 9, 
    'min_samples_split': 60,
    'random_state': 10,
    'subsample': 0.85,
    'max_features':7,
    'warm_start': True,
}


# 通过类SklearnHelper创建4个models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
mlp = SklearnHelper(clf=neural_network.MLPClassifier, seed=SEED, params=mlp_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)


# ## 4.4 第一级分类器的预测标签输出
# - XX_oof_train和XX_oof_test分别表示第一级分类器的训练集和测试集的预测标签输出

# Create our OOF train and test predictions. These base results will be used as new features
rf_oof_train, rf_oof_test, rf_auc_score = get_oof(rf, train_matrix,train_labels, test_matrix,test_labels) # Random Forest
print("Random Forest is complete")
ada_oof_train, ada_oof_test, ada_auc_score = get_oof(ada,train_matrix,train_labels, test_matrix,test_labels) # AdaBoost 
print("AdaBoost  is complete")
mlp_oof_train, mlp_oof_test, mlp_auc_score = get_oof(mlp,train_matrix,train_labels, test_matrix,test_labels) # Neural_network MLP classifier
print("Neural_network MLP  is complete")
gb_oof_train, gb_oof_test, gb_auc_score = get_oof(gb,train_matrix,train_labels, test_matrix,test_labels) # GradientBoosting classifier
print("GradientBoosting is complete")


print "Random Forest模型的AUC值:%.5f \nAdaBoost模型的AUC值:%.5f \nNeural_network MLP模型的AUC值:%.5f \nGradientBoosting模型的AUC值:%.5f"     % (rf_auc_score, ada_auc_score,mlp_auc_score,gb_auc_score)


# ## 4.5 合并第一级分类器的预测标签输出

x_train = np.concatenate(( rf_oof_train, ada_oof_train, mlp_oof_train,gb_oof_train), axis=1)
x_test = np.concatenate(( rf_oof_test, ada_oof_test, mlp_oof_test,gb_oof_test), axis=1)


# ## 4.6 第一级各分类器方法的相关程度
# corr() 相关系数,默认皮尔森 0<|r|<1表示存在不同程度线性相关:
# - 0.8-1.0 极强相关
# - 0.6-0.8 强相关
# - 0.4-0.6 中等程度相关
# - 0.2-0.4 弱相关
# - 0.0-0.2 极弱相关或无相关

df = pd.DataFrame(x_train, columns= ['rf','ada','mlp','gb'])
df_test = pd.DataFrame(x_test, columns= ['rf','ada','mlp','gb'])
df.corr()


# ## 4.7 训练第二级模型
# - 第二级模型选用的是岭回归模型方法

combinerModel = Ridge(alpha = 5000, fit_intercept=False)
combinerModel.fit(x_train, train_labels) 
print "Random Forest模型对最终预测结果的影响程度:%.5f \nAdaBoost模型对最终预测结果的影响程度:%.5f \nNeural_network MLP模型对最终预测结果的影响程度:%.5f \nGradientBoosting模型对最终预测结果的影响程度:%.5f"     % (combinerModel.coef_[0], combinerModel.coef_[1], combinerModel.coef_[2], combinerModel.coef_[3])
test_predictions = combinerModel.predict(x_test)
Stacking_auc_score = metrics.roc_auc_score(y_true=test_labels, y_score=test_predictions)
print "Stacking模型后最终的AUC值:%.5f " % (Stacking_auc_score)

Post Directory