以German信用数据为例的logistics regression算法在评分卡上的实践

发布时间：2020-07-08 10:45:15 作者：espanol
来源：网络阅读：7845

以德国信用数据为例，用logistict regression算法做信用评分卡原理性实现，因此并未考虑feature selection.

第一步：导入必要的库

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

第二步：导入数据

german = pd.read_csv('D:/CreditDatasets/german.data', sep=' ', header=None)
german.columns = ['Status_of_existing_checking_account', 'Duration_in_month', 'Credit_history','Purpose', 'Credit_amount', 'Savings_account', 'Present_employment_since','Installment_rate', 'Personal_status_and_sex', 'Other_debtors', 'Present_residence_since','Property', 'Age', 'Other_installment_plans', 'Housing', 'Number_of_existing_credits','Job', 'Number_of_people', 'Telephone', 'foreign_worker', 'default']
Grp = german.groupby('default')
total_good = Grp.size()[1]
total_bad = Grp.size()[2]

第三步：分别计算名义变量和数值变量的woe值，对取值较少的数值变量也用名义变量woe计算方法实现，其余数值变量均5等分

def CalcWOE(VarName):
    WOE_Map = pd.DataFrame()
    Vars = np.unique(german[VarName])
    for v in Vars:
        tmp = german[VarName] == v
        grp = german[tmp].groupby('default')
        Good = grp.size()[1]
        Bad = grp.size()[2]
        good_ratio = float(Good)/total_good
        bad_ratio = float(Bad)/total_bad
        WOE = np.log(bad_ratio/good_ratio)
        IV = (bad_ratio - good_ratio)*WOE
        result = pd.DataFrame([[VarName, v, WOE, IV]], index=None, columns=['variable', 'class', 'woe', 'iv'])
        WOE_Map = WOE_Map.append(result, ignore_index=True)
    return WOE_Map

# nominal variable woe
status_checking_account_woe = CalcWOE('Status_of_existing_checking_account')
Credit_history_woe          = CalcWOE('Credit_history')
Purpose_woe                 = CalcWOE('Purpose')
Savings_account_woe         = CalcWOE('Savings_account')
Present_employment_since_woe= CalcWOE('Present_employment_since')
Personal_status_and_sex_woe = CalcWOE('Personal_status_and_sex')
Other_debtors_woe           = CalcWOE('Other_debtors')
Property_woe                = CalcWOE('Property')
Other_installment_plans_woe = CalcWOE('Other_installment_plans')
Housing_woe                 = CalcWOE('Housing')
Job_woe                     = CalcWOE('Job')
Telephone_woe               = CalcWOE('Telephone')
foreign_worker_woe          = CalcWOE('foreign_worker')

# numeric variable woe, no binning
Installment_rate_woe        = CalcWOE('Installment_rate')
Present_residence_since_woe = CalcWOE('Present_residence_since')
Number_of_existing_credits_woe = CalcWOE('Number_of_existing_credits')
Number_of_people_woe        = CalcWOE('Number_of_people')


def CalcWOE_bin(VarName,N):
    WOE_Map = pd.DataFrame()
    max_value = max(german[VarName])
    min_value = min(german[VarName])
    bin = float(max_value - min_value)/N
    for i in range(N):
        bin_U = min_value + (i+1)*bin
        bin_L = bin_U - bin
        if i == 1:
            tmp = (german[VarName] >= bin_L) & (german[VarName] <= bin_U)
            grp = german[tmp].groupby('default')
        else:
            tmp = (german[VarName] > bin_L) & (german[VarName] <= bin_U)
            grp = german[tmp].groupby('default')
        Good = grp.size()[1]
        Bad = grp.size()[2]
        good_ratio = float(Good)/total_good
        bad_ratio = float(Bad)/total_bad
        WOE = np.log(bad_ratio/good_ratio)
        IV = (bad_ratio - good_ratio)*WOE
        result = pd.DataFrame([[VarName, [bin_L, bin_U, WOE], WOE, IV]],
                              index=None, columns=['variable', 'class+woe', 'woe', 'iv'])
        WOE_Map = WOE_Map.append(result, ignore_index=True)
    return WOE_Map

Duration_in_month_woe = CalcWOE_bin('Duration_in_month', 5)
Credit_amount_woe     = CalcWOE_bin('Credit_amount', 5)
Age_woe               = CalcWOE_bin('Age', 5)

第四步：用woe值替代原来的值

def ReplaceWOE(VarName, SourceDF, VarWOE):
    dict1 = dict.fromkeys(VarWOE['class'])
    j = 0
    for key in dict1:
        dict1[key] = VarWOE['woe'][j]
        j = j + 1
    SourceDF[VarName] = SourceDF[VarName].map(dict1)
    return SourceDF

german_woe = german
temp = ReplaceWOE('Status_of_existing_checking_account', german_woe, status_checking_account_woe)
temp1 = ReplaceWOE('Credit_history', temp, Credit_history_woe)
temp = ReplaceWOE('Purpose', temp1, Purpose_woe)
temp1 = ReplaceWOE('Savings_account', temp, Savings_account_woe)
temp = ReplaceWOE('Present_employment_since', temp1, Present_employment_since_woe)
temp1 = ReplaceWOE('Personal_status_and_sex', temp, Personal_status_and_sex_woe)
temp = ReplaceWOE('Other_debtors', temp1, Other_debtors_woe)
temp1 = ReplaceWOE('Property', temp, Property_woe)
temp = ReplaceWOE('Other_installment_plans', temp1, Other_installment_plans_woe)
temp1 = ReplaceWOE('Housing', temp, Housing_woe)
temp = ReplaceWOE('Job', temp1, Job_woe)
temp1 = ReplaceWOE('Telephone', temp, Telephone_woe)
temp = ReplaceWOE('foreign_worker', temp1, foreign_worker_woe)

temp1 = ReplaceWOE('Installment_rate', temp, Installment_rate_woe)
temp = ReplaceWOE('Present_residence_since', temp1, Present_residence_since_woe)
temp1 = ReplaceWOE('Number_of_existing_credits', temp, Number_of_existing_credits_woe)
temp = ReplaceWOE('Number_of_people', temp1, Number_of_people_woe)

def ReplaceWOE_bin(VarName, SourceDF, VarWOE):
    items = np.unique(SourceDF[VarName])
    m = min(SourceDF[VarName])
    dict2 = {}
    for it in items:
        if it == m:
            dict2[it] = VarWOE['class+woe'][0][2]
        else:
            for l, u, w in VarWOE['class+woe']:
                if (it > l) & (it <= u):
                    dict2[it] = w
    SourceDF[VarName] = SourceDF[VarName].map(dict2)
    return SourceDF

temp1 = ReplaceWOE_bin('Duration_in_month', temp, Duration_in_month_woe)
temp = ReplaceWOE_bin('Credit_amount', temp1, Credit_amount_woe)
temp1 = ReplaceWOE_bin('Age', temp, Age_woe)

第五步：将数据集拆分为训练集和测试集

X = temp1[list(temp1.columns)[:-1]]
y = temp1['default'] - 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

第六步：在训练集上应用logistic regression算法

from sklearn.linear_model.logistic import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

第七步：评估模型分类精度

from sklearn.metrics import accuracy_score
# print 'Accuracy:', accuracy_score(y_test, predictions)

from sklearn.cross_validation import cross_val_score
scores = cross_val_score(classifier, X_train, y_train, cv=5)
# print np.mean(scores), scores

第八步：创建评分卡

# score = A - B*log(theta)
# P0 = A - B*log(theta0), P0 + PDO = A - B*log(2*theta0)
P0 = 600
PDO = 20
theta0 = 1.0/60
B = PDO/np.log(2)
A = P0 + B*np.log(theta0)
coef = classifier.coef_
beta0 = classifier.intercept_

status_checking_account_woe['score'] = (A - B*beta0)/20 - B*coef[0][0]*status_checking_account_woe['woe']
Duration_in_month_woe['score'] = (A - B*beta0)/20 - B*coef[0][1]*Duration_in_month_woe['woe']
Credit_history_woe['score'] = (A - B*beta0)/20 - B*coef[0][2]*Credit_history_woe['woe']
Purpose_woe['score'] = (A - B*beta0)/20 - B*coef[0][3]*Purpose_woe['woe']
Credit_amount_woe['score'] = (A - B*beta0)/20 - B*coef[0][4]*Credit_amount_woe['woe']
Savings_account_woe['score'] = (A - B*beta0)/20 - B*coef[0][5]*Savings_account_woe['woe']
Present_employment_since_woe['score'] = (A - B*beta0)/20 - B*coef[0][6]*Present_employment_since_woe['woe']
Installment_rate_woe['score'] = (A - B*beta0)/20 - B*coef[0][7]*Installment_rate_woe['woe']
Personal_status_and_sex_woe['score'] = (A - B*beta0)/20 - B*coef[0][8]*Personal_status_and_sex_woe['woe']
Other_debtors_woe['score'] = (A - B*beta0)/20 - B*coef[0][9]*Other_debtors_woe['woe']
Present_residence_since_woe['score'] = (A - B*beta0)/20 - B*coef[0][10]*Present_residence_since_woe['woe']
Property_woe['score'] = (A - B*beta0)/20 - B*coef[0][11]*Property_woe['woe']
Age_woe['score'] = (A - B*beta0)/20 - B*coef[0][12]*Age_woe['woe']
Other_installment_plans_woe['score'] = (A - B*beta0)/20 - B*coef[0][13]*Other_installment_plans_woe['woe']
Housing_woe['score'] = (A - B*beta0)/20 - B*coef[0][14]*Housing_woe['woe']
Number_of_existing_credits_woe['score'] = (A - B*beta0)/20 - B*coef[0][15]*Number_of_existing_credits_woe['woe']
Job_woe['score'] = (A - B*beta0)/20 - B*coef[0][16]*Job_woe['woe']
Number_of_people_woe['score'] = (A - B*beta0)/20 - B*coef[0][17]*Number_of_people_woe['woe']
Telephone_woe['score'] = (A - B*beta0)/20 - B*coef[0][18]*Telephone_woe['woe']
foreign_worker_woe['score'] = (A - B*beta0)/20 - B*coef[0][19]*foreign_worker_woe['woe']

初次用python实现，不当之处请不吝批评指正！

以German信用数据为例的logistics regression算法在评分卡上的实践

相关阅读