kaggle支付反欺诈:IEEE-CIS Fraud Detection第一名方案 特征处理代码
import pandas as pdimport numpy as np"""kaggle支付反欺诈:IEEE-CIS Fraud Detection第一名方案 特征处理"""###### 1.加载数据df_train = pd.read_csv(r"C:\Users\ld\Desktop\yc18\train1.csv",encoding="cp936")df_test = pd.read_c
·
import pandas as pd
import numpy as np
"""
kaggle支付反欺诈:IEEE-CIS Fraud Detection第一名方案 特征处理
"""
###### 1.加载数据
df_train = pd.read_csv(r"C:\Users\ld\Desktop\yc18\train1.csv",encoding="cp936")
df_test = pd.read_csv(r"C:\Users\ld\Desktop\yc18\test1.csv",encoding="cp936")
X_train = df_train[[i for i in df_train.columns.tolist() if i not in ["sdate","data"]]] #训练集
y_train = df_train["data"] #训练标签
X_test =df_test[[i for i in df_test.columns.tolist() if i not in ["sdate","data"]]] #测试集
y_test = df_test["data"]
cols = X_train.columns.tolist() #数据集的特征名称列表
#########1. frequency encode
def encode_FE(df1, df2, cols):
"""
计算类别特征频次占比
df1:训练集
df2:测试集
cols:[columns_name1,columns_name2...]
:param df1:X_train
:param df2:X_test
:param cols: columns list
"""
for col in cols: #对于每个特征
df = pd.concat([df1[col], df2[col]]) #连接训练测试集
vc = df.value_counts(dropna=True, normalize=True).to_dict() #计算类别类特征各类别频次,保存为字典
print("vc:",vc)
vc[-1] = -1
nm = col + "FE"
df1[nm] = df1[col].map(vc)
df1[nm] = df1[nm].astype("float32")
df2[nm] = df2[col].map(vc)
df2[nm] = df2[nm].astype("float32")
# df1.to_csv(r"C:\Users\ld\Desktop\yc18\df1.csv",encoding="cp936")
# df2.to_csv(r"C:\Users\ld\Desktop\yc18\df2.csv",encoding="cp936")
# encode_FE(df_train, X_test, cols)
######2. label encode
def encode_LE(col, train=X_train, test=X_test, verbose=True):
"""
标签数值化,为非数值类标签用整数进行编码
:param col: column 特征名
:param train: 训练集
:param test:测试集
:param verbose:
:return:
"""
df_comb = pd.concat([train[col], test[col]], axis=0)
df_comb, _ = pd.factorize(df_comb)
nm = col
if df_comb.max() > 32000:
train[nm] = df_comb[0: len(train)].astype("float32")
test[nm] = df_comb[len(train):].astype("float32")
else:
train[nm] = df_comb[0: len(train)].astype("float16")
test[nm] = df_comb[len(train):].astype("float16")
del df_comb
if verbose:
print(col)
# print("------------------------>",nm)
# for i in train[col]:
# print(i)
# col = 'holiday_月'
# encode_LE(col, train=X_train, test=X_test, verbose=True)
######3.组合特征不同取值下求其他特征统计值
def encode_AG(main_columns, uids, aggregations=["mean"], df_train=X_train, df_test=X_test, fillna=True, usena=False):
"""
在uids特征取不同值的情况下,其他特征的统计值
:param main_columns: 其他特征名list
:param uids: uids特征list
:param aggregations: 聚合方式list
:param df_train: 训练集
:param df_test: 测试集
:param fillna:
:param usena:
:return:
"""
for main_column in main_columns:
for col in uids:
for agg_type in aggregations:
new_column = main_column + "_" + col + "_" + agg_type
temp_df = pd.concat([df_train[[col, main_column]], df_test[[col, main_column]]])
if usena:
temp_df.loc[temp_df[main_column] == -1, main_column] = np.nan
#求每个uid下,该col的均值或标准差
temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
columns={agg_type: new_column})
#将uid设成index
temp_df.index = list(temp_df[col])
temp_df = temp_df[new_column].to_dict()
#temp_df是一个映射字典
df_train[new_column] = df_train[col].map(temp_df).astype("float32")
df_test[new_column] = df_test[col].map(temp_df).astype("float32")
if fillna:
df_train[new_column].fillna(-1, inplace=True)
df_test[new_column].fillna(-1, inplace=True)
print(new_column)
for i in df_train["上周同期均值_日均六楼湿度_分桶_change_mean"]:
print(i)
# main_columns = ["日地回水温_change","上周同期均值"]
# uids = ["holiday_月","日均六楼湿度_分桶_change"]
# encode_AG(main_columns, uids, aggregations=["mean"], df_train=X_train, df_test=X_test, fillna=True, usena=False)
######4.COMBINE FEATURES交叉特征
def encode_CB(col1, col2, df1=X_train, df2=X_test):
"""
两个特征的笛卡尔乘积
:param col1: 特征名1
:param col2: 特征名2
:param df1: 训练集
:param df2: 测试集
:return:
"""
nm = col1 + '_' + col2
df1[nm] = df1[col1].astype(str) + '_' + df1[col2].astype(str)
df2[nm] = df2[col1].astype(str) + '_' + df2[col2].astype(str)
encode_LE(nm, verbose=False)
print(nm, ', ', end='')
######5.GROUP AGGREGATION NUNIQUE 组合特征不同取值下求其他特征唯一值个数
def encode_AG2(main_columns, uids, train_df=X_train, test_df=X_test):
"""
在uids特征取不同值的情况下,求其他特征的唯一值频次
:param main_columns: 其他特征名list
:param uids: uid特征list
:param train_df: 训练集
:param test_df: 测试集
:return:
"""
for main_column in main_columns:
for col in uids:
comb = pd.concat([train_df[[col] + [main_column]], test_df[[col] + [main_column]]], axis=0)
mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict() #组合特征不同取值下,main_column中的唯一值个数字典
train_df[col + '_' + main_column + '_ct'] = train_df[col].map(mp).astype('float32')
test_df[col + '_' + main_column + '_ct'] = test_df[col].map(mp).astype('float32')
print(col + '_' + main_column + '_ct, ', end='')
main_columns = ["日地回水温_change","上周同期均值"]
uids = ["holiday_月","日均六楼湿度_分桶_change"]
encode_AG2(main_columns, uids, train_df=X_train, test_df=X_test)
参考文献:
https://www.pianshen.com/article/85891157639/
https://zhuanlan.zhihu.com/p/85947569
https://blog.csdn.net/ThomasCai001/article/details/102799177
https://www.jianshu.com/p/80ed05362f58
更多推荐
所有评论(0)