import pandas as pd
import numpy as np

"""
kaggle支付反欺诈:IEEE-CIS Fraud Detection第一名方案 特征处理
"""
###### 1.加载数据
df_train = pd.read_csv(r"C:\Users\ld\Desktop\yc18\train1.csv",encoding="cp936")
df_test = pd.read_csv(r"C:\Users\ld\Desktop\yc18\test1.csv",encoding="cp936")

X_train = df_train[[i for i in df_train.columns.tolist() if i not in ["sdate","data"]]] #训练集
y_train = df_train["data"]                                                      #训练标签
X_test =df_test[[i for i in df_test.columns.tolist() if i not in ["sdate","data"]]]     #测试集
y_test = df_test["data"]
cols = X_train.columns.tolist() #数据集的特征名称列表

#########1. frequency encode
def encode_FE(df1, df2, cols):
    """
    计算类别特征频次占比
    df1:训练集
    df2:测试集
    cols:[columns_name1,columns_name2...]
    :param df1:X_train
    :param df2:X_test
    :param cols: columns list
    """
    for col in cols: #对于每个特征
        df = pd.concat([df1[col], df2[col]]) #连接训练测试集
        vc = df.value_counts(dropna=True, normalize=True).to_dict() #计算类别类特征各类别频次,保存为字典
        print("vc:",vc)
        vc[-1] = -1
        nm = col + "FE"
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype("float32")
        df2[nm] = df2[col].map(vc)
        df2[nm] = df2[nm].astype("float32")
    # df1.to_csv(r"C:\Users\ld\Desktop\yc18\df1.csv",encoding="cp936")
    # df2.to_csv(r"C:\Users\ld\Desktop\yc18\df2.csv",encoding="cp936")
# encode_FE(df_train, X_test, cols)

######2. label encode
def encode_LE(col, train=X_train, test=X_test, verbose=True):
    """
    标签数值化,为非数值类标签用整数进行编码
    :param col: column 特征名
    :param train: 训练集
    :param test:测试集
    :param verbose: 
    :return:
    """
    df_comb = pd.concat([train[col], test[col]], axis=0)
    df_comb, _ = pd.factorize(df_comb)
    nm = col
    if df_comb.max() > 32000:
        train[nm] = df_comb[0: len(train)].astype("float32")
        test[nm] = df_comb[len(train):].astype("float32")
    else:
        train[nm] = df_comb[0: len(train)].astype("float16")
        test[nm] = df_comb[len(train):].astype("float16")
    del df_comb
    if verbose:
        print(col)
    # print("------------------------>",nm)
    # for i in train[col]:
    #     print(i)

# col = 'holiday_月'
# encode_LE(col, train=X_train, test=X_test, verbose=True)


######3.组合特征不同取值下求其他特征统计值
def encode_AG(main_columns, uids, aggregations=["mean"], df_train=X_train, df_test=X_test, fillna=True, usena=False):
    """
    在uids特征取不同值的情况下,其他特征的统计值
    :param main_columns: 其他特征名list
    :param uids: uids特征list
    :param aggregations: 聚合方式list
    :param df_train: 训练集
    :param df_test: 测试集
    :param fillna: 
    :param usena: 
    :return: 
    """
    for main_column in main_columns:
        for col in uids:
            for agg_type in aggregations:
                new_column = main_column + "_" + col + "_" + agg_type
                temp_df = pd.concat([df_train[[col, main_column]], df_test[[col, main_column]]])
                if usena:
                    temp_df.loc[temp_df[main_column] == -1, main_column] = np.nan

                #求每个uid下,该col的均值或标准差
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                    columns={agg_type: new_column})
                #将uid设成index
                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_column].to_dict()
                #temp_df是一个映射字典
                df_train[new_column] = df_train[col].map(temp_df).astype("float32")
                df_test[new_column] = df_test[col].map(temp_df).astype("float32")
                if fillna:
                    df_train[new_column].fillna(-1, inplace=True)
                    df_test[new_column].fillna(-1, inplace=True)
                print(new_column)
    for i in df_train["上周同期均值_日均六楼湿度_分桶_change_mean"]:
        print(i)

# main_columns = ["日地回水温_change","上周同期均值"]
# uids = ["holiday_月","日均六楼湿度_分桶_change"]
# encode_AG(main_columns, uids, aggregations=["mean"], df_train=X_train, df_test=X_test, fillna=True, usena=False)

######4.COMBINE FEATURES交叉特征
def encode_CB(col1, col2, df1=X_train, df2=X_test):
    """
    两个特征的笛卡尔乘积
    :param col1: 特征名1
    :param col2: 特征名2
    :param df1: 训练集
    :param df2: 测试集
    :return: 
    """
    nm = col1 + '_' + col2
    df1[nm] = df1[col1].astype(str) + '_' + df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str) + '_' + df2[col2].astype(str)
    encode_LE(nm, verbose=False)
    print(nm, ', ', end='')

######5.GROUP AGGREGATION NUNIQUE 组合特征不同取值下求其他特征唯一值个数
def encode_AG2(main_columns, uids, train_df=X_train, test_df=X_test):
    """
    在uids特征取不同值的情况下,求其他特征的唯一值频次
    :param main_columns: 其他特征名list
    :param uids:  uid特征list
    :param train_df: 训练集
    :param test_df: 测试集
    :return: 
    """
    for main_column in main_columns:
        for col in uids:
            comb = pd.concat([train_df[[col] + [main_column]], test_df[[col] + [main_column]]], axis=0)
            mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict() #组合特征不同取值下,main_column中的唯一值个数字典
            train_df[col + '_' + main_column + '_ct'] = train_df[col].map(mp).astype('float32')
            test_df[col + '_' + main_column + '_ct'] = test_df[col].map(mp).astype('float32')
            print(col + '_' + main_column + '_ct, ', end='')

main_columns = ["日地回水温_change","上周同期均值"]
uids = ["holiday_月","日均六楼湿度_分桶_change"]
encode_AG2(main_columns, uids, train_df=X_train, test_df=X_test)


参考文献:
https://www.pianshen.com/article/85891157639/
https://zhuanlan.zhihu.com/p/85947569
https://blog.csdn.net/ThomasCai001/article/details/102799177
https://www.jianshu.com/p/80ed05362f58

Logo

技术共进,成长同行——讯飞AI开发者社区

更多推荐