流程
2022-06-20 20:03:42 4 举报
AI智能生成
408专用
作者其他创作
大纲/内容
分支主题
分支主题
分支主题
确定分类或回归
数据集YX
训练集,验证集划分(test0.33 random42)
选择算法,及评价指标:回归:MSE,分类:准确率
例子
import numpy as np<br>import pandas as pd<br><br>train = pd.read_csv(r"C:\Users\Administrator\Desktop\train.csv")<br><br>data[:10]
train.info()
train['feature_1'].value_counts() train['feature_2'].value_counts() train['feature_3'].value_counts()
train = pd.get_dummies(train,columns=['feature_1', 'feature_2'])
Y = train['target']<br>ts =[ 'feature_3', 'feature_1_1',<br> 'feature_1_2', 'feature_1_3', 'feature_1_4', 'feature_1_5',<br> 'feature_2_1', 'feature_2_2', 'feature_2_3']<br>X =train[ts]
from sklearn.model_selection import train_test_split<br>trainX,valX,trainy,valy= train_test_split(X,Y,test_size=0.33,random_state=42)
from sklearn.linear_model import LinearRegression<br>IrModel =LinearRegression().fit(trainX,trainy)
from sklearn.metrics import mean_squared_error<br>mean_squared_error(IrModel.predict(valX),valy)
子主题
data =pd.read_csv(r"C:\Users\Administrator\Desktop\train.csv")<br>data =data[['target','feature_1', 'feature_2', 'feature_3']]
import seaborn as sns<br>import matplotlib.pyplot as plt<br>sns.heatmap(data.corr(),vmin = -1,vmax = 1,square = True,annot =True)
import seaborn as sns <br>import matplotlib.pyplot as plt<br>%matplotlib inline<br><br>fig, ax = plt.subplots(1, 3, figsize = (16, 6))<br>plt.suptitle('Violineplots for features and target')<br>sns.violinplot(x="feature_1", y="target", data=data, ax=ax[0], title='feature_1')<br>sns.violinplot(x="feature_2", y="target", data=data, ax=ax[1], title='feature_2')<br>sns.violinplot(x="feature_3", y="target", data=data, ax=ax[2], title='feature_3')
historical_transactions = pd.read_csv(r"C:\Users\Administrator\Desktop\historical_transactions.csv")<br>historical_transactions[:5]
historical_transactions_3month =historical_transactions.loc[historical_transactions['month_lag'] > -4]<br>historical_transactions_3month.info()
del historical_transactions<br><br>import gc<br>gc.collect()
col_list = ['city_id', 'installments', 'merchant_category_id', 'category_2', 'state_id', 'subsector_id']<br>for col in col_list:<br> historical_transactions_3month[col] = historical_transactions_3month[col].astype(object)<br> <br>historical_transactions_3month[col_list].info()
categories = ['authorized_flag', 'city_id', 'installments', 'category_1', 'category_2', 'category_3', 'merchant_category_id', 'merchant_id', 'state_id', 'subsector_id']<br>historical_transactions_3month[categories].describe()
## 参考代码<br><br># 统计分布计算<br>historical_transactions_3month['purchase_amount'].describe()
## 参考代码<br><br># 直方图<br>historical_transactions_3month['purchase_amount'].hist()
<br>def aggregate_historical_transactions_3month(trans, prefix):<br> """<br> Input: <br> trans: 用于抽取特征的数据集,如historical_transactions_3month;<br> prefix: 用于生成特征的前缀;<br> <br> Return:<br> agg_trans: 按照card_id汇总后的特征数据集,可以用于与train.csv关联后建模<br> <br> """<br><br> # 将authorized_flag字段类型转换为数字,”Y“转换为1, ”N“转换为0。<br><br> trans['authorized_flag'] = trans['authorized_flag'].apply(lambda x: 1 if x == 'Y' else 0)<br> <br> # 将category_1字段类型转换为数字,”Y“转换为1, ”N“转换为0。<br> trans['category_1'] = trans['category_1'].apply(lambda x: 1 if x == 'Y' else 0)<br> <br> # 将category_2字段中缺失值定义为单独类别,用“6”表示该类<br> trans['category_2'] = trans['category_2'].fillna(6)<br> <br> # 将category_3字段中字符映射为数字,缺失值用“3”表示单独一类<br> map_dict = {'A': 0, 'B': 1, 'C': 2, 'nan': 3}<br> trans['category_3'] = trans['category_3'].apply(lambda x: map_dict[str(x)])<br> <br> # 将installmens, category_2, category_3进行独热编码<br> <br> trans = pd.get_dummies(trans, columns=['installments', 'category_2', 'category_3'])<br> <br> # 定义agg_func字典<br> agg_func = {<br> 'authorized_flag': ['sum', 'mean'],<br> 'category_1': ['sum', 'mean'],<br> 'category_2_1.0': ['mean', 'sum'],<br> 'category_2_2.0': ['mean', 'sum'],<br> 'category_2_3.0': ['mean', 'sum'],<br> 'category_2_4.0': ['mean', 'sum'],<br> 'category_2_5.0': ['mean', 'sum'],<br> 'category_2_6.0': ['mean', 'sum'],<br> 'category_3_1': ['sum', 'mean'],<br> 'category_3_2': ['sum', 'mean'],<br> 'category_3_3': ['sum', 'mean'],<br> 'installments_0': ['sum', 'mean'],<br> 'installments_1': ['sum', 'mean'],<br> 'installments_2': ['sum', 'mean'],<br> 'installments_3': ['sum', 'mean'],<br> 'installments_4': ['sum', 'mean'],<br> 'installments_5': ['sum', 'mean'],<br> 'installments_6': ['sum', 'mean'],<br> 'installments_7': ['sum', 'mean'],<br> 'installments_8': ['sum', 'mean'],<br> 'installments_9': ['sum', 'mean'],<br> 'installments_10': ['sum', 'mean'],<br> 'installments_11': ['sum', 'mean'],<br> 'installments_12': ['sum', 'mean'],<br> 'installments_-1': ['sum', 'mean'],<br> 'installments_999': ['sum', 'mean'], <br> 'merchant_id': ['nunique'],<br> 'purchase_amount': ['sum', 'mean', 'max', 'min'], <br> 'merchant_category_id': ['nunique'],<br> 'state_id': ['nunique'],<br> 'subsector_id': ['nunique'],<br> 'city_id': ['nunique']<br> }<br> <br> # 基于agg_func,按照card_id进行特征抽取<br> agg_trans = trans.groupby(['card_id']).agg(agg_func)<br> <br> # 为新特征增加前缀<br> agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]<br> agg_trans.reset_index(inplace=True)<br><br> # 按照card_id汇总消费笔数<br> df = (trans.groupby('card_id')<br> .size()<br> .reset_index(name='{}transactions_count'.format(prefix)))<br><br> # 将agg_trans数据集与df数据集合并为新的agg_trans数据集,使用card_id作为关联主键<br> agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')<br><br> return agg_trans<br><br># 对historical_transactions_3month数据集执行特征抽取函数<br>history_3month = aggregate_historical_transactions_3month(historical_transactions_3month, prefix='hist_')
合并新合成的特征数据集至train.csv数据集,得到新的训练数据;<br>合并过程中,可能存在部分持卡人在近3个月内无消费行为的情况,导致存在缺失值。因此可以将合并后数据集中缺失值填充为0。
train_add_history_3month = pd.merge(train, history_3month, on='card_id', how='left')
train_add_history_3month.fillna(0, inplace=True)
train_add_history_3month.info()
Y = train_add_history_3month['target']<br><br>feature = [col for col in train_add_history_3month.columns.values if 'hist' in col ]<br>X = train_add_history_3month[feature]<br><br>from sklearn.model_selection import train_test_split<br><br>train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size=0.20,random_state=42)
通过绘制特征重要性分布图,分析模型特征重要性
# #############################################################################<br># Plot feature importance<br>import numpy as np<br><br>feature_importance = est.feature_importances_<br># make importances relative to max importance<br>feature_importance = 100.0 * (feature_importance / feature_importance.max())<br>sorted_idx = np.argsort(feature_importance)<br>pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(16, 16))<br>plt.barh(pos, feature_importance[sorted_idx], align='center')<br>plt.yticks(pos, train_X.columns[sorted_idx])<br>plt.xlabel('Relative Importance')<br>plt.title('Variable Importance')<br>plt.show()
保存
train_add_history_3month.to_csv("./data/loyalty/train_add_hist_3month.csv", index=0)
0 条评论
下一页