文章目录
- MLR
- deepctr实现MLR
MLR



https://zhuanlan.zhihu.com/p/100532677
https://blog.csdn.net/fyneru_xiaohui/article/details/106390266
deepctr实现MLR
import os, warnings, time, sys
import pickle
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score, roc_curve, precision_score, recall_score, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from deepctr.models import DeepFM, xDeepFM, MLR, DeepFEFM, DIN, AFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from deepctr.layers import custom_objects
from tensorflow.python.keras.models import save_model, load_model
from tensorflow.keras.models import model_from_yaml
import tensorflow as tf
from tensorflow.python.ops import array_ops
import tensorflow.keras.backend as K
from sklearn import datasets
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.models import model_from_json
from tensorflow.keras.callbacks import *
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.layers.embeddings import Embeddingfrom toolsnn import *def train_MLR():print('MLR 模型训练开始 ', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))start_time_start = time.time()# pdtrain:正样本数:565485,负样本数:1133910,正负样本比: 1 : 2.0052# pdtest:正样本数:565485,负样本数:1134505,正负样本比: 1 : 2.0063# pdeval_full:正样本数:46,负样本数:8253,正负样本比: 1 : 179.413pdtrain = pd.read_csv(train_path_ascii)pdtest = pd.read_csv(test_path_ascii)data = pd.concat([pdtrain, pdtest[pdtest['y'] == 0]], axis=0, ignore_index=True)data = data.drop(['WilsonClickRate_all', 'WilsonClickRate_yesterday', 'WilsonAd_clickRate_all','WilsonAd_clickRate_yesterday'], axis=1)# 将 `用户id`、`广告id`、`用户设备`、`多牛广告位id` 用ASCII数值化,转为embedding: 利用卷积原理,将每个字符的ascii码相加作为字符串的数值data['suuid'] = data['suuid'].apply(lambda x: sum([ord(i) for i in x]))data['advertisement'] = data['advertisement'].apply(lambda x: sum([ord(i) for i in x]))# data['position'] = data['position'].apply(lambda x: sum([ord(i) for i in x])) # 多牛广告位id本身就是float类型,直接embeddingdata['user_modelMake'] = data['user_modelMake'].apply(lambda x: sum([ord(i) for i in x]))# double -> floatdata = transformDF(data, ['reserve_price', 'reserve_price_cpc', 'clickRate_all', 'clickRate_yesterday','ad_clickRate_yesterday'], float)''' 特征处理 '''global sparsecols, densecols# 稀疏-onehotsparsecols = ['hour', 'advert_place', 'province_id', 'port_type', 'user_osID', 'is_holidays', 'is_being','is_outflow', 'advertiser', 'ad_from', 'payment']# ascii embeddingsparse_ascii = ['suuid', 'advertisement', 'position', 'user_modelMake']# 稠密-归一化densecols = ['W', 'H', 'reserve_price', 'reserve_price_cpc', 'is_rest_click', 'clickPerHour_yesterday','display_nums_all', 'click_nums_all', 'display_nums_yesterday', 'click_nums_yesterday','ad_display_all', 'ad_click_all', 'ad_display_yesterday', 'ad_click_yesterday']# 稠密-点击率ratecols = ['WHrate', 'clickRate_all', 'clickRate_yesterday', 'ad_clickRate_yesterday']global namesohnamesoh = {}for sparse in sparsecols:onehot = OneHotEncoder()arrays = onehot.fit_transform(np.array(data[sparse]).reshape(-1, 1))# 将onehot后的稀疏矩阵拼回原来的dfarrays = arrays.toarray()names = [sparse + '_' + str(n) for n in range(len(arrays[0]))]namesoh[sparse] = namesdata = pd.concat([data, pd.DataFrame(arrays, columns=names)], axis=1)data = data.drop([sparse], axis=1)# 保存编码规则with open(feature_encode_path.format(sparse) + '.pkl', 'wb') as f:pickle.dump(onehot, f)# print(' {} onehot完成'.format(sparse))print(' onehot完成', time.strftime("%H:%M:%S", time.localtime(time.time())))for dense in densecols:mms = MinMaxScaler(feature_range=(0, 1))data[dense] = mms.fit_transform(np.array(data[dense]).reshape(-1, 1))with open(feature_encode_path.format(dense) + '.pkl', 'wb') as f:pickle.dump(mms, f)# print(' {} 归一化完成'.format(dense))print(' 归一化完成', time.strftime("%H:%M:%S", time.localtime(time.time())))print(' columns: ', len(list(data.columns)))''' 训练集、测试集、验证集划分 '''train_data, test_data = getRata2(data, num=1)_, val_data = train_test_split(test_data, test_size=0.2, random_state=1, shuffle=True)train_data = shuffle(train_data)test_data = shuffle(test_data)val_data = shuffle(val_data)negBpow(train_data, '训练集')negBpow(val_data, '验证集')negBpow(test_data, '测试集')print(' train_data shape: ', train_data.shape)print(' val_data shape: ', val_data.shape)print(' test_data shape: ', test_data.shape)sparse_features = []for value in namesoh.values():for v in value:sparse_features.append(v)dense_features = densecols + ratecolssparse_feature_columns1 = [SparseFeat(feat, vocabulary_size=int(train_data[feat].max() + 1), embedding_dim=4)for i, feat in enumerate(sparse_features)]sparse_feature_columns2 = [SparseFeat(feat, vocabulary_size=int(train_data[feat].max() + 1), embedding_dim=4)for i, feat in enumerate(sparse_ascii)]sparse_feature_columns = sparse_feature_columns1 + sparse_feature_columns2dense_feature_columns = [DenseFeat(feat, 1)for feat in dense_features]print(' sparse_features count: ', len(sparse_features))print(' dense_features count: ', len(dense_features))linear_feature_columns = sparse_feature_columns + dense_feature_columnstmp_user = ['hour', 'province_id', 'user_osID', 'is_holidays', 'is_being', 'is_outflow']region_feature_columns = []for key, value in namesoh.items():if key in tmp_user:for v in value:region_feature_columns.append(v)base_feature_columns = linear_feature_columnsglobal feature_namesfeature_names = get_feature_names(linear_feature_columns)print(' feature_names: ', feature_names)''' feed input '''train_x = {name: train_data[name].values for name in feature_names}test_x = {name: test_data[name].values for name in feature_names}val_x = {name: val_data[name].values for name in feature_names}train_y = train_data[['y']].valuestest_y = test_data[['y']].valuesval_y = val_data[['y']].valuesprint(' 数据处理完成', time.strftime("%H:%M:%S", time.localtime(time.time())))'''region_feature_columns: 用于聚类的用户特征base_feature_columns:基模型特征,其实可以是全部特征,也可以是用于训练的广告特征l2_reg_linear:LR的正则强度(L2正则)bias_feature_columns: 偏好特征,不同的人群具有聚类特性,同一类人群具有类似的广告点击偏好。'''deep = MLR(region_feature_columns=base_feature_columns, region_num=4,l2_reg_linear=1e-5, task='binary',)mNadam = Adam(lr=1e-4, beta_1=0.95, beta_2=0.96)deep.compile(optimizer=mNadam, loss='binary_crossentropy',metrics=['AUC', 'Precision', 'Recall'])print(' 组网完成', time.strftime("%H:%M:%S", time.localtime(time.time())))print(' 训练开始 ', time.strftime("%H:%M:%S", time.localtime(time.time())))start_time = time.time()''' 训练 '''# 早停止:验证集精确率上升幅度小于min_delta,训练停止earlystop_callback = EarlyStopping(monitor='val_precision', min_delta=0.001, mode='max',verbose=2, patience=3)generator_flag = False # fit# generator_flag = True # fit_generatorif not generator_flag:history = deep.fit(train_x, train_y, validation_data=(val_x, val_y),batch_size=2000,epochs=3,verbose=2,shuffle=True,# callbacks=[earlystop_callback])else:batch_size = 2000train_nums = len(train_data)history = deep.fit_generator(GeneratorRandomPatchs(train_x, train_y, batch_size, train_nums, feature_names),validation_data=(val_x, val_y),steps_per_epoch=train_nums // batch_size,epochs=3000,verbose=2,shuffle=True,# callbacks=[earlystop_callback])end_time = time.time()print(' 训练完成', time.strftime("%H:%M:%S", time.localtime(time.time())))print((' 训练运行时间: {:.0f}分 {:.0f}秒'.format((end_time - start_time) // 60, (end_time - start_time) % 60)))# 模型保存成yaml文件save_model(deep, save_path)print(' 模型保存完成', time.strftime("%H:%M:%S", time.localtime(time.time())))# # 训练可视化# visualization(history, saveflag=True, showflag=False, path1=loss_plt_path.format('loss_auc.jpg'),# path2=loss_plt_path.format('precision_recall.jpg'))# 测试集评估scores = deep.evaluate(test_x, test_y, verbose=0)print(' %s: %.4f' % (deep.metrics_names[0], scores[0]))print(' %s: %.4f' % (deep.metrics_names[1], scores[1]))print(' %s: %.4f' % (deep.metrics_names[2], scores[2]))print(' %s: %.4f' % (deep.metrics_names[3], scores[3]))print(' %s: %.4f' % ('F1', (2 * scores[2] * scores[3]) / (scores[2] + scores[3])))print(' 验证集再评估完成', time.strftime("%H:%M:%S", time.localtime(time.time())))# 全量评估full_evaluate2()end_time_end = time.time()print(('MLR 模型训练运行时间: {:.0f}分 {:.0f}秒'.format((end_time_end - start_time_start) // 60,(end_time_end - start_time_start) % 60)))print(('{:.0f}小时'.format((end_time_end - start_time_start) // 60 / 60)))















