vintage整理
--------仅用于个人学习知识整理和sas/R语言/python代码整理
####1 . 前言
Vintage表,将不同时间层面的顾客拉平到同一时间周期上进行比较,观察不同入口时间的顾客在不同生命周期上的表现。
vintage一般有三种用法:
1.横看:得到同一时间入口的顾客,在不同生命周期上回来购买的表现
2.竖看:不同时间入口客人质量的差别
3.斜看:可以直观地看出不同时期公司的营销活动的效果
下图是一个实例(以下数字为模拟数据)
2. 代码分享
2.1 大促期间顾客vintage
需求:需要分新客的入口产品统计,首单购买后的mob时间段内,二回的人数,花费等指标
注意点:
2.1.1. 这里多了一个大促时间段内回柜的概念(mob0.5),以及mob1及之后的时间判定是以大促结束时间求间隔的
2.1.2. 二回的定义会有修改:因为给品牌做的目标是提升新客二回,在看二回的时候,只想看二回的时间段-----所以就只看顾客第一次及第二次回来的交易
def vintage(promotion_start,promotion_end,sales_start,sales_end,repeat_flag):###加一步判断 要算回来一次即可还是算多次if repeat_flag=='repeat_once':trans_cus=trans_cus_raw\.join(first_date,['md5'],'left')\.withColumn('first_purchase_date',when(col('mindate_fromtrans')<col('first_purchase_date'),col('mindate_fromtrans')).\otherwise(col('first_purchase_date')))\.withColumn('rnk',dense_rank().over(Window.partitionBy('md5').orderBy('purchase_date')))\.where(col('rnk')<=2)else:trans_cus=trans_cus_raw\.join(first_date,['md5'],'left')\.withColumn('first_purchase_date',when(col('mindate_fromtrans')<col('first_purchase_date'),col('mindate_fromtrans')).otherwise(col('first_purchase_date')))#########销售占比大于1%sales_per=trans_cus\.where(col('purchase_date').between(sales_start,sales_end))\.groupBy('productname_std').agg(sum('amount').alias('sales'))\.withColumn('sales0',sum('sales').over(Window.partitionBy().orderBy()))\.withColumn('per',col('sales')/col('sales0')).where(col('per')>=0.01)\.agg(collect_set('productname_std')).collect()[0][0]###首单 及 购买产品first_pur=trans_cus\.where(col('purchase_date')==col('first_purchase_date'))\.where(col('first_purchase_date').between(promotion_start,promotion_end))\.groupBy('md5','productname_std').count().drop('count').where(col('productname_std').isin(sales_per))###匹配trans 计算mob ###在大促时间内回柜的人 记为0.5mob_trans=trans_cus\.groupBy('md5','purchase_date','first_purchase_date')\.agg(sum('amount').alias('amount'),sum('quantity').alias('quantity'))\.join(first_pur,['md5'],'inner')\.withColumn('mob_raw',floor(datediff(col('purchase_date'),lit(promotion_end))/7))\.withColumn('mob',when((col('mob_raw')<=0) & (col('purchase_date')==col('first_purchase_date')),0)\.otherwise(when((col('mob_raw')<=0) & (col('purchase_date')<=promotion_end),0.5).otherwise(col('mob_raw')+lit(1))))\.withColumn('promotion_end',lit(promotion_end))\.withColumn('period_start',when(col('mob')<1,promotion_start).otherwise(expr('date_add(promotion_end,7*(mob-1))')))\.withColumn('period_end',when(col('mob')<1,promotion_end).otherwise(expr('date_add(promotion_end,7*(mob))')))
2.2 正常vintage的变种
需求的更改:
2.2.1. mob月份算的是自然月/ 差额周
2.2.2. 计算二回的时候 回来一次还是回来多次
def vintage(repeat_flag,time_flag):
###加一步判断 要算回来一次即可还是算多次if repeat_flag=='repeat_once':trans_input=trans_input\.withColumn('rnk',dense_rank().over(Window.partitionBy('md5').orderBy('purchase_date')))\.where(col('rnk')<=2)\.withColumn('first_purchase_date',col('first_purchase_date').cast('date'))\.withColumn('purchase_year',year(col('purchase_date')))\.withColumn('purchase_month',month(col('purchase_date')))\.withColumn('first_year',year(col('first_purchase_date')))\.withColumn('first_month',month(col('first_purchase_date')))else:trans_input=trans_input\.withColumn('first_purchase_date',col('first_purchase_date').cast('date'))\.withColumn('purchase_year',year(col('purchase_date')))\.withColumn('purchase_month',month(col('purchase_date')))\.withColumn('first_year',year(col('first_purchase_date')))\.withColumn('first_month',month(col('first_purchase_date')))###自然月/mob差额月if time_flag='natural':total = trans_input\.withColumn('mob_months',12*(col('purchase_year')-col('first_year'))+col('purchase_month')-col('first_month'))\.withColumn('mob_months',when((col('purchase_date')==col('first_purchase_date')) & (col('rnk')==1),0).\otherwise(when((col('mob_months')==0) & (col('rnk')==2),0.5).otherwise(col('mob_months'))))\.withColumn('firstmonth',substring('first_purchase_date',1,7))else:####这里如果是mob 月 下边除以7应该改为30total = trans_cus\.withColumn('first_purchase_date',col('first_purchase_date').cast('date'))\.withColumn('mob_months',floor(datediff(col('purchase_date'),col('first_purchase_date'))/7))\.withColumn('mob_months',when((col('purchase_date')==col('first_purchase_date')) & (col('rnk')==1),0).\otherwise(when((col('mob_months')==0) & (col('rnk')==2),0.5).otherwise(col('mob_months'))))\.withColumn('firstmonth',substring('first_purchase_date',1,7))
2.3 基于2.1入口产品的修改
需求的更改:
2.3.1. 入口产品是一个集合,计算这个集合进来的顾客的回柜
trans_cus=trans_cus_raw\
.join(first_date,['md5'],'left')\
.withColumn('first_purchase_date',when(col('mindate_fromtrans')<col('first_purchase_date'),col('mindate_fromtrans')).\otherwise(col('first_purchase_date')))\
.withColumn('rnk',dense_rank().over(Window.partitionBy('md5').orderBy('purchase_date')))\
.where(col('rnk')<=2)
######
###销售占比大于1%
sales_per=trans_cus\.where(col('productname_std').rlike('防晒乳'))\.groupBy('productname_std').agg(sum('amount').alias('sales'))\.withColumn('sales0',sum('sales').over(Window.partitionBy().orderBy()))\.withColumn('per',col('sales')/col('sales0')).where(col('per')>=0.01)\.agg(collect_set('productname_std')).collect()[0][0]###首单 及 购买产品
first_pur=trans_cus\.where(col('productname_std').isin(sales_per))\.where(col('purchase_date')==col('first_purchase_date'))\.groupBy('md5').count().drop('count')###匹配trans 计算mob
###在大促时间内回柜的人 记为0.5
mob_trans=trans_cus\.groupBy('md5','purchase_date','first_purchase_date')\.agg(sum('amount').alias('amount'),sum('quantity').alias('quantity'))\.join(first_pur,['md5'],'inner')\.withColumn('mob',floor(datediff(col('purchase_date'),col('first_purchase_date'))/30))\.withColumn('firstmonth',substring('first_purchase_date',1,7))
2.4 变种2 mob改为一年内的自然周
需求的更改:
2.4.1. mob月份算的是自然周—主要用到了weekofyear
2.4.2 入口月份变为自然周
###### mob变成自然周
trans_raw=spark.read.parquet(trans_path2)\.where(col('cut')!='中小样')\.where(col('flag')==1).where(col('group_sale')==0)\.withColumn('week',weekofyear('purchase_date'))week_add=trans_raw\.groupBy('purchase_year','purchase_month','week')\.count().drop('count')\.withColumn('purchase_week',when((col('week')==52) & (col('purchase_month')==1),0).otherwise(col('week')))\.withColumn('purchase_week',when((col('week')==1) & (col('purchase_month')==12),53).otherwise(col('purchase_week')))\.withColumn('lag_week',lag('purchase_week').over(Window.partitionBy().orderBy('purchase_year','purchase_month','purchase_week')))\.withColumn('week_diff',when(-col('lag_week')+col('purchase_week')==-51,1).\otherwise(when(-col('lag_week')+col('purchase_week')==-52,0).otherwise(-col('lag_week')+col('purchase_week')))).fillna(1)\.orderBy('purchase_year','purchase_month','purchase_week')\.withColumn('purchase_week1',sum('week_diff').over(Window.orderBy('purchase_year','purchase_month','purchase_week').rangeBetween(Window.unboundedPreceding,0)))trans_raw=trans_raw\.join(week_add,['purchase_year','purchase_month','week'],'left')###人天销售大于0
cus_raw=trans_raw\.groupBy('customer_id','purchase_year','purchase_month','purchase_week1','purchase_date','first_purchase_date','productname_std')\.agg(sum('amount').alias('amount'),sum('quantity').alias('quantity'))\.withColumn('sales0',sum('amount').over(Window.partitionBy('customer_id','purchase_date').orderBy('customer_id')))\.where(col('sales0')>0)first_date=cus_raw\.groupBy('customer_id')\.agg(min('purchase_date').alias('mindate_fromtrans'))cus=cus_raw\.join(first_date,['customer_id'],'left')\.withColumn('first_purchase_date',when(col('mindate_fromtrans')<col('first_purchase_date'),col('mindate_fromtrans')).otherwise(col('first_purchase_date')))###trans首单时间
cus=cus\.withColumn('rnk',dense_rank().over(Window.partitionBy('customer_id').orderBy('purchase_date')))\
# .where(col('rnk')<=2)total = cus\.withColumn('first_purchase_date',col('first_purchase_date').cast('date'))\.withColumn('first_year',year(col('first_purchase_date')))\.withColumn('first_month',month(col('first_purchase_date')))\.withColumn('first_week',weekofyear(col('first_purchase_date')))\.join(week_add.selectExpr('purchase_year as first_year','purchase_month as first_month','week as first_week','purchase_week1 as first_week1'),['first_year','first_month','first_week'],'left')\.where(col('first_purchase_date')>='2018-01-01')\.where(~col('first_week1').isNull())\.withColumn('mob_weeks',col('purchase_week1')-col('first_week1'))\.withColumn('mob_weeks',when((col('purchase_date')==col('first_purchase_date')) & (col('rnk')==1),0).\otherwise(when((col('mob_weeks')==0) & (col('rnk')==2),0.5).otherwise(col('mob_weeks'))))\.withColumn('firstweek',col('first_week1'))summary=total\.groupBy('mob_weeks','firstweek')\.agg(countDistinct('customer_id').alias('cnt'),sum('amount').alias('sales'),sum('quantity').alias('quantity'),max('purchase_date').alias('max_mobdate'),min('purchase_date').alias('min_mobdate'))