涉及知识点:concat、concat_ws、collect_set、date_add、next_day、last_day、date_format、hive动态分区、if函数
1、dws层dws_uv_detail_day、dws_uv_detail_wk、dws_uv_detail_mn,ads层的ads_uv_count
--需求一:用户活跃主题 --dws层的日活跃(DAU) create external table dws_uv_detail_day( `mid_id` string COMMENT '设备唯一标识', `user_id` string COMMENT '用户标识', `version_code` string COMMENT '程序版本号', `version_name` string COMMENT '程序版本名', `lang` string COMMENT '系统语言', `source` string COMMENT '渠道号', `os` string COMMENT '安卓系统版本', `area` string COMMENT '区域', `model` string COMMENT '手机型号', `brand` string COMMENT '手机品牌', `sdk_version` string COMMENT 'sdkVersion', `gmail` string COMMENT 'gmail', `height_width` string COMMENT '屏幕宽高', `app_time` string COMMENT '客户端日志产生时的时间', `network` string COMMENT '网络模式', `lng` string COMMENT '经度', `lat` string COMMENT '纬度' )partitioned by(dt string) insert overwrite table dws_uv_detail_day partition(dt='2022-04-12') select mid_id, concat_ws('|', collect_set(user_id)) user_id, concat_ws('|', collect_set(version_code)) version_code, concat_ws('|', collect_set(version_name)) version_name, concat_ws('|', collect_set(lang))lang, concat_ws('|', collect_set(source)) source, concat_ws('|', collect_set(os)) os, concat_ws('|', collect_set(area)) area, concat_ws('|', collect_set(model)) model, concat_ws('|', collect_set(brand)) brand, concat_ws('|', collect_set(sdk_version)) sdk_version, concat_ws('|', collect_set(gmail)) gmail, concat_ws('|', collect_set(height_width)) height_width, concat_ws('|', collect_set(app_time)) app_time, concat_ws('|', collect_set(network)) network, concat_ws('|', collect_set(lng)) lng, concat_ws('|', collect_set(lat)) lat from dwd_start_log where dt='2022-04-12' group by mid_id; -- next_day 获取指定时间的下周几是几号 select next_day('2022-05-03','mo') -- date_add 对指定时间进行加减操作 select date_add('2022-05-03',+7) -- 本周一到周日 select date_add(next_day('2022-04-02','MO'),-7) select date_add(next_day('2022-04-02','MO'),-1) --dws层的周活跃 create external table dws_uv_detail_wk( `mid_id` string COMMENT '设备唯一标识', `user_id` string COMMENT '用户标识', `version_code` string COMMENT '程序版本号', `version_name` string COMMENT '程序版本名', `lang` string COMMENT '系统语言', `source` string COMMENT '渠道号', `os` string COMMENT '安卓系统版本', `area` string COMMENT '区域', `model` string COMMENT '手机型号', `brand` string COMMENT '手机品牌', `sdk_version` string COMMENT 'sdkVersion', `gmail` string COMMENT 'gmail', `height_width` string COMMENT '屏幕宽高', `app_time` string COMMENT '客户端日志产生时的时间', `network` string COMMENT '网络模式', `lng` string COMMENT '经度', `lat` string COMMENT '纬度', `monday_date` string COMMENT '周一日期', `sunday_date` string COMMENT '周日日期' ) COMMENT '活跃用户按周明细' PARTITIONED BY (`wk_dt` string) --将动态分区设置为非严格模式 set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dws_uv_detail_wk partition(wk_dt) select mid_id, concat_ws('|', collect_set(user_id)) user_id, concat_ws('|', collect_set(version_code)) version_code, concat_ws('|', collect_set(version_name)) version_name, concat_ws('|', collect_set(lang)) lang, concat_ws('|', collect_set(source)) source, concat_ws('|', collect_set(os)) os, concat_ws('|', collect_set(area)) area, concat_ws('|', collect_set(model)) model, concat_ws('|', collect_set(brand)) brand, concat_ws('|', collect_set(sdk_version)) sdk_version, concat_ws('|', collect_set(gmail)) gmail, concat_ws('|', collect_set(height_width)) height_width, concat_ws('|', collect_set(app_time)) app_time, concat_ws('|', collect_set(network)) network, concat_ws('|', collect_set(lng)) lng, concat_ws('|', collect_set(lat)) lat, date_add(next_day('2022-04-12','MO'),-7), date_add(next_day('2022-04-12','MO'),-1), concat(date_add(next_day('2022-04-12','MO'),-7), '_',date_add(next_day('2022-04-12','MO'),-1)) from dws_uv_detail_day where dt >= date_add(next_day('2022-04-12','MO'),-7) and dt<= date_add(next_day('2022-04-12','MO'),-1) group by mid_id; select * from dws_uv_detail_wk select wk_dt,count(*) from dws_uv_detail_wk group by wk_dt --dws层的月活跃 drop table dws_uv_detail_mn create external table dws_uv_detail_mn( `mid_id` string COMMENT '设备唯一标识', `user_id` string COMMENT '用户标识', `version_code` string COMMENT '程序版本号', `version_name` string COMMENT '程序版本名', `lang` string COMMENT '系统语言', `source` string COMMENT '渠道号', `os` string COMMENT '安卓系统版本', `area` string COMMENT '区域', `model` string COMMENT '手机型号', `brand` string COMMENT '手机品牌', `sdk_version` string COMMENT 'sdkVersion', `gmail` string COMMENT 'gmail', `height_width` string COMMENT '屏幕宽高', `app_time` string COMMENT '客户端日志产生时的时间', `network` string COMMENT '网络模式', `lng` string COMMENT '经度', `lat` string COMMENT '纬度' ) COMMENT '活跃用户按月明细' PARTITIONED BY (`mn` string) --测试date_format函数 select date_format('2022-05-03','yyyy-MM') insert overwrite table dws_uv_detail_mn partition(mn) select mid_id, concat_ws('|', collect_set(user_id)) user_id, concat_ws('|', collect_set(version_code)) version_code, concat_ws('|', collect_set(version_name)) version_name, concat_ws('|', collect_set(lang)) lang, concat_ws('|', collect_set(source)) source, concat_ws('|', collect_set(os)) os, concat_ws('|', collect_set(area)) area, concat_ws('|', collect_set(model)) model, concat_ws('|', collect_set(brand)) brand, concat_ws('|', collect_set(sdk_version)) sdk_version, concat_ws('|', collect_set(gmail)) gmail, concat_ws('|', collect_set(height_width)) height_width, concat_ws('|', collect_set(app_time)) app_time, concat_ws('|', collect_set(network)) network, concat_ws('|', collect_set(lng)) lng, concat_ws('|', collect_set(lat)) lat, date_format('2022-04-12','yyyy-MM') from dws_uv_detail_day where date_format(dt,'yyyy-MM')=date_format('2022-04-12','yyyy-MM') group by mid_id; select * from dws_uv_detail_mn --ads层的活跃设备统计结果表 create external table ads_uv_count( `dt` string COMMENT '统计日期', `day_count` bigint COMMENT '当日用户数量', `wk_count` bigint COMMENT '当周用户数量', `mn_count` bigint COMMENT '当月用户数量', `is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果', `is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果' ) COMMENT '活跃设备数' row format delimited fields terminated by 't' --测试if函数 select if('2022-05-08'=date_add(next_day('2022-05-03','mo'),-1),'Y','N') --last_day()返回本月最后一天 select last_day('2022-05-08') insert into table ads_uv_count select '2022-04-12' dt,t1.c,t2.cc,t3.ccc, if('2022-04-12'=date_add(next_day('2022-04-12','mo'),-1),'Y','N'), if('2022-04-12'=last_day('2022-04-12'),'Y','N') from (select '2022-04-12' dt,count(*) c from dws_uv_detail_day where dt='2022-04-12') t1 left join (select '2022-04-12' dt,count(*) cc from dws_uv_detail_wk where wk_dt = concat(date_add(next_day('2022-04-12','mo'),-7), '_' ,date_add(next_day('2022-04-12','mo'),-1))) t2 on t1.dt = t2.dt left join (select '2022-04-12' dt,count(*) ccc from dws_uv_detail_mn where mn=date_format('2022-04-12','yyyy-MM')) t3 on t2.dt = t3.dt需求二 用户新增主题(每日新增)
--dws层的每日新增 create external table dws_new_mid_day ( `mid_id` string COMMENT '设备唯一标识', `user_id` string COMMENT '用户标识', `version_code` string COMMENT '程序版本号', `version_name` string COMMENT '程序版本名', `lang` string COMMENT '系统语言', `source` string COMMENT '渠道号', `os` string COMMENT '安卓系统版本', `area` string COMMENT '区域', `model` string COMMENT '手机型号', `brand` string COMMENT '手机品牌', `sdk_version` string COMMENT 'sdkVersion', `gmail` string COMMENT 'gmail', `height_width` string COMMENT '屏幕宽高', `app_time` string COMMENT '客户端日志产生时的时间', `network` string COMMENT '网络模式', `lng` string COMMENT '经度', `lat` string COMMENT '纬度', `create_date` string comment '创建时间' ) COMMENT '每日新增设备信息' select * from dws_new_mid_day insert into table dws_new_mid_day select u.mid_id, u.user_id , u.version_code , u.version_name , u.lang , u.source, u.os, u.area, u.model, u.brand, u.sdk_version, u.gmail, u.height_width, u.app_time, u.network, u.lng, u.lat, '2022-04-12' from dws_uv_detail_day u left join dws_new_mid_day n on u.mid_id = n.mid_id where u.dt = '2022-04-12' and n.mid_id is null; --ads层的每日新增统计 create external table ads_new_mid_count( `create_date` string comment '创建时间', `new_mid_count` BIGINT comment '新增设备数量' ) COMMENT '每日新增设备信息数量' row format delimited fields terminated by 't' select * from ads_new_mid_count insert into table ads_new_mid_count select create_date,count(*) from dws_new_mid_day where create_date='2022-04-12' group by create_date;需求三 用户留存主题(一日、二日、三日留存)
-- dws层用户每日新增 create external table dws_user_retention_day ( `mid_id` string COMMENT '设备唯一标识', `user_id` string COMMENT '用户标识', `version_code` string COMMENT '程序版本号', `version_name` string COMMENT '程序版本名', `lang` string COMMENT '系统语言', `source` string COMMENT '渠道号', `os` string COMMENT '安卓系统版本', `area` string COMMENT '区域', `model` string COMMENT '手机型号', `brand` string COMMENT '手机品牌', `sdk_version` string COMMENT 'sdkVersion', `gmail` string COMMENT 'gmail', `height_width` string COMMENT '屏幕宽高', `app_time` string COMMENT '客户端日志产生时的时间', `network` string COMMENT '网络模式', `lng` string COMMENT '经度', `lat` string COMMENT '纬度', `create_date` string comment '设备新增时间', `retention_day` int comment '截止当前日期留存天数' ) COMMENT '每日用户留存情况' PARTITIONED BY (`dt` string) select * from dws_user_retention_day -- 求出4.1——1日、2日、3日留存情况 insert overwrite table dws_user_retention_day partition(dt='2022-04-05') select nm.mid_id, nm.user_id, nm.version_code, nm.version_name, nm.lang, nm.source, nm.os, nm.area, nm.model, nm.brand, nm.sdk_version, nm.gmail, nm.height_width, nm.app_time, nm.network, nm.lng, nm.lat, nm.create_date, 1 retention_day from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id where ud.dt='2022-04-05' and nm.create_date=date_add('2022-04-05',-1) union all select nm.mid_id, nm.user_id , nm.version_code , nm.version_name , nm.lang , nm.source, nm.os, nm.area, nm.model, nm.brand, nm.sdk_version, nm.gmail, nm.height_width, nm.app_time, nm.network, nm.lng, nm.lat, nm.create_date, 2 retention_day from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id where ud.dt='2022-04-05' and nm.create_date=date_add('2022-04-05',-2) union all select nm.mid_id, nm.user_id, nm.version_code, nm.version_name, nm.lang, nm.source, nm.os, nm.area, nm.model, nm.brand, nm.sdk_version, nm.gmail, nm.height_width, nm.app_time, nm.network, nm.lng, nm.lat, nm.create_date, 3 retention_day from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id where ud.dt='2022-04-05' and nm.create_date=date_add('2022-04-05',-3); --ads用户留存统计表 create external table ads_user_retention_day_count( `create_date` string comment '设备新增日期', `retention_day` int comment '截止当前日期留存天数', `retention_count` bigint comment '留存数量' ) COMMENT '每日用户留存情况' row format delimited fields terminated by 't' select * from ads_user_retention_day_count insert into table ads_user_retention_day_count select create_date, retention_day, count(*) retention_count from dws_user_retention_day where dt='2022-04-05' group by create_date,retention_day; --asd 用户留存率 create external table ads_user_retention_day_rate ( `stat_date` string comment '统计日期', `create_date` string comment '设备新增日期', `retention_day` int comment '截止当前日期留存天数', `retention_count` bigint comment '留存数量', `new_mid_count` bigint comment '当日设备新增数量', `retention_ratio` decimal(10,2) comment '留存率' ) COMMENT '每日用户留存情况' row format delimited fields terminated by 't' insert into table ads_user_retention_day_rate select '2022-04-05', t1.create_date, t1.retention_day, t1.retention_count, t2.new_mid_count, t1.retention_count / t2.new_mid_count * 100 from ads_user_retention_day_count t1 join ads_new_mid_count t2 on t1.create_date = t2.create_date where date_add(t1.create_date,t1.retention_day) = '2022-04-05' select * from ads_user_retention_day_rate需求四 沉默用户主题
--ads层的沉默用户表 create external table ads_silent_count( `dt` string COMMENT '统计日期', `silent_count` bigint COMMENT '沉默设备数' ) row format delimited fields terminated by 't' insert into table ads_silent_count select '2022-04-12'dt,count(*) from ( select mid_id,count(mid_id) cc from dws_uv_detail_day where dt<='2022-04-12' group by mid_id having count(mid_id)=1 and max(dt)需求五 本周回流用户 --ads层的本周回流 create external table ads_back_count( `dt` string COMMENT '统计日期', `wk_dt` string COMMENT '统计日期所在周', `wastage_count` bigint COMMENT '回流设备数' ) row format delimited fields terminated by 't' insert into table ads_back_count select '2022-04-05' dt,concat(date_add(next_day('2022-04-05','mo'),-7), '_',date_add(next_day('2022-04-05','mo'),-1)) wk_dt,count(*) from --select t.mid_id from ( select mid_id from dws_uv_detail_wk where wk_dt= concat(date_add(next_day('2022-04-05','mo'),-7), '_',date_add(next_day('2022-04-05','mo'),-1)) ) t left join ( select mid_id from dws_new_mid_day where create_date>=date_add(next_day('2022-04-05','mo'),-7) and create_date<=date_add(next_day('2022-04-05','mo'),-1) ) tt on t.mid_id=tt.mid_id left join ( select mid_id from dws_uv_detail_wk where wk_dt= concat(date_add(next_day('2022-04-05','mo'),-14), '_',date_add(next_day('2022-04-05','mo'),-8)) ) ttt on t.mid_id=ttt.mid_id where tt.mid_id is null and ttt.mid_id is null select * from ads_back_count需求六 流失用户数--ads层的流失用户数 create external table ads_wastage_count( `dt` string COMMENT '统计日期', `wastage_count` bigint COMMENT '流失设备数' )row format delimited fields terminated by 't' select * from ads_wastage_count insert into table ads_wastage_count select '2022-04-12',count(*) from ( select mid_id from dws_uv_detail_day group by mid_id having max(dt)<=date_add('2022-04-12',-7) ) tt