hive 常用函数
作者:小教学发布时间:2023-10-03分类:程序开发学习浏览:212
1.分位数
percentile_approx(DOUBLE col, p [, B]) Returns an approximate pth percentile of a numeric column (including floating point types) in the group
含义: 在col列中返回p%的分位数
select
percentile_approx( arr_id , 0.5 )
from
(
select
arr_id
from
(
select
array(1, 2, 3, 4, 5, 6, 7, 8, 9, 1000) as arr
) a lateral view explode(arr) tt as arr_id
) a |

2.url解析
解码url: URL转码,encodeURI,encodeURIComponent — 在线工具
解析json: 在线JSON校验格式化工具(Be JSON)
上报:
:path: /log/web?0000171612422098683https%3A%2F%2Fwww.bilibili.com%2Fbangumi%2Flist%2Fsl17662%3Ffrom_spmid%3Dpgc.cinema-tab.0.0%26intentFrom%3D15%26native.theme%3D1%26navhide%3D1%26share_medium%3Dandroid%26share_plat%3Dandroid%26share_source%3DCOPY%26share_tag%3Ds_i%26timestamp%3D1612343552%26unique_k%3DHKz3gy|666.49.selfDef.click_unfollow||1612422098000|0|0|980x1743|2|{%22event%22:%22click_unfollow%22,%22value%22:{%22sl_id%22:%2217662%22,%22season_id%22:35582,%22season_type%22:3,%22mid%22:36865977},%22bsource%22:%22share_source_copy_link%22}|{}|https%3A%2F%2Fm.bilibili.com%2F|FF7C614F-2539-A3E3-1AE8-E4265F23111B03042infoc|zh-CN|null
msg:
{%22event%22:%22click_season%22,%22value%22:{%22sl_id%22:%2218665%22,%22season_id%22:32429,%22season_type%22:2,%22mid%22:179753067},%22bsource%22:%22%22}
解码url: https://www.sojson.com/encodeurl.html
解析json: https://www.bejson.com/
解析msg 字段:
select
*
,get_json_object(msg, '$.event') as event1
,get_json_object(msg, '$.value') as value1
,get_json_object(get_json_object(msg, '$.value'), '$.sl_id') as sub_value_sl
,get_json_object(get_json_object(msg, '$.value'), '$.season_id') as sub_value_season
,get_json_object(get_json_object(msg, '$.value'), '$.season_type') as sub_value_season_type
,get_json_object(get_json_object(msg, '$.value'), '$.mid') as sub_value_mid
from
(
select '{"event":"click_season","value":{"sl_id":"18665","season_id":32429,"season_type":2,"mid":179753067},"bsource":""}' as msg union all
select '{"event":"unload","value":{"enter":1612292714266,"leave":1612292745642},"bsource":""}' as msg union all
select '{"event":"show_hover","value":{"sl_id":"20233","season_id":33987,"season_type":2,"mid":172278354},"bsource":""}' as msg union all
select '{"event":"click_season","value":{"sl_id":"17834","season_id":34565,"season_type":2,"mid":271658710},"bsource":"search_baidu"}' as msg union all
select '{"event":"show_hover","value":{"sl_id":"17834","season_id":28274,"season_type":2,"mid":281264988},"bsource":""}' as msg
) a
limit 2000
select
*,
reflect("java.net.URLDecoder", "decode", msg, "UTF-8") as url
,get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.event') as event_type
,get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value') as event_value
,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.sl_id') as sub_value_sl
,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.season_id') as sub_value_season
,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.season_type') as sub_value_season_type
,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.mid') as sub_value_mid
from
b_dwd.dwd_flow_web_click_l_d
WHERE
log_date between '<%= log_date %>' and '<%= log_date %>'
and substr(spm_id,1,6)= '666.49'
limit 2000 |
3.正则
滑动验证页面
select
*,
a.from_spmid rlike concat('^','dynamic') ,
a.from_spmid rlike concat('^','dt+') ,
a.from_spmid rlike '^dt'
from
(
select 'ott-platform.ott-dynamic.0.0' as from_spmid union ALL
select 'out_open_deeplink_gdtlh-and-yf-31yaz-17776718' as from_spmid union ALL
select 'out_open_deeplink_gdtlh-and-yf-31yaz-18253662' as from_spmid union ALL
select 'ott-platform.ott-dynamic.0.0' as from_spmid union ALL
select 'dynamic.video-tab.0.0' as from_spmid union ALL
select 'default-value' as from_spmid
) a
3.hsql 间隔日期补数
方法1:窗口函数
with tmp_table as (
select dt ,city , val
from
(
select '20220101' as dt ,'上海' as city , 100 val union all
select '20220102' as dt ,'上海' as city , null val union all
select '20220104' as dt ,'上海' as city , null val union all
select '20220106' as dt ,'上海' as city , 200 val
) a
)
,tmp_a as (
select dt ,city , val ,
case when val is null then 0 else 1 end val1 -- 将空值设置为0,非空值设置为1
from tmp_table
),
tmp_b as (
select dt ,city , val ,val1,
sum(val1) over(partition by '1' order by dt) as cnt_val1 -- 区分 空和费控分空分组
from tmp_a
)
select dt ,city , val ,val1,cnt_val1,
max(val) over(partition by cnt_val1 order by dt) as fill_val -- 使用开窗函数按照分组求最大值
from tmp_b
|

方法2:udf
4. 用户大会员在期的天数
with tmp_dt as ( select id,sdt,edt from ( select 1 id, '2022-02-02' as sdt, '2022-02-07' as edt union all select 1 id, '2022-02-07' as sdt, '2022-02-18' as edt union all select 1 id, '2022-02-10' as sdt, '2022-02-23' as edt ) a ) select id,base_line,max(dt) dt ,datediff(max(dt),base_line)+1 as days from ( select id ,dt,max(base_f) over (PARTITION BY id ORDER BY dt) base_line from ( select id ,dt,flag,sum(flag) over(partition by id order by concat(dt,flag) desc) as sm ,case when sum(flag) over(partition by id order by concat(dt,flag) desc)=0 then dt else 0 end as base_f from ( select id,sdt as dt,1 as flag from tmp_dt union all select id,edt,-1 as flag from tmp_dt ) a ) b ) c group by id ,base_line |

5.取每个mid同时播放的设备数分布。
已经整理好buvid、mid、开始播放时间、结束播放时间。主要背景是想看下现在“共享账号”的用户量级有多少
https://berserker.bilibili.co/?URL=http://berserker.bilibili.co/#/adhoc?sqlId=100670985
with
ogv_play as (
select mid,buvid,season_id,epid,log_date,
arch_play_timestamp as stime,
arch_play_timestamp+total_duration as etime
from bili_ogv.dwd_flow_ogv_play_app_timed_i_d
where log_date between '20230226' and '20230226' and season_id >0 and total_duration>0
union
select mid,buvid,season_id,epid,log_date,
arch_play_timestamp as stime,
arch_play_timestamp+played_duration as etime
from bili_ogv.dwd_flow_ogv_play_web_timed_i_d
where log_date between '20230226' and '20230226' and season_id >0 and played_duration>0
),
vip_user as (
select mid,log_date
from bili_ogv.dim_user_full_d
where log_date between '20230226' and '20230226' and is_vip_user = 1
),
vip_play_pay_ep as (
select a.mid,a.buvid,a.epid,a.stime,a.etime ,a.log_date
from ogv_play a
join vip_user b
on a.mid = b.mid and a.log_date = b.log_date
join (
select epid,log_date
from bili_ogv.dim_ep_av_full_d
where log_date between '20230226' and '20230226' and ep_status not in (2,14)
) c
on a.epid = c.epid and a.log_date = c.log_date
)
-- mid同时播放的设备数分布
select
i.buvid_cnt
,count(*) as mid_cnt
from
(
select
-- mid 的设备数去重 buvid
j.mid
,count(distinct j.buvid ) as buvid_cnt
from
(
select
-- 删除重复进入播放记录, 保留同时进入播放的记录
h.log_date
,h.mid
,h.buvid
,h.epid
,h.in_out_time
,h.label
,h.num -- 每个mid 同时在线计数
,h.in_out_time_rank -- 每个mid 播放开始时间排序
,h.num1 -- 用户进入次数计数
,h.diff
,h.max_num
,h.max_inc_rank -- 升序的最大值
,h.max_inc_rank_for_row -- 升序的最大值存放在每一行
,h.acc_by_mid_num1 -- 每个mid重复进入播放计数
from
(
select
g.log_date
,g.mid
,g.buvid
,g.epid
,g.in_out_time
,g.label
,g.num -- 每个mid 同时在线计数
,g.in_out_time_rank -- 每个mid 播放开始时间排序
,g.num1 -- 用户进入次数计数
,g.diff
,g.max_num
,g.max_inc_rank -- 升序的最大值
,g.max_inc_rank_for_row -- 升序的最大值存放在每一行
,sum(1) over(partition by mid,num1 ) as acc_by_mid_num1 -- 每个mid重复进入播放计数
from
(
select
-- 得到同时在线num 的升序部分
f.log_date
,f.mid
,f.buvid
,f.epid
,f.in_out_time
,f.label
,f.num -- 每个mid 同时在线计数
,f.in_out_time_rank -- 每个mid 播放开始时间排序
,f.num1 -- 用户进入次数计数
,f.diff
,f.max_num
,f.max_inc_rank -- 升序的最大值
,f.max_inc_rank_for_row -- 升序的最大值存放在每一行
from
(
select
e.log_date
,e.mid
,e.buvid
,e.epid
,e.in_out_time
,e.label
,e.num -- 每个mid 同时在线计数
,e.in_out_time_rank
,e.num1 -- 用户进入次数计数
,e.diff
,e.max_num
,e.max_inc_rank -- 升序的最大值
,max(e.max_inc_rank) over(partition by e.mid) max_inc_rank_for_row -- 升序的最大值存放在每一行
from
(
select
d.log_date
,d.mid
,d.buvid
,d.epid
,d.in_out_time
,d.label
,d.num -- 每个mid 同时在线计数
,d.in_out_time_rank
,d.num1 -- 用户进入次数计数
,d.diff
,d.max_num
,if(d.max_num=d.num,d.in_out_time_rank,0) as max_inc_rank -- 升序的最大值
from
(
select
c.log_date
,c.mid
,c.buvid
,c.epid
,c.in_out_time
,c.label
,c.num -- 每个mid 同时在线计数
,c.in_out_time_rank
,c.num1 -- 用户进入次数计数
,c.diff
,first_value(c.num) over(partition by c.mid order by c.num desc ) as max_num
from
(
select
b.log_date
,b.mid
,b.buvid
,b.epid
,b.in_out_time
,b.label
,b.num -- 每个mid 同时在线计数
,b.in_out_time_rank
,sum(b.label1) over(partition by b.mid order by b.in_out_time ) as num1 -- 用户进入次数计数
,b.in_out_time_rank-b.num as diff
from
(
select
a.log_date
,a.mid
,a.buvid
,a.epid
,a.in_out_time
,a.label
,sum(a.label) over(partition by a.mid order by a.in_out_time ) as num -- 每个mid 同时在线计数
,row_number() over(partition by a.mid order by a.in_out_time ) as in_out_time_rank
,if(a.label=-1 ,0,a.label) as label1 -- 退出分组
from
(
-- 开始时间
select
log_date
,mid
,buvid
,epid
,stime as in_out_time
,1 as label
from vip_play_pay_ep
-- where mid = 57706 -- 1645967203
union all
-- 结束时间
select
log_date
,mid
,buvid
,epid
,etime as in_out_time
,-1 as label
from vip_play_pay_ep
-- where mid = 57706 -- 1645967203
) a
) b
) c
) d
) e
) f
where f.in_out_time_rank between 1 and f.max_inc_rank_for_row
) g
) h
where h.acc_by_mid_num1 =1 -- 每个mid重复进入播放计数
) j
group by j.mid
) i
group by i.buvid_cnt
|
思路:
1.每个mid 同时在线计数
2.同时播放数提取升序部分
3.删除重复进入播放部分

5. 提取一个字符串中重复的item_id 后缀数字
select
regexp_extract(sub_str,'item_id\\":\\"([0-9]+)(\\")',1)
FROM
(
select '{"item_infos":[{"item_id":"701969"},{"item_id":"701965"}]},{"item_infos":[{"item_id":"701964"},{"item_id":"701963"}]}' as string_test
) a
lateral view explode(split(string_test,',')) t as sub_str |

标签:hive 常用函数
- 程序开发学习排行
- 最近发表


