需求
hive中有一张表存储所有省市地区信息或者公司组织信息, 查询所有最末枝的地区/组织,如下所示, 查询结果应该是沙河镇, 马池口镇, 中关村,上地,济南市
......创建hive 模拟表
create table area (id int, name string, parent_id int);
with area_info as
(select 1 as id, '北京市' as name, 0 as parent_id union all
select 2 as id, '山东省' as name, 0 as parent_id union all
select 3 as id, '昌平区' as name, 1 as parent_id union all
select 4 as id, '海淀区' as name, 1 as parent_id union all
select 5 as id, '沙河镇' as name, 3 as parent_id union all
select 6 as id, '马池口镇' as name, 3 as parent_id union all
select 7 as id, '中关村' as name, 4 as parent_id union all
select 8 as id, '上地' as name, 4 as parent_id union all
select 9 as id, '烟台市' as name, 2 as parent_id union all
select 10 as id, '即墨区' as name, 9 as parent_id union all
select 11 as id, '牟平区' as name, 9 as parent_id union all
select 12 as id, '济南市' as name, 2 as parent_id
insert overwrite table area select * from area_info;
重点是SQL
通过Hive的CTE实现组织树的遍历
假如有N层架构, 就要写N-1个CTE子句
需要注意保留没有子节点的父节点
with p1 as (select t1.* from area t1 --子
join area t2 --父
on t1.parent_id = t2.id
where t2.name in ('北京市', '山东省')),
p2 as (
--所有子节点
select t3.* from area t3 --子
join p1 --父
on t3.parent_id = p1.id
union all
--没有子节点的父节点
select p1.* from area t3 --子
right join p1 --父
on t3.parent_id = p1.id
where t3.id is null
select * from p2 ;
p2.id
p2.name
p2.parent_id