Files
2026-05-13 11:03:00 +08:00

139 lines
4.7 KiB
SQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- =====================================================================
-- @Name: HIVE-D-SQL-{表名}-ETL
-- @Version: 1.0
-- @Desc: Hive ETL 数据处理模板(临时表链式处理)
-- @TargetDatabase: Hive
-- @说明: 和 Spark 类似,禁止 CTE每步物化为临时表
-- =====================================================================
-- ============================================================================
-- Step01: 基础清洗与过滤
-- ============================================================================
-- 说明:从源表读取数据,进行基础过滤和清洗
-- 输入:{源表名}
-- 输出tmp_etl_01
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
SELECT
id,
name,
amount,
status,
dept_id,
category_id,
created_at,
day_id
FROM db_name.source_table
WHERE day_id = '${day_id}' -- 分区过滤(必须)
AND status IN ('active', 'valid') -- 业务过滤
AND amount > 0 -- 数据质量过滤
AND id IS NOT NULL; -- NULL过滤
-- ============================================================================
-- Step02: 多表关联与维度补全
-- ============================================================================
-- 说明:关联维度表,补全业务属性字段
-- 输入tmp_xxx_01, dim_department, dim_category
-- 输出tmp_xxx_02
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
SELECT
a.id,
a.name,
a.amount,
a.status,
b.dept_name, -- 维度补全:部门名称
c.category_name, -- 维度补全:类别名称
a.created_at,
a.day_id
FROM ${db_tmp_env}.tmp_xxx_01 a
LEFT JOIN db_name.dim_department b
ON a.dept_id = b.dept_id
AND b.day_id = '${day_id}' -- 维度表分区过滤
LEFT JOIN db_name.dim_category c
ON a.category_id = c.category_id
AND c.day_id = '${day_id}'; -- 维度表分区过滤
-- ============================================================================
-- Step03: 聚合计算与指标生成
-- ============================================================================
-- 说明:按业务维度聚合,计算统计指标
-- 输入tmp_xxx_02
-- 输出tmp_xxx_03
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03;
CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS
SELECT
day_id,
dept_name,
category_name,
COUNT(*) AS record_count, -- 记录数
COUNT(DISTINCT id) AS unique_count, -- 唯一计数
SUM(amount) AS total_amount, -- 总金额
AVG(amount) AS avg_amount, -- 平均金额
MAX(amount) AS max_amount, -- 最大金额
MIN(amount) AS min_amount -- 最小金额
FROM ${db_tmp_env}.tmp_xxx_02
GROUP BY day_id, dept_name, category_name;
-- ============================================================================
-- Step04: 最终输出写入目标表
-- ============================================================================
-- 说明:补全目标表标准字段,写入结果表
-- 输入tmp_xxx_03
-- 输出:目标表
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
PARTITION (day_id = '${day_id}')
SELECT
-- 业务字段
dept_name,
category_name,
record_count,
unique_count,
total_amount,
avg_amount,
max_amount,
min_amount,
-- 技术字段
current_timestamp() AS etl_time, -- 数据加工时间
'${day_id}' AS stat_date; -- 统计日期
-- ============================================================================
-- 关键规则说明
-- ============================================================================
/*
1. 禁止使用 CTE (WITH 子句)
- 每个步骤必须物化为临时表
- 原因Hive CTE 可能在某些版本有性能问题
- 物化临时表便于调试和断点续跑
2. 先 DROP 再 CREATE
- 每个临时表创建前必须先 DROP TABLE IF EXISTS
- 原因:防止表已存在导致失败
3. 分区过滤必须前置
- 所有源表和维度表查询必须带 day_id 过滤
- 原因:避免全表扫描,提升性能
4. JOIN 条件下推
- 维度表关联时带上分区过滤条件
- 原因:减少关联数据量
5. 临时表命名规范
- 格式tmp_{业务简称}_{步骤序号}
- 示例tmp_order_stats_01, tmp_order_stats_02
6. 目标表写入规范
- 使用 INSERT OVERWRITE覆盖写入幂等
- 明确指定分区
- 补全技术字段etl_time 等)
7. 存储格式建议
- 临时表:默认格式即可(中间结果不需要优化存储)
- 如需优化STORED AS ORC
*/