Files
2026-05-13 11:03:00 +08:00

147 lines
5.1 KiB
SQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- =====================================================================
-- @Name: KUDU-D-SQL-{表名}-ETL
-- @Version: 2.0
-- @Desc: Kudu (via Impala) ETL 数据处理模板(临时表链式处理)
-- @TargetDatabase: Apache Kudu (via Impala)
-- @说明: 统一规范:禁止 CTE每步物化为临时表先 DROP 再 CREATE
-- 最后一步用 UPSERT INTO 写入 Kudu 目标表
-- =====================================================================
-- ============================================================================
-- Step01: 基础清洗与过滤
-- ============================================================================
-- 说明:从源表读取数据,进行基础过滤和清洗
-- 输入:{源表名}
-- 输出:${db_tmp_env}.tmp_xxx_01
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
SELECT
order_id,
user_id,
dept_id,
product_id,
quantity,
amount,
status,
stat_date
FROM db_name.source_table
WHERE stat_date = '${day_id}'
AND status IN ('completed', 'shipped') -- 业务过滤
AND amount > 0 -- 数据质量过滤
AND user_id IS NOT NULL; -- NULL过滤
-- ============================================================================
-- Step02: 多表关联与维度补全
-- ============================================================================
-- 说明:关联维度表,补全业务属性字段
-- 输入:${db_tmp_env}.tmp_xxx_01, dim_department, dim_product
-- 输出:${db_tmp_env}.tmp_xxx_02
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
SELECT
a.order_id,
a.user_id,
a.amount,
a.quantity,
b.dept_name, -- 维度补全:部门名称
c.category_name, -- 维度补全:类别名称
a.stat_date
FROM ${db_tmp_env}.tmp_xxx_01 a
LEFT JOIN db_name.dim_department b
ON a.dept_id = b.dept_id
LEFT JOIN db_name.dim_product c
ON a.product_id = c.product_id;
-- ============================================================================
-- Step03: 聚合计算与指标生成
-- ============================================================================
-- 说明:按业务维度聚合,计算统计指标
-- 输入:${db_tmp_env}.tmp_xxx_02
-- 输出:${db_tmp_env}.tmp_xxx_03
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03;
CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS
SELECT
stat_date,
dept_name,
category_name,
COUNT(*) AS record_count, -- 记录数
COUNT(DISTINCT user_id) AS unique_users, -- 去重用户数
SUM(amount) AS total_amount, -- 总金额
SUM(quantity) AS total_quantity, -- 总数量
AVG(amount) AS avg_amount, -- 平均金额
MAX(amount) AS max_amount -- 最大金额
FROM ${db_tmp_env}.tmp_xxx_02
GROUP BY stat_date, dept_name, category_name;
-- ============================================================================
-- Step04: 最终输出写入 Kudu 目标表
-- ============================================================================
-- 说明:使用 UPSERT 写入 Kudu 目标表
-- 输入:${db_tmp_env}.tmp_xxx_03
-- 输出Kudu 目标表
-- 方式1UPSERT推荐主键存在则更新不存在则插入
UPSERT INTO ${db_eda_env}.target_table
SELECT
-- 主键字段Kudu 表必须有主键)
dept_name,
category_name,
stat_date,
-- 指标字段
record_count,
unique_users,
total_amount,
total_quantity,
avg_amount,
max_amount,
-- 技术字段
NOW() AS etl_time -- 数据加工时间
FROM ${db_tmp_env}.tmp_xxx_03;
-- 方式2需要全量刷新时先删后插
-- DELETE FROM ${db_eda_env}.target_table WHERE stat_date = '${day_id}';
-- INSERT INTO ${db_eda_env}.target_table
-- SELECT ... FROM ${db_tmp_env}.tmp_xxx_03;
-- ============================================================================
-- 关键规则说明
-- ============================================================================
/*
1. 禁止使用 CTE (WITH 子句)
- 每个步骤必须物化为临时表
- 原因:便于调试、断点续跑、统一编码规范
2. 先 DROP 再 CREATE
- 每个临时表创建前必须先 DROP TABLE IF EXISTS
- 原因:防止表已存在导致失败
3. Kudu 写入方式
- 推荐 UPSERT INTOKudu 核心优势)
- 主键存在 → 更新(整行替换)
- 主键不存在 → 插入新行
- 需要全量刷新 → 先 DELETE 再 INSERT
4. Kudu 表约束
- 不支持 INSERT OVERWRITE用 UPSERT 或 DELETE + INSERT 替代)
- 必须有 PRIMARY KEY
- 主键列不能为 NULL
- 支持 UPDATE 和 DELETE
5. 过滤条件前置
- 所有过滤在最早阶段应用
- 减少中间数据量
6. 临时表命名规范
- 格式tmp_{业务简称}_{步骤序号}
7. Kudu 特有注意事项
- CONCAT 只接受 2 个参数,多参数用 CONCAT_WS
- 不支持 collect_list/collect_set用 GROUP_CONCAT 替代
- 近似去重用 NDV() 函数
*/