147 lines
5.1 KiB
SQL
147 lines
5.1 KiB
SQL
-- =====================================================================
|
||
-- @Name: KUDU-D-SQL-{表名}-ETL
|
||
-- @Version: 2.0
|
||
-- @Desc: Kudu (via Impala) ETL 数据处理模板(临时表链式处理)
|
||
-- @TargetDatabase: Apache Kudu (via Impala)
|
||
-- @说明: 统一规范:禁止 CTE,每步物化为临时表,先 DROP 再 CREATE
|
||
-- 最后一步用 UPSERT INTO 写入 Kudu 目标表
|
||
-- =====================================================================
|
||
|
||
-- ============================================================================
|
||
-- Step01: 基础清洗与过滤
|
||
-- ============================================================================
|
||
-- 说明:从源表读取数据,进行基础过滤和清洗
|
||
-- 输入:{源表名}
|
||
-- 输出:${db_tmp_env}.tmp_xxx_01
|
||
|
||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||
SELECT
|
||
order_id,
|
||
user_id,
|
||
dept_id,
|
||
product_id,
|
||
quantity,
|
||
amount,
|
||
status,
|
||
stat_date
|
||
FROM db_name.source_table
|
||
WHERE stat_date = '${day_id}'
|
||
AND status IN ('completed', 'shipped') -- 业务过滤
|
||
AND amount > 0 -- 数据质量过滤
|
||
AND user_id IS NOT NULL; -- NULL过滤
|
||
|
||
-- ============================================================================
|
||
-- Step02: 多表关联与维度补全
|
||
-- ============================================================================
|
||
-- 说明:关联维度表,补全业务属性字段
|
||
-- 输入:${db_tmp_env}.tmp_xxx_01, dim_department, dim_product
|
||
-- 输出:${db_tmp_env}.tmp_xxx_02
|
||
|
||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||
SELECT
|
||
a.order_id,
|
||
a.user_id,
|
||
a.amount,
|
||
a.quantity,
|
||
b.dept_name, -- 维度补全:部门名称
|
||
c.category_name, -- 维度补全:类别名称
|
||
a.stat_date
|
||
FROM ${db_tmp_env}.tmp_xxx_01 a
|
||
LEFT JOIN db_name.dim_department b
|
||
ON a.dept_id = b.dept_id
|
||
LEFT JOIN db_name.dim_product c
|
||
ON a.product_id = c.product_id;
|
||
|
||
-- ============================================================================
|
||
-- Step03: 聚合计算与指标生成
|
||
-- ============================================================================
|
||
-- 说明:按业务维度聚合,计算统计指标
|
||
-- 输入:${db_tmp_env}.tmp_xxx_02
|
||
-- 输出:${db_tmp_env}.tmp_xxx_03
|
||
|
||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03;
|
||
CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS
|
||
SELECT
|
||
stat_date,
|
||
dept_name,
|
||
category_name,
|
||
COUNT(*) AS record_count, -- 记录数
|
||
COUNT(DISTINCT user_id) AS unique_users, -- 去重用户数
|
||
SUM(amount) AS total_amount, -- 总金额
|
||
SUM(quantity) AS total_quantity, -- 总数量
|
||
AVG(amount) AS avg_amount, -- 平均金额
|
||
MAX(amount) AS max_amount -- 最大金额
|
||
FROM ${db_tmp_env}.tmp_xxx_02
|
||
GROUP BY stat_date, dept_name, category_name;
|
||
|
||
-- ============================================================================
|
||
-- Step04: 最终输出写入 Kudu 目标表
|
||
-- ============================================================================
|
||
-- 说明:使用 UPSERT 写入 Kudu 目标表
|
||
-- 输入:${db_tmp_env}.tmp_xxx_03
|
||
-- 输出:Kudu 目标表
|
||
|
||
-- 方式1:UPSERT(推荐,主键存在则更新,不存在则插入)
|
||
UPSERT INTO ${db_eda_env}.target_table
|
||
SELECT
|
||
-- 主键字段(Kudu 表必须有主键)
|
||
dept_name,
|
||
category_name,
|
||
stat_date,
|
||
|
||
-- 指标字段
|
||
record_count,
|
||
unique_users,
|
||
total_amount,
|
||
total_quantity,
|
||
avg_amount,
|
||
max_amount,
|
||
|
||
-- 技术字段
|
||
NOW() AS etl_time -- 数据加工时间
|
||
FROM ${db_tmp_env}.tmp_xxx_03;
|
||
|
||
-- 方式2:需要全量刷新时(先删后插)
|
||
-- DELETE FROM ${db_eda_env}.target_table WHERE stat_date = '${day_id}';
|
||
-- INSERT INTO ${db_eda_env}.target_table
|
||
-- SELECT ... FROM ${db_tmp_env}.tmp_xxx_03;
|
||
|
||
-- ============================================================================
|
||
-- 关键规则说明
|
||
-- ============================================================================
|
||
/*
|
||
1. 禁止使用 CTE (WITH 子句)
|
||
- 每个步骤必须物化为临时表
|
||
- 原因:便于调试、断点续跑、统一编码规范
|
||
|
||
2. 先 DROP 再 CREATE
|
||
- 每个临时表创建前必须先 DROP TABLE IF EXISTS
|
||
- 原因:防止表已存在导致失败
|
||
|
||
3. Kudu 写入方式
|
||
- 推荐 UPSERT INTO(Kudu 核心优势)
|
||
- 主键存在 → 更新(整行替换)
|
||
- 主键不存在 → 插入新行
|
||
- 需要全量刷新 → 先 DELETE 再 INSERT
|
||
|
||
4. Kudu 表约束
|
||
- 不支持 INSERT OVERWRITE(用 UPSERT 或 DELETE + INSERT 替代)
|
||
- 必须有 PRIMARY KEY
|
||
- 主键列不能为 NULL
|
||
- 支持 UPDATE 和 DELETE
|
||
|
||
5. 过滤条件前置
|
||
- 所有过滤在最早阶段应用
|
||
- 减少中间数据量
|
||
|
||
6. 临时表命名规范
|
||
- 格式:tmp_{业务简称}_{步骤序号}
|
||
|
||
7. Kudu 特有注意事项
|
||
- CONCAT 只接受 2 个参数,多参数用 CONCAT_WS
|
||
- 不支持 collect_list/collect_set,用 GROUP_CONCAT 替代
|
||
- 近似去重用 NDV() 函数
|
||
*/
|