-- ===================================================================== -- @Name: KUDU-D-SQL-{表名}-ETL -- @Version: 2.0 -- @Desc: Kudu (via Impala) ETL 数据处理模板(临时表链式处理) -- @TargetDatabase: Apache Kudu (via Impala) -- @说明: 统一规范:禁止 CTE,每步物化为临时表,先 DROP 再 CREATE -- 最后一步用 UPSERT INTO 写入 Kudu 目标表 -- ===================================================================== -- ============================================================================ -- Step01: 基础清洗与过滤 -- ============================================================================ -- 说明:从源表读取数据,进行基础过滤和清洗 -- 输入:{源表名} -- 输出:${db_tmp_env}.tmp_xxx_01 DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01; CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS SELECT order_id, user_id, dept_id, product_id, quantity, amount, status, stat_date FROM db_name.source_table WHERE stat_date = '${day_id}' AND status IN ('completed', 'shipped') -- 业务过滤 AND amount > 0 -- 数据质量过滤 AND user_id IS NOT NULL; -- NULL过滤 -- ============================================================================ -- Step02: 多表关联与维度补全 -- ============================================================================ -- 说明:关联维度表,补全业务属性字段 -- 输入:${db_tmp_env}.tmp_xxx_01, dim_department, dim_product -- 输出:${db_tmp_env}.tmp_xxx_02 DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02; CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS SELECT a.order_id, a.user_id, a.amount, a.quantity, b.dept_name, -- 维度补全:部门名称 c.category_name, -- 维度补全:类别名称 a.stat_date FROM ${db_tmp_env}.tmp_xxx_01 a LEFT JOIN db_name.dim_department b ON a.dept_id = b.dept_id LEFT JOIN db_name.dim_product c ON a.product_id = c.product_id; -- ============================================================================ -- Step03: 聚合计算与指标生成 -- ============================================================================ -- 说明:按业务维度聚合,计算统计指标 -- 输入:${db_tmp_env}.tmp_xxx_02 -- 输出:${db_tmp_env}.tmp_xxx_03 DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03; CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS SELECT stat_date, dept_name, category_name, COUNT(*) AS record_count, -- 记录数 COUNT(DISTINCT user_id) AS unique_users, -- 去重用户数 SUM(amount) AS total_amount, -- 总金额 SUM(quantity) AS total_quantity, -- 总数量 AVG(amount) AS avg_amount, -- 平均金额 MAX(amount) AS max_amount -- 最大金额 FROM ${db_tmp_env}.tmp_xxx_02 GROUP BY stat_date, dept_name, category_name; -- ============================================================================ -- Step04: 最终输出写入 Kudu 目标表 -- ============================================================================ -- 说明:使用 UPSERT 写入 Kudu 目标表 -- 输入:${db_tmp_env}.tmp_xxx_03 -- 输出:Kudu 目标表 -- 方式1:UPSERT(推荐,主键存在则更新,不存在则插入) UPSERT INTO ${db_eda_env}.target_table SELECT -- 主键字段(Kudu 表必须有主键) dept_name, category_name, stat_date, -- 指标字段 record_count, unique_users, total_amount, total_quantity, avg_amount, max_amount, -- 技术字段 NOW() AS etl_time -- 数据加工时间 FROM ${db_tmp_env}.tmp_xxx_03; -- 方式2:需要全量刷新时(先删后插) -- DELETE FROM ${db_eda_env}.target_table WHERE stat_date = '${day_id}'; -- INSERT INTO ${db_eda_env}.target_table -- SELECT ... FROM ${db_tmp_env}.tmp_xxx_03; -- ============================================================================ -- 关键规则说明 -- ============================================================================ /* 1. 禁止使用 CTE (WITH 子句) - 每个步骤必须物化为临时表 - 原因:便于调试、断点续跑、统一编码规范 2. 先 DROP 再 CREATE - 每个临时表创建前必须先 DROP TABLE IF EXISTS - 原因:防止表已存在导致失败 3. Kudu 写入方式 - 推荐 UPSERT INTO(Kudu 核心优势) - 主键存在 → 更新(整行替换) - 主键不存在 → 插入新行 - 需要全量刷新 → 先 DELETE 再 INSERT 4. Kudu 表约束 - 不支持 INSERT OVERWRITE(用 UPSERT 或 DELETE + INSERT 替代) - 必须有 PRIMARY KEY - 主键列不能为 NULL - 支持 UPDATE 和 DELETE 5. 过滤条件前置 - 所有过滤在最早阶段应用 - 减少中间数据量 6. 临时表命名规范 - 格式:tmp_{业务简称}_{步骤序号} 7. Kudu 特有注意事项 - CONCAT 只接受 2 个参数,多参数用 CONCAT_WS - 不支持 collect_list/collect_set,用 GROUP_CONCAT 替代 - 近似去重用 NDV() 函数 */