Files
smart-data-dev-skill/one-skill/smart-data-developer/references/sql/templates/doris/etl-template.sql
2026-05-13 11:03:00 +08:00

129 lines
4.6 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- =====================================================================
-- @Name: DORIS-D-SQL-{表名}-ETL
-- @Version: 2.0
-- @Desc: Apache Doris ETL 数据处理模板(临时表链式处理)
-- @TargetDatabase: Apache Doris
-- @说明: 统一规范:禁止 CTE每步物化为临时表先 DROP 再 CREATE
-- =====================================================================
-- ============================================================================
-- Step01: 基础清洗与过滤
-- ============================================================================
-- 说明:从源表读取数据,进行基础过滤和清洗
-- 输入:{源表名}
-- 输出:${db_tmp_env}.tmp_xxx_01
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
SELECT
order_id,
user_id,
dept_id,
total_amount,
status,
order_date
FROM db_name.source_table
WHERE order_date = '${day_id}'
AND status IN ('completed', 'shipped') -- 业务过滤
AND total_amount > 0 -- 数据质量过滤
AND user_id IS NOT NULL; -- NULL过滤
-- ============================================================================
-- Step02: 多表关联与维度补全
-- ============================================================================
-- 说明:关联维度表,补全业务属性字段
-- 输入:${db_tmp_env}.tmp_xxx_01, dim_department, dim_category
-- 输出:${db_tmp_env}.tmp_xxx_02
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
SELECT
a.order_id,
a.user_id,
a.total_amount,
a.status,
b.dept_name, -- 维度补全:部门名称
c.category_name, -- 维度补全:类别名称
a.order_date
FROM ${db_tmp_env}.tmp_xxx_01 a
LEFT JOIN db_name.dim_department b
ON a.dept_id = b.dept_id
LEFT JOIN db_name.dim_category c
ON a.category_id = c.category_id;
-- ============================================================================
-- Step03: 聚合计算与指标生成
-- ============================================================================
-- 说明:按业务维度聚合,计算统计指标
-- 输入:${db_tmp_env}.tmp_xxx_02
-- 输出:${db_tmp_env}.tmp_xxx_03
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03;
CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS
SELECT
order_date,
dept_name,
category_name,
COUNT(*) AS record_count, -- 记录数
COUNT(DISTINCT user_id) AS unique_users, -- 去重用户数
SUM(total_amount) AS total_amount, -- 总金额
AVG(total_amount) AS avg_amount, -- 平均金额
MAX(total_amount) AS max_amount -- 最大金额
FROM ${db_tmp_env}.tmp_xxx_02
GROUP BY order_date, dept_name, category_name;
-- ============================================================================
-- Step04: 最终输出写入目标表
-- ============================================================================
-- 说明:补全目标表标准字段,写入结果表
-- 输入:${db_tmp_env}.tmp_xxx_03
-- 输出:目标表
INSERT INTO ${db_eda_env}.target_table
SELECT
-- 业务字段
dept_name,
category_name,
record_count,
unique_users,
total_amount,
avg_amount,
max_amount,
-- 技术字段
NOW() AS etl_time, -- 数据加工时间
'${day_id}' AS stat_date -- 统计日期
FROM ${db_tmp_env}.tmp_xxx_03;
-- ============================================================================
-- 关键规则说明
-- ============================================================================
/*
1. 禁止使用 CTE (WITH 子句)
- 每个步骤必须物化为临时表
- 原因:便于调试、断点续跑、统一编码规范
2. 先 DROP 再 CREATE
- 每个临时表创建前必须先 DROP TABLE IF EXISTS
- 原因:防止表已存在导致失败
3. Doris 写入方式
- 默认使用 INSERT INTO
- Aggregate Key 表:自动合并相同 Key 的数据
- Unique Key 表:自动按主键去重,保留最新数据
- Doris 2.0+ 也支持 INSERT OVERWRITE
4. 过滤条件前置
- 所有过滤在最早阶段应用
- 减少中间数据量
5. 临时表命名规范
- 格式tmp_{业务简称}_{步骤序号}
- 示例tmp_order_stats_01, tmp_order_stats_02
6. Doris 特有注意事项
- 不支持 LEFT SEMI JOIN / LEFT ANTI JOIN
- 日期函数用 MySQL 风格DATE_FORMAT, DATE_ADD(INTERVAL)
- 不支持 collect_list/collect_set用 GROUP_CONCAT 替代
*/