Files
smart-data-dev-skill/one-skill/smart-data-developer/references/sql/templates/spark/insert-template.sql
2026-05-13 11:03:00 +08:00

131 lines
4.4 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- =====================================================================
-- @SparkSqlName: PAIMONA-D-SQL-{表名}-INSERT
-- @Version: 1.0
-- @Desc: 数据插入模板INSERT OVERWRITE
-- @TargetTables: ${db_eda_env}.{目标表名}
-- @SourceTables: {源表列表}
-- @TargetDatabase: Paimon
-- @SourceDatabase: Paimon
-- =====================================================================
-- ============================================================================
-- 场景1分区表覆盖写入
-- ============================================================================
-- 适用:每日/每周/每月增量写入分区表
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
PARTITION (day_id = '${day_id}')
SELECT
field1,
field2,
field3,
current_timestamp() AS etl_time
FROM source_table
WHERE day_id = '${day_id}';
-- ============================================================================
-- 场景2动态分区写入
-- ============================================================================
-- 适用:多分区字段,数据中包含分区值
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
PARTITION (day_id, region) -- 动态分区字段
SELECT
field1,
field2,
field3,
day_id, -- 分区字段1数据中包含
region, -- 分区字段2数据中包含
current_timestamp() AS etl_time
FROM source_table
WHERE day_id BETWEEN '${start_day}' AND '${end_day}';
-- ============================================================================
-- 场景3全表覆盖写入
-- ============================================================================
-- 适用:全量刷新、初始化数据
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
SELECT
field1,
field2,
field3,
current_timestamp() AS etl_time
FROM source_table;
-- ============================================================================
-- 场景4追加写入慎用
-- ============================================================================
-- 适用:日志表、流水表(无分区或允许重复)
INSERT INTO TABLE ${db_eda_env}.target_table
SELECT
field1,
field2,
field3,
current_timestamp() AS etl_time
FROM source_table
WHERE day_id = '${day_id}';
-- ============================================================================
-- 场景5从临时表写入目标表
-- ============================================================================
-- 适用ETL 流程最后一步
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
PARTITION (day_id = '${day_id}')
SELECT
-- 业务字段(与目标表字段顺序一致)
user_id,
user_name,
order_count,
total_amount,
-- 技术字段
current_timestamp() AS etl_time,
'${day_id}' AS stat_date
FROM ${db_tmp_env}.tmp_xxx_final;
-- ============================================================================
-- 场景6MERGE INTO更新插入
-- ============================================================================
-- 适用:增量更新、修正历史数据
MERGE INTO ${db_eda_env}.target_table t
USING ${db_tmp_env}.tmp_xxx_source s
ON t.id = s.id AND t.day_id = s.day_id
WHEN MATCHED THEN
UPDATE SET
t.name = s.name,
t.amount = s.amount,
t.etl_time = current_timestamp()
WHEN NOT MATCHED THEN
INSERT (id, day_id, name, amount, etl_time)
VALUES (s.id, s.day_id, s.name, s.amount, current_timestamp());
-- ============================================================================
-- 关键规则说明
-- ============================================================================
/*
1. INSERT OVERWRITE vs INSERT INTO
- INSERT OVERWRITE覆盖写入推荐
- INSERT INTO追加写入可能导致重复数据
2. 分区表写入必须指定分区
- 避免全表覆盖导致历史数据丢失
- 格式PARTITION (day_id = '${day_id}')
3. 字段顺序必须与目标表一致
- 目标表字段顺序:业务字段 → 技术字段 → 分区字段
- SELECT 字段顺序必须匹配
4. 技术字段补全
- etl_time数据写入时间
- stat_date统计日期可选
- etl_remark备注信息可选
5. MERGE INTO 注意事项
- Spark 3.x+ 支持
- 目标表必须支持事务(如 Paimon/Delta
- 关联字段必须唯一(避免多条匹配)
*/