Files
smart-data-dev-skill/one-skill/smart-data-developer/references/sql/templates/kudu/insert-template.sql
2026-05-13 11:03:00 +08:00

161 lines
4.9 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- =====================================================================
-- @Name: KUDU-D-SQL-{表名}-INSERT
-- @Version: 1.0
-- @Desc: Kudu (via Impala) 数据插入模板
-- @TargetDatabase: Apache Kudu (via Impala)
-- @说明: Kudu 表不支持 INSERT OVERWRITE支持 INSERT INTO 和 UPSERT
-- =====================================================================
-- ============================================================================
-- 场景1INSERT INTO追加写入
-- ============================================================================
-- 适用:向 Kudu 表追加新数据
INSERT INTO db_name.kudu_table
SELECT
id,
stat_date,
name,
department,
amount,
current_timestamp() AS etl_time
FROM db_name.source_table
WHERE stat_date = '${day_id}';
-- ============================================================================
-- 场景2UPSERT INTO更新插入Kudu 特有)
-- ============================================================================
-- 适用:如果主键存在则更新,不存在则插入
-- 这是 Kudu 的核心优势,其他 Hive/Spark 表不支持
-- 基础 UPSERT
UPSERT INTO db_name.kudu_table
SELECT
id,
stat_date,
name,
department,
amount,
current_timestamp() AS etl_time
FROM db_name.staging_table
WHERE stat_date = '${day_id}';
-- 聚合后 UPSERT增量更新指标表
UPSERT INTO db_name.kudu_metrics
SELECT
department,
'${day_id}' AS stat_date,
COUNT(*) AS order_count,
SUM(amount) AS total_amount,
current_timestamp() AS etl_time
FROM db_name.incremental_orders
WHERE stat_date = '${day_id}'
GROUP BY department;
-- ============================================================================
-- 场景3UPDATEKudu 表特有)
-- ============================================================================
-- 适用:修改已有数据
-- 注意:主键列不能被 UPDATE
-- 单条更新
UPDATE db_name.kudu_table
SET status = 'processed',
updated_at = current_timestamp()
WHERE id = 12345;
-- 批量条件更新
UPDATE db_name.kudu_table
SET status = 'expired',
updated_at = current_timestamp()
WHERE stat_date < '2026-01-01'
AND status = 'active';
-- 关联更新(用子查询)
UPDATE db_name.kudu_table t
SET t.department = d.new_dept_name
FROM db_name.dept_mapping d
WHERE t.department = d.old_dept_name;
-- ============================================================================
-- 场景4DELETEKudu 表特有)
-- ============================================================================
-- 适用:删除数据
-- 注意Kudu 的 DELETE 比 Hive/Spark 方便得多
-- 条件删除
DELETE FROM db_name.kudu_table
WHERE stat_date < '2026-01-01';
-- 按主键删除
DELETE FROM db_name.kudu_table
WHERE id IN (1001, 1002, 1003);
-- 关联删除(用子查询)
DELETE FROM db_name.kudu_table
WHERE user_id IN (
SELECT user_id FROM db_name.blacklist
);
-- ============================================================================
-- 场景5从查询结果写入
-- ============================================================================
-- 简单 ETL清洗后写入
INSERT INTO db_name.kudu_target
SELECT
id,
'${day_id}' AS stat_date,
name,
COALESCE(department, '未知') AS department,
amount,
current_timestamp() AS etl_time
FROM db_name.raw_data
WHERE stat_date = '${day_id}'
AND id IS NOT NULL
AND amount > 0;
-- ============================================================================
-- 场景6批量 VALUES 写入
-- ============================================================================
INSERT INTO db_name.kudu_table (id, stat_date, name, amount)
VALUES
(1, '2026-05-01', '测试1', 100.00),
(2, '2026-05-01', '测试2', 200.00),
(3, '2026-05-01', '测试3', 300.00);
-- ============================================================================
-- 关键规则说明
-- ============================================================================
/*
1. Kudu 表与 Hive/Spark 表的核心区别
- 支持 INSERT INTO
- 支持 INSERT OVERWRITE不支持
- 支持 UPSERTKudu 独有,核心能力)
- 支持 UPDATEKudu 独有)
- 支持 DELETEKudu 独有)
2. UPSERT 是 Kudu 的核心优势
- 主键存在 → 更新(整行替换)
- 主键不存在 → 插入新行
- 适用于:增量更新、数据修正、指标回填
3. INSERT INTO 注意事项
- 如果主键冲突会报错(不会自动去重)
- 需要确保写入数据的主键不重复,或使用 UPSERT
4. UPDATE 限制
- 主键列不能被 UPDATE
- WHERE 条件建议包含主键或分区列(性能)
5. DELETE 建议
- 删除大量数据时按分区范围删除
- 定期清理历史数据
6. 性能建议
- 批量写入优于逐条写入
- UPSERT 比 DELETE + INSERT 更高效
- 利用主键做点查,避免全表扫描
*/