161 lines
4.9 KiB
SQL
161 lines
4.9 KiB
SQL
-- =====================================================================
|
||
-- @Name: KUDU-D-SQL-{表名}-INSERT
|
||
-- @Version: 1.0
|
||
-- @Desc: Kudu (via Impala) 数据插入模板
|
||
-- @TargetDatabase: Apache Kudu (via Impala)
|
||
-- @说明: Kudu 表不支持 INSERT OVERWRITE,支持 INSERT INTO 和 UPSERT
|
||
-- =====================================================================
|
||
|
||
-- ============================================================================
|
||
-- 场景1:INSERT INTO(追加写入)
|
||
-- ============================================================================
|
||
-- 适用:向 Kudu 表追加新数据
|
||
|
||
INSERT INTO db_name.kudu_table
|
||
SELECT
|
||
id,
|
||
stat_date,
|
||
name,
|
||
department,
|
||
amount,
|
||
current_timestamp() AS etl_time
|
||
FROM db_name.source_table
|
||
WHERE stat_date = '${day_id}';
|
||
|
||
-- ============================================================================
|
||
-- 场景2:UPSERT INTO(更新插入,Kudu 特有)
|
||
-- ============================================================================
|
||
-- 适用:如果主键存在则更新,不存在则插入
|
||
-- 这是 Kudu 的核心优势,其他 Hive/Spark 表不支持
|
||
|
||
-- 基础 UPSERT
|
||
UPSERT INTO db_name.kudu_table
|
||
SELECT
|
||
id,
|
||
stat_date,
|
||
name,
|
||
department,
|
||
amount,
|
||
current_timestamp() AS etl_time
|
||
FROM db_name.staging_table
|
||
WHERE stat_date = '${day_id}';
|
||
|
||
-- 聚合后 UPSERT(增量更新指标表)
|
||
UPSERT INTO db_name.kudu_metrics
|
||
SELECT
|
||
department,
|
||
'${day_id}' AS stat_date,
|
||
COUNT(*) AS order_count,
|
||
SUM(amount) AS total_amount,
|
||
current_timestamp() AS etl_time
|
||
FROM db_name.incremental_orders
|
||
WHERE stat_date = '${day_id}'
|
||
GROUP BY department;
|
||
|
||
-- ============================================================================
|
||
-- 场景3:UPDATE(Kudu 表特有)
|
||
-- ============================================================================
|
||
-- 适用:修改已有数据
|
||
-- 注意:主键列不能被 UPDATE
|
||
|
||
-- 单条更新
|
||
UPDATE db_name.kudu_table
|
||
SET status = 'processed',
|
||
updated_at = current_timestamp()
|
||
WHERE id = 12345;
|
||
|
||
-- 批量条件更新
|
||
UPDATE db_name.kudu_table
|
||
SET status = 'expired',
|
||
updated_at = current_timestamp()
|
||
WHERE stat_date < '2026-01-01'
|
||
AND status = 'active';
|
||
|
||
-- 关联更新(用子查询)
|
||
UPDATE db_name.kudu_table t
|
||
SET t.department = d.new_dept_name
|
||
FROM db_name.dept_mapping d
|
||
WHERE t.department = d.old_dept_name;
|
||
|
||
-- ============================================================================
|
||
-- 场景4:DELETE(Kudu 表特有)
|
||
-- ============================================================================
|
||
-- 适用:删除数据
|
||
-- 注意:Kudu 的 DELETE 比 Hive/Spark 方便得多
|
||
|
||
-- 条件删除
|
||
DELETE FROM db_name.kudu_table
|
||
WHERE stat_date < '2026-01-01';
|
||
|
||
-- 按主键删除
|
||
DELETE FROM db_name.kudu_table
|
||
WHERE id IN (1001, 1002, 1003);
|
||
|
||
-- 关联删除(用子查询)
|
||
DELETE FROM db_name.kudu_table
|
||
WHERE user_id IN (
|
||
SELECT user_id FROM db_name.blacklist
|
||
);
|
||
|
||
-- ============================================================================
|
||
-- 场景5:从查询结果写入
|
||
-- ============================================================================
|
||
|
||
-- 简单 ETL:清洗后写入
|
||
INSERT INTO db_name.kudu_target
|
||
SELECT
|
||
id,
|
||
'${day_id}' AS stat_date,
|
||
name,
|
||
COALESCE(department, '未知') AS department,
|
||
amount,
|
||
current_timestamp() AS etl_time
|
||
FROM db_name.raw_data
|
||
WHERE stat_date = '${day_id}'
|
||
AND id IS NOT NULL
|
||
AND amount > 0;
|
||
|
||
-- ============================================================================
|
||
-- 场景6:批量 VALUES 写入
|
||
-- ============================================================================
|
||
|
||
INSERT INTO db_name.kudu_table (id, stat_date, name, amount)
|
||
VALUES
|
||
(1, '2026-05-01', '测试1', 100.00),
|
||
(2, '2026-05-01', '测试2', 200.00),
|
||
(3, '2026-05-01', '测试3', 300.00);
|
||
|
||
-- ============================================================================
|
||
-- 关键规则说明
|
||
-- ============================================================================
|
||
/*
|
||
1. Kudu 表与 Hive/Spark 表的核心区别
|
||
- 支持 INSERT INTO:✅
|
||
- 支持 INSERT OVERWRITE:❌(不支持!)
|
||
- 支持 UPSERT:✅(Kudu 独有,核心能力)
|
||
- 支持 UPDATE:✅(Kudu 独有)
|
||
- 支持 DELETE:✅(Kudu 独有)
|
||
|
||
2. UPSERT 是 Kudu 的核心优势
|
||
- 主键存在 → 更新(整行替换)
|
||
- 主键不存在 → 插入新行
|
||
- 适用于:增量更新、数据修正、指标回填
|
||
|
||
3. INSERT INTO 注意事项
|
||
- 如果主键冲突会报错(不会自动去重)
|
||
- 需要确保写入数据的主键不重复,或使用 UPSERT
|
||
|
||
4. UPDATE 限制
|
||
- 主键列不能被 UPDATE
|
||
- WHERE 条件建议包含主键或分区列(性能)
|
||
|
||
5. DELETE 建议
|
||
- 删除大量数据时按分区范围删除
|
||
- 定期清理历史数据
|
||
|
||
6. 性能建议
|
||
- 批量写入优于逐条写入
|
||
- UPSERT 比 DELETE + INSERT 更高效
|
||
- 利用主键做点查,避免全表扫描
|
||
*/
|