-- ===================================================================== -- @Name: KUDU-D-SQL-{表名}-INSERT -- @Version: 1.0 -- @Desc: Kudu (via Impala) 数据插入模板 -- @TargetDatabase: Apache Kudu (via Impala) -- @说明: Kudu 表不支持 INSERT OVERWRITE,支持 INSERT INTO 和 UPSERT -- ===================================================================== -- ============================================================================ -- 场景1:INSERT INTO(追加写入) -- ============================================================================ -- 适用:向 Kudu 表追加新数据 INSERT INTO db_name.kudu_table SELECT id, stat_date, name, department, amount, current_timestamp() AS etl_time FROM db_name.source_table WHERE stat_date = '${day_id}'; -- ============================================================================ -- 场景2:UPSERT INTO(更新插入,Kudu 特有) -- ============================================================================ -- 适用:如果主键存在则更新,不存在则插入 -- 这是 Kudu 的核心优势,其他 Hive/Spark 表不支持 -- 基础 UPSERT UPSERT INTO db_name.kudu_table SELECT id, stat_date, name, department, amount, current_timestamp() AS etl_time FROM db_name.staging_table WHERE stat_date = '${day_id}'; -- 聚合后 UPSERT(增量更新指标表) UPSERT INTO db_name.kudu_metrics SELECT department, '${day_id}' AS stat_date, COUNT(*) AS order_count, SUM(amount) AS total_amount, current_timestamp() AS etl_time FROM db_name.incremental_orders WHERE stat_date = '${day_id}' GROUP BY department; -- ============================================================================ -- 场景3:UPDATE(Kudu 表特有) -- ============================================================================ -- 适用:修改已有数据 -- 注意:主键列不能被 UPDATE -- 单条更新 UPDATE db_name.kudu_table SET status = 'processed', updated_at = current_timestamp() WHERE id = 12345; -- 批量条件更新 UPDATE db_name.kudu_table SET status = 'expired', updated_at = current_timestamp() WHERE stat_date < '2026-01-01' AND status = 'active'; -- 关联更新(用子查询) UPDATE db_name.kudu_table t SET t.department = d.new_dept_name FROM db_name.dept_mapping d WHERE t.department = d.old_dept_name; -- ============================================================================ -- 场景4:DELETE(Kudu 表特有) -- ============================================================================ -- 适用:删除数据 -- 注意:Kudu 的 DELETE 比 Hive/Spark 方便得多 -- 条件删除 DELETE FROM db_name.kudu_table WHERE stat_date < '2026-01-01'; -- 按主键删除 DELETE FROM db_name.kudu_table WHERE id IN (1001, 1002, 1003); -- 关联删除(用子查询) DELETE FROM db_name.kudu_table WHERE user_id IN ( SELECT user_id FROM db_name.blacklist ); -- ============================================================================ -- 场景5:从查询结果写入 -- ============================================================================ -- 简单 ETL:清洗后写入 INSERT INTO db_name.kudu_target SELECT id, '${day_id}' AS stat_date, name, COALESCE(department, '未知') AS department, amount, current_timestamp() AS etl_time FROM db_name.raw_data WHERE stat_date = '${day_id}' AND id IS NOT NULL AND amount > 0; -- ============================================================================ -- 场景6:批量 VALUES 写入 -- ============================================================================ INSERT INTO db_name.kudu_table (id, stat_date, name, amount) VALUES (1, '2026-05-01', '测试1', 100.00), (2, '2026-05-01', '测试2', 200.00), (3, '2026-05-01', '测试3', 300.00); -- ============================================================================ -- 关键规则说明 -- ============================================================================ /* 1. Kudu 表与 Hive/Spark 表的核心区别 - 支持 INSERT INTO:✅ - 支持 INSERT OVERWRITE:❌(不支持!) - 支持 UPSERT:✅(Kudu 独有,核心能力) - 支持 UPDATE:✅(Kudu 独有) - 支持 DELETE:✅(Kudu 独有) 2. UPSERT 是 Kudu 的核心优势 - 主键存在 → 更新(整行替换) - 主键不存在 → 插入新行 - 适用于:增量更新、数据修正、指标回填 3. INSERT INTO 注意事项 - 如果主键冲突会报错(不会自动去重) - 需要确保写入数据的主键不重复,或使用 UPSERT 4. UPDATE 限制 - 主键列不能被 UPDATE - WHERE 条件建议包含主键或分区列(性能) 5. DELETE 建议 - 删除大量数据时按分区范围删除 - 定期清理历史数据 6. 性能建议 - 批量写入优于逐条写入 - UPSERT 比 DELETE + INSERT 更高效 - 利用主键做点查,避免全表扫描 */