-- ===================================================================== -- @Name: HIVE-D-SQL-{表名}-INSERT -- @Version: 1.0 -- @Desc: Hive 数据插入模板 -- @TargetDatabase: Hive -- ===================================================================== -- ============================================================================ -- 场景1:分区表覆盖写入(最常用) -- ============================================================================ -- 适用:每日/每周/每月增量写入分区表 INSERT OVERWRITE TABLE db_name.target_table PARTITION (day_id = '${day_id}') SELECT user_id, user_name, order_count, total_amount, current_timestamp() AS etl_time FROM db_name.source_table WHERE day_id = '${day_id}'; -- ============================================================================ -- 场景2:动态分区写入 -- ============================================================================ -- 适用:数据中包含分区值,自动写入对应分区 -- 先启用动态分区 SET hive.exec.dynamic.partition = true; SET hive.exec.dynamic.partition.mode = nonstrict; INSERT OVERWRITE TABLE db_name.target_table PARTITION (day_id, region) -- 动态分区字段 SELECT user_id, user_name, order_count, total_amount, current_timestamp() AS etl_time, day_id, -- 分区字段1(数据中包含) region -- 分区字段2(数据中包含) FROM db_name.source_table WHERE day_id BETWEEN '${start_day}' AND '${end_day}'; -- ============================================================================ -- 场景3:追加写入 -- ============================================================================ -- 适用:日志表、流水表(允许追加) INSERT INTO TABLE db_name.target_table SELECT field1, field2, field3, current_timestamp() AS etl_time FROM db_name.source_table WHERE day_id = '${day_id}'; -- ============================================================================ -- 场景4:多分区插入(Multi-Insert) -- ============================================================================ -- 适用:一次扫描,写入多个目标(提高效率) FROM db_name.source_table INSERT OVERWRITE TABLE db_name.target_summary PARTITION (day_id = '${day_id}') SELECT department, COUNT(*) AS record_count, SUM(amount) AS total_amount WHERE day_id = '${day_id}' GROUP BY department INSERT OVERWRITE TABLE db_name.target_detail PARTITION (day_id = '${day_id}') SELECT id, name, amount, department WHERE day_id = '${day_id}' AND amount > 1000; -- ============================================================================ -- 场景5:导出到文件 -- ============================================================================ INSERT OVERWRITE DIRECTORY '/output/data/export/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE SELECT id, name, amount, day_id FROM db_name.target_table WHERE day_id = '${day_id}'; -- ============================================================================ -- 场景6:CTAS(Create Table As Select) -- ============================================================================ -- 从查询结果创建新表 CREATE TABLE db_name.new_table AS SELECT department, COUNT(*) AS employee_count, AVG(salary) AS avg_salary FROM db_name.employees WHERE day_id = '${day_id}' GROUP BY department; -- ============================================================================ -- 关键规则说明 -- ============================================================================ /* 1. INSERT OVERWRITE vs INSERT INTO - INSERT OVERWRITE:覆盖分区/表数据(推荐,幂等) - INSERT INTO:追加数据(可能产生重复) 2. 分区表写入必须指定分区 - 静态分区:PARTITION (day_id = '${day_id}') - 动态分区:需先 SET 配置,PARTITION (day_id) - 混合分区:PARTITION (day_id = '2026-05-01', region) 3. 动态分区配置 SET hive.exec.dynamic.partition = true; SET hive.exec.dynamic.partition.mode = nonstrict; -- 允许全动态 SET hive.exec.max.dynamic.partitions = 1000; -- 最大动态分区数 4. 字段顺序 - SELECT 字段顺序必须与目标表列定义一致 - 分区字段在 SELECT 最后(动态分区时) 5. 性能优化 - 多分区插入(Multi-Insert):一次扫描多次写入 - INSERT OVERWRITE 比 INSERT INTO 更安全(幂等性) - 大数据量写入时注意 reducer 数量设置 */