142 lines
4.6 KiB
SQL
142 lines
4.6 KiB
SQL
-- =====================================================================
|
||
-- @Name: HIVE-D-SQL-{表名}-INSERT
|
||
-- @Version: 1.0
|
||
-- @Desc: Hive 数据插入模板
|
||
-- @TargetDatabase: Hive
|
||
-- =====================================================================
|
||
|
||
-- ============================================================================
|
||
-- 场景1:分区表覆盖写入(最常用)
|
||
-- ============================================================================
|
||
-- 适用:每日/每周/每月增量写入分区表
|
||
|
||
INSERT OVERWRITE TABLE db_name.target_table
|
||
PARTITION (day_id = '${day_id}')
|
||
SELECT
|
||
user_id,
|
||
user_name,
|
||
order_count,
|
||
total_amount,
|
||
current_timestamp() AS etl_time
|
||
FROM db_name.source_table
|
||
WHERE day_id = '${day_id}';
|
||
|
||
-- ============================================================================
|
||
-- 场景2:动态分区写入
|
||
-- ============================================================================
|
||
-- 适用:数据中包含分区值,自动写入对应分区
|
||
|
||
-- 先启用动态分区
|
||
SET hive.exec.dynamic.partition = true;
|
||
SET hive.exec.dynamic.partition.mode = nonstrict;
|
||
|
||
INSERT OVERWRITE TABLE db_name.target_table
|
||
PARTITION (day_id, region) -- 动态分区字段
|
||
SELECT
|
||
user_id,
|
||
user_name,
|
||
order_count,
|
||
total_amount,
|
||
current_timestamp() AS etl_time,
|
||
day_id, -- 分区字段1(数据中包含)
|
||
region -- 分区字段2(数据中包含)
|
||
FROM db_name.source_table
|
||
WHERE day_id BETWEEN '${start_day}' AND '${end_day}';
|
||
|
||
-- ============================================================================
|
||
-- 场景3:追加写入
|
||
-- ============================================================================
|
||
-- 适用:日志表、流水表(允许追加)
|
||
|
||
INSERT INTO TABLE db_name.target_table
|
||
SELECT
|
||
field1,
|
||
field2,
|
||
field3,
|
||
current_timestamp() AS etl_time
|
||
FROM db_name.source_table
|
||
WHERE day_id = '${day_id}';
|
||
|
||
-- ============================================================================
|
||
-- 场景4:多分区插入(Multi-Insert)
|
||
-- ============================================================================
|
||
-- 适用:一次扫描,写入多个目标(提高效率)
|
||
|
||
FROM db_name.source_table
|
||
INSERT OVERWRITE TABLE db_name.target_summary
|
||
PARTITION (day_id = '${day_id}')
|
||
SELECT
|
||
department,
|
||
COUNT(*) AS record_count,
|
||
SUM(amount) AS total_amount
|
||
WHERE day_id = '${day_id}'
|
||
GROUP BY department
|
||
|
||
INSERT OVERWRITE TABLE db_name.target_detail
|
||
PARTITION (day_id = '${day_id}')
|
||
SELECT
|
||
id,
|
||
name,
|
||
amount,
|
||
department
|
||
WHERE day_id = '${day_id}'
|
||
AND amount > 1000;
|
||
|
||
-- ============================================================================
|
||
-- 场景5:导出到文件
|
||
-- ============================================================================
|
||
|
||
INSERT OVERWRITE DIRECTORY '/output/data/export/'
|
||
ROW FORMAT DELIMITED
|
||
FIELDS TERMINATED BY ','
|
||
STORED AS TEXTFILE
|
||
SELECT
|
||
id,
|
||
name,
|
||
amount,
|
||
day_id
|
||
FROM db_name.target_table
|
||
WHERE day_id = '${day_id}';
|
||
|
||
-- ============================================================================
|
||
-- 场景6:CTAS(Create Table As Select)
|
||
-- ============================================================================
|
||
|
||
-- 从查询结果创建新表
|
||
CREATE TABLE db_name.new_table AS
|
||
SELECT
|
||
department,
|
||
COUNT(*) AS employee_count,
|
||
AVG(salary) AS avg_salary
|
||
FROM db_name.employees
|
||
WHERE day_id = '${day_id}'
|
||
GROUP BY department;
|
||
|
||
-- ============================================================================
|
||
-- 关键规则说明
|
||
-- ============================================================================
|
||
/*
|
||
1. INSERT OVERWRITE vs INSERT INTO
|
||
- INSERT OVERWRITE:覆盖分区/表数据(推荐,幂等)
|
||
- INSERT INTO:追加数据(可能产生重复)
|
||
|
||
2. 分区表写入必须指定分区
|
||
- 静态分区:PARTITION (day_id = '${day_id}')
|
||
- 动态分区:需先 SET 配置,PARTITION (day_id)
|
||
- 混合分区:PARTITION (day_id = '2026-05-01', region)
|
||
|
||
3. 动态分区配置
|
||
SET hive.exec.dynamic.partition = true;
|
||
SET hive.exec.dynamic.partition.mode = nonstrict; -- 允许全动态
|
||
SET hive.exec.max.dynamic.partitions = 1000; -- 最大动态分区数
|
||
|
||
4. 字段顺序
|
||
- SELECT 字段顺序必须与目标表列定义一致
|
||
- 分区字段在 SELECT 最后(动态分区时)
|
||
|
||
5. 性能优化
|
||
- 多分区插入(Multi-Insert):一次扫描多次写入
|
||
- INSERT OVERWRITE 比 INSERT INTO 更安全(幂等性)
|
||
- 大数据量写入时注意 reducer 数量设置
|
||
*/
|