Add one-skill
This commit is contained in:
@@ -0,0 +1,141 @@
|
||||
-- =====================================================================
|
||||
-- @Name: HIVE-D-SQL-{表名}-INSERT
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Hive 数据插入模板
|
||||
-- @TargetDatabase: Hive
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:分区表覆盖写入(最常用)
|
||||
-- ============================================================================
|
||||
-- 适用:每日/每周/每月增量写入分区表
|
||||
|
||||
INSERT OVERWRITE TABLE db_name.target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
user_id,
|
||||
user_name,
|
||||
order_count,
|
||||
total_amount,
|
||||
current_timestamp() AS etl_time
|
||||
FROM db_name.source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:动态分区写入
|
||||
-- ============================================================================
|
||||
-- 适用:数据中包含分区值,自动写入对应分区
|
||||
|
||||
-- 先启用动态分区
|
||||
SET hive.exec.dynamic.partition = true;
|
||||
SET hive.exec.dynamic.partition.mode = nonstrict;
|
||||
|
||||
INSERT OVERWRITE TABLE db_name.target_table
|
||||
PARTITION (day_id, region) -- 动态分区字段
|
||||
SELECT
|
||||
user_id,
|
||||
user_name,
|
||||
order_count,
|
||||
total_amount,
|
||||
current_timestamp() AS etl_time,
|
||||
day_id, -- 分区字段1(数据中包含)
|
||||
region -- 分区字段2(数据中包含)
|
||||
FROM db_name.source_table
|
||||
WHERE day_id BETWEEN '${start_day}' AND '${end_day}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:追加写入
|
||||
-- ============================================================================
|
||||
-- 适用:日志表、流水表(允许追加)
|
||||
|
||||
INSERT INTO TABLE db_name.target_table
|
||||
SELECT
|
||||
field1,
|
||||
field2,
|
||||
field3,
|
||||
current_timestamp() AS etl_time
|
||||
FROM db_name.source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:多分区插入(Multi-Insert)
|
||||
-- ============================================================================
|
||||
-- 适用:一次扫描,写入多个目标(提高效率)
|
||||
|
||||
FROM db_name.source_table
|
||||
INSERT OVERWRITE TABLE db_name.target_summary
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS record_count,
|
||||
SUM(amount) AS total_amount
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY department
|
||||
|
||||
INSERT OVERWRITE TABLE db_name.target_detail
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
department
|
||||
WHERE day_id = '${day_id}'
|
||||
AND amount > 1000;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:导出到文件
|
||||
-- ============================================================================
|
||||
|
||||
INSERT OVERWRITE DIRECTORY '/output/data/export/'
|
||||
ROW FORMAT DELIMITED
|
||||
FIELDS TERMINATED BY ','
|
||||
STORED AS TEXTFILE
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
day_id
|
||||
FROM db_name.target_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景6:CTAS(Create Table As Select)
|
||||
-- ============================================================================
|
||||
|
||||
-- 从查询结果创建新表
|
||||
CREATE TABLE db_name.new_table AS
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS employee_count,
|
||||
AVG(salary) AS avg_salary
|
||||
FROM db_name.employees
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY department;
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. INSERT OVERWRITE vs INSERT INTO
|
||||
- INSERT OVERWRITE:覆盖分区/表数据(推荐,幂等)
|
||||
- INSERT INTO:追加数据(可能产生重复)
|
||||
|
||||
2. 分区表写入必须指定分区
|
||||
- 静态分区:PARTITION (day_id = '${day_id}')
|
||||
- 动态分区:需先 SET 配置,PARTITION (day_id)
|
||||
- 混合分区:PARTITION (day_id = '2026-05-01', region)
|
||||
|
||||
3. 动态分区配置
|
||||
SET hive.exec.dynamic.partition = true;
|
||||
SET hive.exec.dynamic.partition.mode = nonstrict; -- 允许全动态
|
||||
SET hive.exec.max.dynamic.partitions = 1000; -- 最大动态分区数
|
||||
|
||||
4. 字段顺序
|
||||
- SELECT 字段顺序必须与目标表列定义一致
|
||||
- 分区字段在 SELECT 最后(动态分区时)
|
||||
|
||||
5. 性能优化
|
||||
- 多分区插入(Multi-Insert):一次扫描多次写入
|
||||
- INSERT OVERWRITE 比 INSERT INTO 更安全(幂等性)
|
||||
- 大数据量写入时注意 reducer 数量设置
|
||||
*/
|
||||
Reference in New Issue
Block a user