Add one-skill

This commit is contained in:
Xin Wang
2026-05-13 11:03:00 +08:00
parent a4c8b29176
commit f9e36ef92d
34 changed files with 7656 additions and 0 deletions

View File

@@ -0,0 +1,141 @@
-- =====================================================================
-- @Name: HIVE-D-SQL-{表名}-INSERT
-- @Version: 1.0
-- @Desc: Hive 数据插入模板
-- @TargetDatabase: Hive
-- =====================================================================
-- ============================================================================
-- 场景1分区表覆盖写入最常用
-- ============================================================================
-- 适用:每日/每周/每月增量写入分区表
INSERT OVERWRITE TABLE db_name.target_table
PARTITION (day_id = '${day_id}')
SELECT
user_id,
user_name,
order_count,
total_amount,
current_timestamp() AS etl_time
FROM db_name.source_table
WHERE day_id = '${day_id}';
-- ============================================================================
-- 场景2动态分区写入
-- ============================================================================
-- 适用:数据中包含分区值,自动写入对应分区
-- 先启用动态分区
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
INSERT OVERWRITE TABLE db_name.target_table
PARTITION (day_id, region) -- 动态分区字段
SELECT
user_id,
user_name,
order_count,
total_amount,
current_timestamp() AS etl_time,
day_id, -- 分区字段1数据中包含
region -- 分区字段2数据中包含
FROM db_name.source_table
WHERE day_id BETWEEN '${start_day}' AND '${end_day}';
-- ============================================================================
-- 场景3追加写入
-- ============================================================================
-- 适用:日志表、流水表(允许追加)
INSERT INTO TABLE db_name.target_table
SELECT
field1,
field2,
field3,
current_timestamp() AS etl_time
FROM db_name.source_table
WHERE day_id = '${day_id}';
-- ============================================================================
-- 场景4多分区插入Multi-Insert
-- ============================================================================
-- 适用:一次扫描,写入多个目标(提高效率)
FROM db_name.source_table
INSERT OVERWRITE TABLE db_name.target_summary
PARTITION (day_id = '${day_id}')
SELECT
department,
COUNT(*) AS record_count,
SUM(amount) AS total_amount
WHERE day_id = '${day_id}'
GROUP BY department
INSERT OVERWRITE TABLE db_name.target_detail
PARTITION (day_id = '${day_id}')
SELECT
id,
name,
amount,
department
WHERE day_id = '${day_id}'
AND amount > 1000;
-- ============================================================================
-- 场景5导出到文件
-- ============================================================================
INSERT OVERWRITE DIRECTORY '/output/data/export/'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
SELECT
id,
name,
amount,
day_id
FROM db_name.target_table
WHERE day_id = '${day_id}';
-- ============================================================================
-- 场景6CTASCreate Table As Select
-- ============================================================================
-- 从查询结果创建新表
CREATE TABLE db_name.new_table AS
SELECT
department,
COUNT(*) AS employee_count,
AVG(salary) AS avg_salary
FROM db_name.employees
WHERE day_id = '${day_id}'
GROUP BY department;
-- ============================================================================
-- 关键规则说明
-- ============================================================================
/*
1. INSERT OVERWRITE vs INSERT INTO
- INSERT OVERWRITE覆盖分区/表数据(推荐,幂等)
- INSERT INTO追加数据可能产生重复
2. 分区表写入必须指定分区
- 静态分区PARTITION (day_id = '${day_id}')
- 动态分区:需先 SET 配置PARTITION (day_id)
- 混合分区PARTITION (day_id = '2026-05-01', region)
3. 动态分区配置
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict; -- 允许全动态
SET hive.exec.max.dynamic.partitions = 1000; -- 最大动态分区数
4. 字段顺序
- SELECT 字段顺序必须与目标表列定义一致
- 分区字段在 SELECT 最后(动态分区时)
5. 性能优化
- 多分区插入Multi-Insert一次扫描多次写入
- INSERT OVERWRITE 比 INSERT INTO 更安全(幂等性)
- 大数据量写入时注意 reducer 数量设置
*/