179 lines
5.6 KiB
SQL
179 lines
5.6 KiB
SQL
-- =====================================================================
|
||
-- @SparkSqlName: PAIMONA-D-SQL-{表名}-PARTITION
|
||
-- @Version: 1.0
|
||
-- @Desc: 分区表操作模板
|
||
-- @TargetTables: {分区表名}
|
||
-- @TargetDatabase: Paimon
|
||
-- =====================================================================
|
||
|
||
-- ============================================================================
|
||
-- 分区表创建
|
||
-- ============================================================================
|
||
|
||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.daily_partition_table (
|
||
id BIGINT COMMENT '主键ID',
|
||
user_id STRING COMMENT '用户ID',
|
||
amount DECIMAL(18,2) COMMENT '金额',
|
||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||
)
|
||
COMMENT '按日分区表'
|
||
PARTITIONED BY (day_id STRING COMMENT '统计日期')
|
||
STORED AS PARQUET;
|
||
|
||
-- ============================================================================
|
||
-- 分区写入操作
|
||
-- ============================================================================
|
||
|
||
-- 1. 静态分区写入(指定分区值)
|
||
INSERT OVERWRITE TABLE ${db_eda_env}.daily_partition_table
|
||
PARTITION (day_id = '2026-05-09')
|
||
SELECT
|
||
id,
|
||
user_id,
|
||
amount,
|
||
current_timestamp() AS etl_time
|
||
FROM source_table
|
||
WHERE day_id = '${day_id}';
|
||
|
||
-- 2. 动态分区写入(数据中包含分区值)
|
||
-- 需要先设置动态分区模式
|
||
SET spark.sql.partitionOverwriteMode = dynamic;
|
||
|
||
INSERT OVERWRITE TABLE ${db_eda_env}.daily_partition_table
|
||
PARTITION (day_id) -- 动态分区字段
|
||
SELECT
|
||
id,
|
||
user_id,
|
||
amount,
|
||
current_timestamp() AS etl_time,
|
||
day_id -- 数据中包含分区值
|
||
FROM source_table
|
||
WHERE day_id BETWEEN '2026-05-01' AND '2026-05-09';
|
||
|
||
-- ============================================================================
|
||
-- 分区查询操作
|
||
-- ============================================================================
|
||
|
||
-- 3. 单分区查询
|
||
SELECT *
|
||
FROM ${db_eda_env}.daily_partition_table
|
||
WHERE day_id = '2026-05-09';
|
||
|
||
-- 4. 多分区查询
|
||
SELECT *
|
||
FROM ${db_eda_env}.daily_partition_table
|
||
WHERE day_id IN ('2026-05-01', '2026-05-02', '2026-05-03');
|
||
|
||
-- 5. 分区范围查询
|
||
SELECT *
|
||
FROM ${db_eda_env}.daily_partition_table
|
||
WHERE day_id >= '2026-05-01'
|
||
AND day_id <= '2026-05-09';
|
||
|
||
-- 6. 最近 N 天分区查询(动态计算)
|
||
SELECT *
|
||
FROM ${db_eda_env}.daily_partition_table
|
||
WHERE day_id >= date_format(date_sub(current_date(), 30), 'yyyy-MM-dd');
|
||
|
||
-- ============================================================================
|
||
-- 分区管理操作
|
||
-- ============================================================================
|
||
|
||
-- 7. 查看分区列表
|
||
SHOW PARTITIONS ${db_eda_env}.daily_partition_table;
|
||
|
||
-- 8. 查看特定分区详情
|
||
DESCRIBE EXTENDED ${db_eda_env}.daily_partition_table PARTITION (day_id = '2026-05-09');
|
||
|
||
-- 9. 添加分区(手动创建空分区,部分表类型支持)
|
||
ALTER TABLE ${db_eda_env}.daily_partition_table
|
||
ADD IF NOT EXISTS PARTITION (day_id = '2026-05-10');
|
||
|
||
-- 10. 删除分区(清理历史数据)
|
||
ALTER TABLE ${db_eda_env}.daily_partition_table
|
||
DROP IF EXISTS PARTITION (day_id = '2026-01-01');
|
||
|
||
-- ============================================================================
|
||
-- 多分区字段操作
|
||
-- ============================================================================
|
||
|
||
-- 11. 多分区字段表创建
|
||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.multi_partition_table (
|
||
id BIGINT,
|
||
name STRING,
|
||
amount DECIMAL(18,2),
|
||
etl_time TIMESTAMP
|
||
)
|
||
PARTITIONED BY (year_id STRING, month_id STRING)
|
||
STORED AS PARQUET;
|
||
|
||
-- 12. 多分区字段写入
|
||
INSERT OVERWRITE TABLE ${db_eda_env}.multi_partition_table
|
||
PARTITION (year_id = '2026', month_id = '05')
|
||
SELECT
|
||
id,
|
||
name,
|
||
amount,
|
||
current_timestamp() AS etl_time
|
||
FROM source_table
|
||
WHERE year_id = '2026' AND month_id = '05';
|
||
|
||
-- 13. 多分区字段动态写入
|
||
SET spark.sql.partitionOverwriteMode = dynamic;
|
||
|
||
INSERT OVERWRITE TABLE ${db_eda_env}.multi_partition_table
|
||
PARTITION (year_id, month_id)
|
||
SELECT
|
||
id,
|
||
name,
|
||
amount,
|
||
current_timestamp() AS etl_time,
|
||
year_id,
|
||
month_id
|
||
FROM source_table;
|
||
|
||
-- ============================================================================
|
||
-- 分区数据清理
|
||
-- ============================================================================
|
||
|
||
-- 14. 清理指定分区数据
|
||
INSERT OVERWRITE TABLE ${db_eda_env}.daily_partition_table
|
||
PARTITION (day_id = '2026-05-09')
|
||
SELECT * FROM ${db_eda_env}.daily_partition_table
|
||
WHERE day_id = '2026-05-09'
|
||
AND status = 'valid'; -- 只保留有效数据
|
||
|
||
-- 15. 清理 N 天前分区(批量)
|
||
-- 使用脚本或程序循环执行
|
||
-- ALTER TABLE xxx DROP PARTITION (day_id = '历史分区')
|
||
|
||
-- ============================================================================
|
||
-- 分区最佳实践
|
||
-- ============================================================================
|
||
/*
|
||
1. 分区字段选择原则
|
||
- 查询高频过滤字段
|
||
- 数据量分布均匀的字段
|
||
- 时间字段最常用(day_id, month_id)
|
||
|
||
2. 分区粒度选择
|
||
- 日增量数据 → day_id 分区
|
||
- 月增量数据 → month_id 分区
|
||
- 大数据量 → 可细分到 hour_id
|
||
|
||
3. 分区数量控制
|
||
- 单表分区数建议 < 10000
|
||
- 过多分区影响元数据性能
|
||
|
||
4. 查询必须带分区过滤
|
||
- 避免:SELECT * FROM table(全表扫描)
|
||
- 推荐:SELECT * FROM table WHERE day_id = '${day_id}'
|
||
|
||
5. 动态分区写入设置
|
||
- SET spark.sql.partitionOverwriteMode = dynamic;
|
||
- 避免误覆盖其他分区
|
||
|
||
6. 分区数据清理
|
||
- 定期清理历史分区(如保留近90天)
|
||
- 使用 ALTER TABLE DROP PARTITION
|
||
*/ |