Files
smart-data-dev-skill/one-skill/smart-data-developer/references/sql/templates/kudu/create-table-template.sql
2026-05-13 11:03:00 +08:00

212 lines
8.5 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- =====================================================================
-- @Name: KUDU-D-SQL-{表名}-CREATE
-- @Version: 1.0
-- @Desc: Kudu (via Impala) 建表模板
-- @TargetDatabase: Apache Kudu (via Impala)
-- @说明: Kudu 通过 Impala 访问,使用 Impala DDL 操作 Kudu 表
-- =====================================================================
-- ============================================================================
-- 场景1基础表创建Hash 分区)
-- ============================================================================
-- 适用:按主键 Hash 分布数据,写入和点查性能好
CREATE TABLE IF NOT EXISTS db_name.kudu_basic (
-- 主键列Kudu 表必须有主键)
id BIGINT NOT NULL COMMENT '主键ID',
-- 业务字段
name STRING COMMENT '名称',
category STRING COMMENT '类别',
amount DECIMAL(18,2) COMMENT '金额',
status STRING COMMENT '状态',
created_at TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP COMMENT '更新时间'
)
PRIMARY KEY (id)
PARTITION BY HASH(id) PARTITIONS 8
STORED AS KUDU
TBLPROPERTIES (
'kudu.num_tablet_replicas' = '3'
);
-- ============================================================================
-- 场景2Hash + Range 组合分区
-- ============================================================================
-- 适用:按时间范围 + Hash 组合,兼顾范围查询和写入性能
CREATE TABLE IF NOT EXISTS db_name.kudu_range_hash (
-- 主键列(必须包含分区列)
id BIGINT NOT NULL COMMENT '主键ID',
stat_date STRING NOT NULL COMMENT '统计日期 yyyy-MM-dd',
-- 业务字段
department STRING COMMENT '部门',
metric_name STRING COMMENT '指标名称',
metric_value DECIMAL(18,2) COMMENT '指标值',
etl_time TIMESTAMP COMMENT '加工时间'
)
PRIMARY KEY (id, stat_date)
PARTITION BY
HASH(id) PARTITIONS 4,
RANGE(stat_date) (
PARTITION '2026-01-01' <= VALUES < '2026-02-01',
PARTITION '2026-02-01' <= VALUES < '2026-03-01',
PARTITION '2026-03-01' <= VALUES < '2026-04-01',
PARTITION '2026-04-01' <= VALUES < '2026-05-01',
PARTITION '2026-05-01' <= VALUES < '2026-06-01'
)
STORED AS KUDU
TBLPROPERTIES (
'kudu.num_tablet_replicas' = '3',
'kudu.compression' = 'LZ4'
);
-- ============================================================================
-- 场景3多列主键
-- ============================================================================
CREATE TABLE IF NOT EXISTS db_name.kudu_composite_pk (
user_id BIGINT NOT NULL COMMENT '用户ID',
order_date STRING NOT NULL COMMENT '订单日期',
order_seq INT NOT NULL COMMENT '当日订单序号',
user_name STRING COMMENT '用户姓名',
product_name STRING COMMENT '商品名称',
quantity INT COMMENT '数量',
total_amount DECIMAL(18,2) COMMENT '总金额',
status STRING COMMENT '状态',
create_time TIMESTAMP COMMENT '创建时间'
)
PRIMARY KEY (user_id, order_date, order_seq)
PARTITION BY
HASH(user_id) PARTITIONS 8,
RANGE(order_date) (
PARTITION '2026-01-01' <= VALUES < '2026-02-01',
PARTITION '2026-02-01' <= VALUES < '2026-03-01',
PARTITION '2026-03-01' <= VALUES < '2026-04-01'
)
STORED AS KUDU
TBLPROPERTIES (
'kudu.num_tablet_replicas' = '3'
);
-- ============================================================================
-- 场景4纯 Range 分区
-- ============================================================================
-- 适用:按时间顺序写入,范围查询多
CREATE TABLE IF NOT EXISTS db_name.kudu_range_only (
id BIGINT NOT NULL COMMENT '主键ID',
stat_date STRING NOT NULL COMMENT '统计日期',
metric_value DECIMAL(18,2) COMMENT '指标值',
dimension STRING COMMENT '维度',
etl_time TIMESTAMP COMMENT '加工时间'
)
PRIMARY KEY (id, stat_date)
PARTITION BY RANGE(stat_date) (
PARTITION '2026-01-01' <= VALUES < '2026-04-01',
PARTITION '2026-04-01' <= VALUES < '2026-07-01',
PARTITION '2026-07-01' <= VALUES < '2026-10-01',
PARTITION '2026-10-01' <= VALUES < '2027-01-01'
)
STORED AS KUDU
TBLPROPERTIES (
'kudu.num_tablet_replicas' = '3'
);
-- ============================================================================
-- 场景5外部表映射已有 Kudu 表
-- ============================================================================
CREATE EXTERNAL TABLE IF NOT EXISTS db_name.kudu_external
STORED AS KUDU
TBLPROPERTIES (
'kudu.master_addresses' = 'kudu-master-1:7051,kudu-master-2:7051,kudu-master-3:7051',
'kudu.table_name' = 'impala.db_name.existing_table'
);
-- ============================================================================
-- 场景6带压缩和副本配置
-- ============================================================================
CREATE TABLE IF NOT EXISTS db_name.kudu_with_props (
id BIGINT NOT NULL COMMENT '主键ID',
data_date STRING NOT NULL COMMENT '数据日期',
content STRING COMMENT '内容',
value DOUBLE COMMENT '数值'
)
PRIMARY KEY (id, data_date)
PARTITION BY
HASH(id) PARTITIONS 8,
RANGE(data_date) (
PARTITION '2026-01-01' <= VALUES < '2026-02-01',
PARTITION '2026-02-01' <= VALUES < '2026-03-01'
)
STORED AS KUDU
TBLPROPERTIES (
'kudu.num_tablet_replicas' = '3',
'kudu.compression' = 'LZ4', -- 压缩算法
'kudu.encryption' = 'false' -- 加密
);
-- ============================================================================
-- 字段类型速查Kudu 支持的类型)
-- ============================================================================
/*
| 类型 | 说明 | 适用场景 |
|---------------|----------------|------------------------|
| BOOLEAN | 布尔 | 状态标志 |
| TINYINT | 1字节整数 | 小范围枚举 |
| SMALLINT | 2字节整数 | 小范围数值 |
| INT | 4字节整数 | 数量、等级 |
| BIGINT | 8字节整数 | ID、计数 |
| FLOAT | 4字节浮点 | 近似计算 |
| DOUBLE | 8字节浮点 | 科学计算 |
| DECIMAL(p,s) | 定点数 | 金额、精确数值 |
| STRING | 变长字符串 | 名称、描述 |
| VARCHAR(n) | 变长字符串 | 限定长度字符串 |
| CHAR(n) | 定长字符串 | 固定长度编码 |
| TIMESTAMP | 时间戳 | 时间字段(微秒精度) |
| DATE | 日期 | 日期字段 |
| BINARY | 二进制 | 二进制数据 |
注意Kudu 不支持 ARRAY, MAP, STRUCT 等复杂类型
*/
-- ============================================================================
-- 建表规范说明
-- ============================================================================
/*
1. 主键约束Kudu 特有)
- 每张 Kudu 表必须有 PRIMARY KEY
- 主键列不能为 NULL必须 NOT NULL
- 主键值不可 UPDATE只能删除后重新插入
- 主键列必须包含在分区列中
2. 分区策略
- Hash 分区:均匀分布,适合写入和点查
- Range 分区:按范围查询,适合时间序列
- Hash + Range 组合:兼顾两者优势(推荐)
- 分区数 = tablet 数量,影响并行度
3. 分区设计建议
- Hash 分区数:建议 4 的倍数,参考数据量
- Range 分区:按时间维度,定期添加新分区
- 单个 tablet 建议 1GB~10GB
4. 副本数
- 生产环境建议 3 副本(默认)
- Raft 协议保证一致性
5. 压缩
- 推荐 LZ4速度和压缩比平衡
- 可选SNAPPY, ZLIB, LZ4
6. 与 Hive/Spark 表的区别
- Kudu 表支持 UPDATE 和 DELETE
- Kudu 表不支持 INSERT OVERWRITE
- Kudu 表不支持复杂类型ARRAY, MAP, STRUCT
- Kudu 表主键有约束Hive/Spark 无约束
*/