210 lines
8.6 KiB
SQL
210 lines
8.6 KiB
SQL
-- =====================================================================
|
||
-- @Name: DORIS-D-SQL-{表名}-CREATE
|
||
-- @Version: 1.0
|
||
-- @Desc: Apache Doris 建表模板(OLAP 多模型)
|
||
-- @TargetDatabase: Apache Doris
|
||
-- =====================================================================
|
||
|
||
-- ============================================================================
|
||
-- 场景1:Duplicate Key 模型(明细表)
|
||
-- ============================================================================
|
||
-- 适用:保留原始明细数据,不做预聚合,数据无冗余
|
||
-- 特点:数据按 Key 排序存储,支持所有列的查询和聚合
|
||
|
||
CREATE TABLE IF NOT EXISTS db_name.detail_table (
|
||
-- Key 列(排序字段)
|
||
order_id BIGINT COMMENT '订单ID',
|
||
order_date DATE COMMENT '订单日期',
|
||
user_id BIGINT COMMENT '用户ID',
|
||
|
||
-- Value 列
|
||
user_name VARCHAR(50) COMMENT '用户姓名',
|
||
product_id BIGINT COMMENT '商品ID',
|
||
product_name VARCHAR(200) COMMENT '商品名称',
|
||
quantity INT COMMENT '购买数量',
|
||
unit_price DECIMAL(18,2) COMMENT '单价',
|
||
total_amount DECIMAL(18,2) COMMENT '总金额',
|
||
status VARCHAR(20) COMMENT '订单状态',
|
||
create_time DATETIME COMMENT '创建时间'
|
||
)
|
||
DUPLICATE KEY(order_id, order_date, user_id)
|
||
COMMENT '订单明细表'
|
||
PARTITION BY RANGE(order_date) (
|
||
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
|
||
PARTITION p202603 VALUES LESS THAN ('2026-04-01')
|
||
)
|
||
DISTRIBUTED BY HASH(order_id) BUCKETS 8
|
||
PROPERTIES (
|
||
'replication_num' = '3',
|
||
'storage_format' = 'V2'
|
||
);
|
||
|
||
-- ============================================================================
|
||
-- 场景2:Aggregate Key 模型(聚合表)
|
||
-- ============================================================================
|
||
-- 适用:预聚合场景,相同 Key 的数据自动合并
|
||
-- 特点:Value 列必须指定聚合函数(SUM, REPLACE, MAX, MIN, HLL_UNION, BITMAP_UNION)
|
||
|
||
CREATE TABLE IF NOT EXISTS db_name.agg_table (
|
||
-- Key 列(聚合维度)
|
||
stat_date DATE COMMENT '统计日期',
|
||
department VARCHAR(100) COMMENT '部门名称',
|
||
region VARCHAR(100) COMMENT '地区',
|
||
|
||
-- Value 列(带聚合函数)
|
||
order_count BIGINT SUM COMMENT '订单总数',
|
||
total_amount DECIMAL(18,2) SUM COMMENT '总金额',
|
||
unique_users BIGINT REPLACE COMMENT '去重用户数(预计算值)',
|
||
max_amount DECIMAL(18,2) MAX COMMENT '最大金额',
|
||
last_update DATETIME REPLACE COMMENT '最后更新时间'
|
||
)
|
||
AGGREGATE KEY(stat_date, department, region)
|
||
COMMENT '部门销售聚合表'
|
||
PARTITION BY RANGE(stat_date) (
|
||
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||
PARTITION p202602 VALUES LESS THAN ('2026-03-01')
|
||
)
|
||
DISTRIBUTED BY HASH(department) BUCKETS 8
|
||
PROPERTIES (
|
||
'replication_num' = '3',
|
||
'storage_format' = 'V2'
|
||
);
|
||
|
||
-- ============================================================================
|
||
-- 场景3:Unique Key 模型(唯一主键表)
|
||
-- ============================================================================
|
||
-- 适用:需要按主键更新/去重的场景
|
||
-- 特点:相同主键的数据保留最新一条(整行替换)
|
||
|
||
CREATE TABLE IF NOT EXISTS db_name.unique_table (
|
||
-- Key 列(主键,必须唯一)
|
||
user_id BIGINT COMMENT '用户ID',
|
||
|
||
-- Value 列
|
||
user_name VARCHAR(50) COMMENT '用户姓名',
|
||
phone VARCHAR(20) COMMENT '手机号',
|
||
email VARCHAR(100) COMMENT '邮箱',
|
||
vip_level INT COMMENT 'VIP等级',
|
||
register_date DATE COMMENT '注册日期',
|
||
last_login DATETIME COMMENT '最后登录时间',
|
||
status VARCHAR(10) COMMENT '状态'
|
||
)
|
||
UNIQUE KEY(user_id)
|
||
COMMENT '用户信息表(按主键更新)'
|
||
DISTRIBUTED BY HASH(user_id) BUCKETS 16
|
||
PROPERTIES (
|
||
'replication_num' = '3',
|
||
'enable_unique_key_merge_based_on_replica' = 'true'
|
||
);
|
||
|
||
-- ============================================================================
|
||
-- 场景4:带动态分区属性
|
||
-- ============================================================================
|
||
-- 适用:按日自动创建和管理分区
|
||
|
||
CREATE TABLE IF NOT EXISTS db_name.auto_partition_table (
|
||
stat_date DATE COMMENT '统计日期',
|
||
department VARCHAR(100) COMMENT '部门',
|
||
metric_value DECIMAL(18,2) SUM COMMENT '指标值',
|
||
record_count BIGINT SUM COMMENT '记录数'
|
||
)
|
||
AGGREGATE KEY(stat_date, department)
|
||
COMMENT '自动分区示例表'
|
||
PARTITION BY RANGE(stat_date) ()
|
||
DISTRIBUTED BY HASH(department) BUCKETS 8
|
||
PROPERTIES (
|
||
'replication_num' = '3',
|
||
'dynamic_partition.enable' = 'true',
|
||
'dynamic_partition.time_unit' = 'DAY',
|
||
'dynamic_partition.start' = '-30', -- 保留30天历史
|
||
'dynamic_partition.end' = '3', -- 预创建3天
|
||
'dynamic_partition.prefix' = 'p',
|
||
'dynamic_partition.buckets' = '8'
|
||
);
|
||
|
||
-- ============================================================================
|
||
-- 场景5:多分区 + 多分桶
|
||
-- ============================================================================
|
||
|
||
CREATE TABLE IF NOT EXISTS db_name.multi_partition_table (
|
||
stat_date DATE COMMENT '统计日期',
|
||
region VARCHAR(50) COMMENT '地区',
|
||
city VARCHAR(50) COMMENT '城市',
|
||
user_id BIGINT COMMENT '用户ID',
|
||
amount DECIMAL(18,2) SUM COMMENT '金额'
|
||
)
|
||
AGGREGATE KEY(stat_date, region, city, user_id)
|
||
COMMENT '多维度分区示例'
|
||
PARTITION BY RANGE(stat_date) (
|
||
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||
PARTITION p202602 VALUES LESS THAN ('2026-03-01')
|
||
)
|
||
DISTRIBUTED BY HASH(user_id) BUCKETS 32
|
||
PROPERTIES (
|
||
'replication_num' = '3',
|
||
'in_memory' = 'false',
|
||
'storage_format' = 'V2',
|
||
'compression' = 'LZ4'
|
||
);
|
||
|
||
-- ============================================================================
|
||
-- 字段类型速查
|
||
-- ============================================================================
|
||
/*
|
||
| 类型 | 说明 | 适用场景 |
|
||
|---------------|----------------|------------------------|
|
||
| BOOLEAN | 布尔 | 状态标志 |
|
||
| TINYINT | 1字节整数 | 小范围枚举 |
|
||
| SMALLINT | 2字节整数 | 小范围数值 |
|
||
| INT | 4字节整数 | 数量、等级 |
|
||
| BIGINT | 8字节整数 | ID、计数、大数值 |
|
||
| LARGEINT | 16字节整数 | 超大数值 |
|
||
| FLOAT | 4字节浮点 | 近似计算 |
|
||
| DOUBLE | 8字节浮点 | 科学计算 |
|
||
| DECIMAL(p,s) | 定点数 | 金额、精确数值 |
|
||
| DATE | 日期 | 日期字段(无时间) |
|
||
| DATETIME | 日期时间 | 时间戳(精确到秒) |
|
||
| CHAR(n) | 定长字符串 | 固定长度编码 |
|
||
| VARCHAR(n) | 变长字符串 | 名称、描述 |
|
||
| STRING | 变长字符串 | 大文本(无长度限制) |
|
||
| BITMAP | 位图 | 精确去重(仅聚合模型) |
|
||
| HLL | HyperLogLog | 近似去重(仅聚合模型) |
|
||
| JSON | JSON | JSON数据存储 |
|
||
*/
|
||
|
||
-- ============================================================================
|
||
-- 建表规范说明
|
||
-- ============================================================================
|
||
/*
|
||
1. 模型选择
|
||
- Duplicate Key:保留原始明细,不做预聚合
|
||
- Aggregate Key:预聚合,相同 Key 的 Value 自动合并
|
||
- Unique Key:按主键去重,保留最新数据
|
||
|
||
2. 分区设计
|
||
- 按时间字段 RANGE 分区(最常用)
|
||
- 支持动态分区自动管理
|
||
- 单分区数据量建议 1GB~10GB
|
||
|
||
3. 分桶设计
|
||
- 使用高基数列做 HASH 分桶
|
||
- 分桶数 = BE节点数 × CPU核数(参考值)
|
||
- 单桶数据量建议 100MB~1GB
|
||
|
||
4. 副本数
|
||
- 生产环境建议 3 副本
|
||
- 测试环境可设 1 副本
|
||
|
||
5. Key 列选择
|
||
- Duplicate Key:高频过滤/排序字段
|
||
- Aggregate Key:聚合维度字段
|
||
- Unique Key:业务主键
|
||
|
||
6. 注意事项
|
||
- Key 列必须在 Value 列之前
|
||
- 分区列必须是 Key 列
|
||
- 分桶列必须是 Key 列
|
||
- BITMAP/HLL 仅用于 Aggregate 模型的 Value 列
|
||
*/
|