Files
smart-data-dev-skill/one-skill/smart-data-developer/references/sql/templates/doris/create-table-template.sql
2026-05-13 11:03:00 +08:00

210 lines
8.6 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- =====================================================================
-- @Name: DORIS-D-SQL-{表名}-CREATE
-- @Version: 1.0
-- @Desc: Apache Doris 建表模板OLAP 多模型)
-- @TargetDatabase: Apache Doris
-- =====================================================================
-- ============================================================================
-- 场景1Duplicate Key 模型(明细表)
-- ============================================================================
-- 适用:保留原始明细数据,不做预聚合,数据无冗余
-- 特点:数据按 Key 排序存储,支持所有列的查询和聚合
CREATE TABLE IF NOT EXISTS db_name.detail_table (
-- Key 列(排序字段)
order_id BIGINT COMMENT '订单ID',
order_date DATE COMMENT '订单日期',
user_id BIGINT COMMENT '用户ID',
-- Value 列
user_name VARCHAR(50) COMMENT '用户姓名',
product_id BIGINT COMMENT '商品ID',
product_name VARCHAR(200) COMMENT '商品名称',
quantity INT COMMENT '购买数量',
unit_price DECIMAL(18,2) COMMENT '单价',
total_amount DECIMAL(18,2) COMMENT '总金额',
status VARCHAR(20) COMMENT '订单状态',
create_time DATETIME COMMENT '创建时间'
)
DUPLICATE KEY(order_id, order_date, user_id)
COMMENT '订单明细表'
PARTITION BY RANGE(order_date) (
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01')
)
DISTRIBUTED BY HASH(order_id) BUCKETS 8
PROPERTIES (
'replication_num' = '3',
'storage_format' = 'V2'
);
-- ============================================================================
-- 场景2Aggregate Key 模型(聚合表)
-- ============================================================================
-- 适用:预聚合场景,相同 Key 的数据自动合并
-- 特点Value 列必须指定聚合函数SUM, REPLACE, MAX, MIN, HLL_UNION, BITMAP_UNION
CREATE TABLE IF NOT EXISTS db_name.agg_table (
-- Key 列(聚合维度)
stat_date DATE COMMENT '统计日期',
department VARCHAR(100) COMMENT '部门名称',
region VARCHAR(100) COMMENT '地区',
-- Value 列(带聚合函数)
order_count BIGINT SUM COMMENT '订单总数',
total_amount DECIMAL(18,2) SUM COMMENT '总金额',
unique_users BIGINT REPLACE COMMENT '去重用户数(预计算值)',
max_amount DECIMAL(18,2) MAX COMMENT '最大金额',
last_update DATETIME REPLACE COMMENT '最后更新时间'
)
AGGREGATE KEY(stat_date, department, region)
COMMENT '部门销售聚合表'
PARTITION BY RANGE(stat_date) (
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01')
)
DISTRIBUTED BY HASH(department) BUCKETS 8
PROPERTIES (
'replication_num' = '3',
'storage_format' = 'V2'
);
-- ============================================================================
-- 场景3Unique Key 模型(唯一主键表)
-- ============================================================================
-- 适用:需要按主键更新/去重的场景
-- 特点:相同主键的数据保留最新一条(整行替换)
CREATE TABLE IF NOT EXISTS db_name.unique_table (
-- Key 列(主键,必须唯一)
user_id BIGINT COMMENT '用户ID',
-- Value 列
user_name VARCHAR(50) COMMENT '用户姓名',
phone VARCHAR(20) COMMENT '手机号',
email VARCHAR(100) COMMENT '邮箱',
vip_level INT COMMENT 'VIP等级',
register_date DATE COMMENT '注册日期',
last_login DATETIME COMMENT '最后登录时间',
status VARCHAR(10) COMMENT '状态'
)
UNIQUE KEY(user_id)
COMMENT '用户信息表(按主键更新)'
DISTRIBUTED BY HASH(user_id) BUCKETS 16
PROPERTIES (
'replication_num' = '3',
'enable_unique_key_merge_based_on_replica' = 'true'
);
-- ============================================================================
-- 场景4带动态分区属性
-- ============================================================================
-- 适用:按日自动创建和管理分区
CREATE TABLE IF NOT EXISTS db_name.auto_partition_table (
stat_date DATE COMMENT '统计日期',
department VARCHAR(100) COMMENT '部门',
metric_value DECIMAL(18,2) SUM COMMENT '指标值',
record_count BIGINT SUM COMMENT '记录数'
)
AGGREGATE KEY(stat_date, department)
COMMENT '自动分区示例表'
PARTITION BY RANGE(stat_date) ()
DISTRIBUTED BY HASH(department) BUCKETS 8
PROPERTIES (
'replication_num' = '3',
'dynamic_partition.enable' = 'true',
'dynamic_partition.time_unit' = 'DAY',
'dynamic_partition.start' = '-30', -- 保留30天历史
'dynamic_partition.end' = '3', -- 预创建3天
'dynamic_partition.prefix' = 'p',
'dynamic_partition.buckets' = '8'
);
-- ============================================================================
-- 场景5多分区 + 多分桶
-- ============================================================================
CREATE TABLE IF NOT EXISTS db_name.multi_partition_table (
stat_date DATE COMMENT '统计日期',
region VARCHAR(50) COMMENT '地区',
city VARCHAR(50) COMMENT '城市',
user_id BIGINT COMMENT '用户ID',
amount DECIMAL(18,2) SUM COMMENT '金额'
)
AGGREGATE KEY(stat_date, region, city, user_id)
COMMENT '多维度分区示例'
PARTITION BY RANGE(stat_date) (
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01')
)
DISTRIBUTED BY HASH(user_id) BUCKETS 32
PROPERTIES (
'replication_num' = '3',
'in_memory' = 'false',
'storage_format' = 'V2',
'compression' = 'LZ4'
);
-- ============================================================================
-- 字段类型速查
-- ============================================================================
/*
| 类型 | 说明 | 适用场景 |
|---------------|----------------|------------------------|
| BOOLEAN | 布尔 | 状态标志 |
| TINYINT | 1字节整数 | 小范围枚举 |
| SMALLINT | 2字节整数 | 小范围数值 |
| INT | 4字节整数 | 数量、等级 |
| BIGINT | 8字节整数 | ID、计数、大数值 |
| LARGEINT | 16字节整数 | 超大数值 |
| FLOAT | 4字节浮点 | 近似计算 |
| DOUBLE | 8字节浮点 | 科学计算 |
| DECIMAL(p,s) | 定点数 | 金额、精确数值 |
| DATE | 日期 | 日期字段(无时间) |
| DATETIME | 日期时间 | 时间戳(精确到秒) |
| CHAR(n) | 定长字符串 | 固定长度编码 |
| VARCHAR(n) | 变长字符串 | 名称、描述 |
| STRING | 变长字符串 | 大文本(无长度限制) |
| BITMAP | 位图 | 精确去重(仅聚合模型) |
| HLL | HyperLogLog | 近似去重(仅聚合模型) |
| JSON | JSON | JSON数据存储 |
*/
-- ============================================================================
-- 建表规范说明
-- ============================================================================
/*
1. 模型选择
- Duplicate Key保留原始明细不做预聚合
- Aggregate Key预聚合相同 Key 的 Value 自动合并
- Unique Key按主键去重保留最新数据
2. 分区设计
- 按时间字段 RANGE 分区(最常用)
- 支持动态分区自动管理
- 单分区数据量建议 1GB~10GB
3. 分桶设计
- 使用高基数列做 HASH 分桶
- 分桶数 = BE节点数 × CPU核数参考值
- 单桶数据量建议 100MB~1GB
4. 副本数
- 生产环境建议 3 副本
- 测试环境可设 1 副本
5. Key 列选择
- Duplicate Key高频过滤/排序字段
- Aggregate Key聚合维度字段
- Unique Key业务主键
6. 注意事项
- Key 列必须在 Value 列之前
- 分区列必须是 Key 列
- 分桶列必须是 Key 列
- BITMAP/HLL 仅用于 Aggregate 模型的 Value 列
*/