Add one-skill
This commit is contained in:
@@ -0,0 +1,459 @@
|
||||
# 聚合模式速查
|
||||
|
||||
## 基本聚合
|
||||
|
||||
### 聚合函数列表
|
||||
|
||||
| 函数 | 说明 | 示例 |
|
||||
|------|------|------|
|
||||
| COUNT(*) | 计数(含NULL行) | 总行数 |
|
||||
| COUNT(col) | 计数(不含NULL) | 有效数据数 |
|
||||
| COUNT(DISTINCT col) | 去重计数 | 用户数 |
|
||||
| SUM(col) | 求和 | 总销售额 |
|
||||
| AVG(col) | 平均值 | 平均薪资 |
|
||||
| MIN(col) | 最小值 | 最小年龄 |
|
||||
| MAX(col) | 最大值 | 最高分 |
|
||||
|
||||
### 基本用法
|
||||
|
||||
```sql
|
||||
-- 单列聚合
|
||||
SELECT
|
||||
COUNT(*) AS total_rows,
|
||||
COUNT(DISTINCT user_id) AS unique_users,
|
||||
SUM(amount) AS total_amount,
|
||||
AVG(amount) AS avg_amount,
|
||||
MIN(amount) AS min_amount,
|
||||
MAX(amount) AS max_amount
|
||||
FROM orders
|
||||
|
||||
-- 分组聚合
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS employee_count,
|
||||
AVG(salary) AS avg_salary,
|
||||
MAX(salary) AS max_salary
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## GROUP BY
|
||||
|
||||
```sql
|
||||
-- 单字段分组
|
||||
SELECT
|
||||
category,
|
||||
COUNT(*) AS count
|
||||
FROM products
|
||||
GROUP BY category
|
||||
|
||||
-- 多字段分组
|
||||
SELECT
|
||||
category,
|
||||
brand,
|
||||
COUNT(*) AS count,
|
||||
SUM(price) AS total_price
|
||||
FROM products
|
||||
GROUP BY category, brand
|
||||
|
||||
-- 分组 + 排序
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS count
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
ORDER BY count DESC
|
||||
|
||||
-- 分组 + LIMIT(取Top N组)
|
||||
SELECT
|
||||
category,
|
||||
COUNT(*) AS count
|
||||
FROM products
|
||||
GROUP BY category
|
||||
ORDER BY count DESC
|
||||
LIMIT 10
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## HAVING(分组过滤)
|
||||
|
||||
```sql
|
||||
-- HAVING vs WHERE
|
||||
-- WHERE:过滤原始行(GROUP BY 前)
|
||||
-- HAVING:过滤分组结果(GROUP BY 后)
|
||||
|
||||
-- 示例:筛选订单数大于10的用户
|
||||
SELECT
|
||||
user_id,
|
||||
COUNT(*) AS order_count,
|
||||
SUM(amount) AS total_amount
|
||||
FROM orders
|
||||
GROUP BY user_id
|
||||
HAVING COUNT(*) > 10
|
||||
|
||||
-- 多条件 HAVING
|
||||
SELECT
|
||||
department,
|
||||
AVG(salary) AS avg_salary
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
HAVING AVG(salary) > 5000
|
||||
AND COUNT(*) >= 5
|
||||
|
||||
-- HAVING + ORDER BY
|
||||
SELECT
|
||||
user_id,
|
||||
COUNT(*) AS order_count
|
||||
FROM orders
|
||||
GROUP BY user_id
|
||||
HAVING COUNT(*) >= 5
|
||||
ORDER BY order_count DESC
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 去重计数
|
||||
|
||||
```sql
|
||||
-- COUNT(DISTINCT)
|
||||
SELECT
|
||||
COUNT(DISTINCT user_id) AS unique_users
|
||||
FROM orders
|
||||
|
||||
-- 分组去重计数
|
||||
SELECT
|
||||
date,
|
||||
COUNT(DISTINCT user_id) AS unique_users,
|
||||
COUNT(*) AS total_orders
|
||||
FROM orders
|
||||
GROUP BY date
|
||||
|
||||
-- 多字段去重计数
|
||||
SELECT
|
||||
COUNT(DISTINCT user_id, product_id) AS unique_user_product_pairs
|
||||
FROM order_items
|
||||
|
||||
-- 大数据量近似去重(性能优化)
|
||||
SELECT
|
||||
approx_count_distinct(user_id) AS approx_unique_users
|
||||
FROM orders
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 集合聚合(数组结果)
|
||||
|
||||
```sql
|
||||
-- collect_list:收集为数组(不去重)
|
||||
SELECT
|
||||
user_id,
|
||||
collect_list(product_id) AS products
|
||||
FROM orders
|
||||
GROUP BY user_id
|
||||
|
||||
-- collect_set:收集为数组(去重)
|
||||
SELECT
|
||||
user_id,
|
||||
collect_set(product_id) AS unique_products
|
||||
FROM orders
|
||||
GROUP BY user_id
|
||||
|
||||
-- 取数组大小
|
||||
SELECT
|
||||
user_id,
|
||||
size(collect_list(product_id)) AS product_count,
|
||||
size(collect_set(product_id)) AS unique_product_count
|
||||
FROM orders
|
||||
GROUP BY user_id
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 多级聚合(ROLLUP / CUBE / GROUPING SETS)
|
||||
|
||||
### ROLLUP(层级汇总)
|
||||
|
||||
```sql
|
||||
-- 从右到左递减分组级别
|
||||
SELECT
|
||||
COALESCE(year, '总计') AS year,
|
||||
COALESCE(month, '全年') AS month,
|
||||
COALESCE(region, '全国') AS region,
|
||||
SUM(sales) AS total_sales
|
||||
FROM sales_data
|
||||
GROUP BY ROLLUP (year, month, region)
|
||||
|
||||
-- 结果包含:
|
||||
-- 1. year + month + region 分组
|
||||
-- 2. year + month 汇总(region为NULL)
|
||||
-- 3. year 汇总(month,region为NULL)
|
||||
-- 4. 全表汇总(year,month,region为NULL)
|
||||
```
|
||||
|
||||
### CUBE(全维度组合)
|
||||
|
||||
```sql
|
||||
-- 所有分组组合
|
||||
SELECT
|
||||
COALESCE(year, '总计') AS year,
|
||||
COALESCE(month, '全月') AS month,
|
||||
COALESCE(region, '全国') AS region,
|
||||
SUM(sales) AS total_sales
|
||||
FROM sales_data
|
||||
GROUP BY CUBE (year, month, region)
|
||||
|
||||
-- 结果包含所有组合:
|
||||
-- year+month+region, year+month, year+region, month+region
|
||||
-- year, month, region, 全表汇总
|
||||
```
|
||||
|
||||
### GROUPING SETS(自定义组合)
|
||||
|
||||
```sql
|
||||
-- 指定分组组合
|
||||
SELECT
|
||||
year,
|
||||
month,
|
||||
region,
|
||||
SUM(sales) AS total_sales
|
||||
FROM sales_data
|
||||
GROUP BY GROUPING SETS (
|
||||
(year, month),
|
||||
(year, region),
|
||||
(region),
|
||||
()
|
||||
)
|
||||
|
||||
-- 等价于多个 GROUP BY 合并
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## GROUPING 函数(判断汇总级别)
|
||||
|
||||
```sql
|
||||
-- GROUPING(col):判断该列是否为汇总产生的NULL
|
||||
-- 0 = 真实值, 1 = 汇总NULL
|
||||
|
||||
SELECT
|
||||
year,
|
||||
month,
|
||||
SUM(sales) AS total_sales,
|
||||
GROUPING(year) AS is_year_total,
|
||||
GROUPING(month) AS is_month_total
|
||||
FROM sales_data
|
||||
GROUP BY ROLLUP (year, month)
|
||||
|
||||
-- 用 GROUPING 区分真实NULL和汇总NULL
|
||||
SELECT
|
||||
CASE WHEN GROUPING(region) = 1 THEN '全国汇总' ELSE region END AS region,
|
||||
SUM(sales) AS total_sales
|
||||
FROM sales_data
|
||||
GROUP BY ROLLUP (region)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 条件聚合(CASE WHEN + 聚合)
|
||||
|
||||
```sql
|
||||
-- 分条件统计
|
||||
SELECT
|
||||
date,
|
||||
COUNT(*) AS total_orders,
|
||||
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
|
||||
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
|
||||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_count
|
||||
FROM orders
|
||||
GROUP BY date
|
||||
|
||||
-- 分条件求和
|
||||
SELECT
|
||||
department,
|
||||
SUM(salary) AS total_salary,
|
||||
SUM(CASE WHEN gender = 'M' THEN salary ELSE 0 END) AS male_salary,
|
||||
SUM(CASE WHEN gender = 'F' THEN salary ELSE 0 END) AS female_salary
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
|
||||
-- 条件平均值
|
||||
SELECT
|
||||
category,
|
||||
AVG(CASE WHEN price > 100 THEN price ELSE NULL END) AS high_price_avg
|
||||
FROM products
|
||||
GROUP BY category
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 聚合 + 窗口函数
|
||||
|
||||
```sql
|
||||
-- 分组内占比
|
||||
SELECT
|
||||
department,
|
||||
salary,
|
||||
ROUND(salary / SUM(salary) OVER (PARTITION BY department) * 100, 2) AS salary_pct
|
||||
FROM employees
|
||||
|
||||
-- 分组累计
|
||||
SELECT
|
||||
date,
|
||||
department,
|
||||
amount,
|
||||
SUM(amount) OVER (PARTITION BY department ORDER BY date) AS cumulative
|
||||
FROM sales
|
||||
|
||||
-- 分组排名
|
||||
SELECT
|
||||
*,
|
||||
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS salary_rank
|
||||
FROM employees
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 多表聚合
|
||||
|
||||
```sql
|
||||
-- JOIN 后聚合
|
||||
SELECT
|
||||
u.department,
|
||||
COUNT(o.id) AS order_count,
|
||||
SUM(o.amount) AS total_amount
|
||||
FROM users u
|
||||
LEFT JOIN orders o ON u.id = o.user_id
|
||||
GROUP BY u.department
|
||||
|
||||
-- 子查询聚合
|
||||
SELECT
|
||||
dept_stats.department,
|
||||
dept_stats.avg_salary,
|
||||
emp_count.employee_count
|
||||
FROM (
|
||||
SELECT department, AVG(salary) AS avg_salary
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
) dept_stats
|
||||
JOIN (
|
||||
SELECT department, COUNT(*) AS employee_count
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
) emp_count ON dept_stats.department = emp_count.department
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 常见问题
|
||||
|
||||
### 问题1:GROUP BY 字段遗漏
|
||||
|
||||
```sql
|
||||
-- 错误:SELECT 字段不在 GROUP BY 中
|
||||
SELECT
|
||||
department,
|
||||
name, -- 错误!name 未分组
|
||||
AVG(salary)
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
|
||||
-- 解决1:添加到 GROUP BY
|
||||
SELECT
|
||||
department,
|
||||
name,
|
||||
AVG(salary)
|
||||
FROM employees
|
||||
GROUP BY department, name
|
||||
|
||||
-- 解决2:使用聚合函数处理
|
||||
SELECT
|
||||
department,
|
||||
collect_list(name) AS names, -- 收集所有name
|
||||
AVG(salary)
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
```
|
||||
|
||||
### 问题2:NULL 影响聚合
|
||||
|
||||
```sql
|
||||
-- COUNT(*) 包含 NULL 行
|
||||
-- COUNT(col) 不包含 NULL
|
||||
|
||||
SELECT
|
||||
COUNT(*) AS total, -- 包含 NULL 行
|
||||
COUNT(amount) AS valid, -- 不包含 amount 为 NULL 的行
|
||||
COUNT(DISTINCT amount) AS unique_values
|
||||
FROM orders
|
||||
|
||||
-- SUM/AVG/MIN/MAX 自动忽略 NULL
|
||||
SELECT AVG(price) FROM products -- NULL 自动排除
|
||||
```
|
||||
|
||||
### 问题3:聚合结果精度
|
||||
|
||||
```sql
|
||||
-- AVG 可能精度丢失
|
||||
SELECT
|
||||
AVG(amount) AS avg_amount -- 可能精度不够
|
||||
AVG(CAST(amount AS DECIMAL(18,6))) AS precise_avg -- 高精度
|
||||
FROM orders
|
||||
|
||||
-- ROUND 控制精度
|
||||
SELECT
|
||||
ROUND(AVG(amount), 2) AS avg_amount
|
||||
FROM orders
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 聚合性能优化
|
||||
|
||||
```sql
|
||||
-- 1. 先过滤再聚合
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS count
|
||||
FROM employees
|
||||
WHERE hire_date >= '2024-01-01' -- 先过滤
|
||||
GROUP BY department
|
||||
|
||||
-- 2. 大数据量用近似聚合
|
||||
SELECT
|
||||
approx_count_distinct(user_id) AS users -- 比 COUNT(DISTINCT) 快
|
||||
FROM orders
|
||||
|
||||
-- 3. 减少分组字段数量
|
||||
SELECT
|
||||
category, -- 减少分组字段
|
||||
COUNT(*) AS count
|
||||
FROM products
|
||||
GROUP BY category -- 比 GROUP BY category, brand 快
|
||||
|
||||
-- 4. 避免复杂计算在 GROUP BY 前
|
||||
SELECT
|
||||
department,
|
||||
AVG(salary * 1.1) AS adjusted_avg -- 先计算再聚合
|
||||
FROM employees
|
||||
GROUP BY department
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 聚合模式选择指南
|
||||
|
||||
| 需求 | 推荐方式 |
|
||||
|------|----------|
|
||||
| 简单统计 | GROUP BY + 聚合函数 |
|
||||
| 分条件统计 | CASE WHEN + SUM/COUNT |
|
||||
| 去重计数 | COUNT(DISTINCT) |
|
||||
| 大数据去重 | approx_count_distinct |
|
||||
| 收集数组 | collect_list / collect_set |
|
||||
| 层级汇总 | ROLLUP |
|
||||
| 全维度汇总 | CUBE |
|
||||
| 自定义组合 | GROUPING SETS |
|
||||
| 分组内计算 | 窗口函数 |
|
||||
| 多条件过滤 | HAVING |
|
||||
@@ -0,0 +1,320 @@
|
||||
# Apache Doris SQL 语法参考
|
||||
|
||||
## 数据类型
|
||||
|
||||
| 类型 | 说明 | 示例 |
|
||||
|------|------|------|
|
||||
| BOOLEAN | 布尔 | active BOOLEAN |
|
||||
| TINYINT | 1字节整数 | level TINYINT |
|
||||
| SMALLINT | 2字节整数 | age SMALLINT |
|
||||
| INT | 4字节整数 | count INT |
|
||||
| BIGINT | 8字节整数 | id BIGINT |
|
||||
| LARGEINT | 16字节整数 | hash_key LARGEINT |
|
||||
| FLOAT | 4字节浮点 | score FLOAT |
|
||||
| DOUBLE | 8字节浮点 | price DOUBLE |
|
||||
| DECIMAL(p,s) | 定点数 | amount DECIMAL(18,2) |
|
||||
| DATE | 日期 | birth_date DATE |
|
||||
| DATETIME | 日期时间(精确到秒) | created_at DATETIME |
|
||||
| CHAR(n) | 定长字符串 | code CHAR(10) |
|
||||
| VARCHAR(n) | 变长字符串 | name VARCHAR(100) |
|
||||
| STRING | 变长字符串(无长度限制) | description STRING |
|
||||
| BITMAP | 位图(精确去重) | user_bitmap BITMAP |
|
||||
| HLL | HyperLogLog(近似去重) | user_hll HLL |
|
||||
| JSON | JSON 数据 | props JSON |
|
||||
| ARRAY\<type\> | 数组 | tags ARRAY\<STRING\> |
|
||||
| MAP\<k,v\> | 映射 | props MAP\<STRING,STRING\> |
|
||||
| STRUCT\<field:type,...\> | 结构体 | info STRUCT\<id:INT,name:STRING\> |
|
||||
|
||||
---
|
||||
|
||||
## 时间函数
|
||||
|
||||
```sql
|
||||
-- 当前时间
|
||||
NOW() -- 当前日期时间
|
||||
CURDATE() -- 当前日期
|
||||
CURRENT_TIMESTAMP() -- 当前时间戳
|
||||
|
||||
-- 格式转换
|
||||
DATE_FORMAT(date_col, '%Y-%m-%d') -- 日期格式化
|
||||
DATE_FORMAT(datetime_col, '%Y-%m-%d %H:%i:%s') -- 时间格式化
|
||||
STR_TO_DATE(str, '%Y-%m-%d') -- 字符串转日期
|
||||
|
||||
-- 日期计算
|
||||
DATE_ADD(date_col, INTERVAL 7 DAY) -- 加7天
|
||||
DATE_SUB(date_col, INTERVAL 7 DAY) -- 减7天
|
||||
DATEDIFF(end_date, start_date) -- 日期差(天数)
|
||||
TIMESTAMPDIFF(unit, start, end) -- 时间差(指定单位)
|
||||
TIMESTAMPADD(unit, interval, datetime) -- 时间加
|
||||
|
||||
-- 日期提取
|
||||
YEAR(date_col) -- 年
|
||||
MONTH(date_col) -- 月
|
||||
DAY(date_col) -- 日
|
||||
HOUR(datetime_col) -- 时
|
||||
MINUTE(datetime_col) -- 分
|
||||
SECOND(datetime_col) -- 秒
|
||||
QUARTER(date_col) -- 季度 (1-4)
|
||||
WEEK(date_col) -- 年中第几周
|
||||
DAYOFWEEK(date_col) -- 周几 (1=周日)
|
||||
DAYOFYEAR(date_col) -- 年中第几天
|
||||
|
||||
-- Unix 时间戳
|
||||
UNIX_TIMESTAMP() -- 当前 Unix 时间戳
|
||||
UNIX_TIMESTAMP(datetime_col) -- 转换为 Unix 时间戳
|
||||
FROM_UNIXTIME(timestamp) -- Unix 时间戳转时间字符串
|
||||
FROM_UNIXTIME(timestamp, fmt) -- 带格式的转换
|
||||
|
||||
-- Doris 日期格式符(不同于 Spark)
|
||||
-- %Y: 4位年, %m: 2位月, %d: 2位日
|
||||
-- %H: 24小时制, %i: 分钟, %s: 秒
|
||||
-- %j: 年中天数, %W: 周名
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 字符串函数
|
||||
|
||||
```sql
|
||||
-- 常用函数
|
||||
CONCAT(str1, str2, ...) -- 字符串拼接
|
||||
CONCAT_WS('-', str1, str2, ...) -- 用分隔符拼接
|
||||
LOWER(str) -- 转小写
|
||||
UPPER(str) -- 转大写
|
||||
TRIM(str) -- 去两端空格
|
||||
LTRIM(str) -- 去左空格
|
||||
RTRIM(str) -- 去右空格
|
||||
LENGTH(str) -- 字符串长度
|
||||
CHAR_LENGTH(str) -- 字符数(中文友好)
|
||||
SUBSTRING(str, pos, len) -- 截取字符串(pos从1开始)
|
||||
LEFT(str, len) -- 取左边len个字符
|
||||
RIGHT(str, len) -- 取右边len个字符
|
||||
REVERSE(str) -- 反转字符串
|
||||
REPEAT(str, n) -- 重复n次
|
||||
SPACE(n) -- 生成n个空格
|
||||
|
||||
-- 查找与替换
|
||||
INSTR(str, substr) -- 查找子串位置
|
||||
LOCATE(substr, str, pos) -- 从pos位置查找
|
||||
REPLACE(str, old, new) -- 替换
|
||||
REGEXP_EXTRACT(str, pattern, idx) -- 正则提取(注意和 Spark 参数不同)
|
||||
REGEXP_REPLACE(str, pattern, replacement) -- 正则替换
|
||||
|
||||
-- 分割
|
||||
SPLIT_BY_STRING(str, delimiter) -- 分割(Doris 特有)
|
||||
SPLIT_PART(str, delimiter, idx) -- 取分割后的第idx部分
|
||||
|
||||
-- 其他
|
||||
INITCAP(str) -- 首字母大写
|
||||
LPAD(str, len, pad) -- 左填充
|
||||
RPAD(str, len, pad) -- 右填充
|
||||
HEX(str) -- 转16进制
|
||||
UNHEX(str) -- 16进制转字符串
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 聚合函数
|
||||
|
||||
```sql
|
||||
-- 基础聚合
|
||||
COUNT(*) -- 计数(含NULL行)
|
||||
COUNT(col) -- 计数(不含NULL)
|
||||
COUNT(DISTINCT col) -- 去重计数
|
||||
SUM(col) -- 求和
|
||||
AVG(col) -- 平均值
|
||||
MIN(col) -- 最小值
|
||||
MAX(col) -- 最大值
|
||||
|
||||
-- 集合聚合
|
||||
GROUP_CONCAT(col SEPARATOR ',') -- 字符串聚合(类似 collect_list)
|
||||
APPROX_COUNT_DISTINCT(col) -- 近似去重计数
|
||||
|
||||
-- 统计函数
|
||||
VARIANCE(col) -- 方差
|
||||
VAR_POP(col) -- 总体方差
|
||||
VAR_SAMP(col) -- 样本方差
|
||||
STDDEV(col) -- 标准差
|
||||
STDDEV_POP(col) -- 总体标准差
|
||||
STDDEV_SAMP(col) -- 样本标准差
|
||||
|
||||
-- Bitmap 精确去重(Doris 特有)
|
||||
-- 用于 Aggregate Key 模型中定义为 BITMAP 的列
|
||||
-- bitmap_union_count(bitmap_col) -- 精确去重计数
|
||||
-- bitmap_union(bitmap_col) -- 合并 bitmap
|
||||
-- bitmap_hash(col) -- 将值转为 bitmap(用于写入)
|
||||
|
||||
-- HLL 近似去重(Doris 特有)
|
||||
-- hll_union_agg(hll_col) -- 近似去重计数
|
||||
-- hll_cardinality(hll_col) -- 返回 HLL 基数
|
||||
-- hll_hash(col) -- 将值转为 HLL(用于写入)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 条件表达式
|
||||
|
||||
```sql
|
||||
-- CASE WHEN
|
||||
CASE
|
||||
WHEN condition1 THEN result1
|
||||
WHEN condition2 THEN result2
|
||||
ELSE default_result
|
||||
END
|
||||
|
||||
-- CASE 字段匹配
|
||||
CASE field
|
||||
WHEN value1 THEN result1
|
||||
WHEN value2 THEN result2
|
||||
ELSE default_result
|
||||
END
|
||||
|
||||
-- COALESCE(取第一个非空值)
|
||||
COALESCE(col1, col2, default_value)
|
||||
|
||||
-- NULLIF(相等返回NULL)
|
||||
NULLIF(col1, col2)
|
||||
|
||||
-- IF(简单条件)
|
||||
IF(condition, true_value, false_value)
|
||||
|
||||
-- IFNULL(空值替换,等同 NVL)
|
||||
IFNULL(col, default_value)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## JSON 函数
|
||||
|
||||
```sql
|
||||
-- 解析与提取
|
||||
JSON_PARSE(json_str) -- 解析 JSON 字符串
|
||||
JSON_EXTRACT(json_str, '$.field') -- 提取 JSON 字段(返回 JSON 类型)
|
||||
JSON_EXTRACT_STRING(json_str, '$.field') -- 提取 JSON 字段(返回 STRING)
|
||||
JSON_EXTRACT_INT(json_str, '$.field') -- 提取 JSON 字段(返回 INT)
|
||||
JSON_EXTRACT_DOUBLE(json_str, '$.field') -- 提取 JSON 字段(返回 DOUBLE)
|
||||
JSON_EXTRACT_BOOL(json_str, '$.field') -- 提取 JSON 字段(返回 BOOLEAN)
|
||||
|
||||
-- 路径语法
|
||||
-- $ : 根节点
|
||||
-- $.field : 对象字段
|
||||
-- $[0] : 数组索引
|
||||
-- $.a.b : 嵌套字段
|
||||
|
||||
-- 构造
|
||||
JSON_OBJECT('key1', val1, 'key2', val2) -- 构造 JSON 对象
|
||||
JSON_ARRAY(val1, val2, ...) -- 构造 JSON 数组
|
||||
|
||||
-- 查询
|
||||
JSON_LENGTH(json_str) -- JSON 长度
|
||||
JSON_KEYS(json_str) -- JSON 所有 key
|
||||
JSON_VALID(json_str) -- 是否合法 JSON
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ARRAY 函数
|
||||
|
||||
```sql
|
||||
-- 创建
|
||||
ARRAY(val1, val2, ...) -- 创建数组
|
||||
|
||||
-- 访问
|
||||
array_contains(arr, val) -- 判断是否包含
|
||||
element_at(arr, idx) -- 取元素(idx从1开始)
|
||||
arr[idx] -- 取元素(idx从0开始)
|
||||
|
||||
-- 操作
|
||||
SIZE(arr) -- 数组长度
|
||||
ARRAY_JOIN(arr, delimiter) -- 数组转字符串
|
||||
CONCAT(arr1, arr2) -- 数组拼接
|
||||
|
||||
-- 展开与排序
|
||||
EXPLODE(arr) -- 展开数组为多行(LATERAL VIEW 中使用)
|
||||
ARRAY_SORT(arr) -- 排序
|
||||
ARRAY_DISTINCT(arr) -- 去重
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 与 Spark SQL 的主要差异
|
||||
|
||||
| 特性 | Spark SQL | Apache Doris | 说明 |
|
||||
|------|-----------|-------------|------|
|
||||
| **日期格式符** | `yyyy-MM-dd` | `%Y-%m-%d` | Doris 用 MySQL 风格格式符 |
|
||||
| **当前时间** | `current_timestamp()` | `NOW()` 或 `CURRENT_TIMESTAMP()` | 都支持,Doris 偏好 NOW() |
|
||||
| **日期加减** | `date_add(col, 7)` | `DATE_ADD(col, INTERVAL 7 DAY)` | Doris 需要 INTERVAL 语法 |
|
||||
| **正则提取** | `regexp_extract(str, pattern, idx)` | `REGEXP_EXTRACT(str, pattern, idx)` | 参数名可能不同 |
|
||||
| **LEFT SEMI JOIN** | ✅ 支持 | ❌ 不支持 | 用 IN 子查询替代 |
|
||||
| **LEFT ANTI JOIN** | ✅ 支持 | ❌ 不支持 | 用 NOT IN / NOT EXISTS 替代 |
|
||||
| **INSERT OVERWRITE** | ✅ 支持 | ✅ 2.0+ 支持 | 早期版本不支持 |
|
||||
| **CTE (WITH)** | ✅ 支持 | ✅ 支持 | 都支持 |
|
||||
| **MERGE INTO** | ✅ 支持 | ❌ 不支持 | Doris 用 UPSERT 替代 |
|
||||
| **临时表链式处理** | ✅ 推荐 | ✅ 统一规范 | 编码规范层面统一禁止 CTE |
|
||||
| **精确去重** | COUNT(DISTINCT) | COUNT(DISTINCT) 或 BITMAP | Doris BITMAP 性能更好 |
|
||||
| **近似去重** | `approx_count_distinct` | HLL 类型 | Doris 内置 HLL 类型 |
|
||||
| **UPDATE/DELETE** | ❌ 不支持 | ✅ 支持 | Doris 支持 DML |
|
||||
| **UPSERT** | ❌ 不支持 | ❌ 不直接支持 | 用 Aggregate/Unique Key 模型替代 |
|
||||
| **分页** | LIMIT(无OFFSET) | LIMIT + OFFSET | Doris 完整支持 |
|
||||
| **JSON** | `get_json_object` | `JSON_EXTRACT_STRING` 等系列函数 | 函数名完全不同 |
|
||||
| **collect_list/set** | ✅ 支持 | ❌ 不直接支持 | 用 GROUP_CONCAT 替代 |
|
||||
| **LATERAL VIEW explode** | ✅ 支持 | ✅ 支持 | 语法兼容 |
|
||||
|
||||
---
|
||||
|
||||
## SQL 生成规则
|
||||
|
||||
### 通用规则(所有引擎统一)
|
||||
|
||||
1. **禁止使用 CTE (WITH 子句)**,每个主要逻辑步骤必须物化为临时表
|
||||
2. **先 DROP 再 CREATE**:`DROP TABLE IF EXISTS ...; CREATE TABLE ... AS SELECT ...;`
|
||||
3. **禁止 `SELECT *`**,必须明确列出所有字段
|
||||
4. 多表查询时所有表必须使用简短别名
|
||||
5. 每个步骤前添加注释说明
|
||||
6. **谓词下推**:过滤条件前置,JOIN 时在 WHERE 中一并添加过滤
|
||||
7. 临时表命名:`${db_tmp_env}.tmp_{业务简称}_{步骤序号}`
|
||||
|
||||
### Doris 特有规则
|
||||
|
||||
1. 使用 `INSERT INTO` 写入目标表(Doris 2.0+ 也支持 `INSERT OVERWRITE`)
|
||||
2. Aggregate Key 表自动合并相同 Key 的数据
|
||||
3. Unique Key 表自动按主键去重,保留最新数据
|
||||
4. 不支持 `LEFT SEMI JOIN` / `LEFT ANTI JOIN`,用 `IN` / `NOT IN` 替代
|
||||
5. 精确去重推荐 `BITMAP`,近似去重推荐 `HLL`
|
||||
6. 日期函数用 MySQL 风格:
|
||||
- `DATE_FORMAT(col, '%Y-%m-%d')`(不是 `yyyy-MM-dd`)
|
||||
- `DATE_ADD(col, INTERVAL 7 DAY)`(不是 `date_add(col, 7)`)
|
||||
- `CURDATE()` / `NOW()`
|
||||
7. 不支持 `collect_list` / `collect_set`,用 `GROUP_CONCAT` 替代
|
||||
8. 时间范围筛选:
|
||||
```sql
|
||||
-- 日账期过滤
|
||||
WHERE stat_date = '${day_id}'
|
||||
-- 最近N个月
|
||||
WHERE stat_date >= DATE_FORMAT(DATE_SUB(STR_TO_DATE('${month_id}', '%Y%m'), INTERVAL N MONTH), '%Y%m')
|
||||
AND stat_date < '${month_id}'
|
||||
```
|
||||
|
||||
### SQL 脚本结构
|
||||
|
||||
```sql
|
||||
-- =====================================================================
|
||||
-- @SqlName: doris-D-SQL-{表名}
|
||||
-- @Engine: doris
|
||||
-- ...(头注释)
|
||||
-- =====================================================================
|
||||
|
||||
-- Step01: {步骤描述}
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||||
SELECT ...;
|
||||
|
||||
-- Step02: {步骤描述}
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||||
SELECT ...;
|
||||
|
||||
-- 最后一步:写入目标表
|
||||
INSERT INTO ${db_eda_env}.target_table
|
||||
SELECT ...;
|
||||
```
|
||||
@@ -0,0 +1,334 @@
|
||||
# Hive SQL 语法参考
|
||||
|
||||
## 数据类型
|
||||
|
||||
| 类型 | 说明 | 示例 |
|
||||
|------|------|------|
|
||||
| TINYINT | 1字节整数 | level TINYINT |
|
||||
| SMALLINT | 2字节整数 | age SMALLINT |
|
||||
| INT | 4字节整数 | count INT |
|
||||
| BIGINT | 8字节整数 | id BIGINT |
|
||||
| FLOAT | 4字节浮点 | score FLOAT |
|
||||
| DOUBLE | 8字节浮点 | price DOUBLE |
|
||||
| DECIMAL(p,s) | 定点数 | amount DECIMAL(18,2) |
|
||||
| BOOLEAN | 布尔 | active BOOLEAN |
|
||||
| STRING | 变长字符串 | name STRING |
|
||||
| VARCHAR(n) | 变长字符串(限长) | code VARCHAR(50) |
|
||||
| CHAR(n) | 定长字符串 | flag CHAR(1) |
|
||||
| DATE | 日期 | birth_date DATE |
|
||||
| TIMESTAMP | 时间戳(纳秒精度) | created_at TIMESTAMP |
|
||||
| BINARY | 二进制 | data BINARY |
|
||||
| ARRAY\<type\> | 数组 | tags ARRAY\<STRING\> |
|
||||
| MAP\<k,v\> | 映射 | props MAP\<STRING,STRING\> |
|
||||
| STRUCT\<f1:t1,...\> | 结构体 | user STRUCT\<id:INT,name:STRING\> |
|
||||
| UNIONTYPE\<t1,t2,...\> | 联合类型 | value UNIONTYPE\<INT,STRING\> |
|
||||
|
||||
---
|
||||
|
||||
## 时间函数
|
||||
|
||||
```sql
|
||||
-- 当前时间
|
||||
current_date() -- 当前日期
|
||||
current_timestamp() -- 当前时间戳
|
||||
unix_timestamp() -- 当前 Unix 时间戳(秒)
|
||||
|
||||
-- 格式转换
|
||||
date_format(date_col, 'yyyy-MM-dd') -- 日期格式化
|
||||
date_format(timestamp_col, 'yyyy-MM-dd HH:mm:ss') -- 时间格式化
|
||||
to_date(string_col) -- 字符串转日期
|
||||
to_date(string_col, 'yyyy-MM-dd') -- 字符串转日期(带格式)
|
||||
from_unixtime(timestamp) -- Unix 时间戳转字符串
|
||||
from_unixtime(timestamp, 'yyyy-MM-dd') -- 带格式转换
|
||||
|
||||
-- 日期计算
|
||||
date_add(date_col, 7) -- 加7天
|
||||
date_sub(date_col, 7) -- 减7天
|
||||
add_months(date_col, 3) -- 加3个月
|
||||
datediff(end_date, start_date) -- 日期差(天数)
|
||||
months_between(date1, date2) -- 月份差
|
||||
|
||||
-- 日期提取
|
||||
year(date_col) -- 年
|
||||
month(date_col) -- 月
|
||||
day(date_col) -- 日
|
||||
dayofmonth(date_col) -- 月中第几天
|
||||
dayofweek(date_col) -- 周几 (1=周日, 7=周六)
|
||||
hour(timestamp_col) -- 时
|
||||
minute(timestamp_col) -- 分
|
||||
second(timestamp_col) -- 秒
|
||||
quarter(date_col) -- 季度 (1-4)
|
||||
weekofyear(date_col) -- 年中第几周
|
||||
last_day(date_col) -- 月末日期
|
||||
trunc(date_col, 'MM') -- 月初日期
|
||||
trunc(date_col, 'YY') -- 年初日期
|
||||
|
||||
-- Unix 时间戳
|
||||
unix_timestamp(date_col) -- 转 Unix 时间戳
|
||||
unix_timestamp(string_col, 'yyyy-MM-dd') -- 指定格式转换
|
||||
from_unixtime(timestamp) -- Unix 时间戳转字符串
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 字符串函数
|
||||
|
||||
```sql
|
||||
-- 常用函数
|
||||
concat(str1, str2, ...) -- 字符串拼接
|
||||
concat_ws('-', str1, str2, ...) -- 用分隔符拼接
|
||||
lower(str) -- 转小写
|
||||
upper(str) -- 转大写
|
||||
trim(str) -- 去两端空格
|
||||
ltrim(str) -- 去左空格
|
||||
rtrim(str) -- 去右空格
|
||||
length(str) -- 字符串长度
|
||||
substring(str, pos, len) -- 截取字符串(pos从1开始)
|
||||
substr(str, pos, len) -- 同 substring
|
||||
left(str, len) -- 取左边len个字符
|
||||
right(str, len) -- 取右边len个字符
|
||||
reverse(str) -- 反转字符串
|
||||
repeat(str, n) -- 重复n次
|
||||
space(n) -- 生成n个空格
|
||||
|
||||
-- 查找与替换
|
||||
instr(str, substr) -- 查找子串位置
|
||||
locate(substr, str, pos) -- 从pos位置查找
|
||||
replace(str, old, new) -- 替换
|
||||
regexp_extract(str, pattern, idx) -- 正则提取
|
||||
regexp_replace(str, pattern, replacement) -- 正则替换
|
||||
|
||||
-- 分割
|
||||
split(str, delimiter) -- 分割成数组
|
||||
|
||||
-- 其他
|
||||
initcap(str) -- 首字母大写
|
||||
lpad(str, len, pad) -- 左填充
|
||||
rpad(str, len, pad) -- 右填充
|
||||
hex(col) -- 转16进制
|
||||
unhex(str) -- 16进制转字符串
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 聚合函数
|
||||
|
||||
```sql
|
||||
-- 基础聚合
|
||||
COUNT(*) -- 计数(含NULL行)
|
||||
COUNT(col) -- 计数(不含NULL)
|
||||
COUNT(DISTINCT col) -- 去重计数
|
||||
SUM(col) -- 求和
|
||||
AVG(col) -- 平均值
|
||||
MIN(col) -- 最小值
|
||||
MAX(col) -- 最大值
|
||||
|
||||
-- 集合聚合
|
||||
collect_list(col) -- 返回数组(不去重)
|
||||
collect_set(col) -- 返回数组(去重)
|
||||
|
||||
-- 统计函数
|
||||
variance(col) -- 方差
|
||||
var_pop(col) -- 总体方差
|
||||
var_samp(col) -- 样本方差
|
||||
stddev(col) -- 标准差
|
||||
stddev_pop(col) -- 总体标准差
|
||||
stddev_samp(col) -- 样本标准差
|
||||
|
||||
-- 近似函数
|
||||
approx_count_distinct(col) -- 近似去重计数(大数据量优化)
|
||||
|
||||
-- 其他
|
||||
first(col) -- 第一个值
|
||||
last(col) -- 最后一个值
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 条件表达式
|
||||
|
||||
```sql
|
||||
-- CASE WHEN
|
||||
CASE
|
||||
WHEN condition1 THEN result1
|
||||
WHEN condition2 THEN result2
|
||||
ELSE default_result
|
||||
END
|
||||
|
||||
-- CASE 字段匹配
|
||||
CASE field
|
||||
WHEN value1 THEN result1
|
||||
WHEN value2 THEN result2
|
||||
ELSE default_result
|
||||
END
|
||||
|
||||
-- COALESCE(取第一个非空值)
|
||||
COALESCE(col1, col2, default_value)
|
||||
|
||||
-- NULLIF(相等返回NULL)
|
||||
NULLIF(col1, col2)
|
||||
|
||||
-- IF(简单条件)
|
||||
IF(condition, true_value, false_value)
|
||||
|
||||
-- NVL(空值替换)
|
||||
NVL(col, default_value)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 复杂类型操作
|
||||
|
||||
```sql
|
||||
-- ARRAY 操作
|
||||
array(val1, val2, ...) -- 创建数组
|
||||
array_contains(arr, val) -- 判断是否包含
|
||||
element_at(arr, idx) -- 取元素(idx从1开始)
|
||||
arr[idx] -- 取元素(idx从0开始)
|
||||
size(arr) -- 数组长度
|
||||
array_join(arr, delimiter) -- 数组转字符串
|
||||
sort_array(arr) -- 排序
|
||||
array_distinct(arr) -- 去重
|
||||
|
||||
-- 展开(LATERAL VIEW + explode)
|
||||
-- 展开数组
|
||||
SELECT id, tag
|
||||
FROM table
|
||||
LATERAL VIEW explode(tags) t AS tag;
|
||||
|
||||
-- 展开数组带索引
|
||||
SELECT id, pos, tag
|
||||
FROM table
|
||||
LATERAL VIEW posexplode(tags) t AS pos, tag;
|
||||
|
||||
-- 展开 Map
|
||||
SELECT id, map_key, map_value
|
||||
FROM table
|
||||
LATERAL VIEW explode(props) m AS map_key, map_value;
|
||||
|
||||
-- MAP 操作
|
||||
map(key1, val1, key2, val2) -- 创建 Map
|
||||
str_to_map(str, delim1, delim2) -- 字符串转 Map
|
||||
map_contains(map, key) -- 判断是否包含key
|
||||
map_keys(map) -- 所有 key(返回数组)
|
||||
map_values(map) -- 所有 value(返回数组)
|
||||
size(map) -- Map大小
|
||||
|
||||
-- STRUCT 操作
|
||||
named_struct('name1', val1, 'name2', val2) -- 创建结构体
|
||||
struct_col.field_name -- 访问结构体字段
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 分区表操作
|
||||
|
||||
```sql
|
||||
-- 创建分区表
|
||||
CREATE TABLE target_table (
|
||||
id BIGINT,
|
||||
name STRING,
|
||||
amount DECIMAL(18,2)
|
||||
)
|
||||
PARTITIONED BY (day_id STRING)
|
||||
STORED AS ORC;
|
||||
|
||||
-- 静态分区写入
|
||||
INSERT OVERWRITE TABLE target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT id, name, amount FROM source_table;
|
||||
|
||||
-- 动态分区写入
|
||||
SET hive.exec.dynamic.partition = true;
|
||||
SET hive.exec.dynamic.partition.mode = nonstrict;
|
||||
|
||||
INSERT OVERWRITE TABLE target_table
|
||||
PARTITION (day_id)
|
||||
SELECT id, name, amount, day_id FROM source_table;
|
||||
|
||||
-- 分区管理
|
||||
SHOW PARTITIONS target_table;
|
||||
ALTER TABLE target_table ADD IF NOT EXISTS PARTITION (day_id = '2026-05-10');
|
||||
ALTER TABLE target_table DROP IF EXISTS PARTITION (day_id = '2026-01-01');
|
||||
|
||||
-- MSCK REPAIR(恢复分区元数据)
|
||||
MSCK REPAIR TABLE target_table;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 与 Spark SQL 的主要差异
|
||||
|
||||
| 特性 | Spark SQL | Hive | 说明 |
|
||||
|------|-----------|------|------|
|
||||
| **LEFT SEMI JOIN** | ✅ 独立语法 | ✅ 支持(语义相同) | Hive 也可用 IN 子查询替代 |
|
||||
| **LEFT ANTI JOIN** | ✅ 独立语法 | ✅ 支持(语义相同) | Hive 也可用 NOT IN 替代 |
|
||||
| **CTE (WITH)** | ✅ 支持 | ✅ Hive 0.13+ 支持 | 都支持但 Hive 中推荐物化临时表 |
|
||||
| **INSERT OVERWRITE** | ✅ 支持 | ✅ 支持 | 写法一致 |
|
||||
| **MERGE INTO** | ✅ 支持 | ❌ 不支持 | Hive 不支持 |
|
||||
| **UPDATE/DELETE** | ❌ 不支持 | 仅 ACID 表支持 | 普通 Hive 表不支持 |
|
||||
| **collect_list/set** | ✅ 支持 | ✅ 支持 | 完全一致 |
|
||||
| **LATERAL VIEW** | ✅ 支持 | ✅ 支持(Hive 原生) | Hive 首创的语法 |
|
||||
| **分桶 JOIN** | 可优化 | 可优化(SMB JOIN) | Hive 分桶优化更成熟 |
|
||||
| **日期格式** | `yyyy-MM-dd` | `yyyy-MM-dd` | 格式一致 |
|
||||
| **临时表** | CREATE TEMP TABLE | CREATE TEMPORARY TABLE | 关键字略有不同 |
|
||||
| **存储格式** | PARQUET/ORC | ORC/PARQUET/TEXTFILE | Hive 支持 TEXTFILE |
|
||||
| **分区发现** | 自动 | 需 MSCK REPAIR 或 ALTER | Hive 需手动恢复 |
|
||||
| **复杂类型** | 完整支持 | 完整支持 | 基本一致 |
|
||||
| **窗口函数** | ✅ 完整支持 | ✅ 完整支持 | 语法一致 |
|
||||
|
||||
---
|
||||
|
||||
## SQL 生成规则
|
||||
|
||||
### 通用规则(所有引擎统一)
|
||||
|
||||
1. **禁止使用 CTE (WITH 子句)**,每个主要逻辑步骤必须物化为临时表
|
||||
2. **先 DROP 再 CREATE**:`DROP TABLE IF EXISTS ...; CREATE TABLE ... AS SELECT ...;`
|
||||
3. **禁止 `SELECT *`**,必须明确列出所有字段
|
||||
4. 多表查询时所有表必须使用简短别名
|
||||
5. 每个步骤前添加注释说明
|
||||
6. **谓词下推**:过滤条件前置,JOIN 时在 WHERE 中一并添加过滤
|
||||
7. 临时表命名:`${db_tmp_env}.tmp_{业务简称}_{步骤序号}`
|
||||
|
||||
### Hive 特有规则
|
||||
|
||||
1. 使用 `INSERT OVERWRITE TABLE ... PARTITION (...)` 写入目标表
|
||||
2. 动态分区需先 `SET hive.exec.dynamic.partition = true;`
|
||||
3. 分区列不能出现在表定义的列中(Hive 特有约束)
|
||||
4. 支持 `collect_list` / `collect_set` 聚合
|
||||
5. 支持 `LATERAL VIEW explode()` 展开数组
|
||||
6. 日期函数:`date_format()`, `to_date()`, `date_add()`, `add_months()`(和 Spark 一致)
|
||||
7. 时间范围筛选:
|
||||
```sql
|
||||
-- 日账期过滤
|
||||
WHERE day_id = '${day_id}'
|
||||
-- 最近N个月(月份格式 yyyyMM)
|
||||
WHERE month_id >= date_format(add_months(to_date('${month_id}', 'yyyyMM'), -N), 'yyyyMM')
|
||||
AND month_id < '${month_id}'
|
||||
```
|
||||
|
||||
### SQL 脚本结构
|
||||
|
||||
```sql
|
||||
-- =====================================================================
|
||||
-- @SqlName: hive-D-SQL-{表名}
|
||||
-- @Engine: hive
|
||||
-- ...(头注释)
|
||||
-- =====================================================================
|
||||
|
||||
-- Step01: {步骤描述}
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||||
SELECT ...;
|
||||
|
||||
-- Step02: {步骤描述}
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||||
SELECT ...;
|
||||
|
||||
-- 最后一步:写入目标表
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT ...;
|
||||
```
|
||||
@@ -0,0 +1,369 @@
|
||||
# JOIN 模式速查
|
||||
|
||||
## JOIN 类型
|
||||
|
||||
| 类型 | 说明 | 结果特点 |
|
||||
|------|------|----------|
|
||||
| INNER JOIN | 内连接 | 只返回匹配的行 |
|
||||
| LEFT JOIN | 左外连接 | 左表全部,右表匹配(无匹配为NULL) |
|
||||
| RIGHT JOIN | 右外连接 | 右表全部,左表匹配(无匹配为NULL) |
|
||||
| FULL OUTER JOIN | 全外连接 | 两表全部,无匹配处为NULL |
|
||||
| CROSS JOIN | 交叉连接 | 笛卡尔积(每行与每行组合) |
|
||||
| LEFT SEMI JOIN | 左半连接 | 左表中在右表有匹配的行(不返回右表列) |
|
||||
| LEFT ANTI JOIN | 左反连接 | 左表中在右表无匹配的行 |
|
||||
|
||||
---
|
||||
|
||||
## INNER JOIN
|
||||
|
||||
```sql
|
||||
-- 基本语法
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
INNER JOIN table_b b ON a.id = b.id
|
||||
|
||||
-- 等价写法(逗号连接)
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a, table_b b
|
||||
WHERE a.id = b.id
|
||||
|
||||
-- 多字段关联
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
INNER JOIN table_b b
|
||||
ON a.user_id = b.user_id
|
||||
AND a.order_date = b.order_date
|
||||
```
|
||||
|
||||
**使用场景**:只需要两表都有匹配的数据时使用。
|
||||
|
||||
---
|
||||
|
||||
## LEFT JOIN
|
||||
|
||||
```sql
|
||||
-- 基本语法
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
LEFT JOIN table_b b ON a.id = b.id
|
||||
|
||||
-- 处理右表NULL值
|
||||
SELECT
|
||||
a.id,
|
||||
a.name,
|
||||
COALESCE(b.amount, 0) AS amount, -- NULL转0
|
||||
IF(b.id IS NULL, '无匹配', '有匹配') AS match_status
|
||||
FROM table_a a
|
||||
LEFT JOIN table_b b ON a.id = b.id
|
||||
|
||||
-- 找出左表中无匹配的行(差集)
|
||||
SELECT a.*
|
||||
FROM table_a a
|
||||
LEFT JOIN table_b b ON a.id = b.id
|
||||
WHERE b.id IS NULL
|
||||
```
|
||||
|
||||
**使用场景**:保留左表所有数据,右表补充信息时使用。
|
||||
|
||||
---
|
||||
|
||||
## RIGHT JOIN
|
||||
|
||||
```sql
|
||||
-- 基本语法
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
RIGHT JOIN table_b b ON a.id = b.id
|
||||
|
||||
-- 等价于 LEFT JOIN 反过来
|
||||
SELECT a.*, b.*
|
||||
FROM table_b b
|
||||
LEFT JOIN table_a a ON b.id = a.id
|
||||
```
|
||||
|
||||
**使用场景**:保留右表所有数据时使用(可改用 LEFT JOIN 反转)。
|
||||
|
||||
---
|
||||
|
||||
## FULL OUTER JOIN
|
||||
|
||||
```sql
|
||||
-- 基本语法
|
||||
SELECT
|
||||
COALESCE(a.id, b.id) AS id, -- 统一ID
|
||||
a.name,
|
||||
b.amount
|
||||
FROM table_a a
|
||||
FULL OUTER JOIN table_b b ON a.id = b.id
|
||||
|
||||
-- 找出两表差异
|
||||
SELECT
|
||||
a.id AS a_id,
|
||||
b.id AS b_id,
|
||||
CASE
|
||||
WHEN a.id IS NULL THEN '仅在B表'
|
||||
WHEN b.id IS NULL THEN '仅在A表'
|
||||
ELSE '两表都有'
|
||||
END AS status
|
||||
FROM table_a a
|
||||
FULL OUTER JOIN table_b b ON a.id = b.id
|
||||
WHERE a.id IS NULL OR b.id IS NULL
|
||||
```
|
||||
|
||||
**使用场景**:需要两表完整数据,分析差异时使用。
|
||||
|
||||
---
|
||||
|
||||
## CROSS JOIN
|
||||
|
||||
```sql
|
||||
-- 基本语法(笛卡尔积)
|
||||
SELECT a.name, b.color
|
||||
FROM products a
|
||||
CROSS JOIN colors b
|
||||
|
||||
-- 结果:products每行与colors每行组合
|
||||
-- products有10行,colors有5行 → 结果50行
|
||||
|
||||
-- 等价写法
|
||||
SELECT a.name, b.color
|
||||
FROM products a, colors b -- 无WHERE条件
|
||||
```
|
||||
|
||||
**使用场景**:生成所有组合、测试数据生成时使用。
|
||||
|
||||
**注意**:数据量大时慎用,可能产生巨量结果。
|
||||
|
||||
---
|
||||
|
||||
## LEFT SEMI JOIN(Spark SQL 特有)
|
||||
|
||||
```sql
|
||||
-- 基本语法
|
||||
SELECT a.*
|
||||
FROM table_a a
|
||||
LEFT SEMI JOIN table_b b ON a.id = b.id
|
||||
|
||||
-- 效果:返回A表中在B表有匹配的行,不返回B表的列
|
||||
-- 等价于 IN 子查询
|
||||
SELECT a.*
|
||||
FROM table_a a
|
||||
WHERE a.id IN (SELECT id FROM table_b b)
|
||||
```
|
||||
|
||||
**使用场景**:只需要判断左表是否在右表存在,不需要右表数据。
|
||||
|
||||
---
|
||||
|
||||
## LEFT ANTI JOIN(Spark SQL 特有)
|
||||
|
||||
```sql
|
||||
-- 基本语法
|
||||
SELECT a.*
|
||||
FROM table_a a
|
||||
LEFT ANTI JOIN table_b b ON a.id = b.id
|
||||
|
||||
-- 效果:返回A表中在B表无匹配的行
|
||||
-- 等价于 NOT IN 子查询
|
||||
SELECT a.*
|
||||
FROM table_a a
|
||||
WHERE a.id NOT IN (SELECT id FROM table_b b)
|
||||
|
||||
-- 或 NOT EXISTS
|
||||
SELECT a.*
|
||||
FROM table_a a
|
||||
WHERE NOT EXISTS (SELECT 1 FROM table_b b WHERE b.id = a.id)
|
||||
```
|
||||
|
||||
**使用场景**:找出差集(左表中不存在于右表的数据)。
|
||||
|
||||
---
|
||||
|
||||
## 多表 JOIN
|
||||
|
||||
```sql
|
||||
-- 三表关联
|
||||
SELECT
|
||||
o.order_id,
|
||||
u.user_name,
|
||||
p.product_name,
|
||||
oi.quantity
|
||||
FROM orders o
|
||||
JOIN users u ON o.user_id = u.id
|
||||
JOIN order_items oi ON o.order_id = oi.order_id
|
||||
JOIN products p ON oi.product_id = p.id
|
||||
|
||||
-- 四表及以上
|
||||
SELECT
|
||||
a.col1,
|
||||
b.col2,
|
||||
c.col3,
|
||||
d.col4
|
||||
FROM table_a a
|
||||
JOIN table_b b ON a.id = b.a_id
|
||||
JOIN table_c c ON b.id = c.b_id
|
||||
JOIN table_d d ON c.id = d.c_id
|
||||
```
|
||||
|
||||
**建议**:多表 JOIN 时,从最小表开始,逐步关联。
|
||||
|
||||
---
|
||||
|
||||
## 自连接(Self Join)
|
||||
|
||||
```sql
|
||||
-- 员工与经理关联
|
||||
SELECT
|
||||
e.name AS employee,
|
||||
m.name AS manager
|
||||
FROM employees e
|
||||
LEFT JOIN employees m ON e.manager_id = m.id
|
||||
|
||||
-- 查找重复数据
|
||||
SELECT
|
||||
a.id,
|
||||
a.name,
|
||||
b.id AS duplicate_id
|
||||
FROM table_a a
|
||||
JOIN table_a b ON a.name = b.name AND a.id < b.id
|
||||
|
||||
-- 紧邻数据比较(前后行)
|
||||
SELECT
|
||||
a.date,
|
||||
a.amount,
|
||||
b.amount AS prev_amount
|
||||
FROM sales a
|
||||
LEFT JOIN sales b ON a.date = b.date + 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## JOIN 条件下推优化
|
||||
|
||||
**原则**:过滤条件前置,减少 JOIN 数据量。
|
||||
|
||||
```sql
|
||||
-- 推荐:过滤前置
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
JOIN table_b b ON a.id = b.id
|
||||
WHERE a.date = '${day_id}' -- 先过滤A表
|
||||
AND b.status = 'active' -- 先过滤B表
|
||||
|
||||
-- 不推荐:JOIN后再过滤
|
||||
SELECT a.*, b.*
|
||||
FROM (
|
||||
SELECT * FROM table_a -- 未过滤
|
||||
) a
|
||||
JOIN (
|
||||
SELECT * FROM table_b -- 未过滤
|
||||
) b ON a.id = b.id
|
||||
WHERE a.date = '${day_id}'
|
||||
AND b.status = 'active'
|
||||
```
|
||||
|
||||
**性能差异**:
|
||||
- 推荐:JOIN 前各表已过滤,数据量小,JOIN 快
|
||||
- 不推荐:全表 JOIN 后过滤,数据量大,性能差
|
||||
|
||||
---
|
||||
|
||||
## JOIN 常见问题
|
||||
|
||||
### 问题1:关联字段类型不一致
|
||||
|
||||
```sql
|
||||
-- 错误:STRING 与 BIGINT 关联
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
JOIN table_b b ON a.id = b.id -- a.id是STRING,b.id是BIGINT
|
||||
|
||||
-- 解决:类型转换
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
JOIN table_b b ON CAST(a.id AS BIGINT) = b.id
|
||||
```
|
||||
|
||||
### 问题2:关联字段含NULL
|
||||
|
||||
```sql
|
||||
-- 问题:NULL关联不上
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
LEFT JOIN table_b b ON a.ref_id = b.id -- a.ref_id有NULL
|
||||
|
||||
-- 解决:先过滤NULL或用COALESCE
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
LEFT JOIN table_b b ON COALESCE(a.ref_id, 'N/A') = b.id
|
||||
```
|
||||
|
||||
### 问题3:多字段关联效率低
|
||||
|
||||
```sql
|
||||
-- 问题:多字段关联
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
JOIN table_b b
|
||||
ON a.user_id = b.user_id
|
||||
AND a.order_date = b.order_date
|
||||
AND a.product_id = b.product_id
|
||||
|
||||
-- 解决:合并关联字段
|
||||
SELECT a.*, b.*
|
||||
FROM table_a a
|
||||
JOIN table_b b
|
||||
ON CONCAT(a.user_id, '_', a.order_date, '_', a.product_id)
|
||||
= CONCAT(b.user_id, '_', b.order_date, '_', b.product_id)
|
||||
```
|
||||
|
||||
### 问题4:大表 JOIN 大表
|
||||
|
||||
```sql
|
||||
-- 问题:两表都很大,JOIN 慢
|
||||
SELECT a.*, b.*
|
||||
FROM large_table_a a
|
||||
JOIN large_table_b b ON a.id = b.id
|
||||
|
||||
-- 解决方案:
|
||||
-- 1. 尽量前置过滤
|
||||
-- 2. 使用分区表,按分区关联
|
||||
-- 3. 调整 Spark 并行度
|
||||
-- 4. 使用 BROADCAST JOIN(其中一表较小)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## BROADCAST JOIN(小表广播)
|
||||
|
||||
```sql
|
||||
-- Spark SQL 自动判断(需配置阈值)
|
||||
-- 小表自动广播到所有节点,避免 Shuffle
|
||||
|
||||
-- 手动指定广播
|
||||
SELECT /*+ BROADCAST(b) */ a.*, b.*
|
||||
FROM large_table_a a
|
||||
JOIN small_table_b b ON a.id = b.id
|
||||
|
||||
-- 多表广播
|
||||
SELECT /*+ BROADCAST(b), BROADCAST(c) */ a.*, b.*, c.*
|
||||
FROM large_table_a a
|
||||
JOIN small_table_b b ON a.id = b.id
|
||||
JOIN small_table_c c ON a.category = c.category
|
||||
```
|
||||
|
||||
**适用条件**:其中一表数据量较小(通常 < 10MB)。
|
||||
|
||||
---
|
||||
|
||||
## JOIN 类型选择指南
|
||||
|
||||
| 需求 | 推荐 JOIN | 说明 |
|
||||
|------|-----------|------|
|
||||
| 两表都有才保留 | INNER JOIN | 最常用 |
|
||||
| 左表全部保留 | LEFT JOIN | 补充右表信息 |
|
||||
| 右表全部保留 | RIGHT JOIN | 或反转用 LEFT JOIN |
|
||||
| 两表全部保留 | FULL OUTER JOIN | 分析差异 |
|
||||
| 判断左表是否在右表存在 | LEFT SEMI JOIN | 不需要右表列 |
|
||||
| 左表不在右表的行 | LEFT ANTI JOIN | 差集查询 |
|
||||
| 生成所有组合 | CROSS JOIN | 慎用 |
|
||||
@@ -0,0 +1,336 @@
|
||||
# Kudu (via Impala) SQL 语法参考
|
||||
|
||||
> **重要**:Kudu 本身没有 SQL 引擎,通过 Impala 访问。本文档是 Impala SQL 操作 Kudu 表的语法参考。
|
||||
|
||||
## 数据类型
|
||||
|
||||
| 类型 | 说明 | 示例 |
|
||||
|------|------|------|
|
||||
| BOOLEAN | 布尔 | active BOOLEAN |
|
||||
| TINYINT | 1字节整数 | level TINYINT |
|
||||
| SMALLINT | 2字节整数 | age SMALLINT |
|
||||
| INT | 4字节整数 | count INT |
|
||||
| BIGINT | 8字节整数 | id BIGINT |
|
||||
| FLOAT | 4字节浮点 | score FLOAT |
|
||||
| DOUBLE | 8字节浮点 | price DOUBLE |
|
||||
| DECIMAL(p,s) | 定点数 | amount DECIMAL(18,2) |
|
||||
| STRING | 变长字符串 | name STRING |
|
||||
| VARCHAR(n) | 变长字符串(限长) | code VARCHAR(50) |
|
||||
| CHAR(n) | 定长字符串 | flag CHAR(1) |
|
||||
| TIMESTAMP | 时间戳(微秒精度) | created_at TIMESTAMP |
|
||||
| DATE | 日期 | birth_date DATE |
|
||||
| BINARY | 二进制 | data BINARY |
|
||||
|
||||
**注意**:Kudu 不支持 ARRAY、MAP、STRUCT 等复杂类型。
|
||||
|
||||
---
|
||||
|
||||
## 时间函数
|
||||
|
||||
```sql
|
||||
-- 当前时间
|
||||
NOW() -- 当前日期时间
|
||||
CURRENT_TIMESTAMP() -- 当前时间戳
|
||||
UNIX_TIMESTAMP() -- 当前 Unix 时间戳(秒)
|
||||
TO_DATE(NOW()) -- 当前日期
|
||||
|
||||
-- 格式转换
|
||||
FROM_UNIXTIME(timestamp, 'yyyy-MM-dd') -- Unix 时间戳转格式化字符串
|
||||
FROM_UNIXTIME(timestamp, 'yyyy-MM-dd HH:mm:ss')
|
||||
CAST(string_col AS TIMESTAMP) -- 字符串转时间戳
|
||||
CAST(timestamp_col AS STRING) -- 时间戳转字符串
|
||||
|
||||
-- 日期计算
|
||||
DAYS_ADD(date_col, 7) -- 加7天
|
||||
DAYS_SUB(date_col, 7) -- 减7天
|
||||
ADD_MONTHS(date_col, 3) -- 加3个月(Impala 6.0+)
|
||||
DATE_ADD(date_col, INTERVAL 7 DAY) -- 加7天(标准语法)
|
||||
DATEDIFF(end_date, start_date) -- 日期差(天数)
|
||||
MONTHS_BETWEEN(date1, date2) -- 月份差
|
||||
|
||||
-- 日期提取
|
||||
YEAR(date_col) -- 年
|
||||
MONTH(date_col) -- 月
|
||||
DAY(date_col) -- 日
|
||||
DAYOFWEEK(date_col) -- 周几 (1=周日)
|
||||
DAYOFYEAR(date_col) -- 年中第几天
|
||||
HOUR(timestamp_col) -- 时
|
||||
MINUTE(timestamp_col) -- 分
|
||||
SECOND(timestamp_col) -- 秒
|
||||
QUARTER(date_col) -- 季度 (1-4)
|
||||
WEEKOFYEAR(date_col) -- 年中第几周
|
||||
|
||||
-- Impala 日期格式符
|
||||
-- yyyy: 4位年, MM: 2位月, dd: 2位日
|
||||
-- HH: 24小时制, mm: 分钟, ss: 秒
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 字符串函数
|
||||
|
||||
```sql
|
||||
-- 常用函数
|
||||
CONCAT(str1, str2) -- 字符串拼接(仅2个参数)
|
||||
CONCAT_WS('-', str1, str2, ...) -- 用分隔符拼接
|
||||
LOWER(str) -- 转小写
|
||||
UPPER(str) -- 转大写
|
||||
TRIM(str) -- 去两端空格
|
||||
LTRIM(str) -- 去左空格
|
||||
RTRIM(str) -- 去右空格
|
||||
LENGTH(str) -- 字符串长度
|
||||
SUBSTR(str, pos, len) -- 截取字符串(pos从1开始)
|
||||
SUBSTRING(str, pos, len) -- 同上
|
||||
LEFT(str, len) -- 取左边len个字符
|
||||
RIGHT(str, len) -- 取右边len个字符
|
||||
REVERSE(str) -- 反转字符串
|
||||
REPEAT(str, n) -- 重复n次
|
||||
SPACE(n) -- 生成n个空格
|
||||
|
||||
-- 查找与替换
|
||||
INSTR(str, substr) -- 查找子串位置
|
||||
LOCATE(substr, str, pos) -- 从pos位置查找
|
||||
REPLACE(str, old, new) -- 替换
|
||||
REGEXP_EXTRACT(str, pattern, idx) -- 正则提取
|
||||
REGEXP_REPLACE(str, pattern, replacement) -- 正则替换
|
||||
|
||||
-- 分割
|
||||
SPLIT_PART(str, delimiter, idx) -- 取分割后第idx部分
|
||||
|
||||
-- 其他
|
||||
INITCAP(str) -- 首字母大写
|
||||
LPAD(str, len, pad) -- 左填充
|
||||
RPAD(str, len, pad) -- 右填充
|
||||
HEX(col) -- 转16进制
|
||||
UNHEX(str) -- 16进制转字符串
|
||||
```
|
||||
|
||||
**注意**:Impala 的 `CONCAT` 只接受 2 个参数,多参数拼接用 `CONCAT_WS`。
|
||||
|
||||
---
|
||||
|
||||
## 聚合函数
|
||||
|
||||
```sql
|
||||
-- 基础聚合
|
||||
COUNT(*) -- 计数(含NULL行)
|
||||
COUNT(col) -- 计数(不含NULL)
|
||||
COUNT(DISTINCT col) -- 去重计数
|
||||
SUM(col) -- 求和
|
||||
AVG(col) -- 平均值
|
||||
MIN(col) -- 最小值
|
||||
MAX(col) -- 最大值
|
||||
|
||||
-- 集合聚合
|
||||
GROUP_CONCAT(col SEPARATOR ',') -- 字符串聚合
|
||||
|
||||
-- 统计函数
|
||||
VARIANCE(col) -- 方差
|
||||
VAR_POP(col) -- 总体方差
|
||||
VAR_SAMP(col) -- 样本方差
|
||||
STDDEV(col) -- 标准差
|
||||
STDDEV_POP(col) -- 总体标准差
|
||||
STDDEV_SAMP(col) -- 样本标准差
|
||||
|
||||
-- 近似函数
|
||||
NDV(col) -- 近似去重计数(Impala 特有,比 COUNT(DISTINCT) 快)
|
||||
APPROX_COUNT_DISTINCT(col) -- 近似去重计数
|
||||
|
||||
-- 其他
|
||||
FIRST_VALUE(col) -- 窗口内第一个值
|
||||
LAST_VALUE(col) -- 窗口内最后一个值
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 条件表达式
|
||||
|
||||
```sql
|
||||
-- CASE WHEN
|
||||
CASE
|
||||
WHEN condition1 THEN result1
|
||||
WHEN condition2 THEN result2
|
||||
ELSE default_result
|
||||
END
|
||||
|
||||
-- CASE 字段匹配
|
||||
CASE field
|
||||
WHEN value1 THEN result1
|
||||
WHEN value2 THEN result2
|
||||
ELSE default_result
|
||||
END
|
||||
|
||||
-- COALESCE(取第一个非空值)
|
||||
COALESCE(col1, col2, default_value)
|
||||
|
||||
-- NULLIF(相等返回NULL)
|
||||
NULLIF(col1, col2)
|
||||
|
||||
-- IF(简单条件)
|
||||
IF(condition, true_value, false_value)
|
||||
|
||||
-- ISNULL / ISNOTNULL
|
||||
ISNULL(col) -- 判断是否为NULL
|
||||
ISNOTNULL(col) -- 判断是否不为NULL
|
||||
|
||||
-- NVL(空值替换)
|
||||
NVL(col, default_value)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Kudu 特有操作
|
||||
|
||||
### INSERT INTO
|
||||
|
||||
```sql
|
||||
-- 追加写入
|
||||
INSERT INTO kudu_table VALUES (1, 'test', 100.00);
|
||||
INSERT INTO kudu_table SELECT * FROM other_table WHERE ...;
|
||||
```
|
||||
|
||||
### UPSERT INTO(Kudu 核心能力)
|
||||
|
||||
```sql
|
||||
-- 主键存在则更新,不存在则插入
|
||||
UPSERT INTO kudu_table VALUES (1, 'test', 100.00);
|
||||
UPSERT INTO kudu_table SELECT * FROM staging_table WHERE ...;
|
||||
```
|
||||
|
||||
### UPDATE
|
||||
|
||||
```sql
|
||||
-- 更新数据(主键列不可更新)
|
||||
UPDATE kudu_table SET status = 'active' WHERE id = 1;
|
||||
UPDATE kudu_table SET amount = amount * 1.1 WHERE date < '2026-01-01';
|
||||
```
|
||||
|
||||
### DELETE
|
||||
|
||||
```sql
|
||||
-- 删除数据
|
||||
DELETE FROM kudu_table WHERE id = 1;
|
||||
DELETE FROM kudu_table WHERE date < '2026-01-01';
|
||||
```
|
||||
|
||||
### ALTER TABLE(Kudu 特有)
|
||||
|
||||
```sql
|
||||
-- 添加列
|
||||
ALTER TABLE kudu_table ADD COLUMNS (new_col STRING COMMENT '新列');
|
||||
|
||||
-- 删除列
|
||||
ALTER TABLE kudu_table DROP COLUMN old_col;
|
||||
|
||||
-- 修改列类型
|
||||
ALTER TABLE kudu_table CHANGE old_name new_name STRING;
|
||||
|
||||
-- 添加范围分区
|
||||
ALTER TABLE kudu_table ADD RANGE PARTITION
|
||||
'2026-06-01' <= VALUES < '2026-07-01';
|
||||
|
||||
-- 删除范围分区
|
||||
ALTER TABLE kudu_table DROP RANGE PARTITION
|
||||
'2026-01-01' <= VALUES < '2026-02-01';
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 分区操作
|
||||
|
||||
```sql
|
||||
-- Hash 分区(建表时指定)
|
||||
PARTITION BY HASH(id) PARTITIONS 8
|
||||
|
||||
-- Range 分区(建表时指定)
|
||||
PARTITION BY RANGE(stat_date) (
|
||||
PARTITION '2026-01-01' <= VALUES < '2026-02-01',
|
||||
PARTITION '2026-02-01' <= VALUES < '2026-03-01'
|
||||
)
|
||||
|
||||
-- Hash + Range 组合
|
||||
PARTITION BY
|
||||
HASH(id) PARTITIONS 4,
|
||||
RANGE(stat_date) (...)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 与 Spark SQL / Hive 的主要差异
|
||||
|
||||
| 特性 | Spark SQL | Hive | Kudu (Impala) | 说明 |
|
||||
|------|-----------|------|--------------|------|
|
||||
| **INSERT OVERWRITE** | ✅ | ✅ | ❌ | Kudu 不支持,用 DELETE + INSERT 替代 |
|
||||
| **UPSERT** | ❌ | ❌ | ✅ | Kudu 独有核心能力 |
|
||||
| **UPDATE** | ❌ | 仅ACID表 | ✅ | Kudu 原生支持 |
|
||||
| **DELETE** | ❌ | 仅ACID表 | ✅ | Kudu 原生支持 |
|
||||
| **主键约束** | ❌ 无约束 | ❌ 无约束 | ✅ 强制主键 | Kudu 表必须有主键 |
|
||||
| **复杂类型** | ✅ ARRAY/MAP/STRUCT | ✅ 完整支持 | ❌ 不支持 | Kudu 限制 |
|
||||
| **CONCAT 多参数** | ✅ 任意个数 | ✅ 任意个数 | ❌ 仅2个参数 | Impala 用 CONCAT_WS |
|
||||
| **近似去重** | `approx_count_distinct` | `approx_count_distinct` | `NDV` | Impala 特有函数名 |
|
||||
| **临时表链式处理** | ✅ 推荐 | ✅ 推荐 | ❌ 不需要 | Kudu 用 UPSERT 单步 |
|
||||
| **CTE (WITH)** | ✅ 支持 | ✅ 支持 | ✅ 支持 | 都支持 |
|
||||
| **MERGE INTO** | ✅ 支持 | ❌ | ❌ | 用 UPSERT 替代 |
|
||||
| **分区类型** | 目录分区 | 目录分区 | Hash/Range 内置 | Kudu 分区机制不同 |
|
||||
| **分桶** | 可选 | 可选 | Hash 分区替代 | 概念类似但实现不同 |
|
||||
| **日期格式** | `yyyy-MM-dd` | `yyyy-MM-dd` | `yyyy-MM-dd` | 格式一致 |
|
||||
| **collect_list/set** | ✅ | ✅ | ❌ | Impala 用 GROUP_CONCAT |
|
||||
| **ALTER ADD COLUMN** | ✅ | ✅ | ✅ | Kudu 支持在线加列 |
|
||||
| **ALTER DROP COLUMN** | 部分支持 | ✅ | ✅ | Kudu 支持在线删列 |
|
||||
|
||||
---
|
||||
|
||||
## SQL 生成规则
|
||||
|
||||
### 通用规则(所有引擎统一)
|
||||
|
||||
1. **禁止使用 CTE (WITH 子句)**,每个主要逻辑步骤必须物化为临时表
|
||||
2. **先 DROP 再 CREATE**:`DROP TABLE IF EXISTS ...; CREATE TABLE ... AS SELECT ...;`
|
||||
3. **禁止 `SELECT *`**,必须明确列出所有字段
|
||||
4. 多表查询时所有表必须使用简短别名
|
||||
5. 每个步骤前添加注释说明
|
||||
6. **谓词下推**:过滤条件前置,JOIN 时在 WHERE 中一并添加过滤
|
||||
7. 临时表命名:`${db_tmp_env}.tmp_{业务简称}_{步骤序号}`
|
||||
|
||||
### Kudu 特有规则
|
||||
|
||||
1. **Kudu 表不支持 `INSERT OVERWRITE`**,用 `DELETE + INSERT` 或 `UPSERT` 替代
|
||||
2. **最后一步优先使用 `UPSERT INTO`**(Kudu 核心优势:主键存在则更新,不存在则插入)
|
||||
3. 需要全量刷新时:先 `DELETE FROM` 再 `INSERT INTO`
|
||||
4. 支持 `UPDATE` 和 `DELETE`(Kudu 表独有)
|
||||
5. Kudu 表必须有 `PRIMARY KEY`,主键列不能为 NULL
|
||||
6. `CONCAT` 只接受 2 个参数,多参数用 `CONCAT_WS`
|
||||
7. 不支持 `collect_list` / `collect_set`,用 `GROUP_CONCAT` 替代
|
||||
8. 近似去重用 `NDV()` 函数
|
||||
9. 时间范围筛选:
|
||||
```sql
|
||||
-- 日账期过滤
|
||||
WHERE stat_date = '${day_id}'
|
||||
-- 日期范围
|
||||
WHERE stat_date >= DAYS_SUB(TO_DATE('${day_id}'), 30)
|
||||
AND stat_date < '${day_id}'
|
||||
```
|
||||
|
||||
### SQL 脚本结构
|
||||
|
||||
```sql
|
||||
-- =====================================================================
|
||||
-- @SqlName: kudu-D-SQL-{表名}
|
||||
-- @Engine: kudu
|
||||
-- ...(头注释)
|
||||
-- =====================================================================
|
||||
|
||||
-- Step01: {步骤描述}
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||||
SELECT ...;
|
||||
|
||||
-- Step02: {步骤描述}
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||||
SELECT ...;
|
||||
|
||||
-- 最后一步:UPSERT 写入 Kudu 目标表
|
||||
UPSERT INTO ${db_eda_env}.target_table
|
||||
SELECT ...;
|
||||
```
|
||||
@@ -0,0 +1,414 @@
|
||||
# Spark SQL 语法要点
|
||||
|
||||
## 数据类型
|
||||
|
||||
| 类型 | 说明 | 示例 |
|
||||
|------|------|------|
|
||||
| STRING | 字符串 | name STRING |
|
||||
| INT | 整数 | age INT |
|
||||
| BIGINT | 大整数 | id BIGINT |
|
||||
| DOUBLE | 浮点数 | price DOUBLE |
|
||||
| DECIMAL(p,s) | 定点数 | amount DECIMAL(18,2) |
|
||||
| BOOLEAN | 布尔 | active BOOLEAN |
|
||||
| DATE | 日期 | birth_date DATE |
|
||||
| TIMESTAMP | 时间戳 | created_at TIMESTAMP |
|
||||
| ARRAY<type> | 数组 | tags ARRAY<STRING> |
|
||||
| MAP<key,value> | 映射 | props MAP<STRING,STRING> |
|
||||
| STRUCT<field:type,...> | 结构体 | user STRUCT<id:INT,name:STRING> |
|
||||
|
||||
---
|
||||
|
||||
## 时间函数
|
||||
|
||||
```sql
|
||||
-- 当前时间
|
||||
current_date()
|
||||
current_timestamp()
|
||||
now()
|
||||
|
||||
-- 格式转换
|
||||
date_format(date_col, 'yyyy-MM-dd')
|
||||
date_format(timestamp_col, 'yyyy-MM-dd HH:mm:ss')
|
||||
to_date(string_col, 'yyyy-MM-dd')
|
||||
to_timestamp(string_col, 'yyyy-MM-dd HH:mm:ss')
|
||||
|
||||
-- 日期计算
|
||||
date_add(date_col, 7) -- 加7天
|
||||
date_sub(date_col, 7) -- 减7天
|
||||
add_months(date_col, 3) -- 加3个月
|
||||
datediff(end_date, start_date) -- 日期差(天数)
|
||||
|
||||
-- 日期提取
|
||||
year(date_col)
|
||||
month(date_col)
|
||||
day(date_col)
|
||||
dayofweek(date_col)
|
||||
hour(timestamp_col)
|
||||
minute(timestamp_col)
|
||||
second(timestamp_col)
|
||||
|
||||
-- 季度、周
|
||||
quarter(date_col) -- 季度 (1-4)
|
||||
weekofyear(date_col) -- 年中第几周
|
||||
|
||||
-- Unix 时间戳
|
||||
unix_timestamp(date_col) -- 转 Unix 时间戳
|
||||
from_unixtime(timestamp) -- Unix 时间戳转时间字符串
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 字符串函数
|
||||
|
||||
```sql
|
||||
-- 常用函数
|
||||
concat(str1, str2, ...) -- 字符串拼接
|
||||
concat_ws('-', str1, str2, ...) -- 用分隔符拼接
|
||||
lower(str) -- 转小写
|
||||
upper(str) -- 转大写
|
||||
trim(str) -- 去两端空格
|
||||
ltrim(str) -- 去左空格
|
||||
rtrim(str) -- 去右空格
|
||||
length(str) -- 字符串长度
|
||||
substring(str, pos, len) -- 截取字符串
|
||||
left(str, len) -- 取左边len个字符
|
||||
right(str, len) -- 取右边len个字符
|
||||
reverse(str) -- 反转字符串
|
||||
repeat(str, n) -- 重复n次
|
||||
space(n) -- 生成n个空格
|
||||
|
||||
-- 查找与替换
|
||||
instr(str, substr) -- 查找子串位置
|
||||
locate(substr, str, pos) -- 从pos位置查找
|
||||
replace(str, old, new) -- 替换
|
||||
regexp_extract(str, pattern, idx) -- 正则提取
|
||||
regexp_replace(str, pattern, replacement) -- 正则替换
|
||||
|
||||
-- 分割
|
||||
split(str, delimiter) -- 分割成数组
|
||||
split_part(str, delimiter, idx) -- 取分割后的第idx部分
|
||||
|
||||
-- 其他
|
||||
initcap(str) -- 首字母大写
|
||||
lpad(str, len, pad) -- 左填充
|
||||
rpad(str, len, pad) -- 右填充
|
||||
levenshtein(str1, str2) -- 编辑距离
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 条件表达式
|
||||
|
||||
```sql
|
||||
-- CASE WHEN
|
||||
CASE
|
||||
WHEN condition1 THEN result1
|
||||
WHEN condition2 THEN result2
|
||||
ELSE default_result
|
||||
END
|
||||
|
||||
-- CASE 字段匹配
|
||||
CASE field
|
||||
WHEN value1 THEN result1
|
||||
WHEN value2 THEN result2
|
||||
ELSE default_result
|
||||
END
|
||||
|
||||
-- COALESCE(取第一个非空值)
|
||||
COALESCE(col1, col2, default_value)
|
||||
|
||||
-- NULLIF(相等返回NULL)
|
||||
NULLIF(col1, col2)
|
||||
|
||||
-- IF(简单条件)
|
||||
IF(condition, true_value, false_value)
|
||||
|
||||
-- NVL(空值替换)
|
||||
NVL(col, default_value)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 聚合函数
|
||||
|
||||
```sql
|
||||
-- 基础聚合
|
||||
COUNT(*) -- 计数(含NULL行)
|
||||
COUNT(col) -- 计数(不含NULL)
|
||||
COUNT(DISTINCT col) -- 去重计数
|
||||
SUM(col) -- 求和
|
||||
AVG(col) -- 平均值
|
||||
MIN(col) -- 最小值
|
||||
MAX(col) -- 最大值
|
||||
|
||||
-- 集合聚合
|
||||
collect_list(col) -- 返回数组(不去重)
|
||||
collect_set(col) -- 返回数组(去重)
|
||||
|
||||
-- 统计函数
|
||||
variance(col) -- 方差
|
||||
var_pop(col) -- 总体方差
|
||||
var_samp(col) -- 样本方差
|
||||
stddev(col) -- 标准差
|
||||
stddev_pop(col) -- 总体标准差
|
||||
stddev_samp(col) -- 样本标准差
|
||||
|
||||
-- 近似函数
|
||||
approx_count_distinct(col) -- 近似去重计数(大数据量优化)
|
||||
|
||||
-- 其他
|
||||
first(col) -- 第一个值
|
||||
last(col) -- 最后一个值
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 数学函数
|
||||
|
||||
```sql
|
||||
-- 基础运算
|
||||
abs(col) -- 绝对值
|
||||
round(col, digits) -- 四舍五入
|
||||
ceil(col) -- 向上取整
|
||||
floor(col) -- 向下取整
|
||||
sign(col) -- 符号 (-1, 0, 1)
|
||||
|
||||
-- 指数与对数
|
||||
exp(col) -- e的指数
|
||||
log(col) -- 自然对数
|
||||
log10(col) -- 10为底对数
|
||||
log2(col) -- 2为底对数
|
||||
pow(col, n) -- 幂运算
|
||||
sqrt(col) -- 平方根
|
||||
|
||||
-- 三角函数
|
||||
sin(col), cos(col), tan(col)
|
||||
asin(col), acos(col), atan(col)
|
||||
|
||||
-- 随机数
|
||||
rand() -- 随机数 (0-1)
|
||||
rand(seed) -- 指定种子随机数
|
||||
|
||||
-- 其他
|
||||
cbrt(col) -- 立方根
|
||||
hex(col) -- 转16进制
|
||||
unhex(col) -- 16进制转字符串
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 数组函数
|
||||
|
||||
```sql
|
||||
-- 创建数组
|
||||
array(val1, val2, ...) -- 创建数组
|
||||
|
||||
-- 访问
|
||||
array_contains(arr, val) -- 判断是否包含
|
||||
element_at(arr, idx) -- 取元素(idx从1开始)
|
||||
arr[idx] -- 取元素(idx从0开始)
|
||||
|
||||
-- 操作
|
||||
size(arr) -- 数组长度
|
||||
array_join(arr, delimiter) -- 数组转字符串
|
||||
concat(arr1, arr2) -- 数组拼接
|
||||
|
||||
-- 展开
|
||||
explode(arr) -- 展开数组为多行
|
||||
posexplode(arr) -- 展开数组(带位置索引)
|
||||
|
||||
-- 排序与去重
|
||||
sort_array(arr) -- 排序
|
||||
array_distinct(arr) -- 去重
|
||||
array_remove(arr, val) -- 移除元素
|
||||
array_union(arr1, arr2) -- 并集
|
||||
array_intersect(arr1, arr2) -- 交集
|
||||
array_except(arr1, arr2) -- 差集
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Map 函数
|
||||
|
||||
```sql
|
||||
-- 创建 Map
|
||||
map(key1, val1, key2, val2, ...) -- 创建 Map
|
||||
str_to_map(str, delim1, delim2) -- 字符串转 Map
|
||||
|
||||
-- 访问
|
||||
map_contains(map, key) -- 判断是否包含key
|
||||
element_at(map, key) -- 取值
|
||||
map[key] -- 取值
|
||||
map_keys(map) -- 取所有key(返回数组)
|
||||
map_values(map) -- 取所有value(返回数组)
|
||||
|
||||
-- 操作
|
||||
size(map) -- Map大小
|
||||
map_concat(map1, map2) -- Map合并
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## JSON 函数
|
||||
|
||||
```sql
|
||||
-- 解析
|
||||
get_json_object(json_str, path) -- 提取JSON字段
|
||||
json_tuple(json_str, field1, ...) -- 提取多个字段
|
||||
|
||||
-- 转换
|
||||
from_json(json_str, schema) -- JSON转结构体
|
||||
to_json(struct_col) -- 结构体转JSON
|
||||
|
||||
-- Schema 定义示例
|
||||
from_json('{"name":"张三","age":25}', 'name STRING, age INT')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 分区表操作
|
||||
|
||||
```sql
|
||||
-- 创建分区表
|
||||
CREATE TABLE target_table (
|
||||
id BIGINT,
|
||||
name STRING,
|
||||
amount DECIMAL(18,2)
|
||||
)
|
||||
PARTITIONED BY (day_id STRING)
|
||||
STORED AS PARQUET;
|
||||
|
||||
-- 写入指定分区
|
||||
INSERT OVERWRITE TABLE target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT id, name, amount
|
||||
FROM source_table
|
||||
WHERE ...
|
||||
|
||||
-- 动态分区写入
|
||||
INSERT OVERWRITE TABLE target_table
|
||||
PARTITION (day_id)
|
||||
SELECT id, name, amount, day_id
|
||||
FROM source_table;
|
||||
|
||||
-- 查看分区
|
||||
SHOW PARTITIONS target_table;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 临时表与视图
|
||||
|
||||
```sql
|
||||
-- 创建临时表
|
||||
CREATE TEMPORARY TABLE tmp_table AS
|
||||
SELECT ...
|
||||
|
||||
-- 创建临时视图
|
||||
CREATE TEMPORARY VIEW tmp_view AS
|
||||
SELECT ...
|
||||
|
||||
-- 全局临时视图(跨Session)
|
||||
CREATE GLOBAL TEMPORARY VIEW global_view AS
|
||||
SELECT ...
|
||||
|
||||
-- 删除
|
||||
DROP TABLE IF EXISTS tmp_table;
|
||||
DROP VIEW IF EXISTS tmp_view;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## MERGE INTO(更新插入)
|
||||
|
||||
```sql
|
||||
-- MERGE INTO 语法
|
||||
MERGE INTO target_table t
|
||||
USING source_table s
|
||||
ON t.id = s.id
|
||||
WHEN MATCHED THEN UPDATE SET t.name = s.name, t.amount = s.amount
|
||||
WHEN NOT MATCHED THEN INSERT (id, name, amount) VALUES (s.id, s.name, s.amount)
|
||||
|
||||
-- 仅更新
|
||||
MERGE INTO target_table t
|
||||
USING source_table s
|
||||
ON t.id = s.id
|
||||
WHEN MATCHED THEN UPDATE SET *
|
||||
|
||||
-- 仅插入
|
||||
MERGE INTO target_table t
|
||||
USING source_table s
|
||||
ON t.id = s.id
|
||||
WHEN NOT MATCHED THEN INSERT *
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Spark SQL 不支持的特性
|
||||
|
||||
| PostgreSQL 特性 | Spark SQL | 替代方案 |
|
||||
|------------------|-----------|----------|
|
||||
| CREATE INDEX | ❌ 不支持 | 依赖存储格式优化(Parquet/ORC) |
|
||||
| CREATE TRIGGER | ❌ 不支持 | 使用程序逻辑处理 |
|
||||
| FOREIGN KEY 约束 | ❌ 不强制 | 数据关联靠 JOIN 保证 |
|
||||
| CHECK 约束 | ❌ 不支持 | 使用过滤条件 |
|
||||
| ON CONFLICT (UPSERT) | 使用 MERGE INTO | - |
|
||||
| WITH RECURSIVE | Spark 3.x+ 支持 | 或用程序迭代 |
|
||||
| 物化视图 | ❌ 不支持 | 使用缓存或临时表 |
|
||||
| 存储过程 | ❌ 不支持 | 使用外部程序 |
|
||||
| FOR UPDATE 锁 | ❌ 不支持 | 无行级锁概念 |
|
||||
|
||||
---
|
||||
|
||||
## SQL 生成规则
|
||||
|
||||
### 通用规则(所有引擎统一)
|
||||
|
||||
1. **禁止使用 CTE (WITH 子句)**,每个主要逻辑步骤必须物化为临时表
|
||||
2. **先 DROP 再 CREATE**:`DROP TABLE IF EXISTS ...; CREATE TABLE ... AS SELECT ...;`
|
||||
3. **禁止 `SELECT *`**,必须明确列出所有字段
|
||||
4. 多表查询时所有表必须使用简短别名
|
||||
5. 每个步骤前添加注释说明
|
||||
6. **谓词下推**:过滤条件前置,JOIN 时在 WHERE 中一并添加过滤
|
||||
7. 临时表命名:`${db_tmp_env}.tmp_{业务简称}_{步骤序号}`
|
||||
8. 目标表命名:`${db_eda_env}.{目标表名}`
|
||||
|
||||
### Spark 特有规则
|
||||
|
||||
1. 使用 `INSERT OVERWRITE TABLE` 写入目标表
|
||||
2. 分区表必须指定分区:`PARTITION (day_id = '${day_id}')`
|
||||
3. 最后一步写入目标表,中间步骤物化临时表
|
||||
4. 日期函数:`date_format()`, `to_date()`, `date_add()`, `add_months()`
|
||||
5. 时间范围筛选:
|
||||
```sql
|
||||
-- 日账期过滤
|
||||
WHERE day_id = '${day_id}'
|
||||
-- 最近N个月(月份格式 yyyyMM)
|
||||
WHERE month_id >= date_format(add_months(to_date('${month_id}', 'yyyyMM'), -N), 'yyyyMM')
|
||||
AND month_id < '${month_id}'
|
||||
```
|
||||
|
||||
### SQL 脚本结构
|
||||
|
||||
```sql
|
||||
-- =====================================================================
|
||||
-- @SqlName: spark-D-SQL-{表名}
|
||||
-- @Engine: spark
|
||||
-- ...(头注释)
|
||||
-- =====================================================================
|
||||
|
||||
-- Step01: {步骤描述}
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||||
SELECT ...;
|
||||
|
||||
-- Step02: {步骤描述}
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||||
SELECT ...;
|
||||
|
||||
-- 最后一步:写入目标表
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT ...;
|
||||
```
|
||||
@@ -0,0 +1,306 @@
|
||||
# 窗口函数速查
|
||||
|
||||
## 基本语法
|
||||
|
||||
```sql
|
||||
函数名() OVER (
|
||||
PARTITION BY 分组字段 -- 可选:分组
|
||||
ORDER BY 排序字段 -- 可选:排序
|
||||
ROWS/RANGE 窗口范围 -- 可选:窗口范围
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 排序函数
|
||||
|
||||
| 函数 | 说明 | 特点 | 适用场景 |
|
||||
|------|------|------|----------|
|
||||
| ROW_NUMBER() | 连续排名 | 不跳号,相同值不同排名 | 每组取前N条、去重 |
|
||||
| RANK() | 排名 | 跳号,相同值相同排名 | 成绩排名 |
|
||||
| DENSE_RANK() | 紧密排名 | 不跳号,相同值相同排名 | 连续名次 |
|
||||
| NTILE(n) | 分桶 | 分成n组 | 数据分片、抽样 |
|
||||
|
||||
### ROW_NUMBER 示例
|
||||
|
||||
```sql
|
||||
-- 每个部门薪资最高的员工(取第一名)
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT
|
||||
name,
|
||||
department,
|
||||
salary,
|
||||
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn
|
||||
FROM employees
|
||||
) t
|
||||
WHERE rn = 1
|
||||
|
||||
-- 去重:每个用户取最新订单
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT
|
||||
*,
|
||||
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) AS rn
|
||||
FROM orders
|
||||
) t
|
||||
WHERE rn = 1
|
||||
```
|
||||
|
||||
### RANK 与 DENSE_RANK 示例
|
||||
|
||||
```sql
|
||||
-- 成绩排名(跳号)
|
||||
SELECT
|
||||
name,
|
||||
score,
|
||||
RANK() OVER (ORDER BY score DESC) AS rank, -- 1,2,2,4,5...
|
||||
DENSE_RANK() OVER (ORDER BY score DESC) AS dense -- 1,2,2,3,4...
|
||||
FROM students
|
||||
```
|
||||
|
||||
### NTILE 示例
|
||||
|
||||
```sql
|
||||
-- 将用户分成4组(业绩分位)
|
||||
SELECT
|
||||
name,
|
||||
sales,
|
||||
NTILE(4) OVER (ORDER BY sales DESC) AS quartile -- 1(最高)到4(最低)
|
||||
FROM sales_data
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 聚合函数(窗口内)
|
||||
|
||||
| 函数 | 说明 |
|
||||
|------|------|
|
||||
| SUM(col) OVER | 累计求和 |
|
||||
| AVG(col) OVER | 累计平均 |
|
||||
| COUNT(col) OVER | 窗口内计数 |
|
||||
| MAX(col) OVER | 窗口内最大值 |
|
||||
| MIN(col) OVER | 窗口内最小值 |
|
||||
|
||||
### 累计求和示例
|
||||
|
||||
```sql
|
||||
-- 累计销售额(从开始到当前)
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
SUM(amount) OVER (ORDER BY date) AS cumulative_amount
|
||||
FROM daily_sales
|
||||
|
||||
-- 分组累计(每个部门累计)
|
||||
SELECT
|
||||
date,
|
||||
department,
|
||||
amount,
|
||||
SUM(amount) OVER (PARTITION BY department ORDER BY date) AS dept_cumulative
|
||||
FROM sales_data
|
||||
```
|
||||
|
||||
### 移动平均示例
|
||||
|
||||
```sql
|
||||
-- 7天移动平均
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
AVG(amount) OVER (
|
||||
ORDER BY date
|
||||
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
|
||||
) AS moving_avg_7d
|
||||
FROM daily_sales
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 偏移函数
|
||||
|
||||
| 函数 | 说明 | 适用场景 |
|
||||
|------|------|----------|
|
||||
| LAG(col, n) | 取前n行的值 | 环比、查看历史值 |
|
||||
| LAG(col, n, default) | 取前n行,无值返回default | 防止NULL |
|
||||
| LEAD(col, n) | 取后n行的值 | 查看未来值 |
|
||||
| LEAD(col, n, default) | 取后n行,无值返回default | 防止NULL |
|
||||
| FIRST_VALUE(col) | 窗口第一个值 | 组内首个值 |
|
||||
| LAST_VALUE(col) | 窗口最后一个值 | 组内末尾值 |
|
||||
|
||||
### LAG 示例(环比计算)
|
||||
|
||||
```sql
|
||||
-- 日环比增长
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
LAG(amount, 1) OVER (ORDER BY date) AS prev_day,
|
||||
amount - LAG(amount, 1) OVER (ORDER BY date) AS daily_growth,
|
||||
ROUND((amount - LAG(amount, 1) OVER (ORDER BY date))
|
||||
/ LAG(amount, 1) OVER (ORDER BY date) * 100, 2) AS growth_rate_pct
|
||||
FROM daily_sales
|
||||
|
||||
-- 月度同比(取去年同期)
|
||||
SELECT
|
||||
month,
|
||||
revenue,
|
||||
LAG(revenue, 12) OVER (ORDER BY month) AS prev_year_revenue,
|
||||
ROUND((revenue - LAG(revenue, 12) OVER (ORDER BY month))
|
||||
/ LAG(revenue, 12) OVER (ORDER BY month) * 100, 2) AS yoy_growth_pct
|
||||
FROM monthly_revenue
|
||||
```
|
||||
|
||||
### LEAD 示例
|
||||
|
||||
```sql
|
||||
-- 查看下一行数据
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
LEAD(amount, 1) OVER (ORDER BY date) AS next_day_amount
|
||||
FROM daily_sales
|
||||
```
|
||||
|
||||
### FIRST_VALUE / LAST_VALUE 示例
|
||||
|
||||
```sql
|
||||
-- 每个部门薪资最高和最低的人
|
||||
SELECT
|
||||
name,
|
||||
department,
|
||||
salary,
|
||||
FIRST_VALUE(name) OVER (PARTITION BY department ORDER BY salary DESC) AS highest_paid,
|
||||
LAST_VALUE(name) OVER (
|
||||
PARTITION BY department
|
||||
ORDER BY salary DESC
|
||||
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
|
||||
) AS lowest_paid
|
||||
FROM employees
|
||||
```
|
||||
|
||||
**注意**:LAST_VALUE 默认窗口是 `ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`,需要显式指定全窗口才能取到最后值。
|
||||
|
||||
---
|
||||
|
||||
## 窗口范围定义
|
||||
|
||||
### ROWS(基于行数)
|
||||
|
||||
```sql
|
||||
ROWS BETWEEN 3 PRECEDING AND CURRENT ROW -- 前3行到当前行
|
||||
ROWS BETWEEN 6 PRECEDING AND 1 FOLLOWING -- 前6行到后1行
|
||||
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW -- 从开始到当前
|
||||
ROWS BETWEEN CURRENT ROW AND 3 FOLLOWING -- 当前行到后3行
|
||||
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING -- 全窗口
|
||||
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING -- 当前行及前后各1行(共3行)
|
||||
```
|
||||
|
||||
### RANGE(基于值范围)
|
||||
|
||||
```sql
|
||||
RANGE BETWEEN 100 PRECEDING AND CURRENT ROW -- 值差100以内
|
||||
RANGE BETWEEN INTERVAL '7' DAY PRECEDING AND CURRENT ROW -- 7天内
|
||||
```
|
||||
|
||||
**ROWS vs RANGE**:
|
||||
- ROWS:固定行数,不受值影响
|
||||
- RANGE:根据排序字段的值计算范围
|
||||
|
||||
---
|
||||
|
||||
## 实战案例
|
||||
|
||||
### 案例1:分组取Top N
|
||||
|
||||
```sql
|
||||
-- 每个部门薪资前3名
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT
|
||||
name,
|
||||
department,
|
||||
salary,
|
||||
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn
|
||||
FROM employees
|
||||
) t
|
||||
WHERE rn <= 3
|
||||
```
|
||||
|
||||
### 案例2:连续问题判断
|
||||
|
||||
```sql
|
||||
-- 判断是否连续增长(连续3天增长)
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
CASE
|
||||
WHEN amount > LAG(amount, 1) OVER (ORDER BY date)
|
||||
AND LAG(amount, 1) OVER (ORDER BY date) > LAG(amount, 2) OVER (ORDER BY date)
|
||||
THEN '连续增长'
|
||||
ELSE '非连续增长'
|
||||
END AS trend
|
||||
FROM daily_sales
|
||||
```
|
||||
|
||||
### 案例3:组内占比
|
||||
|
||||
```sql
|
||||
-- 每个部门各员工薪资占比
|
||||
SELECT
|
||||
name,
|
||||
department,
|
||||
salary,
|
||||
ROUND(salary / SUM(salary) OVER (PARTITION BY department) * 100, 2) AS salary_pct
|
||||
FROM employees
|
||||
```
|
||||
|
||||
### 案例4:累计百分比(帕累托分析)
|
||||
|
||||
```sql
|
||||
-- 80/20分析:哪些客户贡献了80%销售额
|
||||
SELECT
|
||||
customer_id,
|
||||
sales_amount,
|
||||
ROUND(SUM(sales_amount) OVER (ORDER BY sales_amount DESC)
|
||||
/ SUM(sales_amount) OVER () * 100, 2) AS cumulative_pct
|
||||
FROM customer_sales
|
||||
ORDER BY sales_amount DESC
|
||||
```
|
||||
|
||||
### 案例5:缺失值填充
|
||||
|
||||
```sql
|
||||
-- 用前一个有效值填充NULL
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
LAST_VALUE(amount IGNORE NULLS) OVER (ORDER BY date) AS filled_amount
|
||||
FROM sales_data
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 性能优化建议
|
||||
|
||||
1. **减少 PARTITION BY 分组数量**:分组越多,计算越慢
|
||||
2. **合理使用窗口范围**:避免全窗口扫描
|
||||
3. **先过滤再窗口**:WHERE 条件前置,减少数据量
|
||||
4. **避免嵌套窗口函数**:多次调用会重复计算
|
||||
|
||||
```sql
|
||||
-- 推荐:先过滤
|
||||
SELECT
|
||||
*,
|
||||
ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) AS rn
|
||||
FROM employees
|
||||
WHERE hire_date >= '2024-01-01' -- 先过滤,减少数据量
|
||||
|
||||
-- 不推荐:先窗口再过滤(窗口函数在全部数据上执行)
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT *, ROW_NUMBER() OVER (...) AS rn
|
||||
FROM employees
|
||||
) t
|
||||
WHERE hire_date >= '2024-01-01'
|
||||
```
|
||||
@@ -0,0 +1,209 @@
|
||||
-- =====================================================================
|
||||
-- @Name: DORIS-D-SQL-{表名}-CREATE
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Apache Doris 建表模板(OLAP 多模型)
|
||||
-- @TargetDatabase: Apache Doris
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:Duplicate Key 模型(明细表)
|
||||
-- ============================================================================
|
||||
-- 适用:保留原始明细数据,不做预聚合,数据无冗余
|
||||
-- 特点:数据按 Key 排序存储,支持所有列的查询和聚合
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.detail_table (
|
||||
-- Key 列(排序字段)
|
||||
order_id BIGINT COMMENT '订单ID',
|
||||
order_date DATE COMMENT '订单日期',
|
||||
user_id BIGINT COMMENT '用户ID',
|
||||
|
||||
-- Value 列
|
||||
user_name VARCHAR(50) COMMENT '用户姓名',
|
||||
product_id BIGINT COMMENT '商品ID',
|
||||
product_name VARCHAR(200) COMMENT '商品名称',
|
||||
quantity INT COMMENT '购买数量',
|
||||
unit_price DECIMAL(18,2) COMMENT '单价',
|
||||
total_amount DECIMAL(18,2) COMMENT '总金额',
|
||||
status VARCHAR(20) COMMENT '订单状态',
|
||||
create_time DATETIME COMMENT '创建时间'
|
||||
)
|
||||
DUPLICATE KEY(order_id, order_date, user_id)
|
||||
COMMENT '订单明细表'
|
||||
PARTITION BY RANGE(order_date) (
|
||||
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||||
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
|
||||
PARTITION p202603 VALUES LESS THAN ('2026-04-01')
|
||||
)
|
||||
DISTRIBUTED BY HASH(order_id) BUCKETS 8
|
||||
PROPERTIES (
|
||||
'replication_num' = '3',
|
||||
'storage_format' = 'V2'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:Aggregate Key 模型(聚合表)
|
||||
-- ============================================================================
|
||||
-- 适用:预聚合场景,相同 Key 的数据自动合并
|
||||
-- 特点:Value 列必须指定聚合函数(SUM, REPLACE, MAX, MIN, HLL_UNION, BITMAP_UNION)
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.agg_table (
|
||||
-- Key 列(聚合维度)
|
||||
stat_date DATE COMMENT '统计日期',
|
||||
department VARCHAR(100) COMMENT '部门名称',
|
||||
region VARCHAR(100) COMMENT '地区',
|
||||
|
||||
-- Value 列(带聚合函数)
|
||||
order_count BIGINT SUM COMMENT '订单总数',
|
||||
total_amount DECIMAL(18,2) SUM COMMENT '总金额',
|
||||
unique_users BIGINT REPLACE COMMENT '去重用户数(预计算值)',
|
||||
max_amount DECIMAL(18,2) MAX COMMENT '最大金额',
|
||||
last_update DATETIME REPLACE COMMENT '最后更新时间'
|
||||
)
|
||||
AGGREGATE KEY(stat_date, department, region)
|
||||
COMMENT '部门销售聚合表'
|
||||
PARTITION BY RANGE(stat_date) (
|
||||
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||||
PARTITION p202602 VALUES LESS THAN ('2026-03-01')
|
||||
)
|
||||
DISTRIBUTED BY HASH(department) BUCKETS 8
|
||||
PROPERTIES (
|
||||
'replication_num' = '3',
|
||||
'storage_format' = 'V2'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:Unique Key 模型(唯一主键表)
|
||||
-- ============================================================================
|
||||
-- 适用:需要按主键更新/去重的场景
|
||||
-- 特点:相同主键的数据保留最新一条(整行替换)
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.unique_table (
|
||||
-- Key 列(主键,必须唯一)
|
||||
user_id BIGINT COMMENT '用户ID',
|
||||
|
||||
-- Value 列
|
||||
user_name VARCHAR(50) COMMENT '用户姓名',
|
||||
phone VARCHAR(20) COMMENT '手机号',
|
||||
email VARCHAR(100) COMMENT '邮箱',
|
||||
vip_level INT COMMENT 'VIP等级',
|
||||
register_date DATE COMMENT '注册日期',
|
||||
last_login DATETIME COMMENT '最后登录时间',
|
||||
status VARCHAR(10) COMMENT '状态'
|
||||
)
|
||||
UNIQUE KEY(user_id)
|
||||
COMMENT '用户信息表(按主键更新)'
|
||||
DISTRIBUTED BY HASH(user_id) BUCKETS 16
|
||||
PROPERTIES (
|
||||
'replication_num' = '3',
|
||||
'enable_unique_key_merge_based_on_replica' = 'true'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:带动态分区属性
|
||||
-- ============================================================================
|
||||
-- 适用:按日自动创建和管理分区
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.auto_partition_table (
|
||||
stat_date DATE COMMENT '统计日期',
|
||||
department VARCHAR(100) COMMENT '部门',
|
||||
metric_value DECIMAL(18,2) SUM COMMENT '指标值',
|
||||
record_count BIGINT SUM COMMENT '记录数'
|
||||
)
|
||||
AGGREGATE KEY(stat_date, department)
|
||||
COMMENT '自动分区示例表'
|
||||
PARTITION BY RANGE(stat_date) ()
|
||||
DISTRIBUTED BY HASH(department) BUCKETS 8
|
||||
PROPERTIES (
|
||||
'replication_num' = '3',
|
||||
'dynamic_partition.enable' = 'true',
|
||||
'dynamic_partition.time_unit' = 'DAY',
|
||||
'dynamic_partition.start' = '-30', -- 保留30天历史
|
||||
'dynamic_partition.end' = '3', -- 预创建3天
|
||||
'dynamic_partition.prefix' = 'p',
|
||||
'dynamic_partition.buckets' = '8'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:多分区 + 多分桶
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.multi_partition_table (
|
||||
stat_date DATE COMMENT '统计日期',
|
||||
region VARCHAR(50) COMMENT '地区',
|
||||
city VARCHAR(50) COMMENT '城市',
|
||||
user_id BIGINT COMMENT '用户ID',
|
||||
amount DECIMAL(18,2) SUM COMMENT '金额'
|
||||
)
|
||||
AGGREGATE KEY(stat_date, region, city, user_id)
|
||||
COMMENT '多维度分区示例'
|
||||
PARTITION BY RANGE(stat_date) (
|
||||
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||||
PARTITION p202602 VALUES LESS THAN ('2026-03-01')
|
||||
)
|
||||
DISTRIBUTED BY HASH(user_id) BUCKETS 32
|
||||
PROPERTIES (
|
||||
'replication_num' = '3',
|
||||
'in_memory' = 'false',
|
||||
'storage_format' = 'V2',
|
||||
'compression' = 'LZ4'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 字段类型速查
|
||||
-- ============================================================================
|
||||
/*
|
||||
| 类型 | 说明 | 适用场景 |
|
||||
|---------------|----------------|------------------------|
|
||||
| BOOLEAN | 布尔 | 状态标志 |
|
||||
| TINYINT | 1字节整数 | 小范围枚举 |
|
||||
| SMALLINT | 2字节整数 | 小范围数值 |
|
||||
| INT | 4字节整数 | 数量、等级 |
|
||||
| BIGINT | 8字节整数 | ID、计数、大数值 |
|
||||
| LARGEINT | 16字节整数 | 超大数值 |
|
||||
| FLOAT | 4字节浮点 | 近似计算 |
|
||||
| DOUBLE | 8字节浮点 | 科学计算 |
|
||||
| DECIMAL(p,s) | 定点数 | 金额、精确数值 |
|
||||
| DATE | 日期 | 日期字段(无时间) |
|
||||
| DATETIME | 日期时间 | 时间戳(精确到秒) |
|
||||
| CHAR(n) | 定长字符串 | 固定长度编码 |
|
||||
| VARCHAR(n) | 变长字符串 | 名称、描述 |
|
||||
| STRING | 变长字符串 | 大文本(无长度限制) |
|
||||
| BITMAP | 位图 | 精确去重(仅聚合模型) |
|
||||
| HLL | HyperLogLog | 近似去重(仅聚合模型) |
|
||||
| JSON | JSON | JSON数据存储 |
|
||||
*/
|
||||
|
||||
-- ============================================================================
|
||||
-- 建表规范说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 模型选择
|
||||
- Duplicate Key:保留原始明细,不做预聚合
|
||||
- Aggregate Key:预聚合,相同 Key 的 Value 自动合并
|
||||
- Unique Key:按主键去重,保留最新数据
|
||||
|
||||
2. 分区设计
|
||||
- 按时间字段 RANGE 分区(最常用)
|
||||
- 支持动态分区自动管理
|
||||
- 单分区数据量建议 1GB~10GB
|
||||
|
||||
3. 分桶设计
|
||||
- 使用高基数列做 HASH 分桶
|
||||
- 分桶数 = BE节点数 × CPU核数(参考值)
|
||||
- 单桶数据量建议 100MB~1GB
|
||||
|
||||
4. 副本数
|
||||
- 生产环境建议 3 副本
|
||||
- 测试环境可设 1 副本
|
||||
|
||||
5. Key 列选择
|
||||
- Duplicate Key:高频过滤/排序字段
|
||||
- Aggregate Key:聚合维度字段
|
||||
- Unique Key:业务主键
|
||||
|
||||
6. 注意事项
|
||||
- Key 列必须在 Value 列之前
|
||||
- 分区列必须是 Key 列
|
||||
- 分桶列必须是 Key 列
|
||||
- BITMAP/HLL 仅用于 Aggregate 模型的 Value 列
|
||||
*/
|
||||
@@ -0,0 +1,128 @@
|
||||
-- =====================================================================
|
||||
-- @Name: DORIS-D-SQL-{表名}-ETL
|
||||
-- @Version: 2.0
|
||||
-- @Desc: Apache Doris ETL 数据处理模板(临时表链式处理)
|
||||
-- @TargetDatabase: Apache Doris
|
||||
-- @说明: 统一规范:禁止 CTE,每步物化为临时表,先 DROP 再 CREATE
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- Step01: 基础清洗与过滤
|
||||
-- ============================================================================
|
||||
-- 说明:从源表读取数据,进行基础过滤和清洗
|
||||
-- 输入:{源表名}
|
||||
-- 输出:${db_tmp_env}.tmp_xxx_01
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||||
SELECT
|
||||
order_id,
|
||||
user_id,
|
||||
dept_id,
|
||||
total_amount,
|
||||
status,
|
||||
order_date
|
||||
FROM db_name.source_table
|
||||
WHERE order_date = '${day_id}'
|
||||
AND status IN ('completed', 'shipped') -- 业务过滤
|
||||
AND total_amount > 0 -- 数据质量过滤
|
||||
AND user_id IS NOT NULL; -- NULL过滤
|
||||
|
||||
-- ============================================================================
|
||||
-- Step02: 多表关联与维度补全
|
||||
-- ============================================================================
|
||||
-- 说明:关联维度表,补全业务属性字段
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_01, dim_department, dim_category
|
||||
-- 输出:${db_tmp_env}.tmp_xxx_02
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||||
SELECT
|
||||
a.order_id,
|
||||
a.user_id,
|
||||
a.total_amount,
|
||||
a.status,
|
||||
b.dept_name, -- 维度补全:部门名称
|
||||
c.category_name, -- 维度补全:类别名称
|
||||
a.order_date
|
||||
FROM ${db_tmp_env}.tmp_xxx_01 a
|
||||
LEFT JOIN db_name.dim_department b
|
||||
ON a.dept_id = b.dept_id
|
||||
LEFT JOIN db_name.dim_category c
|
||||
ON a.category_id = c.category_id;
|
||||
|
||||
-- ============================================================================
|
||||
-- Step03: 聚合计算与指标生成
|
||||
-- ============================================================================
|
||||
-- 说明:按业务维度聚合,计算统计指标
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_02
|
||||
-- 输出:${db_tmp_env}.tmp_xxx_03
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS
|
||||
SELECT
|
||||
order_date,
|
||||
dept_name,
|
||||
category_name,
|
||||
COUNT(*) AS record_count, -- 记录数
|
||||
COUNT(DISTINCT user_id) AS unique_users, -- 去重用户数
|
||||
SUM(total_amount) AS total_amount, -- 总金额
|
||||
AVG(total_amount) AS avg_amount, -- 平均金额
|
||||
MAX(total_amount) AS max_amount -- 最大金额
|
||||
FROM ${db_tmp_env}.tmp_xxx_02
|
||||
GROUP BY order_date, dept_name, category_name;
|
||||
|
||||
-- ============================================================================
|
||||
-- Step04: 最终输出写入目标表
|
||||
-- ============================================================================
|
||||
-- 说明:补全目标表标准字段,写入结果表
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_03
|
||||
-- 输出:目标表
|
||||
|
||||
INSERT INTO ${db_eda_env}.target_table
|
||||
SELECT
|
||||
-- 业务字段
|
||||
dept_name,
|
||||
category_name,
|
||||
record_count,
|
||||
unique_users,
|
||||
total_amount,
|
||||
avg_amount,
|
||||
max_amount,
|
||||
|
||||
-- 技术字段
|
||||
NOW() AS etl_time, -- 数据加工时间
|
||||
'${day_id}' AS stat_date -- 统计日期
|
||||
FROM ${db_tmp_env}.tmp_xxx_03;
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 禁止使用 CTE (WITH 子句)
|
||||
- 每个步骤必须物化为临时表
|
||||
- 原因:便于调试、断点续跑、统一编码规范
|
||||
|
||||
2. 先 DROP 再 CREATE
|
||||
- 每个临时表创建前必须先 DROP TABLE IF EXISTS
|
||||
- 原因:防止表已存在导致失败
|
||||
|
||||
3. Doris 写入方式
|
||||
- 默认使用 INSERT INTO
|
||||
- Aggregate Key 表:自动合并相同 Key 的数据
|
||||
- Unique Key 表:自动按主键去重,保留最新数据
|
||||
- Doris 2.0+ 也支持 INSERT OVERWRITE
|
||||
|
||||
4. 过滤条件前置
|
||||
- 所有过滤在最早阶段应用
|
||||
- 减少中间数据量
|
||||
|
||||
5. 临时表命名规范
|
||||
- 格式:tmp_{业务简称}_{步骤序号}
|
||||
- 示例:tmp_order_stats_01, tmp_order_stats_02
|
||||
|
||||
6. Doris 特有注意事项
|
||||
- 不支持 LEFT SEMI JOIN / LEFT ANTI JOIN
|
||||
- 日期函数用 MySQL 风格:DATE_FORMAT, DATE_ADD(INTERVAL)
|
||||
- 不支持 collect_list/collect_set,用 GROUP_CONCAT 替代
|
||||
*/
|
||||
@@ -0,0 +1,147 @@
|
||||
-- =====================================================================
|
||||
-- @Name: DORIS-D-SQL-{表名}-INSERT
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Apache Doris 数据插入模板
|
||||
-- @TargetDatabase: Apache Doris
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:INSERT INTO(追加写入)
|
||||
-- ============================================================================
|
||||
-- 适用:向 Doris 表追加数据,不会删除已有数据
|
||||
|
||||
INSERT INTO db_name.target_table
|
||||
SELECT
|
||||
stat_date,
|
||||
department,
|
||||
region,
|
||||
order_count,
|
||||
total_amount
|
||||
FROM db_name.source_table
|
||||
WHERE stat_date = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:INSERT OVERWRITE(覆盖写入)
|
||||
-- ============================================================================
|
||||
-- 适用:覆盖目标表(或指定分区)的全部数据
|
||||
-- 注意:Doris 2.0+ 支持,且仅适用于 Partition 表
|
||||
|
||||
-- 覆盖整表
|
||||
INSERT OVERWRITE db_name.target_table
|
||||
SELECT
|
||||
stat_date,
|
||||
department,
|
||||
region,
|
||||
order_count,
|
||||
total_amount
|
||||
FROM db_name.source_table;
|
||||
|
||||
-- 覆盖指定分区(推荐)
|
||||
INSERT OVERWRITE db_name.target_table
|
||||
PARTITION(p202605)
|
||||
SELECT
|
||||
department,
|
||||
region,
|
||||
order_count,
|
||||
total_amount
|
||||
FROM db_name.source_table
|
||||
WHERE stat_date >= '2026-05-01'
|
||||
AND stat_date < '2026-06-01';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:从查询结果写入(ETL 场景)
|
||||
-- ============================================================================
|
||||
|
||||
-- 简单转换后写入
|
||||
INSERT INTO db_name.target_table
|
||||
SELECT
|
||||
order_date,
|
||||
department,
|
||||
COUNT(*) AS order_count,
|
||||
COUNT(DISTINCT user_id) AS unique_users,
|
||||
SUM(total_amount) AS total_amount,
|
||||
AVG(total_amount) AS avg_amount
|
||||
FROM db_name.source_orders o
|
||||
LEFT JOIN db_name.dim_department d ON o.dept_id = d.dept_id
|
||||
WHERE o.order_date = '${day_id}'
|
||||
GROUP BY order_date, department;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:批量 VALUES 写入
|
||||
-- ============================================================================
|
||||
|
||||
INSERT INTO db_name.target_table (stat_date, department, amount)
|
||||
VALUES
|
||||
('2026-05-01', '市场部', 10000.00),
|
||||
('2026-05-01', '技术部', 25000.00),
|
||||
('2026-05-01', '运营部', 18000.00);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:Stream Load(数据导入)
|
||||
-- ============================================================================
|
||||
-- 适用:大批量数据导入(百万级以上)
|
||||
-- 注意:Stream Load 通过 HTTP 协议提交,不是 SQL 语法
|
||||
|
||||
/*
|
||||
-- curl 命令示例
|
||||
curl --location-trusted -u user:password \
|
||||
-H "label:load_order_20260501" \
|
||||
-H "column_separator:," \
|
||||
-H "columns:order_id,order_date,user_id,total_amount" \
|
||||
-T data.csv \
|
||||
http://fe_host:8030/api/db_name/orders/_stream_load
|
||||
*/
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景6:Broker Load(外部数据源导入)
|
||||
-- ============================================================================
|
||||
|
||||
/*
|
||||
LOAD LABEL db_name.load_label_20260501
|
||||
(
|
||||
DATA INFILE('hdfs://namenode:8020/path/to/data/*')
|
||||
INTO TABLE target_table
|
||||
COLUMNS TERMINATED BY ','
|
||||
(stat_date, department, region, amount)
|
||||
SET (amount = amount * 1.0)
|
||||
)
|
||||
WITH BROKER 'broker_name'
|
||||
(
|
||||
'username' = 'hdfs_user',
|
||||
'password' = 'hdfs_password'
|
||||
)
|
||||
PROPERTIES
|
||||
(
|
||||
'timeout' = '3600',
|
||||
'max_filter_ratio' = '0.01'
|
||||
);
|
||||
*/
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. INSERT INTO vs INSERT OVERWRITE
|
||||
- INSERT INTO:追加数据,不删除已有数据
|
||||
- INSERT OVERWRITE:覆盖数据(Doris 2.0+ 支持)
|
||||
- 日常增量推荐 INSERT INTO,全量刷新推荐 INSERT OVERWRITE
|
||||
|
||||
2. Doris 不使用临时表链式处理
|
||||
- 与 Spark 不同,Doris 通常用单条 SQL 或 CTE 完成 ETL
|
||||
- 直接 INSERT INTO ... SELECT ... 即可
|
||||
|
||||
3. 字段顺序
|
||||
- SELECT 字段顺序必须与目标表列定义顺序一致
|
||||
- 或显式指定列名:INSERT INTO table (col1, col2) SELECT ...
|
||||
|
||||
4. 数据导入方式选择
|
||||
- 少量数据:INSERT INTO ... SELECT ... 或 INSERT INTO ... VALUES ...
|
||||
- 大批量导入:Stream Load(HTTP PUT,最高性能)
|
||||
- HDFS 导入:Broker Load
|
||||
- 外部数据源:Routine Load(Kafka 等)
|
||||
|
||||
5. 性能建议
|
||||
- 批量写入优于逐条写入
|
||||
- Stream Load 是最高性能的导入方式
|
||||
- 建议攒批后一次性写入,避免频繁小批量导入
|
||||
*/
|
||||
@@ -0,0 +1,189 @@
|
||||
-- =====================================================================
|
||||
-- @Name: DORIS-D-SQL-{表名}-QUERY
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Apache Doris 查询模板
|
||||
-- @TargetDatabase: Apache Doris
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 1. 单表查询
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
order_id,
|
||||
user_name,
|
||||
total_amount,
|
||||
create_time
|
||||
FROM db_name.orders
|
||||
WHERE order_date = '${day_id}'
|
||||
AND status = 'completed'
|
||||
ORDER BY total_amount DESC
|
||||
LIMIT 100;
|
||||
|
||||
-- ============================================================================
|
||||
-- 2. JOIN 查询
|
||||
-- ============================================================================
|
||||
|
||||
-- 两表 JOIN
|
||||
SELECT
|
||||
o.order_id,
|
||||
o.total_amount,
|
||||
u.user_name,
|
||||
u.vip_level
|
||||
FROM db_name.orders o
|
||||
JOIN db_name.users u ON o.user_id = u.user_id
|
||||
WHERE o.order_date = '${day_id}'
|
||||
AND o.status = 'completed';
|
||||
|
||||
-- 多表 JOIN
|
||||
SELECT
|
||||
o.order_id,
|
||||
u.user_name,
|
||||
p.product_name,
|
||||
oi.quantity,
|
||||
oi.unit_price
|
||||
FROM db_name.orders o
|
||||
JOIN db_name.users u ON o.user_id = u.user_id
|
||||
JOIN db_name.order_items oi ON o.order_id = oi.order_id
|
||||
JOIN db_name.products p ON oi.product_id = p.product_id
|
||||
WHERE o.order_date BETWEEN '${start_date}' AND '${end_date}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 3. 聚合查询
|
||||
-- ============================================================================
|
||||
|
||||
-- GROUP BY + HAVING
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS order_count,
|
||||
SUM(total_amount) AS total_amount,
|
||||
AVG(total_amount) AS avg_amount
|
||||
FROM db_name.orders
|
||||
WHERE order_date = '${day_id}'
|
||||
GROUP BY department
|
||||
HAVING COUNT(*) >= 5
|
||||
ORDER BY total_amount DESC;
|
||||
|
||||
-- 多字段分组 + 去重计数
|
||||
SELECT
|
||||
order_date,
|
||||
region,
|
||||
COUNT(*) AS order_count,
|
||||
COUNT(DISTINCT user_id) AS unique_users,
|
||||
SUM(total_amount) AS total_amount
|
||||
FROM db_name.orders
|
||||
WHERE order_date BETWEEN '${start_date}' AND '${end_date}'
|
||||
GROUP BY order_date, region;
|
||||
|
||||
-- ============================================================================
|
||||
-- 4. 窗口函数
|
||||
-- ============================================================================
|
||||
|
||||
-- ROW_NUMBER(分组取Top N)
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT
|
||||
department,
|
||||
user_name,
|
||||
total_amount,
|
||||
ROW_NUMBER() OVER (PARTITION BY department ORDER BY total_amount DESC) AS rn
|
||||
FROM db_name.orders
|
||||
WHERE order_date = '${day_id}'
|
||||
) t
|
||||
WHERE rn <= 3;
|
||||
|
||||
-- 累计聚合
|
||||
SELECT
|
||||
order_date,
|
||||
daily_amount,
|
||||
SUM(daily_amount) OVER (ORDER BY order_date) AS cumulative_amount,
|
||||
AVG(daily_amount) OVER (
|
||||
ORDER BY order_date
|
||||
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
|
||||
) AS moving_avg_7d
|
||||
FROM (
|
||||
SELECT order_date, SUM(total_amount) AS daily_amount
|
||||
FROM db_name.orders
|
||||
GROUP BY order_date
|
||||
) t;
|
||||
|
||||
-- LAG/LEAD(环比计算)
|
||||
SELECT
|
||||
order_date,
|
||||
daily_amount,
|
||||
LAG(daily_amount, 1) OVER (ORDER BY order_date) AS prev_amount,
|
||||
daily_amount - LAG(daily_amount, 1) OVER (ORDER BY order_date) AS daily_change,
|
||||
ROUND(
|
||||
(daily_amount - LAG(daily_amount, 1) OVER (ORDER BY order_date))
|
||||
/ LAG(daily_amount, 1) OVER (ORDER BY order_date) * 100, 2
|
||||
) AS growth_rate_pct
|
||||
FROM (
|
||||
SELECT order_date, SUM(total_amount) AS daily_amount
|
||||
FROM db_name.orders
|
||||
GROUP BY order_date
|
||||
) t;
|
||||
|
||||
-- ============================================================================
|
||||
-- 5. 分页查询
|
||||
-- ============================================================================
|
||||
|
||||
-- LIMIT OFFSET 分页(Doris 原生支持)
|
||||
SELECT
|
||||
order_id, user_name, total_amount
|
||||
FROM db_name.orders
|
||||
WHERE order_date = '${day_id}'
|
||||
ORDER BY order_id
|
||||
LIMIT 20 OFFSET 0; -- 第1页,每页20条
|
||||
|
||||
-- ============================================================================
|
||||
-- 6. 子查询
|
||||
-- ============================================================================
|
||||
|
||||
-- IN 子查询
|
||||
SELECT *
|
||||
FROM db_name.orders
|
||||
WHERE user_id IN (
|
||||
SELECT user_id FROM db_name.users WHERE vip_level >= 3
|
||||
)
|
||||
AND order_date = '${day_id}';
|
||||
|
||||
-- EXISTS 子查询
|
||||
SELECT *
|
||||
FROM db_name.products p
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM db_name.inventory i
|
||||
WHERE i.product_id = p.product_id
|
||||
AND i.quantity > 0
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 7. 条件聚合(CASE WHEN + 聚合)
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
order_date,
|
||||
COUNT(*) AS total_orders,
|
||||
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
|
||||
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
|
||||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_count,
|
||||
SUM(CASE WHEN total_amount > 1000 THEN total_amount ELSE 0 END) AS high_value_amount
|
||||
FROM db_name.orders
|
||||
WHERE order_date = '${day_id}'
|
||||
GROUP BY order_date;
|
||||
|
||||
-- ============================================================================
|
||||
-- 8. Bitmap 精确去重(Doris 特有)
|
||||
-- ============================================================================
|
||||
|
||||
-- 使用 bitmap 做精确去重(需要在 Aggregate Key 模型中定义 BITMAP 类型列)
|
||||
-- 注意:bitmap 函数只能用于包含 BITMAP 类型列的表
|
||||
|
||||
-- 精确去重计数(预计算场景,在 Aggregate Key 表中使用 BITMAP_UNION)
|
||||
-- 建表时 Value 列定义:user_id_bitmap BITMAP BITMAP_UNION
|
||||
-- 查询时:
|
||||
-- SELECT date, bitmap_union_count(user_id_bitmap) AS uv FROM table GROUP BY date;
|
||||
|
||||
-- HLL 近似去重
|
||||
-- 建表时 Value 列定义:user_id_hll HLL HLL_UNION
|
||||
-- 查询时:
|
||||
-- SELECT date, hll_union_agg(user_id_hll) AS approx_uv FROM table GROUP BY date;
|
||||
@@ -0,0 +1,211 @@
|
||||
-- =====================================================================
|
||||
-- @Name: HIVE-D-SQL-{表名}-CREATE
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Hive 建表模板(内部表/外部表/分区/分桶)
|
||||
-- @TargetDatabase: Hive
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:内部表(Managed Table)
|
||||
-- ============================================================================
|
||||
-- 适用:Hive 管理数据和元数据,DROP TABLE 时数据一并删除
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.managed_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
name STRING COMMENT '名称',
|
||||
category STRING COMMENT '类别',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
status STRING COMMENT '状态',
|
||||
created_at TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP COMMENT '更新时间',
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间',
|
||||
etl_remark STRING COMMENT '备注信息'
|
||||
)
|
||||
COMMENT '内部表示例'
|
||||
STORED AS ORC; -- 推荐存储格式
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:外部表(External Table)
|
||||
-- ============================================================================
|
||||
-- 适用:数据由外部系统管理,DROP TABLE 只删元数据不删数据
|
||||
|
||||
CREATE EXTERNAL TABLE IF NOT EXISTS db_name.external_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
user_id STRING COMMENT '用户ID',
|
||||
action STRING COMMENT '操作类型',
|
||||
page_url STRING COMMENT '页面URL',
|
||||
ip_address STRING COMMENT 'IP地址',
|
||||
event_time TIMESTAMP COMMENT '事件时间'
|
||||
)
|
||||
COMMENT '日志外部表'
|
||||
ROW FORMAT DELIMITED
|
||||
FIELDS TERMINATED BY '\t'
|
||||
LINES TERMINATED BY '\n'
|
||||
STORED AS TEXTFILE
|
||||
LOCATION '/data/external/logs/';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:分区表(单分区)
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.partitioned_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
user_id STRING COMMENT '用户ID',
|
||||
user_name STRING COMMENT '用户姓名',
|
||||
order_count BIGINT COMMENT '订单数',
|
||||
total_amount DECIMAL(18,2) COMMENT '总金额',
|
||||
department STRING COMMENT '部门',
|
||||
region STRING COMMENT '地区',
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||||
)
|
||||
COMMENT '按日分区的统计表'
|
||||
PARTITIONED BY (day_id STRING COMMENT '统计日期,格式yyyy-MM-dd')
|
||||
STORED AS ORC;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:多分区字段
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.multi_partition_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
name STRING COMMENT '名称',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||||
)
|
||||
COMMENT '多分区字段示例表'
|
||||
PARTITIONED BY (
|
||||
year_id STRING COMMENT '年份',
|
||||
month_id STRING COMMENT '月份'
|
||||
)
|
||||
STORED AS ORC;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:分桶表
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.bucketed_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
user_id BIGINT COMMENT '用户ID',
|
||||
user_name STRING COMMENT '用户姓名',
|
||||
amount DECIMAL(18,2) COMMENT '金额'
|
||||
)
|
||||
COMMENT '分桶表示例'
|
||||
PARTITIONED BY (day_id STRING)
|
||||
CLUSTERED BY (user_id) -- 分桶列
|
||||
SORTED BY (amount DESC) -- 桶内排序
|
||||
INTO 16 BUCKETS -- 桶数量
|
||||
STORED AS ORC;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景6:ORC 格式 + 表属性
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.orc_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
name STRING COMMENT '名称',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||||
)
|
||||
COMMENT 'ORC格式带属性配置'
|
||||
PARTITIONED BY (day_id STRING)
|
||||
STORED AS ORC
|
||||
TBLPROPERTIES (
|
||||
'orc.compress' = 'SNAPPY', -- 压缩格式
|
||||
'orc.create.index' = 'true', -- 创建索引
|
||||
'transactional' = 'false' -- 非事务表
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景7:Parquet 格式
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.parquet_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
name STRING COMMENT '名称',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
tags ARRAY<STRING> COMMENT '标签数组',
|
||||
props MAP<STRING,STRING> COMMENT '属性映射'
|
||||
)
|
||||
COMMENT 'Parquet格式表示例'
|
||||
PARTITIONED BY (day_id STRING)
|
||||
STORED AS PARQUET
|
||||
TBLPROPERTIES (
|
||||
'parquet.compression' = 'SNAPPY'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景8:临时表
|
||||
-- ============================================================================
|
||||
|
||||
-- 会话级临时表(会话结束自动删除)
|
||||
CREATE TEMPORARY TABLE tmp_processing (
|
||||
id BIGINT,
|
||||
name STRING,
|
||||
amount DECIMAL(18,2)
|
||||
);
|
||||
|
||||
-- CTAS 快速创建临时表
|
||||
CREATE TEMPORARY TABLE tmp_source AS
|
||||
SELECT id, name, amount
|
||||
FROM source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 字段类型速查
|
||||
-- ============================================================================
|
||||
/*
|
||||
| 类型 | 说明 | 适用场景 |
|
||||
|-------------------|----------------|------------------------|
|
||||
| TINYINT | 1字节整数 | 小范围数值 |
|
||||
| SMALLINT | 2字节整数 | 小范围数值 |
|
||||
| INT | 4字节整数 | 数量、等级 |
|
||||
| BIGINT | 8字节整数 | ID、计数 |
|
||||
| FLOAT | 4字节浮点 | 近似计算 |
|
||||
| DOUBLE | 8字节浮点 | 科学计算 |
|
||||
| DECIMAL(p,s) | 定点数 | 金额、精确数值 |
|
||||
| BOOLEAN | 布尔 | 状态标志 |
|
||||
| STRING | 变长字符串 | 名称、描述(最常用) |
|
||||
| VARCHAR(n) | 变长字符串 | 限定长度字符串 |
|
||||
| CHAR(n) | 定长字符串 | 固定长度编码 |
|
||||
| DATE | 日期 | 日期字段 |
|
||||
| TIMESTAMP | 时间戳 | 时间字段 |
|
||||
| BINARY | 二进制 | 二进制数据 |
|
||||
| ARRAY<type> | 数组 | 多值字段 |
|
||||
| MAP<k,v> | 映射 | 属性字典 |
|
||||
| STRUCT<f1:t1,...> | 结构体 | 嵌套结构 |
|
||||
*/
|
||||
|
||||
-- ============================================================================
|
||||
-- 建表规范说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 内部表 vs 外部表
|
||||
- 内部表:Hive 管理数据,DROP 删数据和元数据
|
||||
- 外部表:外部管理数据,DROP 只删元数据
|
||||
- 生产推荐:原始数据用外部表,加工结果用内部表
|
||||
|
||||
2. 存储格式选择
|
||||
- ORC(推荐):压缩好,列存储,支持谓词下推
|
||||
- PARQUET:跨平台兼容好,列存储
|
||||
- TEXTFILE:原始数据导入,性能最差
|
||||
|
||||
3. 分区设计
|
||||
- 按时间分区最常用(day_id, month_id)
|
||||
- 分区列不能出现在表定义的列中(Hive 特有)
|
||||
- 查询时分区列作为普通字段使用
|
||||
|
||||
4. 分桶设计
|
||||
- 选择高基数列做分桶列
|
||||
- 用于优化 JOIN(分桶列相同可做 map-side join)
|
||||
- 用于数据抽样(TABLESAMPLE)
|
||||
|
||||
5. 字段命名规范
|
||||
- snake_case 格式:user_id, total_amount
|
||||
- 主键:id 或 {业务}_id
|
||||
- 技术字段:etl_time, etl_remark
|
||||
- 分区字段:day_id, month_id, year_id
|
||||
|
||||
6. COMMENT 必须添加
|
||||
- 每个字段必须有 COMMENT
|
||||
- 表必须有 COMMENT
|
||||
*/
|
||||
@@ -0,0 +1,138 @@
|
||||
-- =====================================================================
|
||||
-- @Name: HIVE-D-SQL-{表名}-ETL
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Hive ETL 数据处理模板(临时表链式处理)
|
||||
-- @TargetDatabase: Hive
|
||||
-- @说明: 和 Spark 类似,禁止 CTE,每步物化为临时表
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- Step01: 基础清洗与过滤
|
||||
-- ============================================================================
|
||||
-- 说明:从源表读取数据,进行基础过滤和清洗
|
||||
-- 输入:{源表名}
|
||||
-- 输出:tmp_etl_01
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
status,
|
||||
dept_id,
|
||||
category_id,
|
||||
created_at,
|
||||
day_id
|
||||
FROM db_name.source_table
|
||||
WHERE day_id = '${day_id}' -- 分区过滤(必须)
|
||||
AND status IN ('active', 'valid') -- 业务过滤
|
||||
AND amount > 0 -- 数据质量过滤
|
||||
AND id IS NOT NULL; -- NULL过滤
|
||||
|
||||
-- ============================================================================
|
||||
-- Step02: 多表关联与维度补全
|
||||
-- ============================================================================
|
||||
-- 说明:关联维度表,补全业务属性字段
|
||||
-- 输入:tmp_xxx_01, dim_department, dim_category
|
||||
-- 输出:tmp_xxx_02
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||||
SELECT
|
||||
a.id,
|
||||
a.name,
|
||||
a.amount,
|
||||
a.status,
|
||||
b.dept_name, -- 维度补全:部门名称
|
||||
c.category_name, -- 维度补全:类别名称
|
||||
a.created_at,
|
||||
a.day_id
|
||||
FROM ${db_tmp_env}.tmp_xxx_01 a
|
||||
LEFT JOIN db_name.dim_department b
|
||||
ON a.dept_id = b.dept_id
|
||||
AND b.day_id = '${day_id}' -- 维度表分区过滤
|
||||
LEFT JOIN db_name.dim_category c
|
||||
ON a.category_id = c.category_id
|
||||
AND c.day_id = '${day_id}'; -- 维度表分区过滤
|
||||
|
||||
-- ============================================================================
|
||||
-- Step03: 聚合计算与指标生成
|
||||
-- ============================================================================
|
||||
-- 说明:按业务维度聚合,计算统计指标
|
||||
-- 输入:tmp_xxx_02
|
||||
-- 输出:tmp_xxx_03
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS
|
||||
SELECT
|
||||
day_id,
|
||||
dept_name,
|
||||
category_name,
|
||||
COUNT(*) AS record_count, -- 记录数
|
||||
COUNT(DISTINCT id) AS unique_count, -- 唯一计数
|
||||
SUM(amount) AS total_amount, -- 总金额
|
||||
AVG(amount) AS avg_amount, -- 平均金额
|
||||
MAX(amount) AS max_amount, -- 最大金额
|
||||
MIN(amount) AS min_amount -- 最小金额
|
||||
FROM ${db_tmp_env}.tmp_xxx_02
|
||||
GROUP BY day_id, dept_name, category_name;
|
||||
|
||||
-- ============================================================================
|
||||
-- Step04: 最终输出写入目标表
|
||||
-- ============================================================================
|
||||
-- 说明:补全目标表标准字段,写入结果表
|
||||
-- 输入:tmp_xxx_03
|
||||
-- 输出:目标表
|
||||
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
-- 业务字段
|
||||
dept_name,
|
||||
category_name,
|
||||
record_count,
|
||||
unique_count,
|
||||
total_amount,
|
||||
avg_amount,
|
||||
max_amount,
|
||||
min_amount,
|
||||
|
||||
-- 技术字段
|
||||
current_timestamp() AS etl_time, -- 数据加工时间
|
||||
'${day_id}' AS stat_date; -- 统计日期
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 禁止使用 CTE (WITH 子句)
|
||||
- 每个步骤必须物化为临时表
|
||||
- 原因:Hive CTE 可能在某些版本有性能问题
|
||||
- 物化临时表便于调试和断点续跑
|
||||
|
||||
2. 先 DROP 再 CREATE
|
||||
- 每个临时表创建前必须先 DROP TABLE IF EXISTS
|
||||
- 原因:防止表已存在导致失败
|
||||
|
||||
3. 分区过滤必须前置
|
||||
- 所有源表和维度表查询必须带 day_id 过滤
|
||||
- 原因:避免全表扫描,提升性能
|
||||
|
||||
4. JOIN 条件下推
|
||||
- 维度表关联时带上分区过滤条件
|
||||
- 原因:减少关联数据量
|
||||
|
||||
5. 临时表命名规范
|
||||
- 格式:tmp_{业务简称}_{步骤序号}
|
||||
- 示例:tmp_order_stats_01, tmp_order_stats_02
|
||||
|
||||
6. 目标表写入规范
|
||||
- 使用 INSERT OVERWRITE(覆盖写入,幂等)
|
||||
- 明确指定分区
|
||||
- 补全技术字段(etl_time 等)
|
||||
|
||||
7. 存储格式建议
|
||||
- 临时表:默认格式即可(中间结果不需要优化存储)
|
||||
- 如需优化:STORED AS ORC
|
||||
*/
|
||||
@@ -0,0 +1,141 @@
|
||||
-- =====================================================================
|
||||
-- @Name: HIVE-D-SQL-{表名}-INSERT
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Hive 数据插入模板
|
||||
-- @TargetDatabase: Hive
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:分区表覆盖写入(最常用)
|
||||
-- ============================================================================
|
||||
-- 适用:每日/每周/每月增量写入分区表
|
||||
|
||||
INSERT OVERWRITE TABLE db_name.target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
user_id,
|
||||
user_name,
|
||||
order_count,
|
||||
total_amount,
|
||||
current_timestamp() AS etl_time
|
||||
FROM db_name.source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:动态分区写入
|
||||
-- ============================================================================
|
||||
-- 适用:数据中包含分区值,自动写入对应分区
|
||||
|
||||
-- 先启用动态分区
|
||||
SET hive.exec.dynamic.partition = true;
|
||||
SET hive.exec.dynamic.partition.mode = nonstrict;
|
||||
|
||||
INSERT OVERWRITE TABLE db_name.target_table
|
||||
PARTITION (day_id, region) -- 动态分区字段
|
||||
SELECT
|
||||
user_id,
|
||||
user_name,
|
||||
order_count,
|
||||
total_amount,
|
||||
current_timestamp() AS etl_time,
|
||||
day_id, -- 分区字段1(数据中包含)
|
||||
region -- 分区字段2(数据中包含)
|
||||
FROM db_name.source_table
|
||||
WHERE day_id BETWEEN '${start_day}' AND '${end_day}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:追加写入
|
||||
-- ============================================================================
|
||||
-- 适用:日志表、流水表(允许追加)
|
||||
|
||||
INSERT INTO TABLE db_name.target_table
|
||||
SELECT
|
||||
field1,
|
||||
field2,
|
||||
field3,
|
||||
current_timestamp() AS etl_time
|
||||
FROM db_name.source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:多分区插入(Multi-Insert)
|
||||
-- ============================================================================
|
||||
-- 适用:一次扫描,写入多个目标(提高效率)
|
||||
|
||||
FROM db_name.source_table
|
||||
INSERT OVERWRITE TABLE db_name.target_summary
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS record_count,
|
||||
SUM(amount) AS total_amount
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY department
|
||||
|
||||
INSERT OVERWRITE TABLE db_name.target_detail
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
department
|
||||
WHERE day_id = '${day_id}'
|
||||
AND amount > 1000;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:导出到文件
|
||||
-- ============================================================================
|
||||
|
||||
INSERT OVERWRITE DIRECTORY '/output/data/export/'
|
||||
ROW FORMAT DELIMITED
|
||||
FIELDS TERMINATED BY ','
|
||||
STORED AS TEXTFILE
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
day_id
|
||||
FROM db_name.target_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景6:CTAS(Create Table As Select)
|
||||
-- ============================================================================
|
||||
|
||||
-- 从查询结果创建新表
|
||||
CREATE TABLE db_name.new_table AS
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS employee_count,
|
||||
AVG(salary) AS avg_salary
|
||||
FROM db_name.employees
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY department;
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. INSERT OVERWRITE vs INSERT INTO
|
||||
- INSERT OVERWRITE:覆盖分区/表数据(推荐,幂等)
|
||||
- INSERT INTO:追加数据(可能产生重复)
|
||||
|
||||
2. 分区表写入必须指定分区
|
||||
- 静态分区:PARTITION (day_id = '${day_id}')
|
||||
- 动态分区:需先 SET 配置,PARTITION (day_id)
|
||||
- 混合分区:PARTITION (day_id = '2026-05-01', region)
|
||||
|
||||
3. 动态分区配置
|
||||
SET hive.exec.dynamic.partition = true;
|
||||
SET hive.exec.dynamic.partition.mode = nonstrict; -- 允许全动态
|
||||
SET hive.exec.max.dynamic.partitions = 1000; -- 最大动态分区数
|
||||
|
||||
4. 字段顺序
|
||||
- SELECT 字段顺序必须与目标表列定义一致
|
||||
- 分区字段在 SELECT 最后(动态分区时)
|
||||
|
||||
5. 性能优化
|
||||
- 多分区插入(Multi-Insert):一次扫描多次写入
|
||||
- INSERT OVERWRITE 比 INSERT INTO 更安全(幂等性)
|
||||
- 大数据量写入时注意 reducer 数量设置
|
||||
*/
|
||||
@@ -0,0 +1,235 @@
|
||||
-- =====================================================================
|
||||
-- @Name: HIVE-D-SQL-{表名}-QUERY
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Hive 查询模板
|
||||
-- @TargetDatabase: Hive
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 1. 单表查询
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
created_at
|
||||
FROM db_name.source_table
|
||||
WHERE day_id = '${day_id}' -- 分区过滤(必须)
|
||||
AND status = 'active'
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1000;
|
||||
|
||||
-- ============================================================================
|
||||
-- 2. JOIN 查询
|
||||
-- ============================================================================
|
||||
|
||||
-- 两表 JOIN
|
||||
SELECT
|
||||
a.id,
|
||||
a.name,
|
||||
a.amount,
|
||||
b.category_name
|
||||
FROM db_name.main_table a
|
||||
JOIN db_name.dim_table b ON a.category_id = b.id
|
||||
WHERE a.day_id = '${day_id}';
|
||||
|
||||
-- 多表 JOIN(带别名)
|
||||
SELECT
|
||||
o.order_id,
|
||||
u.user_name,
|
||||
p.product_name,
|
||||
oi.quantity,
|
||||
oi.unit_price
|
||||
FROM db_name.orders o
|
||||
JOIN db_name.users u ON o.user_id = u.id
|
||||
JOIN db_name.order_items oi ON o.order_id = oi.order_id
|
||||
JOIN db_name.products p ON oi.product_id = p.id
|
||||
WHERE o.day_id = '${day_id}'
|
||||
AND o.status IN ('completed', 'shipped');
|
||||
|
||||
-- ============================================================================
|
||||
-- 3. 聚合查询
|
||||
-- ============================================================================
|
||||
|
||||
-- GROUP BY + HAVING
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS employee_count,
|
||||
SUM(salary) AS total_salary,
|
||||
AVG(salary) AS avg_salary,
|
||||
MAX(salary) AS max_salary
|
||||
FROM db_name.employees
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY department
|
||||
HAVING COUNT(*) >= 5
|
||||
ORDER BY total_salary DESC;
|
||||
|
||||
-- 多字段分组 + 去重计数
|
||||
SELECT
|
||||
date,
|
||||
region,
|
||||
COUNT(*) AS order_count,
|
||||
COUNT(DISTINCT user_id) AS unique_users,
|
||||
SUM(amount) AS total_amount
|
||||
FROM db_name.orders
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY date, region;
|
||||
|
||||
-- ============================================================================
|
||||
-- 4. 窗口函数
|
||||
-- ============================================================================
|
||||
|
||||
-- ROW_NUMBER(分组取Top N)
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT
|
||||
department,
|
||||
name,
|
||||
salary,
|
||||
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn
|
||||
FROM db_name.employees
|
||||
WHERE day_id = '${day_id}'
|
||||
) t
|
||||
WHERE rn <= 3;
|
||||
|
||||
-- 累计聚合
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
SUM(amount) OVER (ORDER BY date) AS cumulative_amount,
|
||||
AVG(amount) OVER (
|
||||
ORDER BY date
|
||||
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
|
||||
) AS moving_avg_7d
|
||||
FROM daily_sales
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- LAG/LEAD(环比)
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
LAG(amount, 1) OVER (ORDER BY date) AS prev_amount,
|
||||
amount - LAG(amount, 1) OVER (ORDER BY date) AS daily_change
|
||||
FROM daily_sales
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 5. 子查询
|
||||
-- ============================================================================
|
||||
|
||||
-- IN 子查询
|
||||
SELECT *
|
||||
FROM db_name.orders
|
||||
WHERE user_id IN (
|
||||
SELECT id FROM db_name.users WHERE vip_level >= 3
|
||||
)
|
||||
AND day_id = '${day_id}';
|
||||
|
||||
-- EXISTS 子查询
|
||||
SELECT *
|
||||
FROM db_name.products p
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM db_name.inventory i
|
||||
WHERE i.product_id = p.id
|
||||
AND i.quantity > 0
|
||||
)
|
||||
AND p.day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 6. 条件聚合(CASE WHEN + 聚合)
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
date,
|
||||
COUNT(*) AS total_orders,
|
||||
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
|
||||
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
|
||||
SUM(CASE WHEN amount > 1000 THEN amount ELSE 0 END) AS high_value_amount
|
||||
FROM db_name.orders
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY date;
|
||||
|
||||
-- ============================================================================
|
||||
-- 7. LATERAL VIEW + explode(Hive 特有)
|
||||
-- ============================================================================
|
||||
|
||||
-- 展开数组字段
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
tag
|
||||
FROM db_name.articles
|
||||
LATERAL VIEW explode(tags) t AS tag
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- 展开数组并统计
|
||||
SELECT
|
||||
tag,
|
||||
COUNT(*) AS article_count
|
||||
FROM db_name.articles
|
||||
LATERAL VIEW explode(tags) t AS tag
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY tag;
|
||||
|
||||
-- 展开 Map
|
||||
SELECT
|
||||
id,
|
||||
map_key,
|
||||
map_value
|
||||
FROM db_name.data_table
|
||||
LATERAL VIEW explode(props) m AS map_key, map_value
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- posexplode(带索引展开)
|
||||
SELECT
|
||||
id,
|
||||
pos,
|
||||
tag
|
||||
FROM db_name.articles
|
||||
LATERAL VIEW posexplode(tags) t AS pos, tag
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 8. 复杂类型查询
|
||||
-- ============================================================================
|
||||
|
||||
-- ARRAY 操作
|
||||
SELECT
|
||||
id,
|
||||
size(tags) AS tag_count, -- 数组长度
|
||||
array_contains(tags, '大数据') AS has_tag, -- 包含判断
|
||||
tags[0] AS first_tag -- 取第一个元素
|
||||
FROM db_name.articles
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- MAP 操作
|
||||
SELECT
|
||||
id,
|
||||
props['city'] AS city, -- 取值
|
||||
map_keys(props) AS all_keys, -- 所有 key
|
||||
map_values(props) AS all_values -- 所有 value
|
||||
FROM db_name.user_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- STRUCT 操作
|
||||
SELECT
|
||||
id,
|
||||
user_info.name AS user_name, -- 结构体字段访问
|
||||
user_info.age AS user_age
|
||||
FROM db_name.data_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 9. 集合聚合
|
||||
-- ============================================================================
|
||||
|
||||
-- collect_list / collect_set
|
||||
SELECT
|
||||
department,
|
||||
collect_list(name) AS all_names, -- 收集为数组(不去重)
|
||||
collect_set(name) AS unique_names, -- 收集为数组(去重)
|
||||
size(collect_set(name)) AS unique_count
|
||||
FROM db_name.employees
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY department;
|
||||
@@ -0,0 +1,211 @@
|
||||
-- =====================================================================
|
||||
-- @Name: KUDU-D-SQL-{表名}-CREATE
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Kudu (via Impala) 建表模板
|
||||
-- @TargetDatabase: Apache Kudu (via Impala)
|
||||
-- @说明: Kudu 通过 Impala 访问,使用 Impala DDL 操作 Kudu 表
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:基础表创建(Hash 分区)
|
||||
-- ============================================================================
|
||||
-- 适用:按主键 Hash 分布数据,写入和点查性能好
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.kudu_basic (
|
||||
-- 主键列(Kudu 表必须有主键)
|
||||
id BIGINT NOT NULL COMMENT '主键ID',
|
||||
|
||||
-- 业务字段
|
||||
name STRING COMMENT '名称',
|
||||
category STRING COMMENT '类别',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
status STRING COMMENT '状态',
|
||||
created_at TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP COMMENT '更新时间'
|
||||
)
|
||||
PRIMARY KEY (id)
|
||||
PARTITION BY HASH(id) PARTITIONS 8
|
||||
STORED AS KUDU
|
||||
TBLPROPERTIES (
|
||||
'kudu.num_tablet_replicas' = '3'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:Hash + Range 组合分区
|
||||
-- ============================================================================
|
||||
-- 适用:按时间范围 + Hash 组合,兼顾范围查询和写入性能
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.kudu_range_hash (
|
||||
-- 主键列(必须包含分区列)
|
||||
id BIGINT NOT NULL COMMENT '主键ID',
|
||||
stat_date STRING NOT NULL COMMENT '统计日期 yyyy-MM-dd',
|
||||
|
||||
-- 业务字段
|
||||
department STRING COMMENT '部门',
|
||||
metric_name STRING COMMENT '指标名称',
|
||||
metric_value DECIMAL(18,2) COMMENT '指标值',
|
||||
etl_time TIMESTAMP COMMENT '加工时间'
|
||||
)
|
||||
PRIMARY KEY (id, stat_date)
|
||||
PARTITION BY
|
||||
HASH(id) PARTITIONS 4,
|
||||
RANGE(stat_date) (
|
||||
PARTITION '2026-01-01' <= VALUES < '2026-02-01',
|
||||
PARTITION '2026-02-01' <= VALUES < '2026-03-01',
|
||||
PARTITION '2026-03-01' <= VALUES < '2026-04-01',
|
||||
PARTITION '2026-04-01' <= VALUES < '2026-05-01',
|
||||
PARTITION '2026-05-01' <= VALUES < '2026-06-01'
|
||||
)
|
||||
STORED AS KUDU
|
||||
TBLPROPERTIES (
|
||||
'kudu.num_tablet_replicas' = '3',
|
||||
'kudu.compression' = 'LZ4'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:多列主键
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.kudu_composite_pk (
|
||||
user_id BIGINT NOT NULL COMMENT '用户ID',
|
||||
order_date STRING NOT NULL COMMENT '订单日期',
|
||||
order_seq INT NOT NULL COMMENT '当日订单序号',
|
||||
|
||||
user_name STRING COMMENT '用户姓名',
|
||||
product_name STRING COMMENT '商品名称',
|
||||
quantity INT COMMENT '数量',
|
||||
total_amount DECIMAL(18,2) COMMENT '总金额',
|
||||
status STRING COMMENT '状态',
|
||||
create_time TIMESTAMP COMMENT '创建时间'
|
||||
)
|
||||
PRIMARY KEY (user_id, order_date, order_seq)
|
||||
PARTITION BY
|
||||
HASH(user_id) PARTITIONS 8,
|
||||
RANGE(order_date) (
|
||||
PARTITION '2026-01-01' <= VALUES < '2026-02-01',
|
||||
PARTITION '2026-02-01' <= VALUES < '2026-03-01',
|
||||
PARTITION '2026-03-01' <= VALUES < '2026-04-01'
|
||||
)
|
||||
STORED AS KUDU
|
||||
TBLPROPERTIES (
|
||||
'kudu.num_tablet_replicas' = '3'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:纯 Range 分区
|
||||
-- ============================================================================
|
||||
-- 适用:按时间顺序写入,范围查询多
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.kudu_range_only (
|
||||
id BIGINT NOT NULL COMMENT '主键ID',
|
||||
stat_date STRING NOT NULL COMMENT '统计日期',
|
||||
metric_value DECIMAL(18,2) COMMENT '指标值',
|
||||
dimension STRING COMMENT '维度',
|
||||
etl_time TIMESTAMP COMMENT '加工时间'
|
||||
)
|
||||
PRIMARY KEY (id, stat_date)
|
||||
PARTITION BY RANGE(stat_date) (
|
||||
PARTITION '2026-01-01' <= VALUES < '2026-04-01',
|
||||
PARTITION '2026-04-01' <= VALUES < '2026-07-01',
|
||||
PARTITION '2026-07-01' <= VALUES < '2026-10-01',
|
||||
PARTITION '2026-10-01' <= VALUES < '2027-01-01'
|
||||
)
|
||||
STORED AS KUDU
|
||||
TBLPROPERTIES (
|
||||
'kudu.num_tablet_replicas' = '3'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:外部表映射已有 Kudu 表
|
||||
-- ============================================================================
|
||||
|
||||
CREATE EXTERNAL TABLE IF NOT EXISTS db_name.kudu_external
|
||||
STORED AS KUDU
|
||||
TBLPROPERTIES (
|
||||
'kudu.master_addresses' = 'kudu-master-1:7051,kudu-master-2:7051,kudu-master-3:7051',
|
||||
'kudu.table_name' = 'impala.db_name.existing_table'
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景6:带压缩和副本配置
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS db_name.kudu_with_props (
|
||||
id BIGINT NOT NULL COMMENT '主键ID',
|
||||
data_date STRING NOT NULL COMMENT '数据日期',
|
||||
content STRING COMMENT '内容',
|
||||
value DOUBLE COMMENT '数值'
|
||||
)
|
||||
PRIMARY KEY (id, data_date)
|
||||
PARTITION BY
|
||||
HASH(id) PARTITIONS 8,
|
||||
RANGE(data_date) (
|
||||
PARTITION '2026-01-01' <= VALUES < '2026-02-01',
|
||||
PARTITION '2026-02-01' <= VALUES < '2026-03-01'
|
||||
)
|
||||
STORED AS KUDU
|
||||
TBLPROPERTIES (
|
||||
'kudu.num_tablet_replicas' = '3',
|
||||
'kudu.compression' = 'LZ4', -- 压缩算法
|
||||
'kudu.encryption' = 'false' -- 加密
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 字段类型速查(Kudu 支持的类型)
|
||||
-- ============================================================================
|
||||
/*
|
||||
| 类型 | 说明 | 适用场景 |
|
||||
|---------------|----------------|------------------------|
|
||||
| BOOLEAN | 布尔 | 状态标志 |
|
||||
| TINYINT | 1字节整数 | 小范围枚举 |
|
||||
| SMALLINT | 2字节整数 | 小范围数值 |
|
||||
| INT | 4字节整数 | 数量、等级 |
|
||||
| BIGINT | 8字节整数 | ID、计数 |
|
||||
| FLOAT | 4字节浮点 | 近似计算 |
|
||||
| DOUBLE | 8字节浮点 | 科学计算 |
|
||||
| DECIMAL(p,s) | 定点数 | 金额、精确数值 |
|
||||
| STRING | 变长字符串 | 名称、描述 |
|
||||
| VARCHAR(n) | 变长字符串 | 限定长度字符串 |
|
||||
| CHAR(n) | 定长字符串 | 固定长度编码 |
|
||||
| TIMESTAMP | 时间戳 | 时间字段(微秒精度) |
|
||||
| DATE | 日期 | 日期字段 |
|
||||
| BINARY | 二进制 | 二进制数据 |
|
||||
|
||||
注意:Kudu 不支持 ARRAY, MAP, STRUCT 等复杂类型
|
||||
*/
|
||||
|
||||
-- ============================================================================
|
||||
-- 建表规范说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 主键约束(Kudu 特有)
|
||||
- 每张 Kudu 表必须有 PRIMARY KEY
|
||||
- 主键列不能为 NULL(必须 NOT NULL)
|
||||
- 主键值不可 UPDATE(只能删除后重新插入)
|
||||
- 主键列必须包含在分区列中
|
||||
|
||||
2. 分区策略
|
||||
- Hash 分区:均匀分布,适合写入和点查
|
||||
- Range 分区:按范围查询,适合时间序列
|
||||
- Hash + Range 组合:兼顾两者优势(推荐)
|
||||
- 分区数 = tablet 数量,影响并行度
|
||||
|
||||
3. 分区设计建议
|
||||
- Hash 分区数:建议 4 的倍数,参考数据量
|
||||
- Range 分区:按时间维度,定期添加新分区
|
||||
- 单个 tablet 建议 1GB~10GB
|
||||
|
||||
4. 副本数
|
||||
- 生产环境建议 3 副本(默认)
|
||||
- Raft 协议保证一致性
|
||||
|
||||
5. 压缩
|
||||
- 推荐 LZ4(速度和压缩比平衡)
|
||||
- 可选:SNAPPY, ZLIB, LZ4
|
||||
|
||||
6. 与 Hive/Spark 表的区别
|
||||
- Kudu 表支持 UPDATE 和 DELETE
|
||||
- Kudu 表不支持 INSERT OVERWRITE
|
||||
- Kudu 表不支持复杂类型(ARRAY, MAP, STRUCT)
|
||||
- Kudu 表主键有约束,Hive/Spark 无约束
|
||||
*/
|
||||
@@ -0,0 +1,146 @@
|
||||
-- =====================================================================
|
||||
-- @Name: KUDU-D-SQL-{表名}-ETL
|
||||
-- @Version: 2.0
|
||||
-- @Desc: Kudu (via Impala) ETL 数据处理模板(临时表链式处理)
|
||||
-- @TargetDatabase: Apache Kudu (via Impala)
|
||||
-- @说明: 统一规范:禁止 CTE,每步物化为临时表,先 DROP 再 CREATE
|
||||
-- 最后一步用 UPSERT INTO 写入 Kudu 目标表
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- Step01: 基础清洗与过滤
|
||||
-- ============================================================================
|
||||
-- 说明:从源表读取数据,进行基础过滤和清洗
|
||||
-- 输入:{源表名}
|
||||
-- 输出:${db_tmp_env}.tmp_xxx_01
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||||
SELECT
|
||||
order_id,
|
||||
user_id,
|
||||
dept_id,
|
||||
product_id,
|
||||
quantity,
|
||||
amount,
|
||||
status,
|
||||
stat_date
|
||||
FROM db_name.source_table
|
||||
WHERE stat_date = '${day_id}'
|
||||
AND status IN ('completed', 'shipped') -- 业务过滤
|
||||
AND amount > 0 -- 数据质量过滤
|
||||
AND user_id IS NOT NULL; -- NULL过滤
|
||||
|
||||
-- ============================================================================
|
||||
-- Step02: 多表关联与维度补全
|
||||
-- ============================================================================
|
||||
-- 说明:关联维度表,补全业务属性字段
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_01, dim_department, dim_product
|
||||
-- 输出:${db_tmp_env}.tmp_xxx_02
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||||
SELECT
|
||||
a.order_id,
|
||||
a.user_id,
|
||||
a.amount,
|
||||
a.quantity,
|
||||
b.dept_name, -- 维度补全:部门名称
|
||||
c.category_name, -- 维度补全:类别名称
|
||||
a.stat_date
|
||||
FROM ${db_tmp_env}.tmp_xxx_01 a
|
||||
LEFT JOIN db_name.dim_department b
|
||||
ON a.dept_id = b.dept_id
|
||||
LEFT JOIN db_name.dim_product c
|
||||
ON a.product_id = c.product_id;
|
||||
|
||||
-- ============================================================================
|
||||
-- Step03: 聚合计算与指标生成
|
||||
-- ============================================================================
|
||||
-- 说明:按业务维度聚合,计算统计指标
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_02
|
||||
-- 输出:${db_tmp_env}.tmp_xxx_03
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS
|
||||
SELECT
|
||||
stat_date,
|
||||
dept_name,
|
||||
category_name,
|
||||
COUNT(*) AS record_count, -- 记录数
|
||||
COUNT(DISTINCT user_id) AS unique_users, -- 去重用户数
|
||||
SUM(amount) AS total_amount, -- 总金额
|
||||
SUM(quantity) AS total_quantity, -- 总数量
|
||||
AVG(amount) AS avg_amount, -- 平均金额
|
||||
MAX(amount) AS max_amount -- 最大金额
|
||||
FROM ${db_tmp_env}.tmp_xxx_02
|
||||
GROUP BY stat_date, dept_name, category_name;
|
||||
|
||||
-- ============================================================================
|
||||
-- Step04: 最终输出写入 Kudu 目标表
|
||||
-- ============================================================================
|
||||
-- 说明:使用 UPSERT 写入 Kudu 目标表
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_03
|
||||
-- 输出:Kudu 目标表
|
||||
|
||||
-- 方式1:UPSERT(推荐,主键存在则更新,不存在则插入)
|
||||
UPSERT INTO ${db_eda_env}.target_table
|
||||
SELECT
|
||||
-- 主键字段(Kudu 表必须有主键)
|
||||
dept_name,
|
||||
category_name,
|
||||
stat_date,
|
||||
|
||||
-- 指标字段
|
||||
record_count,
|
||||
unique_users,
|
||||
total_amount,
|
||||
total_quantity,
|
||||
avg_amount,
|
||||
max_amount,
|
||||
|
||||
-- 技术字段
|
||||
NOW() AS etl_time -- 数据加工时间
|
||||
FROM ${db_tmp_env}.tmp_xxx_03;
|
||||
|
||||
-- 方式2:需要全量刷新时(先删后插)
|
||||
-- DELETE FROM ${db_eda_env}.target_table WHERE stat_date = '${day_id}';
|
||||
-- INSERT INTO ${db_eda_env}.target_table
|
||||
-- SELECT ... FROM ${db_tmp_env}.tmp_xxx_03;
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 禁止使用 CTE (WITH 子句)
|
||||
- 每个步骤必须物化为临时表
|
||||
- 原因:便于调试、断点续跑、统一编码规范
|
||||
|
||||
2. 先 DROP 再 CREATE
|
||||
- 每个临时表创建前必须先 DROP TABLE IF EXISTS
|
||||
- 原因:防止表已存在导致失败
|
||||
|
||||
3. Kudu 写入方式
|
||||
- 推荐 UPSERT INTO(Kudu 核心优势)
|
||||
- 主键存在 → 更新(整行替换)
|
||||
- 主键不存在 → 插入新行
|
||||
- 需要全量刷新 → 先 DELETE 再 INSERT
|
||||
|
||||
4. Kudu 表约束
|
||||
- 不支持 INSERT OVERWRITE(用 UPSERT 或 DELETE + INSERT 替代)
|
||||
- 必须有 PRIMARY KEY
|
||||
- 主键列不能为 NULL
|
||||
- 支持 UPDATE 和 DELETE
|
||||
|
||||
5. 过滤条件前置
|
||||
- 所有过滤在最早阶段应用
|
||||
- 减少中间数据量
|
||||
|
||||
6. 临时表命名规范
|
||||
- 格式:tmp_{业务简称}_{步骤序号}
|
||||
|
||||
7. Kudu 特有注意事项
|
||||
- CONCAT 只接受 2 个参数,多参数用 CONCAT_WS
|
||||
- 不支持 collect_list/collect_set,用 GROUP_CONCAT 替代
|
||||
- 近似去重用 NDV() 函数
|
||||
*/
|
||||
@@ -0,0 +1,160 @@
|
||||
-- =====================================================================
|
||||
-- @Name: KUDU-D-SQL-{表名}-INSERT
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Kudu (via Impala) 数据插入模板
|
||||
-- @TargetDatabase: Apache Kudu (via Impala)
|
||||
-- @说明: Kudu 表不支持 INSERT OVERWRITE,支持 INSERT INTO 和 UPSERT
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:INSERT INTO(追加写入)
|
||||
-- ============================================================================
|
||||
-- 适用:向 Kudu 表追加新数据
|
||||
|
||||
INSERT INTO db_name.kudu_table
|
||||
SELECT
|
||||
id,
|
||||
stat_date,
|
||||
name,
|
||||
department,
|
||||
amount,
|
||||
current_timestamp() AS etl_time
|
||||
FROM db_name.source_table
|
||||
WHERE stat_date = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:UPSERT INTO(更新插入,Kudu 特有)
|
||||
-- ============================================================================
|
||||
-- 适用:如果主键存在则更新,不存在则插入
|
||||
-- 这是 Kudu 的核心优势,其他 Hive/Spark 表不支持
|
||||
|
||||
-- 基础 UPSERT
|
||||
UPSERT INTO db_name.kudu_table
|
||||
SELECT
|
||||
id,
|
||||
stat_date,
|
||||
name,
|
||||
department,
|
||||
amount,
|
||||
current_timestamp() AS etl_time
|
||||
FROM db_name.staging_table
|
||||
WHERE stat_date = '${day_id}';
|
||||
|
||||
-- 聚合后 UPSERT(增量更新指标表)
|
||||
UPSERT INTO db_name.kudu_metrics
|
||||
SELECT
|
||||
department,
|
||||
'${day_id}' AS stat_date,
|
||||
COUNT(*) AS order_count,
|
||||
SUM(amount) AS total_amount,
|
||||
current_timestamp() AS etl_time
|
||||
FROM db_name.incremental_orders
|
||||
WHERE stat_date = '${day_id}'
|
||||
GROUP BY department;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:UPDATE(Kudu 表特有)
|
||||
-- ============================================================================
|
||||
-- 适用:修改已有数据
|
||||
-- 注意:主键列不能被 UPDATE
|
||||
|
||||
-- 单条更新
|
||||
UPDATE db_name.kudu_table
|
||||
SET status = 'processed',
|
||||
updated_at = current_timestamp()
|
||||
WHERE id = 12345;
|
||||
|
||||
-- 批量条件更新
|
||||
UPDATE db_name.kudu_table
|
||||
SET status = 'expired',
|
||||
updated_at = current_timestamp()
|
||||
WHERE stat_date < '2026-01-01'
|
||||
AND status = 'active';
|
||||
|
||||
-- 关联更新(用子查询)
|
||||
UPDATE db_name.kudu_table t
|
||||
SET t.department = d.new_dept_name
|
||||
FROM db_name.dept_mapping d
|
||||
WHERE t.department = d.old_dept_name;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:DELETE(Kudu 表特有)
|
||||
-- ============================================================================
|
||||
-- 适用:删除数据
|
||||
-- 注意:Kudu 的 DELETE 比 Hive/Spark 方便得多
|
||||
|
||||
-- 条件删除
|
||||
DELETE FROM db_name.kudu_table
|
||||
WHERE stat_date < '2026-01-01';
|
||||
|
||||
-- 按主键删除
|
||||
DELETE FROM db_name.kudu_table
|
||||
WHERE id IN (1001, 1002, 1003);
|
||||
|
||||
-- 关联删除(用子查询)
|
||||
DELETE FROM db_name.kudu_table
|
||||
WHERE user_id IN (
|
||||
SELECT user_id FROM db_name.blacklist
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:从查询结果写入
|
||||
-- ============================================================================
|
||||
|
||||
-- 简单 ETL:清洗后写入
|
||||
INSERT INTO db_name.kudu_target
|
||||
SELECT
|
||||
id,
|
||||
'${day_id}' AS stat_date,
|
||||
name,
|
||||
COALESCE(department, '未知') AS department,
|
||||
amount,
|
||||
current_timestamp() AS etl_time
|
||||
FROM db_name.raw_data
|
||||
WHERE stat_date = '${day_id}'
|
||||
AND id IS NOT NULL
|
||||
AND amount > 0;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景6:批量 VALUES 写入
|
||||
-- ============================================================================
|
||||
|
||||
INSERT INTO db_name.kudu_table (id, stat_date, name, amount)
|
||||
VALUES
|
||||
(1, '2026-05-01', '测试1', 100.00),
|
||||
(2, '2026-05-01', '测试2', 200.00),
|
||||
(3, '2026-05-01', '测试3', 300.00);
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. Kudu 表与 Hive/Spark 表的核心区别
|
||||
- 支持 INSERT INTO:✅
|
||||
- 支持 INSERT OVERWRITE:❌(不支持!)
|
||||
- 支持 UPSERT:✅(Kudu 独有,核心能力)
|
||||
- 支持 UPDATE:✅(Kudu 独有)
|
||||
- 支持 DELETE:✅(Kudu 独有)
|
||||
|
||||
2. UPSERT 是 Kudu 的核心优势
|
||||
- 主键存在 → 更新(整行替换)
|
||||
- 主键不存在 → 插入新行
|
||||
- 适用于:增量更新、数据修正、指标回填
|
||||
|
||||
3. INSERT INTO 注意事项
|
||||
- 如果主键冲突会报错(不会自动去重)
|
||||
- 需要确保写入数据的主键不重复,或使用 UPSERT
|
||||
|
||||
4. UPDATE 限制
|
||||
- 主键列不能被 UPDATE
|
||||
- WHERE 条件建议包含主键或分区列(性能)
|
||||
|
||||
5. DELETE 建议
|
||||
- 删除大量数据时按分区范围删除
|
||||
- 定期清理历史数据
|
||||
|
||||
6. 性能建议
|
||||
- 批量写入优于逐条写入
|
||||
- UPSERT 比 DELETE + INSERT 更高效
|
||||
- 利用主键做点查,避免全表扫描
|
||||
*/
|
||||
@@ -0,0 +1,179 @@
|
||||
-- =====================================================================
|
||||
-- @Name: KUDU-D-SQL-{表名}-QUERY
|
||||
-- @Version: 1.0
|
||||
-- @Desc: Kudu (via Impala) 查询模板
|
||||
-- @TargetDatabase: Apache Kudu (via Impala)
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 1. 单表查询
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
created_at
|
||||
FROM db_name.kudu_table
|
||||
WHERE stat_date = '${day_id}'
|
||||
AND status = 'active'
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1000;
|
||||
|
||||
-- ============================================================================
|
||||
-- 2. JOIN 查询
|
||||
-- ============================================================================
|
||||
|
||||
-- 两表 JOIN(Kudu 表 JOIN 非 Kudu 表也支持)
|
||||
SELECT
|
||||
k.id,
|
||||
k.name,
|
||||
k.amount,
|
||||
d.dept_name
|
||||
FROM db_name.kudu_table k
|
||||
JOIN db_name.dim_department d ON k.dept_id = d.dept_id
|
||||
WHERE k.stat_date = '${day_id}';
|
||||
|
||||
-- 多表 JOIN
|
||||
SELECT
|
||||
k.id,
|
||||
k.user_name,
|
||||
p.product_name,
|
||||
k.quantity,
|
||||
k.total_amount
|
||||
FROM db_name.kudu_orders k
|
||||
JOIN db_name.dim_users u ON k.user_id = u.user_id
|
||||
JOIN db_name.dim_products p ON k.product_id = p.product_id
|
||||
WHERE k.stat_date BETWEEN '${start_date}' AND '${end_date}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 3. 聚合查询
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS record_count,
|
||||
SUM(amount) AS total_amount,
|
||||
AVG(amount) AS avg_amount,
|
||||
MAX(amount) AS max_amount,
|
||||
MIN(amount) AS min_amount
|
||||
FROM db_name.kudu_table
|
||||
WHERE stat_date = '${day_id}'
|
||||
GROUP BY department
|
||||
HAVING COUNT(*) >= 5
|
||||
ORDER BY total_amount DESC;
|
||||
|
||||
-- 多字段分组 + 去重计数
|
||||
SELECT
|
||||
stat_date,
|
||||
region,
|
||||
COUNT(*) AS order_count,
|
||||
COUNT(DISTINCT user_id) AS unique_users,
|
||||
SUM(amount) AS total_amount
|
||||
FROM db_name.kudu_table
|
||||
WHERE stat_date BETWEEN '${start_date}' AND '${end_date}'
|
||||
GROUP BY stat_date, region;
|
||||
|
||||
-- ============================================================================
|
||||
-- 4. 窗口函数
|
||||
-- ============================================================================
|
||||
|
||||
-- ROW_NUMBER(分组取Top N)
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT
|
||||
department,
|
||||
user_name,
|
||||
amount,
|
||||
ROW_NUMBER() OVER (PARTITION BY department ORDER BY amount DESC) AS rn
|
||||
FROM db_name.kudu_table
|
||||
WHERE stat_date = '${day_id}'
|
||||
) t
|
||||
WHERE rn <= 3;
|
||||
|
||||
-- 累计聚合
|
||||
SELECT
|
||||
stat_date,
|
||||
daily_amount,
|
||||
SUM(daily_amount) OVER (ORDER BY stat_date) AS cumulative_amount
|
||||
FROM (
|
||||
SELECT stat_date, SUM(amount) AS daily_amount
|
||||
FROM db_name.kudu_table
|
||||
GROUP BY stat_date
|
||||
) t;
|
||||
|
||||
-- LAG/LEAD(环比)
|
||||
SELECT
|
||||
stat_date,
|
||||
daily_amount,
|
||||
LAG(daily_amount, 1) OVER (ORDER BY stat_date) AS prev_amount,
|
||||
daily_amount - LAG(daily_amount, 1) OVER (ORDER BY stat_date) AS daily_change
|
||||
FROM (
|
||||
SELECT stat_date, SUM(amount) AS daily_amount
|
||||
FROM db_name.kudu_table
|
||||
GROUP BY stat_date
|
||||
) t;
|
||||
|
||||
-- ============================================================================
|
||||
-- 5. 子查询
|
||||
-- ============================================================================
|
||||
|
||||
-- IN 子查询
|
||||
SELECT *
|
||||
FROM db_name.kudu_table
|
||||
WHERE user_id IN (
|
||||
SELECT user_id FROM db_name.vip_users WHERE vip_level >= 3
|
||||
)
|
||||
AND stat_date = '${day_id}';
|
||||
|
||||
-- EXISTS 子查询
|
||||
SELECT *
|
||||
FROM db_name.kudu_products p
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM db_name.kudu_inventory i
|
||||
WHERE i.product_id = p.product_id
|
||||
AND i.quantity > 0
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 6. 条件聚合(CASE WHEN + 聚合)
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
stat_date,
|
||||
COUNT(*) AS total_count,
|
||||
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
|
||||
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
|
||||
SUM(CASE WHEN amount > 1000 THEN amount ELSE 0 END) AS high_value_amount
|
||||
FROM db_name.kudu_table
|
||||
WHERE stat_date = '${day_id}'
|
||||
GROUP BY stat_date;
|
||||
|
||||
-- ============================================================================
|
||||
-- 7. LIMIT / OFFSET(分页)
|
||||
-- ============================================================================
|
||||
|
||||
SELECT id, name, amount
|
||||
FROM db_name.kudu_table
|
||||
WHERE stat_date = '${day_id}'
|
||||
ORDER BY id
|
||||
LIMIT 20 OFFSET 0;
|
||||
|
||||
-- ============================================================================
|
||||
-- 8. Kudu 特有:通过主键高效点查
|
||||
-- ============================================================================
|
||||
-- Kudu 主键查询可跳过扫描,直接定位 tablet
|
||||
|
||||
-- 单主键点查
|
||||
SELECT * FROM db_name.kudu_table
|
||||
WHERE id = 12345;
|
||||
|
||||
-- 复合主键点查
|
||||
SELECT * FROM db_name.kudu_composite_pk
|
||||
WHERE user_id = 1001
|
||||
AND order_date = '2026-05-01'
|
||||
AND order_seq = 1;
|
||||
|
||||
-- 主键 IN 查询
|
||||
SELECT * FROM db_name.kudu_table
|
||||
WHERE id IN (1001, 1002, 1003, 1004, 1005);
|
||||
@@ -0,0 +1,176 @@
|
||||
-- =====================================================================
|
||||
-- @SparkSqlName: PAIMONA-D-SQL-{表名}-CREATE
|
||||
-- @Version: 1.0
|
||||
-- @Desc: 建表模板(CREATE TABLE)
|
||||
-- @TargetTables: {新表名}
|
||||
-- @TargetDatabase: Paimon
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:基础表创建(非分区)
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.basic_table (
|
||||
-- 主键/标识字段
|
||||
id BIGINT COMMENT '主键ID',
|
||||
|
||||
-- 业务字段
|
||||
name STRING COMMENT '名称',
|
||||
category STRING COMMENT '类别',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
status STRING COMMENT '状态',
|
||||
|
||||
-- 时间字段
|
||||
created_at TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP COMMENT '更新时间',
|
||||
|
||||
-- 技术字段
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间',
|
||||
etl_remark STRING COMMENT '备注信息'
|
||||
)
|
||||
COMMENT '基础业务表'
|
||||
STORED AS PARQUET; -- 存储格式
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:分区表创建(单分区字段)
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.partitioned_table (
|
||||
-- 主键/标识字段
|
||||
id BIGINT COMMENT '主键ID',
|
||||
|
||||
-- 业务字段
|
||||
user_id STRING COMMENT '用户ID',
|
||||
user_name STRING COMMENT '用户姓名',
|
||||
order_count BIGINT COMMENT '订单数',
|
||||
total_amount DECIMAL(18,2) COMMENT '总金额',
|
||||
|
||||
-- 维度字段
|
||||
department STRING COMMENT '部门',
|
||||
region STRING COMMENT '地区',
|
||||
|
||||
-- 技术字段
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||||
)
|
||||
COMMENT '按日分区的统计表'
|
||||
PARTITIONED BY (day_id STRING COMMENT '统计日期,格式yyyy-MM-dd')
|
||||
STORED AS PARQUET;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:多分区字段表
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.multi_partition_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
name STRING COMMENT '名称',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||||
)
|
||||
COMMENT '多分区字段示例表'
|
||||
PARTITIONED BY (
|
||||
year_id STRING COMMENT '年份',
|
||||
month_id STRING COMMENT '月份'
|
||||
)
|
||||
STORED AS PARQUET;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:带表属性配置
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.configured_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
name STRING COMMENT '名称',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||||
)
|
||||
COMMENT '带属性配置的表'
|
||||
PARTITIONED BY (day_id STRING)
|
||||
STORED AS PARQUET
|
||||
TBLPROPERTIES (
|
||||
'parquet.compression' = 'SNAPPY', -- 压缩格式
|
||||
'spark.sql.partitionOverwriteMode' = 'dynamic' -- 动态分区覆盖模式
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:Paimon 表创建(主键表)
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.paimon_pk_table (
|
||||
-- 主键字段(Paimon 主键表必须包含所有主键字段)
|
||||
id BIGINT COMMENT '主键ID',
|
||||
day_id STRING COMMENT '分区日期',
|
||||
|
||||
-- 业务字段
|
||||
name STRING COMMENT '名称',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
status STRING COMMENT '状态',
|
||||
|
||||
-- 技术字段
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||||
)
|
||||
COMMENT 'Paimon 主键表(支持 MERGE INTO)'
|
||||
PARTITIONED BY (day_id)
|
||||
TBLPROPERTIES (
|
||||
'primary-key' = 'id,day_id', -- 主键定义
|
||||
'bucket' = '4', -- 分桶数
|
||||
'changelog-producer' = 'input' -- 变更日志生产
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景6:临时表创建
|
||||
-- ============================================================================
|
||||
CREATE TEMPORARY TABLE tmp_processing_table (
|
||||
id BIGINT,
|
||||
name STRING,
|
||||
amount DECIMAL(18,2)
|
||||
);
|
||||
|
||||
-- 或使用 AS 创建临时表
|
||||
CREATE TEMPORARY TABLE tmp_source AS
|
||||
SELECT id, name, amount
|
||||
FROM source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 字段类型速查
|
||||
-- ============================================================================
|
||||
/*
|
||||
| 类型 | 说明 | 适用场景 |
|
||||
|---------------|----------------|------------------------|
|
||||
| STRING | 字符串 | 名称、编码、描述 |
|
||||
| INT | 整数 | 数量、等级、标志 |
|
||||
| BIGINT | 大整数 | ID、计数、金额(整数) |
|
||||
| DECIMAL(p,s) | 定点数 | 金额、比例、精度数值 |
|
||||
| DOUBLE | 浮点数 | 科学计算(慎用于金额) |
|
||||
| BOOLEAN | 布尔 | 状态标志 |
|
||||
| DATE | 日期 | 日期字段 |
|
||||
| TIMESTAMP | 时间戳 | 时间字段 |
|
||||
| ARRAY<type> | 数组 | 多值字段 |
|
||||
| MAP<k,v> | 映射 | 属性字典 |
|
||||
*/
|
||||
|
||||
-- ============================================================================
|
||||
-- 建表规范说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 字段命名规范
|
||||
- 使用 snake_case:user_id, total_amount
|
||||
- 主键:id 或 {业务}_id
|
||||
- 技术字段:etl_time, etl_remark
|
||||
|
||||
2. COMMENT 必须添加
|
||||
- 每个字段必须有 COMMENT
|
||||
- 表必须有 COMMENT
|
||||
|
||||
3. 分区字段选择
|
||||
- 按时间分区:day_id, month_id
|
||||
- 分区粒度:日分区最常用
|
||||
|
||||
4. 存储格式
|
||||
- 推荐:PARQUET(列存储,压缩好)
|
||||
- 可选:ORC、AVRO
|
||||
|
||||
5. Paimon 表特性
|
||||
- primary-key:主键字段列表
|
||||
- bucket:分桶数(影响并发)
|
||||
- 支持 MERGE INTO 操作
|
||||
|
||||
6. 表属性配置
|
||||
- 压缩格式:SNAPPY(推荐)、GZIP、LZ4
|
||||
- 动态分区模式:dynamic(推荐)
|
||||
*/
|
||||
@@ -0,0 +1,148 @@
|
||||
-- =====================================================================
|
||||
-- @SparkSqlName: PAIMONA-D-SQL-{表名}-ETL
|
||||
-- @Version: 1.0
|
||||
-- @Desc: ETL 数据处理模板(临时表链式处理)
|
||||
-- @TargetTables: ${db_eda_env}.{目标表名}
|
||||
-- @SourceTables: {源表列表}
|
||||
-- @TargetDatabase: Paimon
|
||||
-- @SourceDatabase: Paimon
|
||||
-- @任务调度频度: {日/周/月}
|
||||
-- @修改记录:
|
||||
-- 版本号 更新时间 更新人员 更新内容
|
||||
-- V1.0 {日期} {人员} 创建脚本
|
||||
-- @数据处理步骤:
|
||||
-- Step01: {步骤描述}
|
||||
-- Step02: {步骤描述}
|
||||
-- Step03: {步骤描述}
|
||||
-- 参数说明
|
||||
-- 账期参数:
|
||||
-- ${day_id} 日账期,格式:20250101
|
||||
-- 环境变量:
|
||||
-- 变量名 测试环境值 生产环境值
|
||||
-- ${db_tmp_env} {库名} {库名}
|
||||
-- ${db_eda_env} {库名} {库名}
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- Step01: 基础清洗与过滤
|
||||
-- ============================================================================
|
||||
-- 说明:从源表读取数据,进行基础过滤和清洗
|
||||
-- 输入:{源表名}
|
||||
-- 输出:${db_tmp_env}.tmp_{表名}_01
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_01;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_01 AS
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
status,
|
||||
created_at,
|
||||
day_id
|
||||
FROM source_table
|
||||
WHERE day_id = '${day_id}' -- 分区过滤(必须)
|
||||
AND status IN ('active', 'valid') -- 业务过滤
|
||||
AND amount > 0 -- 数据质量过滤
|
||||
AND id IS NOT NULL -- NULL过滤;
|
||||
|
||||
-- ============================================================================
|
||||
-- Step02: 多表关联与维度补全
|
||||
-- ============================================================================
|
||||
-- 说明:关联维度表,补全业务属性字段
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_01, {维度表1}, {维度表2}
|
||||
-- 输出:${db_tmp_env}.tmp_xxx_02
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_02;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_02 AS
|
||||
SELECT
|
||||
a.id,
|
||||
a.name,
|
||||
a.amount,
|
||||
a.status,
|
||||
b.category_name, -- 维度补全:类别名称
|
||||
c.department_name, -- 维度补全:部门名称
|
||||
a.created_at,
|
||||
a.day_id
|
||||
FROM ${db_tmp_env}.tmp_xxx_01 a
|
||||
LEFT JOIN dim_category b
|
||||
ON a.category_id = b.id
|
||||
AND b.day_id = '${day_id}' -- 维度表分区过滤
|
||||
LEFT JOIN dim_department c
|
||||
ON a.department_id = c.id
|
||||
AND c.day_id = '${day_id}'; -- 维度表分区过滤
|
||||
|
||||
-- ============================================================================
|
||||
-- Step03: 聚合计算与指标生成
|
||||
-- ============================================================================
|
||||
-- 说明:按业务维度聚合,计算统计指标
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_02
|
||||
-- 输出:${db_tmp_env}.tmp_xxx_03
|
||||
|
||||
DROP TABLE IF EXISTS ${db_tmp_env}.tmp_xxx_03;
|
||||
CREATE TABLE ${db_tmp_env}.tmp_xxx_03 AS
|
||||
SELECT
|
||||
day_id,
|
||||
category_name,
|
||||
department_name,
|
||||
COUNT(*) AS record_count, -- 记录数
|
||||
COUNT(DISTINCT id) AS unique_count, -- 唯一计数
|
||||
SUM(amount) AS total_amount, -- 总金额
|
||||
AVG(amount) AS avg_amount, -- 平均金额
|
||||
MAX(amount) AS max_amount, -- 最大金额
|
||||
MIN(amount) AS min_amount -- 最小金额
|
||||
FROM ${db_tmp_env}.tmp_xxx_02
|
||||
GROUP BY day_id, category_name, department_name;
|
||||
|
||||
-- ============================================================================
|
||||
-- Step04: 最终输出写入目标表
|
||||
-- ============================================================================
|
||||
-- 说明:补全目标表标准字段,写入结果表
|
||||
-- 输入:${db_tmp_env}.tmp_xxx_03
|
||||
-- 输出:${db_eda_env}.{目标表名}
|
||||
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
-- 业务字段
|
||||
category_name,
|
||||
department_name,
|
||||
record_count,
|
||||
unique_count,
|
||||
total_amount,
|
||||
avg_amount,
|
||||
max_amount,
|
||||
min_amount,
|
||||
|
||||
-- 技术字段
|
||||
current_timestamp() AS etl_time, -- 数据加工时间
|
||||
'${day_id}' AS stat_date -- 统计日期;
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 禁止使用 CTE (WITH 子句)
|
||||
- 每个步骤必须物化为临时表
|
||||
- 原因:避免内存溢出,便于调试和断点续跑
|
||||
|
||||
2. 先 DROP 再 CREATE
|
||||
- 每个临时表创建前必须先 DROP
|
||||
- 原因:防止表已存在导致失败
|
||||
|
||||
3. 分区过滤必须前置
|
||||
- 所有源表和维度表查询必须带 day_id 过滤
|
||||
- 原因:避免全表扫描,提升性能
|
||||
|
||||
4. JOIN 条件下推
|
||||
- 维度表关联时带上分区过滤条件
|
||||
- 原因:减少关联数据量
|
||||
|
||||
5. 临时表命名规范
|
||||
- 格式:tmp_{业务简称}_{步骤序号}
|
||||
- 示例:tmp_order_stats_01, tmp_order_stats_02
|
||||
|
||||
6. 目标表写入规范
|
||||
- 使用 INSERT OVERWRITE(覆盖写入)
|
||||
- 明确指定分区
|
||||
- 补全技术字段(etl_time 等)
|
||||
*/
|
||||
@@ -0,0 +1,131 @@
|
||||
-- =====================================================================
|
||||
-- @SparkSqlName: PAIMONA-D-SQL-{表名}-INSERT
|
||||
-- @Version: 1.0
|
||||
-- @Desc: 数据插入模板(INSERT OVERWRITE)
|
||||
-- @TargetTables: ${db_eda_env}.{目标表名}
|
||||
-- @SourceTables: {源表列表}
|
||||
-- @TargetDatabase: Paimon
|
||||
-- @SourceDatabase: Paimon
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景1:分区表覆盖写入
|
||||
-- ============================================================================
|
||||
-- 适用:每日/每周/每月增量写入分区表
|
||||
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
field1,
|
||||
field2,
|
||||
field3,
|
||||
current_timestamp() AS etl_time
|
||||
FROM source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景2:动态分区写入
|
||||
-- ============================================================================
|
||||
-- 适用:多分区字段,数据中包含分区值
|
||||
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
|
||||
PARTITION (day_id, region) -- 动态分区字段
|
||||
SELECT
|
||||
field1,
|
||||
field2,
|
||||
field3,
|
||||
day_id, -- 分区字段1(数据中包含)
|
||||
region, -- 分区字段2(数据中包含)
|
||||
current_timestamp() AS etl_time
|
||||
FROM source_table
|
||||
WHERE day_id BETWEEN '${start_day}' AND '${end_day}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景3:全表覆盖写入
|
||||
-- ============================================================================
|
||||
-- 适用:全量刷新、初始化数据
|
||||
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
|
||||
SELECT
|
||||
field1,
|
||||
field2,
|
||||
field3,
|
||||
current_timestamp() AS etl_time
|
||||
FROM source_table;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景4:追加写入(慎用)
|
||||
-- ============================================================================
|
||||
-- 适用:日志表、流水表(无分区或允许重复)
|
||||
|
||||
INSERT INTO TABLE ${db_eda_env}.target_table
|
||||
SELECT
|
||||
field1,
|
||||
field2,
|
||||
field3,
|
||||
current_timestamp() AS etl_time
|
||||
FROM source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景5:从临时表写入目标表
|
||||
-- ============================================================================
|
||||
-- 适用:ETL 流程最后一步
|
||||
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.target_table
|
||||
PARTITION (day_id = '${day_id}')
|
||||
SELECT
|
||||
-- 业务字段(与目标表字段顺序一致)
|
||||
user_id,
|
||||
user_name,
|
||||
order_count,
|
||||
total_amount,
|
||||
|
||||
-- 技术字段
|
||||
current_timestamp() AS etl_time,
|
||||
'${day_id}' AS stat_date
|
||||
FROM ${db_tmp_env}.tmp_xxx_final;
|
||||
|
||||
-- ============================================================================
|
||||
-- 场景6:MERGE INTO(更新插入)
|
||||
-- ============================================================================
|
||||
-- 适用:增量更新、修正历史数据
|
||||
|
||||
MERGE INTO ${db_eda_env}.target_table t
|
||||
USING ${db_tmp_env}.tmp_xxx_source s
|
||||
ON t.id = s.id AND t.day_id = s.day_id
|
||||
WHEN MATCHED THEN
|
||||
UPDATE SET
|
||||
t.name = s.name,
|
||||
t.amount = s.amount,
|
||||
t.etl_time = current_timestamp()
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (id, day_id, name, amount, etl_time)
|
||||
VALUES (s.id, s.day_id, s.name, s.amount, current_timestamp());
|
||||
|
||||
-- ============================================================================
|
||||
-- 关键规则说明
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. INSERT OVERWRITE vs INSERT INTO
|
||||
- INSERT OVERWRITE:覆盖写入(推荐)
|
||||
- INSERT INTO:追加写入(可能导致重复数据)
|
||||
|
||||
2. 分区表写入必须指定分区
|
||||
- 避免全表覆盖导致历史数据丢失
|
||||
- 格式:PARTITION (day_id = '${day_id}')
|
||||
|
||||
3. 字段顺序必须与目标表一致
|
||||
- 目标表字段顺序:业务字段 → 技术字段 → 分区字段
|
||||
- SELECT 字段顺序必须匹配
|
||||
|
||||
4. 技术字段补全
|
||||
- etl_time:数据写入时间
|
||||
- stat_date:统计日期(可选)
|
||||
- etl_remark:备注信息(可选)
|
||||
|
||||
5. MERGE INTO 注意事项
|
||||
- Spark 3.x+ 支持
|
||||
- 目标表必须支持事务(如 Paimon/Delta)
|
||||
- 关联字段必须唯一(避免多条匹配)
|
||||
*/
|
||||
@@ -0,0 +1,179 @@
|
||||
-- =====================================================================
|
||||
-- @SparkSqlName: PAIMONA-D-SQL-{表名}-PARTITION
|
||||
-- @Version: 1.0
|
||||
-- @Desc: 分区表操作模板
|
||||
-- @TargetTables: {分区表名}
|
||||
-- @TargetDatabase: Paimon
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 分区表创建
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.daily_partition_table (
|
||||
id BIGINT COMMENT '主键ID',
|
||||
user_id STRING COMMENT '用户ID',
|
||||
amount DECIMAL(18,2) COMMENT '金额',
|
||||
etl_time TIMESTAMP COMMENT '数据加工时间'
|
||||
)
|
||||
COMMENT '按日分区表'
|
||||
PARTITIONED BY (day_id STRING COMMENT '统计日期')
|
||||
STORED AS PARQUET;
|
||||
|
||||
-- ============================================================================
|
||||
-- 分区写入操作
|
||||
-- ============================================================================
|
||||
|
||||
-- 1. 静态分区写入(指定分区值)
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.daily_partition_table
|
||||
PARTITION (day_id = '2026-05-09')
|
||||
SELECT
|
||||
id,
|
||||
user_id,
|
||||
amount,
|
||||
current_timestamp() AS etl_time
|
||||
FROM source_table
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- 2. 动态分区写入(数据中包含分区值)
|
||||
-- 需要先设置动态分区模式
|
||||
SET spark.sql.partitionOverwriteMode = dynamic;
|
||||
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.daily_partition_table
|
||||
PARTITION (day_id) -- 动态分区字段
|
||||
SELECT
|
||||
id,
|
||||
user_id,
|
||||
amount,
|
||||
current_timestamp() AS etl_time,
|
||||
day_id -- 数据中包含分区值
|
||||
FROM source_table
|
||||
WHERE day_id BETWEEN '2026-05-01' AND '2026-05-09';
|
||||
|
||||
-- ============================================================================
|
||||
-- 分区查询操作
|
||||
-- ============================================================================
|
||||
|
||||
-- 3. 单分区查询
|
||||
SELECT *
|
||||
FROM ${db_eda_env}.daily_partition_table
|
||||
WHERE day_id = '2026-05-09';
|
||||
|
||||
-- 4. 多分区查询
|
||||
SELECT *
|
||||
FROM ${db_eda_env}.daily_partition_table
|
||||
WHERE day_id IN ('2026-05-01', '2026-05-02', '2026-05-03');
|
||||
|
||||
-- 5. 分区范围查询
|
||||
SELECT *
|
||||
FROM ${db_eda_env}.daily_partition_table
|
||||
WHERE day_id >= '2026-05-01'
|
||||
AND day_id <= '2026-05-09';
|
||||
|
||||
-- 6. 最近 N 天分区查询(动态计算)
|
||||
SELECT *
|
||||
FROM ${db_eda_env}.daily_partition_table
|
||||
WHERE day_id >= date_format(date_sub(current_date(), 30), 'yyyy-MM-dd');
|
||||
|
||||
-- ============================================================================
|
||||
-- 分区管理操作
|
||||
-- ============================================================================
|
||||
|
||||
-- 7. 查看分区列表
|
||||
SHOW PARTITIONS ${db_eda_env}.daily_partition_table;
|
||||
|
||||
-- 8. 查看特定分区详情
|
||||
DESCRIBE EXTENDED ${db_eda_env}.daily_partition_table PARTITION (day_id = '2026-05-09');
|
||||
|
||||
-- 9. 添加分区(手动创建空分区,部分表类型支持)
|
||||
ALTER TABLE ${db_eda_env}.daily_partition_table
|
||||
ADD IF NOT EXISTS PARTITION (day_id = '2026-05-10');
|
||||
|
||||
-- 10. 删除分区(清理历史数据)
|
||||
ALTER TABLE ${db_eda_env}.daily_partition_table
|
||||
DROP IF EXISTS PARTITION (day_id = '2026-01-01');
|
||||
|
||||
-- ============================================================================
|
||||
-- 多分区字段操作
|
||||
-- ============================================================================
|
||||
|
||||
-- 11. 多分区字段表创建
|
||||
CREATE TABLE IF NOT EXISTS ${db_eda_env}.multi_partition_table (
|
||||
id BIGINT,
|
||||
name STRING,
|
||||
amount DECIMAL(18,2),
|
||||
etl_time TIMESTAMP
|
||||
)
|
||||
PARTITIONED BY (year_id STRING, month_id STRING)
|
||||
STORED AS PARQUET;
|
||||
|
||||
-- 12. 多分区字段写入
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.multi_partition_table
|
||||
PARTITION (year_id = '2026', month_id = '05')
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
current_timestamp() AS etl_time
|
||||
FROM source_table
|
||||
WHERE year_id = '2026' AND month_id = '05';
|
||||
|
||||
-- 13. 多分区字段动态写入
|
||||
SET spark.sql.partitionOverwriteMode = dynamic;
|
||||
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.multi_partition_table
|
||||
PARTITION (year_id, month_id)
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
current_timestamp() AS etl_time,
|
||||
year_id,
|
||||
month_id
|
||||
FROM source_table;
|
||||
|
||||
-- ============================================================================
|
||||
-- 分区数据清理
|
||||
-- ============================================================================
|
||||
|
||||
-- 14. 清理指定分区数据
|
||||
INSERT OVERWRITE TABLE ${db_eda_env}.daily_partition_table
|
||||
PARTITION (day_id = '2026-05-09')
|
||||
SELECT * FROM ${db_eda_env}.daily_partition_table
|
||||
WHERE day_id = '2026-05-09'
|
||||
AND status = 'valid'; -- 只保留有效数据
|
||||
|
||||
-- 15. 清理 N 天前分区(批量)
|
||||
-- 使用脚本或程序循环执行
|
||||
-- ALTER TABLE xxx DROP PARTITION (day_id = '历史分区')
|
||||
|
||||
-- ============================================================================
|
||||
-- 分区最佳实践
|
||||
-- ============================================================================
|
||||
/*
|
||||
1. 分区字段选择原则
|
||||
- 查询高频过滤字段
|
||||
- 数据量分布均匀的字段
|
||||
- 时间字段最常用(day_id, month_id)
|
||||
|
||||
2. 分区粒度选择
|
||||
- 日增量数据 → day_id 分区
|
||||
- 月增量数据 → month_id 分区
|
||||
- 大数据量 → 可细分到 hour_id
|
||||
|
||||
3. 分区数量控制
|
||||
- 单表分区数建议 < 10000
|
||||
- 过多分区影响元数据性能
|
||||
|
||||
4. 查询必须带分区过滤
|
||||
- 避免:SELECT * FROM table(全表扫描)
|
||||
- 推荐:SELECT * FROM table WHERE day_id = '${day_id}'
|
||||
|
||||
5. 动态分区写入设置
|
||||
- SET spark.sql.partitionOverwriteMode = dynamic;
|
||||
- 避免误覆盖其他分区
|
||||
|
||||
6. 分区数据清理
|
||||
- 定期清理历史分区(如保留近90天)
|
||||
- 使用 ALTER TABLE DROP PARTITION
|
||||
*/
|
||||
@@ -0,0 +1,160 @@
|
||||
-- =====================================================================
|
||||
-- @SparkSqlName: PAIMONA-D-SQL-{表名}-QUERY
|
||||
-- @Version: 1.0
|
||||
-- @Desc: 标准 SELECT 查询模板
|
||||
-- @TargetTables: 无(查询输出)
|
||||
-- @SourceTables: {源表列表}
|
||||
-- @TargetDatabase: Paimon
|
||||
-- @SourceDatabase: Paimon
|
||||
-- =====================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- 基础查询示例
|
||||
-- ============================================================================
|
||||
|
||||
-- 1. 单表查询
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
amount,
|
||||
created_at
|
||||
FROM source_table
|
||||
WHERE day_id = '${day_id}' -- 分区过滤
|
||||
AND status = 'active' -- 业务过滤
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1000;
|
||||
|
||||
-- ============================================================================
|
||||
-- JOIN 查询示例
|
||||
-- ============================================================================
|
||||
|
||||
-- 2. 两表 JOIN
|
||||
SELECT
|
||||
a.id,
|
||||
a.name,
|
||||
b.category_name
|
||||
FROM main_table a
|
||||
JOIN dim_table b ON a.category_id = b.id
|
||||
WHERE a.day_id = '${day_id}'
|
||||
AND b.is_active = true;
|
||||
|
||||
-- 3. 多表 JOIN(带别名)
|
||||
SELECT
|
||||
o.order_id,
|
||||
u.user_name,
|
||||
p.product_name,
|
||||
oi.quantity,
|
||||
oi.unit_price
|
||||
FROM orders o
|
||||
JOIN users u ON o.user_id = u.id
|
||||
JOIN order_items oi ON o.order_id = oi.order_id
|
||||
JOIN products p ON oi.product_id = p.id
|
||||
WHERE o.day_id = '${day_id}'
|
||||
AND o.status IN ('completed', 'shipped');
|
||||
|
||||
-- ============================================================================
|
||||
-- 聚合查询示例
|
||||
-- ============================================================================
|
||||
|
||||
-- 4. GROUP BY 聚合
|
||||
SELECT
|
||||
department,
|
||||
COUNT(*) AS employee_count,
|
||||
SUM(salary) AS total_salary,
|
||||
AVG(salary) AS avg_salary,
|
||||
MAX(salary) AS max_salary,
|
||||
MIN(salary) AS min_salary
|
||||
FROM employees
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY department
|
||||
HAVING COUNT(*) >= 5
|
||||
ORDER BY total_salary DESC;
|
||||
|
||||
-- 5. 多字段分组 + 去重计数
|
||||
SELECT
|
||||
date,
|
||||
region,
|
||||
COUNT(*) AS order_count,
|
||||
COUNT(DISTINCT user_id) AS unique_users,
|
||||
SUM(amount) AS total_amount
|
||||
FROM orders
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY date, region;
|
||||
|
||||
-- ============================================================================
|
||||
-- 窗口函数示例
|
||||
-- ============================================================================
|
||||
|
||||
-- 6. ROW_NUMBER(分组取Top N)
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT
|
||||
department,
|
||||
name,
|
||||
salary,
|
||||
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn
|
||||
FROM employees
|
||||
WHERE day_id = '${day_id}'
|
||||
) t
|
||||
WHERE rn <= 3; -- 每个部门薪资前3名
|
||||
|
||||
-- 7. 累计聚合
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
SUM(amount) OVER (ORDER BY date) AS cumulative_amount,
|
||||
AVG(amount) OVER (
|
||||
ORDER BY date
|
||||
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
|
||||
) AS moving_avg_7d
|
||||
FROM daily_sales
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- 8. LAG/LEAD(环比计算)
|
||||
SELECT
|
||||
date,
|
||||
amount,
|
||||
LAG(amount, 1) OVER (ORDER BY date) AS prev_amount,
|
||||
amount - LAG(amount, 1) OVER (ORDER BY date) AS daily_change,
|
||||
ROUND((amount - LAG(amount, 1) OVER (ORDER BY date))
|
||||
/ LAG(amount, 1) OVER (ORDER BY date) * 100, 2) AS growth_rate_pct
|
||||
FROM daily_sales
|
||||
WHERE day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 子查询示例
|
||||
-- ============================================================================
|
||||
|
||||
-- 9. IN 子查询
|
||||
SELECT *
|
||||
FROM orders
|
||||
WHERE user_id IN (
|
||||
SELECT id FROM users WHERE vip_level >= 3
|
||||
)
|
||||
AND day_id = '${day_id}';
|
||||
|
||||
-- 10. EXISTS 子查询
|
||||
SELECT *
|
||||
FROM products p
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM inventory i
|
||||
WHERE i.product_id = p.id
|
||||
AND i.quantity > 0
|
||||
)
|
||||
AND p.day_id = '${day_id}';
|
||||
|
||||
-- ============================================================================
|
||||
-- 条件聚合示例
|
||||
-- ============================================================================
|
||||
|
||||
-- 11. CASE WHEN + 聚合
|
||||
SELECT
|
||||
date,
|
||||
COUNT(*) AS total_orders,
|
||||
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
|
||||
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
|
||||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_count,
|
||||
SUM(CASE WHEN amount > 1000 THEN amount ELSE 0 END) AS high_value_amount
|
||||
FROM orders
|
||||
WHERE day_id = '${day_id}'
|
||||
GROUP BY date;
|
||||
Reference in New Issue
Block a user