Files
smart-data-dev-skill/one-skill/smart-data-developer/references/sql/reference/aggregation-patterns.md
2026-05-13 11:03:00 +08:00

459 lines
8.8 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 聚合模式速查
## 基本聚合
### 聚合函数列表
| 函数 | 说明 | 示例 |
|------|------|------|
| COUNT(*) | 计数含NULL行 | 总行数 |
| COUNT(col) | 计数不含NULL | 有效数据数 |
| COUNT(DISTINCT col) | 去重计数 | 用户数 |
| SUM(col) | 求和 | 总销售额 |
| AVG(col) | 平均值 | 平均薪资 |
| MIN(col) | 最小值 | 最小年龄 |
| MAX(col) | 最大值 | 最高分 |
### 基本用法
```sql
-- 单列聚合
SELECT
COUNT(*) AS total_rows,
COUNT(DISTINCT user_id) AS unique_users,
SUM(amount) AS total_amount,
AVG(amount) AS avg_amount,
MIN(amount) AS min_amount,
MAX(amount) AS max_amount
FROM orders
-- 分组聚合
SELECT
department,
COUNT(*) AS employee_count,
AVG(salary) AS avg_salary,
MAX(salary) AS max_salary
FROM employees
GROUP BY department
```
---
## GROUP BY
```sql
-- 单字段分组
SELECT
category,
COUNT(*) AS count
FROM products
GROUP BY category
-- 多字段分组
SELECT
category,
brand,
COUNT(*) AS count,
SUM(price) AS total_price
FROM products
GROUP BY category, brand
-- 分组 + 排序
SELECT
department,
COUNT(*) AS count
FROM employees
GROUP BY department
ORDER BY count DESC
-- 分组 + LIMIT取Top N组
SELECT
category,
COUNT(*) AS count
FROM products
GROUP BY category
ORDER BY count DESC
LIMIT 10
```
---
## HAVING分组过滤
```sql
-- HAVING vs WHERE
-- WHERE过滤原始行GROUP BY 前)
-- HAVING过滤分组结果GROUP BY 后)
-- 示例筛选订单数大于10的用户
SELECT
user_id,
COUNT(*) AS order_count,
SUM(amount) AS total_amount
FROM orders
GROUP BY user_id
HAVING COUNT(*) > 10
-- 多条件 HAVING
SELECT
department,
AVG(salary) AS avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 5000
AND COUNT(*) >= 5
-- HAVING + ORDER BY
SELECT
user_id,
COUNT(*) AS order_count
FROM orders
GROUP BY user_id
HAVING COUNT(*) >= 5
ORDER BY order_count DESC
```
---
## 去重计数
```sql
-- COUNT(DISTINCT)
SELECT
COUNT(DISTINCT user_id) AS unique_users
FROM orders
-- 分组去重计数
SELECT
date,
COUNT(DISTINCT user_id) AS unique_users,
COUNT(*) AS total_orders
FROM orders
GROUP BY date
-- 多字段去重计数
SELECT
COUNT(DISTINCT user_id, product_id) AS unique_user_product_pairs
FROM order_items
-- 大数据量近似去重(性能优化)
SELECT
approx_count_distinct(user_id) AS approx_unique_users
FROM orders
```
---
## 集合聚合(数组结果)
```sql
-- collect_list收集为数组不去重
SELECT
user_id,
collect_list(product_id) AS products
FROM orders
GROUP BY user_id
-- collect_set收集为数组去重
SELECT
user_id,
collect_set(product_id) AS unique_products
FROM orders
GROUP BY user_id
-- 取数组大小
SELECT
user_id,
size(collect_list(product_id)) AS product_count,
size(collect_set(product_id)) AS unique_product_count
FROM orders
GROUP BY user_id
```
---
## 多级聚合ROLLUP / CUBE / GROUPING SETS
### ROLLUP层级汇总
```sql
-- 从右到左递减分组级别
SELECT
COALESCE(year, '总计') AS year,
COALESCE(month, '全年') AS month,
COALESCE(region, '全国') AS region,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY ROLLUP (year, month, region)
-- 结果包含:
-- 1. year + month + region 分组
-- 2. year + month 汇总region为NULL
-- 3. year 汇总month,region为NULL
-- 4. 全表汇总year,month,region为NULL
```
### CUBE全维度组合
```sql
-- 所有分组组合
SELECT
COALESCE(year, '总计') AS year,
COALESCE(month, '全月') AS month,
COALESCE(region, '全国') AS region,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY CUBE (year, month, region)
-- 结果包含所有组合:
-- year+month+region, year+month, year+region, month+region
-- year, month, region, 全表汇总
```
### GROUPING SETS自定义组合
```sql
-- 指定分组组合
SELECT
year,
month,
region,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY GROUPING SETS (
(year, month),
(year, region),
(region),
()
)
-- 等价于多个 GROUP BY 合并
```
---
## GROUPING 函数(判断汇总级别)
```sql
-- GROUPING(col)判断该列是否为汇总产生的NULL
-- 0 = 真实值, 1 = 汇总NULL
SELECT
year,
month,
SUM(sales) AS total_sales,
GROUPING(year) AS is_year_total,
GROUPING(month) AS is_month_total
FROM sales_data
GROUP BY ROLLUP (year, month)
-- 用 GROUPING 区分真实NULL和汇总NULL
SELECT
CASE WHEN GROUPING(region) = 1 THEN '全国汇总' ELSE region END AS region,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY ROLLUP (region)
```
---
## 条件聚合CASE WHEN + 聚合)
```sql
-- 分条件统计
SELECT
date,
COUNT(*) AS total_orders,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_count
FROM orders
GROUP BY date
-- 分条件求和
SELECT
department,
SUM(salary) AS total_salary,
SUM(CASE WHEN gender = 'M' THEN salary ELSE 0 END) AS male_salary,
SUM(CASE WHEN gender = 'F' THEN salary ELSE 0 END) AS female_salary
FROM employees
GROUP BY department
-- 条件平均值
SELECT
category,
AVG(CASE WHEN price > 100 THEN price ELSE NULL END) AS high_price_avg
FROM products
GROUP BY category
```
---
## 聚合 + 窗口函数
```sql
-- 分组内占比
SELECT
department,
salary,
ROUND(salary / SUM(salary) OVER (PARTITION BY department) * 100, 2) AS salary_pct
FROM employees
-- 分组累计
SELECT
date,
department,
amount,
SUM(amount) OVER (PARTITION BY department ORDER BY date) AS cumulative
FROM sales
-- 分组排名
SELECT
*,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS salary_rank
FROM employees
```
---
## 多表聚合
```sql
-- JOIN 后聚合
SELECT
u.department,
COUNT(o.id) AS order_count,
SUM(o.amount) AS total_amount
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.department
-- 子查询聚合
SELECT
dept_stats.department,
dept_stats.avg_salary,
emp_count.employee_count
FROM (
SELECT department, AVG(salary) AS avg_salary
FROM employees
GROUP BY department
) dept_stats
JOIN (
SELECT department, COUNT(*) AS employee_count
FROM employees
GROUP BY department
) emp_count ON dept_stats.department = emp_count.department
```
---
## 常见问题
### 问题1GROUP BY 字段遗漏
```sql
-- 错误SELECT 字段不在 GROUP BY 中
SELECT
department,
name, -- 错误name 未分组
AVG(salary)
FROM employees
GROUP BY department
-- 解决1添加到 GROUP BY
SELECT
department,
name,
AVG(salary)
FROM employees
GROUP BY department, name
-- 解决2使用聚合函数处理
SELECT
department,
collect_list(name) AS names, -- 收集所有name
AVG(salary)
FROM employees
GROUP BY department
```
### 问题2NULL 影响聚合
```sql
-- COUNT(*) 包含 NULL 行
-- COUNT(col) 不包含 NULL
SELECT
COUNT(*) AS total, -- 包含 NULL 行
COUNT(amount) AS valid, -- 不包含 amount 为 NULL 的行
COUNT(DISTINCT amount) AS unique_values
FROM orders
-- SUM/AVG/MIN/MAX 自动忽略 NULL
SELECT AVG(price) FROM products -- NULL 自动排除
```
### 问题3聚合结果精度
```sql
-- AVG 可能精度丢失
SELECT
AVG(amount) AS avg_amount -- 可能精度不够
AVG(CAST(amount AS DECIMAL(18,6))) AS precise_avg -- 高精度
FROM orders
-- ROUND 控制精度
SELECT
ROUND(AVG(amount), 2) AS avg_amount
FROM orders
```
---
## 聚合性能优化
```sql
-- 1. 先过滤再聚合
SELECT
department,
COUNT(*) AS count
FROM employees
WHERE hire_date >= '2024-01-01' -- 先过滤
GROUP BY department
-- 2. 大数据量用近似聚合
SELECT
approx_count_distinct(user_id) AS users -- 比 COUNT(DISTINCT) 快
FROM orders
-- 3. 减少分组字段数量
SELECT
category, -- 减少分组字段
COUNT(*) AS count
FROM products
GROUP BY category -- 比 GROUP BY category, brand 快
-- 4. 避免复杂计算在 GROUP BY 前
SELECT
department,
AVG(salary * 1.1) AS adjusted_avg -- 先计算再聚合
FROM employees
GROUP BY department
```
---
## 聚合模式选择指南
| 需求 | 推荐方式 |
|------|----------|
| 简单统计 | GROUP BY + 聚合函数 |
| 分条件统计 | CASE WHEN + SUM/COUNT |
| 去重计数 | COUNT(DISTINCT) |
| 大数据去重 | approx_count_distinct |
| 收集数组 | collect_list / collect_set |
| 层级汇总 | ROLLUP |
| 全维度汇总 | CUBE |
| 自定义组合 | GROUPING SETS |
| 分组内计算 | 窗口函数 |
| 多条件过滤 | HAVING |