459 lines
8.8 KiB
Markdown
459 lines
8.8 KiB
Markdown
# 聚合模式速查
|
||
|
||
## 基本聚合
|
||
|
||
### 聚合函数列表
|
||
|
||
| 函数 | 说明 | 示例 |
|
||
|------|------|------|
|
||
| COUNT(*) | 计数(含NULL行) | 总行数 |
|
||
| COUNT(col) | 计数(不含NULL) | 有效数据数 |
|
||
| COUNT(DISTINCT col) | 去重计数 | 用户数 |
|
||
| SUM(col) | 求和 | 总销售额 |
|
||
| AVG(col) | 平均值 | 平均薪资 |
|
||
| MIN(col) | 最小值 | 最小年龄 |
|
||
| MAX(col) | 最大值 | 最高分 |
|
||
|
||
### 基本用法
|
||
|
||
```sql
|
||
-- 单列聚合
|
||
SELECT
|
||
COUNT(*) AS total_rows,
|
||
COUNT(DISTINCT user_id) AS unique_users,
|
||
SUM(amount) AS total_amount,
|
||
AVG(amount) AS avg_amount,
|
||
MIN(amount) AS min_amount,
|
||
MAX(amount) AS max_amount
|
||
FROM orders
|
||
|
||
-- 分组聚合
|
||
SELECT
|
||
department,
|
||
COUNT(*) AS employee_count,
|
||
AVG(salary) AS avg_salary,
|
||
MAX(salary) AS max_salary
|
||
FROM employees
|
||
GROUP BY department
|
||
```
|
||
|
||
---
|
||
|
||
## GROUP BY
|
||
|
||
```sql
|
||
-- 单字段分组
|
||
SELECT
|
||
category,
|
||
COUNT(*) AS count
|
||
FROM products
|
||
GROUP BY category
|
||
|
||
-- 多字段分组
|
||
SELECT
|
||
category,
|
||
brand,
|
||
COUNT(*) AS count,
|
||
SUM(price) AS total_price
|
||
FROM products
|
||
GROUP BY category, brand
|
||
|
||
-- 分组 + 排序
|
||
SELECT
|
||
department,
|
||
COUNT(*) AS count
|
||
FROM employees
|
||
GROUP BY department
|
||
ORDER BY count DESC
|
||
|
||
-- 分组 + LIMIT(取Top N组)
|
||
SELECT
|
||
category,
|
||
COUNT(*) AS count
|
||
FROM products
|
||
GROUP BY category
|
||
ORDER BY count DESC
|
||
LIMIT 10
|
||
```
|
||
|
||
---
|
||
|
||
## HAVING(分组过滤)
|
||
|
||
```sql
|
||
-- HAVING vs WHERE
|
||
-- WHERE:过滤原始行(GROUP BY 前)
|
||
-- HAVING:过滤分组结果(GROUP BY 后)
|
||
|
||
-- 示例:筛选订单数大于10的用户
|
||
SELECT
|
||
user_id,
|
||
COUNT(*) AS order_count,
|
||
SUM(amount) AS total_amount
|
||
FROM orders
|
||
GROUP BY user_id
|
||
HAVING COUNT(*) > 10
|
||
|
||
-- 多条件 HAVING
|
||
SELECT
|
||
department,
|
||
AVG(salary) AS avg_salary
|
||
FROM employees
|
||
GROUP BY department
|
||
HAVING AVG(salary) > 5000
|
||
AND COUNT(*) >= 5
|
||
|
||
-- HAVING + ORDER BY
|
||
SELECT
|
||
user_id,
|
||
COUNT(*) AS order_count
|
||
FROM orders
|
||
GROUP BY user_id
|
||
HAVING COUNT(*) >= 5
|
||
ORDER BY order_count DESC
|
||
```
|
||
|
||
---
|
||
|
||
## 去重计数
|
||
|
||
```sql
|
||
-- COUNT(DISTINCT)
|
||
SELECT
|
||
COUNT(DISTINCT user_id) AS unique_users
|
||
FROM orders
|
||
|
||
-- 分组去重计数
|
||
SELECT
|
||
date,
|
||
COUNT(DISTINCT user_id) AS unique_users,
|
||
COUNT(*) AS total_orders
|
||
FROM orders
|
||
GROUP BY date
|
||
|
||
-- 多字段去重计数
|
||
SELECT
|
||
COUNT(DISTINCT user_id, product_id) AS unique_user_product_pairs
|
||
FROM order_items
|
||
|
||
-- 大数据量近似去重(性能优化)
|
||
SELECT
|
||
approx_count_distinct(user_id) AS approx_unique_users
|
||
FROM orders
|
||
```
|
||
|
||
---
|
||
|
||
## 集合聚合(数组结果)
|
||
|
||
```sql
|
||
-- collect_list:收集为数组(不去重)
|
||
SELECT
|
||
user_id,
|
||
collect_list(product_id) AS products
|
||
FROM orders
|
||
GROUP BY user_id
|
||
|
||
-- collect_set:收集为数组(去重)
|
||
SELECT
|
||
user_id,
|
||
collect_set(product_id) AS unique_products
|
||
FROM orders
|
||
GROUP BY user_id
|
||
|
||
-- 取数组大小
|
||
SELECT
|
||
user_id,
|
||
size(collect_list(product_id)) AS product_count,
|
||
size(collect_set(product_id)) AS unique_product_count
|
||
FROM orders
|
||
GROUP BY user_id
|
||
```
|
||
|
||
---
|
||
|
||
## 多级聚合(ROLLUP / CUBE / GROUPING SETS)
|
||
|
||
### ROLLUP(层级汇总)
|
||
|
||
```sql
|
||
-- 从右到左递减分组级别
|
||
SELECT
|
||
COALESCE(year, '总计') AS year,
|
||
COALESCE(month, '全年') AS month,
|
||
COALESCE(region, '全国') AS region,
|
||
SUM(sales) AS total_sales
|
||
FROM sales_data
|
||
GROUP BY ROLLUP (year, month, region)
|
||
|
||
-- 结果包含:
|
||
-- 1. year + month + region 分组
|
||
-- 2. year + month 汇总(region为NULL)
|
||
-- 3. year 汇总(month,region为NULL)
|
||
-- 4. 全表汇总(year,month,region为NULL)
|
||
```
|
||
|
||
### CUBE(全维度组合)
|
||
|
||
```sql
|
||
-- 所有分组组合
|
||
SELECT
|
||
COALESCE(year, '总计') AS year,
|
||
COALESCE(month, '全月') AS month,
|
||
COALESCE(region, '全国') AS region,
|
||
SUM(sales) AS total_sales
|
||
FROM sales_data
|
||
GROUP BY CUBE (year, month, region)
|
||
|
||
-- 结果包含所有组合:
|
||
-- year+month+region, year+month, year+region, month+region
|
||
-- year, month, region, 全表汇总
|
||
```
|
||
|
||
### GROUPING SETS(自定义组合)
|
||
|
||
```sql
|
||
-- 指定分组组合
|
||
SELECT
|
||
year,
|
||
month,
|
||
region,
|
||
SUM(sales) AS total_sales
|
||
FROM sales_data
|
||
GROUP BY GROUPING SETS (
|
||
(year, month),
|
||
(year, region),
|
||
(region),
|
||
()
|
||
)
|
||
|
||
-- 等价于多个 GROUP BY 合并
|
||
```
|
||
|
||
---
|
||
|
||
## GROUPING 函数(判断汇总级别)
|
||
|
||
```sql
|
||
-- GROUPING(col):判断该列是否为汇总产生的NULL
|
||
-- 0 = 真实值, 1 = 汇总NULL
|
||
|
||
SELECT
|
||
year,
|
||
month,
|
||
SUM(sales) AS total_sales,
|
||
GROUPING(year) AS is_year_total,
|
||
GROUPING(month) AS is_month_total
|
||
FROM sales_data
|
||
GROUP BY ROLLUP (year, month)
|
||
|
||
-- 用 GROUPING 区分真实NULL和汇总NULL
|
||
SELECT
|
||
CASE WHEN GROUPING(region) = 1 THEN '全国汇总' ELSE region END AS region,
|
||
SUM(sales) AS total_sales
|
||
FROM sales_data
|
||
GROUP BY ROLLUP (region)
|
||
```
|
||
|
||
---
|
||
|
||
## 条件聚合(CASE WHEN + 聚合)
|
||
|
||
```sql
|
||
-- 分条件统计
|
||
SELECT
|
||
date,
|
||
COUNT(*) AS total_orders,
|
||
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
|
||
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
|
||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_count
|
||
FROM orders
|
||
GROUP BY date
|
||
|
||
-- 分条件求和
|
||
SELECT
|
||
department,
|
||
SUM(salary) AS total_salary,
|
||
SUM(CASE WHEN gender = 'M' THEN salary ELSE 0 END) AS male_salary,
|
||
SUM(CASE WHEN gender = 'F' THEN salary ELSE 0 END) AS female_salary
|
||
FROM employees
|
||
GROUP BY department
|
||
|
||
-- 条件平均值
|
||
SELECT
|
||
category,
|
||
AVG(CASE WHEN price > 100 THEN price ELSE NULL END) AS high_price_avg
|
||
FROM products
|
||
GROUP BY category
|
||
```
|
||
|
||
---
|
||
|
||
## 聚合 + 窗口函数
|
||
|
||
```sql
|
||
-- 分组内占比
|
||
SELECT
|
||
department,
|
||
salary,
|
||
ROUND(salary / SUM(salary) OVER (PARTITION BY department) * 100, 2) AS salary_pct
|
||
FROM employees
|
||
|
||
-- 分组累计
|
||
SELECT
|
||
date,
|
||
department,
|
||
amount,
|
||
SUM(amount) OVER (PARTITION BY department ORDER BY date) AS cumulative
|
||
FROM sales
|
||
|
||
-- 分组排名
|
||
SELECT
|
||
*,
|
||
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS salary_rank
|
||
FROM employees
|
||
```
|
||
|
||
---
|
||
|
||
## 多表聚合
|
||
|
||
```sql
|
||
-- JOIN 后聚合
|
||
SELECT
|
||
u.department,
|
||
COUNT(o.id) AS order_count,
|
||
SUM(o.amount) AS total_amount
|
||
FROM users u
|
||
LEFT JOIN orders o ON u.id = o.user_id
|
||
GROUP BY u.department
|
||
|
||
-- 子查询聚合
|
||
SELECT
|
||
dept_stats.department,
|
||
dept_stats.avg_salary,
|
||
emp_count.employee_count
|
||
FROM (
|
||
SELECT department, AVG(salary) AS avg_salary
|
||
FROM employees
|
||
GROUP BY department
|
||
) dept_stats
|
||
JOIN (
|
||
SELECT department, COUNT(*) AS employee_count
|
||
FROM employees
|
||
GROUP BY department
|
||
) emp_count ON dept_stats.department = emp_count.department
|
||
```
|
||
|
||
---
|
||
|
||
## 常见问题
|
||
|
||
### 问题1:GROUP BY 字段遗漏
|
||
|
||
```sql
|
||
-- 错误:SELECT 字段不在 GROUP BY 中
|
||
SELECT
|
||
department,
|
||
name, -- 错误!name 未分组
|
||
AVG(salary)
|
||
FROM employees
|
||
GROUP BY department
|
||
|
||
-- 解决1:添加到 GROUP BY
|
||
SELECT
|
||
department,
|
||
name,
|
||
AVG(salary)
|
||
FROM employees
|
||
GROUP BY department, name
|
||
|
||
-- 解决2:使用聚合函数处理
|
||
SELECT
|
||
department,
|
||
collect_list(name) AS names, -- 收集所有name
|
||
AVG(salary)
|
||
FROM employees
|
||
GROUP BY department
|
||
```
|
||
|
||
### 问题2:NULL 影响聚合
|
||
|
||
```sql
|
||
-- COUNT(*) 包含 NULL 行
|
||
-- COUNT(col) 不包含 NULL
|
||
|
||
SELECT
|
||
COUNT(*) AS total, -- 包含 NULL 行
|
||
COUNT(amount) AS valid, -- 不包含 amount 为 NULL 的行
|
||
COUNT(DISTINCT amount) AS unique_values
|
||
FROM orders
|
||
|
||
-- SUM/AVG/MIN/MAX 自动忽略 NULL
|
||
SELECT AVG(price) FROM products -- NULL 自动排除
|
||
```
|
||
|
||
### 问题3:聚合结果精度
|
||
|
||
```sql
|
||
-- AVG 可能精度丢失
|
||
SELECT
|
||
AVG(amount) AS avg_amount -- 可能精度不够
|
||
AVG(CAST(amount AS DECIMAL(18,6))) AS precise_avg -- 高精度
|
||
FROM orders
|
||
|
||
-- ROUND 控制精度
|
||
SELECT
|
||
ROUND(AVG(amount), 2) AS avg_amount
|
||
FROM orders
|
||
```
|
||
|
||
---
|
||
|
||
## 聚合性能优化
|
||
|
||
```sql
|
||
-- 1. 先过滤再聚合
|
||
SELECT
|
||
department,
|
||
COUNT(*) AS count
|
||
FROM employees
|
||
WHERE hire_date >= '2024-01-01' -- 先过滤
|
||
GROUP BY department
|
||
|
||
-- 2. 大数据量用近似聚合
|
||
SELECT
|
||
approx_count_distinct(user_id) AS users -- 比 COUNT(DISTINCT) 快
|
||
FROM orders
|
||
|
||
-- 3. 减少分组字段数量
|
||
SELECT
|
||
category, -- 减少分组字段
|
||
COUNT(*) AS count
|
||
FROM products
|
||
GROUP BY category -- 比 GROUP BY category, brand 快
|
||
|
||
-- 4. 避免复杂计算在 GROUP BY 前
|
||
SELECT
|
||
department,
|
||
AVG(salary * 1.1) AS adjusted_avg -- 先计算再聚合
|
||
FROM employees
|
||
GROUP BY department
|
||
```
|
||
|
||
---
|
||
|
||
## 聚合模式选择指南
|
||
|
||
| 需求 | 推荐方式 |
|
||
|------|----------|
|
||
| 简单统计 | GROUP BY + 聚合函数 |
|
||
| 分条件统计 | CASE WHEN + SUM/COUNT |
|
||
| 去重计数 | COUNT(DISTINCT) |
|
||
| 大数据去重 | approx_count_distinct |
|
||
| 收集数组 | collect_list / collect_set |
|
||
| 层级汇总 | ROLLUP |
|
||
| 全维度汇总 | CUBE |
|
||
| 自定义组合 | GROUPING SETS |
|
||
| 分组内计算 | 窗口函数 |
|
||
| 多条件过滤 | HAVING | |