8.8 KiB
8.8 KiB
聚合模式速查
基本聚合
聚合函数列表
| 函数 | 说明 | 示例 |
|---|---|---|
| COUNT(*) | 计数(含NULL行) | 总行数 |
| COUNT(col) | 计数(不含NULL) | 有效数据数 |
| COUNT(DISTINCT col) | 去重计数 | 用户数 |
| SUM(col) | 求和 | 总销售额 |
| AVG(col) | 平均值 | 平均薪资 |
| MIN(col) | 最小值 | 最小年龄 |
| MAX(col) | 最大值 | 最高分 |
基本用法
-- 单列聚合
SELECT
COUNT(*) AS total_rows,
COUNT(DISTINCT user_id) AS unique_users,
SUM(amount) AS total_amount,
AVG(amount) AS avg_amount,
MIN(amount) AS min_amount,
MAX(amount) AS max_amount
FROM orders
-- 分组聚合
SELECT
department,
COUNT(*) AS employee_count,
AVG(salary) AS avg_salary,
MAX(salary) AS max_salary
FROM employees
GROUP BY department
GROUP BY
-- 单字段分组
SELECT
category,
COUNT(*) AS count
FROM products
GROUP BY category
-- 多字段分组
SELECT
category,
brand,
COUNT(*) AS count,
SUM(price) AS total_price
FROM products
GROUP BY category, brand
-- 分组 + 排序
SELECT
department,
COUNT(*) AS count
FROM employees
GROUP BY department
ORDER BY count DESC
-- 分组 + LIMIT(取Top N组)
SELECT
category,
COUNT(*) AS count
FROM products
GROUP BY category
ORDER BY count DESC
LIMIT 10
HAVING(分组过滤)
-- HAVING vs WHERE
-- WHERE:过滤原始行(GROUP BY 前)
-- HAVING:过滤分组结果(GROUP BY 后)
-- 示例:筛选订单数大于10的用户
SELECT
user_id,
COUNT(*) AS order_count,
SUM(amount) AS total_amount
FROM orders
GROUP BY user_id
HAVING COUNT(*) > 10
-- 多条件 HAVING
SELECT
department,
AVG(salary) AS avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 5000
AND COUNT(*) >= 5
-- HAVING + ORDER BY
SELECT
user_id,
COUNT(*) AS order_count
FROM orders
GROUP BY user_id
HAVING COUNT(*) >= 5
ORDER BY order_count DESC
去重计数
-- COUNT(DISTINCT)
SELECT
COUNT(DISTINCT user_id) AS unique_users
FROM orders
-- 分组去重计数
SELECT
date,
COUNT(DISTINCT user_id) AS unique_users,
COUNT(*) AS total_orders
FROM orders
GROUP BY date
-- 多字段去重计数
SELECT
COUNT(DISTINCT user_id, product_id) AS unique_user_product_pairs
FROM order_items
-- 大数据量近似去重(性能优化)
SELECT
approx_count_distinct(user_id) AS approx_unique_users
FROM orders
集合聚合(数组结果)
-- collect_list:收集为数组(不去重)
SELECT
user_id,
collect_list(product_id) AS products
FROM orders
GROUP BY user_id
-- collect_set:收集为数组(去重)
SELECT
user_id,
collect_set(product_id) AS unique_products
FROM orders
GROUP BY user_id
-- 取数组大小
SELECT
user_id,
size(collect_list(product_id)) AS product_count,
size(collect_set(product_id)) AS unique_product_count
FROM orders
GROUP BY user_id
多级聚合(ROLLUP / CUBE / GROUPING SETS)
ROLLUP(层级汇总)
-- 从右到左递减分组级别
SELECT
COALESCE(year, '总计') AS year,
COALESCE(month, '全年') AS month,
COALESCE(region, '全国') AS region,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY ROLLUP (year, month, region)
-- 结果包含:
-- 1. year + month + region 分组
-- 2. year + month 汇总(region为NULL)
-- 3. year 汇总(month,region为NULL)
-- 4. 全表汇总(year,month,region为NULL)
CUBE(全维度组合)
-- 所有分组组合
SELECT
COALESCE(year, '总计') AS year,
COALESCE(month, '全月') AS month,
COALESCE(region, '全国') AS region,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY CUBE (year, month, region)
-- 结果包含所有组合:
-- year+month+region, year+month, year+region, month+region
-- year, month, region, 全表汇总
GROUPING SETS(自定义组合)
-- 指定分组组合
SELECT
year,
month,
region,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY GROUPING SETS (
(year, month),
(year, region),
(region),
()
)
-- 等价于多个 GROUP BY 合并
GROUPING 函数(判断汇总级别)
-- GROUPING(col):判断该列是否为汇总产生的NULL
-- 0 = 真实值, 1 = 汇总NULL
SELECT
year,
month,
SUM(sales) AS total_sales,
GROUPING(year) AS is_year_total,
GROUPING(month) AS is_month_total
FROM sales_data
GROUP BY ROLLUP (year, month)
-- 用 GROUPING 区分真实NULL和汇总NULL
SELECT
CASE WHEN GROUPING(region) = 1 THEN '全国汇总' ELSE region END AS region,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY ROLLUP (region)
条件聚合(CASE WHEN + 聚合)
-- 分条件统计
SELECT
date,
COUNT(*) AS total_orders,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_count
FROM orders
GROUP BY date
-- 分条件求和
SELECT
department,
SUM(salary) AS total_salary,
SUM(CASE WHEN gender = 'M' THEN salary ELSE 0 END) AS male_salary,
SUM(CASE WHEN gender = 'F' THEN salary ELSE 0 END) AS female_salary
FROM employees
GROUP BY department
-- 条件平均值
SELECT
category,
AVG(CASE WHEN price > 100 THEN price ELSE NULL END) AS high_price_avg
FROM products
GROUP BY category
聚合 + 窗口函数
-- 分组内占比
SELECT
department,
salary,
ROUND(salary / SUM(salary) OVER (PARTITION BY department) * 100, 2) AS salary_pct
FROM employees
-- 分组累计
SELECT
date,
department,
amount,
SUM(amount) OVER (PARTITION BY department ORDER BY date) AS cumulative
FROM sales
-- 分组排名
SELECT
*,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS salary_rank
FROM employees
多表聚合
-- JOIN 后聚合
SELECT
u.department,
COUNT(o.id) AS order_count,
SUM(o.amount) AS total_amount
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.department
-- 子查询聚合
SELECT
dept_stats.department,
dept_stats.avg_salary,
emp_count.employee_count
FROM (
SELECT department, AVG(salary) AS avg_salary
FROM employees
GROUP BY department
) dept_stats
JOIN (
SELECT department, COUNT(*) AS employee_count
FROM employees
GROUP BY department
) emp_count ON dept_stats.department = emp_count.department
常见问题
问题1:GROUP BY 字段遗漏
-- 错误:SELECT 字段不在 GROUP BY 中
SELECT
department,
name, -- 错误!name 未分组
AVG(salary)
FROM employees
GROUP BY department
-- 解决1:添加到 GROUP BY
SELECT
department,
name,
AVG(salary)
FROM employees
GROUP BY department, name
-- 解决2:使用聚合函数处理
SELECT
department,
collect_list(name) AS names, -- 收集所有name
AVG(salary)
FROM employees
GROUP BY department
问题2:NULL 影响聚合
-- COUNT(*) 包含 NULL 行
-- COUNT(col) 不包含 NULL
SELECT
COUNT(*) AS total, -- 包含 NULL 行
COUNT(amount) AS valid, -- 不包含 amount 为 NULL 的行
COUNT(DISTINCT amount) AS unique_values
FROM orders
-- SUM/AVG/MIN/MAX 自动忽略 NULL
SELECT AVG(price) FROM products -- NULL 自动排除
问题3:聚合结果精度
-- AVG 可能精度丢失
SELECT
AVG(amount) AS avg_amount -- 可能精度不够
AVG(CAST(amount AS DECIMAL(18,6))) AS precise_avg -- 高精度
FROM orders
-- ROUND 控制精度
SELECT
ROUND(AVG(amount), 2) AS avg_amount
FROM orders
聚合性能优化
-- 1. 先过滤再聚合
SELECT
department,
COUNT(*) AS count
FROM employees
WHERE hire_date >= '2024-01-01' -- 先过滤
GROUP BY department
-- 2. 大数据量用近似聚合
SELECT
approx_count_distinct(user_id) AS users -- 比 COUNT(DISTINCT) 快
FROM orders
-- 3. 减少分组字段数量
SELECT
category, -- 减少分组字段
COUNT(*) AS count
FROM products
GROUP BY category -- 比 GROUP BY category, brand 快
-- 4. 避免复杂计算在 GROUP BY 前
SELECT
department,
AVG(salary * 1.1) AS adjusted_avg -- 先计算再聚合
FROM employees
GROUP BY department
聚合模式选择指南
| 需求 | 推荐方式 |
|---|---|
| 简单统计 | GROUP BY + 聚合函数 |
| 分条件统计 | CASE WHEN + SUM/COUNT |
| 去重计数 | COUNT(DISTINCT) |
| 大数据去重 | approx_count_distinct |
| 收集数组 | collect_list / collect_set |
| 层级汇总 | ROLLUP |
| 全维度汇总 | CUBE |
| 自定义组合 | GROUPING SETS |
| 分组内计算 | 窗口函数 |
| 多条件过滤 | HAVING |