# 聚合模式速查 ## 基本聚合 ### 聚合函数列表 | 函数 | 说明 | 示例 | |------|------|------| | COUNT(*) | 计数(含NULL行) | 总行数 | | COUNT(col) | 计数(不含NULL) | 有效数据数 | | COUNT(DISTINCT col) | 去重计数 | 用户数 | | SUM(col) | 求和 | 总销售额 | | AVG(col) | 平均值 | 平均薪资 | | MIN(col) | 最小值 | 最小年龄 | | MAX(col) | 最大值 | 最高分 | ### 基本用法 ```sql -- 单列聚合 SELECT COUNT(*) AS total_rows, COUNT(DISTINCT user_id) AS unique_users, SUM(amount) AS total_amount, AVG(amount) AS avg_amount, MIN(amount) AS min_amount, MAX(amount) AS max_amount FROM orders -- 分组聚合 SELECT department, COUNT(*) AS employee_count, AVG(salary) AS avg_salary, MAX(salary) AS max_salary FROM employees GROUP BY department ``` --- ## GROUP BY ```sql -- 单字段分组 SELECT category, COUNT(*) AS count FROM products GROUP BY category -- 多字段分组 SELECT category, brand, COUNT(*) AS count, SUM(price) AS total_price FROM products GROUP BY category, brand -- 分组 + 排序 SELECT department, COUNT(*) AS count FROM employees GROUP BY department ORDER BY count DESC -- 分组 + LIMIT(取Top N组) SELECT category, COUNT(*) AS count FROM products GROUP BY category ORDER BY count DESC LIMIT 10 ``` --- ## HAVING(分组过滤) ```sql -- HAVING vs WHERE -- WHERE:过滤原始行(GROUP BY 前) -- HAVING:过滤分组结果(GROUP BY 后) -- 示例:筛选订单数大于10的用户 SELECT user_id, COUNT(*) AS order_count, SUM(amount) AS total_amount FROM orders GROUP BY user_id HAVING COUNT(*) > 10 -- 多条件 HAVING SELECT department, AVG(salary) AS avg_salary FROM employees GROUP BY department HAVING AVG(salary) > 5000 AND COUNT(*) >= 5 -- HAVING + ORDER BY SELECT user_id, COUNT(*) AS order_count FROM orders GROUP BY user_id HAVING COUNT(*) >= 5 ORDER BY order_count DESC ``` --- ## 去重计数 ```sql -- COUNT(DISTINCT) SELECT COUNT(DISTINCT user_id) AS unique_users FROM orders -- 分组去重计数 SELECT date, COUNT(DISTINCT user_id) AS unique_users, COUNT(*) AS total_orders FROM orders GROUP BY date -- 多字段去重计数 SELECT COUNT(DISTINCT user_id, product_id) AS unique_user_product_pairs FROM order_items -- 大数据量近似去重(性能优化) SELECT approx_count_distinct(user_id) AS approx_unique_users FROM orders ``` --- ## 集合聚合(数组结果) ```sql -- collect_list:收集为数组(不去重) SELECT user_id, collect_list(product_id) AS products FROM orders GROUP BY user_id -- collect_set:收集为数组(去重) SELECT user_id, collect_set(product_id) AS unique_products FROM orders GROUP BY user_id -- 取数组大小 SELECT user_id, size(collect_list(product_id)) AS product_count, size(collect_set(product_id)) AS unique_product_count FROM orders GROUP BY user_id ``` --- ## 多级聚合(ROLLUP / CUBE / GROUPING SETS) ### ROLLUP(层级汇总) ```sql -- 从右到左递减分组级别 SELECT COALESCE(year, '总计') AS year, COALESCE(month, '全年') AS month, COALESCE(region, '全国') AS region, SUM(sales) AS total_sales FROM sales_data GROUP BY ROLLUP (year, month, region) -- 结果包含: -- 1. year + month + region 分组 -- 2. year + month 汇总(region为NULL) -- 3. year 汇总(month,region为NULL) -- 4. 全表汇总(year,month,region为NULL) ``` ### CUBE(全维度组合) ```sql -- 所有分组组合 SELECT COALESCE(year, '总计') AS year, COALESCE(month, '全月') AS month, COALESCE(region, '全国') AS region, SUM(sales) AS total_sales FROM sales_data GROUP BY CUBE (year, month, region) -- 结果包含所有组合: -- year+month+region, year+month, year+region, month+region -- year, month, region, 全表汇总 ``` ### GROUPING SETS(自定义组合) ```sql -- 指定分组组合 SELECT year, month, region, SUM(sales) AS total_sales FROM sales_data GROUP BY GROUPING SETS ( (year, month), (year, region), (region), () ) -- 等价于多个 GROUP BY 合并 ``` --- ## GROUPING 函数(判断汇总级别) ```sql -- GROUPING(col):判断该列是否为汇总产生的NULL -- 0 = 真实值, 1 = 汇总NULL SELECT year, month, SUM(sales) AS total_sales, GROUPING(year) AS is_year_total, GROUPING(month) AS is_month_total FROM sales_data GROUP BY ROLLUP (year, month) -- 用 GROUPING 区分真实NULL和汇总NULL SELECT CASE WHEN GROUPING(region) = 1 THEN '全国汇总' ELSE region END AS region, SUM(sales) AS total_sales FROM sales_data GROUP BY ROLLUP (region) ``` --- ## 条件聚合(CASE WHEN + 聚合) ```sql -- 分条件统计 SELECT date, COUNT(*) AS total_orders, SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count, SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count, SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_count FROM orders GROUP BY date -- 分条件求和 SELECT department, SUM(salary) AS total_salary, SUM(CASE WHEN gender = 'M' THEN salary ELSE 0 END) AS male_salary, SUM(CASE WHEN gender = 'F' THEN salary ELSE 0 END) AS female_salary FROM employees GROUP BY department -- 条件平均值 SELECT category, AVG(CASE WHEN price > 100 THEN price ELSE NULL END) AS high_price_avg FROM products GROUP BY category ``` --- ## 聚合 + 窗口函数 ```sql -- 分组内占比 SELECT department, salary, ROUND(salary / SUM(salary) OVER (PARTITION BY department) * 100, 2) AS salary_pct FROM employees -- 分组累计 SELECT date, department, amount, SUM(amount) OVER (PARTITION BY department ORDER BY date) AS cumulative FROM sales -- 分组排名 SELECT *, RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS salary_rank FROM employees ``` --- ## 多表聚合 ```sql -- JOIN 后聚合 SELECT u.department, COUNT(o.id) AS order_count, SUM(o.amount) AS total_amount FROM users u LEFT JOIN orders o ON u.id = o.user_id GROUP BY u.department -- 子查询聚合 SELECT dept_stats.department, dept_stats.avg_salary, emp_count.employee_count FROM ( SELECT department, AVG(salary) AS avg_salary FROM employees GROUP BY department ) dept_stats JOIN ( SELECT department, COUNT(*) AS employee_count FROM employees GROUP BY department ) emp_count ON dept_stats.department = emp_count.department ``` --- ## 常见问题 ### 问题1:GROUP BY 字段遗漏 ```sql -- 错误:SELECT 字段不在 GROUP BY 中 SELECT department, name, -- 错误!name 未分组 AVG(salary) FROM employees GROUP BY department -- 解决1:添加到 GROUP BY SELECT department, name, AVG(salary) FROM employees GROUP BY department, name -- 解决2:使用聚合函数处理 SELECT department, collect_list(name) AS names, -- 收集所有name AVG(salary) FROM employees GROUP BY department ``` ### 问题2:NULL 影响聚合 ```sql -- COUNT(*) 包含 NULL 行 -- COUNT(col) 不包含 NULL SELECT COUNT(*) AS total, -- 包含 NULL 行 COUNT(amount) AS valid, -- 不包含 amount 为 NULL 的行 COUNT(DISTINCT amount) AS unique_values FROM orders -- SUM/AVG/MIN/MAX 自动忽略 NULL SELECT AVG(price) FROM products -- NULL 自动排除 ``` ### 问题3:聚合结果精度 ```sql -- AVG 可能精度丢失 SELECT AVG(amount) AS avg_amount -- 可能精度不够 AVG(CAST(amount AS DECIMAL(18,6))) AS precise_avg -- 高精度 FROM orders -- ROUND 控制精度 SELECT ROUND(AVG(amount), 2) AS avg_amount FROM orders ``` --- ## 聚合性能优化 ```sql -- 1. 先过滤再聚合 SELECT department, COUNT(*) AS count FROM employees WHERE hire_date >= '2024-01-01' -- 先过滤 GROUP BY department -- 2. 大数据量用近似聚合 SELECT approx_count_distinct(user_id) AS users -- 比 COUNT(DISTINCT) 快 FROM orders -- 3. 减少分组字段数量 SELECT category, -- 减少分组字段 COUNT(*) AS count FROM products GROUP BY category -- 比 GROUP BY category, brand 快 -- 4. 避免复杂计算在 GROUP BY 前 SELECT department, AVG(salary * 1.1) AS adjusted_avg -- 先计算再聚合 FROM employees GROUP BY department ``` --- ## 聚合模式选择指南 | 需求 | 推荐方式 | |------|----------| | 简单统计 | GROUP BY + 聚合函数 | | 分条件统计 | CASE WHEN + SUM/COUNT | | 去重计数 | COUNT(DISTINCT) | | 大数据去重 | approx_count_distinct | | 收集数组 | collect_list / collect_set | | 层级汇总 | ROLLUP | | 全维度汇总 | CUBE | | 自定义组合 | GROUPING SETS | | 分组内计算 | 窗口函数 | | 多条件过滤 | HAVING |