Files
smart-data-dev-skill/one-skill/smart-data-developer/references/sql/reference/aggregation-patterns.md
2026-05-13 11:03:00 +08:00

8.8 KiB
Raw Blame History

聚合模式速查

基本聚合

聚合函数列表

函数 说明 示例
COUNT(*) 计数含NULL行 总行数
COUNT(col) 计数不含NULL 有效数据数
COUNT(DISTINCT col) 去重计数 用户数
SUM(col) 求和 总销售额
AVG(col) 平均值 平均薪资
MIN(col) 最小值 最小年龄
MAX(col) 最大值 最高分

基本用法

-- 单列聚合
SELECT 
    COUNT(*) AS total_rows,
    COUNT(DISTINCT user_id) AS unique_users,
    SUM(amount) AS total_amount,
    AVG(amount) AS avg_amount,
    MIN(amount) AS min_amount,
    MAX(amount) AS max_amount
FROM orders

-- 分组聚合
SELECT 
    department,
    COUNT(*) AS employee_count,
    AVG(salary) AS avg_salary,
    MAX(salary) AS max_salary
FROM employees
GROUP BY department

GROUP BY

-- 单字段分组
SELECT 
    category,
    COUNT(*) AS count
FROM products
GROUP BY category

-- 多字段分组
SELECT 
    category,
    brand,
    COUNT(*) AS count,
    SUM(price) AS total_price
FROM products
GROUP BY category, brand

-- 分组 + 排序
SELECT 
    department,
    COUNT(*) AS count
FROM employees
GROUP BY department
ORDER BY count DESC

-- 分组 + LIMIT取Top N组
SELECT 
    category,
    COUNT(*) AS count
FROM products
GROUP BY category
ORDER BY count DESC
LIMIT 10

HAVING分组过滤

-- HAVING vs WHERE
-- WHERE过滤原始行GROUP BY 前)
-- HAVING过滤分组结果GROUP BY 后)

-- 示例筛选订单数大于10的用户
SELECT 
    user_id,
    COUNT(*) AS order_count,
    SUM(amount) AS total_amount
FROM orders
GROUP BY user_id
HAVING COUNT(*) > 10

-- 多条件 HAVING
SELECT 
    department,
    AVG(salary) AS avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 5000
   AND COUNT(*) >= 5

-- HAVING + ORDER BY
SELECT 
    user_id,
    COUNT(*) AS order_count
FROM orders
GROUP BY user_id
HAVING COUNT(*) >= 5
ORDER BY order_count DESC

去重计数

-- COUNT(DISTINCT)
SELECT 
    COUNT(DISTINCT user_id) AS unique_users
FROM orders

-- 分组去重计数
SELECT 
    date,
    COUNT(DISTINCT user_id) AS unique_users,
    COUNT(*) AS total_orders
FROM orders
GROUP BY date

-- 多字段去重计数
SELECT 
    COUNT(DISTINCT user_id, product_id) AS unique_user_product_pairs
FROM order_items

-- 大数据量近似去重(性能优化)
SELECT 
    approx_count_distinct(user_id) AS approx_unique_users
FROM orders

集合聚合(数组结果)

-- collect_list收集为数组不去重
SELECT 
    user_id,
    collect_list(product_id) AS products
FROM orders
GROUP BY user_id

-- collect_set收集为数组去重
SELECT 
    user_id,
    collect_set(product_id) AS unique_products
FROM orders
GROUP BY user_id

-- 取数组大小
SELECT 
    user_id,
    size(collect_list(product_id)) AS product_count,
    size(collect_set(product_id)) AS unique_product_count
FROM orders
GROUP BY user_id

多级聚合ROLLUP / CUBE / GROUPING SETS

ROLLUP层级汇总

-- 从右到左递减分组级别
SELECT 
    COALESCE(year, '总计') AS year,
    COALESCE(month, '全年') AS month,
    COALESCE(region, '全国') AS region,
    SUM(sales) AS total_sales
FROM sales_data
GROUP BY ROLLUP (year, month, region)

-- 结果包含:
-- 1. year + month + region 分组
-- 2. year + month 汇总region为NULL
-- 3. year 汇总month,region为NULL
-- 4. 全表汇总year,month,region为NULL

CUBE全维度组合

-- 所有分组组合
SELECT 
    COALESCE(year, '总计') AS year,
    COALESCE(month, '全月') AS month,
    COALESCE(region, '全国') AS region,
    SUM(sales) AS total_sales
FROM sales_data
GROUP BY CUBE (year, month, region)

-- 结果包含所有组合:
-- year+month+region, year+month, year+region, month+region
-- year, month, region, 全表汇总

GROUPING SETS自定义组合

-- 指定分组组合
SELECT 
    year,
    month,
    region,
    SUM(sales) AS total_sales
FROM sales_data
GROUP BY GROUPING SETS (
    (year, month),
    (year, region),
    (region),
    ()
)

-- 等价于多个 GROUP BY 合并

GROUPING 函数(判断汇总级别)

-- GROUPING(col)判断该列是否为汇总产生的NULL
-- 0 = 真实值, 1 = 汇总NULL

SELECT 
    year,
    month,
    SUM(sales) AS total_sales,
    GROUPING(year) AS is_year_total,
    GROUPING(month) AS is_month_total
FROM sales_data
GROUP BY ROLLUP (year, month)

-- 用 GROUPING 区分真实NULL和汇总NULL
SELECT 
    CASE WHEN GROUPING(region) = 1 THEN '全国汇总' ELSE region END AS region,
    SUM(sales) AS total_sales
FROM sales_data
GROUP BY ROLLUP (region)

条件聚合CASE WHEN + 聚合)

-- 分条件统计
SELECT 
    date,
    COUNT(*) AS total_orders,
    SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_count,
    SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_count,
    SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_count
FROM orders
GROUP BY date

-- 分条件求和
SELECT 
    department,
    SUM(salary) AS total_salary,
    SUM(CASE WHEN gender = 'M' THEN salary ELSE 0 END) AS male_salary,
    SUM(CASE WHEN gender = 'F' THEN salary ELSE 0 END) AS female_salary
FROM employees
GROUP BY department

-- 条件平均值
SELECT 
    category,
    AVG(CASE WHEN price > 100 THEN price ELSE NULL END) AS high_price_avg
FROM products
GROUP BY category

聚合 + 窗口函数

-- 分组内占比
SELECT 
    department,
    salary,
    ROUND(salary / SUM(salary) OVER (PARTITION BY department) * 100, 2) AS salary_pct
FROM employees

-- 分组累计
SELECT 
    date,
    department,
    amount,
    SUM(amount) OVER (PARTITION BY department ORDER BY date) AS cumulative
FROM sales

-- 分组排名
SELECT 
    *,
    RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS salary_rank
FROM employees

多表聚合

-- JOIN 后聚合
SELECT 
    u.department,
    COUNT(o.id) AS order_count,
    SUM(o.amount) AS total_amount
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.department

-- 子查询聚合
SELECT 
    dept_stats.department,
    dept_stats.avg_salary,
    emp_count.employee_count
FROM (
    SELECT department, AVG(salary) AS avg_salary
    FROM employees
    GROUP BY department
) dept_stats
JOIN (
    SELECT department, COUNT(*) AS employee_count
    FROM employees
    GROUP BY department
) emp_count ON dept_stats.department = emp_count.department

常见问题

问题1GROUP BY 字段遗漏

-- 错误SELECT 字段不在 GROUP BY 中
SELECT 
    department,
    name,         -- 错误name 未分组
    AVG(salary)
FROM employees
GROUP BY department

-- 解决1添加到 GROUP BY
SELECT 
    department,
    name,
    AVG(salary)
FROM employees
GROUP BY department, name

-- 解决2使用聚合函数处理
SELECT 
    department,
    collect_list(name) AS names,    -- 收集所有name
    AVG(salary)
FROM employees
GROUP BY department

问题2NULL 影响聚合

-- COUNT(*) 包含 NULL 行
-- COUNT(col) 不包含 NULL

SELECT 
    COUNT(*) AS total,         -- 包含 NULL 行
    COUNT(amount) AS valid,    -- 不包含 amount 为 NULL 的行
    COUNT(DISTINCT amount) AS unique_values
FROM orders

-- SUM/AVG/MIN/MAX 自动忽略 NULL
SELECT AVG(price) FROM products  -- NULL 自动排除

问题3聚合结果精度

-- AVG 可能精度丢失
SELECT 
    AVG(amount) AS avg_amount          -- 可能精度不够
    AVG(CAST(amount AS DECIMAL(18,6))) AS precise_avg  -- 高精度
FROM orders

-- ROUND 控制精度
SELECT 
    ROUND(AVG(amount), 2) AS avg_amount
FROM orders

聚合性能优化

-- 1. 先过滤再聚合
SELECT 
    department,
    COUNT(*) AS count
FROM employees
WHERE hire_date >= '2024-01-01'    -- 先过滤
GROUP BY department

-- 2. 大数据量用近似聚合
SELECT 
    approx_count_distinct(user_id) AS users    -- 比 COUNT(DISTINCT) 快
FROM orders

-- 3. 减少分组字段数量
SELECT 
    category,          -- 减少分组字段
    COUNT(*) AS count
FROM products
GROUP BY category      -- 比 GROUP BY category, brand 快

-- 4. 避免复杂计算在 GROUP BY 前
SELECT 
    department,
    AVG(salary * 1.1) AS adjusted_avg    -- 先计算再聚合
FROM employees
GROUP BY department

聚合模式选择指南

需求 推荐方式
简单统计 GROUP BY + 聚合函数
分条件统计 CASE WHEN + SUM/COUNT
去重计数 COUNT(DISTINCT)
大数据去重 approx_count_distinct
收集数组 collect_list / collect_set
层级汇总 ROLLUP
全维度汇总 CUBE
自定义组合 GROUPING SETS
分组内计算 窗口函数
多条件过滤 HAVING