369 lines
7.6 KiB
Markdown
369 lines
7.6 KiB
Markdown
# JOIN 模式速查
|
||
|
||
## JOIN 类型
|
||
|
||
| 类型 | 说明 | 结果特点 |
|
||
|------|------|----------|
|
||
| INNER JOIN | 内连接 | 只返回匹配的行 |
|
||
| LEFT JOIN | 左外连接 | 左表全部,右表匹配(无匹配为NULL) |
|
||
| RIGHT JOIN | 右外连接 | 右表全部,左表匹配(无匹配为NULL) |
|
||
| FULL OUTER JOIN | 全外连接 | 两表全部,无匹配处为NULL |
|
||
| CROSS JOIN | 交叉连接 | 笛卡尔积(每行与每行组合) |
|
||
| LEFT SEMI JOIN | 左半连接 | 左表中在右表有匹配的行(不返回右表列) |
|
||
| LEFT ANTI JOIN | 左反连接 | 左表中在右表无匹配的行 |
|
||
|
||
---
|
||
|
||
## INNER JOIN
|
||
|
||
```sql
|
||
-- 基本语法
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
INNER JOIN table_b b ON a.id = b.id
|
||
|
||
-- 等价写法(逗号连接)
|
||
SELECT a.*, b.*
|
||
FROM table_a a, table_b b
|
||
WHERE a.id = b.id
|
||
|
||
-- 多字段关联
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
INNER JOIN table_b b
|
||
ON a.user_id = b.user_id
|
||
AND a.order_date = b.order_date
|
||
```
|
||
|
||
**使用场景**:只需要两表都有匹配的数据时使用。
|
||
|
||
---
|
||
|
||
## LEFT JOIN
|
||
|
||
```sql
|
||
-- 基本语法
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
LEFT JOIN table_b b ON a.id = b.id
|
||
|
||
-- 处理右表NULL值
|
||
SELECT
|
||
a.id,
|
||
a.name,
|
||
COALESCE(b.amount, 0) AS amount, -- NULL转0
|
||
IF(b.id IS NULL, '无匹配', '有匹配') AS match_status
|
||
FROM table_a a
|
||
LEFT JOIN table_b b ON a.id = b.id
|
||
|
||
-- 找出左表中无匹配的行(差集)
|
||
SELECT a.*
|
||
FROM table_a a
|
||
LEFT JOIN table_b b ON a.id = b.id
|
||
WHERE b.id IS NULL
|
||
```
|
||
|
||
**使用场景**:保留左表所有数据,右表补充信息时使用。
|
||
|
||
---
|
||
|
||
## RIGHT JOIN
|
||
|
||
```sql
|
||
-- 基本语法
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
RIGHT JOIN table_b b ON a.id = b.id
|
||
|
||
-- 等价于 LEFT JOIN 反过来
|
||
SELECT a.*, b.*
|
||
FROM table_b b
|
||
LEFT JOIN table_a a ON b.id = a.id
|
||
```
|
||
|
||
**使用场景**:保留右表所有数据时使用(可改用 LEFT JOIN 反转)。
|
||
|
||
---
|
||
|
||
## FULL OUTER JOIN
|
||
|
||
```sql
|
||
-- 基本语法
|
||
SELECT
|
||
COALESCE(a.id, b.id) AS id, -- 统一ID
|
||
a.name,
|
||
b.amount
|
||
FROM table_a a
|
||
FULL OUTER JOIN table_b b ON a.id = b.id
|
||
|
||
-- 找出两表差异
|
||
SELECT
|
||
a.id AS a_id,
|
||
b.id AS b_id,
|
||
CASE
|
||
WHEN a.id IS NULL THEN '仅在B表'
|
||
WHEN b.id IS NULL THEN '仅在A表'
|
||
ELSE '两表都有'
|
||
END AS status
|
||
FROM table_a a
|
||
FULL OUTER JOIN table_b b ON a.id = b.id
|
||
WHERE a.id IS NULL OR b.id IS NULL
|
||
```
|
||
|
||
**使用场景**:需要两表完整数据,分析差异时使用。
|
||
|
||
---
|
||
|
||
## CROSS JOIN
|
||
|
||
```sql
|
||
-- 基本语法(笛卡尔积)
|
||
SELECT a.name, b.color
|
||
FROM products a
|
||
CROSS JOIN colors b
|
||
|
||
-- 结果:products每行与colors每行组合
|
||
-- products有10行,colors有5行 → 结果50行
|
||
|
||
-- 等价写法
|
||
SELECT a.name, b.color
|
||
FROM products a, colors b -- 无WHERE条件
|
||
```
|
||
|
||
**使用场景**:生成所有组合、测试数据生成时使用。
|
||
|
||
**注意**:数据量大时慎用,可能产生巨量结果。
|
||
|
||
---
|
||
|
||
## LEFT SEMI JOIN(Spark SQL 特有)
|
||
|
||
```sql
|
||
-- 基本语法
|
||
SELECT a.*
|
||
FROM table_a a
|
||
LEFT SEMI JOIN table_b b ON a.id = b.id
|
||
|
||
-- 效果:返回A表中在B表有匹配的行,不返回B表的列
|
||
-- 等价于 IN 子查询
|
||
SELECT a.*
|
||
FROM table_a a
|
||
WHERE a.id IN (SELECT id FROM table_b b)
|
||
```
|
||
|
||
**使用场景**:只需要判断左表是否在右表存在,不需要右表数据。
|
||
|
||
---
|
||
|
||
## LEFT ANTI JOIN(Spark SQL 特有)
|
||
|
||
```sql
|
||
-- 基本语法
|
||
SELECT a.*
|
||
FROM table_a a
|
||
LEFT ANTI JOIN table_b b ON a.id = b.id
|
||
|
||
-- 效果:返回A表中在B表无匹配的行
|
||
-- 等价于 NOT IN 子查询
|
||
SELECT a.*
|
||
FROM table_a a
|
||
WHERE a.id NOT IN (SELECT id FROM table_b b)
|
||
|
||
-- 或 NOT EXISTS
|
||
SELECT a.*
|
||
FROM table_a a
|
||
WHERE NOT EXISTS (SELECT 1 FROM table_b b WHERE b.id = a.id)
|
||
```
|
||
|
||
**使用场景**:找出差集(左表中不存在于右表的数据)。
|
||
|
||
---
|
||
|
||
## 多表 JOIN
|
||
|
||
```sql
|
||
-- 三表关联
|
||
SELECT
|
||
o.order_id,
|
||
u.user_name,
|
||
p.product_name,
|
||
oi.quantity
|
||
FROM orders o
|
||
JOIN users u ON o.user_id = u.id
|
||
JOIN order_items oi ON o.order_id = oi.order_id
|
||
JOIN products p ON oi.product_id = p.id
|
||
|
||
-- 四表及以上
|
||
SELECT
|
||
a.col1,
|
||
b.col2,
|
||
c.col3,
|
||
d.col4
|
||
FROM table_a a
|
||
JOIN table_b b ON a.id = b.a_id
|
||
JOIN table_c c ON b.id = c.b_id
|
||
JOIN table_d d ON c.id = d.c_id
|
||
```
|
||
|
||
**建议**:多表 JOIN 时,从最小表开始,逐步关联。
|
||
|
||
---
|
||
|
||
## 自连接(Self Join)
|
||
|
||
```sql
|
||
-- 员工与经理关联
|
||
SELECT
|
||
e.name AS employee,
|
||
m.name AS manager
|
||
FROM employees e
|
||
LEFT JOIN employees m ON e.manager_id = m.id
|
||
|
||
-- 查找重复数据
|
||
SELECT
|
||
a.id,
|
||
a.name,
|
||
b.id AS duplicate_id
|
||
FROM table_a a
|
||
JOIN table_a b ON a.name = b.name AND a.id < b.id
|
||
|
||
-- 紧邻数据比较(前后行)
|
||
SELECT
|
||
a.date,
|
||
a.amount,
|
||
b.amount AS prev_amount
|
||
FROM sales a
|
||
LEFT JOIN sales b ON a.date = b.date + 1
|
||
```
|
||
|
||
---
|
||
|
||
## JOIN 条件下推优化
|
||
|
||
**原则**:过滤条件前置,减少 JOIN 数据量。
|
||
|
||
```sql
|
||
-- 推荐:过滤前置
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
JOIN table_b b ON a.id = b.id
|
||
WHERE a.date = '${day_id}' -- 先过滤A表
|
||
AND b.status = 'active' -- 先过滤B表
|
||
|
||
-- 不推荐:JOIN后再过滤
|
||
SELECT a.*, b.*
|
||
FROM (
|
||
SELECT * FROM table_a -- 未过滤
|
||
) a
|
||
JOIN (
|
||
SELECT * FROM table_b -- 未过滤
|
||
) b ON a.id = b.id
|
||
WHERE a.date = '${day_id}'
|
||
AND b.status = 'active'
|
||
```
|
||
|
||
**性能差异**:
|
||
- 推荐:JOIN 前各表已过滤,数据量小,JOIN 快
|
||
- 不推荐:全表 JOIN 后过滤,数据量大,性能差
|
||
|
||
---
|
||
|
||
## JOIN 常见问题
|
||
|
||
### 问题1:关联字段类型不一致
|
||
|
||
```sql
|
||
-- 错误:STRING 与 BIGINT 关联
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
JOIN table_b b ON a.id = b.id -- a.id是STRING,b.id是BIGINT
|
||
|
||
-- 解决:类型转换
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
JOIN table_b b ON CAST(a.id AS BIGINT) = b.id
|
||
```
|
||
|
||
### 问题2:关联字段含NULL
|
||
|
||
```sql
|
||
-- 问题:NULL关联不上
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
LEFT JOIN table_b b ON a.ref_id = b.id -- a.ref_id有NULL
|
||
|
||
-- 解决:先过滤NULL或用COALESCE
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
LEFT JOIN table_b b ON COALESCE(a.ref_id, 'N/A') = b.id
|
||
```
|
||
|
||
### 问题3:多字段关联效率低
|
||
|
||
```sql
|
||
-- 问题:多字段关联
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
JOIN table_b b
|
||
ON a.user_id = b.user_id
|
||
AND a.order_date = b.order_date
|
||
AND a.product_id = b.product_id
|
||
|
||
-- 解决:合并关联字段
|
||
SELECT a.*, b.*
|
||
FROM table_a a
|
||
JOIN table_b b
|
||
ON CONCAT(a.user_id, '_', a.order_date, '_', a.product_id)
|
||
= CONCAT(b.user_id, '_', b.order_date, '_', b.product_id)
|
||
```
|
||
|
||
### 问题4:大表 JOIN 大表
|
||
|
||
```sql
|
||
-- 问题:两表都很大,JOIN 慢
|
||
SELECT a.*, b.*
|
||
FROM large_table_a a
|
||
JOIN large_table_b b ON a.id = b.id
|
||
|
||
-- 解决方案:
|
||
-- 1. 尽量前置过滤
|
||
-- 2. 使用分区表,按分区关联
|
||
-- 3. 调整 Spark 并行度
|
||
-- 4. 使用 BROADCAST JOIN(其中一表较小)
|
||
```
|
||
|
||
---
|
||
|
||
## BROADCAST JOIN(小表广播)
|
||
|
||
```sql
|
||
-- Spark SQL 自动判断(需配置阈值)
|
||
-- 小表自动广播到所有节点,避免 Shuffle
|
||
|
||
-- 手动指定广播
|
||
SELECT /*+ BROADCAST(b) */ a.*, b.*
|
||
FROM large_table_a a
|
||
JOIN small_table_b b ON a.id = b.id
|
||
|
||
-- 多表广播
|
||
SELECT /*+ BROADCAST(b), BROADCAST(c) */ a.*, b.*, c.*
|
||
FROM large_table_a a
|
||
JOIN small_table_b b ON a.id = b.id
|
||
JOIN small_table_c c ON a.category = c.category
|
||
```
|
||
|
||
**适用条件**:其中一表数据量较小(通常 < 10MB)。
|
||
|
||
---
|
||
|
||
## JOIN 类型选择指南
|
||
|
||
| 需求 | 推荐 JOIN | 说明 |
|
||
|------|-----------|------|
|
||
| 两表都有才保留 | INNER JOIN | 最常用 |
|
||
| 左表全部保留 | LEFT JOIN | 补充右表信息 |
|
||
| 右表全部保留 | RIGHT JOIN | 或反转用 LEFT JOIN |
|
||
| 两表全部保留 | FULL OUTER JOIN | 分析差异 |
|
||
| 判断左表是否在右表存在 | LEFT SEMI JOIN | 不需要右表列 |
|
||
| 左表不在右表的行 | LEFT ANTI JOIN | 差集查询 |
|
||
| 生成所有组合 | CROSS JOIN | 慎用 | |