8.5 KiB
8.5 KiB
Rate Limiting
Understanding and handling rate limits in the FastGPT API.
Understanding Rate Limits
FastGPT API may enforce rate limits to:
- Prevent API abuse
- Ensure fair resource allocation
- Maintain system stability
When you exceed the rate limit, you'll receive a 429 Too Many Requests response.
RateLimitError
The SDK raises RateLimitError when rate limits are exceeded:
from fastgpt_client import ChatClient
from fastgpt_client.exceptions import RateLimitError
try:
response = client.create_chat_completion(
messages=[{"role": "user", "content": "Hello"}]
)
except RateLimitError as e:
print(f"Rate limit exceeded!")
print(f"Status code: {e.status_code}") # 429
print(f"Retry after: {e.retry_after}") # Suggested wait time
Handling Rate Limits
1. Simple Retry with Delay
import time
from fastgpt_client.exceptions import RateLimitError
def chat_with_retry(client, messages, max_retries=3):
"""Retry on rate limit with fixed delay."""
for attempt in range(max_retries):
try:
response = client.create_chat_completion(
messages=messages,
stream=False
)
response.raise_for_status()
return response.json()
except RateLimitError as e:
if attempt < max_retries - 1:
# Use Retry-After header or default to 5 seconds
wait_time = int(e.retry_after) if e.retry_after else 5
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
else:
print("Max retries exceeded")
raise
2. Exponential Backoff
import time
def chat_with_backoff(client, messages, max_retries=5):
"""Retry with exponential backoff."""
base_delay = 1 # Start with 1 second
for attempt in range(max_retries):
try:
response = client.create_chat_completion(
messages=messages,
stream=False
)
response.raise_for_status()
return response.json()
except RateLimitError as e:
if attempt < max_retries - 1:
# Exponential backoff with jitter
delay = base_delay * (2 ** attempt)
# Add jitter to avoid thundering herd
import random
jitter = random.uniform(0, 0.5 * delay)
wait_time = delay + jitter
print(f"Rate limited. Waiting {wait_time:.1f} seconds...")
time.sleep(wait_time)
else:
raise
3. Async Retry with Backoff
import asyncio
async def async_chat_with_retry(client, messages, max_retries=5):
"""Async retry with exponential backoff."""
base_delay = 1
for attempt in range(max_retries):
try:
response = await client.create_chat_completion(
messages=messages,
stream=False
)
response.raise_for_status()
return response.json()
except RateLimitError as e:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
print(f"Rate limited. Waiting {delay} seconds...")
await asyncio.sleep(delay)
else:
raise
4. Rate Limiter Class
import time
from collections import deque
from threading import Lock
class RateLimiter:
"""Token bucket rate limiter."""
def __init__(self, rate: int, per: float = 60.0):
"""
Args:
rate: Number of requests allowed
per: Time period in seconds
"""
self.rate = rate
self.per = per
self.allowance = rate
self.last_check = time.time()
self.lock = Lock()
def acquire(self, block: bool = True, timeout: float = None) -> bool:
"""Acquire a token from the bucket."""
with self.lock:
current = time.time()
time_passed = current - self.last_check
self.last_check = current
# Refill bucket
self.allowance += time_passed * (self.rate / self.per)
if self.allowance > self.rate:
self.allowance = self.rate
if self.allowance < 1.0:
if not block:
return False
# Calculate wait time
sleep_time = (1.0 - self.allowance) * (self.per / self.rate)
if timeout is not None and sleep_time > timeout:
return False
time.sleep(sleep_time)
self.allowance = 0.0
else:
self.allowance -= 1.0
return True
# Usage
rate_limiter = RateLimiter(rate=10, per=60) # 10 requests per minute
for i in range(15):
if rate_limiter.acquire():
response = client.create_chat_completion(
messages=[{"role": "user", "content": f"Message {i}"}]
)
print(f"Sent message {i}")
else:
print(f"Rate limited, skipping message {i}")
5. Decorator for Rate Limiting
import time
import functools
from fastgpt_client.exceptions import RateLimitError
def rate_limit_retry(max_retries=3, base_delay=1):
"""Decorator to handle rate limiting with retries."""
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except RateLimitError as e:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
wait_time = int(e.retry_after) if e.retry_after else delay
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
else:
raise
return wrapper
return decorator
# Usage
@rate_limit_retry(max_retries=3, base_delay=2)
def send_message(client, message: str):
response = client.create_chat_completion(
messages=[{"role": "user", "content": message}],
stream=False
)
response.raise_for_status()
return response.json()
Monitoring Rate Limits
import time
from collections import defaultdict
class RequestMonitor:
"""Monitor API request rates."""
def __init__(self, window_seconds=60):
self.window = window_seconds
self.requests = defaultdict(list)
self.lock = Lock()
def record_request(self, endpoint: str):
"""Record an API request."""
with self.lock:
now = time.time()
self.requests[endpoint].append(now)
# Remove old requests outside the window
cutoff = now - self.window
self.requests[endpoint] = [
t for t in self.requests[endpoint] if t > cutoff
]
def get_rate(self, endpoint: str) -> float:
"""Get requests per second for an endpoint."""
with self.lock:
recent = self.requests[endpoint]
if not recent:
return 0.0
return len(recent) / self.window
def is_rate_limited(self, endpoint: str, limit: int) -> bool:
"""Check if endpoint is rate limited."""
with self.lock:
cutoff = time.time() - self.window
recent = [t for t in self.requests[endpoint] if t > cutoff]
return len(recent) >= limit
# Usage
monitor = RequestMonitor(window_seconds=60)
def make_request(client, messages):
endpoint = "/api/v1/chat/completions"
# Check if we're rate limited
if monitor.is_rate_limited(endpoint, limit=100):
print("Approaching rate limit, slowing down...")
time.sleep(1)
monitor.record_request(endpoint)
response = client.create_chat_completion(messages=messages)
return response
Best Practices
- Implement backoff - Use exponential backoff for retries
- Respect Retry-After - Use the
retry_afterheader when available - Monitor usage - Track request rates to avoid hitting limits
- Queue requests - For batch operations, use rate limiting
- Handle gracefully - Show user-friendly messages when rate limited
- Use async - Better resource utilization with concurrent requests
See Also
- Error Handling - Comprehensive error handling guide
- Exceptions Reference - Exception types and attributes