Here's a truth that took me three production outages to learn: your application will break. Not might break. Will break. The question isn't whether errors will happen, but whether you'll know about them when they do.
I remember the first time a major bug hit production. Users were seeing 500 errors for six hours before someone told me. Six hours. I had no logging, no monitoring, no alerts. I was flying blind. That night, I set up proper logging, and it changed everything. Now I know about errors within minutes, sometimes seconds.
In this chapter, I'll show you how to build observability into your Django application so you sleep peacefully at night.
18.1 Django Logging Configuration: Handlers, Formatters, Filters
Logging is your first line of defense. It tells you what happened, when, and why. Here's how to set it up properly.
# ========== PRODUCTION LOGGING CONFIGURATION ==========
# settings/production.py
import os
import logging
from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
# Formatters control how log messages look
'formatters': {
'verbose': {
'format': '{levelname} {asctime} {module} {process:d} {thread:d} {message}',
'style': '{',
},
'standard': {
'format': '{asctime} [{levelname}] {name}: {message}',
'style': '{',
'datefmt': '%Y-%m-%d %H:%M:%S',
},
'detailed': {
'format': '{asctime} - {levelname} - {pathname}:{lineno} - {funcName} - {message}',
'style': '{',
},
'json': {
'format': '{"time": "%(asctime)s", "level": "%(levelname)s", "name": "%(name)s", "message": "%(message)s"}',
},
},
# Filters add extra conditions
'filters': {
'require_debug_false': {
'()': 'django.utils.log.RequireDebugFalse',
},
'require_debug_true': {
'()': 'django.utils.log.RequireDebugTrue',
},
'ip_filter': {
'()': 'myproject.logging_filters.IPFilter',
'ips': ['192.168.1.100'],
},
},
# Handlers determine where logs go
'handlers': {
# Console output (for development)
'console': {
'level': 'DEBUG',
'class': 'logging.StreamHandler',
'formatter': 'standard',
'filters': ['require_debug_true'],
},
# File output with rotation (by size)
'file': {
'level': 'INFO',
'class': 'logging.handlers.RotatingFileHandler',
'filename': '/var/log/django/django.log',
'maxBytes': 1024 * 1024 * 50, # 50 MB
'backupCount': 10,
'formatter': 'detailed',
},
# Daily rotation (better for long-term)
'daily_file': {
'level': 'INFO',
'class': 'logging.handlers.TimedRotatingFileHandler',
'filename': '/var/log/django/django.log',
'when': 'midnight',
'interval': 1,
'backupCount': 30, # Keep 30 days
'formatter': 'detailed',
},
# Error-specific file
'error_file': {
'level': 'ERROR',
'class': 'logging.handlers.RotatingFileHandler',
'filename': '/var/log/django/errors.log',
'maxBytes': 1024 * 1024 * 10, # 10 MB
'backupCount': 5,
'formatter': 'detailed',
},
# Separate log for security events
'security_file': {
'level': 'WARNING',
'class': 'logging.handlers.RotatingFileHandler',
'filename': '/var/log/django/security.log',
'formatter': 'detailed',
},
# Send to syslog (for central logging)
'syslog': {
'level': 'INFO',
'class': 'logging.handlers.SysLogHandler',
'address': '/dev/log',
'facility': 'local7',
'formatter': 'standard',
},
# Email alerts for critical errors
'mail_admins': {
'level': 'ERROR',
'class': 'django.utils.log.AdminEmailHandler',
'formatter': 'verbose',
'filters': ['require_debug_false'],
},
},
# Loggers capture messages from specific sources
'loggers': {
# Root logger - catches everything
'': {
'handlers': ['console', 'daily_file'],
'level': 'INFO',
},
# Django core logger
'django': {
'handlers': ['console', 'daily_file'],
'level': 'INFO',
'propagate': False,
},
# Django request logger
'django.request': {
'handlers': ['mail_admins', 'error_file'],
'level': 'ERROR',
'propagate': False,
},
# Database queries (be careful - very verbose)
'django.db.backends': {
'handlers': ['console'],
'level': 'WARNING', # Only log slow queries
'propagate': False,
},
# Security logger
'django.security': {
'handlers': ['security_file', 'mail_admins'],
'level': 'WARNING',
'propagate': False,
},
# Your application loggers
'myapp': {
'handlers': ['console', 'daily_file', 'mail_admins'],
'level': 'INFO',
'propagate': False,
},
'myapp.api': {
'handlers': ['console', 'daily_file'],
'level': 'DEBUG',
'propagate': False,
},
# Third-party loggers
'celery': {
'handlers': ['daily_file', 'mail_admins'],
'level': 'INFO',
},
},
# Root logger configuration
'root': {
'handlers': ['console', 'daily_file'],
'level': 'WARNING',
},
}
# ========== CUSTOM LOGGING FILTERS ==========
# logging_filters.py
import logging
from django.core.cache import cache
class IPFilter(logging.Filter):
"""Filter out log entries from specific IPs"""
def __init__(self, ips=None):
self.ips = ips or []
def filter(self, record):
# Get IP from request if available
if hasattr(record, 'request'):
ip = record.request.META.get('REMOTE_ADDR')
return ip not in self.ips
return True
class RateLimitFilter(logging.Filter):
"""Prevent log flooding from repeated errors"""
def __init__(self, key_prefix='log_limit', timeout=60, max_count=10):
self.key_prefix = key_prefix
self.timeout = timeout
self.max_count = max_count
def filter(self, record):
cache_key = f'{self.key_prefix}_{record.name}_{record.levelname}'
count = cache.get(cache_key, 0)
if count >= self.max_count:
return False
cache.set(cache_key, count + 1, self.timeout)
return True
# ========== USING LOGGERS IN YOUR CODE ==========
# views.py
import logging
# Create logger for this module
logger = logging.getLogger(__name__)
def post_create(request):
logger.info("User %s accessed post creation view", request.user.username)
try:
post = Post.objects.get(id=post_id)
logger.debug(f"Retrieved post: {post.title}")
except Post.DoesNotExist:
logger.warning(f"Post {post_id} not found by user {request.user.username}")
raise Http404()
if request.method == 'POST':
form = PostForm(request.POST)
if form.is_valid():
post = form.save()
logger.info(f"Post {post.id} created by {request.user.username}")
return redirect('post_detail', pk=post.pk)
else:
logger.warning(f"Invalid form submission for user {request.user.username}: {form.errors}")
return render(request, 'post_form.html', {'form': form})
# ========== LOGGING EXCEPTIONS WITH TRACEBACK ==========
import traceback
def risky_operation(request):
try:
result = call_external_api()
return JsonResponse({'result': result})
except Exception as e:
# Log full traceback
logger.exception(f"API call failed: {str(e)}")
# Or with custom message
logger.error(f"API call failed", exc_info=True)
return JsonResponse({'error': 'Internal server error'}, status=500)
# ========== CONTEXTUAL LOGGING ==========
# Add request ID to every log for tracking
import uuid
from django.utils.deprecation import MiddlewareMixin
class RequestIDMiddleware:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
request_id = request.headers.get('X-Request-ID', str(uuid.uuid4()))
request.request_id = request_id
# Add to logging context
old_factory = logging.getLogRecordFactory()
def record_factory(*args, **kwargs):
record = old_factory(*args, **kwargs)
record.request_id = request_id
return record
logging.setLogRecordFactory(record_factory)
response = self.get_response(request)
response['X-Request-ID'] = request_id
return response
# Use in logs
logger.info(f"Processing payment", extra={'user_id': user.id, 'request_id': request.request_id})
The log that saved me: I once had a bug that only happened once every 500 requests. Without logs, I would never have found it. But I had detailed logging, saw the pattern, and fixed it in an hour. Logs aren't just for debugging - they're for understanding how your application actually behaves.
18.2 Logging to Files, Console, and External Services
Files are fine for small sites. For production, you need centralized logging.
# ========== LOGGING TO EXTERNAL SERVICES ==========
# 1. Logstash (ELK Stack)
# Install: pip install python-logstash
import logstash
import logging
logger = logging.getLogger('myapp')
logger.setLevel(logging.INFO)
logstash_handler = logstash.LogstashHandler('localhost', 5959, version=1)
logger.addHandler(logstash_handler)
logger.info('User action', extra={
'user_id': user.id,
'action': 'post_created',
'post_id': post.id
})
# 2. Graylog
# Install: pip install graypy
from graypy import GELFUDPHandler
logger = logging.getLogger('myapp')
logger.addHandler(GELFUDPHandler('localhost', 12201))
# 3. Papertrail (cloud logging)
# Configure in settings:
LOGGING['handlers']['papertrail'] = {
'level': 'INFO',
'class': 'logging.handlers.SysLogHandler',
'address': ('logs.papertrailapp.com', 12345),
'formatter': 'standard',
}
# ========== LOG ROTATION STRATEGIES ==========
# Using TimedRotatingFileHandler (recommended for production)
from logging.handlers import TimedRotatingFileHandler
handler = TimedRotatingFileHandler(
'/var/log/django/app.log',
when='midnight',
interval=1,
backupCount=30, # Keep 30 days
encoding='utf-8'
)
# Using WatchedFileHandler (with logrotate)
from logging.handlers import WatchedFileHandler
handler = WatchedFileHandler('/var/log/django/app.log')
# Then configure system logrotate:
# /etc/logrotate.d/django
"""
/var/log/django/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
create 640 www-data www-data
sharedscripts
postrotate
systemctl reload gunicorn
endscript
}
"""
# ========== STRUCTURED LOGGING (JSON format) ==========
import json
from pythonjsonlogger import jsonlogger # pip install python-json-logger
class CustomJsonFormatter(jsonlogger.JsonFormatter):
def add_fields(self, log_record, record, message_dict):
super().add_fields(log_record, record, message_dict)
log_record['level'] = record.levelname
log_record['logger'] = record.name
log_record['timestamp'] = record.created
# Handler configuration
json_handler = logging.StreamHandler()
json_handler.setFormatter(CustomJsonFormatter('%(timestamp)s %(level)s %(name)s %(message)s'))
# Now logs are JSON - easy to parse with tools like jq
# {"timestamp": 1234567890, "level": "INFO", "logger": "myapp", "message": "User logged in"}
# ========== LOGGING SLOW REQUESTS ==========
class SlowRequestMiddleware:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
import time
start = time.time()
response = self.get_response(request)
duration = time.time() - start
if duration > 1.0: # 1 second threshold
logger.warning(
f"Slow request: {request.method} {request.path} took {duration:.2f}s",
extra={
'duration': duration,
'method': request.method,
'path': request.path,
'user': str(request.user),
'ip': request.META.get('REMOTE_ADDR')
}
)
return response
# ========== LOGGING DATABASE QUERIES ==========
# For debugging slow queries
from django.db import connection
from contextlib import contextmanager
@contextmanager
def query_logger(operation_name):
"""Log all queries executed within this context"""
initial_queries = len(connection.queries)
yield
queries_executed = len(connection.queries) - initial_queries
logger.debug(f"{operation_name} executed {queries_executed} queries")
# Log individual slow queries
for query in connection.queries[initial_queries:]:
if float(query['time']) > 0.1: # 100ms threshold
logger.warning(f"Slow query ({query['time']}s): {query['sql']}")
# Usage
with query_logger("post_list_view"):
posts = Post.objects.filter(is_published=True).select_related('author')
return render(request, 'posts.html', {'posts': posts})
18.3 Error Tracking with Sentry
Sentry changed my life. No more waiting for users to report errors. No more vague "something went wrong" messages.
# ========== SENTRY SETUP ==========
# Install: pip install sentry-sdk
# settings.py
import sentry_sdk
from sentry_sdk.integrations.django import DjangoIntegration
from sentry_sdk.integrations.celery import CeleryIntegration
from sentry_sdk.integrations.redis import RedisIntegration
sentry_sdk.init(
dsn=os.environ.get('SENTRY_DSN'),
integrations=[
DjangoIntegration(),
CeleryIntegration(),
RedisIntegration(),
],
# Set traces_sample_rate to 1.0 to capture 100% of transactions
traces_sample_rate=0.1, # Capture 10% of transactions in production
# Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions
profiles_sample_rate=0.1,
# Environment
environment=os.environ.get('ENVIRONMENT', 'production'),
# Release version
release=f"myproject@{os.environ.get('GIT_COMMIT', 'unknown')}",
# Send PII (user emails) - use with caution
send_default_pii=False,
)
# ========== MANUAL ERROR CAPTURE ==========
# views.py
import sentry_sdk
from sentry_sdk import capture_exception, capture_message
def payment_view(request):
try:
process_payment()
except PaymentError as e:
# Capture with additional context
with sentry_sdk.push_scope() as scope:
scope.set_tag('payment_provider', 'stripe')
scope.set_context('payment_data', {
'amount': request.POST.get('amount'),
'customer_id': request.user.id,
})
scope.set_user({
'id': request.user.id,
'email': request.user.email,
'username': request.user.username,
})
capture_exception(e)
return JsonResponse({'error': 'Payment failed'}, status=400)
# Capture message for non-exception events
def suspicious_activity(request):
capture_message(
f"Suspicious login attempt from IP {request.META.get('REMOTE_ADDR')}",
level='warning'
)
# ========== ADDING BREADCRUMBS ==========
# Breadcrumbs track what happened before an error
from sentry_sdk import add_breadcrumb
def user_action(request):
add_breadcrumb(
category='auth',
message=f'User {request.user.username} performed action',
level='info',
data={
'action': 'post_create',
'post_title': request.POST.get('title'),
}
)
# Continue with operation
# ========== PERFORMANCE MONITORING WITH SENTRY ==========
from sentry_sdk import start_transaction, start_span
def expensive_operation():
with start_transaction(name="expensive_operation"):
with start_span(op="database", description="fetch_posts"):
posts = Post.objects.filter(is_published=True)
with start_span(op="processing", description="process_posts"):
for post in posts:
process_post(post)
with start_span(op="cache", description="update_cache"):
cache.set('latest_posts', posts, 300)
# ========== DJANGO-DEBUG-TOOLBAR + SENTRY ==========
# For development, capture errors locally
if DEBUG:
from django.core.mail import mail_admins
def send_to_sentry(request, exception):
capture_exception(exception)
# Override Django's default error handling
from django.views.decorators.csrf import requires_csrf_token
from django.views.defaults import server_error
@requires_csrf_token
def custom_server_error(request, template_name='500.html'):
capture_exception()
return server_error(request, template_name)
# ========== IGNORING SPECIFIC ERRORS ==========
# In sentry_sdk.init() or later
sentry_sdk.init(
before_send=lambda event, hint: event if 'Http404' not in event.get('exception', {}).get('values', [{}])[0].get('type', '') else None
)
# Or ignore by exception type
from sentry_sdk.integrations.logging import LoggingIntegration
sentry_logging = LoggingIntegration(
level=logging.INFO, # Capture info and above as breadcrumbs
event_level=logging.ERROR, # Send errors as events
)
How Sentry saved my weekend: I was on vacation when my phone buzzed. Sentry alert. A critical error was happening in production. I opened the app on my phone, saw the full traceback, realized it was a database migration issue, and fixed it in 10 minutes from the beach. Without Sentry, I would have come back to a week of angry customer emails.
18.4 Health Check Endpoints and Monitoring with Uptime Robot
You need to know when your site is down before your users do.
# ========== HEALTH CHECK ENDPOINTS ==========
# views.py
from django.http import JsonResponse
from django.db import connections
from django.db.utils import OperationalError
from django.core.cache import cache
from django.views.decorators.csrf import csrf_exempt
import json
@csrf_exempt # Health checks don't need CSRF
def health_check(request):
"""
Comprehensive health check for load balancers and monitoring
"""
status = {
'status': 'healthy',
'timestamp': timezone.now().isoformat(),
'version': os.environ.get('GIT_COMMIT', 'unknown'),
'checks': {}
}
# Database check
try:
db_conn = connections['default']
db_conn.cursor()
status['checks']['database'] = {'status': 'healthy'}
except OperationalError as e:
status['checks']['database'] = {'status': 'unhealthy', 'error': str(e)}
status['status'] = 'unhealthy'
# Cache check
try:
cache.set('health_check_key', 'ok', 10)
if cache.get('health_check_key') == 'ok':
status['checks']['cache'] = {'status': 'healthy'}
else:
raise Exception("Cache read failed")
except Exception as e:
status['checks']['cache'] = {'status': 'unhealthy', 'error': str(e)}
status['status'] = 'unhealthy'
# Celery check (if using)
try:
from celery import current_app
result = current_app.control.ping(timeout=1.0)
if result:
status['checks']['celery'] = {'status': 'healthy', 'workers': len(result)}
else:
status['checks']['celery'] = {'status': 'unhealthy', 'error': 'No workers responding'}
status['status'] = 'unhealthy'
except Exception as e:
status['checks']['celery'] = {'status': 'unhealthy', 'error': str(e)}
# Disk space check
import shutil
disk_usage = shutil.disk_usage('/')
disk_free_percent = (disk_usage.free / disk_usage.total) * 100
status['checks']['disk'] = {
'status': 'healthy' if disk_free_percent > 10 else 'warning',
'free_gb': round(disk_usage.free / (1024**3), 2),
'free_percent': round(disk_free_percent, 2)
}
if disk_free_percent < 5:
status['status'] = 'unhealthy'
# Return appropriate HTTP status code
http_status = 200 if status['status'] == 'healthy' else 503
return JsonResponse(status, status=http_status)
def readiness_check(request):
"""
Readiness probe for Kubernetes - checks if app is ready to receive traffic
"""
try:
# Check if migrations are applied
from django.db.migrations.executor import MigrationExecutor
executor = MigrationExecutor(connections['default'])
if executor.migration_plan(executor.loader.graph.leaf_nodes()):
return JsonResponse({'status': 'not ready', 'reason': 'pending migrations'}, status=503)
return JsonResponse({'status': 'ready'})
except Exception as e:
return JsonResponse({'status': 'not ready', 'reason': str(e)}, status=503)
def metrics(request):
"""
Prometheus metrics endpoint
"""
from prometheus_client import generate_latest, REGISTRY, Counter, Histogram
# Install: pip install prometheus-client
# Define metrics
request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint'])
request_duration = Histogram('http_request_duration_seconds', 'HTTP request duration', ['method', 'endpoint'])
# Record metrics (usually done in middleware)
return HttpResponse(generate_latest(REGISTRY), content_type='text/plain')
# ========== URL CONFIGURATION ==========
# urls.py
urlpatterns = [
path('health/', health_check, name='health_check'),
path('ready/', readiness_check, name='readiness_check'),
path('metrics/', metrics, name='metrics'),
]
# ========== UPTIME ROBOT CONFIGURATION ==========
# 1. Create account on uptimerobot.com (free tier available)
# 2. Add new monitor:
# - Type: HTTP(s)
# - Friendly Name: Django Production
# - URL: https://yoursite.com/health/
# - Monitoring Interval: 5 minutes
# - Alert Contacts: Your email, SMS, Slack
# 3. Configure alert conditions:
# - Alert when status changes to down
# - Alert when response time > 2000ms
# - Alert when SSL certificate expires in < 14 days
# ========== CUSTOM MONITORING MIDDLEWARE ==========
class MetricsMiddleware:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
import time
start = time.time()
response = self.get_response(request)
duration = time.time() - start
# Record metrics
from prometheus_client import Counter, Histogram
request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status'])
request_duration = Histogram('http_request_duration_seconds', 'HTTP request duration', ['method', 'endpoint'])
endpoint = request.path.split('/')[1] if request.path != '/' else 'root'
request_count.labels(method=request.method, endpoint=endpoint, status=response.status_code).inc()
request_duration.labels(method=request.method, endpoint=endpoint).observe(duration)
return response
# ========== SIMPLE MONITORING WITH CRON ==========
# monitoring.py - Run via cron every 5 minutes
import requests
import smtplib
from email.mime.text import MIMEText
def check_site_health():
try:
response = requests.get('https://yoursite.com/health/', timeout=10)
if response.status_code != 200:
send_alert(f"Site returned {response.status_code}")
data = response.json()
if data.get('status') != 'healthy':
send_alert(f"Health check failed: {data.get('checks')}")
except Exception as e:
send_alert(f"Cannot reach site: {str(e)}")
def send_alert(message):
# Send email
msg = MIMEText(message)
msg['Subject'] = 'ALERT: Site Health Check Failed'
msg['From'] = 'alerts@yoursite.com'
msg['To'] = 'admin@yoursite.com'
with smtplib.SMTP('localhost') as server:
server.send_message(msg)
# Also send to Slack webhook
import requests
requests.post('https://hooks.slack.com/...', json={'text': message})
18.5 Performance Monitoring with New Relic or Datadog
For deeper insights, use APM tools. They show you exactly where your time is going.
# ========== NEW RELIC SETUP ==========
# Install: pip install newrelic
# Generate newrelic.ini
# newrelic-admin generate-config YOUR_LICENSE_KEY newrelic.ini
# Wrap your application
# newrelic-admin run-program gunicorn myproject.wsgi
# Or in code
import newrelic.agent
newrelic.agent.initialize('newrelic.ini')
# Add custom instrumentation
@newrelic.agent.function_trace()
def expensive_database_query():
# This function will show up in New Relic dashboard
pass
# Record custom events
newrelic.agent.record_custom_event('PaymentProcessed', {
'amount': 99.99,
'customer_id': user.id,
'payment_method': 'credit_card'
})
# ========== DATADOG SETUP ==========
# Install: pip install ddtrace
# Run with ddtrace
# ddtrace-run gunicorn myproject.wsgi
# Configure in settings.py
DATADOG_TRACE = {
'DEFAULT_SERVICE': 'myproject',
'TAGS': {'env': 'production', 'version': '1.0'},
}
# Add custom spans
from ddtrace import tracer
@tracer.wrap(name='process_payment', service='payment')
def process_payment(amount, customer_id):
with tracer.trace('database.query') as span:
span.set_tag('table', 'transactions')
result = Transaction.objects.create(amount=amount)
return result
# ========== NGINX MONITORING ==========
# Enable stub_status in nginx.conf
server {
location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1;
deny all;
}
}
# Check with curl
# curl http://localhost/nginx_status
"""
Active connections: 42
server accepts handled requests
12345 12345 23456
Reading: 0 Writing: 2 Waiting: 40
"""
# ========== ALERTING RULES ==========
# What to monitor and alert on:
"""
CRITICAL ALERTS (Page immediately):
- Error rate > 5% over 5 minutes
- Response time > 3 seconds over 5 minutes
- Database connection pool exhausted
- Disk space < 10%
- SSL certificate expires in < 7 days
WARNING ALERTS (Send to Slack/Email):
- CPU usage > 80% for 10 minutes
- Memory usage > 85%
- Slow queries > 1 second (more than 10 per minute)
- 404 rate > 100 per minute (potential attack)
- Failed logins > 50 per minute (brute force)
INFO ALERTS (Log only):
- Deployment successful
- Database migration applied
- Cache cleared
- New user registration spikes
"""
# ========== SIMPLE STATUS DASHBOARD ==========
# views.py
from django.contrib.admin.views.decorators import staff_member_required
@staff_member_required
def admin_dashboard(request):
"""Simple admin dashboard with system status"""
import psutil # pip install psutil
context = {
'cpu_percent': psutil.cpu_percent(interval=1),
'memory_percent': psutil.virtual_memory().percent,
'disk_percent': psutil.disk_usage('/').percent,
'load_avg': psutil.getloadavg(),
'uptime': time.time() - psutil.boot_time(),
}
# Database stats
from django.db import connection
with connection.cursor() as cursor:
cursor.execute("SELECT count(*) FROM blog_post")
context['post_count'] = cursor.fetchone()[0]
# Cache stats
from django.core.cache import cache
context['cache_hit_rate'] = getattr(cache, 'hit_rate', 'N/A')
# Recent errors (from log file)
import subprocess
try:
error_count = subprocess.check_output(
"grep -c ERROR /var/log/django/errors.log | tail -1",
shell=True
).decode().strip()
context['error_count_24h'] = error_count
except:
context['error_count_24h'] = 'N/A'
return render(request, 'admin/dashboard.html', context)
Monitoring Checklist
- ✅ Logging configured with rotation (daily or by size)
- ✅ Different log levels for different environments
- ✅ Sensitive data filtered from logs (passwords, tokens)
- ✅ Sentry (or similar) configured for error tracking
- ✅ Health check endpoint returning system status
- ✅ Uptime monitoring (Uptime Robot, Pingdom)
- ✅ Performance monitoring (New Relic, Datadog, or open source)
- ✅ Alerting configured for critical conditions
- ✅ Dashboard for quick status overview
- ✅ On-call rotation for after-hours alerts
- ✅ Runbook documented for common incidents
- ✅ Regular log review (daily or weekly)
Summary
Good monitoring is like insurance. You hope you never need it, but when disaster strikes, you're glad you have it. In this chapter, we covered:
- Configuring Django logging with multiple handlers and formatters
- Logging to files, console, and external services
- Error tracking with Sentry for real-time alerts
- Health check endpoints for load balancers and monitoring services
- Uptime monitoring with Uptime Robot
- Performance monitoring with New Relic and Datadog
In the next chapter, we'll build hands-on mini projects to apply everything you've learned.