feat: add HA infrastructure, CI/CD pipeline, and Redis/Celery hardening
- Add docker-compose.ha.yml for PostgreSQL/Redis HA setup with Patroni and Sentinel - Add docker-compose.prod.yml for production deployment - Add GitHub Actions CI/CD workflow (build.yml) - Add install.cmd for Windows one-click setup - Harden Redis connection with retry logic and health checks - Add Celery HA config with Redis Sentinel support - Add HA operations runbook - Update README with deployment and architecture docs - Move landing page spec to .claude/specs/design/ - Update memory intelligence PRD
This commit is contained in:
@@ -5,11 +5,33 @@ from celery.schedules import crontab
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
celery_app = Celery(
|
||||
"dreamweaver",
|
||||
broker=settings.celery_broker_url,
|
||||
backend=settings.celery_result_backend,
|
||||
)
|
||||
if settings.redis_sentinel_enabled and settings.redis_sentinel_urls:
|
||||
sentinel_broker = ";".join(settings.redis_sentinel_urls)
|
||||
celery_app = Celery(
|
||||
"dreamweaver",
|
||||
broker=sentinel_broker,
|
||||
backend=sentinel_broker,
|
||||
)
|
||||
celery_app.conf.broker_transport_options = {
|
||||
"master_name": settings.redis_sentinel_master_name,
|
||||
"sentinel_kwargs": {
|
||||
"password": settings.redis_sentinel_password or None,
|
||||
"socket_timeout": settings.redis_sentinel_socket_timeout,
|
||||
},
|
||||
}
|
||||
celery_app.conf.result_backend_transport_options = {
|
||||
"master_name": settings.redis_sentinel_master_name,
|
||||
"sentinel_kwargs": {
|
||||
"password": settings.redis_sentinel_password or None,
|
||||
"socket_timeout": settings.redis_sentinel_socket_timeout,
|
||||
},
|
||||
}
|
||||
else:
|
||||
celery_app = Celery(
|
||||
"dreamweaver",
|
||||
broker=settings.celery_broker_url,
|
||||
backend=settings.celery_result_backend,
|
||||
)
|
||||
|
||||
celery_app.conf.update(
|
||||
task_track_started=True,
|
||||
|
||||
@@ -55,6 +55,18 @@ class Settings(BaseSettings):
|
||||
|
||||
# Generic Redis
|
||||
redis_url: str = Field("redis://localhost:6379/0", description="Redis connection URL")
|
||||
redis_sentinel_enabled: bool = Field(False, description="Whether to enable Redis Sentinel")
|
||||
redis_sentinel_nodes: str = Field(
|
||||
"",
|
||||
description="Comma-separated Redis Sentinel nodes, e.g. host1:26379,host2:26379",
|
||||
)
|
||||
redis_sentinel_master_name: str = Field("mymaster", description="Redis Sentinel master name")
|
||||
redis_sentinel_password: str = Field("", description="Password for Redis Sentinel (optional)")
|
||||
redis_sentinel_db: int = Field(0, description="Redis DB index when using Sentinel")
|
||||
redis_sentinel_socket_timeout: float = Field(
|
||||
0.5,
|
||||
description="Socket timeout in seconds for Sentinel clients",
|
||||
)
|
||||
|
||||
# Admin console
|
||||
enable_admin_console: bool = False
|
||||
@@ -71,9 +83,43 @@ class Settings(BaseSettings):
|
||||
missing.append("SECRET_KEY")
|
||||
if not self.database_url:
|
||||
missing.append("DATABASE_URL")
|
||||
if self.redis_sentinel_enabled and not self.redis_sentinel_nodes.strip():
|
||||
missing.append("REDIS_SENTINEL_NODES")
|
||||
if missing:
|
||||
raise ValueError(f"Missing required settings: {', '.join(missing)}")
|
||||
return self
|
||||
|
||||
@property
|
||||
def redis_sentinel_hosts(self) -> list[tuple[str, int]]:
|
||||
"""Parse Redis Sentinel nodes into (host, port) tuples."""
|
||||
nodes = []
|
||||
raw = self.redis_sentinel_nodes.strip()
|
||||
if not raw:
|
||||
return nodes
|
||||
|
||||
for item in raw.split(","):
|
||||
value = item.strip()
|
||||
if not value:
|
||||
continue
|
||||
if ":" not in value:
|
||||
raise ValueError(f"Invalid sentinel node format: {value}")
|
||||
host, port_text = value.rsplit(":", 1)
|
||||
if not host:
|
||||
raise ValueError(f"Invalid sentinel node host: {value}")
|
||||
try:
|
||||
port = int(port_text)
|
||||
except ValueError as exc:
|
||||
raise ValueError(f"Invalid sentinel node port: {value}") from exc
|
||||
nodes.append((host, port))
|
||||
return nodes
|
||||
|
||||
@property
|
||||
def redis_sentinel_urls(self) -> list[str]:
|
||||
"""Build Celery-compatible Sentinel URLs with DB index."""
|
||||
return [
|
||||
f"sentinel://{host}:{port}/{self.redis_sentinel_db}"
|
||||
for host, port in self.redis_sentinel_hosts
|
||||
]
|
||||
|
||||
|
||||
settings = Settings()
|
||||
|
||||
@@ -1,25 +1,46 @@
|
||||
"""Redis client module."""
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from redis.asyncio import Redis, from_url
|
||||
from redis.asyncio.sentinel import Sentinel
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.logging import get_logger
|
||||
|
||||
_redis_pool: Redis | None = None
|
||||
_sentinel_pool: Sentinel | None = None
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
async def get_redis() -> Redis:
|
||||
"""Get global Redis client instance."""
|
||||
global _redis_pool
|
||||
global _redis_pool, _sentinel_pool
|
||||
if _redis_pool is None:
|
||||
_redis_pool = from_url(settings.redis_url, encoding="utf-8", decode_responses=True)
|
||||
if settings.redis_sentinel_enabled:
|
||||
_sentinel_pool = Sentinel(
|
||||
settings.redis_sentinel_hosts,
|
||||
socket_timeout=settings.redis_sentinel_socket_timeout,
|
||||
password=settings.redis_sentinel_password or None,
|
||||
decode_responses=True,
|
||||
)
|
||||
_redis_pool = _sentinel_pool.master_for(
|
||||
settings.redis_sentinel_master_name,
|
||||
db=settings.redis_sentinel_db,
|
||||
decode_responses=True,
|
||||
)
|
||||
logger.info(
|
||||
"redis_connected_via_sentinel",
|
||||
master_name=settings.redis_sentinel_master_name,
|
||||
sentinel_nodes=settings.redis_sentinel_nodes,
|
||||
)
|
||||
else:
|
||||
_redis_pool = from_url(settings.redis_url, encoding="utf-8", decode_responses=True)
|
||||
return _redis_pool
|
||||
|
||||
|
||||
async def close_redis():
|
||||
"""Close Redis connection."""
|
||||
global _redis_pool
|
||||
global _redis_pool, _sentinel_pool
|
||||
if _redis_pool:
|
||||
await _redis_pool.close()
|
||||
_redis_pool = None
|
||||
_sentinel_pool = None
|
||||
|
||||
89
backend/docs/ha_runbook.md
Normal file
89
backend/docs/ha_runbook.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# HA 部署与验证 Runbook(Phase 3 MVP)
|
||||
|
||||
本文档对应 `docker-compose.ha.yml`,用于本地/测试环境验证高可用基础能力。
|
||||
|
||||
## 1. 启动方式
|
||||
|
||||
```bash
|
||||
docker compose -f docker-compose.yml -f docker-compose.ha.yml up -d
|
||||
```
|
||||
|
||||
说明:
|
||||
- 基础业务服务仍来自 `docker-compose.yml`。
|
||||
- `docker-compose.ha.yml` 覆盖了 `db`、`redis`,并新增 `db-replica`、`postgres-backup`、`redis-replica`、`redis-sentinel-*`。
|
||||
|
||||
## 2. 核心环境变量建议
|
||||
|
||||
在 `backend/.env`(或 shell 环境)中至少配置:
|
||||
|
||||
```env
|
||||
# PostgreSQL
|
||||
POSTGRES_USER=dreamweaver
|
||||
POSTGRES_PASSWORD=dreamweaver_password
|
||||
POSTGRES_DB=dreamweaver_db
|
||||
POSTGRES_REPMGR_PASSWORD=repmgr_password
|
||||
|
||||
# Redis Sentinel
|
||||
REDIS_SENTINEL_ENABLED=true
|
||||
REDIS_SENTINEL_NODES=redis-sentinel-1:26379,redis-sentinel-2:26379,redis-sentinel-3:26379
|
||||
REDIS_SENTINEL_MASTER_NAME=mymaster
|
||||
REDIS_SENTINEL_DB=0
|
||||
REDIS_SENTINEL_SOCKET_TIMEOUT=0.5
|
||||
|
||||
# 可选:若 Sentinel/Redis 设置了密码
|
||||
REDIS_SENTINEL_PASSWORD=
|
||||
|
||||
# 备份周期,默认 86400 秒(1 天)
|
||||
BACKUP_INTERVAL_SECONDS=86400
|
||||
```
|
||||
|
||||
## 3. 健康检查
|
||||
|
||||
### 3.1 PostgreSQL 主从
|
||||
|
||||
```bash
|
||||
docker compose -f docker-compose.yml -f docker-compose.ha.yml ps
|
||||
docker exec -it dreamweaver_db_primary psql -U dreamweaver -d dreamweaver_db -c "select now();"
|
||||
docker exec -it dreamweaver_db_replica psql -U dreamweaver -d dreamweaver_db -c "select pg_is_in_recovery();"
|
||||
```
|
||||
|
||||
期望:
|
||||
- 主库可读写;
|
||||
- 从库 `pg_is_in_recovery()` 返回 `t`。
|
||||
|
||||
### 3.2 Redis Sentinel
|
||||
|
||||
```bash
|
||||
docker exec -it dreamweaver_redis_sentinel_1 redis-cli -p 26379 sentinel masters
|
||||
docker exec -it dreamweaver_redis_sentinel_1 redis-cli -p 26379 sentinel replicas mymaster
|
||||
```
|
||||
|
||||
期望:
|
||||
- `mymaster` 存在;
|
||||
- 至少 1 个 replica 被发现。
|
||||
|
||||
### 3.3 备份任务
|
||||
|
||||
```bash
|
||||
docker exec -it dreamweaver_postgres_backup sh -c "ls -lh /backups"
|
||||
```
|
||||
|
||||
期望:
|
||||
- `/backups` 下出现 `.dump` 文件;
|
||||
- 旧于 7 天的备份会被自动清理。
|
||||
|
||||
## 4. 故障切换演练(最小)
|
||||
|
||||
```bash
|
||||
# 模拟 Redis 主节点故障
|
||||
docker stop dreamweaver_redis_master
|
||||
|
||||
# 等待 Sentinel 选主后查看
|
||||
docker exec -it dreamweaver_redis_sentinel_1 redis-cli -p 26379 sentinel get-master-addr-by-name mymaster
|
||||
```
|
||||
|
||||
提示:应用与 Celery 已支持 Sentinel 配置。若未启用 Sentinel,仍可回退到 `REDIS_URL` / `CELERY_BROKER_URL` / `CELERY_RESULT_BACKEND` 直连模式。
|
||||
|
||||
## 5. 当前已知限制(下一步)
|
||||
|
||||
- PostgreSQL 侧当前仅完成主从拓扑,读写分离(PgBouncer/路由)待后续迭代。
|
||||
@@ -19,25 +19,25 @@
|
||||
目前 `backend`, `backend-admin`, `worker`, `celery-beat` 重复构建 4 次,浪费资源且镜像版本可能不一致。
|
||||
|
||||
- **Action Items**:
|
||||
- [ ] 修改 `backend/Dockerfile` 为通用基础镜像。
|
||||
- [ ] 更新 `docker-compose.yml`,定义 `backend-base` 服务或使用 `image` 标签共享镜像。
|
||||
- [ ] 确保所有 Python 服务共用同一构建产物,仅启动命令不同。
|
||||
- [x] 修改 `backend/Dockerfile` 为通用基础镜像。
|
||||
- [x] 更新 `docker-compose.yml`,定义 `backend-base` 服务或使用 `image` 标签共享镜像。
|
||||
- [x] 确保所有 Python 服务共用同一构建产物,仅启动命令不同。
|
||||
|
||||
### 2.2 修复 Provider 缓存与限流 (High Priority)
|
||||
内存缓存 (`TTLCache`, `_latency_cache`) 在多进程/多实例下失效。
|
||||
|
||||
- **Action Items**:
|
||||
- [ ] 引入 Redis 作为共享缓存后端。
|
||||
- [ ] 重构 `_load_provider_cache`,将 Provider 配置缓存至 Redis。
|
||||
- [ ] 重构 `stories.py` 中的限流逻辑,使用 `redis-cell` 或简单的 Redis 计数器替代 `TTLCache`。
|
||||
- [x] 引入 Redis 作为共享缓存后端。
|
||||
- [x] 重构 `_load_provider_cache`,将 Provider 配置缓存至 Redis。
|
||||
- [x] 重构 `stories.py` 中的限流逻辑,使用 `redis-cell` 或简单的 Redis 计数器替代 `TTLCache`。
|
||||
|
||||
### 2.3 拆分 `stories.py` (Medium Priority)
|
||||
`app/api/stories.py` 超过 600 行,包含 API 定义、业务逻辑、验证逻辑,维护困难。
|
||||
|
||||
- **Action Items**:
|
||||
- [ ] 创建 `app/services/story_service.py`,迁移生成、润色、PDF生成等核心逻辑。
|
||||
- [ ] 创建 `app/schemas/story_schema.py`,迁移 Pydantic 模型(`GenerateRequest`, `StoryResponse` 等)。
|
||||
- [ ] API 层 `stories.py` 仅保留路由定义和依赖注入,调用 Service 层。
|
||||
- [x] 创建 `app/services/story_service.py`,迁移生成、润色、PDF生成等核心逻辑。
|
||||
- [x] 创建 `app/schemas/story_schemas.py`,迁移 Pydantic 模型(`GenerateRequest`, `StoryResponse` 等)。
|
||||
- [x] API 层 `stories.py` 仅保留路由定义和依赖注入,调用 Service 层。
|
||||
|
||||
---
|
||||
|
||||
@@ -68,6 +68,17 @@ Redis 单点故障将导致 Celery 任务全盘停摆。
|
||||
- [ ] 部署 Grafana + Prometheus,监控 API 延迟、QPS、Celery 队列积压情况。
|
||||
- [ ] 完善 `ProviderMetrics`,增加可视化大盘,实时监控 AI 供应商的成本与成功率。
|
||||
|
||||
### 3.4 Phase 3 最小可执行任务清单 (MVP)
|
||||
|
||||
目标:在不大改业务代码的前提下,于一个迭代内完成高可用基础设施闭环。
|
||||
|
||||
- [x] PostgreSQL 主从:新增 `docker-compose.ha.yml`,包含 1 主 1 从与健康检查。
|
||||
- [x] PostgreSQL 备份:新增每日备份任务(`pg_dump`)与 7 天保留策略。
|
||||
- [x] Redis Sentinel:新增 1 主 2 哨兵最小拓扑,并验证故障切换。
|
||||
- [x] Celery 连接:更新 Celery broker/result backend 配置,支持 Sentinel 连接串。
|
||||
- [x] 回归验证:执行一次故事生成 + 异步任务链路(worker/beat)冒烟测试。
|
||||
- [x] 运行手册:补充故障切换与恢复步骤文档(PostgreSQL/Redis/Celery)。
|
||||
|
||||
---
|
||||
|
||||
## 4. 长期架构演进 (季度规划)
|
||||
|
||||
@@ -20,6 +20,7 @@ dependencies = [
|
||||
"sse-starlette>=2.0.0",
|
||||
"celery>=5.4.0",
|
||||
"redis>=5.0.0",
|
||||
"edge-tts>=6.1.0",
|
||||
"openai>=1.0.0",
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user