feat: professionalize control plane and standalone delivery
This commit is contained in:
138
src/biliup_next/modules/song_detect/providers/codex.py
Normal file
138
src/biliup_next/modules/song_detect/providers/codex.py
Normal file
@ -0,0 +1,138 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from biliup_next.core.errors import ModuleError
|
||||
from biliup_next.core.models import Artifact, Task, utc_now_iso
|
||||
from biliup_next.core.providers import ProviderManifest
|
||||
from biliup_next.infra.adapters.codex_cli import CodexCliAdapter
|
||||
|
||||
SONG_SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"songs": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {"type": "string"},
|
||||
"end": {"type": "string"},
|
||||
"title": {"type": "string"},
|
||||
"artist": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
"evidence": {"type": "string"},
|
||||
},
|
||||
"required": ["start", "end", "title", "artist", "confidence", "evidence"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
},
|
||||
"required": ["songs"],
|
||||
"additionalProperties": False,
|
||||
}
|
||||
|
||||
TASK_PROMPT = """你是音乐片段识别助手。当前目录下有一个字幕文件。
|
||||
任务:
|
||||
1. 结合字幕内容并允许联网搜索进行纠错(识别同音字、唱错等)。
|
||||
2. 识别出直播中唱过的所有歌曲,给出精确的开始和结束时间。歌曲开始时间规则:
|
||||
- 歌曲开始时间应使用“上一句字幕的结束时间”作为 start_time。
|
||||
- 这样可以尽量保留歌曲可能存在的前奏。
|
||||
3. 同一首歌间隔 ≤160s 合并,>160s 分开。若连续识别出相同歌曲,且中间只有短暂对白、空白、转场或无歌词段,应合并为同一首歌.
|
||||
4. 忽略纯聊天片段。
|
||||
5. 无法确认的歌曲丢弃,宁缺毋滥:你的输出将直接面向最终用户。
|
||||
6. 忽略短片段:如果一段演唱持续时间总和少于 15 秒,视为随口哼唱,请直接忽略,不计入列表。
|
||||
7. 仔细分析每一句歌词,识别出相关歌曲后, 使用该歌曲歌词上下文对比字幕上下文,确定歌曲起始与停止时间
|
||||
8.歌曲标注规则:
|
||||
- 可以在歌曲名称后使用括号 () 添加补充说明。
|
||||
- 常见标注示例:
|
||||
- (片段):歌曲演唱时间较短,例如 < 60 秒
|
||||
- (清唱):无伴奏演唱
|
||||
- (副歌):只演唱副歌部分
|
||||
- 标注应简洁,仅在确有必要时使用。
|
||||
9. 通过歌曲起始和结束时间自检, 一般歌曲长度在5分钟以内, 1分钟以上, 可疑片段重新联网搜索检查.
|
||||
最后请严格按照 Schema 生成 JSON 数据。"""
|
||||
|
||||
|
||||
class CodexSongDetector:
|
||||
def __init__(self, adapter: CodexCliAdapter | None = None) -> None:
|
||||
self.adapter = adapter or CodexCliAdapter()
|
||||
|
||||
manifest = ProviderManifest(
|
||||
id="codex",
|
||||
name="Codex Song Detector",
|
||||
version="0.1.0",
|
||||
provider_type="song_detector",
|
||||
entrypoint="biliup_next.modules.song_detect.providers.codex:CodexSongDetector",
|
||||
capabilities=["song_detect"],
|
||||
enabled_by_default=True,
|
||||
)
|
||||
|
||||
def detect(self, task: Task, subtitle_srt: Artifact, settings: dict[str, Any]) -> tuple[Artifact, Artifact]:
|
||||
work_dir = Path(subtitle_srt.path).resolve().parent
|
||||
schema_path = work_dir / "song_schema.json"
|
||||
songs_json_path = work_dir / "songs.json"
|
||||
songs_txt_path = work_dir / "songs.txt"
|
||||
schema_path.write_text(json.dumps(SONG_SCHEMA, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
codex_cmd = str(settings.get("codex_cmd", "codex"))
|
||||
result = self.adapter.run_song_detect(
|
||||
codex_cmd=codex_cmd,
|
||||
work_dir=work_dir,
|
||||
prompt=TASK_PROMPT,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise ModuleError(
|
||||
code="SONG_DETECT_FAILED",
|
||||
message="codex exec 执行失败",
|
||||
retryable=True,
|
||||
details={"stdout": result.stdout[-2000:], "stderr": result.stderr[-2000:]},
|
||||
)
|
||||
|
||||
if songs_json_path.exists() and not songs_txt_path.exists():
|
||||
self._generate_txt_fallback(songs_json_path, songs_txt_path)
|
||||
|
||||
if not songs_json_path.exists() or not songs_txt_path.exists():
|
||||
raise ModuleError(
|
||||
code="SONG_DETECT_OUTPUT_MISSING",
|
||||
message=f"未生成 songs.json/songs.txt: {work_dir}",
|
||||
retryable=True,
|
||||
details={"stdout": result.stdout[-2000:], "stderr": result.stderr[-2000:]},
|
||||
)
|
||||
|
||||
return (
|
||||
Artifact(
|
||||
id=None,
|
||||
task_id=task.id,
|
||||
artifact_type="songs_json",
|
||||
path=str(songs_json_path.resolve()),
|
||||
metadata_json=json.dumps({"provider": "codex"}),
|
||||
created_at=utc_now_iso(),
|
||||
),
|
||||
Artifact(
|
||||
id=None,
|
||||
task_id=task.id,
|
||||
artifact_type="songs_txt",
|
||||
path=str(songs_txt_path.resolve()),
|
||||
metadata_json=json.dumps({"provider": "codex"}),
|
||||
created_at=utc_now_iso(),
|
||||
),
|
||||
)
|
||||
|
||||
def _generate_txt_fallback(self, songs_json_path: Path, songs_txt_path: Path) -> None:
|
||||
try:
|
||||
data = json.loads(songs_json_path.read_text(encoding="utf-8"))
|
||||
songs = data.get("songs", [])
|
||||
with songs_txt_path.open("w", encoding="utf-8") as file_handle:
|
||||
for song in songs:
|
||||
start_time = str(song["start"]).split(",")[0].split(".")[0]
|
||||
file_handle.write(f"{start_time} {song['title']} — {song['artist']}\n")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
raise ModuleError(
|
||||
code="SONGS_TXT_GENERATE_FAILED",
|
||||
message=f"生成 songs.txt 失败: {songs_txt_path}",
|
||||
retryable=False,
|
||||
details={"error": str(exc)},
|
||||
) from exc
|
||||
Reference in New Issue
Block a user