feat: professionalize control plane and standalone delivery
This commit is contained in:
@ -0,0 +1,151 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from biliup_next.core.errors import ModuleError
|
||||
from biliup_next.core.models import Task
|
||||
from biliup_next.core.providers import ProviderManifest
|
||||
from biliup_next.infra.adapters.bilibili_api import BilibiliApiAdapter
|
||||
from biliup_next.infra.adapters.full_video_locator import resolve_full_video_bvid
|
||||
|
||||
|
||||
class BilibiliCollectionProvider:
|
||||
def __init__(self, bilibili_api: BilibiliApiAdapter | None = None) -> None:
|
||||
self.bilibili_api = bilibili_api or BilibiliApiAdapter()
|
||||
self._section_cache: dict[int, int | None] = {}
|
||||
|
||||
manifest = ProviderManifest(
|
||||
id="bilibili_collection",
|
||||
name="Bilibili Collection Provider",
|
||||
version="0.1.0",
|
||||
provider_type="collection_provider",
|
||||
entrypoint="biliup_next.modules.collection.providers.bilibili_collection:BilibiliCollectionProvider",
|
||||
capabilities=["collection"],
|
||||
enabled_by_default=True,
|
||||
)
|
||||
|
||||
def sync(self, task: Task, target: str, settings: dict[str, Any]) -> dict[str, object]:
|
||||
session_dir = Path(str(settings["session_dir"])) / task.title
|
||||
cookies = self.bilibili_api.load_cookies(Path(str(settings["cookies_file"])))
|
||||
csrf = cookies.get("bili_jct")
|
||||
if not csrf:
|
||||
raise ModuleError(code="COOKIE_CSRF_MISSING", message="Cookie 缺少 bili_jct", retryable=False)
|
||||
|
||||
session = self.bilibili_api.build_session(
|
||||
cookies=cookies,
|
||||
referer="https://member.bilibili.com/platform/upload-manager/distribution",
|
||||
)
|
||||
|
||||
if target == "a":
|
||||
season_id = int(settings["season_id_a"])
|
||||
bvid = resolve_full_video_bvid(task.title, session_dir, settings)
|
||||
if not bvid:
|
||||
(session_dir / "collection_a_done.flag").touch()
|
||||
return {"status": "skipped", "reason": "full_video_bvid_not_found"}
|
||||
flag_path = session_dir / "collection_a_done.flag"
|
||||
else:
|
||||
season_id = int(settings["season_id_b"])
|
||||
bvid_path = session_dir / "bvid.txt"
|
||||
if not bvid_path.exists():
|
||||
raise ModuleError(code="COLLECTION_BVID_MISSING", message=f"缺少 bvid.txt: {session_dir}", retryable=True)
|
||||
bvid = bvid_path.read_text(encoding="utf-8").strip()
|
||||
flag_path = session_dir / "collection_b_done.flag"
|
||||
|
||||
if season_id <= 0:
|
||||
flag_path.touch()
|
||||
return {"status": "skipped", "reason": "season_disabled"}
|
||||
|
||||
section_id = self._resolve_section_id(session, season_id)
|
||||
if not section_id:
|
||||
raise ModuleError(code="COLLECTION_SECTION_NOT_FOUND", message=f"未找到合集 section: {season_id}", retryable=True)
|
||||
|
||||
info = self._get_video_info(session, bvid)
|
||||
add_result = self._add_videos_batch(session, csrf, section_id, [info])
|
||||
if add_result["status"] == "failed":
|
||||
raise ModuleError(
|
||||
code="COLLECTION_ADD_FAILED",
|
||||
message=str(add_result["message"]),
|
||||
retryable=True,
|
||||
details=add_result,
|
||||
)
|
||||
|
||||
flag_path.touch()
|
||||
if add_result["status"] == "added":
|
||||
append_key = "append_collection_a_new_to_end" if target == "a" else "append_collection_b_new_to_end"
|
||||
if settings.get(append_key, True):
|
||||
self._move_videos_to_section_end(session, csrf, section_id, [int(info["aid"])])
|
||||
return {"status": add_result["status"], "target": target, "bvid": bvid, "season_id": season_id}
|
||||
|
||||
def _resolve_section_id(self, session, season_id: int) -> int | None: # type: ignore[no-untyped-def]
|
||||
if season_id in self._section_cache:
|
||||
return self._section_cache[season_id]
|
||||
result = self.bilibili_api.list_seasons(session)
|
||||
if result.get("code") != 0:
|
||||
return None
|
||||
for season in result.get("data", {}).get("seasons", []):
|
||||
if season.get("season", {}).get("id") == season_id:
|
||||
sections = season.get("sections", {}).get("sections", [])
|
||||
section_id = sections[0]["id"] if sections else None
|
||||
self._section_cache[season_id] = section_id
|
||||
return section_id
|
||||
self._section_cache[season_id] = None
|
||||
return None
|
||||
|
||||
def _get_video_info(self, session, bvid: str) -> dict[str, object]: # type: ignore[no-untyped-def]
|
||||
data = self.bilibili_api.get_video_view(
|
||||
session,
|
||||
bvid,
|
||||
error_code="COLLECTION_VIDEO_INFO_FAILED",
|
||||
error_message="获取视频信息失败",
|
||||
)
|
||||
return {"aid": data["aid"], "cid": data["cid"], "title": data["title"], "charging_pay": 0}
|
||||
|
||||
def _add_videos_batch(self, session, csrf: str, section_id: int, episodes: list[dict[str, object]]) -> dict[str, object]: # type: ignore[no-untyped-def]
|
||||
time.sleep(random.uniform(5.0, 10.0))
|
||||
result = self.bilibili_api.add_section_episodes(
|
||||
session,
|
||||
csrf=csrf,
|
||||
section_id=section_id,
|
||||
episodes=episodes,
|
||||
)
|
||||
if result.get("code") == 0:
|
||||
return {"status": "added"}
|
||||
if result.get("code") == 20080:
|
||||
return {"status": "already_exists", "message": result.get("message", "")}
|
||||
return {"status": "failed", "message": result.get("message", "unknown error"), "code": result.get("code")}
|
||||
|
||||
def _move_videos_to_section_end(self, session, csrf: str, section_id: int, added_aids: list[int]) -> bool: # type: ignore[no-untyped-def]
|
||||
detail = self.bilibili_api.get_section_detail(session, section_id=section_id)
|
||||
if detail.get("code") != 0:
|
||||
return False
|
||||
section = detail.get("data", {}).get("section", {})
|
||||
episodes = detail.get("data", {}).get("episodes", []) or []
|
||||
if not episodes:
|
||||
return True
|
||||
target_aids = {int(aid) for aid in added_aids}
|
||||
existing = []
|
||||
appended = []
|
||||
for episode in episodes:
|
||||
item = {"id": episode.get("id")}
|
||||
if item["id"] is None:
|
||||
continue
|
||||
if episode.get("aid") in target_aids:
|
||||
appended.append(item)
|
||||
else:
|
||||
existing.append(item)
|
||||
ordered = existing + appended
|
||||
payload = {
|
||||
"section": {
|
||||
"id": section["id"],
|
||||
"seasonId": section["seasonId"],
|
||||
"title": section["title"],
|
||||
"type": section["type"],
|
||||
},
|
||||
"sorts": [{"id": item["id"], "sort": index + 1} for index, item in enumerate(ordered)],
|
||||
}
|
||||
result = self.bilibili_api.edit_section(session, csrf=csrf, payload=payload)
|
||||
return result.get("code") == 0
|
||||
@ -0,0 +1,161 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from biliup_next.core.errors import ModuleError
|
||||
from biliup_next.core.models import Task
|
||||
from biliup_next.core.providers import ProviderManifest
|
||||
from biliup_next.infra.adapters.bilibili_api import BilibiliApiAdapter
|
||||
from biliup_next.infra.adapters.full_video_locator import resolve_full_video_bvid
|
||||
|
||||
|
||||
class BilibiliTopCommentProvider:
|
||||
def __init__(self, bilibili_api: BilibiliApiAdapter | None = None) -> None:
|
||||
self.bilibili_api = bilibili_api or BilibiliApiAdapter()
|
||||
|
||||
manifest = ProviderManifest(
|
||||
id="bilibili_top_comment",
|
||||
name="Bilibili Top Comment Provider",
|
||||
version="0.1.0",
|
||||
provider_type="comment_provider",
|
||||
entrypoint="biliup_next.modules.comment.providers.bilibili_top_comment:BilibiliTopCommentProvider",
|
||||
capabilities=["comment"],
|
||||
enabled_by_default=True,
|
||||
)
|
||||
|
||||
def comment(self, task: Task, settings: dict[str, Any]) -> dict[str, object]:
|
||||
session_dir = Path(str(settings["session_dir"])) / task.title
|
||||
songs_path = session_dir / "songs.txt"
|
||||
songs_json_path = session_dir / "songs.json"
|
||||
bvid_path = session_dir / "bvid.txt"
|
||||
if not songs_path.exists() or not bvid_path.exists():
|
||||
raise ModuleError(
|
||||
code="COMMENT_INPUT_MISSING",
|
||||
message=f"缺少评论所需文件: {session_dir}",
|
||||
retryable=True,
|
||||
)
|
||||
|
||||
timeline_content = songs_path.read_text(encoding="utf-8").strip()
|
||||
split_content = self._build_split_comment_content(songs_json_path, songs_path)
|
||||
if not timeline_content and not split_content:
|
||||
self._touch_comment_flags(session_dir, split_done=True, full_done=True)
|
||||
return {"status": "skipped", "reason": "comment_content_empty"}
|
||||
|
||||
cookies = self.bilibili_api.load_cookies(Path(str(settings["cookies_file"])))
|
||||
csrf = cookies.get("bili_jct")
|
||||
if not csrf:
|
||||
raise ModuleError(code="COOKIE_CSRF_MISSING", message="Cookie 缺少 bili_jct", retryable=False)
|
||||
|
||||
session = self.bilibili_api.build_session(
|
||||
cookies=cookies,
|
||||
referer="https://www.bilibili.com/",
|
||||
origin="https://www.bilibili.com",
|
||||
)
|
||||
|
||||
split_result = {"status": "skipped", "reason": "disabled"}
|
||||
full_result = {"status": "skipped", "reason": "disabled"}
|
||||
split_done = (session_dir / "comment_split_done.flag").exists()
|
||||
full_done = (session_dir / "comment_full_done.flag").exists()
|
||||
|
||||
if settings.get("post_split_comment", True) and not split_done:
|
||||
split_bvid = bvid_path.read_text(encoding="utf-8").strip()
|
||||
if split_content:
|
||||
split_result = self._post_and_top_comment(session, csrf, split_bvid, split_content, "split")
|
||||
else:
|
||||
split_result = {"status": "skipped", "reason": "split_comment_empty"}
|
||||
split_done = True
|
||||
(session_dir / "comment_split_done.flag").touch()
|
||||
elif not split_done:
|
||||
split_done = True
|
||||
(session_dir / "comment_split_done.flag").touch()
|
||||
|
||||
if settings.get("post_full_video_timeline_comment", True) and not full_done:
|
||||
full_bvid = resolve_full_video_bvid(task.title, session_dir, settings)
|
||||
if full_bvid and timeline_content:
|
||||
full_result = self._post_and_top_comment(session, csrf, full_bvid, timeline_content, "full")
|
||||
else:
|
||||
reason = "full_video_bvid_not_found" if not full_bvid else "timeline_comment_empty"
|
||||
full_result = {"status": "skipped", "reason": reason}
|
||||
full_done = True
|
||||
(session_dir / "comment_full_done.flag").touch()
|
||||
elif not full_done:
|
||||
full_done = True
|
||||
(session_dir / "comment_full_done.flag").touch()
|
||||
|
||||
if split_done and full_done:
|
||||
(session_dir / "comment_done.flag").touch()
|
||||
return {"status": "ok", "split": split_result, "full": full_result}
|
||||
|
||||
def _post_and_top_comment(
|
||||
self,
|
||||
session,
|
||||
csrf: str,
|
||||
bvid: str,
|
||||
content: str,
|
||||
target: str,
|
||||
) -> dict[str, object]:
|
||||
view = self.bilibili_api.get_video_view(
|
||||
session,
|
||||
bvid,
|
||||
error_code="COMMENT_VIEW_FAILED",
|
||||
error_message=f"获取{target}视频信息失败",
|
||||
)
|
||||
aid = int(view["aid"])
|
||||
add_res = self.bilibili_api.add_reply(
|
||||
session,
|
||||
csrf=csrf,
|
||||
aid=aid,
|
||||
content=content,
|
||||
error_message=f"发布{target}评论失败",
|
||||
)
|
||||
rpid = int(add_res["rpid"])
|
||||
time.sleep(3)
|
||||
self.bilibili_api.top_reply(
|
||||
session,
|
||||
csrf=csrf,
|
||||
aid=aid,
|
||||
rpid=rpid,
|
||||
error_message=f"置顶{target}评论失败",
|
||||
)
|
||||
return {"status": "ok", "bvid": bvid, "aid": aid, "rpid": rpid}
|
||||
|
||||
@staticmethod
|
||||
def _build_split_comment_content(songs_json_path: Path, songs_txt_path: Path) -> str:
|
||||
if songs_json_path.exists():
|
||||
try:
|
||||
data = json.loads(songs_json_path.read_text(encoding="utf-8"))
|
||||
lines = []
|
||||
for index, song in enumerate(data.get("songs", []), 1):
|
||||
title = str(song.get("title", "")).strip()
|
||||
artist = str(song.get("artist", "")).strip()
|
||||
if not title:
|
||||
continue
|
||||
suffix = f" — {artist}" if artist else ""
|
||||
lines.append(f"{index}. {title}{suffix}")
|
||||
if lines:
|
||||
return "\n".join(lines)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
if songs_txt_path.exists():
|
||||
lines = []
|
||||
for index, raw in enumerate(songs_txt_path.read_text(encoding="utf-8").splitlines(), 1):
|
||||
text = raw.strip()
|
||||
if not text:
|
||||
continue
|
||||
parts = text.split(" ", 1)
|
||||
song_text = parts[1] if len(parts) == 2 and ":" in parts[0] else text
|
||||
lines.append(f"{index}. {song_text}")
|
||||
return "\n".join(lines)
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _touch_comment_flags(session_dir: Path, *, split_done: bool, full_done: bool) -> None:
|
||||
if split_done:
|
||||
(session_dir / "comment_split_done.flag").touch()
|
||||
if full_done:
|
||||
(session_dir / "comment_full_done.flag").touch()
|
||||
if split_done and full_done:
|
||||
(session_dir / "comment_done.flag").touch()
|
||||
@ -1,26 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from biliup_next.core.errors import ModuleError
|
||||
from biliup_next.core.models import Artifact, Task, TaskStep, utc_now_iso
|
||||
from biliup_next.core.models import Artifact, Task, TaskContext, TaskStep, utc_now_iso
|
||||
from biliup_next.core.registry import Registry
|
||||
from biliup_next.infra.task_repository import TaskRepository
|
||||
|
||||
SHANGHAI_TZ = ZoneInfo("Asia/Shanghai")
|
||||
TITLE_PATTERN = re.compile(
|
||||
r"^(?P<streamer>.+?)\s+(?P<month>\d{2})月(?P<day>\d{2})日\s+(?P<hour>\d{2})时(?P<minute>\d{2})分"
|
||||
)
|
||||
|
||||
|
||||
class IngestService:
|
||||
def __init__(self, registry: Registry, repo: TaskRepository):
|
||||
self.registry = registry
|
||||
self.repo = repo
|
||||
|
||||
def create_task_from_file(self, source_path: Path, settings: dict[str, object]) -> Task:
|
||||
def create_task_from_file(
|
||||
self,
|
||||
source_path: Path,
|
||||
settings: dict[str, object],
|
||||
*,
|
||||
context_payload: dict[str, object] | None = None,
|
||||
) -> Task:
|
||||
provider_id = str(settings.get("provider", "local_file"))
|
||||
provider = self.registry.get("ingest_provider", provider_id)
|
||||
provider.validate_source(source_path, settings)
|
||||
source_path = source_path.resolve()
|
||||
session_dir = Path(str(settings["session_dir"])).resolve()
|
||||
try:
|
||||
source_path.relative_to(session_dir)
|
||||
except ValueError as exc:
|
||||
raise ModuleError(
|
||||
code="SOURCE_OUTSIDE_WORKSPACE",
|
||||
message=f"源文件不在 session 工作区内: {source_path}",
|
||||
retryable=False,
|
||||
details={"session_dir": str(session_dir), "hint": "请先使用 stage/import 或 stage/upload 导入文件"},
|
||||
) from exc
|
||||
|
||||
task_id = source_path.stem
|
||||
if self.repo.get_task(task_id):
|
||||
@ -31,10 +56,11 @@ class IngestService:
|
||||
)
|
||||
|
||||
now = utc_now_iso()
|
||||
context_payload = context_payload or {}
|
||||
task = Task(
|
||||
id=task_id,
|
||||
source_type="local_file",
|
||||
source_path=str(source_path.resolve()),
|
||||
source_path=str(source_path),
|
||||
title=source_path.stem,
|
||||
status="created",
|
||||
created_at=now,
|
||||
@ -59,11 +85,22 @@ class IngestService:
|
||||
id=None,
|
||||
task_id=task_id,
|
||||
artifact_type="source_video",
|
||||
path=str(source_path.resolve()),
|
||||
path=str(source_path),
|
||||
metadata_json=json.dumps({"provider": provider_id}),
|
||||
created_at=now,
|
||||
)
|
||||
)
|
||||
context = self._build_task_context(
|
||||
task,
|
||||
context_payload,
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
session_gap_minutes=int(settings.get("session_gap_minutes", 60)),
|
||||
)
|
||||
self.repo.upsert_task_context(context)
|
||||
full_video_bvid = (context.full_video_bvid or "").strip()
|
||||
if full_video_bvid.startswith("BV"):
|
||||
(source_path.parent / "full_video_bvid.txt").write_text(full_video_bvid, encoding="utf-8")
|
||||
return task
|
||||
|
||||
def scan_stage(self, settings: dict[str, object]) -> dict[str, object]:
|
||||
@ -123,10 +160,27 @@ class IngestService:
|
||||
)
|
||||
continue
|
||||
|
||||
sidecar_meta = self._load_sidecar_metadata(
|
||||
source_path,
|
||||
enabled=bool(settings.get("meta_sidecar_enabled", True)),
|
||||
suffix=str(settings.get("meta_sidecar_suffix", ".meta.json")),
|
||||
)
|
||||
task_dir = session_dir / task_id
|
||||
task_dir.mkdir(parents=True, exist_ok=True)
|
||||
target_source = self._move_to_directory(source_path, task_dir)
|
||||
task = self.create_task_from_file(target_source, settings)
|
||||
if sidecar_meta["meta_path"] is not None:
|
||||
self._move_optional_metadata_file(sidecar_meta["meta_path"], task_dir)
|
||||
context_payload = {
|
||||
"source_title": source_path.stem,
|
||||
"segment_duration_seconds": duration_seconds,
|
||||
"segment_started_at": sidecar_meta["payload"].get("segment_started_at"),
|
||||
"streamer": sidecar_meta["payload"].get("streamer"),
|
||||
"room_id": sidecar_meta["payload"].get("room_id"),
|
||||
"session_key": sidecar_meta["payload"].get("session_key"),
|
||||
"full_video_bvid": sidecar_meta["payload"].get("full_video_bvid"),
|
||||
"reference_timestamp": sidecar_meta["payload"].get("reference_timestamp") or source_path.stat().st_mtime,
|
||||
}
|
||||
task = self.create_task_from_file(target_source, settings, context_payload=context_payload)
|
||||
accepted.append(
|
||||
{
|
||||
"task_id": task.id,
|
||||
@ -199,3 +253,202 @@ class IngestService:
|
||||
if not candidate.exists():
|
||||
return candidate
|
||||
index += 1
|
||||
|
||||
@staticmethod
|
||||
def _load_sidecar_metadata(source_path: Path, *, enabled: bool, suffix: str) -> dict[str, object]:
|
||||
if not enabled:
|
||||
return {"meta_path": None, "payload": {}}
|
||||
suffix = suffix.strip() or ".meta.json"
|
||||
meta_path = source_path.with_name(f"{source_path.stem}{suffix}")
|
||||
payload: dict[str, object] = {}
|
||||
if meta_path.exists():
|
||||
try:
|
||||
payload = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ModuleError(
|
||||
code="STAGE_META_INVALID",
|
||||
message=f"元数据文件不是合法 JSON: {meta_path.name}",
|
||||
retryable=False,
|
||||
) from exc
|
||||
if not isinstance(payload, dict):
|
||||
raise ModuleError(
|
||||
code="STAGE_META_INVALID",
|
||||
message=f"元数据文件必须是对象: {meta_path.name}",
|
||||
retryable=False,
|
||||
)
|
||||
return {"meta_path": meta_path if meta_path.exists() else None, "payload": payload}
|
||||
|
||||
def _move_optional_metadata_file(self, meta_path: Path, task_dir: Path) -> None:
|
||||
if not meta_path.exists():
|
||||
return
|
||||
self._move_to_directory(meta_path, task_dir)
|
||||
|
||||
def _build_task_context(
|
||||
self,
|
||||
task: Task,
|
||||
context_payload: dict[str, object],
|
||||
*,
|
||||
created_at: str,
|
||||
updated_at: str,
|
||||
session_gap_minutes: int,
|
||||
) -> TaskContext:
|
||||
source_title = self._clean_text(context_payload.get("source_title")) or task.title
|
||||
streamer = self._clean_text(context_payload.get("streamer"))
|
||||
room_id = self._clean_text(context_payload.get("room_id"))
|
||||
session_key = self._clean_text(context_payload.get("session_key"))
|
||||
full_video_bvid = self._clean_bvid(context_payload.get("full_video_bvid"))
|
||||
segment_duration = self._coerce_float(context_payload.get("segment_duration_seconds"))
|
||||
segment_started_at = self._coerce_iso_datetime(context_payload.get("segment_started_at"))
|
||||
|
||||
if streamer is None or segment_started_at is None:
|
||||
inferred = self._infer_from_title(
|
||||
source_title,
|
||||
reference_timestamp=context_payload.get("reference_timestamp"),
|
||||
)
|
||||
if streamer is None:
|
||||
streamer = inferred.get("streamer")
|
||||
if segment_started_at is None:
|
||||
segment_started_at = inferred.get("segment_started_at")
|
||||
|
||||
if session_key is None:
|
||||
session_key, inherited_bvid = self._infer_session_key(
|
||||
streamer=streamer,
|
||||
room_id=room_id,
|
||||
segment_started_at=segment_started_at,
|
||||
segment_duration_seconds=segment_duration,
|
||||
fallback_task_id=task.id,
|
||||
gap_minutes=session_gap_minutes,
|
||||
)
|
||||
if full_video_bvid is None:
|
||||
full_video_bvid = inherited_bvid
|
||||
elif full_video_bvid is None:
|
||||
full_video_bvid = self._find_full_video_bvid_by_session_key(session_key)
|
||||
|
||||
if full_video_bvid is None:
|
||||
binding = self.repo.get_session_binding(session_key=session_key, source_title=source_title)
|
||||
if binding is not None:
|
||||
if session_key is None and binding.session_key:
|
||||
session_key = binding.session_key
|
||||
full_video_bvid = self._clean_bvid(binding.full_video_bvid)
|
||||
|
||||
if session_key is None:
|
||||
session_key = f"task:{task.id}"
|
||||
|
||||
return TaskContext(
|
||||
id=None,
|
||||
task_id=task.id,
|
||||
session_key=session_key,
|
||||
streamer=streamer,
|
||||
room_id=room_id,
|
||||
source_title=source_title,
|
||||
segment_started_at=segment_started_at,
|
||||
segment_duration_seconds=segment_duration,
|
||||
full_video_bvid=full_video_bvid,
|
||||
created_at=created_at,
|
||||
updated_at=updated_at,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _clean_text(value: object) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
text = str(value).strip()
|
||||
return text or None
|
||||
|
||||
@staticmethod
|
||||
def _clean_bvid(value: object) -> str | None:
|
||||
text = IngestService._clean_text(value)
|
||||
if text and text.startswith("BV"):
|
||||
return text
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _coerce_float(value: object) -> float | None:
|
||||
if value is None or value == "":
|
||||
return None
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _coerce_iso_datetime(value: object) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
text = str(value).strip()
|
||||
if not text:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromisoformat(text).astimezone(SHANGHAI_TZ).isoformat()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _infer_from_title(self, title: str, *, reference_timestamp: object) -> dict[str, str | None]:
|
||||
match = TITLE_PATTERN.match(title)
|
||||
if not match:
|
||||
return {"streamer": None, "segment_started_at": None}
|
||||
reference_dt = self._reference_datetime(reference_timestamp)
|
||||
month = int(match.group("month"))
|
||||
day = int(match.group("day"))
|
||||
hour = int(match.group("hour"))
|
||||
minute = int(match.group("minute"))
|
||||
year = reference_dt.year
|
||||
if (month, day) > (reference_dt.month, reference_dt.day):
|
||||
year -= 1
|
||||
started_at = datetime(year, month, day, hour, minute, tzinfo=SHANGHAI_TZ)
|
||||
return {
|
||||
"streamer": match.group("streamer").strip(),
|
||||
"segment_started_at": started_at.isoformat(),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _reference_datetime(reference_timestamp: object) -> datetime:
|
||||
if isinstance(reference_timestamp, (int, float)):
|
||||
return datetime.fromtimestamp(float(reference_timestamp), tz=SHANGHAI_TZ)
|
||||
return datetime.now(tz=SHANGHAI_TZ)
|
||||
|
||||
def _infer_session_key(
|
||||
self,
|
||||
*,
|
||||
streamer: str | None,
|
||||
room_id: str | None,
|
||||
segment_started_at: str | None,
|
||||
segment_duration_seconds: float | None,
|
||||
fallback_task_id: str,
|
||||
gap_minutes: int,
|
||||
) -> tuple[str | None, str | None]:
|
||||
if not streamer or not segment_started_at:
|
||||
return None, None
|
||||
try:
|
||||
segment_start = datetime.fromisoformat(segment_started_at)
|
||||
except ValueError:
|
||||
return None, None
|
||||
|
||||
tolerance = timedelta(minutes=max(gap_minutes, 0))
|
||||
for context in self.repo.find_recent_task_contexts(streamer):
|
||||
if room_id and context.room_id and room_id != context.room_id:
|
||||
continue
|
||||
candidate_end = self._context_end_time(context)
|
||||
if candidate_end is None:
|
||||
continue
|
||||
if segment_start >= candidate_end and segment_start - candidate_end <= tolerance:
|
||||
return context.session_key, context.full_video_bvid
|
||||
date_tag = segment_start.astimezone(SHANGHAI_TZ).strftime("%Y%m%dT%H%M")
|
||||
return f"{streamer}:{date_tag}", None
|
||||
|
||||
@staticmethod
|
||||
def _context_end_time(context: TaskContext) -> datetime | None:
|
||||
if not context.segment_started_at or context.segment_duration_seconds is None:
|
||||
return None
|
||||
try:
|
||||
started_at = datetime.fromisoformat(context.segment_started_at)
|
||||
except ValueError:
|
||||
return None
|
||||
return started_at + timedelta(seconds=float(context.segment_duration_seconds))
|
||||
|
||||
def _find_full_video_bvid_by_session_key(self, session_key: str) -> str | None:
|
||||
for context in self.repo.list_task_contexts_by_session_key(session_key):
|
||||
bvid = self._clean_bvid(context.full_video_bvid)
|
||||
if bvid:
|
||||
return bvid
|
||||
return None
|
||||
|
||||
247
src/biliup_next/modules/publish/providers/biliup_cli.py
Normal file
247
src/biliup_next/modules/publish/providers/biliup_cli.py
Normal file
@ -0,0 +1,247 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from biliup_next.core.errors import ModuleError
|
||||
from biliup_next.core.models import PublishRecord, Task, utc_now_iso
|
||||
from biliup_next.core.providers import ProviderManifest
|
||||
from biliup_next.infra.adapters.biliup_cli import BiliupCliAdapter
|
||||
|
||||
|
||||
class BiliupCliPublishProvider:
|
||||
def __init__(self, adapter: BiliupCliAdapter | None = None) -> None:
|
||||
self.adapter = adapter or BiliupCliAdapter()
|
||||
|
||||
manifest = ProviderManifest(
|
||||
id="biliup_cli",
|
||||
name="biliup CLI Publish Provider",
|
||||
version="0.1.0",
|
||||
provider_type="publish_provider",
|
||||
entrypoint="biliup_next.modules.publish.providers.biliup_cli:BiliupCliPublishProvider",
|
||||
capabilities=["publish"],
|
||||
enabled_by_default=True,
|
||||
)
|
||||
|
||||
def publish(self, task: Task, clip_videos: list, settings: dict[str, Any]) -> PublishRecord:
|
||||
work_dir = Path(str(settings["session_dir"])) / task.title
|
||||
bvid_file = work_dir / "bvid.txt"
|
||||
upload_done = work_dir / "upload_done.flag"
|
||||
config = self._load_upload_config(Path(str(settings["upload_config_file"])))
|
||||
|
||||
video_files = [artifact.path for artifact in clip_videos]
|
||||
if not video_files:
|
||||
raise ModuleError(
|
||||
code="PUBLISH_NO_CLIPS",
|
||||
message=f"没有可上传的切片: {task.id}",
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
parsed = self._parse_filename(task.title, config)
|
||||
streamer = parsed.get("streamer", task.title)
|
||||
date = parsed.get("date", "")
|
||||
|
||||
songs_txt = work_dir / "songs.txt"
|
||||
songs_json = work_dir / "songs.json"
|
||||
songs_list = songs_txt.read_text(encoding="utf-8").strip() if songs_txt.exists() else ""
|
||||
song_count = 0
|
||||
if songs_json.exists():
|
||||
song_count = len(json.loads(songs_json.read_text(encoding="utf-8")).get("songs", []))
|
||||
|
||||
quote = self._get_random_quote(config)
|
||||
template_vars = {
|
||||
"streamer": streamer,
|
||||
"date": date,
|
||||
"song_count": song_count,
|
||||
"songs_list": songs_list,
|
||||
"daily_quote": quote.get("text", ""),
|
||||
"quote_author": quote.get("author", ""),
|
||||
}
|
||||
template = config.get("template", {})
|
||||
title = template.get("title", "{streamer}_{date}").format(**template_vars)
|
||||
description = template.get("description", "{songs_list}").format(**template_vars)
|
||||
dynamic = template.get("dynamic", "").format(**template_vars)
|
||||
tags = template.get("tag", "翻唱,唱歌,音乐").format(**template_vars)
|
||||
streamer_cfg = config.get("streamers", {})
|
||||
if streamer in streamer_cfg:
|
||||
tags = streamer_cfg[streamer].get("tags", tags)
|
||||
|
||||
upload_settings = config.get("upload_settings", {})
|
||||
tid = upload_settings.get("tid", 31)
|
||||
biliup_path = str(settings["biliup_path"])
|
||||
cookie_file = str(settings["cookie_file"])
|
||||
retry_count = max(1, int(settings.get("retry_count", 5)))
|
||||
|
||||
self.adapter.run_optional([biliup_path, "-u", cookie_file, "renew"])
|
||||
|
||||
first_batch = video_files[:5]
|
||||
remaining_batches = [video_files[i:i + 5] for i in range(5, len(video_files), 5)]
|
||||
|
||||
existing_bvid = bvid_file.read_text(encoding="utf-8").strip() if bvid_file.exists() else ""
|
||||
if upload_done.exists() and existing_bvid.startswith("BV"):
|
||||
return PublishRecord(
|
||||
id=None,
|
||||
task_id=task.id,
|
||||
platform="bilibili",
|
||||
aid=None,
|
||||
bvid=existing_bvid,
|
||||
title=title,
|
||||
published_at=utc_now_iso(),
|
||||
)
|
||||
|
||||
bvid = existing_bvid if existing_bvid.startswith("BV") else self._upload_first_batch(
|
||||
biliup_path=biliup_path,
|
||||
cookie_file=cookie_file,
|
||||
first_batch=first_batch,
|
||||
title=title,
|
||||
tid=tid,
|
||||
tags=tags,
|
||||
description=description,
|
||||
dynamic=dynamic,
|
||||
upload_settings=upload_settings,
|
||||
retry_count=retry_count,
|
||||
)
|
||||
bvid_file.write_text(bvid, encoding="utf-8")
|
||||
|
||||
for batch_index, batch in enumerate(remaining_batches, start=2):
|
||||
self._append_batch(
|
||||
biliup_path=biliup_path,
|
||||
cookie_file=cookie_file,
|
||||
bvid=bvid,
|
||||
batch=batch,
|
||||
batch_index=batch_index,
|
||||
retry_count=retry_count,
|
||||
)
|
||||
|
||||
upload_done.touch()
|
||||
return PublishRecord(
|
||||
id=None,
|
||||
task_id=task.id,
|
||||
platform="bilibili",
|
||||
aid=None,
|
||||
bvid=bvid,
|
||||
title=title,
|
||||
published_at=utc_now_iso(),
|
||||
)
|
||||
|
||||
def _upload_first_batch(
|
||||
self,
|
||||
*,
|
||||
biliup_path: str,
|
||||
cookie_file: str,
|
||||
first_batch: list[str],
|
||||
title: str,
|
||||
tid: int,
|
||||
tags: str,
|
||||
description: str,
|
||||
dynamic: str,
|
||||
upload_settings: dict[str, Any],
|
||||
retry_count: int,
|
||||
) -> str:
|
||||
upload_cmd = [
|
||||
biliup_path,
|
||||
"-u",
|
||||
cookie_file,
|
||||
"upload",
|
||||
*first_batch,
|
||||
"--title",
|
||||
title,
|
||||
"--tid",
|
||||
str(tid),
|
||||
"--tag",
|
||||
tags,
|
||||
"--copyright",
|
||||
str(upload_settings.get("copyright", 2)),
|
||||
"--source",
|
||||
str(upload_settings.get("source", "直播回放")),
|
||||
"--desc",
|
||||
description,
|
||||
]
|
||||
if dynamic:
|
||||
upload_cmd.extend(["--dynamic", dynamic])
|
||||
cover = str(upload_settings.get("cover", "")).strip()
|
||||
if cover and Path(cover).exists():
|
||||
upload_cmd.extend(["--cover", cover])
|
||||
|
||||
for attempt in range(1, retry_count + 1):
|
||||
result = self.adapter.run(upload_cmd, label=f"首批上传[{attempt}/{retry_count}]")
|
||||
if result.returncode == 0:
|
||||
match = re.search(r'"bvid":"(BV[A-Za-z0-9]+)"', result.stdout) or re.search(r"(BV[A-Za-z0-9]+)", result.stdout)
|
||||
if match:
|
||||
return match.group(1)
|
||||
if attempt < retry_count:
|
||||
time.sleep(self._wait_seconds(attempt - 1))
|
||||
continue
|
||||
raise ModuleError(
|
||||
code="PUBLISH_UPLOAD_FAILED",
|
||||
message="首批上传失败",
|
||||
retryable=True,
|
||||
details={"stdout": result.stdout[-2000:], "stderr": result.stderr[-2000:]},
|
||||
)
|
||||
raise AssertionError("unreachable")
|
||||
|
||||
def _append_batch(
|
||||
self,
|
||||
*,
|
||||
biliup_path: str,
|
||||
cookie_file: str,
|
||||
bvid: str,
|
||||
batch: list[str],
|
||||
batch_index: int,
|
||||
retry_count: int,
|
||||
) -> None:
|
||||
time.sleep(45)
|
||||
append_cmd = [biliup_path, "-u", cookie_file, "append", "--vid", bvid, *batch]
|
||||
for attempt in range(1, retry_count + 1):
|
||||
result = self.adapter.run(append_cmd, label=f"追加第{batch_index}批[{attempt}/{retry_count}]")
|
||||
if result.returncode == 0:
|
||||
return
|
||||
if attempt < retry_count:
|
||||
time.sleep(self._wait_seconds(attempt - 1))
|
||||
continue
|
||||
raise ModuleError(
|
||||
code="PUBLISH_APPEND_FAILED",
|
||||
message=f"追加第 {batch_index} 批失败",
|
||||
retryable=True,
|
||||
details={"stdout": result.stdout[-2000:], "stderr": result.stderr[-2000:]},
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _wait_seconds(retry_index: int) -> int:
|
||||
return min(300 * (2**retry_index), 3600)
|
||||
|
||||
@staticmethod
|
||||
def _load_upload_config(path: Path) -> dict[str, Any]:
|
||||
if not path.exists():
|
||||
return {}
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
@staticmethod
|
||||
def _parse_filename(filename: str, config: dict[str, Any] | None = None) -> dict[str, str]:
|
||||
config = config or {}
|
||||
patterns = config.get("filename_patterns", {}).get("patterns", [])
|
||||
for pattern_config in patterns:
|
||||
regex = pattern_config.get("regex")
|
||||
if not regex:
|
||||
continue
|
||||
match = re.match(regex, filename)
|
||||
if match:
|
||||
data = match.groupdict()
|
||||
date_format = pattern_config.get("date_format", "{date}")
|
||||
try:
|
||||
data["date"] = date_format.format(**data)
|
||||
except KeyError:
|
||||
pass
|
||||
return data
|
||||
return {"streamer": filename, "date": ""}
|
||||
|
||||
@staticmethod
|
||||
def _get_random_quote(config: dict[str, Any]) -> dict[str, str]:
|
||||
quotes = config.get("quotes", [])
|
||||
if not quotes:
|
||||
return {"text": "", "author": ""}
|
||||
return random.choice(quotes)
|
||||
138
src/biliup_next/modules/song_detect/providers/codex.py
Normal file
138
src/biliup_next/modules/song_detect/providers/codex.py
Normal file
@ -0,0 +1,138 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from biliup_next.core.errors import ModuleError
|
||||
from biliup_next.core.models import Artifact, Task, utc_now_iso
|
||||
from biliup_next.core.providers import ProviderManifest
|
||||
from biliup_next.infra.adapters.codex_cli import CodexCliAdapter
|
||||
|
||||
SONG_SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"songs": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {"type": "string"},
|
||||
"end": {"type": "string"},
|
||||
"title": {"type": "string"},
|
||||
"artist": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
"evidence": {"type": "string"},
|
||||
},
|
||||
"required": ["start", "end", "title", "artist", "confidence", "evidence"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
},
|
||||
"required": ["songs"],
|
||||
"additionalProperties": False,
|
||||
}
|
||||
|
||||
TASK_PROMPT = """你是音乐片段识别助手。当前目录下有一个字幕文件。
|
||||
任务:
|
||||
1. 结合字幕内容并允许联网搜索进行纠错(识别同音字、唱错等)。
|
||||
2. 识别出直播中唱过的所有歌曲,给出精确的开始和结束时间。歌曲开始时间规则:
|
||||
- 歌曲开始时间应使用“上一句字幕的结束时间”作为 start_time。
|
||||
- 这样可以尽量保留歌曲可能存在的前奏。
|
||||
3. 同一首歌间隔 ≤160s 合并,>160s 分开。若连续识别出相同歌曲,且中间只有短暂对白、空白、转场或无歌词段,应合并为同一首歌.
|
||||
4. 忽略纯聊天片段。
|
||||
5. 无法确认的歌曲丢弃,宁缺毋滥:你的输出将直接面向最终用户。
|
||||
6. 忽略短片段:如果一段演唱持续时间总和少于 15 秒,视为随口哼唱,请直接忽略,不计入列表。
|
||||
7. 仔细分析每一句歌词,识别出相关歌曲后, 使用该歌曲歌词上下文对比字幕上下文,确定歌曲起始与停止时间
|
||||
8.歌曲标注规则:
|
||||
- 可以在歌曲名称后使用括号 () 添加补充说明。
|
||||
- 常见标注示例:
|
||||
- (片段):歌曲演唱时间较短,例如 < 60 秒
|
||||
- (清唱):无伴奏演唱
|
||||
- (副歌):只演唱副歌部分
|
||||
- 标注应简洁,仅在确有必要时使用。
|
||||
9. 通过歌曲起始和结束时间自检, 一般歌曲长度在5分钟以内, 1分钟以上, 可疑片段重新联网搜索检查.
|
||||
最后请严格按照 Schema 生成 JSON 数据。"""
|
||||
|
||||
|
||||
class CodexSongDetector:
|
||||
def __init__(self, adapter: CodexCliAdapter | None = None) -> None:
|
||||
self.adapter = adapter or CodexCliAdapter()
|
||||
|
||||
manifest = ProviderManifest(
|
||||
id="codex",
|
||||
name="Codex Song Detector",
|
||||
version="0.1.0",
|
||||
provider_type="song_detector",
|
||||
entrypoint="biliup_next.modules.song_detect.providers.codex:CodexSongDetector",
|
||||
capabilities=["song_detect"],
|
||||
enabled_by_default=True,
|
||||
)
|
||||
|
||||
def detect(self, task: Task, subtitle_srt: Artifact, settings: dict[str, Any]) -> tuple[Artifact, Artifact]:
|
||||
work_dir = Path(subtitle_srt.path).resolve().parent
|
||||
schema_path = work_dir / "song_schema.json"
|
||||
songs_json_path = work_dir / "songs.json"
|
||||
songs_txt_path = work_dir / "songs.txt"
|
||||
schema_path.write_text(json.dumps(SONG_SCHEMA, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
codex_cmd = str(settings.get("codex_cmd", "codex"))
|
||||
result = self.adapter.run_song_detect(
|
||||
codex_cmd=codex_cmd,
|
||||
work_dir=work_dir,
|
||||
prompt=TASK_PROMPT,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise ModuleError(
|
||||
code="SONG_DETECT_FAILED",
|
||||
message="codex exec 执行失败",
|
||||
retryable=True,
|
||||
details={"stdout": result.stdout[-2000:], "stderr": result.stderr[-2000:]},
|
||||
)
|
||||
|
||||
if songs_json_path.exists() and not songs_txt_path.exists():
|
||||
self._generate_txt_fallback(songs_json_path, songs_txt_path)
|
||||
|
||||
if not songs_json_path.exists() or not songs_txt_path.exists():
|
||||
raise ModuleError(
|
||||
code="SONG_DETECT_OUTPUT_MISSING",
|
||||
message=f"未生成 songs.json/songs.txt: {work_dir}",
|
||||
retryable=True,
|
||||
details={"stdout": result.stdout[-2000:], "stderr": result.stderr[-2000:]},
|
||||
)
|
||||
|
||||
return (
|
||||
Artifact(
|
||||
id=None,
|
||||
task_id=task.id,
|
||||
artifact_type="songs_json",
|
||||
path=str(songs_json_path.resolve()),
|
||||
metadata_json=json.dumps({"provider": "codex"}),
|
||||
created_at=utc_now_iso(),
|
||||
),
|
||||
Artifact(
|
||||
id=None,
|
||||
task_id=task.id,
|
||||
artifact_type="songs_txt",
|
||||
path=str(songs_txt_path.resolve()),
|
||||
metadata_json=json.dumps({"provider": "codex"}),
|
||||
created_at=utc_now_iso(),
|
||||
),
|
||||
)
|
||||
|
||||
def _generate_txt_fallback(self, songs_json_path: Path, songs_txt_path: Path) -> None:
|
||||
try:
|
||||
data = json.loads(songs_json_path.read_text(encoding="utf-8"))
|
||||
songs = data.get("songs", [])
|
||||
with songs_txt_path.open("w", encoding="utf-8") as file_handle:
|
||||
for song in songs:
|
||||
start_time = str(song["start"]).split(",")[0].split(".")[0]
|
||||
file_handle.write(f"{start_time} {song['title']} — {song['artist']}\n")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
raise ModuleError(
|
||||
code="SONGS_TXT_GENERATE_FAILED",
|
||||
message=f"生成 songs.txt 失败: {songs_txt_path}",
|
||||
retryable=False,
|
||||
details={"error": str(exc)},
|
||||
) from exc
|
||||
101
src/biliup_next/modules/split/providers/ffmpeg_copy.py
Normal file
101
src/biliup_next/modules/split/providers/ffmpeg_copy.py
Normal file
@ -0,0 +1,101 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from biliup_next.core.errors import ModuleError
|
||||
from biliup_next.core.models import Artifact, Task, utc_now_iso
|
||||
from biliup_next.core.providers import ProviderManifest
|
||||
|
||||
|
||||
class FfmpegCopySplitProvider:
|
||||
manifest = ProviderManifest(
|
||||
id="ffmpeg_copy",
|
||||
name="FFmpeg Copy Split Provider",
|
||||
version="0.1.0",
|
||||
provider_type="split_provider",
|
||||
entrypoint="biliup_next.modules.split.providers.ffmpeg_copy:FfmpegCopySplitProvider",
|
||||
capabilities=["split"],
|
||||
enabled_by_default=True,
|
||||
)
|
||||
|
||||
def split(self, task: Task, songs_json: Artifact, source_video: Artifact, settings: dict[str, Any]) -> list[Artifact]:
|
||||
work_dir = Path(songs_json.path).resolve().parent
|
||||
split_dir = work_dir / "split_video"
|
||||
split_done = work_dir / "split_done.flag"
|
||||
if split_done.exists() and split_dir.exists():
|
||||
return self._collect_existing_clips(task.id, split_dir)
|
||||
|
||||
with Path(songs_json.path).open("r", encoding="utf-8") as file_handle:
|
||||
data = json.load(file_handle)
|
||||
songs = data.get("songs", [])
|
||||
if not songs:
|
||||
raise ModuleError(
|
||||
code="SPLIT_SONGS_EMPTY",
|
||||
message=f"songs.json 中没有歌曲: {songs_json.path}",
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
split_dir.mkdir(parents=True, exist_ok=True)
|
||||
ffmpeg_bin = str(settings.get("ffmpeg_bin", "ffmpeg"))
|
||||
video_path = Path(source_video.path).resolve()
|
||||
|
||||
for index, song in enumerate(songs, 1):
|
||||
start = str(song.get("start", "00:00:00,000")).replace(",", ".")
|
||||
end = str(song.get("end", "00:00:00,000")).replace(",", ".")
|
||||
title = str(song.get("title", "UNKNOWN")).replace("/", "_").replace("\\", "_")
|
||||
output_path = split_dir / f"{index:02d}_{title}{video_path.suffix}"
|
||||
if output_path.exists():
|
||||
continue
|
||||
cmd = [
|
||||
ffmpeg_bin,
|
||||
"-y",
|
||||
"-ss",
|
||||
start,
|
||||
"-to",
|
||||
end,
|
||||
"-i",
|
||||
str(video_path),
|
||||
"-c",
|
||||
"copy",
|
||||
"-map_metadata",
|
||||
"0",
|
||||
str(output_path),
|
||||
]
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
except FileNotFoundError as exc:
|
||||
raise ModuleError(
|
||||
code="FFMPEG_NOT_FOUND",
|
||||
message=f"找不到 ffmpeg: {ffmpeg_bin}",
|
||||
retryable=False,
|
||||
) from exc
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise ModuleError(
|
||||
code="SPLIT_FFMPEG_FAILED",
|
||||
message=f"ffmpeg 切割失败: {output_path.name}",
|
||||
retryable=True,
|
||||
details={"stderr": exc.stderr[-2000:], "stdout": exc.stdout[-2000:]},
|
||||
) from exc
|
||||
|
||||
split_done.touch()
|
||||
return self._collect_existing_clips(task.id, split_dir)
|
||||
|
||||
def _collect_existing_clips(self, task_id: str, split_dir: Path) -> list[Artifact]:
|
||||
artifacts: list[Artifact] = []
|
||||
for path in sorted(split_dir.iterdir()):
|
||||
if not path.is_file():
|
||||
continue
|
||||
artifacts.append(
|
||||
Artifact(
|
||||
id=None,
|
||||
task_id=task_id,
|
||||
artifact_type="clip_video",
|
||||
path=str(path.resolve()),
|
||||
metadata_json=json.dumps({"provider": "ffmpeg_copy"}),
|
||||
created_at=utc_now_iso(),
|
||||
)
|
||||
)
|
||||
return artifacts
|
||||
191
src/biliup_next/modules/transcribe/providers/groq.py
Normal file
191
src/biliup_next/modules/transcribe/providers/groq.py
Normal file
@ -0,0 +1,191 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import math
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from biliup_next.core.errors import ModuleError
|
||||
from biliup_next.core.models import Artifact, Task, utc_now_iso
|
||||
from biliup_next.core.providers import ProviderManifest
|
||||
|
||||
|
||||
LANGUAGE = "zh"
|
||||
BITRATE_KBPS = 64
|
||||
MODEL_NAME = "whisper-large-v3-turbo"
|
||||
|
||||
|
||||
class GroqTranscribeProvider:
|
||||
manifest = ProviderManifest(
|
||||
id="groq",
|
||||
name="Groq Transcribe Provider",
|
||||
version="0.1.0",
|
||||
provider_type="transcribe_provider",
|
||||
entrypoint="biliup_next.modules.transcribe.providers.groq:GroqTranscribeProvider",
|
||||
capabilities=["transcribe"],
|
||||
enabled_by_default=True,
|
||||
)
|
||||
|
||||
def transcribe(self, task: Task, source_video: Artifact, settings: dict[str, Any]) -> Artifact:
|
||||
groq_api_key = str(settings.get("groq_api_key", "")).strip()
|
||||
if not groq_api_key:
|
||||
raise ModuleError(
|
||||
code="GROQ_API_KEY_MISSING",
|
||||
message="未配置 transcribe.groq_api_key",
|
||||
retryable=False,
|
||||
)
|
||||
try:
|
||||
from groq import Groq
|
||||
except ModuleNotFoundError as exc:
|
||||
raise ModuleError(
|
||||
code="GROQ_DEPENDENCY_MISSING",
|
||||
message="未安装 groq 依赖,请在 biliup-next 环境中执行 pip install -e .",
|
||||
retryable=False,
|
||||
) from exc
|
||||
|
||||
source_path = Path(source_video.path).resolve()
|
||||
if not source_path.exists():
|
||||
raise ModuleError(
|
||||
code="TRANSCRIBE_SOURCE_MISSING",
|
||||
message=f"源视频不存在: {source_path}",
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
ffmpeg_bin = str(settings.get("ffmpeg_bin", "ffmpeg"))
|
||||
max_file_size_mb = int(settings.get("max_file_size_mb", 23))
|
||||
work_dir = source_path.parent
|
||||
temp_audio_dir = work_dir / "temp_audio"
|
||||
temp_audio_dir.mkdir(parents=True, exist_ok=True)
|
||||
segment_duration = max(1, math.floor((max_file_size_mb * 8 * 1024) / BITRATE_KBPS))
|
||||
output_pattern = temp_audio_dir / "part_%03d.mp3"
|
||||
|
||||
self._extract_audio_segments(
|
||||
ffmpeg_bin=ffmpeg_bin,
|
||||
source_path=source_path,
|
||||
output_pattern=output_pattern,
|
||||
segment_duration=segment_duration,
|
||||
)
|
||||
|
||||
segments = sorted(temp_audio_dir.glob("part_*.mp3"))
|
||||
if not segments:
|
||||
raise ModuleError(
|
||||
code="TRANSCRIBE_AUDIO_SEGMENTS_MISSING",
|
||||
message=f"未生成音频分片: {source_path.name}",
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
client = Groq(api_key=groq_api_key)
|
||||
srt_path = work_dir / f"{task.title}.srt"
|
||||
global_idx = 1
|
||||
|
||||
try:
|
||||
with srt_path.open("w", encoding="utf-8") as srt_file:
|
||||
for index, segment in enumerate(segments):
|
||||
offset_seconds = index * segment_duration
|
||||
segment_data = self._transcribe_with_retry(client, segment)
|
||||
for chunk in segment_data:
|
||||
start = self._format_srt_time(float(chunk["start"]) + offset_seconds)
|
||||
end = self._format_srt_time(float(chunk["end"]) + offset_seconds)
|
||||
text = str(chunk["text"]).strip()
|
||||
srt_file.write(f"{global_idx}\n{start} --> {end}\n{text}\n\n")
|
||||
global_idx += 1
|
||||
finally:
|
||||
shutil.rmtree(temp_audio_dir, ignore_errors=True)
|
||||
|
||||
return Artifact(
|
||||
id=None,
|
||||
task_id=task.id,
|
||||
artifact_type="subtitle_srt",
|
||||
path=str(srt_path.resolve()),
|
||||
metadata_json=json.dumps(
|
||||
{
|
||||
"provider": "groq",
|
||||
"model": MODEL_NAME,
|
||||
"segment_duration_seconds": segment_duration,
|
||||
}
|
||||
),
|
||||
created_at=utc_now_iso(),
|
||||
)
|
||||
|
||||
def _extract_audio_segments(
|
||||
self,
|
||||
*,
|
||||
ffmpeg_bin: str,
|
||||
source_path: Path,
|
||||
output_pattern: Path,
|
||||
segment_duration: int,
|
||||
) -> None:
|
||||
cmd = [
|
||||
ffmpeg_bin,
|
||||
"-y",
|
||||
"-i",
|
||||
str(source_path),
|
||||
"-vn",
|
||||
"-acodec",
|
||||
"libmp3lame",
|
||||
"-b:a",
|
||||
f"{BITRATE_KBPS}k",
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
"22050",
|
||||
"-f",
|
||||
"segment",
|
||||
"-segment_time",
|
||||
str(segment_duration),
|
||||
"-reset_timestamps",
|
||||
"1",
|
||||
str(output_pattern),
|
||||
]
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
except FileNotFoundError as exc:
|
||||
raise ModuleError(
|
||||
code="FFMPEG_NOT_FOUND",
|
||||
message=f"找不到 ffmpeg: {ffmpeg_bin}",
|
||||
retryable=False,
|
||||
) from exc
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise ModuleError(
|
||||
code="FFMPEG_AUDIO_EXTRACT_FAILED",
|
||||
message=f"音频提取失败: {source_path.name}",
|
||||
retryable=True,
|
||||
details={"stderr": exc.stderr[-2000:], "stdout": exc.stdout[-2000:]},
|
||||
) from exc
|
||||
|
||||
def _transcribe_with_retry(self, client: Any, audio_file: Path) -> list[dict[str, Any]]:
|
||||
retry_count = 0
|
||||
while True:
|
||||
try:
|
||||
with audio_file.open("rb") as file_handle:
|
||||
response = client.audio.transcriptions.create(
|
||||
file=(audio_file.name, file_handle.read()),
|
||||
model=MODEL_NAME,
|
||||
response_format="verbose_json",
|
||||
language=LANGUAGE,
|
||||
temperature=0.0,
|
||||
)
|
||||
return [dict(segment) for segment in response.segments]
|
||||
except Exception as exc: # noqa: BLE001
|
||||
retry_count += 1
|
||||
err_str = str(exc)
|
||||
if "429" in err_str or "rate_limit" in err_str.lower():
|
||||
time.sleep(25)
|
||||
continue
|
||||
raise ModuleError(
|
||||
code="GROQ_TRANSCRIBE_FAILED",
|
||||
message=f"Groq 转录失败: {audio_file.name}",
|
||||
retryable=True,
|
||||
details={"error": err_str, "retry_count": retry_count},
|
||||
) from exc
|
||||
|
||||
@staticmethod
|
||||
def _format_srt_time(seconds: float) -> str:
|
||||
td_hours = int(seconds // 3600)
|
||||
td_mins = int((seconds % 3600) // 60)
|
||||
td_secs = int(seconds % 60)
|
||||
td_millis = int((seconds - int(seconds)) * 1000)
|
||||
return f"{td_hours:02}:{td_mins:02}:{td_secs:02},{td_millis:03}"
|
||||
Reference in New Issue
Block a user