diff --git a/demohouse/video_analyser/README.md b/demohouse/video_analyser/README.md index 326f4893..c687096a 100644 --- a/demohouse/video_analyser/README.md +++ b/demohouse/video_analyser/README.md @@ -14,8 +14,7 @@ ### 相关模型 -- Doubao-pro-32k:主要参与记忆信息的处理,在当前画面无法直接回答用户问题时,大语言模型将结合历史记忆提供精准答案。 -- Doubao-vision-pro-32k:负责对摄像头实时捕捉的视频画面进行视觉内容理解。 +- Doubao-Seed-1.6-flash:负责对摄像头实时捕捉的视频画面进行视觉内容理解以及问题回答,在当前画面无法直接回答用户问题时,大语言模型将结合历史记忆提供精准答案。 - Doubao-语音合成:负责将模型生成的文本回答转化为自然流畅的语音输出。 - Doubao-流式语音识别:将用户的语音提问转写为文本,以便于大模型对用户问题的理解与回复。 @@ -29,8 +28,7 @@ - [Node.js](https://nodejs.org/) (版本 16.2.0 或更高,推荐 Node.js 18 的 LTS 版本) - 已获取火山方舟 API Key [参考文档](https://www.volcengine.com/docs/82379/1298459#api-key-%E7%AD%BE%E5%90%8D%E9%89%B4%E6%9D%83) - 获取语音技术产品的 APP ID 和 Access Token,获取方式参见【附录】 -- 已创建 Doubao-Vision-Pro 32K 的 endpoint [参考文档](https://www.volcengine.com/docs/82379/1099522#594199f1) -- 已创建 Doubao-Pro 32K 的endpoint [参考文档](https://www.volcengine.com/docs/82379/1099522#594199f1) +- 已开通 Doubao-Seed-1.6-flash 模型 ## 快速开始 @@ -44,12 +42,10 @@ ``` 2. 修改配置 - - 修改`backend/code/config.py` 中配置,填入刚刚获取的API keys, endpoint id 和 APP ID和 Access Token + - 修改`backend/code/config.py` 中配置,填入刚刚获取的 APP ID和 Access Token | 配置变量名 | 说明 | | ------------ | --------------------------------- | - | VLM_ENDPOINT | doubao-vision-pro 32k endpoint id | - | LLM_ENDPOINT | doubao-pro 32k endpoint id | | TTS_APP_ID | 语音合成模型 APP ID | | TTS_ACCESS_TOKEN | 语音合成模型 Access Token | @@ -106,7 +102,7 @@ - 实时抽帧的纯图片:前端每秒会发送一个只包含一帧图片的请求,后端会将其存入长期记忆,用于之后的回答;后端对这个请求不回复任何内容。 - 包含用户提问的请求:当前端通过VAD识别到用户提问时,会将识别出来的问题文字和用户讲话时的当前图片一起发起给后端,后端会以语音形式进行回复。 -2. 由于每次`bot/chat` 的请求都是无状态的,前端会在header中传入一个X-Context-Id,帮助后端存储和召回用户的历史视频信息。 +2. 由于每次`bot/chat` 的请求都是无状态的,前端会在metadata中传入一个context_id,帮助后端存储和召回用户的历史视频信息。 ### 模型回复策略 diff --git a/demohouse/video_analyser/backend/code/config.py b/demohouse/video_analyser/backend/code/config.py index 9f04c747..196cf715 100644 --- a/demohouse/video_analyser/backend/code/config.py +++ b/demohouse/video_analyser/backend/code/config.py @@ -9,8 +9,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -VLM_ENDPOINT = "" -LLM_ENDPOINT = "" # 256K model for a short term memory +VISUAL_SUMMARY_ENDPOINT = "doubao-seed-1-6-flash-250615" +QUESTION_ANSWER_ENDPOINT = "doubao-seed-1-6-flash-250615" TTS_APP_ID = "" TTS_ACCESS_TOKEN = "" diff --git a/demohouse/video_analyser/backend/code/main.py b/demohouse/video_analyser/backend/code/main.py index 0d80b632..0a04573c 100644 --- a/demohouse/video_analyser/backend/code/main.py +++ b/demohouse/video_analyser/backend/code/main.py @@ -20,16 +20,19 @@ import prompt import utils -from config import LLM_ENDPOINT, VLM_ENDPOINT, TTS_ACCESS_TOKEN, TTS_APP_ID +from config import VISUAL_SUMMARY_ENDPOINT, QUESTION_ANSWER_ENDPOINT, TTS_ACCESS_TOKEN, TTS_APP_ID + +from volcenginesdkarkruntime.types.chat.completion_create_params import Thinking + +from volcenginesdkarkruntime.types.chat.chat_completion_content_part_text_param import ChatCompletionContentPartTextParam from arkitect.core.component.llm import BaseChatLanguageModel -from arkitect.core.component.llm.model import ( +from arkitect.types.llm.model import ( ArkChatCompletionChunk, ArkChatParameters, ArkChatRequest, ArkChatResponse, ArkMessage, - ChatCompletionMessageTextPart, Response, ) from arkitect.core.component.tts import ( @@ -58,7 +61,7 @@ async def get_request_messages_for_llm( request_messages = await contexts.get_history(context_id) if isinstance(request.messages[-1].content, list): assert isinstance( - request.messages[-1].content[0], ChatCompletionMessageTextPart + request.messages[-1].content[0], ChatCompletionContentPartTextParam ) text = request.messages[-1].content[0].text else: @@ -74,7 +77,7 @@ async def chat_with_vlm( parameters: ArkChatParameters, ) -> Tuple[bool, Optional[AsyncIterable[ArkChatCompletionChunk]]]: vlm = BaseChatLanguageModel( - endpoint_id=VLM_ENDPOINT, + endpoint_id=VISUAL_SUMMARY_ENDPOINT, messages=[ArkMessage(role="system", content=prompt.VLM_CHAT_PROMPT)] + [request.messages[-1]], parameters=parameters, @@ -108,7 +111,7 @@ async def llm_answer( contexts, context_id, request, prompt.LLM_PROMPT ) llm = BaseChatLanguageModel( - endpoint_id=LLM_ENDPOINT, + endpoint_id=QUESTION_ANSWER_ENDPOINT, messages=request_messages, parameters=parameters, ) @@ -180,7 +183,7 @@ async def summarize_image( ArkMessage(role="system", content=prompt.VLM_PROMPT) ] + request.messages vlm = BaseChatLanguageModel( - endpoint_id=VLM_ENDPOINT, + endpoint_id=VISUAL_SUMMARY_ENDPOINT, messages=request_messages, parameters=parameters, ) @@ -195,7 +198,7 @@ async def default_model_calling( request: ArkChatRequest, ) -> AsyncIterable[Union[ArkChatCompletionChunk, ArkChatResponse]]: # local in-memory storage should be changed to other storage in production - context_id: Optional[str] = get_headers().get("X-Context-Id", None) + context_id: Optional[str] = request.metadata["context_id"] assert context_id is not None contexts: utils.Storage = utils.CoroutineSafeMap.get_instance_sync() if not await contexts.contains(context_id): @@ -205,10 +208,11 @@ async def default_model_calling( # Use VLM to summarize the image asynchronously and return immediately is_image = ( isinstance(request.messages[-1].content, list) - and isinstance(request.messages[-1].content[0], ChatCompletionMessageTextPart) + and isinstance(request.messages[-1].content[0], ChatCompletionContentPartTextParam) and request.messages[-1].content[0].text == "" ) parameters = ArkChatParameters(**request.__dict__) + parameters.thinking = Thinking(type="disabled") if is_image: _ = asyncio.create_task( summarize_image(contexts, request, parameters, context_id) @@ -248,7 +252,7 @@ async def default_model_calling( await tts_client.close() text = "" if isinstance(request.messages[-1].content, list) and isinstance( - request.messages[-1].content[0], ChatCompletionMessageTextPart + request.messages[-1].content[0], ChatCompletionContentPartTextParam ): text = request.messages[-1].content[0].text elif isinstance(request.messages[-1].content, str):