Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions demohouse/video_analyser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@

### 相关模型

- Doubao-pro-32k:主要参与记忆信息的处理,在当前画面无法直接回答用户问题时,大语言模型将结合历史记忆提供精准答案。
- Doubao-vision-pro-32k:负责对摄像头实时捕捉的视频画面进行视觉内容理解。
- Doubao-Seed-1.6-flash:负责对摄像头实时捕捉的视频画面进行视觉内容理解以及问题回答,在当前画面无法直接回答用户问题时,大语言模型将结合历史记忆提供精准答案。
- Doubao-语音合成:负责将模型生成的文本回答转化为自然流畅的语音输出。
- Doubao-流式语音识别:将用户的语音提问转写为文本,以便于大模型对用户问题的理解与回复。

Expand All @@ -29,8 +28,7 @@
- [Node.js](https://nodejs.org/) (版本 16.2.0 或更高,推荐 Node.js 18 的 LTS 版本)
- 已获取火山方舟 API Key [参考文档](https://www.volcengine.com/docs/82379/1298459#api-key-%E7%AD%BE%E5%90%8D%E9%89%B4%E6%9D%83)
- 获取语音技术产品的 APP ID 和 Access Token,获取方式参见【附录】
- 已创建 Doubao-Vision-Pro 32K 的 endpoint [参考文档](https://www.volcengine.com/docs/82379/1099522#594199f1)
- 已创建 Doubao-Pro 32K 的endpoint [参考文档](https://www.volcengine.com/docs/82379/1099522#594199f1)
- 已开通 Doubao-Seed-1.6-flash 模型

## 快速开始

Expand All @@ -44,12 +42,10 @@
```
2. 修改配置

- 修改`backend/code/config.py` 中配置,填入刚刚获取的API keys, endpoint id 和 APP ID和 Access Token
- 修改`backend/code/config.py` 中配置,填入刚刚获取的 APP ID和 Access Token

| 配置变量名 | 说明 |
| ------------ | --------------------------------- |
| VLM_ENDPOINT | doubao-vision-pro 32k endpoint id |
| LLM_ENDPOINT | doubao-pro 32k endpoint id |
| TTS_APP_ID | 语音合成模型 APP ID |
| TTS_ACCESS_TOKEN | 语音合成模型 Access Token |

Expand Down Expand Up @@ -106,7 +102,7 @@
- 实时抽帧的纯图片:前端每秒会发送一个只包含一帧图片的请求,后端会将其存入长期记忆,用于之后的回答;后端对这个请求不回复任何内容。
- 包含用户提问的请求:当前端通过VAD识别到用户提问时,会将识别出来的问题文字和用户讲话时的当前图片一起发起给后端,后端会以语音形式进行回复。

2. 由于每次`bot/chat` 的请求都是无状态的,前端会在header中传入一个X-Context-Id,帮助后端存储和召回用户的历史视频信息。
2. 由于每次`bot/chat` 的请求都是无状态的,前端会在metadata中传入一个context_id,帮助后端存储和召回用户的历史视频信息。

### 模型回复策略

Expand Down
4 changes: 2 additions & 2 deletions demohouse/video_analyser/backend/code/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

VLM_ENDPOINT = "<ENDPOINT_ID_FOR_DOUBAO_VISION_PRO>"
LLM_ENDPOINT = "<ENDPOINT_ID_FOR_LLM>" # 256K model for a short term memory
VISUAL_SUMMARY_ENDPOINT = "doubao-seed-1-6-flash-250615"
QUESTION_ANSWER_ENDPOINT = "doubao-seed-1-6-flash-250615"

TTS_APP_ID = "<TTS_APP_ID>"
TTS_ACCESS_TOKEN = "<TTS_ACCESS_TOKEN>"
24 changes: 14 additions & 10 deletions demohouse/video_analyser/backend/code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,19 @@

import prompt
import utils
from config import LLM_ENDPOINT, VLM_ENDPOINT, TTS_ACCESS_TOKEN, TTS_APP_ID
from config import VISUAL_SUMMARY_ENDPOINT, QUESTION_ANSWER_ENDPOINT, TTS_ACCESS_TOKEN, TTS_APP_ID

from volcenginesdkarkruntime.types.chat.completion_create_params import Thinking

from volcenginesdkarkruntime.types.chat.chat_completion_content_part_text_param import ChatCompletionContentPartTextParam

from arkitect.core.component.llm import BaseChatLanguageModel
from arkitect.core.component.llm.model import (
from arkitect.types.llm.model import (
ArkChatCompletionChunk,
ArkChatParameters,
ArkChatRequest,
ArkChatResponse,
ArkMessage,
ChatCompletionMessageTextPart,
Response,
)
from arkitect.core.component.tts import (
Expand Down Expand Up @@ -58,7 +61,7 @@ async def get_request_messages_for_llm(
request_messages = await contexts.get_history(context_id)
if isinstance(request.messages[-1].content, list):
assert isinstance(
request.messages[-1].content[0], ChatCompletionMessageTextPart
request.messages[-1].content[0], ChatCompletionContentPartTextParam
)
text = request.messages[-1].content[0].text
else:
Expand All @@ -74,7 +77,7 @@ async def chat_with_vlm(
parameters: ArkChatParameters,
) -> Tuple[bool, Optional[AsyncIterable[ArkChatCompletionChunk]]]:
vlm = BaseChatLanguageModel(
endpoint_id=VLM_ENDPOINT,
endpoint_id=VISUAL_SUMMARY_ENDPOINT,
messages=[ArkMessage(role="system", content=prompt.VLM_CHAT_PROMPT)]
+ [request.messages[-1]],
parameters=parameters,
Expand Down Expand Up @@ -108,7 +111,7 @@ async def llm_answer(
contexts, context_id, request, prompt.LLM_PROMPT
)
llm = BaseChatLanguageModel(
endpoint_id=LLM_ENDPOINT,
endpoint_id=QUESTION_ANSWER_ENDPOINT,
messages=request_messages,
parameters=parameters,
)
Expand Down Expand Up @@ -180,7 +183,7 @@ async def summarize_image(
ArkMessage(role="system", content=prompt.VLM_PROMPT)
] + request.messages
vlm = BaseChatLanguageModel(
endpoint_id=VLM_ENDPOINT,
endpoint_id=VISUAL_SUMMARY_ENDPOINT,
messages=request_messages,
parameters=parameters,
)
Expand All @@ -195,7 +198,7 @@ async def default_model_calling(
request: ArkChatRequest,
) -> AsyncIterable[Union[ArkChatCompletionChunk, ArkChatResponse]]:
# local in-memory storage should be changed to other storage in production
context_id: Optional[str] = get_headers().get("X-Context-Id", None)
context_id: Optional[str] = request.metadata["context_id"]
assert context_id is not None
contexts: utils.Storage = utils.CoroutineSafeMap.get_instance_sync()
if not await contexts.contains(context_id):
Expand All @@ -205,10 +208,11 @@ async def default_model_calling(
# Use VLM to summarize the image asynchronously and return immediately
is_image = (
isinstance(request.messages[-1].content, list)
and isinstance(request.messages[-1].content[0], ChatCompletionMessageTextPart)
and isinstance(request.messages[-1].content[0], ChatCompletionContentPartTextParam)
and request.messages[-1].content[0].text == ""
)
parameters = ArkChatParameters(**request.__dict__)
parameters.thinking = Thinking(type="disabled")
if is_image:
_ = asyncio.create_task(
summarize_image(contexts, request, parameters, context_id)
Expand Down Expand Up @@ -248,7 +252,7 @@ async def default_model_calling(
await tts_client.close()
text = ""
if isinstance(request.messages[-1].content, list) and isinstance(
request.messages[-1].content[0], ChatCompletionMessageTextPart
request.messages[-1].content[0], ChatCompletionContentPartTextParam
):
text = request.messages[-1].content[0].text
elif isinstance(request.messages[-1].content, str):
Expand Down