|
20 | 20 | list_files |
21 | 21 | ) |
22 | 22 | from services.vectordatabase_service import ElasticSearchService, get_vector_db_core |
23 | | -from utils.attachment_utils import convert_image_to_text, convert_long_text_to_text |
24 | 23 | from utils.config_utils import tenant_config_manager, get_model_name_from_config |
25 | | -from utils.prompt_template_utils import get_file_processing_messages_template |
26 | 24 | from utils.file_management_utils import save_upload_file |
27 | 25 |
|
28 | 26 | from nexent import MessageObserver |
@@ -188,225 +186,6 @@ async def list_files_impl(prefix: str, limit: Optional[int] = None): |
188 | 186 | return files |
189 | 187 |
|
190 | 188 |
|
191 | | -def get_parsing_file_data(index: int, total_files: int, filename: str) -> dict: |
192 | | - """ |
193 | | - Get structured data for parsing file message |
194 | | -
|
195 | | - Args: |
196 | | - index: Current file index (0-based) |
197 | | - total_files: Total number of files |
198 | | - filename: Name of the file being parsed |
199 | | -
|
200 | | - Returns: |
201 | | - dict: Structured data with parameters for internationalization |
202 | | - """ |
203 | | - return { |
204 | | - "params": { |
205 | | - "index": index + 1, |
206 | | - "total": total_files, |
207 | | - "filename": filename |
208 | | - } |
209 | | - } |
210 | | - |
211 | | - |
212 | | -def get_truncation_data(filename: str, truncation_percentage: int) -> dict: |
213 | | - """ |
214 | | - Get structured data for truncation message |
215 | | -
|
216 | | - Args: |
217 | | - filename: Name of the file being truncated |
218 | | - truncation_percentage: Percentage of content that was read |
219 | | -
|
220 | | - Returns: |
221 | | - dict: Structured data with parameters for internationalization |
222 | | - """ |
223 | | - return { |
224 | | - "params": { |
225 | | - "filename": filename, |
226 | | - "percentage": truncation_percentage |
227 | | - } |
228 | | - } |
229 | | - |
230 | | - |
231 | | -async def preprocess_files_generator( |
232 | | - query: str, |
233 | | - file_cache: List[dict], |
234 | | - tenant_id: str, |
235 | | - language: str, |
236 | | - task_id: str, |
237 | | - conversation_id: int |
238 | | -) -> AsyncGenerator[str, None]: |
239 | | - """ |
240 | | - Generate streaming response for file preprocessing |
241 | | -
|
242 | | - Args: |
243 | | - query: User query string |
244 | | - file_cache: List of cached file data |
245 | | - tenant_id: Tenant ID |
246 | | - language: Language preference |
247 | | - task_id: Unique task ID |
248 | | - conversation_id: Conversation ID |
249 | | -
|
250 | | - Yields: |
251 | | - str: JSON formatted streaming messages |
252 | | - """ |
253 | | - file_descriptions = [] |
254 | | - total_files = len(file_cache) |
255 | | - |
256 | | - # Create and register the preprocess task |
257 | | - task = asyncio.current_task() |
258 | | - if task: |
259 | | - preprocess_manager.register_preprocess_task( |
260 | | - task_id, conversation_id, task) |
261 | | - |
262 | | - try: |
263 | | - for index, file_data in enumerate(file_cache): |
264 | | - if task and task.done(): |
265 | | - logger.info(f"Preprocess task {task_id} was cancelled") |
266 | | - break |
267 | | - |
268 | | - progress = int((index / total_files) * 100) |
269 | | - progress_message = json.dumps({ |
270 | | - "type": "progress", |
271 | | - "progress": progress, |
272 | | - "message_data": get_parsing_file_data(index, total_files, file_data['filename']) |
273 | | - }, ensure_ascii=False) |
274 | | - yield f"data: {progress_message}\n\n" |
275 | | - await asyncio.sleep(0.1) |
276 | | - |
277 | | - try: |
278 | | - # Check if file already has an error |
279 | | - if "error" in file_data: |
280 | | - raise Exception(file_data["error"]) |
281 | | - |
282 | | - description = "" |
283 | | - truncation_percentage = None |
284 | | - file_descriptions.append(description) |
285 | | - |
286 | | - # Send processing result for each file |
287 | | - file_message_data = { |
288 | | - "type": "file_processed", |
289 | | - "filename": file_data["filename"], |
290 | | - "description": description |
291 | | - } |
292 | | - file_message = json.dumps( |
293 | | - file_message_data, ensure_ascii=False) |
294 | | - yield f"data: {file_message}\n\n" |
295 | | - await asyncio.sleep(0.1) |
296 | | - |
297 | | - # Send truncation notice immediately if file was truncated |
298 | | - if truncation_percentage is not None and int(truncation_percentage) < 100: |
299 | | - if int(truncation_percentage) == 0: |
300 | | - truncation_percentage = "< 1" |
301 | | - |
302 | | - truncation_message = json.dumps({ |
303 | | - "type": "truncation", |
304 | | - "message_data": get_truncation_data(file_data['filename'], truncation_percentage) |
305 | | - }, ensure_ascii=False) |
306 | | - yield f"data: {truncation_message}\n\n" |
307 | | - await asyncio.sleep(0.1) |
308 | | - except Exception as e: |
309 | | - error_description = f"Error parsing file {file_data['filename']}: {str(e)}" |
310 | | - logger.exception(error_description) |
311 | | - file_descriptions.append(error_description) |
312 | | - error_message = json.dumps({ |
313 | | - "type": "error", |
314 | | - "filename": file_data["filename"], |
315 | | - "message": error_description |
316 | | - }, ensure_ascii=False) |
317 | | - yield f"data: {error_message}\n\n" |
318 | | - await asyncio.sleep(0.1) |
319 | | - |
320 | | - # Send completion message |
321 | | - complete_message = json.dumps({ |
322 | | - "type": "complete", |
323 | | - "progress": 100, |
324 | | - "final_query": query |
325 | | - }, ensure_ascii=False) |
326 | | - yield f"data: {complete_message}\n\n" |
327 | | - finally: |
328 | | - preprocess_manager.unregister_preprocess_task(task_id) |
329 | | - |
330 | | - |
331 | | -async def process_image_file(query: str, filename: str, file_content: bytes, tenant_id: str, language: str = LANGUAGE["ZH"]) -> str: |
332 | | - """ |
333 | | - Process image file, convert to text using external API |
334 | | - """ |
335 | | - # Load messages based on language |
336 | | - messages = get_file_processing_messages_template(language) |
337 | | - |
338 | | - try: |
339 | | - image_stream = BytesIO(file_content) |
340 | | - text = convert_image_to_text(query, image_stream, tenant_id, language) |
341 | | - return messages["IMAGE_CONTENT_SUCCESS"].format(filename=filename, content=text) |
342 | | - except Exception as e: |
343 | | - return messages["IMAGE_CONTENT_ERROR"].format(filename=filename, error=str(e)) |
344 | | - |
345 | | - |
346 | | -async def process_text_file(query: str, filename: str, file_content: bytes, tenant_id: str, language: str = LANGUAGE["ZH"]) -> tuple[str, Optional[str]]: |
347 | | - """ |
348 | | - Process text file, convert to text using external API |
349 | | - """ |
350 | | - # Load messages based on language |
351 | | - messages = get_file_processing_messages_template(language) |
352 | | - |
353 | | - # file_content is byte data, need to send to API through file upload |
354 | | - data_process_service_url = DATA_PROCESS_SERVICE |
355 | | - api_url = f"{data_process_service_url}/tasks/process_text_file" |
356 | | - logger.info(f"Processing text file {filename} with API: {api_url}") |
357 | | - |
358 | | - try: |
359 | | - # Upload byte data as a file |
360 | | - files = { |
361 | | - 'file': (filename, file_content, 'application/octet-stream') |
362 | | - } |
363 | | - data = { |
364 | | - 'chunking_strategy': 'basic', |
365 | | - 'timeout': 60 |
366 | | - } |
367 | | - async with httpx.AsyncClient() as client: |
368 | | - response = await client.post(api_url, files=files, data=data, timeout=60) |
369 | | - |
370 | | - if response.status_code == 200: |
371 | | - result = response.json() |
372 | | - raw_text = result.get("text", "") |
373 | | - logger.info( |
374 | | - f"File processed successfully: {raw_text[:200]}...{raw_text[-200:]}..., length: {len(raw_text)}") |
375 | | - else: |
376 | | - error_detail = response.json().get('detail', 'unknown error') if response.headers.get( |
377 | | - 'content-type', '').startswith('application/json') else response.text |
378 | | - logger.error( |
379 | | - f"File processing failed (status code: {response.status_code}): {error_detail}") |
380 | | - raise Exception( |
381 | | - messages["FILE_PROCESSING_ERROR"].format(status_code=response.status_code, error_detail=error_detail)) |
382 | | - |
383 | | - except Exception as e: |
384 | | - return messages["FILE_CONTENT_ERROR"].format(filename=filename, error=str(e)), None |
385 | | - |
386 | | - try: |
387 | | - text, truncation_percentage = convert_long_text_to_text( |
388 | | - query, raw_text, tenant_id, language) |
389 | | - return messages["FILE_CONTENT_SUCCESS"].format(filename=filename, content=text), truncation_percentage |
390 | | - except Exception as e: |
391 | | - return messages["FILE_CONTENT_ERROR"].format(filename=filename, error=str(e)), None |
392 | | - |
393 | | - |
394 | | -def get_file_description(files: List[UploadFile]) -> str: |
395 | | - """ |
396 | | - Generate file description text |
397 | | - """ |
398 | | - if not files: |
399 | | - return "User provided some reference files:\nNo files provided" |
400 | | - |
401 | | - description = "User provided some reference files:\n" |
402 | | - for file in files: |
403 | | - ext = os.path.splitext(file.filename or "")[1].lower() |
404 | | - if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']: |
405 | | - description += f"- Image file {file.filename or ''}\n" |
406 | | - else: |
407 | | - description += f"- File {file.filename or ''}\n" |
408 | | - return description |
409 | | - |
410 | 189 | def get_llm_model(tenant_id: str): |
411 | 190 | # Get the tenant config |
412 | 191 | main_model_config = tenant_config_manager.get_model_config( |
|
0 commit comments