diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 0d679e7..0a88ce1 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,11 +1,8 @@ name: Publish to PyPI on: - push: - branches: - - main - tags: - - "v*" + release: + types: [published] jobs: deploy: diff --git a/README.md b/README.md index d7ac2ba..5f5aa70 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,33 @@ -# edubotics-core -## Welcome to edubotics-core by Edubotics AI! πŸ‘‹ - -![PyPI](https://img.shields.io/pypi/v/edubotics-core.svg) -![GitHub stars](https://img.shields.io/github/stars/edubotics-ai/edubot-core.svg) -![License](https://img.shields.io/github/license/edubotics-ai/edubot-core.svg) -![PyPI Downloads](https://img.shields.io/pypi/dm/edubotics-core.svg) -[![GitHub Contributors](https://img.shields.io/github/contributors/edubotics-ai/edubot-core)](https://github.com/edubotics-ai/edubot-core/graphs/contributors) +

+ + edubotics-ai + +

+

+ Edubotics AI - Empower Education with AI: Create Intelligent Chatbots Quickly and Efficiently +

+

+ + PyPI + + + GitHub stars + + + License + + + PyPI Downloads + + + GitHub Contributors + +

-**Empower Education with AI: Create Intelligent Chatbots Quickly and Efficiently πŸš€** +## Welcome to edubotics-core by Edubotics AI! πŸ‘‹ -edubotics-core is an open-source Python library that allows developers to build LLM-based chatbots efficiently. It provides a comprehensive set of core modules for vector storage, retrieval, processing, with more to come. +**edubotics-core** is an open-source Python library that allows developers to build LLM-based chatbots efficiently. It provides a comprehensive set of core modules for vector storage, retrieval, processing, with more to come. ## πŸ›  Installation @@ -20,7 +37,7 @@ You can install edubotics-core using pip: pip install edubotics-core ``` -Full documentation can be found [here](https://edubotics-ai.github.io/edubot-core/). +Full documentation can be found [here](http://docs.edubotics.ai/). ## ✨ Key Features - Modular and Extensible: Easily create, modify, and extend to the core modules. @@ -38,4 +55,4 @@ We welcome contributions to edubotics-core! If you're interested in contributing ## πŸ“œ License -edubotics-core is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details. \ No newline at end of file +edubotics-core is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details. diff --git a/apps/ai_tutor/.env.example b/apps/ai_tutor/.env.example new file mode 100644 index 0000000..44344e8 --- /dev/null +++ b/apps/ai_tutor/.env.example @@ -0,0 +1,11 @@ +LLAMA_CLOUD_API_KEY= +OPENAI_API_KEY= +HF_TOKEN= +HUGGING_FACE_TOKEN= +LITERAL_API_KEY_LOGGING= +LITERAL_API_URL=https://cloud.getliteral.ai +CHAINLIT_AUTH_SECRET= +CHAINLIT_URL=http://localhost:8000 +OAUTH_GOOGLE_CLIENT_ID= +OAUTH_GOOGLE_CLIENT_SECRET= +EMAIL_ENCRYPTION_KEY= diff --git a/apps/ai_tutor/chainlit_app.py b/apps/ai_tutor/chainlit_app.py index ba8768a..c361806 100644 --- a/apps/ai_tutor/chainlit_app.py +++ b/apps/ai_tutor/chainlit_app.py @@ -10,7 +10,6 @@ import chainlit as cl from edubotics_core.chat.llm_tutor import LLMTutor from edubotics_core.chat.helpers import ( - get_sources, get_history_chat_resume, get_history_setup_llm, # get_last_config, @@ -22,6 +21,7 @@ from helpers import ( check_user_cooldown, reset_tokens_for_user, + get_sources, ) from helpers import get_time import copy diff --git a/apps/ai_tutor/config/config.yml b/apps/ai_tutor/config/config.yml index eed8ee7..28d2341 100644 --- a/apps/ai_tutor/config/config.yml +++ b/apps/ai_tutor/config/config.yml @@ -8,7 +8,7 @@ vectorstore: data_path: 'storage/data' # str url_file_path: 'storage/data/urls.txt' # str expand_urls: True # bool - db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR] + db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR] db_path : 'vectorstores' # str model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002'] search_top_k : 3 # int @@ -25,7 +25,7 @@ vectorstore: index_name: "new_idx" # str llm_params: - llm_arch: 'langchain' # [langchain] + llm_arch: 'langgraph' # [langchain, langgraph] use_history: True # bool generate_follow_up: False # bool memory_window: 3 # int diff --git a/apps/ai_tutor/config/config_manager.py b/apps/ai_tutor/config/config_manager.py index 6cc5edf..b01b8ed 100644 --- a/apps/ai_tutor/config/config_manager.py +++ b/apps/ai_tutor/config/config_manager.py @@ -1,6 +1,7 @@ from pydantic import BaseModel, conint, confloat, HttpUrl -from typing import Optional, List +from typing import Optional, List, Dict, Any import yaml +from .prompts import prompts class FaissParams(BaseModel): @@ -112,6 +113,10 @@ class APIConfig(BaseModel): timeout: conint(gt=0) = 60 +class PromptsConfig(BaseModel): + prompts: Dict[str, Any] = prompts + + class Config(BaseModel): log_dir: str = "storage/logs" log_chunk_dir: str = "storage/logs/chunks" @@ -126,6 +131,7 @@ class Config(BaseModel): token_config: TokenConfig misc: MiscConfig api_config: APIConfig + prompts_dict: PromptsConfig = PromptsConfig(prompts=prompts) class ConfigManager: @@ -142,6 +148,9 @@ def load_config(self) -> Config: with open(self.project_config_path, "r") as f: project_config_data = yaml.safe_load(f) + # Add prompts to the project config + project_config_data["prompts_dict"] = prompts + # Merge the two configurations merged_config = {**config_data, **project_config_data} diff --git a/apps/ai_tutor/config/prompts.py b/apps/ai_tutor/config/prompts.py index bdd6611..c93642a 100644 --- a/apps/ai_tutor/config/prompts.py +++ b/apps/ai_tutor/config/prompts.py @@ -1,5 +1,5 @@ prompts = { - "openai": { + "gpt-4o-mini": { "rephrase_prompt": ( "You are someone that rephrases statements. Rephrase the student's question to add context from their chat history if relevant, ensuring it remains from the student's point of view. " "Incorporate relevant details from the chat history to make the question clearer and more specific. " diff --git a/apps/ai_tutor/helpers.py b/apps/ai_tutor/helpers.py index a47d81c..f754f14 100644 --- a/apps/ai_tutor/helpers.py +++ b/apps/ai_tutor/helpers.py @@ -1,6 +1,93 @@ from datetime import datetime, timedelta, timezone import tiktoken from edubotics_core.chat_processor.helpers import update_user_info, convert_to_dict +import chainlit as cl + + +def get_sources(res, answer, stream=True, view_sources=False): + source_elements = [] + source_dict = {} # Dictionary to store URL elements + + for idx, source in enumerate(res["context"]): + print(source) + source_metadata = source.metadata + url = source_metadata.get("source", "N/A") + score = source_metadata.get("score", "N/A") + page = source_metadata.get("page", 1) + + lecture_tldr = source_metadata.get("tldr", "N/A") + lecture_recording = source_metadata.get("lecture_recording", "N/A") + suggested_readings = source_metadata.get("suggested_readings", "N/A") + date = source_metadata.get("date", "N/A") + + source_type = source_metadata.get("source_type", "N/A") + + url_name = f"{url}_{page}" + if url_name not in source_dict: + source_dict[url_name] = { + "text": source.page_content, + "url": url, + "score": score, + "page": page, + "lecture_tldr": lecture_tldr, + "lecture_recording": lecture_recording, + "suggested_readings": suggested_readings, + "date": date, + "source_type": source_type, + } + else: + source_dict[url_name]["text"] += f"\n\n{source.page_content}" + + full_answer = "" # Not to include the answer again if streaming + + if not stream: # First, display the answer if not streaming + # full_answer = "**Answer:**\n" + full_answer += answer + + if view_sources: + # Then, display the sources + # check if the answer has sources + if len(source_dict) == 0: + full_answer += "\n\n**No sources found.**" + return full_answer, source_elements, source_dict + else: + full_answer += "\n\n**Sources:**\n" + for idx, (url_name, source_data) in enumerate(source_dict.items()): + full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n" + + name = f"Source {idx + 1} Text\n" + full_answer += name + source_elements.append( + cl.Text(name=name, content=source_data["text"], display="side") + ) + + # Add a PDF element if the source is a PDF file + if source_data["url"].lower().endswith(".pdf"): + name = f"Source {idx + 1} PDF\n" + full_answer += name + pdf_url = f"{source_data['url']}#page={source_data['page']+1}" + source_elements.append( + cl.Pdf(name=name, url=pdf_url, display="side") + ) + + full_answer += "\n**Metadata:**\n" + for idx, (url_name, source_data) in enumerate(source_dict.items()): + full_answer += f"\nSource {idx + 1} Metadata:\n" + source_elements.append( + cl.Text( + name=f"Source {idx + 1} Metadata", + content=f"Source: {source_data['url']}\n" + f"Page: {source_data['page']}\n" + f"Type: {source_data['source_type']}\n" + f"Date: {source_data['date']}\n" + f"TL;DR: {source_data['lecture_tldr']}\n" + f"Lecture Recording: {source_data['lecture_recording']}\n" + f"Suggested Readings: {source_data['suggested_readings']}\n", + display="side", + ) + ) + + return full_answer, source_elements, source_dict def get_time(): diff --git a/apps/chainlit_base/chainlit_base.py b/apps/chainlit_base/chainlit_base.py index 740015f..393d1a1 100644 --- a/apps/chainlit_base/chainlit_base.py +++ b/apps/chainlit_base/chainlit_base.py @@ -4,9 +4,9 @@ import chainlit as cl from edubotics_core.chat.llm_tutor import LLMTutor from edubotics_core.chat.helpers import ( - get_sources, get_history_setup_llm, ) +from helpers import get_sources import copy from langchain_community.callbacks import get_openai_callback from config.config_manager import config_manager diff --git a/apps/chainlit_base/config/config_manager.py b/apps/chainlit_base/config/config_manager.py index da88660..b01b8ed 100644 --- a/apps/chainlit_base/config/config_manager.py +++ b/apps/chainlit_base/config/config_manager.py @@ -1,6 +1,7 @@ from pydantic import BaseModel, conint, confloat, HttpUrl -from typing import Optional, List +from typing import Optional, List, Dict, Any import yaml +from .prompts import prompts class FaissParams(BaseModel): @@ -24,7 +25,8 @@ class VectorStoreConfig(BaseModel): db_option: str = "RAGatouille" # Options: [FAISS, Chroma, RAGatouille, RAPTOR] db_path: str = "vectorstores" model: str = ( - "sentence-transformers/all-MiniLM-L6-v2" # Options: [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002] + # Options: [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002] + "sentence-transformers/all-MiniLM-L6-v2" ) search_top_k: conint(gt=0) = 3 score_threshold: confloat(ge=0.0, le=1.0) = 0.2 @@ -95,10 +97,26 @@ class MetadataConfig(BaseModel): slide_base_link: HttpUrl = "https://dl4ds.github.io" +class TokenConfig(BaseModel): + cooldown_time: conint(gt=0) = 60 + regen_time: conint(gt=0) = 180 + tokens_left: conint(gt=0) = 2000 + all_time_tokens_allocated: conint(gt=0) = 1000000 + + +class MiscConfig(BaseModel): + github_repo: HttpUrl = "https://github.com/edubotics-ai/edubot-core" + docs_website: HttpUrl = "https://dl4ds.github.io/dl4ds_tutor/" + + class APIConfig(BaseModel): timeout: conint(gt=0) = 60 +class PromptsConfig(BaseModel): + prompts: Dict[str, Any] = prompts + + class Config(BaseModel): log_dir: str = "storage/logs" log_chunk_dir: str = "storage/logs/chunks" @@ -110,7 +128,10 @@ class Config(BaseModel): splitter_options: SplitterOptions retriever: RetrieverConfig metadata: MetadataConfig + token_config: TokenConfig + misc: MiscConfig api_config: APIConfig + prompts_dict: PromptsConfig = PromptsConfig(prompts=prompts) class ConfigManager: @@ -127,6 +148,9 @@ def load_config(self) -> Config: with open(self.project_config_path, "r") as f: project_config_data = yaml.safe_load(f) + # Add prompts to the project config + project_config_data["prompts_dict"] = prompts + # Merge the two configurations merged_config = {**config_data, **project_config_data} diff --git a/apps/chainlit_base/config/constants.py b/apps/chainlit_base/config/constants.py new file mode 100644 index 0000000..506d0af --- /dev/null +++ b/apps/chainlit_base/config/constants.py @@ -0,0 +1,26 @@ +from dotenv import load_dotenv +import os + +load_dotenv() + +# API Keys - Loaded from the .env file + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") +HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") +LITERAL_API_KEY_LOGGING = os.getenv("LITERAL_API_KEY_LOGGING") +LITERAL_API_URL = os.getenv("LITERAL_API_URL") +CHAINLIT_URL = os.getenv("CHAINLIT_URL") +EMAIL_ENCRYPTION_KEY = os.getenv("EMAIL_ENCRYPTION_KEY") + +OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID") +OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET") + +opening_message = "Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!" +chat_end_message = ( + "I hope I was able to help you. If you have any more questions, feel free to ask!" +) + +# Model Paths + +LLAMA_PATH = "../storage/models/tinyllama" diff --git a/apps/chainlit_base/config/prompts.py b/apps/chainlit_base/config/prompts.py index bdd6611..c93642a 100644 --- a/apps/chainlit_base/config/prompts.py +++ b/apps/chainlit_base/config/prompts.py @@ -1,5 +1,5 @@ prompts = { - "openai": { + "gpt-4o-mini": { "rephrase_prompt": ( "You are someone that rephrases statements. Rephrase the student's question to add context from their chat history if relevant, ensuring it remains from the student's point of view. " "Incorporate relevant details from the chat history to make the question clearer and more specific. " diff --git a/apps/chainlit_base/helpers.py b/apps/chainlit_base/helpers.py new file mode 100644 index 0000000..f754f14 --- /dev/null +++ b/apps/chainlit_base/helpers.py @@ -0,0 +1,177 @@ +from datetime import datetime, timedelta, timezone +import tiktoken +from edubotics_core.chat_processor.helpers import update_user_info, convert_to_dict +import chainlit as cl + + +def get_sources(res, answer, stream=True, view_sources=False): + source_elements = [] + source_dict = {} # Dictionary to store URL elements + + for idx, source in enumerate(res["context"]): + print(source) + source_metadata = source.metadata + url = source_metadata.get("source", "N/A") + score = source_metadata.get("score", "N/A") + page = source_metadata.get("page", 1) + + lecture_tldr = source_metadata.get("tldr", "N/A") + lecture_recording = source_metadata.get("lecture_recording", "N/A") + suggested_readings = source_metadata.get("suggested_readings", "N/A") + date = source_metadata.get("date", "N/A") + + source_type = source_metadata.get("source_type", "N/A") + + url_name = f"{url}_{page}" + if url_name not in source_dict: + source_dict[url_name] = { + "text": source.page_content, + "url": url, + "score": score, + "page": page, + "lecture_tldr": lecture_tldr, + "lecture_recording": lecture_recording, + "suggested_readings": suggested_readings, + "date": date, + "source_type": source_type, + } + else: + source_dict[url_name]["text"] += f"\n\n{source.page_content}" + + full_answer = "" # Not to include the answer again if streaming + + if not stream: # First, display the answer if not streaming + # full_answer = "**Answer:**\n" + full_answer += answer + + if view_sources: + # Then, display the sources + # check if the answer has sources + if len(source_dict) == 0: + full_answer += "\n\n**No sources found.**" + return full_answer, source_elements, source_dict + else: + full_answer += "\n\n**Sources:**\n" + for idx, (url_name, source_data) in enumerate(source_dict.items()): + full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n" + + name = f"Source {idx + 1} Text\n" + full_answer += name + source_elements.append( + cl.Text(name=name, content=source_data["text"], display="side") + ) + + # Add a PDF element if the source is a PDF file + if source_data["url"].lower().endswith(".pdf"): + name = f"Source {idx + 1} PDF\n" + full_answer += name + pdf_url = f"{source_data['url']}#page={source_data['page']+1}" + source_elements.append( + cl.Pdf(name=name, url=pdf_url, display="side") + ) + + full_answer += "\n**Metadata:**\n" + for idx, (url_name, source_data) in enumerate(source_dict.items()): + full_answer += f"\nSource {idx + 1} Metadata:\n" + source_elements.append( + cl.Text( + name=f"Source {idx + 1} Metadata", + content=f"Source: {source_data['url']}\n" + f"Page: {source_data['page']}\n" + f"Type: {source_data['source_type']}\n" + f"Date: {source_data['date']}\n" + f"TL;DR: {source_data['lecture_tldr']}\n" + f"Lecture Recording: {source_data['lecture_recording']}\n" + f"Suggested Readings: {source_data['suggested_readings']}\n", + display="side", + ) + ) + + return full_answer, source_elements, source_dict + + +def get_time(): + return datetime.now(timezone.utc).isoformat() + + +async def check_user_cooldown( + user_info, current_time, COOLDOWN_TIME, TOKENS_LEFT, REGEN_TIME +): + # # Check if no tokens left + tokens_left = user_info.metadata.get("tokens_left", 0) + if tokens_left > 0 and not user_info.metadata.get("in_cooldown", False): + return False, None + + user_info = convert_to_dict(user_info) + last_message_time_str = user_info["metadata"].get("last_message_time") + + # Convert from ISO format string to datetime object and ensure UTC timezone + last_message_time = datetime.fromisoformat(last_message_time_str).replace( + tzinfo=timezone.utc + ) + current_time = datetime.fromisoformat(current_time).replace(tzinfo=timezone.utc) + + # Calculate the elapsed time + elapsed_time = current_time - last_message_time + elapsed_time_in_seconds = elapsed_time.total_seconds() + + # Calculate when the cooldown period ends + cooldown_end_time = last_message_time + timedelta(seconds=COOLDOWN_TIME) + cooldown_end_time_iso = cooldown_end_time.isoformat() + + # Check if the user is still in cooldown + if elapsed_time_in_seconds < COOLDOWN_TIME: + return True, cooldown_end_time_iso # Return in ISO 8601 format + + user_info["metadata"]["in_cooldown"] = False + # If not in cooldown, regenerate tokens + await reset_tokens_for_user(user_info, TOKENS_LEFT, REGEN_TIME) + + return False, None + + +async def reset_tokens_for_user(user_info, TOKENS_LEFT, REGEN_TIME): + user_info = convert_to_dict(user_info) + last_message_time_str = user_info["metadata"].get("last_message_time") + + last_message_time = datetime.fromisoformat(last_message_time_str).replace( + tzinfo=timezone.utc + ) + current_time = datetime.fromisoformat(get_time()).replace(tzinfo=timezone.utc) + + # Calculate the elapsed time since the last message + elapsed_time_in_seconds = (current_time - last_message_time).total_seconds() + + # Current token count (can be negative) + current_tokens = user_info["metadata"].get("tokens_left_at_last_message", 0) + current_tokens = min(current_tokens, TOKENS_LEFT) + + # Maximum tokens that can be regenerated + max_tokens = user_info["metadata"].get("max_tokens", TOKENS_LEFT) + + # Calculate how many tokens should have been regenerated proportionally + if current_tokens < max_tokens: + # Calculate the regeneration rate per second based on REGEN_TIME for full regeneration + # If current_tokens is close to 0, then the regeneration rate is relatively high, and if current_tokens is close to max_tokens, then the regeneration rate is relatively low + regeneration_rate_per_second = ( + max_tokens - max(current_tokens, 0) + ) / REGEN_TIME + + # Calculate how many tokens should have been regenerated based on the elapsed time + tokens_to_regenerate = int( + elapsed_time_in_seconds * regeneration_rate_per_second + ) + + # Ensure the new token count does not exceed max_tokens + new_token_count = min(current_tokens + tokens_to_regenerate, max_tokens) + + # Update the user's token count + user_info["metadata"]["tokens_left"] = new_token_count + + await update_user_info(user_info) + + +def get_num_tokens(text, model): + encoding = tiktoken.encoding_for_model(model) + tokens = encoding.encode(text) + return len(tokens) diff --git a/edubotics_core/chat/helpers.py b/edubotics_core/chat/helpers.py index 62ac2a5..9a1fd06 100644 --- a/edubotics_core/chat/helpers.py +++ b/edubotics_core/chat/helpers.py @@ -1,111 +1,41 @@ -from config.prompts import prompts -import chainlit as cl - - -def get_sources(res, answer, stream=True, view_sources=False): - source_elements = [] - source_dict = {} # Dictionary to store URL elements - - for idx, source in enumerate(res["context"]): - source_metadata = source.metadata - url = source_metadata.get("source", "N/A") - score = source_metadata.get("score", "N/A") - page = source_metadata.get("page", 1) - - lecture_tldr = source_metadata.get("tldr", "N/A") - lecture_recording = source_metadata.get("lecture_recording", "N/A") - suggested_readings = source_metadata.get("suggested_readings", "N/A") - date = source_metadata.get("date", "N/A") - - source_type = source_metadata.get("source_type", "N/A") - - url_name = f"{url}_{page}" - if url_name not in source_dict: - source_dict[url_name] = { - "text": source.page_content, - "url": url, - "score": score, - "page": page, - "lecture_tldr": lecture_tldr, - "lecture_recording": lecture_recording, - "suggested_readings": suggested_readings, - "date": date, - "source_type": source_type, - } - else: - source_dict[url_name]["text"] += f"\n\n{source.page_content}" - - full_answer = "" # Not to include the answer again if streaming - - if not stream: # First, display the answer if not streaming - # full_answer = "**Answer:**\n" - full_answer += answer - - if view_sources: - # Then, display the sources - # check if the answer has sources - if len(source_dict) == 0: - full_answer += "\n\n**No sources found.**" - return full_answer, source_elements, source_dict - else: - full_answer += "\n\n**Sources:**\n" - for idx, (url_name, source_data) in enumerate(source_dict.items()): - full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n" - - name = f"Source {idx + 1} Text\n" - full_answer += name - source_elements.append( - cl.Text(name=name, content=source_data["text"], display="side") - ) - - # Add a PDF element if the source is a PDF file - if source_data["url"].lower().endswith(".pdf"): - name = f"Source {idx + 1} PDF\n" - full_answer += name - pdf_url = f"{source_data['url']}#page={source_data['page']+1}" - source_elements.append( - cl.Pdf(name=name, url=pdf_url, display="side") - ) - - full_answer += "\n**Metadata:**\n" - for idx, (url_name, source_data) in enumerate(source_dict.items()): - full_answer += f"\nSource {idx + 1} Metadata:\n" - source_elements.append( - cl.Text( - name=f"Source {idx + 1} Metadata", - content=f"Source: {source_data['url']}\n" - f"Page: {source_data['page']}\n" - f"Type: {source_data['source_type']}\n" - f"Date: {source_data['date']}\n" - f"TL;DR: {source_data['lecture_tldr']}\n" - f"Lecture Recording: {source_data['lecture_recording']}\n" - f"Suggested Readings: {source_data['suggested_readings']}\n", - display="side", - ) - ) - - return full_answer, source_elements, source_dict - - -def get_prompt(config, prompt_type): +def get_prompt(config, prompt_type, all_prompts): llm_params = config["llm_params"] llm_loader = llm_params["llm_loader"] use_history = llm_params["use_history"] - llm_style = llm_params["llm_style"].lower() + llm_style = llm_params.get("llm_style", "normal").lower() + + print(all_prompts.keys()) + + # Validate llm_loader + if llm_loader not in all_prompts: + raise ValueError(f"Invalid llm_loader: {llm_loader}") + + loader_prompts = all_prompts[llm_loader] + + # Determine the appropriate prompt key + if use_history: + history_key = "prompt_with_history" + else: + history_key = "prompt_no_history" + + # Handle the case where the prompt type is a specific one like 'rephrase_prompt' + if prompt_type in loader_prompts: + selected_prompt = loader_prompts[prompt_type] + # Handle the case where the prompt type is a generic one like 'prompt_with_history' or 'prompt_no_history' + elif history_key in loader_prompts: + selected_prompt = loader_prompts[history_key] + else: + raise ValueError( + f"No valid prompt found for {prompt_type} or {history_key} in {llm_loader}" + ) - if prompt_type == "qa": - if llm_loader == "local_llm": - if use_history: - return prompts["tiny_llama"]["prompt_with_history"] - else: - return prompts["tiny_llama"]["prompt_no_history"] - else: - if use_history: - return prompts["openai"]["prompt_with_history"][llm_style] - else: - return prompts["openai"]["prompt_no_history"] - elif prompt_type == "rephrase": - return prompts["openai"]["rephrase_prompt"] + # If selected_prompt is a dictionary (e.g., different styles), return the appropriate style + if isinstance(selected_prompt, dict): + if llm_style not in selected_prompt: + raise ValueError(f"Invalid llm_style: {llm_style} for {llm_loader}") + return selected_prompt[llm_style] + else: + return selected_prompt # TODO: Do this better diff --git a/edubotics_core/chat/langchain/utils.py b/edubotics_core/chat/langchain/utils.py index 8f250de..3efb631 100644 --- a/edubotics_core/chat/langchain/utils.py +++ b/edubotics_core/chat/langchain/utils.py @@ -7,7 +7,6 @@ from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough from langchain_core.runnables.history import RunnableWithMessageHistory from langchain_core.chat_history import BaseChatMessageHistory -from langchain_core.pydantic_v1 import BaseModel, Field from langchain.chains.combine_documents.base import ( DEFAULT_DOCUMENT_PROMPT, DEFAULT_DOCUMENT_SEPARATOR, @@ -21,6 +20,7 @@ from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun import inspect from langchain_core.messages import BaseMessage +from pydantic import BaseModel, Field CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage] @@ -240,7 +240,7 @@ def create_stuff_documents_chain( document_separator: str = DEFAULT_DOCUMENT_SEPARATOR, ) -> Runnable[Dict[str, Any], Any]: """Create a chain for passing a list of Documents to a model.""" - _validate_prompt(prompt) + _validate_prompt(prompt, "context") _document_prompt = document_prompt or DEFAULT_DOCUMENT_PROMPT _output_parser = output_parser or StrOutputParser() diff --git a/edubotics_core/chat/langgraph/langgraph_agentic_rag.py b/edubotics_core/chat/langgraph/langgraph_agentic_rag.py new file mode 100644 index 0000000..939bde0 --- /dev/null +++ b/edubotics_core/chat/langgraph/langgraph_agentic_rag.py @@ -0,0 +1,404 @@ +from edubotics_core.chat.langchain.utils import ( + BaseChatMessageHistory, + InMemoryHistory, +) +from edubotics_core.chat.base import BaseRAG +from langchain_community.chat_message_histories import ChatMessageHistory +from langchain_openai import ChatOpenAI +from langgraph.prebuilt import ToolNode, tools_condition +from langgraph.graph import StateGraph, START, END +from langchain_core.pydantic_v1 import BaseModel, Field +from typing import Annotated, Optional, TypedDict, Sequence, Literal +from langchain_core.messages import BaseMessage, HumanMessage, AIMessage +from langgraph.graph.message import add_messages +from langchain_core.prompts import PromptTemplate, BasePromptTemplate +from langchain_core.callbacks import Callbacks +from langchain_core.retrievers import BaseRetriever +from langchain_core.tools.simple import Tool +from langchain.schema import Document +from functools import partial +from langchain_core.messages import SystemMessage + + +class RetrieverInput(BaseModel): + """Input schema for the retriever tool.""" + + input: str = Field(description="Query to look up in retriever") + + +def _get_relevant_documents( + input: str, + retriever: BaseRetriever, + document_prompt: BasePromptTemplate, + document_separator: str, + callbacks: Callbacks = None, +) -> list: + """Retrieve relevant documents synchronously.""" + docs = retriever.get_relevant_documents(input) + docs_list = [] + for doc in docs: + docs_list.append( + { + "page_content": doc.page_content, + "metadata": doc.metadata, + "source": doc.metadata.get("source"), + "page": doc.metadata.get("page"), + "score": doc.metadata.get("score"), + } + ) + return docs_list + + +async def _aget_relevant_documents( + input: str, + retriever: BaseRetriever, + document_prompt: BasePromptTemplate, + document_separator: str, + callbacks: Callbacks = None, +) -> list: + """Retrieve relevant documents asynchronously.""" + docs = await retriever.aget_relevant_documents(input) + docs_list = [] + for doc in docs: + docs_list.append( + { + "page_content": doc.page_content, + "metadata": doc.metadata, + "source": doc.metadata.get("source"), + "page": doc.metadata.get("page"), + "score": doc.metadata.get("score"), + } + ) + return docs_list + + +def create_retriever_tool( + retriever: BaseRetriever, + name: str, + description: str, + *, + document_prompt: Optional[BasePromptTemplate] = None, + document_separator: str = "\n\n", +) -> Tool: + """Create a tool for document retrieval.""" + document_prompt = document_prompt or PromptTemplate.from_template("{page_content}") + func = partial( + _get_relevant_documents, + retriever=retriever, + document_prompt=document_prompt, + document_separator=document_separator, + ) + afunc = partial( + _aget_relevant_documents, + retriever=retriever, + document_prompt=document_prompt, + document_separator=document_separator, + ) + return Tool( + name=name, + description=description, + func=func, + coroutine=afunc, + args_schema=RetrieverInput, + ) + + +class AgentState(TypedDict): + """State representation for the agent.""" + + messages: Annotated[Sequence[BaseMessage], add_messages] + context: dict + + +class GradeDocuments(BaseModel): + """Binary score for relevance check on retrieved documents.""" + + binary_score: str = Field( + description="Documents are relevant to the question, 'yes' or 'no'" + ) + + +class LanggraphAgenticRAG(BaseRAG): + def __init__( + self, + llm, + memory, + retriever, + qa_prompt: str, + rephrase_prompt: str, + config: dict, + callbacks=None, + ): + """ + Initialize the LanggraphAgenticRAG class. + + Args: + llm: The language model instance. + memory: The chat message history instance. + retriever: The retriever instance. + qa_prompt (str): The QA prompt string. + rephrase_prompt (str): The rephrase prompt string. + config (dict): Configuration dictionary. + callbacks (Optional[list]): Optional list of callbacks. + """ + self.llm = llm + self.memory = self.add_history_from_list(memory) + self.retriever = retriever + self.retriever_tool = create_retriever_tool( + self.retriever, + "retrieve_contextual_answers", + "Search and return information about the course content.", + ) + self.tools = [self.retriever_tool] + + self.qa_prompt = qa_prompt + self.rephrase_prompt = rephrase_prompt + + self.summary_prompt = """ + Summarize the main points from the retrieved documents in a concise manner, highlighting key information that directly answers the user’s query. Avoid additional questions or exploration. + """ + + self.action_prompt = """ + Extract actionable insights from the retrieved documents. Summarize any relevant information and suggest specific next steps or follow-up actions the user can take based on the provided content. + """ + + self.config = config + self.store = {} + + # Initialize the agentic graph workflow + self.graph = self.initialize_graph() + + def rewrite(self, state): + print("---TRANSFORM QUERY---") + question = state["messages"][0].content + messages = [ + SystemMessage( + content="Your task is to rephrase the user's question to make it more precise and clear, focusing on specificity to improve retrieval accuracy. Do not ask any questions back to the user. Only provide the rephrased question." + ), + HumanMessage(content=question), + ] + + # LLM + model = ChatOpenAI(temperature=0, model_name="gpt-4o-mini", streaming=True) + response = model.invoke(messages) + return {"messages": [response]} # Return the rephrased question + + def generate(self, state): + print("---GENERATE---") + question = state["messages"][0].content + docs = state["context"] + + messages = [ + SystemMessage( + content="Provide a concise answer to the user's question using the provided context. Only include the final answer without any additional explanations or reasoning." + ), + HumanMessage(content=f"Context:\n{docs}\n\nQuestion:\n{question}"), + ] + + # LLM + llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, streaming=True) + + # Run + response = llm.invoke(messages) + return {"messages": [response], "context": docs} + + def extract_actionable_insights(self, state): + print("---EXTRACT ACTIONABLE INSIGHTS---") + # retrieved_docs = state["context"] + question = state["messages"][0].content + + messages = [ + SystemMessage( + content="Based on the provided context, provide actionable insights that address the user's question. Summarize relevant information and suggest specific next steps or follow-up actions. Do not include your reasoning or the context in your final response." + ), + HumanMessage(content=f"Question: {question}"), + ] + + # LLM + llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, streaming=True) + + # Run + response = llm.invoke(messages) + return {"messages": [response], "context": state["context"]} + + def grade_documents(self, state) -> Literal["extract_insights", "rewrite"]: + print("---CHECK RELEVANCE---") + model = ChatOpenAI(temperature=0, model_name="gpt-4o-mini", streaming=True) + llm_with_tool = model.with_structured_output(GradeDocuments) + + prompt = PromptTemplate( + template="""Evaluate whether the retrieved documents contain information directly relevant to the user's question. Respond with 'yes' if relevant, or 'no' if not, without any additional explanation.""", + input_variables=["context", "question"], + ) + + chain = prompt | llm_with_tool + question = state["messages"][0].content + docs = state["context"] + scored_result = chain.invoke({"question": question, "context": docs}) + score = scored_result.binary_score.lower() + if score == "yes": + print("---DECISION: DOCS RELEVANT---") + return "extract_insights" + else: + print("---DECISION: DOCS NOT RELEVANT---") + return "rewrite" + + def agent(self, state): + print("---CALL AGENT---") + messages = state["messages"] + model = ChatOpenAI(temperature=0, streaming=True, model="gpt-4o-mini") + model = model.bind_tools(self.tools) + response = model.invoke(messages) + # We return a list, because this will get added to the existing list + return {"messages": [response], "context": state["context"]} + + def initialize_graph(self): + """ + Initialize the agentic graph workflow. + + Returns: + StateGraph: The initialized graph for the RAG process. + """ + # Define a new graph + workflow = StateGraph(AgentState) + + # Define the nodes + workflow.add_node("agent", self.agent) + retrieve = ToolNode([self.retriever_tool]) + workflow.add_node("retrieve", retrieve) + workflow.add_node("rewrite", self.rewrite) + workflow.add_node("generate", self.generate) + workflow.add_node("extract_insights", self.extract_actionable_insights) + + # Start with the agent + workflow.add_edge(START, "agent") + + # Agent decides whether to retrieve or not + workflow.add_conditional_edges( + "agent", + tools_condition, + { + "tools": "retrieve", + END: END, + }, + ) + + # After retrieval, go to generate + workflow.add_edge("retrieve", "generate") + + # After generating, grade the documents + workflow.add_conditional_edges( + "generate", + self.grade_documents, + { + "extract_insights": "extract_insights", + "rewrite": "rewrite", + }, + ) + + # After extracting insights, end the process + workflow.add_edge("extract_insights", END) + + # If rewriting is necessary, loop back to the agent + workflow.add_edge("rewrite", "agent") + + # Compile the workflow + graph = workflow.compile() + return graph + + def add_history_from_list(self, conversation_list): + """ + Add messages from a list to the chat history. + + Args: + conversation_list (list): The list of messages to add. + """ + history = ChatMessageHistory() + for message in conversation_list: + message_type = ( + message.get("type", None) + if isinstance(message, dict) + else getattr(message, "type", None) + ) + message_content = ( + message.get("content", None) + if isinstance(message, dict) + else getattr(message, "content", None) + ) + + if message_type in ["human", "user_message"]: + history.add_user_message(message_content) + elif message_type in ["ai", "ai_message"]: + history.add_ai_message(message_content) + return history + + def get_session_history( + self, user_id: str, conversation_id: str, memory_window: int + ) -> BaseChatMessageHistory: + """ + Get the session history for a user and conversation. + + Args: + user_id (str): The user identifier. + conversation_id (str): The conversation identifier. + memory_window (int): The number of conversations to consider for context. + + Returns: + BaseChatMessageHistory: The chat message history. + """ + if (user_id, conversation_id) not in self.store: + self.store[(user_id, conversation_id)] = InMemoryHistory() + self.store[(user_id, conversation_id)].add_messages(self.memory.messages) + return self.store[(user_id, conversation_id)] + + async def invoke(self, user_query, config, **kwargs): + """ + Invoke the agentic RAG process with the given user query. + + Args: + user_query (dict): The user query containing 'input'. + config (dict): Configuration for the process. + + Returns: + dict: The response containing 'answer' and 'context'. + """ + inputs = {"messages": [HumanMessage(content=user_query["input"])]} + output = {} + + # Stream outputs from the graph + for node_output in self.graph.stream(inputs, {"recursion_limit": 10}): + output.update(node_output) + + print("---END---") + + # Extract the final answer and context + key = next( + ( + k + for k in ["extract_insights", "rewrite", "generate", "agent"] + if k in output + ), + None, + ) + + if key: + context_data = output[key].get("context", []) + # Convert context_data to a list of Document objects + res_context = ( + [ + Document(page_content=doc["page_content"], metadata=doc["metadata"]) + for doc in context_data + ] + if isinstance(context_data, list) + else [] + ) + answer_message = output[key]["messages"][0] + if isinstance(answer_message, AIMessage): + answer = answer_message.content + else: + answer = answer_message + res = {"answer": answer, "context": res_context} + else: + res = {"answer": None, "context": []} + + return res diff --git a/edubotics_core/chat/llm_tutor.py b/edubotics_core/chat/llm_tutor.py index 4499887..ca1038f 100644 --- a/edubotics_core/chat/llm_tutor.py +++ b/edubotics_core/chat/llm_tutor.py @@ -6,6 +6,9 @@ Langchain_RAG_V2, QuestionGenerator, ) +from edubotics_core.chat.langgraph.langgraph_agentic_rag import ( + LanggraphAgenticRAG, +) class LLMTutor: @@ -23,9 +26,11 @@ def __init__(self, config, user, logger=None): self.user = user self.logger = logger self.vector_db = VectorStoreManager(config, logger=self.logger).load_database() - self.qa_prompt = get_prompt(config, "qa") # Initialize qa_prompt + self.qa_prompt = get_prompt( + config, "qa", all_prompts=self.config["prompts_dict"]["prompts"] + ) # Initialize qa_prompt self.rephrase_prompt = get_prompt( - config, "rephrase" + config, "rephrase", all_prompts=self.config["prompts_dict"]["prompts"] ) # Initialize rephrase_prompt # TODO: Removed this functionality for now, don't know if we need it @@ -57,7 +62,7 @@ def update_llm(self, old_config, new_config): if "llm_params.llm_style" in changes: self.qa_prompt = get_prompt( - self.config, "qa" + self.config, "qa", all_prompts=self.config["prompts_dict"]["prompts"] ) # Update qa_prompt if ELI5 changes def get_config_changes(self, old_config, new_config): @@ -117,12 +122,21 @@ def retrieval_qa_chain( config=self.config, callbacks=callbacks, ) - - self.question_generator = QuestionGenerator() + elif self.config["llm_params"]["llm_arch"] == "langgraph": + self.qa_chain = LanggraphAgenticRAG( + llm=llm, + memory=memory, + retriever=retriever, + qa_prompt=qa_prompt, + rephrase_prompt=rephrase_prompt, + config=self.config, + callbacks=callbacks, + ) else: raise ValueError( f"Invalid LLM Architecture: {self.config['llm_params']['llm_arch']}" ) + self.question_generator = QuestionGenerator() return self.qa_chain def load_llm(self): diff --git a/edubotics_core/chat_processor/literal_ai.py b/edubotics_core/chat_processor/literal_ai.py index 82dd697..ca7fb13 100644 --- a/edubotics_core/chat_processor/literal_ai.py +++ b/edubotics_core/chat_processor/literal_ai.py @@ -1,7 +1,7 @@ -from chainlit.data import ChainlitDataLayer +from chainlit.data import LiteralDataLayer # update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py) -class CustomLiteralDataLayer(ChainlitDataLayer): +class CustomLiteralDataLayer(LiteralDataLayer): def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/edubotics_core/config/constants.py b/edubotics_core/config/constants.py index 6c0feb7..ef2b51d 100644 --- a/edubotics_core/config/constants.py +++ b/edubotics_core/config/constants.py @@ -3,10 +3,12 @@ import os from dotenv import load_dotenv -load_dotenv() +load_dotenv(".env") -# Required Constants # TODO: MOVE THIS TO APP SPECIFIC DIRECTORY +# Centralized definition of required constants for easy management and access TIMEOUT = os.getenv("TIMEOUT", 60) -OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") -LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") -HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") +LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY", "") +HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "") +GITHUB_PERSONAL_ACCESS_TOKEN = os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", "") +GITHUB_USERNAME = os.getenv("GITHUB_USERNAME", "") diff --git a/edubotics_core/dataloader/data_loader.py b/edubotics_core/dataloader/data_loader.py index 8c24ea9..653d652 100644 --- a/edubotics_core/dataloader/data_loader.py +++ b/edubotics_core/dataloader/data_loader.py @@ -17,10 +17,12 @@ from urllib.parse import urljoin import html2text import bs4 -import PyPDF2 from edubotics_core.dataloader.pdf_readers.base import PDFReader from edubotics_core.dataloader.pdf_readers.llama import LlamaParser from edubotics_core.dataloader.pdf_readers.gpt import GPTParser +from edubotics_core.dataloader.repo_readers.github import GithubReader +from edubotics_core.dataloader.repo_readers.helpers import read_notebook_from_file +from edubotics_core.dataloader.metadata_extractor import LLMMetadataExtractor from edubotics_core.dataloader.helpers import get_metadata from edubotics_core.config.constants import TIMEOUT @@ -55,9 +57,10 @@ def check_links(self, base_url, html_content): resp = requests.head(absolute_url, timeout=TIMEOUT) if resp.status_code != 200: - logger.warning( - f"Link {absolute_url} is broken. Status code: {resp.status_code}" - ) + # logger.warning( + # f"Link {absolute_url} is broken. Status code: {resp.status_code}" + # ) + pass return str(soup) @@ -75,30 +78,24 @@ def read_html(self, url): class FileReader: - def __init__(self, logger, kind): + def __init__(self, logger, config, kind): self.logger = logger + self.config = config self.kind = kind + if kind == "llama": self.pdf_reader = LlamaParser() elif kind == "gpt": self.pdf_reader = GPTParser() else: self.pdf_reader = PDFReader() + self.web_reader = HTMLReader() + self.github_reader = GithubReader() self.logger.info( f"Initialized FileReader with {kind} PDF reader and HTML reader" ) - def extract_text_from_pdf(self, pdf_path): - text = "" - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - num_pages = len(reader.pages) - for page_num in range(num_pages): - page = reader.pages[page_num] - text += page.extract_text() - return text - def read_pdf(self, temp_file_path: str): documents = self.pdf_reader.parse(temp_file_path) return documents @@ -135,6 +132,31 @@ def read_tex_from_url(self, tex_url): self.logger.error(f"Failed to fetch .tex file from URL: {tex_url}") return None + def read_github_repo(self, github_url: str): + repo_contents = self.github_reader.get_repo_contents(github_url) + docs = [ + Document(page_content=content, metadata={"source": file}) + for file, content in repo_contents.items() + if content is not None + ] + for i, doc in enumerate(docs): + doc.metadata["page"] = i + + return docs + + def read_notebook(self, notebook_path): + if "github.com" in notebook_path and "blob" in notebook_path: + notebook_path = notebook_path.replace( + "github.com", "raw.githubusercontent.com" + ) + notebook_path = notebook_path.replace("/blob/", "/") + self.logger.info(f"Changed notebook path to {notebook_path}") + + return read_notebook_from_file( + notebook_path, + headers_to_split_on=self.config["content"]["notebookheaders_to_split_on"], + ) + class ChunkProcessor: def __init__(self, config, logger): @@ -223,7 +245,7 @@ def process_chunks( def chunk_docs(self, file_reader, uploaded_files, weblinks): addl_metadata = get_metadata( *self.config["metadata"]["metadata_links"], self.config - ) # For any additional metadata + ) # For any additional metadata''' # remove already processed files if reparse_files is False if not self.config["vectorstore"]["reparse_files"]: @@ -280,17 +302,29 @@ def process_documents( file_data = {} file_metadata = {} - for doc in documents: - # if len(doc.page_content) <= 400: # better approach to filter out non-informative documents - # continue - - page_num = doc.metadata.get("page", 0) + for i, doc in enumerate(documents): + page_num = doc.metadata.get("page", i) file_data[page_num] = doc.page_content # Create a new dictionary for metadata in each iteration - metadata = addl_metadata.get(file_path, {}).copy() - metadata["page"] = page_num + metadata = doc.metadata metadata["source"] = file_path + metadata["page"] = page_num + + if self.config["metadata"]["lectures_pattern"] in file_path: + addl_metadata_copy = addl_metadata.copy() + metadata.update(addl_metadata_copy) + metadata["content_type"] = "lecture" + elif self.config["metadata"]["assignments_pattern"] in file_path: + addl_metadata = LLMMetadataExtractor( + fields=self.config["metadata"]["assignment_metadata_fields"] + ).extract_metadata(file_path) + + metadata.update(addl_metadata) + metadata["content_type"] = "assignment" + else: + metadata["content_type"] = "other" + file_metadata[page_num] = metadata if self.config["vectorstore"]["db_option"] not in ["RAGatouille"]: @@ -318,6 +352,7 @@ def process_file(self, file_path, file_index, file_reader, addl_metadata): "docx": file_reader.read_docx, "srt": file_reader.read_srt, "tex": file_reader.read_tex_from_url, + "ipynb": file_reader.read_notebook, } if file_type not in read_methods: self.logger.warning(f"Unsupported file type: {file_type}") @@ -340,14 +375,16 @@ def process_file(self, file_path, file_index, file_reader, addl_metadata): self.logger.error(f"Error processing file {file_name}: {str(e)}") def process_weblink(self, link, link_index, file_reader, addl_metadata): + self.logger.info(f"Reading link {link_index + 1} : {link}") + if link in self.document_data: return - self.logger.info(f"Reading link {link_index + 1} : {link}") - try: if "youtube" in link: documents = file_reader.read_youtube_transcript(link) + elif "github.com" in link: + documents = file_reader.read_github_repo(link) else: documents = file_reader.read_html(link) @@ -405,7 +442,7 @@ def load_document_data(self): class DataLoader: def __init__(self, config, logger=None): self.file_reader = FileReader( - logger=logger, kind=config["llm_params"]["pdf_reader"] + logger=logger, config=config, kind=config["llm_params"]["pdf_reader"] ) self.chunk_processor = ChunkProcessor(config, logger=logger) @@ -419,10 +456,7 @@ def get_chunks(self, uploaded_files, weblinks): import yaml import argparse - parser = argparse.ArgumentParser(description="Process some links.") - parser.add_argument( - "--links", nargs="+", required=True, help="List of links to process." - ) + parser = argparse.ArgumentParser(description="Data Loader") parser.add_argument( "--config_file", type=str, help="Path to the main config file", required=True ) @@ -434,7 +468,6 @@ def get_chunks(self, uploaded_files, weblinks): ) args = parser.parse_args() - links_to_process = args.links logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -455,6 +488,15 @@ def get_chunks(self, uploaded_files, weblinks): if file != "urls.txt" ] + urls_file = os.path.join(STORAGE_DIR, "urls.txt") + with open(urls_file, "r") as f: + weblinks = f.readlines() + + weblinks = [link.strip() for link in weblinks] + + print(f"Uploaded files: {uploaded_files}") + print(f"Web links: {weblinks}") + data_loader = DataLoader(config, logger=logger) # Just for testing ( @@ -463,8 +505,8 @@ def get_chunks(self, uploaded_files, weblinks): documents, document_metadata, ) = data_loader.get_chunks( - links_to_process, - [], + uploaded_files, + weblinks, ) print(document_names[:5]) diff --git a/edubotics_core/dataloader/metadata_extractor.py b/edubotics_core/dataloader/metadata_extractor.py new file mode 100644 index 0000000..83790da --- /dev/null +++ b/edubotics_core/dataloader/metadata_extractor.py @@ -0,0 +1,114 @@ +import requests +import json +from typing import List +from bs4 import BeautifulSoup +from dotenv import load_dotenv +from openai import OpenAI + +load_dotenv() + + +def gather_metadata(files, urls, config): + pass + + +def filter_assignment_urls(files, config): + assignment_pattern = config["metadata"]["assignment_base_link"] + assignment_urls = [] + for file in files: + if assignment_pattern in file: + assignment_urls.append(file) + + return assignment_urls + + +def filter_lecture_urls(files, urls, config): + lecture_pattern = config["metadata"]["lectures_pattern"] + lecture_urls = [] + for file in files: + if lecture_pattern in file: + lecture_urls.append(file) + + return lecture_urls + + +class LLMMetadataExtractor: + """ + Extracts metadata from a given webpage using an LLM. + """ + + def __init__(self, fields: List[str]): + self.client = OpenAI() + self.fields = fields + + def extract_metadata(self, url: str) -> dict: + # Fetch and parse the webpage + response = requests.get(url, timeout=60) + soup = BeautifulSoup(response.text, "html.parser") + + # Extract the main content (you might need to adjust this based on the page structure) + content = soup.find("main") or soup.find("body") + text = content.get_text(separator="\n", strip=True) + + fields_str = ", ".join(self.fields) + + prompt = f""" + Extract the following metadata from the given webpage about a course assignment: + {fields_str} + + Please format the output as a JSON object with keys: {fields_str}. + If applicable, the source_file is the link that points to an assignment file (e.g. .ipynb, .pdf, etc). + Usually, it's under an tag with the texts "Download", "View" or "notebook". + If any information is not found, set the value to null. + + Text: + {text[:4000]} + + JSON Output: + """ + + # Call the OpenAI API + response = self.client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that extracts metadata from course assignment texts.", + }, + {"role": "user", "content": prompt}, + ], + temperature=0.2, + ) + + try: + metadata = ( + response.choices[0] + .message.content.replace("```json\n", "") + .replace("\n```", "") + ) + metadata = json.loads(metadata) + + # TODO: This is a hack to get the source_file. We need to improve the LLM output. + try: + source_file = soup.find("a", string=metadata["source_file"]) + metadata["source_file"] = source_file["href"] + except Exception as e: + print("Error: Could not find source_file in the webpage") + print(e) + + except json.JSONDecodeError as e: + print("Error: Could not parse JSON from LLM response") + print(e) + metadata = {} + + return metadata + + +if __name__ == "__main__": + extractor = LLMMetadataExtractor( + fields=["title", "due_date", "release_date", "source_file"] + ) + metadata = extractor.extract_metadata( + "https://tools4ds.github.io/fa2024/assignments/01_assignment.html" + ) + print(json.dumps(metadata, indent=2)) diff --git a/edubotics_core/dataloader/pdf_readers/gpt.py b/edubotics_core/dataloader/pdf_readers/gpt.py index 67af0dd..d495ca7 100644 --- a/edubotics_core/dataloader/pdf_readers/gpt.py +++ b/edubotics_core/dataloader/pdf_readers/gpt.py @@ -1,12 +1,11 @@ import base64 -import os import requests from io import BytesIO from openai import OpenAI from pdf2image import convert_from_path from langchain.schema import Document -from edubotics_core.config.constants import TIMEOUT +from edubotics_core.config.constants import TIMEOUT, OPENAI_API_KEY class GPTParser: @@ -17,7 +16,7 @@ class GPTParser: def __init__(self): self.client = OpenAI() - self.api_key = os.getenv("OPENAI_API_KEY") + self.api_key = OPENAI_API_KEY self.prompt = """ The provided documents are images of PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. diff --git a/edubotics_core/dataloader/repo_readers/__init__.py b/edubotics_core/dataloader/repo_readers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/edubotics_core/dataloader/repo_readers/github.py b/edubotics_core/dataloader/repo_readers/github.py new file mode 100644 index 0000000..dd13953 --- /dev/null +++ b/edubotics_core/dataloader/repo_readers/github.py @@ -0,0 +1,222 @@ +import requests +import base64 +from edubotics_core.dataloader.repo_readers.helpers import extract_notebook_content +from edubotics_core.config.constants import ( + GITHUB_USERNAME, + GITHUB_PERSONAL_ACCESS_TOKEN, +) +import argparse + + +class GithubReader: + def __init__(self, username=None, personal_access_token=None): + """ + Initialize the GithubReader with the username and personal access token. + + Args: + username (str): The GitHub username for authentication. + personal_access_token (str): The GitHub personal access token for authentication. + """ + if username is None: + self.username = GITHUB_USERNAME + else: + self.username = username + if personal_access_token is None: + self.personal_access_token = GITHUB_PERSONAL_ACCESS_TOKEN + else: + self.personal_access_token = personal_access_token + + self.ignore_files = [ + "README.md", + ".DS_Store", + "requirements.txt", + "LICENSE", + "COPYING", + "COPYRIGHT", + "NOTICE", + "AUTHORS", + "CONTRIBUTORS", + ".gitignore", + ] + + self.ignore_ext = [ + "csv", + "pyc", + "jpg", + "png", + "gif", + "jpeg", + ] + + self.repo_allow_list = ["release/", "contents/"] + + if not self.personal_access_token: + raise Warning( + "Personal access token is not set. You may need to use a personal access token with the correct scopes for private repositories." + ) + + def get_repo_contents(self, url): + """ + Fetch the contents of a private GitHub repository. + + Args: + repo_owner (str): The owner of the repository. + repo_name (str): The name of the repository. + branch (str, optional): The branch to fetch the contents from. Defaults to 'main'. + path (str, optional): The path to the repository. Defaults to ''. + """ + repo_owner, repo_name, branch = self.parse_github_url(url) + + # top level path is '' + return self.read_github_repo_contents(repo_owner, repo_name, branch) + + def read_github_repo_contents(self, repo_owner, repo_name, branch="main", path=""): + """ + Fetch the contents of a private GitHub repository. + + Args: + repo_owner (str): The owner of the repository. + repo_name (str): The name of the repository. + branch (str, optional): The branch to fetch the contents from. Defaults to 'main'. + + Returns: + dict: The contents of the repository, with file paths as keys and file contents as values. + """ + repo_contents = {} + + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{path}?ref={branch}" + auth_string = f"{self.username}:{self.personal_access_token}" + auth_bytes = auth_string.encode("ascii") + auth_b64 = base64.b64encode(auth_bytes).decode("ascii") + + headers = {"Authorization": f"Basic {auth_b64}"} + + response = requests.get(url, headers=headers, timeout=60) + + if response.status_code == 200: + for item in response.json(): + if item["type"] == "file": + + file_path = item["path"] + extension = file_path.split(".")[-1] + + if self.repo_allow_list: + if not any( + pattern in file_path for pattern in self.repo_allow_list + ): + continue + if file_path in self.ignore_files or extension in self.ignore_ext: + continue + + file_content = self.get_github_file_content( + repo_owner, repo_name, file_path, branch + ) + + full_path = f"https://github.com/{repo_owner}/{repo_name}/blob/{branch}/{file_path}" + repo_contents[full_path] = file_content + + elif item["type"] == "dir": + + sub_dir_path = item["path"] + sub_dir_contents = self.read_github_repo_contents( + repo_owner, repo_name, branch, sub_dir_path + ) + + repo_contents.update(sub_dir_contents) + else: + print( + f"Failed to fetch repository contents: {response.status_code}. You may need to use a personal access token with the correct scopes." + ) + return repo_contents + + def get_github_file_content( + self, repo_owner: str, repo_name: str, file_path: str, branch: str = "main" + ): + """ + Fetch the content of a file from a private GitHub repository. + + Args: + repo_owner (str): The owner of the repository. + repo_name (str): The name of the repository. + file_path (str): The path to the file within the repository. + branch (str, optional): The branch to fetch the file from. Defaults to 'main'. + + Returns: + str: The content of the file, or None if the request fails. + """ + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}?ref={branch}" + auth_string = f"{self.username}:{self.personal_access_token}" + auth_bytes = auth_string.encode("ascii") + auth_b64 = base64.b64encode(auth_bytes).decode("ascii") + + headers = {"Authorization": f"Basic {auth_b64}"} + response = requests.get(url, headers=headers, timeout=60) + if response.status_code == 200: + content = response.json()["content"] + decoded_content = base64.b64decode(content).decode("utf-8") + + if not decoded_content.strip(): + print(f"File {file_path} is empty.") + return None + + if file_path.endswith(".ipynb"): + decoded_content = extract_notebook_content(decoded_content) + + return decoded_content + else: + print(f"Failed to fetch file: {response.status_code}") + return None + + @staticmethod + def parse_github_url(url): + """ + Parse a GitHub URL to extract the repository owner, name, and branch. + + Args: + url (str): The GitHub repository URL. + + Returns: + tuple: A tuple containing (repo_owner, repo_name, branch). + If branch is not specified in the URL, it defaults to 'main'. + """ + from urllib.parse import urlparse + + parsed_url = urlparse(url) + path_parts = parsed_url.path.strip("/").split("/") + + if len(path_parts) < 2: + raise ValueError("Invalid GitHub URL") + + repo_owner = path_parts[0] + repo_name = path_parts[1] + branch = "main" # Default branch + + if len(path_parts) > 3 and path_parts[2] == "tree": + branch = path_parts[3] + + return repo_owner, repo_name, branch + + +# Usage example +if __name__ == "__main__": + # Set up argparse to get username and github_url as arguments + parser = argparse.ArgumentParser(description="Read GitHub repository contents.") + parser.add_argument( + "--github_url", type=str, help="GitHub repository URL", required=True + ) + + args = parser.parse_args() + github_url = args.github_url + + reader = GithubReader() # Initialize the GithubReader + owner, name, branch = GithubReader.parse_github_url(github_url) + print(f"Owner: {owner}, Repo: {name}, Branch: {branch}") + repo_contents = reader.get_repo_contents(github_url) + + # The repo_contents dictionary now contains the contents of all files in the repository + for file_path, file_content in repo_contents.items(): + if file_content is None: + continue + print(f"File: {file_path}") + print(file_content[:20]) + print("---") diff --git a/edubotics_core/dataloader/repo_readers/helpers.py b/edubotics_core/dataloader/repo_readers/helpers.py new file mode 100644 index 0000000..0fa4a93 --- /dev/null +++ b/edubotics_core/dataloader/repo_readers/helpers.py @@ -0,0 +1,98 @@ +import os +import nbformat +import requests +import argparse +from langchain_text_splitters import MarkdownHeaderTextSplitter + + +def read_notebook_from_url(notebook_url): + """ + Read the contents of a Jupyter notebook from a URL. + + Args: + notebook_url (str): The URL of the Jupyter notebook file. + + Returns: + str: The contents of the Jupyter notebook. + """ + response = requests.get(notebook_url, timeout=60) + if response.status_code == 200: + notebook_content = response.text + return notebook_content + else: + print(f"Failed to fetch notebook from URL: {response.status_code}") + return None + + +def read_notebook_from_file(notebook_path, headers_to_split_on): + """ + Read the contents of a Jupyter notebook from a file. + + Args: + notebook_path (str): The path to the Jupyter notebook file. + + Returns: + str: The contents of the Jupyter notebook. + """ + if not os.path.exists(notebook_path): + print(f"File {notebook_path} does not exist. Using filepath as URL instead.") + notebook_content = read_notebook_from_url(notebook_path) + else: + with open(notebook_path, "r") as file: + notebook_content = file.read() + return extract_notebook_content(notebook_content, headers_to_split_on) + + +def extract_notebook_content( + notebook_content, + headers_to_split_on=[("###", "Section"), ("##", "Subsection"), ("#", "Title")], +): + """ + Extract the content from a Jupyter notebook, preserving the order of the cells. + + Args: + notebook_content (str): The contents of the Jupyter notebook. + headers_to_split_on (list): A list of headers to split the notebook content by. Default is [("###", "Section"), ("##", "Subsection"), ("#", "Title")]. + + Returns: + List[Document]: The contents of the notebook, split by the headers_to_split_on. + """ + notebook = nbformat.reads(notebook_content, as_version=4) + content = "" + for cell in notebook.cells: + if cell.cell_type == "markdown": + content += cell.source + "\n" + elif cell.cell_type == "code": + content += "```python\n" + cell.source + "\n```\n" + elif cell.cell_type == "raw": + content += cell.source + "\n" + + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, strip_headers=False + ) + + chunks = markdown_splitter.split_text(content) + return chunks + + +if __name__ == "__main__": + # Initialize argument parser + parser = argparse.ArgumentParser( + description="Read and print notebook content from a file." + ) + + # Add notebook_path as an argument + parser.add_argument( + "--notebook_path", + type=str, + help="The path to the Jupyter notebook file (.ipynb) to read.", + ) + + # Parse arguments + args = parser.parse_args() + + # Read the notebook path from args + notebook_content = read_notebook_from_file(args.notebook_path) + for doc in notebook_content: + print(doc) + print("---") diff --git a/edubotics_core/dataloader/webpage_crawler.py b/edubotics_core/dataloader/webpage_crawler.py index 5ecb3c6..519bce1 100644 --- a/edubotics_core/dataloader/webpage_crawler.py +++ b/edubotics_core/dataloader/webpage_crawler.py @@ -3,7 +3,7 @@ import asyncio import requests from bs4 import BeautifulSoup -from urllib.parse import urljoin, urldefrag +from urllib.parse import urljoin, urldefrag, urlparse from edubotics_core.config.constants import TIMEOUT @@ -19,6 +19,8 @@ async def fetch(self, session: ClientSession, url: str) -> str: return await response.text(encoding="latin1") def url_exists(self, url: str) -> bool: + if url.startswith("mailto:"): + return False try: response = requests.head(url, timeout=TIMEOUT) return response.status_code == 200 @@ -26,6 +28,11 @@ def url_exists(self, url: str) -> bool: return False async def get_links(self, session: ClientSession, website_link: str, base_url: str): + if not website_link.startswith(base_url): + return [] + elif website_link.startswith("mailto:"): + return [] + html_data = await self.fetch(session, website_link) soup = BeautifulSoup(html_data, "html.parser") list_links = [] @@ -35,7 +42,7 @@ async def get_links(self, session: ClientSession, website_link: str, base_url: s normalized_url = self.normalize_url(full_url) # sections removed if ( normalized_url not in self.dict_href_links - and self.is_child_url(normalized_url, base_url) + # and self.is_child_url(normalized_url, base_url) and self.url_exists(normalized_url) ): self.dict_href_links[normalized_url] = None @@ -89,12 +96,16 @@ async def get_all_pages(self, url: str, base_url: str): return checked_urls def is_webpage(self, url: str) -> bool: - try: - response = requests.head(url, allow_redirects=True, timeout=TIMEOUT) - content_type = response.headers.get("Content-Type", "").lower() - return "text/html" in content_type - except requests.RequestException: + + if url.endswith(".ipynb") or url.endswith(".pdf"): return False + else: + try: + response = requests.head(url, allow_redirects=True, timeout=TIMEOUT) + content_type = response.headers.get("Content-Type", "").lower() + return "text/html" in content_type + except requests.RequestException: + return False def clean_url_list(self, urls): files, webpages = [], [] @@ -112,5 +123,42 @@ def is_child_url(self, url, base_url): def normalize_url(self, url: str): # Strip the fragment identifier + if url.startswith("url: "): + url = url[5:] defragged_url, _ = urldefrag(url) return defragged_url + + async def find_target_url(self, base_url: str, target_url: str, depth: int) -> str: + async with aiohttp.ClientSession() as session: + visited = set() # To keep track of visited URLs + return await self._search_links( + session, base_url, target_url, visited, depth + ) + + async def _search_links( + self, + session: ClientSession, + current_url: str, + target_url: str, + visited: set, + depth: int, + ) -> str: + if current_url.startswith("mailto:"): + return None + if current_url in visited or depth < 0: + return None + visited.add(current_url) + + base_url = urlparse(current_url).netloc + print(f"base_url: {base_url}") + links = await self.get_links(session, current_url, base_url) + for link in links: + if link == target_url: + return link + found_url = await self._search_links( + session, link, target_url, visited, depth - 1 + ) + if found_url: + return found_url + + return None diff --git a/edubotics_core/vectorstore/store_manager.py b/edubotics_core/vectorstore/store_manager.py index 54d4479..178a581 100644 --- a/edubotics_core/vectorstore/store_manager.py +++ b/edubotics_core/vectorstore/store_manager.py @@ -53,8 +53,10 @@ def load_files(self): files = [ os.path.join(self.config["vectorstore"]["data_path"], file) for file in files + if file != "urls.txt" ] - urls = get_urls_from_file(self.config["vectorstore"]["url_file_path"]) + url_file_path = self.config["vectorstore"]["url_file_path"] + urls = get_urls_from_file(url_file_path) if self.config["vectorstore"]["expand_urls"]: all_urls = [] for url in urls: @@ -103,12 +105,15 @@ def create_database(self): start_time = time.time() # Start time for creating database data_loader = DataLoader(self.config, self.logger) self.logger.info("Loading data") - files, urls = self.load_files() + local_files, urls = self.load_files() + # print(f"Local files: {local_files}") + # print(f"URLs: {urls}") files, webpages = self.webpage_crawler.clean_url_list(urls) + files.extend(local_files) self.logger.info(f"Number of files: {len(files)}") self.logger.info(f"Number of webpages: {len(webpages)}") if f"{self.config['vectorstore']['url_file_path']}" in files: - files.remove(f"{self.config['vectorstores']['url_file_path']}") # cleanup + files.remove(f"{self.config['vectorstore']['url_file_path']}") # cleanup ( document_chunks, document_names, @@ -166,15 +171,19 @@ def __len__(self): def main(): # Add argument parsing for config files + CWD = os.getcwd() parser = argparse.ArgumentParser(description="Load configuration files.") parser.add_argument( - "--config_file", type=str, help="Path to the main config file", required=True + "--config_file", + type=str, + help="Path to the main config file", + default=os.path.join(CWD, "config/config.yml"), ) parser.add_argument( "--project_config_file", type=str, help="Path to the project config file", - required=True, + default=os.path.join(CWD, "config/project_config.yml"), ) args = parser.parse_args() diff --git a/eval_code/true_lens.ipynb b/eval_code/true_lens.ipynb index 6b0cc71..f13da75 100644 --- a/eval_code/true_lens.ipynb +++ b/eval_code/true_lens.ipynb @@ -655,7 +655,7 @@ " },\n", " )\n", "\n", - " records.append(rec)\n" + " records.append(rec)" ] }, { @@ -690,23 +690,20 @@ "\n", "# Question/statement relevance between question and each context chunk.\n", "f_context_relevance = (\n", - " Feedback(provider.context_relevance_with_cot_reasons)\n", - " .on_input()\n", - " .on(context)\n", + " Feedback(provider.context_relevance_with_cot_reasons).on_input().on(context)\n", ")\n", "\n", "# Define a groundedness feedback function\n", "f_groundedness = (\n", - " Feedback(provider.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n", + " Feedback(provider.groundedness_measure_with_cot_reasons, name=\"Groundedness\")\n", " .on(context.collect())\n", " .on_output()\n", ")\n", "\n", "# Question/answer relevance between overall question and answer.\n", - "f_qa_relevance = (\n", - " Feedback(provider.relevance_with_cot_reasons, name = \"Answer Relevance\")\n", - " .on_input_output()\n", - ")" + "f_qa_relevance = Feedback(\n", + " provider.relevance_with_cot_reasons, name=\"Answer Relevance\"\n", + ").on_input_output()" ] }, { @@ -730,7 +727,7 @@ " app_id=\"AI Tutor - GPT3 - FAISS\",\n", " app=virtual_app,\n", " feedbacks=[f_context_relevance, f_groundedness, f_qa_relevance],\n", - " feedback_mode = \"deferred\" # optional\n", + " feedback_mode=\"deferred\", # optional\n", ")" ] }, @@ -793,6 +790,7 @@ ], "source": [ "from trulens_eval import Tru\n", + "\n", "tru = Tru()\n", "\n", "tru.run_dashboard(force=True)" diff --git a/requirements.txt b/requirements.txt index f6dd818..616b140 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,37 +1,322 @@ -aiohttp -beautifulsoup4 -chainlit -langchain -langchain-community -langchain-core -literalai -llama-parse -numpy -pandas -pysrt -python-dotenv -PyYAML -RAGatouille -requests -scikit-learn -torch -tqdm -transformers -trulens_eval -umap-learn -llama-cpp-python -pymupdf -websockets -langchain-openai -langchain-experimental -html2text -PyPDF2 -pdf2image -black -flake8 -bandit -fastapi -google-auth -google-auth-oauthlib -Jinja2 -cryptography +aiofiles==23.2.1 +aiohappyeyeballs==2.4.0 +aiohttp==3.10.5 +aiosignal==1.3.1 +alembic==1.13.2 +altair==5.4.1 +annotated-types==0.7.0 +anyio==4.4.0 +appnope==0.1.4 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +asyncer==0.0.7 +attrs==24.2.0 +babel==2.16.0 +backports.tarfile==1.2.0 +bandit==1.7.9 +beautifulsoup4==4.12.3 +bidict==0.23.1 +bitarray==2.9.2 +black==24.8.0 +bleach==6.1.0 +blinker==1.8.2 +cachetools==5.5.0 +catalogue==2.0.10 +certifi==2024.7.4 +cffi==1.17.1 +chainlit==1.2.0 +chardet==5.2.0 +charset-normalizer==3.3.2 +chevron==0.14.0 +click==8.1.7 +colbert-ai==0.2.19 +comm==0.2.2 +contourpy==1.3.0 +cryptography==43.0.1 +cycler==0.12.1 +dataclasses-json==0.6.7 +datasets==2.21.0 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +Deprecated==1.2.14 +dill==0.3.8 +dirtyjson==1.0.8 +diskcache==5.6.3 +distro==1.9.0 +docutils==0.21.2 +entrypoints==0.4 +executing==2.1.0 +faiss-cpu==1.8.0.post1 +Faker==28.4.1 +fast-pytorch-kmeans==0.2.0.1 +fastapi==0.110.3 +fastjsonschema==2.20.0 +favicon==0.7.0 +filelock==3.16.0 +filetype==1.2.0 +flake8==7.1.1 +Flask==3.0.3 +fonttools==4.53.1 +fqdn==1.5.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +git-python==1.0.3 +gitdb==4.0.11 +GitPython==3.1.43 +google-auth==2.34.0 +google-auth-oauthlib==1.2.1 +googleapis-common-protos==1.65.0 +greenlet==3.0.3 +grpcio==1.66.1 +h11==0.14.0 +htbuilder==0.6.2 +html2text==2024.2.26 +httpcore==1.0.5 +httpx==0.27.2 +huggingface-hub==0.24.6 +idna==3.8 +importlib_metadata==8.4.0 +ipykernel==6.29.5 +ipython==8.27.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +itsdangerous==2.2.0 +jaraco.classes==3.4.0 +jaraco.context==6.0.1 +jaraco.functools==4.0.2 +jedi==0.19.1 +Jinja2==3.1.4 +jiter==0.5.0 +joblib==1.4.2 +json5==0.9.25 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter==1.1.1 +jupyter-console==6.6.3 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.2 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +keyring==25.3.0 +kiwisolver==1.4.7 +langchain==0.2.16 +langchain-community==0.2.16 +langchain-core==0.2.38 +langchain-experimental==0.0.65 +langchain-openai==0.1.23 +langchain-text-splitters==0.2.4 +langsmith==0.1.116 +Lazify==0.4.0 +literalai==0.0.607 +llama-cloud==0.0.17 +llama-index==0.11.7 +llama-index-agent-openai==0.3.1 +llama-index-cli==0.3.0 +llama-index-core==0.11.7 +llama-index-embeddings-openai==0.2.4 +llama-index-indices-managed-llama-cloud==0.3.0 +llama-index-legacy==0.9.48.post3 +llama-index-llms-openai==0.2.3 +llama-index-multi-modal-llms-openai==0.2.0 +llama-index-program-openai==0.2.0 +llama-index-question-gen-openai==0.2.0 +llama-index-readers-file==0.2.1 +llama-index-readers-llama-parse==0.3.0 +llama-parse==0.5.2 +llama_cpp_python==0.2.90 +llvmlite==0.43.0 +lxml==5.3.0 +Mako==1.3.5 +Markdown==3.7 +markdown-it-py==3.0.0 +markdownlit==0.0.7 +MarkupSafe==2.1.5 +marshmallow==3.22.0 +matplotlib==3.9.2 +matplotlib-inline==0.1.7 +mccabe==0.7.0 +mdurl==0.1.2 +mistune==3.0.2 +more-itertools==10.4.0 +mpmath==1.3.0 +multidict==6.0.5 +multiprocess==0.70.16 +munch==2.5.0 +mypy-extensions==1.0.0 +narwhals==1.6.4 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.3 +nh3==0.2.18 +ninja==1.11.1.1 +nltk==3.9.1 +notebook==7.2.2 +notebook_shim==0.2.4 +numba==0.60.0 +numpy==1.26.4 +oauthlib==3.2.2 +onnx==1.16.2 +openai==1.44.0 +opentelemetry-api==1.27.0 +opentelemetry-exporter-otlp==1.27.0 +opentelemetry-exporter-otlp-proto-common==1.27.0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-exporter-otlp-proto-http==1.27.0 +opentelemetry-instrumentation==0.48b0 +opentelemetry-proto==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-semantic-conventions==0.48b0 +orjson==3.10.7 +overrides==7.7.0 +packaging==23.2 +pandas==2.2.2 +pandocfilters==1.5.1 +parso==0.8.4 +pathspec==0.12.1 +pbr==6.1.0 +pdf2image==1.17.0 +pexpect==4.9.0 +pillow==10.4.0 +pkginfo==1.10.0 +platformdirs==4.3.1 +plotly==5.24.0 +prometheus_client==0.20.0 +prompt_toolkit==3.0.47 +protobuf==4.25.4 +psutil==5.9.8 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycodestyle==2.12.1 +pycparser==2.22 +pydantic==2.9.1 +pydantic_core==2.23.3 +pydeck==0.9.1 +pyflakes==3.2.0 +Pygments==2.18.0 +PyJWT==2.9.0 +pymdown-extensions==10.9 +PyMuPDF==1.24.10 +PyMuPDFb==1.24.10 +pynndescent==0.5.13 +pynvml==11.5.3 +pyparsing==3.1.4 +pypdf==4.3.1 +PyPDF2==3.0.1 +pysrt==1.1.2 +python-dateutil==2.9.0.post0 +python-decouple==3.8 +python-dotenv==1.0.1 +python-engineio==4.9.1 +python-json-logger==2.0.7 +python-multipart==0.0.9 +python-socketio==5.11.4 +pytz==2024.1 +PyYAML==6.0.2 +pyzmq==26.2.0 +RAGatouille==0.0.8.post4 +readme_renderer==44.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +rfc3339-validator==0.1.4 +rfc3986==2.0.0 +rfc3986-validator==0.1.1 +rich==13.8.0 +rpds-py==0.20.0 +rsa==4.9 +safetensors==0.4.5 +scikit-learn==1.5.1 +scipy==1.14.1 +Send2Trash==1.8.3 +sentence-transformers==2.7.0 +simple-websocket==1.0.0 +six==1.16.0 +smmap==5.0.1 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==2.0.34 +srsly==2.4.8 +st-annotated-text==4.0.1 +st-theme==1.2.3 +stack-data==0.6.3 +starlette==0.37.2 +stevedore==5.3.0 +streamlit==1.38.0 +streamlit-aggrid==0.3.4 +streamlit-camera-input-live==0.2.0 +streamlit-card==1.0.2 +streamlit-embedcode==0.1.2 +streamlit-extras==0.4.7 +streamlit-faker==0.0.3 +streamlit-image-coordinates==0.1.9 +streamlit-keyup==0.2.4 +streamlit-pills==0.3.0 +streamlit-toggle-switch==1.0.2 +streamlit-vertical-slider==2.5.5 +striprtf==0.0.26 +sympy==1.13.2 +syncer==2.0.3 +tenacity==8.5.0 +terminado==0.18.1 +threadpoolctl==3.5.0 +tiktoken==0.7.0 +tinycss2==1.3.0 +tokenize-rt==6.0.0 +tokenizers==0.19.1 +toml==0.10.2 +tomli==2.0.1 +torch==2.4.1 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +transformers==4.44.2 +trulens==1.0.1 +trulens-core==1.0.1 +trulens-dashboard==1.0.1 +trulens-feedback==1.0.1 +trulens_eval==1.0.1 +twine==5.1.1 +types-python-dateutil==2.9.0.20240906 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.1 +ujson==5.10.0 +umap-learn==0.5.6 +uptrace==1.26.0 +uri-template==1.3.0 +urllib3==2.2.2 +uvicorn==0.25.0 +validators==0.34.0 +voyager==2.0.9 +watchfiles==0.20.0 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +websockets==13.0.1 +Werkzeug==3.0.4 +widgetsnbextension==4.0.13 +wrapt==1.16.0 +wsproto==1.2.0 +xxhash==3.5.0 +yarl==1.11.0 +zipp==3.20.1 diff --git a/setup.py b/setup.py index 8feaa85..617cf80 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ from setuptools import setup, find_packages +import os # Read the contents of requirements.txt with open("requirements.txt") as f: @@ -7,9 +8,21 @@ with open("README.md") as f: readme = f.read() +# Tag is in the format v0.1.0, so we need to remove the v +git_tag = os.environ.get("GITHUB_REF_NAME", "") + +if git_tag.startswith("v"): + version = git_tag[1:] +else: + version = "0.0.0" # Fall back to 0.0.0 if we can't find the tag + +if not version: + print("No version found, defaulting to 0.0.0") + version = "0.0.0" + setup( name="edubotics-core", - version="0.1.0", + version=version, packages=find_packages(), package_dir={"edubotics-core": "edubotics_core"}, python_requires=">=3.7",