Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 62 additions & 6 deletions backend/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import logging
import os
import re
import sqlite3
from typing import Optional
import requests
import json

import weaviate
from bs4 import BeautifulSoup, SoupStrainer
Expand All @@ -15,6 +18,7 @@
from backend.constants import WEAVIATE_GENERAL_GUIDES_AND_TUTORIALS_INDEX_NAME
from backend.embeddings import get_embeddings_model
from backend.parser import langchain_docs_extractor
from backend.utils import parse_openapi_spec

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -56,12 +60,10 @@ def simple_extractor(html: str | BeautifulSoup) -> str:
)
return re.sub(r"\n\n+", "\n\n", soup.text).strip()


#########################
# General Guides and Tutorials
#########################


# NOTE: To be deprecated once LangChain docs are migrated to new site.
def load_langchain_python_docs():
return SitemapLoader(
Expand All @@ -77,7 +79,6 @@ def load_langchain_python_docs():
meta_function=metadata_extractor,
).load()


# NOTE: To be deprecated once LangChain docs are migrated to new site.
def load_langchain_js_docs():
return SitemapLoader(
Expand All @@ -93,7 +94,6 @@ def load_langchain_js_docs():
filter_urls=["https://js.langchain.com/docs/"],
).load()


def load_aggregated_docs_site():
return SitemapLoader(
"https://docs.langchain.com/sitemap.xml",
Expand All @@ -114,7 +114,6 @@ def ingest_general_guides_and_tutorials():
aggregated_site_docs = load_aggregated_docs_site()
return langchain_python_docs + langchain_js_docs + aggregated_site_docs


def ingest_docs():
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
embedding = get_embeddings_model()
Expand Down Expand Up @@ -173,6 +172,63 @@ def ingest_docs():
f"General Guides and Tutorials now has this many vectors: {num_vecs}",
)

#########################
# API and SDK Docs
#########################

SQLITE_DB_PATH = "api_sdk_docs.db"

def create_sqlite_db():
conn = sqlite3.connect(SQLITE_DB_PATH)
cur = conn.cursor()
cur.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS docs USING fts5(
domain,
title,
url,
content
);
""")
conn.commit()
return conn

def load_langsmith_api_docs():
url = "https://api.smith.langchain.com/openapi.json"
spec = requests.get(url).json()
docs = parse_openapi_spec(spec, base_url="https://api.smith.langchain.com/redoc", domain="langsmith_api")
return docs

def load_langgraph_platform_api_docs():
with open("docs/langgraph-platform.json", "r") as f:
spec = json.load(f)
docs = parse_openapi_spec(spec, base_url="https://langchain-ai.github.io/langgraph/cloud/reference/api/api_ref.html", domain="langgraph_platform_api")
return docs

def ingest_sdk_and_api_docs():
conn = create_sqlite_db()
cur = conn.cursor()
docs = []
docs.extend(load_langsmith_api_docs())
docs.extend(load_langgraph_platform_api_docs())

cur.execute("DELETE FROM docs WHERE domain = 'langsmith_api'")
cur.execute("DELETE FROM docs WHERE domain = 'langgraph_platform_api'")

for doc in docs:
cur.execute("""
INSERT INTO docs (domain, title, url, content)
VALUES (?, ?, ?, ?)
""", (doc["domain"], doc["title"], doc["url"], doc["content"]))

conn.commit()
print(f"Inserted {len(docs)} documents into SQLite database")
conn.close()
return docs

#########################
# Main
#########################

if __name__ == "__main__":
ingest_docs()
# ingest_docs()
ingest_sdk_and_api_docs()
131 changes: 131 additions & 0 deletions backend/retrieval_graph/deepagent/deepagent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
import json
import sqlite3
from typing import Literal
from pydantic import BaseModel
from langchain_core.runnables import RunnableConfig
from typing import Optional
from deepagents import create_deep_agent, SubAgent, DeepAgentState
from backend.retrieval_graph.deepagent.prompts import RAG_TOOL_DESCRIPTION, DEEP_AGENT_DEFAULT_INSTRUCTIONS, LANGSMITH_API_TOOL_DESCRIPTION, LANGGRAPH_PLATFORM_API_TOOL_DESCRIPTION, LIST_API_SDK_ENDPOINTS_TOOL_DESCRIPTION, REACT_AGENT_INSTRUCTIONS, NO_LISTING_TOOL_INSTRUCTIONS
from langchain_core.tools import tool
from backend.retrieval import make_retriever
from langchain_core.documents import Document

# TODO: Make this configurable eventually?
config = {
"configurable": {
"embedding_model": "openai/text-embedding-3-small",
"retriever_provider": "weaviate",
}
}

@tool(description=RAG_TOOL_DESCRIPTION)
async def guide_rag_search(query: str) -> str:
with make_retriever(config) as retriever:
response = await retriever.ainvoke(query, config)
return {"documents": [{"page_content": doc.page_content, "source": doc.metadata["source"], "title": doc.metadata["title"]} for doc in response]}

@tool(description=LIST_API_SDK_ENDPOINTS_TOOL_DESCRIPTION)
async def list_api_sdk_endpoints(domain: str) -> str:
conn = sqlite3.connect("api_sdk_docs.db")
cur = conn.cursor()
cur.execute("""
SELECT title
FROM docs
WHERE domain = ?
""", (domain,))
results = cur.fetchall()
conn.close()
all_titles = [f"{title}\n" for title in results]
return f"There are {len(results)} endpoints or functions for {domain}. Here are the titles that you can search for specifically to get details on an endpoint or function:\n\n{''.join(all_titles)}"

@tool(description=LANGSMITH_API_TOOL_DESCRIPTION)
async def langsmith_api_search(match_string: str) -> str:
conn = sqlite3.connect("api_sdk_docs.db")
cur = conn.cursor()
cur.execute("""
SELECT title, url, content
FROM docs
WHERE domain = 'langsmith_api'
AND title LIKE ?
LIMIT 3
""", (f"%{match_string}%",))

results = cur.fetchall()
conn.close()

if not results:
return {"message": f"No results found for '{match_string}'"}

return {
"results": [{"title": title, "url": url, "content": content}
for title, url, content in results],
"count": len(results)
}

@tool(description=LANGGRAPH_PLATFORM_API_TOOL_DESCRIPTION)
async def langgraph_platform_api_search(match_string: str) -> str:
conn = sqlite3.connect("api_sdk_docs.db")
cur = conn.cursor()
cur.execute("""
SELECT title, url, content
FROM docs
WHERE domain = 'langgraph_platform_api'
AND title LIKE ?
LIMIT 3
""", (f"%{match_string}%",))

results = cur.fetchall()
conn.close()
if not results:
return {"message": f"No results found for '{match_string}'"}

return {
"results": [{"title": title, "url": url, "content": content}
for title, url, content in results],
"count": len(results)
}


class AgentConfig(BaseModel):
instructions: Optional[str] = DEEP_AGENT_DEFAULT_INSTRUCTIONS
subagents: Optional[list[dict]] = []

class StateSchema(DeepAgentState):
documents: list[Document]

all_tools = [guide_rag_search, langsmith_api_search, langgraph_platform_api_search, list_api_sdk_endpoints]

from langchain_anthropic import ChatAnthropic

def deep_agent_factory(config: RunnableConfig):
cfg = AgentConfig(**config.get("configurable", {}))
return create_deep_agent(
tools=[guide_rag_search, langsmith_api_search, langgraph_platform_api_search, list_api_sdk_endpoints],
instructions=cfg.instructions,
subagents=cfg.subagents,
context_schema=AgentConfig
).with_config({"recursion_limit": 100})

deep_agent = deep_agent_factory(config)


################
# ReAct Agent
################

from langgraph.prebuilt import create_react_agent

all_tools = [guide_rag_search, langsmith_api_search, langgraph_platform_api_search, list_api_sdk_endpoints]

react_agent = create_react_agent(
ChatAnthropic(model="claude-sonnet-4-20250514", max_tokens=64000).bind_tools(tools=all_tools, cache_control={"type": "ephemeral"}),
[guide_rag_search, langsmith_api_search, langgraph_platform_api_search, list_api_sdk_endpoints],
prompt=REACT_AGENT_INSTRUCTIONS,
)

react_agent_no_listing_tool = create_react_agent(
ChatAnthropic(model="claude-sonnet-4-20250514", max_tokens=64000),
[guide_rag_search, langsmith_api_search, langgraph_platform_api_search],
prompt=NO_LISTING_TOOL_INSTRUCTIONS,
)
Loading
Loading