-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
117 lines (95 loc) · 4.54 KB
/
Copy pathapp.py
File metadata and controls
117 lines (95 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever
from langchain_groq import ChatGroq
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from pathlib import Path
PDF_FOLDER = "./data" # all pdfs
def load_pdf(folder_path):
all_documents = []
pdf_files = list(Path(folder_path).glob("*.pdf")) # find all files in the folder
for pdf in pdf_files:
loader = PyPDFLoader(str(pdf))
docs = loader.load()
all_documents.extend(docs)
return all_documents
def chunk_documents(documents):
splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=50,
separators=["\n\n", "\n", ".", " "]
)
return splitter.split_documents(documents)
def get_bm25_retriever(folder_path="./data"):
docs = load_pdf(folder_path)
chunks = chunk_documents(docs)
retriever = BM25Retriever.from_documents(chunks)
retriever.k = 3 # return top 3 chunks
return retriever
def build_chain(retriever):
llm = ChatGroq(
api_key="your_api_key",
model_name="llama-3.3-70b-versatile",
temperature=0.2
)
contextualize_prompt = ChatPromptTemplate.from_messages([
("system", "Reformulate the question as standalone given the chat history."),
MessagesPlaceholder("chat_history"),
("human", "{input}")
])
history_aware_retriever = create_history_aware_retriever(
llm, retriever, contextualize_prompt
)
answer_prompt = ChatPromptTemplate.from_messages([
("system", """You are a document assistant. Answer ONLY from context.
If not in context, say 'I can only answer about the uploaded documents.'
Context: {context}"""),
MessagesPlaceholder("chat_history"),
("human", "{input}")
])
document_chain = create_stuff_documents_chain(llm, answer_prompt)
return create_retrieval_chain(history_aware_retriever, document_chain)
# ── Streamlit UI ──────────────────────────────────────────
st.set_page_config(page_title="Vectorless RAG Chatbot", page_icon="🤖")
st.title("🤖 RAG Chatbot")
st.caption("Ask questions about your PDF!")
# ── Session state — persists across reruns ────────────────
if "chat_history" not in st.session_state:
st.session_state.chat_history = [] # LangChain memory
if "messages" not in st.session_state:
st.session_state.messages = [] # UI chat bubbles
if "chain" not in st.session_state:
with st.spinner("Loading knowledge base..."):
retriever = get_bm25_retriever() # ← no vector store
st.session_state.chain = build_chain(retriever)
# ── Display chat history as bubbles ──────────────────────
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
st.write(msg["content"])
# ── Chat input ────────────────────────────────────────────
if question := st.chat_input("Ask a question about your PDF..."):
# Show user message
with st.chat_message("user"):
st.write(question)
st.session_state.messages.append({"role": "user", "content": question})
# Get answer
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
result = st.session_state.chain.invoke({
"input": question,
"chat_history": st.session_state.chat_history
})
answer = result["answer"]
st.write(answer)
# Show sources in expander
with st.expander("📄 Sources"):
for i, doc in enumerate(result["context"]):
st.markdown(f"**Chunk {i+1} (page {doc.metadata.get('page','?')}):**")
st.caption(doc.page_content[:200])
# Save to session
st.session_state.messages.append({"role": "assistant", "content": answer})
st.session_state.chat_history.append(HumanMessage(content=question))
st.session_state.chat_history.append(AIMessage(content=answer))