Skip to content

Commit 84fc919

Browse files
committed
MariaDB Knowledge Base Chat
1 parent 801e8bc commit 84fc919

File tree

3 files changed

+187
-0
lines changed

3 files changed

+187
-0
lines changed

kb_chat/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# MariaDB KB Chat and Vector Store Generator
2+
3+
This script scrapes web pages from the MariaDB Knowledge Base, cleans and processes the content, and then generates a FAISS index using the OpenAI embeddings for each document. The vector store is saved as a pickle file and then used by a chatbot to answer questions about the MariaDB server.
4+
5+
## Requirements
6+
7+
Install the required packages with the following command:
8+
9+
pip install argparse bs4 dotenv faiss-cpu openai requests numpy streamlit
10+
11+
## Setup
12+
13+
1. Download the MariaDB KB CSV file from https://github.com/Icerath/mariadb_kb_server/blob/main/kb_urls.csv
14+
2. Create a `.env` file in the same directory as the script.
15+
3. Add your OpenAI API key to the `.env` file as follows:
16+
17+
OPENAI_API_KEY=your_api_key_here
18+
19+
## Preprocessing
20+
21+
Run the script with the following command:
22+
23+
python create_vectorestore.py --csv-file kb_urls.csv --tmp-dir tmp --vectorstore-path vectorstore.pkl --chunk-size 4000 --chunk-overlap 200
24+
25+
This will create a file `vectorestore.pkl` which is used to answer questions
26+
27+
## Run chat
28+
29+
streamlit run chat.py
30+
31+
Now, you will have a self hosted version of the chat over the MariaDB KB.

kb_chat/chat.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import streamlit as st
2+
import pickle
3+
from langchain.vectorstores import FAISS
4+
from dotenv import load_dotenv
5+
import openai
6+
import os
7+
8+
load_dotenv()
9+
10+
openai.api_key = os.getenv("OPENAI_API_KEY")
11+
12+
def gen_prompts(content, question):
13+
system_msg_content = "You are a questioning answering expert about MariaDB. You only respond based on the facts that are given to you and ignore your prior knowledge."
14+
user_msg_content = f"{content}\n---\n\nGiven the above content about MariaDB along with the URL of the content, respond to this question {question} and mention the URL as a source. If the question is not about MariaDB and you cannot answer it based on the provided content, politely decline to answer. Simply state that you couldn't find any relevant information instead of going into details. Do not say the phrase 'in the provided content'. If the information I provide contains the word obsolete, emphasize that the response is obsolete. Also, suggest newer MariaDB versions if the question is about versions older than 10.3 and say that the others are no longer maintained. Do not add the URL as a source if you cannot answer based on the provided content. If there are exceptions for particular MariaDB version, specify the exceptions that apply. Also, if the provided score is lower than 0.2 decline to answer and say you found no relevant information. If the source URL repeats, only use it once."
15+
system_msg = {"role": "system", "content": system_msg_content}
16+
user_msg = {"role": "user", "content": user_msg_content}
17+
18+
return system_msg, user_msg
19+
20+
def process_doc(content, question, model_type="gpt-4", max_tokens=30000):
21+
if len(content) > max_tokens:
22+
print('Trimmed')
23+
content = content[:max_tokens]
24+
system_msg, user_msg = gen_prompts(content, question)
25+
26+
try:
27+
response = openai.ChatCompletion.create(
28+
model=model_type,
29+
messages=[system_msg, user_msg],
30+
)
31+
except Exception as e:
32+
return "Sorry, there was an error. Please try again!"
33+
34+
result = response.choices[0].message['content']
35+
return result
36+
37+
with open("vectorstore.pkl", "rb") as f:
38+
faiss_index = pickle.load(f)
39+
40+
def search_similar_docs(question, k=4):
41+
docs = faiss_index.similarity_search_with_score(question, k=k)
42+
docs_with_url = []
43+
for doc in docs:
44+
url = doc[0].metadata["source"]
45+
doc[0].page_content = f"URL: {url}\n{doc[0].page_content}\nSCORE:{doc[1]}\n"
46+
docs_with_url.append(doc[0])
47+
print(docs)
48+
return docs_with_url
49+
50+
def main():
51+
st.title("MariaDB KB Chatbot")
52+
53+
if 'chat_history' not in st.session_state:
54+
st.session_state.chat_history = []
55+
56+
user_input = st.text_input("Ask a question:", "")
57+
if st.button("Send"):
58+
st.session_state.chat_history.append(("User", user_input))
59+
results = process_doc(search_similar_docs(user_input), user_input)
60+
61+
st.session_state.chat_history.append(("Bot", results))
62+
63+
for role, message in st.session_state.chat_history:
64+
if role == "User":
65+
st.markdown(f"> **{role}**: {message}")
66+
else:
67+
st.markdown(f"**{role}**: {message}")
68+
69+
if __name__ == "__main__":
70+
main()

kb_chat/create_vectorstore.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import argparse
2+
import pickle
3+
import os
4+
import csv
5+
import openai
6+
import re
7+
import requests
8+
9+
from langchain.document_loaders import BSHTMLLoader
10+
from langchain.vectorstores import FAISS
11+
from langchain.text_splitter import CharacterTextSplitter
12+
from langchain.embeddings.openai import OpenAIEmbeddings
13+
from dotenv import load_dotenv
14+
15+
load_dotenv()
16+
openai.api_key = os.getenv("OPENAI_API_KEY")
17+
18+
def parse_args():
19+
parser = argparse.ArgumentParser(description='MariaDB KB Vector Store Generator')
20+
parser.add_argument('--csv-file', type=str, default='kb_urls.csv', help='Path to the input CSV file containing the URLs')
21+
parser.add_argument('--tmp-dir', type=str, default='tmp', help='Directory where the temporary HTML files will be stored')
22+
parser.add_argument('--vectorstore-path', type=str, default='vectorstore.pkl', help='Path to save the generated FAISS vector store pickle file')
23+
parser.add_argument('--chunk-size', type=int, default=4000, help='Chunk size for splitting the documents')
24+
parser.add_argument('--chunk-overlap', type=int, default=200, help='Overlap size between chunks when splitting documents')
25+
return parser.parse_args()
26+
27+
def download_web_page(url):
28+
response = requests.get(url)
29+
30+
if response.status_code == 200:
31+
content = response.text
32+
filename = url.replace('://', '_').replace('/', '_') + '.html'
33+
34+
with open('./tmp/' + filename, 'w', encoding='utf-8') as file:
35+
file.write(content)
36+
else:
37+
print(f"Error: Unable to fetch the web page. Status code: {response.status_code}")
38+
39+
def read_csv(csv_file):
40+
urls = []
41+
42+
with open(csv_file, newline='', encoding='utf-8') as csvfile:
43+
csv_reader = csv.reader(csvfile)
44+
for row in csv_reader:
45+
if row[0].strip():
46+
urls.append(row[0])
47+
48+
return urls[1:]
49+
50+
def main():
51+
args = parse_args()
52+
53+
urls = read_csv(args.csv_file)
54+
all_docs = []
55+
idx = 0
56+
for url in urls:
57+
filename = url.replace('://', '_').replace('/', '_').strip() + '.html'
58+
doc_path = args.tmp_dir + '/' + filename
59+
if not os.path.exists(doc_path):
60+
download_web_page(url)
61+
loader = BSHTMLLoader(doc_path)
62+
doc = loader.load()[0]
63+
64+
content = re.sub(r'\s+', ' ', doc.page_content)
65+
doc.page_content = content
66+
doc.metadata["source"] = url
67+
68+
all_docs.append(doc)
69+
70+
text_splitter = CharacterTextSplitter(
71+
separator = " ",
72+
chunk_size = args.chunk_size,
73+
chunk_overlap = args.chunk_overlap,
74+
length_function = len,
75+
)
76+
print("Loaded {} documents".format(len(all_docs)))
77+
all_docs = text_splitter.split_documents(all_docs)
78+
print("After split: {} documents".format(len(all_docs)))
79+
80+
faiss_index = FAISS.from_documents(all_docs, OpenAIEmbeddings())
81+
82+
with open(args.vectorstore_path, "wb") as f:
83+
pickle.dump(faiss_index, f)
84+
85+
if __name__ == "__main__":
86+
main()

0 commit comments

Comments
 (0)