-
Notifications
You must be signed in to change notification settings - Fork 38
Expand file tree
/
Copy pathdata_loader.py
More file actions
28 lines (22 loc) · 766 Bytes
/
data_loader.py
File metadata and controls
28 lines (22 loc) · 766 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from openai import OpenAI
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import SentenceSplitter
from dotenv import load_dotenv
load_dotenv()
client = OpenAI()
EMBED_MODEL = "text-embedding-3-large"
EMBED_DIM = 3072
splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200)
def load_and_chunk_pdf(path: str):
docs = PDFReader().load_data(file=path)
texts = [d.text for d in docs if getattr(d, "text", None)]
chunks = []
for t in texts:
chunks.extend(splitter.split_text(t))
return chunks
def embed_texts(texts: list[str]) -> list[list[float]]:
response = client.embeddings.create(
model=EMBED_MODEL,
input=texts,
)
return [item.embedding for item in response.data]