|
from langchain_community.document_loaders.generic import GenericLoader |
|
from langchain_community.document_loaders import FileSystemBlobLoader |
|
from langchain_community.document_loaders.parsers import PyMuPDFParser |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
from langchain_chroma import Chroma |
|
|
|
from pathlib import Path |
|
|
|
def load_data(documents): |
|
""" |
|
Load and parse data from a list of PDF files. |
|
|
|
Args: |
|
documents Union[UploadedFile, list(UploadedFile)]: A single UploadedFile or list of UploadedFile objects. Strict for PDF only. |
|
|
|
Returns: |
|
List[Document]: A list of parsed LangChain Document class. |
|
""" |
|
|
|
for file in documents: |
|
with open(f"./{file.name}", 'wb') as f: |
|
f.write(file.getbuffer()) |
|
|
|
|
|
loader = GenericLoader(blob_loader=FileSystemBlobLoader(path="./", glob="*.pdf"), |
|
blob_parser=PyMuPDFParser(mode='page')) |
|
loaded_docs = loader.load() |
|
|
|
|
|
pdf_files = Path.cwd().glob("*.pdf") |
|
for pdf in pdf_files: |
|
pdf.unlink() |
|
|
|
return loaded_docs |
|
|
|
def split_data(loaded_docs): |
|
""" |
|
Split a list of loaded documents into smaller chunks. |
|
|
|
Args: |
|
loaded_docs List[Document]: A list of loaded LangChain Document class. |
|
|
|
Returns: |
|
List[Document]: A list of smaller chunks of parsed document. |
|
""" |
|
splitter = RecursiveCharacterTextSplitter( |
|
separators=["\n\n", "\n", " ", ".", ",", "" |
|
"\u200b", |
|
"\uff0c", |
|
"\u3001", |
|
"\uff0e", |
|
"\u3002", |
|
], |
|
chunk_size=1000, |
|
chunk_overlap=200, |
|
add_start_index=True, |
|
is_separator_regex=False) |
|
|
|
splitted_docs = splitter.split_documents(loaded_docs) |
|
return splitted_docs |
|
|
|
def upsert_chromadb(splitted_docs, embedding, idx, collection_name, db_name): |
|
""" |
|
Upserts data into Chromadb |
|
|
|
Args: |
|
splitted_docs List[Document]: A list of smaller chunks of parsed document. |
|
embedding: The embedding model. |
|
idx List[str]: A list of unique identifiers for each document. |
|
collection_name str: The name of the Chroma collection. |
|
db_name str: The name of the database. |
|
""" |
|
vector_store = Chroma.from_documents(splitted_docs, embedding, ids=idx, |
|
collection_name=collection_name, |
|
persist_directory="./" + db_name |
|
) |
|
return vector_store |
|
|