Libidrave commited on
Commit
c8d159b
·
verified ·
1 Parent(s): 1a2db48

Create utils/preprocess.py

Browse files
Files changed (1) hide show
  1. utils/preprocess.py +78 -0
utils/preprocess.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders.generic import GenericLoader
2
+ from langchain_community.document_loaders import FileSystemBlobLoader
3
+ from langchain_community.document_loaders.parsers import PyMuPDFParser
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+
6
+ from langchain_chroma import Chroma
7
+
8
+ from pathlib import Path
9
+
10
+ def load_data(documents):
11
+ """
12
+ Load and parse data from a list of PDF files.
13
+
14
+ Args:
15
+ documents Union[UploadedFile, list(UploadedFile)]: A single UploadedFile or list of UploadedFile objects. Strict for PDF only.
16
+
17
+ Returns:
18
+ List[Document]: A list of parsed LangChain Document class.
19
+ """
20
+ # Write PDF file to current working directory
21
+ for file in documents:
22
+ with open(f"./{file.name}", 'wb') as f:
23
+ f.write(file.getbuffer())
24
+
25
+ # Load and parse the data
26
+ loader = GenericLoader(blob_loader=FileSystemBlobLoader(path="./", glob="*.pdf"),
27
+ blob_parser=PyMuPDFParser(mode='page'))
28
+ loaded_docs = loader.load()
29
+
30
+ # Remove temporary PDF files after loading
31
+ pdf_files = Path.cwd().glob("*.pdf")
32
+ for pdf in pdf_files:
33
+ pdf.unlink()
34
+
35
+ return loaded_docs
36
+
37
+ def split_data(loaded_docs):
38
+ """
39
+ Split a list of loaded documents into smaller chunks.
40
+
41
+ Args:
42
+ loaded_docs List[Document]: A list of loaded LangChain Document class.
43
+
44
+ Returns:
45
+ List[Document]: A list of smaller chunks of parsed document.
46
+ """
47
+ splitter = RecursiveCharacterTextSplitter(
48
+ separators=["\n\n", "\n", " ", ".", ",", ""
49
+ "\u200b", # Zero-width space
50
+ "\uff0c", # Fullwidth comma
51
+ "\u3001", # Ideographic comma
52
+ "\uff0e", # Fullwidth full stop
53
+ "\u3002", # Ideographic full stop
54
+ ],
55
+ chunk_size=1000,
56
+ chunk_overlap=200,
57
+ add_start_index=True,
58
+ is_separator_regex=False)
59
+
60
+ splitted_docs = splitter.split_documents(loaded_docs)
61
+ return splitted_docs
62
+
63
+ def upsert_chromadb(splitted_docs, embedding, idx, collection_name, db_name):
64
+ """
65
+ Upserts data into Chromadb
66
+
67
+ Args:
68
+ splitted_docs List[Document]: A list of smaller chunks of parsed document.
69
+ embedding: The embedding model.
70
+ idx List[str]: A list of unique identifiers for each document.
71
+ collection_name str: The name of the Chroma collection.
72
+ db_name str: The name of the database.
73
+ """
74
+ vector_store = Chroma.from_documents(splitted_docs, embedding, ids=idx,
75
+ collection_name=collection_name,
76
+ persist_directory="./" + db_name
77
+ )
78
+ return vector_store