rahgadda commited on
Commit
ea4fe04
·
verified ·
1 Parent(s): 4257878

Initial Draft

Browse files
Files changed (1) hide show
  1. lib/api/vector/VectorStore.py +139 -0
lib/api/vector/VectorStore.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ import os
6
+
7
+ # Function to create a vector store
8
+ def fn_create_vector_store(lv_file_name, lv_domain):
9
+ """
10
+ Create a vector store by encoding and storing embeddings of column descriptions from an Excel file.
11
+
12
+ Args:
13
+ lv_file_name (str): The path to the Excel file.
14
+ lv_domain (str): The domain name.
15
+
16
+ Returns:
17
+ str: A message indicating the status of the operation.
18
+ """
19
+ # File Names
20
+ lv_faiss_file_name = 'db/'+lv_domain+'_index.faiss'
21
+ lv_rowdata_file_name = 'db/'+lv_domain+'_row_mapping.parquet'
22
+
23
+ # Check if files exist
24
+ if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_rowdata_file_name):
25
+ return "Data Already Exist"
26
+ else:
27
+ try:
28
+ # Load the pre-trained model
29
+ lv_model = SentenceTransformer('all-MiniLM-L6-v2')
30
+
31
+ # Excel to Dataframe
32
+ lv_excel_data = pd.read_excel(lv_file_name,sheet_name=None)
33
+
34
+ # Dictionary to store Embeddings, Faiss Index, and Index to Row Mapping
35
+ lv_embeddings_list = []
36
+ lv_row_mapping = []
37
+
38
+ # Reading each sheet
39
+ for lv_sheet_name, lv_sheet_data in lv_excel_data.items():
40
+ # Creating Embeddings
41
+ # Details available here -> https://www.sbert.net/docs/pretrained_models.html
42
+ lv_sheet_data.iloc[:, 1] = lv_sheet_data.iloc[:, 1].apply(lambda x: str(x).replace(u'\xa0', u' '))
43
+ lv_column_descriptions = lv_sheet_data.iloc[:, 1].astype(str).tolist()
44
+
45
+ lv_embeddings = lv_model.encode(lv_column_descriptions).astype('float32')
46
+ lv_embeddings_list.append(lv_embeddings)
47
+
48
+ # Merging all table, columns, description/hint into table
49
+ for i, row in enumerate(lv_sheet_data.itertuples(index=False)):
50
+ lv_row_mapping.append({
51
+ 'sheet_name': lv_sheet_name,
52
+ 'column_name': row[0],
53
+ 'column_description': row[1]
54
+ })
55
+
56
+ # Combine all embeddings into one array
57
+ lv_merged_embeddings_list = np.vstack(lv_embeddings_list)
58
+
59
+ # Create a Faiss index
60
+ lv_dimension = lv_merged_embeddings_list.shape[1]
61
+ lv_index = faiss.IndexFlatL2(lv_dimension)
62
+ lv_index.add(lv_merged_embeddings_list)
63
+
64
+ # Saving the Faiss index to a file
65
+ faiss.write_index(lv_index, lv_faiss_file_name)
66
+
67
+ # Saving the Row Data to a file
68
+ lv_row_mapping_df = pd.DataFrame(lv_row_mapping)
69
+ lv_row_mapping_df.to_parquet(lv_rowdata_file_name,index=False)
70
+
71
+ return "Record Added Successfully"
72
+ except Exception as e:
73
+ raise e
74
+
75
+
76
+ def fn_map_data(lv_saved_file_name,lv_file_name,lv_source_domain):
77
+
78
+ # File Names
79
+ lv_faiss_file_name = 'db/'+lv_source_domain+'_index.faiss'
80
+ lv_sourcedata_file_name = 'db/'+lv_source_domain+'_row_mapping.parquet'
81
+ lv_mapping_file_name = 'db/'+lv_source_domain+"_"+lv_file_name
82
+
83
+ # Loading Data
84
+ if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_sourcedata_file_name):
85
+ # Load the pre-trained model
86
+ lv_model = SentenceTransformer('all-MiniLM-L6-v2')
87
+
88
+ # Load the Faiss index
89
+ lv_index = faiss.read_index(lv_faiss_file_name)
90
+
91
+ # Load the Row Data
92
+ lv_source_mapping_df = pd.read_parquet(lv_sourcedata_file_name)
93
+ lv_source_mapping_df.reindex()
94
+
95
+ # Excel to Dataframe
96
+ lv_excel_data = pd.read_excel(lv_saved_file_name,sheet_name=None)
97
+
98
+ # New Mapping Dataframe
99
+ lv_row_mapping_df = pd.DataFrame(columns=['source_sheet_name','source_column','target_sheet_name','target_column'])
100
+
101
+ # Reading each sheet
102
+ for lv_sheet_name, lv_sheet_data in lv_excel_data.items():
103
+
104
+ # Processing each row of the sheet
105
+ for i, row in enumerate(lv_sheet_data.itertuples(index=False)):
106
+ try:
107
+ # Creating Embeddings
108
+ # Details available here -> https://www.sbert.net/docs/pretrained_models.html
109
+ lv_query = row[1]
110
+ lv_query_embedding = lv_model.encode([lv_query])
111
+
112
+ # Search for similar vectors
113
+ lv_distances, lv_indices = lv_index.search(np.array(lv_query_embedding), 1)
114
+ # print("Rahul Rahul")
115
+ # print(lv_indices[0][0])
116
+
117
+ # Mapped Row
118
+ lv_row = lv_source_mapping_df.iloc[[lv_indices[0][0]]]
119
+ # print(lv_row['sheet_name'])
120
+ # print(lv_row['column_name'])
121
+
122
+ lv_new_row = {
123
+ 'source_sheet_name': lv_row['sheet_name'].values[0],
124
+ 'source_column': lv_row['column_name'].values[0],
125
+ 'target_sheet_name': lv_sheet_name,
126
+ 'target_column': row[0]
127
+ }
128
+
129
+ # Adding to the Dataframe
130
+ lv_row_mapping_df = pd.concat([lv_row_mapping_df, pd.DataFrame([lv_new_row])], ignore_index=True)
131
+ except Exception as e:
132
+ pass
133
+
134
+ # Saving the Row Data to a file
135
+ lv_row_mapping_df.to_excel(lv_mapping_file_name,index=False)
136
+
137
+ return lv_row_mapping_df.to_json(orient='records')
138
+ else:
139
+ raise Exception("Source Domain Data Not Found")