rahgadda commited on
Commit
ae0839f
·
verified ·
1 Parent(s): e1b47f7

Initial Draft

Browse files
Files changed (1) hide show
  1. lib/api/endpoints/VectorStoreAPI.py +31 -131
lib/api/endpoints/VectorStoreAPI.py CHANGED
@@ -1,139 +1,39 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sentence_transformers import SentenceTransformer
4
- import faiss
5
- import os
6
 
7
- # Function to create a vector store
8
- def fn_create_vector_store(lv_file_name, lv_domain):
9
- """
10
- Create a vector store by encoding and storing embeddings of column descriptions from an Excel file.
11
 
12
- Args:
13
- lv_file_name (str): The path to the Excel file.
14
- lv_domain (str): The domain name.
15
 
16
- Returns:
17
- str: A message indicating the status of the operation.
18
- """
19
- # File Names
20
- lv_faiss_file_name = 'db/'+lv_domain+'_index.faiss'
21
- lv_rowdata_file_name = 'db/'+lv_domain+'_row_mapping.parquet'
 
22
 
23
- # Check if files exist
24
- if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_rowdata_file_name):
25
- return "Data Already Exist"
26
- else:
27
  try:
28
- # Load the pre-trained model
29
- lv_model = SentenceTransformer('all-MiniLM-L6-v2')
30
-
31
- # Excel to Dataframe
32
- lv_excel_data = pd.read_excel(lv_file_name,sheet_name=None)
33
-
34
- # Dictionary to store Embeddings, Faiss Index, and Index to Row Mapping
35
- lv_embeddings_list = []
36
- lv_row_mapping = []
37
-
38
- # Reading each sheet
39
- for lv_sheet_name, lv_sheet_data in lv_excel_data.items():
40
- # Creating Embeddings
41
- # Details available here -> https://www.sbert.net/docs/pretrained_models.html
42
- lv_sheet_data.iloc[:, 1] = lv_sheet_data.iloc[:, 1].apply(lambda x: str(x).replace(u'\xa0', u' '))
43
- lv_column_descriptions = lv_sheet_data.iloc[:, 1].astype(str).tolist()
44
 
45
- lv_embeddings = lv_model.encode(lv_column_descriptions).astype('float32')
46
- lv_embeddings_list.append(lv_embeddings)
47
-
48
- # Merging all table, columns, description/hint into table
49
- for i, row in enumerate(lv_sheet_data.itertuples(index=False)):
50
- lv_row_mapping.append({
51
- 'sheet_name': lv_sheet_name,
52
- 'column_name': row[0],
53
- 'column_description': row[1]
54
- })
55
-
56
- # Combine all embeddings into one array
57
- lv_merged_embeddings_list = np.vstack(lv_embeddings_list)
58
-
59
- # Create a Faiss index
60
- lv_dimension = lv_merged_embeddings_list.shape[1]
61
- lv_index = faiss.IndexFlatL2(lv_dimension)
62
- lv_index.add(lv_merged_embeddings_list)
63
-
64
- # Saving the Faiss index to a file
65
- faiss.write_index(lv_index, lv_faiss_file_name)
66
-
67
- # Saving the Row Data to a file
68
- lv_row_mapping_df = pd.DataFrame(lv_row_mapping)
69
- lv_row_mapping_df.to_parquet(lv_rowdata_file_name,index=False)
70
-
71
- return "Record Added Successfully"
72
  except Exception as e:
73
- raise e
74
-
75
-
76
- def fn_map_data(lv_saved_file_name,lv_file_name,lv_source_domain):
77
-
78
- # File Names
79
- lv_faiss_file_name = 'db/'+lv_source_domain+'_index.faiss'
80
- lv_sourcedata_file_name = 'db/'+lv_source_domain+'_row_mapping.parquet'
81
- lv_mapping_file_name = 'db/'+lv_source_domain+"_"+lv_file_name
82
-
83
- # Loading Data
84
- if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_sourcedata_file_name):
85
- # Load the pre-trained model
86
- lv_model = SentenceTransformer('all-MiniLM-L6-v2')
87
-
88
- # Load the Faiss index
89
- lv_index = faiss.read_index(lv_faiss_file_name)
90
-
91
- # Load the Row Data
92
- lv_source_mapping_df = pd.read_parquet(lv_sourcedata_file_name)
93
- lv_source_mapping_df.reindex()
94
-
95
- # Excel to Dataframe
96
- lv_excel_data = pd.read_excel(lv_saved_file_name,sheet_name=None)
97
-
98
- # New Mapping Dataframe
99
- lv_row_mapping_df = pd.DataFrame(columns=['source_sheet_name','source_column','target_sheet_name','target_column'])
100
-
101
- # Reading each sheet
102
- for lv_sheet_name, lv_sheet_data in lv_excel_data.items():
103
-
104
- # Processing each row of the sheet
105
- for i, row in enumerate(lv_sheet_data.itertuples(index=False)):
106
- try:
107
- # Creating Embeddings
108
- # Details available here -> https://www.sbert.net/docs/pretrained_models.html
109
- lv_query = row[1]
110
- lv_query_embedding = lv_model.encode([lv_query])
111
-
112
- # Search for similar vectors
113
- lv_distances, lv_indices = lv_index.search(np.array(lv_query_embedding), 1)
114
- # print("Rahul Rahul")
115
- # print(lv_indices[0][0])
116
-
117
- # Mapped Row
118
- lv_row = lv_source_mapping_df.iloc[[lv_indices[0][0]]]
119
- # print(lv_row['sheet_name'])
120
- # print(lv_row['column_name'])
121
-
122
- lv_new_row = {
123
- 'source_sheet_name': lv_row['sheet_name'].values[0],
124
- 'source_column': lv_row['column_name'].values[0],
125
- 'target_sheet_name': lv_sheet_name,
126
- 'target_column': row[0]
127
- }
128
-
129
- # Adding to the Dataframe
130
- lv_row_mapping_df = pd.concat([lv_row_mapping_df, pd.DataFrame([lv_new_row])], ignore_index=True)
131
- except Exception as e:
132
- pass
133
-
134
- # Saving the Row Data to a file
135
- lv_row_mapping_df.to_excel(lv_mapping_file_name,index=False)
136
 
137
- return lv_row_mapping_df.to_json(orient='records')
138
- else:
139
- raise Exception("Source Domain Data Not Found")
 
1
+ from flask.views import MethodView
2
+ from flask import request,Response
 
 
 
3
 
4
+ import json
5
+ import traceback
6
+ import logging
 
7
 
8
+ import lib.api.vector.VectorStore as cv
 
 
9
 
10
+ class VectorStoreAPI(MethodView):
11
+ lv_logger = logging.getLogger(__name__)
12
+
13
+ def get(self):
14
+ resp = { "test message": "working"}
15
+ status = 200
16
+ return Response(json.dumps(resp), status=status, mimetype='application/json')
17
 
18
+ def post(self):
 
 
 
19
  try:
20
+ # Saving file
21
+ lv_file = request.files['file']
22
+ lv_domain = request.form['domain']
23
+ lv_file_name = 'storage/' + lv_domain + ".xlsx"
24
+ lv_file.save(lv_file_name)
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Processing the file
27
+ lv_status = cv.fn_create_vector_store(lv_file_name, lv_domain)
28
+
29
+ return Response(
30
+ json.dumps({"status":lv_status}),
31
+ status=200,
32
+ mimetype='application/json'
33
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  except Exception as e:
35
+ self.lv_logger.error(e)
36
+ self.lv_logger.error(type(e))
37
+ self.lv_logger.error(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ return Response(json.dumps({"error_message":str(e)}), status=500, mimetype='application/json')