import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer import faiss import os # Function to create a vector store def fn_create_vector_store(lv_file_name, lv_domain): """ Create a vector store by encoding and storing embeddings of column descriptions from an Excel file. Args: lv_file_name (str): The path to the Excel file. lv_domain (str): The domain name. Returns: str: A message indicating the status of the operation. """ # File Names lv_faiss_file_name = 'db/'+lv_domain+'_index.faiss' lv_rowdata_file_name = 'db/'+lv_domain+'_row_mapping.parquet' # Check if files exist if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_rowdata_file_name): return "Data Already Exist" else: try: # Load the pre-trained model lv_model = SentenceTransformer('all-MiniLM-L6-v2') # Excel to Dataframe lv_excel_data = pd.read_excel(lv_file_name,sheet_name=None) # Dictionary to store Embeddings, Faiss Index, and Index to Row Mapping lv_embeddings_list = [] lv_row_mapping = [] # Reading each sheet for lv_sheet_name, lv_sheet_data in lv_excel_data.items(): # Creating Embeddings # Details available here -> https://www.sbert.net/docs/pretrained_models.html lv_sheet_data.iloc[:, 1] = lv_sheet_data.iloc[:, 1].apply(lambda x: str(x).replace(u'\xa0', u' ')) lv_column_descriptions = lv_sheet_data.iloc[:, 1].astype(str).tolist() lv_embeddings = lv_model.encode(lv_column_descriptions).astype('float32') lv_embeddings_list.append(lv_embeddings) # Merging all table, columns, description/hint into table for i, row in enumerate(lv_sheet_data.itertuples(index=False)): lv_row_mapping.append({ 'sheet_name': lv_sheet_name, 'column_name': row[0], 'column_description': row[1] }) # Combine all embeddings into one array lv_merged_embeddings_list = np.vstack(lv_embeddings_list) # Create a Faiss index lv_dimension = lv_merged_embeddings_list.shape[1] lv_index = faiss.IndexFlatL2(lv_dimension) lv_index.add(lv_merged_embeddings_list) # Saving the Faiss index to a file faiss.write_index(lv_index, lv_faiss_file_name) # Saving the Row Data to a file lv_row_mapping_df = pd.DataFrame(lv_row_mapping) lv_row_mapping_df.to_parquet(lv_rowdata_file_name,index=False) return "Record Added Successfully" except Exception as e: raise e def fn_map_data(lv_saved_file_name,lv_file_name,lv_source_domain): # File Names lv_faiss_file_name = 'db/'+lv_source_domain+'_index.faiss' lv_sourcedata_file_name = 'db/'+lv_source_domain+'_row_mapping.parquet' lv_mapping_file_name = 'db/'+lv_source_domain+"_"+lv_file_name # Loading Data if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_sourcedata_file_name): # Load the pre-trained model lv_model = SentenceTransformer('all-MiniLM-L6-v2') # Load the Faiss index lv_index = faiss.read_index(lv_faiss_file_name) # Load the Row Data lv_source_mapping_df = pd.read_parquet(lv_sourcedata_file_name) lv_source_mapping_df.reindex() # Excel to Dataframe lv_excel_data = pd.read_excel(lv_saved_file_name,sheet_name=None) # New Mapping Dataframe lv_row_mapping_df = pd.DataFrame(columns=['source_sheet_name','source_column','target_sheet_name','target_column']) # Reading each sheet for lv_sheet_name, lv_sheet_data in lv_excel_data.items(): # Processing each row of the sheet for i, row in enumerate(lv_sheet_data.itertuples(index=False)): try: # Creating Embeddings # Details available here -> https://www.sbert.net/docs/pretrained_models.html lv_query = row[1] lv_query_embedding = lv_model.encode([lv_query]) # Search for similar vectors lv_distances, lv_indices = lv_index.search(np.array(lv_query_embedding), 1) # print("Rahul Rahul") # print(lv_indices[0][0]) # Mapped Row lv_row = lv_source_mapping_df.iloc[[lv_indices[0][0]]] # print(lv_row['sheet_name']) # print(lv_row['column_name']) lv_new_row = { 'source_sheet_name': lv_row['sheet_name'].values[0], 'source_column': lv_row['column_name'].values[0], 'target_sheet_name': lv_sheet_name, 'target_column': row[0] } # Adding to the Dataframe lv_row_mapping_df = pd.concat([lv_row_mapping_df, pd.DataFrame([lv_new_row])], ignore_index=True) except Exception as e: pass # Saving the Row Data to a file lv_row_mapping_df.to_excel(lv_mapping_file_name,index=False) return lv_row_mapping_df.to_json(orient='records') else: raise Exception("Source Domain Data Not Found")