Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import os | |
# Function to create a vector store | |
def fn_create_vector_store(lv_file_name, lv_domain): | |
""" | |
Create a vector store by encoding and storing embeddings of column descriptions from an Excel file. | |
Args: | |
lv_file_name (str): The path to the Excel file. | |
lv_domain (str): The domain name. | |
Returns: | |
str: A message indicating the status of the operation. | |
""" | |
# File Names | |
lv_faiss_file_name = 'db/'+lv_domain+'_index.faiss' | |
lv_rowdata_file_name = 'db/'+lv_domain+'_row_mapping.parquet' | |
# Check if files exist | |
if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_rowdata_file_name): | |
return "Data Already Exist" | |
else: | |
try: | |
# Load the pre-trained model | |
lv_model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Excel to Dataframe | |
lv_excel_data = pd.read_excel(lv_file_name,sheet_name=None) | |
# Dictionary to store Embeddings, Faiss Index, and Index to Row Mapping | |
lv_embeddings_list = [] | |
lv_row_mapping = [] | |
# Reading each sheet | |
for lv_sheet_name, lv_sheet_data in lv_excel_data.items(): | |
# Creating Embeddings | |
# Details available here -> https://www.sbert.net/docs/pretrained_models.html | |
lv_sheet_data.iloc[:, 1] = lv_sheet_data.iloc[:, 1].apply(lambda x: str(x).replace(u'\xa0', u' ')) | |
lv_column_descriptions = lv_sheet_data.iloc[:, 1].astype(str).tolist() | |
lv_embeddings = lv_model.encode(lv_column_descriptions).astype('float32') | |
lv_embeddings_list.append(lv_embeddings) | |
# Merging all table, columns, description/hint into table | |
for i, row in enumerate(lv_sheet_data.itertuples(index=False)): | |
lv_row_mapping.append({ | |
'sheet_name': lv_sheet_name, | |
'column_name': row[0], | |
'column_description': row[1] | |
}) | |
# Combine all embeddings into one array | |
lv_merged_embeddings_list = np.vstack(lv_embeddings_list) | |
# Create a Faiss index | |
lv_dimension = lv_merged_embeddings_list.shape[1] | |
lv_index = faiss.IndexFlatL2(lv_dimension) | |
lv_index.add(lv_merged_embeddings_list) | |
# Saving the Faiss index to a file | |
faiss.write_index(lv_index, lv_faiss_file_name) | |
# Saving the Row Data to a file | |
lv_row_mapping_df = pd.DataFrame(lv_row_mapping) | |
lv_row_mapping_df.to_parquet(lv_rowdata_file_name,index=False) | |
return "Record Added Successfully" | |
except Exception as e: | |
raise e | |
def fn_map_data(lv_saved_file_name,lv_file_name,lv_source_domain): | |
# File Names | |
lv_faiss_file_name = 'db/'+lv_source_domain+'_index.faiss' | |
lv_sourcedata_file_name = 'db/'+lv_source_domain+'_row_mapping.parquet' | |
lv_mapping_file_name = 'db/'+lv_source_domain+"_"+lv_file_name | |
# Loading Data | |
if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_sourcedata_file_name): | |
# Load the pre-trained model | |
lv_model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Load the Faiss index | |
lv_index = faiss.read_index(lv_faiss_file_name) | |
# Load the Row Data | |
lv_source_mapping_df = pd.read_parquet(lv_sourcedata_file_name) | |
lv_source_mapping_df.reindex() | |
# Excel to Dataframe | |
lv_excel_data = pd.read_excel(lv_saved_file_name,sheet_name=None) | |
# New Mapping Dataframe | |
lv_row_mapping_df = pd.DataFrame(columns=['source_sheet_name','source_column','target_sheet_name','target_column']) | |
# Reading each sheet | |
for lv_sheet_name, lv_sheet_data in lv_excel_data.items(): | |
# Processing each row of the sheet | |
for i, row in enumerate(lv_sheet_data.itertuples(index=False)): | |
try: | |
# Creating Embeddings | |
# Details available here -> https://www.sbert.net/docs/pretrained_models.html | |
lv_query = row[1] | |
lv_query_embedding = lv_model.encode([lv_query]) | |
# Search for similar vectors | |
lv_distances, lv_indices = lv_index.search(np.array(lv_query_embedding), 1) | |
# print("Rahul Rahul") | |
# print(lv_indices[0][0]) | |
# Mapped Row | |
lv_row = lv_source_mapping_df.iloc[[lv_indices[0][0]]] | |
# print(lv_row['sheet_name']) | |
# print(lv_row['column_name']) | |
lv_new_row = { | |
'source_sheet_name': lv_row['sheet_name'].values[0], | |
'source_column': lv_row['column_name'].values[0], | |
'target_sheet_name': lv_sheet_name, | |
'target_column': row[0] | |
} | |
# Adding to the Dataframe | |
lv_row_mapping_df = pd.concat([lv_row_mapping_df, pd.DataFrame([lv_new_row])], ignore_index=True) | |
except Exception as e: | |
pass | |
# Saving the Row Data to a file | |
lv_row_mapping_df.to_excel(lv_mapping_file_name,index=False) | |
return lv_row_mapping_df.to_json(orient='records') | |
else: | |
raise Exception("Source Domain Data Not Found") |