Spaces:
Sleeping
Sleeping
File size: 5,688 Bytes
ea4fe04 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
# Function to create a vector store
def fn_create_vector_store(lv_file_name, lv_domain):
"""
Create a vector store by encoding and storing embeddings of column descriptions from an Excel file.
Args:
lv_file_name (str): The path to the Excel file.
lv_domain (str): The domain name.
Returns:
str: A message indicating the status of the operation.
"""
# File Names
lv_faiss_file_name = 'db/'+lv_domain+'_index.faiss'
lv_rowdata_file_name = 'db/'+lv_domain+'_row_mapping.parquet'
# Check if files exist
if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_rowdata_file_name):
return "Data Already Exist"
else:
try:
# Load the pre-trained model
lv_model = SentenceTransformer('all-MiniLM-L6-v2')
# Excel to Dataframe
lv_excel_data = pd.read_excel(lv_file_name,sheet_name=None)
# Dictionary to store Embeddings, Faiss Index, and Index to Row Mapping
lv_embeddings_list = []
lv_row_mapping = []
# Reading each sheet
for lv_sheet_name, lv_sheet_data in lv_excel_data.items():
# Creating Embeddings
# Details available here -> https://www.sbert.net/docs/pretrained_models.html
lv_sheet_data.iloc[:, 1] = lv_sheet_data.iloc[:, 1].apply(lambda x: str(x).replace(u'\xa0', u' '))
lv_column_descriptions = lv_sheet_data.iloc[:, 1].astype(str).tolist()
lv_embeddings = lv_model.encode(lv_column_descriptions).astype('float32')
lv_embeddings_list.append(lv_embeddings)
# Merging all table, columns, description/hint into table
for i, row in enumerate(lv_sheet_data.itertuples(index=False)):
lv_row_mapping.append({
'sheet_name': lv_sheet_name,
'column_name': row[0],
'column_description': row[1]
})
# Combine all embeddings into one array
lv_merged_embeddings_list = np.vstack(lv_embeddings_list)
# Create a Faiss index
lv_dimension = lv_merged_embeddings_list.shape[1]
lv_index = faiss.IndexFlatL2(lv_dimension)
lv_index.add(lv_merged_embeddings_list)
# Saving the Faiss index to a file
faiss.write_index(lv_index, lv_faiss_file_name)
# Saving the Row Data to a file
lv_row_mapping_df = pd.DataFrame(lv_row_mapping)
lv_row_mapping_df.to_parquet(lv_rowdata_file_name,index=False)
return "Record Added Successfully"
except Exception as e:
raise e
def fn_map_data(lv_saved_file_name,lv_file_name,lv_source_domain):
# File Names
lv_faiss_file_name = 'db/'+lv_source_domain+'_index.faiss'
lv_sourcedata_file_name = 'db/'+lv_source_domain+'_row_mapping.parquet'
lv_mapping_file_name = 'db/'+lv_source_domain+"_"+lv_file_name
# Loading Data
if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_sourcedata_file_name):
# Load the pre-trained model
lv_model = SentenceTransformer('all-MiniLM-L6-v2')
# Load the Faiss index
lv_index = faiss.read_index(lv_faiss_file_name)
# Load the Row Data
lv_source_mapping_df = pd.read_parquet(lv_sourcedata_file_name)
lv_source_mapping_df.reindex()
# Excel to Dataframe
lv_excel_data = pd.read_excel(lv_saved_file_name,sheet_name=None)
# New Mapping Dataframe
lv_row_mapping_df = pd.DataFrame(columns=['source_sheet_name','source_column','target_sheet_name','target_column'])
# Reading each sheet
for lv_sheet_name, lv_sheet_data in lv_excel_data.items():
# Processing each row of the sheet
for i, row in enumerate(lv_sheet_data.itertuples(index=False)):
try:
# Creating Embeddings
# Details available here -> https://www.sbert.net/docs/pretrained_models.html
lv_query = row[1]
lv_query_embedding = lv_model.encode([lv_query])
# Search for similar vectors
lv_distances, lv_indices = lv_index.search(np.array(lv_query_embedding), 1)
# print("Rahul Rahul")
# print(lv_indices[0][0])
# Mapped Row
lv_row = lv_source_mapping_df.iloc[[lv_indices[0][0]]]
# print(lv_row['sheet_name'])
# print(lv_row['column_name'])
lv_new_row = {
'source_sheet_name': lv_row['sheet_name'].values[0],
'source_column': lv_row['column_name'].values[0],
'target_sheet_name': lv_sheet_name,
'target_column': row[0]
}
# Adding to the Dataframe
lv_row_mapping_df = pd.concat([lv_row_mapping_df, pd.DataFrame([lv_new_row])], ignore_index=True)
except Exception as e:
pass
# Saving the Row Data to a file
lv_row_mapping_df.to_excel(lv_mapping_file_name,index=False)
return lv_row_mapping_df.to_json(orient='records')
else:
raise Exception("Source Domain Data Not Found") |