File size: 5,688 Bytes
ea4fe04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os

# Function to create a vector store
def fn_create_vector_store(lv_file_name, lv_domain):
    """
    Create a vector store by encoding and storing embeddings of column descriptions from an Excel file.

    Args:
        lv_file_name (str): The path to the Excel file.
        lv_domain (str): The domain name.

    Returns:
        str: A message indicating the status of the operation.
    """
    # File Names
    lv_faiss_file_name = 'db/'+lv_domain+'_index.faiss'
    lv_rowdata_file_name = 'db/'+lv_domain+'_row_mapping.parquet'

    # Check if files exist
    if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_rowdata_file_name):
        return "Data Already Exist"
    else:
        try:
            # Load the pre-trained model
            lv_model = SentenceTransformer('all-MiniLM-L6-v2')

            # Excel to Dataframe
            lv_excel_data = pd.read_excel(lv_file_name,sheet_name=None)
            
            # Dictionary to store Embeddings, Faiss Index, and Index to Row Mapping
            lv_embeddings_list = []
            lv_row_mapping = []

            # Reading each sheet
            for lv_sheet_name, lv_sheet_data in lv_excel_data.items():        
                # Creating Embeddings
                # Details available here -> https://www.sbert.net/docs/pretrained_models.html
                lv_sheet_data.iloc[:, 1] = lv_sheet_data.iloc[:, 1].apply(lambda x: str(x).replace(u'\xa0', u' '))
                lv_column_descriptions = lv_sheet_data.iloc[:, 1].astype(str).tolist()
            
                lv_embeddings = lv_model.encode(lv_column_descriptions).astype('float32')
                lv_embeddings_list.append(lv_embeddings)

                # Merging all table, columns, description/hint into table
                for i, row in enumerate(lv_sheet_data.itertuples(index=False)):
                    lv_row_mapping.append({
                        'sheet_name': lv_sheet_name,
                        'column_name': row[0],
                        'column_description': row[1]
                    })
            
            # Combine all embeddings into one array
            lv_merged_embeddings_list = np.vstack(lv_embeddings_list)

            # Create a Faiss index
            lv_dimension = lv_merged_embeddings_list.shape[1]
            lv_index = faiss.IndexFlatL2(lv_dimension)
            lv_index.add(lv_merged_embeddings_list)

            # Saving the Faiss index to a file
            faiss.write_index(lv_index, lv_faiss_file_name)

            # Saving the Row Data to a file
            lv_row_mapping_df = pd.DataFrame(lv_row_mapping)
            lv_row_mapping_df.to_parquet(lv_rowdata_file_name,index=False)

            return "Record Added Successfully"
        except Exception as e:
            raise e


def fn_map_data(lv_saved_file_name,lv_file_name,lv_source_domain):

    # File Names
    lv_faiss_file_name = 'db/'+lv_source_domain+'_index.faiss'
    lv_sourcedata_file_name = 'db/'+lv_source_domain+'_row_mapping.parquet'
    lv_mapping_file_name = 'db/'+lv_source_domain+"_"+lv_file_name

    # Loading Data 
    if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_sourcedata_file_name):
        # Load the pre-trained model
        lv_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Load the Faiss index
        lv_index = faiss.read_index(lv_faiss_file_name)

        # Load the Row Data
        lv_source_mapping_df = pd.read_parquet(lv_sourcedata_file_name)
        lv_source_mapping_df.reindex()

        # Excel to Dataframe
        lv_excel_data = pd.read_excel(lv_saved_file_name,sheet_name=None)

        # New Mapping Dataframe
        lv_row_mapping_df = pd.DataFrame(columns=['source_sheet_name','source_column','target_sheet_name','target_column'])

        # Reading each sheet
        for lv_sheet_name, lv_sheet_data in lv_excel_data.items():        
            
            # Processing each row of the sheet
            for i, row in enumerate(lv_sheet_data.itertuples(index=False)):
                try:
                    # Creating Embeddings
                    # Details available here -> https://www.sbert.net/docs/pretrained_models.html
                    lv_query = row[1]
                    lv_query_embedding = lv_model.encode([lv_query])

                    # Search for similar vectors
                    lv_distances, lv_indices = lv_index.search(np.array(lv_query_embedding), 1)
                    # print("Rahul Rahul")
                    # print(lv_indices[0][0])

                    # Mapped Row
                    lv_row = lv_source_mapping_df.iloc[[lv_indices[0][0]]]
                    # print(lv_row['sheet_name'])
                    # print(lv_row['column_name'])

                    lv_new_row = {
                                    'source_sheet_name': lv_row['sheet_name'].values[0],
                                    'source_column': lv_row['column_name'].values[0],
                                    'target_sheet_name': lv_sheet_name,
                                    'target_column': row[0]
                                }

                    # Adding to the Dataframe
                    lv_row_mapping_df = pd.concat([lv_row_mapping_df, pd.DataFrame([lv_new_row])], ignore_index=True)
                except Exception as e:
                    pass

        # Saving the Row Data to a file
        lv_row_mapping_df.to_excel(lv_mapping_file_name,index=False)

        return lv_row_mapping_df.to_json(orient='records')
    else:
        raise Exception("Source Domain Data Not Found")