import numpy as np from sentence_transformers import SentenceTransformer, util from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import string import re nltk.download('stopwords') nltk.download('punkt') STOP_WORDS = list(stopwords.words('english')) BERTOPIC_REPRESENTATIONS = [ "KeyBERTInspired", "MaximalMarginalRelevance", ] TRANSFORMERS = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"] TRANSFORMERS_INFO = ["all-mpnet-base-v2: All-round model tuned for many use-cases. " "Trained on a large and diverse dataset of over 1 billion training pairs", "multi-qa-mpnet-base-dot-v1: This model was tuned for semantic search: Given a query/question, " "if can find relevant passages. " "It was trained on a large and diverse set of (question, answer) pairs." ] def get_bertopic_representation(representation: str): if representation == BERTOPIC_REPRESENTATIONS[0]: return KeyBERTInspired() elif representation == BERTOPIC_REPRESENTATIONS[1]: return MaximalMarginalRelevance() else: return None def tokenize_explode(df, col): df['tokenized'] = df[col].apply(word_tokenize) df = df.explode('tokenized') df['tokenized'] = df['tokenized'].str.strip() df['tokenized'] = df['tokenized'].str.lower() return df def cleanup_tokens(df, col): df = df[df[col].apply(lambda x: len(x) > 2)] df = df[~df[col].str.contains(r'^(\d+\.?\d*)$', regex=True)] df = df[~df[col].isin([p for p in string.punctuation])] df = df[df[col].isin(STOP_WORDS) == False] return df def get_embedding_model(transformer=BERTOPIC_REPRESENTATIONS[0]) -> SentenceTransformer: """ get given sentence transformer model :param transformer: :return: """ sentence_model = SentenceTransformer(transformer) return sentence_model def str_to_vector_list(text_list, sentence_model, replace_dict=None): """ embedding for the given text list using provided embedding model :param text_list: :param sentence_model: :param replace_dict: any values in the string that we may need to replace :return: """ text_list = [str(x).replace('[^\w\s]', '') for x in text_list] if replace_dict: for stp in replace_dict: text_list = [str(x).replace(stp, replace_dict[stp]) for x in text_list] embeddings = sentence_model.encode(text_list, show_progress_bar=True, batch_size=1000) return embeddings.tolist() def remove_unnecessary_tokens_from_df(df, columns, extra_stopwords=None) -> None: """ removes unnecessary token from the given columns of the dataframe :param df: :param columns: :param extra_stopwords: :return: """ df[columns] = df[columns].apply(lambda x: x.str.replace('[^\w\s]', '')) if extra_stopwords: for stp in extra_stopwords: df[columns] = df[columns].apply(lambda x: x.str.replace(stp, ' ')) def cosine_sim_matrix(embeddings_a, embeddings_b) -> np.array: """ finds out cosine similarity matrix for the given embeddings :param embeddings_a: :param embeddings_b: :return: numpy array showing the cosine similarity matrix """ return np.array( util.pytorch_cos_sim(embeddings_a, embeddings_b) )