Spaces:

zayedupal
/

google_ads_space

Runtime error

App Files Files Community

google_ads_space / Functionalities /NLP_Helper.py

zayedupal

Upload 11 files

1ee5c89 almost 2 years ago

raw

history blame contribute delete

3.45 kB

	import numpy as np
	from sentence_transformers import SentenceTransformer, util
	from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import string
	import re

	nltk.download('stopwords')
	nltk.download('punkt')

	STOP_WORDS = list(stopwords.words('english'))
	BERTOPIC_REPRESENTATIONS = [
	"KeyBERTInspired",
	"MaximalMarginalRelevance",
	]

	TRANSFORMERS = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"]
	TRANSFORMERS_INFO = ["all-mpnet-base-v2: All-round model tuned for many use-cases. "
	"Trained on a large and diverse dataset of over 1 billion training pairs",

	"multi-qa-mpnet-base-dot-v1: This model was tuned for semantic search: Given a query/question, "
	"if can find relevant passages. "
	"It was trained on a large and diverse set of (question, answer) pairs."
	]


	def get_bertopic_representation(representation: str):
	if representation == BERTOPIC_REPRESENTATIONS[0]:
	return KeyBERTInspired()
	elif representation == BERTOPIC_REPRESENTATIONS[1]:
	return MaximalMarginalRelevance()
	else:
	return None


	def tokenize_explode(df, col):
	df['tokenized'] = df[col].apply(word_tokenize)
	df = df.explode('tokenized')
	df['tokenized'] = df['tokenized'].str.strip()
	df['tokenized'] = df['tokenized'].str.lower()

	return df


	def cleanup_tokens(df, col):
	df = df[df[col].apply(lambda x: len(x) > 2)]
	df = df[~df[col].str.contains(r'^(\d+\.?\d*)$', regex=True)]
	df = df[~df[col].isin([p for p in string.punctuation])]
	df = df[df[col].isin(STOP_WORDS) == False]

	return df


	def get_embedding_model(transformer=BERTOPIC_REPRESENTATIONS[0]) -> SentenceTransformer:
	"""
	get given sentence transformer model
	:param transformer:
	:return:
	"""
	sentence_model = SentenceTransformer(transformer)
	return sentence_model


	def str_to_vector_list(text_list, sentence_model, replace_dict=None):
	"""
	embedding for the given text list using provided embedding model
	:param text_list:
	:param sentence_model:
	:param replace_dict: any values in the string that we may need to replace
	:return:
	"""
	text_list = [str(x).replace('[^\w\s]', '') for x in text_list]
	if replace_dict:
	for stp in replace_dict:
	text_list = [str(x).replace(stp, replace_dict[stp]) for x in text_list]

	embeddings = sentence_model.encode(text_list, show_progress_bar=True, batch_size=1000)
	return embeddings.tolist()


	def remove_unnecessary_tokens_from_df(df, columns, extra_stopwords=None) -> None:
	"""
	removes unnecessary token from the given columns of the dataframe
	:param df:
	:param columns:
	:param extra_stopwords:
	:return:
	"""
	df[columns] = df[columns].apply(lambda x: x.str.replace('[^\w\s]', ''))

	if extra_stopwords:
	for stp in extra_stopwords:
	df[columns] = df[columns].apply(lambda x: x.str.replace(stp, ' '))


	def cosine_sim_matrix(embeddings_a, embeddings_b) -> np.array:
	"""
	finds out cosine similarity matrix for the given embeddings
	:param embeddings_a:
	:param embeddings_b:
	:return: numpy array showing the cosine similarity matrix
	"""
	return np.array(
	util.pytorch_cos_sim(embeddings_a, embeddings_b)
	)