Spaces:

MilindChawre
/

erav3-s11-hindi-tokenizer

Sleeping

App Files Files Community

erav3-s11-hindi-tokenizer / app.py

MilindChawre

Adding hindi BPE tokenizer

da971a5 about 2 months ago

raw

history blame contribute delete

6.12 kB

	import streamlit as st
	from pathlib import Path
	from hindi_tokenizer import load_tokenizer, encode_text, decode_text

	def load_hindi_tokenizer():
	"""Load the trained Hindi BPE tokenizer"""
	output_dir = Path("output")
	config_path = output_dir / "hindi_encoder.json"

	if not config_path.exists():
	st.error("Error: Tokenizer configuration not found! Please train the tokenizer first.")
	st.stop()

	return load_tokenizer(str(config_path))

	def main():
	st.set_page_config(
	page_title="Hindi BPE Tokenizer",
	page_icon="🇮🇳",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Set custom CSS for styling
	st.markdown(
	"""
	<style>
	.stApp {
	background-color: #2E2E2E; /* Dark background */
	color: #FFFFFF; /* White text for better contrast */
	}
	.stButton {
	background-color: #4CAF50; /* Green button */
	color: white;
	border-radius: 8px; /* Rounded corners */
	padding: 10px 20px; /* Padding for buttons */
	font-size: 16px; /* Larger font size */
	}
	.stButton:hover {
	background-color: #45a049; /* Darker green on hover */
	}
	.stTextInput, .stTextArea {
	background-color: #ffffff; /* White input fields */
	border: 2px solid #4CAF50; /* Green border */
	border-radius: 8px; /* Rounded corners */
	padding: 10px; /* Padding for input fields */
	font-size: 16px; /* Larger font size */
	}
	.stHeader {
	color: #FFD700; /* Gold color for header text */
	font-size: 28px; /* Larger header font size */
	font-weight: bold; /* Bold header */
	}
	.stMarkdown {
	color: #FFFFFF; /* White markdown text */
	font-size: 16px; /* Larger markdown font size */
	}
	.stTextInput:focus, .stTextArea:focus {
	border-color: #45a049; /* Change border color on focus */
	box-shadow: 0 0 5px rgba(76, 175, 80, 0.5); /* Add shadow on focus */
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	st.title("Hindi BPE Tokenizer")
	st.markdown("A web interface for encoding and decoding Hindi text using BPE tokenization")

	# Load tokenizer
	try:
	tokenizer = load_hindi_tokenizer()
	except Exception as e:
	st.error(f"Error loading tokenizer: {e}")
	st.stop()

	# Create two columns
	encode_col, decode_col = st.columns(2)

	# Encoding Section
	with encode_col:
	st.header("Encode Hindi Text")
	st.markdown("Convert Hindi text into token IDs")

	input_text = st.text_area(
	"Enter Hindi Text",
	placeholder="यहाँ हिंदी टेक्स्ट लिखें...",
	height=150,
	key="encode_input"
	)

	if st.button("Encode", key="encode_button"):
	if input_text.strip():
	try:
	token_ids, tokens = encode_text(tokenizer, input_text)

	st.subheader("Results:")
	st.markdown("Tokens:")
	st.write(tokens)

	st.markdown("Token IDs:")
	st.write(token_ids)

	# Display as comma-separated string for easy copying
	st.markdown("Token IDs (comma-separated):")
	st.code(", ".join(map(str, token_ids)))

	except Exception as e:
	st.error(f"Error during encoding: {e}")
	else:
	st.warning("Please enter some text to encode")

	# Decoding Section
	with decode_col:
	st.header("Decode Token IDs")
	st.markdown("Convert token IDs back to Hindi text")

	input_ids = st.text_area(
	"Enter Token IDs (comma-separated)",
	placeholder="2197, 1024, 402, 7, 924...",
	height=150,
	key="decode_input"
	)

	if st.button("Decode", key="decode_button"):
	if input_ids.strip():
	try:
	# Convert string of IDs to list of integers
	token_ids = [int(id.strip()) for id in input_ids.split(",")]

	decoded_text = decode_text(tokenizer, token_ids)

	st.subheader("Results:")
	st.markdown("Decoded Text:")
	st.write(decoded_text)

	# Display in a box for better visibility
	st.text_area(
	"Decoded Text (copyable)",
	value=decoded_text,
	height=100,
	key="decoded_output"
	)

	except ValueError:
	st.error("Invalid input format. Please enter comma-separated numbers.")
	except Exception as e:
	st.error(f"Error during decoding: {e}")
	else:
	st.warning("Please enter token IDs to decode")

	# Add information section at the bottom
	st.markdown("---")
	st.markdown("### About the Tokenizer")

	info_col1, info_col2 = st.columns(2)

	with info_col1:
	st.markdown("""
	Tokenizer Details:
	- Type: Byte Pair Encoding (BPE)
	- Vocabulary Size: 5000 tokens
	- Special Tokens: `<pad>`, `<unk>`, `<s>`, `</s>`
	- Minimum Token Frequency: 2
	""")

	with info_col2:
	st.markdown("""
	Preprocessing:
	- Retains Hindi Unicode (\\u0900-\\u097F)
	- Removes digits and special characters
	- Normalizes punctuation
	- Cleans whitespace
	""")

	if __name__ == "__main__":
	main()