Spaces:
Sleeping
Sleeping
File size: 1,876 Bytes
2aa73f0 671b9b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
from transformers import AutoTokenizer, MT5ForConditionalGeneration
from transformers import T5Tokenizer
import streamlit as st
import pandas as pd
from datasets import Dataset
import torch
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments
tokenizer = T5Tokenizer.from_pretrained('google/mt5-base')
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")
#st.write(model)
df = pd.read_csv('proverbs.csv')
df
dataset = Dataset.from_pandas(df)
def preprocess_function(examples):
inputs = examples['Proverb']
targets = examples['Meaning']
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_dataset = dataset.map(preprocess_function, batched=True)
dataset_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = dataset_split['train']
test_dataset = dataset_split['test']
print(f"Training dataset size: {len(train_dataset)}")
print(f"Testing dataset size: {len(test_dataset)}")
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
save_total_limit=2,
save_steps=500,
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset, # Typically you'd have a separate eval dataset
)
# Fine-tune the model
trainer.train()
model.save_pretrained("./fine-tuned-mt5-marathi-proverbs")
tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs") |