from transformers import pipeline, AutoTokenizer from optimum.onnxruntime import ORTModelForTokenClassification import re import gradio as gr # Define categories and keywords CATEGORIES = { "Need": { "Housing": ["nha", "thue", "sua nha"], "Groceries": ["thuc pham", "rau cu", "sieu thi"], }, "Want": { "Entertainment": ["phim", "karaoke", "game", "do choi"], "Dining Out": ["cafe", "nha hang", "tra sua"], }, "Saving/Investment": { "Savings": ["quy tiet kiem", "dau tu", "tai san"], }, } # Normalize Vietnamese text def normalize_vietnamese(text): return re.sub( r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text ).replace("đ", "d") # Load tokenizer and quantized model model_name = "distilbert-base-multilingual-cased" tokenizer = AutoTokenizer.from_pretrained(model_name) quantized_model = ORTModelForTokenClassification.from_pretrained(model_name) # Create NER pipeline ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple") # Classify input def classify_and_extract(user_input): normalized_input = normalize_vietnamese(user_input.lower()) amount = re.search(r"\d+", normalized_input) amount = amount.group(0) if amount else "Unknown" # Rule-based matching for main_cat, subcategories in CATEGORIES.items(): for sub_cat, keywords in subcategories.items(): if any(keyword in normalized_input for keyword in keywords): return { "Main Category": main_cat, "Sub Category": sub_cat, "Amount": amount, "NER Entities": [], } # Fallback to NER model ner_results = ner_model(user_input) return { "Main Category": "Uncategorized", "Sub Category": "Unknown", "Amount": amount, "NER Entities": ner_results, } # Gradio app def process_user_input(user_input): result = classify_and_extract(user_input) return ( f"Main Category: {result['Main Category']}\n" f"Sub Category: {result['Sub Category']}\n" f"Amount: {result['Amount']}\n" f"Entities: {result['NER Entities']}" ) iface = gr.Interface( fn=process_user_input, inputs="text", outputs="text", title="Expenditure Classifier", description="Classify and categorize spending." ) iface.launch(share=True)