Spaces:

Meomap
/

Spend

Running

App Files Files Community

Meomap commited on 6 days ago

Commit

9839b62

verified ·

1 Parent(s): 89789fd

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -45

app.py CHANGED Viewed

@@ -1,90 +1,71 @@
-import re
 from transformers import pipeline, AutoTokenizer
 from optimum.onnxruntime import ORTModelForTokenClassification
 import gradio as gr
-# Define categories and their keywords
 CATEGORIES = {
     "Need": {
-        "Utilities": ["dien", "nuoc", "gas", "internet", "dienthoai"],
-        "Housing": ["nha", "thue", "sua chua", "sua nha"],
-        "Groceries": ["thuc pham", "sieu thi", "rau cu", "do an"],
-        "Transportation": ["xang", "xe", "ve xe", "bao duong"],
-        "Education": ["hoc phi", "sach", "truong", "khoa hoc"],
-        "Medical": ["bao hiem", "bac si", "thuoc"],
-        "Insurance": ["bao hiem", "nha", "oto", "suc khoe"],
-        "Childcare": ["tre em", "truong mam non", "nguoi giup viec"],
     },
     "Want": {
-        "Dining Out": ["nha hang", "quan an", "cafe", "tra sua"],
-        "Entertainment": ["phim", "karaoke", "game", "nhac", "do choi", "bup be"],
-        "Travel": ["du lich", "ve may bay", "khach san"],
-        "Fitness": ["gym", "yoga", "the thao"],
-        "Shopping": ["quan ao", "phu kien", "dien thoai", "luxury"],
-        "Hobbies": ["so thich", "do choi", "my thuat"],
-        "Personal Care": ["spa", "toc", "lam dep", "my pham"],
     },
     "Saving/Investment": {
-        "Emergency Fund": ["quy du phong"],
-        "Retirement": ["nghi huu"],
-        "Investments": ["chung khoan", "bat dong san"],
-        "Debt Repayment": ["tra no"],
-        "Education Fund": ["quy hoc tap"],
-        "Savings for Goals": ["quy tiet kiem"],
-        "Health Savings": ["bao hiem y te"],
-    }
 }
-# Normalize Vietnamese input (remove accents)
 def normalize_vietnamese(text):
     return re.sub(
         r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
     ).replace("đ", "d")
-# Load and quantize the model
 model_name = "distilbert-base-multilingual-cased"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-quantized_model = ORTModelForTokenClassification.from_pretrained(model_name, from_transformers=True)
-# Create the NER pipeline with the quantized model
 ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")
 # Classify input
 def classify_and_extract(user_input):
     normalized_input = normalize_vietnamese(user_input.lower())
-    # Extract amount using regex
-    amount_match = re.search(r"(\d+(\.\d{1,2})?)", normalized_input)
-    amount = amount_match.group(0) if amount_match else "Unknown"
-    # Rule-based matching for categories
-    for main_category, subcategories in CATEGORIES.items():
-        for subcategory, keywords in subcategories.items():
             if any(keyword in normalized_input for keyword in keywords):
                 return {
-                    "Main Category": main_category,
-                    "Sub Category": subcategory,
                     "Amount": amount,
-                    "Entities": []  # Skip NER if matched via rules
                 }
-    # Fallback to NER model for unmatched cases
     ner_results = ner_model(user_input)
     return {
         "Main Category": "Uncategorized",
         "Sub Category": "Unknown",
         "Amount": amount,
-        "Entities": ner_results,
     }
-# Gradio interface
 def process_user_input(user_input):
     result = classify_and_extract(user_input)
     return (
         f"Main Category: {result['Main Category']}\n"
         f"Sub Category: {result['Sub Category']}\n"
         f"Amount: {result['Amount']}\n"
-        f"Entities: {result['Entities']}"
     )
 iface = gr.Interface(
@@ -92,7 +73,7 @@ iface = gr.Interface(
     inputs="text",
     outputs="text",
     title="Expenditure Classifier",
-    description="Classify expenditures into main and subcategories (Need, Want, Saving/Investment) and extract amounts."
 )
-iface.launch()

 from transformers import pipeline, AutoTokenizer
 from optimum.onnxruntime import ORTModelForTokenClassification
+import re
 import gradio as gr
+# Define categories and keywords
 CATEGORIES = {
     "Need": {
+        "Housing": ["nha", "thue", "sua nha"],
+        "Groceries": ["thuc pham", "rau cu", "sieu thi"],
     },
     "Want": {
+        "Entertainment": ["phim", "karaoke", "game", "do choi"],
+        "Dining Out": ["cafe", "nha hang", "tra sua"],
     },
     "Saving/Investment": {
+        "Savings": ["quy tiet kiem", "dau tu", "tai san"],
+    },
 }
+# Normalize Vietnamese text
 def normalize_vietnamese(text):
     return re.sub(
         r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
     ).replace("đ", "d")
+# Load tokenizer and quantized model
 model_name = "distilbert-base-multilingual-cased"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+quantized_model = ORTModelForTokenClassification.from_pretrained(model_name)
+# Create NER pipeline
 ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")
 # Classify input
 def classify_and_extract(user_input):
     normalized_input = normalize_vietnamese(user_input.lower())
+    amount = re.search(r"\d+", normalized_input)
+    amount = amount.group(0) if amount else "Unknown"
+    # Rule-based matching
+    for main_cat, subcategories in CATEGORIES.items():
+        for sub_cat, keywords in subcategories.items():
             if any(keyword in normalized_input for keyword in keywords):
                 return {
+                    "Main Category": main_cat,
+                    "Sub Category": sub_cat,
                     "Amount": amount,
+                    "NER Entities": [],
                 }
+    # Fallback to NER model
     ner_results = ner_model(user_input)
     return {
         "Main Category": "Uncategorized",
         "Sub Category": "Unknown",
         "Amount": amount,
+        "NER Entities": ner_results,
     }
+# Gradio app
 def process_user_input(user_input):
     result = classify_and_extract(user_input)
     return (
         f"Main Category: {result['Main Category']}\n"
         f"Sub Category: {result['Sub Category']}\n"
         f"Amount: {result['Amount']}\n"
+        f"Entities: {result['NER Entities']}"
     )
 iface = gr.Interface(
     inputs="text",
     outputs="text",
     title="Expenditure Classifier",
+    description="Classify and categorize spending."
 )
+iface.launch(share=True)