Meomap commited on
Commit
9839b62
·
verified ·
1 Parent(s): 89789fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -45
app.py CHANGED
@@ -1,90 +1,71 @@
1
- import re
2
  from transformers import pipeline, AutoTokenizer
3
  from optimum.onnxruntime import ORTModelForTokenClassification
 
4
  import gradio as gr
5
 
6
- # Define categories and their keywords
7
  CATEGORIES = {
8
  "Need": {
9
- "Utilities": ["dien", "nuoc", "gas", "internet", "dienthoai"],
10
- "Housing": ["nha", "thue", "sua chua", "sua nha"],
11
- "Groceries": ["thuc pham", "sieu thi", "rau cu", "do an"],
12
- "Transportation": ["xang", "xe", "ve xe", "bao duong"],
13
- "Education": ["hoc phi", "sach", "truong", "khoa hoc"],
14
- "Medical": ["bao hiem", "bac si", "thuoc"],
15
- "Insurance": ["bao hiem", "nha", "oto", "suc khoe"],
16
- "Childcare": ["tre em", "truong mam non", "nguoi giup viec"],
17
  },
18
  "Want": {
19
- "Dining Out": ["nha hang", "quan an", "cafe", "tra sua"],
20
- "Entertainment": ["phim", "karaoke", "game", "nhac", "do choi", "bup be"],
21
- "Travel": ["du lich", "ve may bay", "khach san"],
22
- "Fitness": ["gym", "yoga", "the thao"],
23
- "Shopping": ["quan ao", "phu kien", "dien thoai", "luxury"],
24
- "Hobbies": ["so thich", "do choi", "my thuat"],
25
- "Personal Care": ["spa", "toc", "lam dep", "my pham"],
26
  },
27
  "Saving/Investment": {
28
- "Emergency Fund": ["quy du phong"],
29
- "Retirement": ["nghi huu"],
30
- "Investments": ["chung khoan", "bat dong san"],
31
- "Debt Repayment": ["tra no"],
32
- "Education Fund": ["quy hoc tap"],
33
- "Savings for Goals": ["quy tiet kiem"],
34
- "Health Savings": ["bao hiem y te"],
35
- }
36
  }
37
 
38
- # Normalize Vietnamese input (remove accents)
39
  def normalize_vietnamese(text):
40
  return re.sub(
41
  r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
42
  ).replace("đ", "d")
43
 
44
- # Load and quantize the model
45
  model_name = "distilbert-base-multilingual-cased"
46
  tokenizer = AutoTokenizer.from_pretrained(model_name)
47
- quantized_model = ORTModelForTokenClassification.from_pretrained(model_name, from_transformers=True)
48
 
49
- # Create the NER pipeline with the quantized model
50
  ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")
51
 
52
  # Classify input
53
  def classify_and_extract(user_input):
54
  normalized_input = normalize_vietnamese(user_input.lower())
 
 
55
 
56
- # Extract amount using regex
57
- amount_match = re.search(r"(\d+(\.\d{1,2})?)", normalized_input)
58
- amount = amount_match.group(0) if amount_match else "Unknown"
59
-
60
- # Rule-based matching for categories
61
- for main_category, subcategories in CATEGORIES.items():
62
- for subcategory, keywords in subcategories.items():
63
  if any(keyword in normalized_input for keyword in keywords):
64
  return {
65
- "Main Category": main_category,
66
- "Sub Category": subcategory,
67
  "Amount": amount,
68
- "Entities": [] # Skip NER if matched via rules
69
  }
70
 
71
- # Fallback to NER model for unmatched cases
72
  ner_results = ner_model(user_input)
73
  return {
74
  "Main Category": "Uncategorized",
75
  "Sub Category": "Unknown",
76
  "Amount": amount,
77
- "Entities": ner_results,
78
  }
79
 
80
- # Gradio interface
81
  def process_user_input(user_input):
82
  result = classify_and_extract(user_input)
83
  return (
84
  f"Main Category: {result['Main Category']}\n"
85
  f"Sub Category: {result['Sub Category']}\n"
86
  f"Amount: {result['Amount']}\n"
87
- f"Entities: {result['Entities']}"
88
  )
89
 
90
  iface = gr.Interface(
@@ -92,7 +73,7 @@ iface = gr.Interface(
92
  inputs="text",
93
  outputs="text",
94
  title="Expenditure Classifier",
95
- description="Classify expenditures into main and subcategories (Need, Want, Saving/Investment) and extract amounts."
96
  )
97
 
98
- iface.launch()
 
 
1
  from transformers import pipeline, AutoTokenizer
2
  from optimum.onnxruntime import ORTModelForTokenClassification
3
+ import re
4
  import gradio as gr
5
 
6
+ # Define categories and keywords
7
  CATEGORIES = {
8
  "Need": {
9
+ "Housing": ["nha", "thue", "sua nha"],
10
+ "Groceries": ["thuc pham", "rau cu", "sieu thi"],
 
 
 
 
 
 
11
  },
12
  "Want": {
13
+ "Entertainment": ["phim", "karaoke", "game", "do choi"],
14
+ "Dining Out": ["cafe", "nha hang", "tra sua"],
 
 
 
 
 
15
  },
16
  "Saving/Investment": {
17
+ "Savings": ["quy tiet kiem", "dau tu", "tai san"],
18
+ },
 
 
 
 
 
 
19
  }
20
 
21
+ # Normalize Vietnamese text
22
  def normalize_vietnamese(text):
23
  return re.sub(
24
  r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
25
  ).replace("đ", "d")
26
 
27
+ # Load tokenizer and quantized model
28
  model_name = "distilbert-base-multilingual-cased"
29
  tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+ quantized_model = ORTModelForTokenClassification.from_pretrained(model_name)
31
 
32
+ # Create NER pipeline
33
  ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")
34
 
35
  # Classify input
36
  def classify_and_extract(user_input):
37
  normalized_input = normalize_vietnamese(user_input.lower())
38
+ amount = re.search(r"\d+", normalized_input)
39
+ amount = amount.group(0) if amount else "Unknown"
40
 
41
+ # Rule-based matching
42
+ for main_cat, subcategories in CATEGORIES.items():
43
+ for sub_cat, keywords in subcategories.items():
 
 
 
 
44
  if any(keyword in normalized_input for keyword in keywords):
45
  return {
46
+ "Main Category": main_cat,
47
+ "Sub Category": sub_cat,
48
  "Amount": amount,
49
+ "NER Entities": [],
50
  }
51
 
52
+ # Fallback to NER model
53
  ner_results = ner_model(user_input)
54
  return {
55
  "Main Category": "Uncategorized",
56
  "Sub Category": "Unknown",
57
  "Amount": amount,
58
+ "NER Entities": ner_results,
59
  }
60
 
61
+ # Gradio app
62
  def process_user_input(user_input):
63
  result = classify_and_extract(user_input)
64
  return (
65
  f"Main Category: {result['Main Category']}\n"
66
  f"Sub Category: {result['Sub Category']}\n"
67
  f"Amount: {result['Amount']}\n"
68
+ f"Entities: {result['NER Entities']}"
69
  )
70
 
71
  iface = gr.Interface(
 
73
  inputs="text",
74
  outputs="text",
75
  title="Expenditure Classifier",
76
+ description="Classify and categorize spending."
77
  )
78
 
79
+ iface.launch(share=True)