Spaces:
Sleeping
Sleeping
Commit
·
b9f1938
1
Parent(s):
1bf5976
add IndoNLU model
Browse files- app.py +9 -11
- model/indoBERT-indoNLU-Fold-5.pth +3 -0
- src/__pycache__/helper.cpython-310.pyc +0 -0
- src/__pycache__/legalNER.cpython-310.pyc +0 -0
- src/helper.py +1 -0
- src/legalNER.py +13 -8
app.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
from gradio_pdf import PDF
|
2 |
from src.helper import *
|
3 |
import gradio as gr
|
4 |
-
from
|
5 |
-
|
6 |
-
dir_ = Path(__file__).parent
|
7 |
|
8 |
with gr.Blocks() as ner:
|
9 |
gr.Markdown("# Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
|
@@ -24,7 +22,7 @@ with gr.Blocks() as ner:
|
|
24 |
]
|
25 |
gr.Markdown("## Penjelasan Label")
|
26 |
gr.DataFrame(keterangan_label, headers=["Label", "Keterangan"], height=200)
|
27 |
-
gr.Markdown("##
|
28 |
# Input Text
|
29 |
with gr.Row():
|
30 |
with gr.Column(scale=2):
|
@@ -48,7 +46,7 @@ with gr.Blocks() as ner:
|
|
48 |
fn=text_extraction,
|
49 |
)
|
50 |
|
51 |
-
gr.Markdown("##
|
52 |
# Input PDF
|
53 |
with gr.Row():
|
54 |
with gr.Column(scale=2):
|
@@ -63,12 +61,12 @@ with gr.Blocks() as ner:
|
|
63 |
button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
|
64 |
|
65 |
gr.Examples(
|
66 |
-
["428_pid.b_2021_pn_jkt.brt_20240529091234.pdf",
|
67 |
-
"1558_pid.b_2020_pn_jkt.brt_20240529091451.pdf",
|
68 |
-
"329_pid.b_2023_pn_jkt.brt_20240529090837.pdf",
|
69 |
-
"168_Pid.Sus_2023_PN_Bkl.pdf",
|
70 |
-
"169_Pid.Sus_2023_PN_Bkl.pdf",
|
71 |
-
"167_Pid.Sus_2023_PN_Bkl.pdf"],
|
72 |
inputs=[doc],
|
73 |
outputs=output_pdf,
|
74 |
fn=pdf_extraction,
|
|
|
1 |
from gradio_pdf import PDF
|
2 |
from src.helper import *
|
3 |
import gradio as gr
|
4 |
+
from gradio_pdf import PDF
|
|
|
|
|
5 |
|
6 |
with gr.Blocks() as ner:
|
7 |
gr.Markdown("# Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
|
|
|
22 |
]
|
23 |
gr.Markdown("## Penjelasan Label")
|
24 |
gr.DataFrame(keterangan_label, headers=["Label", "Keterangan"], height=200)
|
25 |
+
gr.Markdown("## Ekstraksi Entitas pada Potongan Kalimat")
|
26 |
# Input Text
|
27 |
with gr.Row():
|
28 |
with gr.Column(scale=2):
|
|
|
46 |
fn=text_extraction,
|
47 |
)
|
48 |
|
49 |
+
gr.Markdown("## Ekstraksi Entitas pada Dokumen Putusan Hukum")
|
50 |
# Input PDF
|
51 |
with gr.Row():
|
52 |
with gr.Column(scale=2):
|
|
|
61 |
button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
|
62 |
|
63 |
gr.Examples(
|
64 |
+
["data/428_pid.b_2021_pn_jkt.brt_20240529091234.pdf",
|
65 |
+
"data/1558_pid.b_2020_pn_jkt.brt_20240529091451.pdf",
|
66 |
+
"data/329_pid.b_2023_pn_jkt.brt_20240529090837.pdf",
|
67 |
+
"data/168_Pid.Sus_2023_PN_Bkl.pdf",
|
68 |
+
"data/169_Pid.Sus_2023_PN_Bkl.pdf",
|
69 |
+
"data/167_Pid.Sus_2023_PN_Bkl.pdf"],
|
70 |
inputs=[doc],
|
71 |
outputs=output_pdf,
|
72 |
fn=pdf_extraction,
|
model/indoBERT-indoNLU-Fold-5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:486b48b22c41570e5768fc574ff11eb61ca6b5b9751c06e7884108e52ca90c07
|
3 |
+
size 495564052
|
src/__pycache__/helper.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/helper.cpython-310.pyc and b/src/__pycache__/helper.cpython-310.pyc differ
|
|
src/__pycache__/legalNER.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/legalNER.cpython-310.pyc and b/src/__pycache__/legalNER.cpython-310.pyc differ
|
|
src/helper.py
CHANGED
@@ -2,6 +2,7 @@ from transformers import BertTokenizerFast, BertForTokenClassification
|
|
2 |
import gradio as gr
|
3 |
from src.legalNER import *
|
4 |
|
|
|
5 |
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
6 |
indolem = 'indolem/indobert-base-uncased'
|
7 |
indonlu = 'indobenchmark/indobert-base-p2'
|
|
|
2 |
import gradio as gr
|
3 |
from src.legalNER import *
|
4 |
|
5 |
+
|
6 |
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
7 |
indolem = 'indolem/indobert-base-uncased'
|
8 |
indonlu = 'indobenchmark/indobert-base-p2'
|
src/legalNER.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
class LegalNER():
|
5 |
def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
|
@@ -132,23 +137,23 @@ class LegalNER():
|
|
132 |
result = ''
|
133 |
for i, (label, data) in enumerate(sorted_entitu_result.items()):
|
134 |
if label in ['PENA', 'ARTV']:
|
135 |
-
result += f'{i+1}. {self.label_convert[label]}\t =
|
136 |
elif label in ['PROS']:
|
137 |
if (i+1) >= 10:
|
138 |
-
result += f'{i+1}. {self.label_convert[label]}\t =
|
139 |
else:
|
140 |
-
result += f'{i+1}. {self.label_convert[label]}\t\t =
|
141 |
elif label in ['JUDP', 'CRIA']:
|
142 |
-
result += f'{i+1}. {self.label_convert[label]}\t\t\t =
|
143 |
elif label in ['ADVO']:
|
144 |
-
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t =
|
145 |
elif label in ['REGI']:
|
146 |
if (i+1) >= 10:
|
147 |
-
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t =
|
148 |
else:
|
149 |
-
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t\t =
|
150 |
else:
|
151 |
-
result += f'{i+1}. {self.label_convert[label]}\t\t =
|
152 |
|
153 |
return result
|
154 |
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
import requests
|
4 |
+
import PyPDF2
|
5 |
+
import re
|
6 |
+
# import nltk
|
7 |
+
# nltk.download('punkt')
|
8 |
|
9 |
class LegalNER():
|
10 |
def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
|
|
|
137 |
result = ''
|
138 |
for i, (label, data) in enumerate(sorted_entitu_result.items()):
|
139 |
if label in ['PENA', 'ARTV']:
|
140 |
+
result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
|
141 |
elif label in ['PROS']:
|
142 |
if (i+1) >= 10:
|
143 |
+
result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
|
144 |
else:
|
145 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
|
146 |
elif label in ['JUDP', 'CRIA']:
|
147 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
|
148 |
elif label in ['ADVO']:
|
149 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t = {data.capitalize()}\n'
|
150 |
elif label in ['REGI']:
|
151 |
if (i+1) >= 10:
|
152 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
|
153 |
else:
|
154 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t\t = {data.capitalize()}\n'
|
155 |
else:
|
156 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
|
157 |
|
158 |
return result
|
159 |
|