hertogateis commited on
Commit
7797cc9
·
verified ·
1 Parent(s): 0cb6df9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -147
app.py CHANGED
@@ -1,131 +1,64 @@
1
  import os
2
- import streamlit as st
3
- from st_aggrid import AgGrid
4
  import pandas as pd
5
- from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
6
-
7
- # Set the page layout for Streamlit
8
- st.set_page_config(layout="wide")
9
-
10
- # CSS styling
11
- style = '''
12
- <style>
13
- body {background-color: #F5F5F5; color: #000000;}
14
- header {visibility: hidden;}
15
- div.block-container {padding-top:4rem;}
16
- section[data-testid="stSidebar"] div:first-child {
17
- padding-top: 0;
18
- }
19
- .font {
20
- text-align:center;
21
- font-family:sans-serif;font-size: 1.25rem;}
22
- </style>
23
- '''
24
- st.markdown(style, unsafe_allow_html=True)
25
-
26
- st.markdown('<p style="font-family:sans-serif;font-size: 1.5rem;text-align: right;"> HertogAI Table Q&A using TAPAS and Model Language</p>', unsafe_allow_html=True)
27
- st.markdown('<p style="font-family:sans-serif;font-size: 0.7rem;text-align: right;"> This code is based on Jordan Skinner. I enhanced his work for Data analysis COUNT, AVG, TOTAL, MEAN & StdDev </p>', unsafe_allow_html=True)
28
- st.markdown("<p style='font-family:sans-serif;font-size: 0.6rem;text-align: right;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
29
-
30
-
31
- # Initialize TAPAS pipeline
32
- tqa = pipeline(task="table-question-answering",
33
- model="google/tapas-large-finetuned-wtq",
34
- device="cpu")
35
-
36
- # Initialize T5 tokenizer and model for text generation
37
- t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
38
- t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
39
-
40
- # File uploader in the sidebar
41
- file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
42
-
43
- # File processing and question answering
44
- if file_name is None:
45
-
46
- st.markdown('<p class="custom-font">Please click left side bar to upload an excel or csv file </p>', unsafe_allow_html=True)
47
- else:
48
- try:
49
- # Check file type and handle reading accordingly
50
- if file_name.name.endswith('.csv'):
51
- df = pd.read_csv(file_name, sep=';', encoding='ISO-8859-1') # Adjust encoding if needed
52
- elif file_name.name.endswith('.xlsx'):
53
- df = pd.read_excel(file_name, engine='openpyxl') # Use openpyxl to read .xlsx files
54
- else:
55
- st.error("Unsupported file type")
56
- df = None
57
-
58
- # Continue with further processing if df is loaded
59
- if df is not None:
60
- numeric_columns = df.select_dtypes(include=['object']).columns
61
- for col in numeric_columns:
62
- df[col] = pd.to_numeric(df[col], errors='ignore')
63
-
64
- st.write("Original Data:")
65
- st.write(df)
66
-
67
- # Create a copy for numerical operations
68
- df_numeric = df.copy()
69
- df = df.astype(str)
70
-
71
- # Display the first 5 rows of the dataframe in an editable grid
72
- grid_response = AgGrid(
73
- df.head(5),
74
- columns_auto_size_mode='FIT_CONTENTS',
75
- editable=True,
76
- height=300,
77
- width='100%',
78
- )
79
-
80
- except Exception as e:
81
- st.error(f"Error reading file: {str(e)}")
82
-
83
- # User input for the question
84
- question = st.text_input('Type your question')
85
-
86
- # Process the answer using TAPAS and T5
87
- with st.spinner():
88
- if st.button('Answer'):
89
- try:
90
- # Get the raw answer from TAPAS
91
- raw_answer = tqa(table=df, query=question, truncation=True)
92
-
93
- st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result From TAPAS: </p>",
94
- unsafe_allow_html=True)
95
- st.success(raw_answer)
96
-
97
- # Extract relevant information from the TAPAS result
98
- answer = raw_answer['answer']
99
- aggregator = raw_answer.get('aggregator', '')
100
- coordinates = raw_answer.get('coordinates', [])
101
- cells = raw_answer.get('cells', [])
102
-
103
- # Construct a base sentence replacing 'SUM' with the query term
104
- base_sentence = f"The {question.lower()} of the selected data is {answer}."
105
- if coordinates and cells:
106
- rows_info = [f"Row {coordinate[0] + 1}, Column '{df.columns[coordinate[1]]}' with value {cell}"
107
- for coordinate, cell in zip(coordinates, cells)]
108
- rows_description = " and ".join(rows_info)
109
- base_sentence += f" This includes the following data: {rows_description}."
110
-
111
- # Generate a fluent response using the T5 model, rephrasing the base sentence
112
- input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"
113
-
114
- # Tokenize the input and generate a fluent response using T5
115
- inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
116
- summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
117
-
118
- # Decode the generated text
119
- generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
120
-
121
- # Display the final generated response
122
- st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response with LLM: </p>", unsafe_allow_html=True)
123
- st.success(generated_text)
124
-
125
- except Exception as e:
126
- st.warning("Please retype your question and make sure to use the column name and cell value correctly.")
127
-
128
-
129
  # Manually fix the aggregator if it returns an incorrect one
130
  if 'MEDIAN' in question.upper() and 'AVERAGE' in aggregator.upper():
131
  aggregator = 'MEDIAN'
@@ -139,23 +72,27 @@ elif 'TOTAL' in question.upper() and 'SUM' in aggregator.upper():
139
  # Use the corrected aggregator for further processing
140
  summary_type = aggregator.lower()
141
 
142
- # Now, calculate the correct value using pandas based on the corrected aggregator
143
- if summary_type == 'sum':
144
- numeric_value = df_numeric[column_name].sum()
145
- elif summary_type == 'max':
146
- numeric_value = df_numeric[column_name].max()
147
- elif summary_type == 'min':
148
- numeric_value = df_numeric[column_name].min()
149
- elif summary_type == 'average':
150
- numeric_value = df_numeric[column_name].mean()
151
- elif summary_type == 'count':
152
- numeric_value = df_numeric[column_name].count()
153
- elif summary_type == 'median':
154
- numeric_value = df_numeric[column_name].median()
155
- elif summary_type == 'std_dev':
156
- numeric_value = df_numeric[column_name].std()
 
 
 
 
157
  else:
158
- numeric_value = processed_answer # Fallback if something went wrong
159
 
160
  # Construct a natural language response
161
  if summary_type == 'sum':
@@ -186,8 +123,3 @@ st.success(f"""
186
  Additional Context:
187
  • Query Asked: "{question}"
188
  """)
189
-
190
-
191
-
192
-
193
-
 
1
  import os
 
 
2
  import pandas as pd
3
+ import streamlit as st
4
+ from tapas import tqa, t5_tokenizer, t5_model
5
+
6
+ # Assuming 'df' is the DataFrame you are using and has numeric columns
7
+ df_numeric = df.select_dtypes(include='number')
8
+
9
+ # Ensure that `column_name` is defined and valid
10
+ column_name = None # Make sure this is defined later from TAPAS response
11
+
12
+ # User input for the question
13
+ question = st.text_input('Type your question')
14
+
15
+ # Process the answer using TAPAS and T5
16
+ with st.spinner():
17
+ if st.button('Answer'):
18
+ try:
19
+ # Get the raw answer from TAPAS
20
+ raw_answer = tqa(table=df, query=question, truncation=True)
21
+
22
+ st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result From TAPAS: </p>", unsafe_allow_html=True)
23
+ st.success(raw_answer)
24
+
25
+ # Extract relevant information from the TAPAS result
26
+ answer = raw_answer['answer']
27
+ aggregator = raw_answer.get('aggregator', '')
28
+ coordinates = raw_answer.get('coordinates', [])
29
+ cells = raw_answer.get('cells', [])
30
+
31
+ # Extract the column name based on coordinates
32
+ if coordinates:
33
+ row, col = coordinates[0] # assuming single cell result
34
+ column_name = df.columns[col] # Get the column name
35
+
36
+ # Construct a base sentence replacing 'SUM' with the query term
37
+ base_sentence = f"The {question.lower()} of the selected data is {answer}."
38
+ if coordinates and cells:
39
+ rows_info = [f"Row {coordinate[0] + 1}, Column '{df.columns[coordinate[1]]}' with value {cell}"
40
+ for coordinate, cell in zip(coordinates, cells)]
41
+ rows_description = " and ".join(rows_info)
42
+ base_sentence += f" This includes the following data: {rows_description}."
43
+
44
+ # Generate a fluent response using the T5 model, rephrasing the base sentence
45
+ input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"
46
+
47
+ # Tokenize the input and generate a fluent response using T5
48
+ inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
49
+ summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
50
+
51
+ # Decode the generated text
52
+ generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
53
+
54
+ # Display the final generated response
55
+ st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response with LLM: </p>", unsafe_allow_html=True)
56
+ st.success(generated_text)
57
+
58
+ except Exception as e:
59
+ st.warning("Please retype your question and make sure to use the column name and cell value correctly.")
60
+
61
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # Manually fix the aggregator if it returns an incorrect one
63
  if 'MEDIAN' in question.upper() and 'AVERAGE' in aggregator.upper():
64
  aggregator = 'MEDIAN'
 
72
  # Use the corrected aggregator for further processing
73
  summary_type = aggregator.lower()
74
 
75
+ # Check if `column_name` is valid before proceeding
76
+ if column_name and column_name in df_numeric.columns:
77
+ # Now, calculate the correct value using pandas based on the corrected aggregator
78
+ if summary_type == 'sum':
79
+ numeric_value = df_numeric[column_name].sum()
80
+ elif summary_type == 'max':
81
+ numeric_value = df_numeric[column_name].max()
82
+ elif summary_type == 'min':
83
+ numeric_value = df_numeric[column_name].min()
84
+ elif summary_type == 'average':
85
+ numeric_value = df_numeric[column_name].mean()
86
+ elif summary_type == 'count':
87
+ numeric_value = df_numeric[column_name].count()
88
+ elif summary_type == 'median':
89
+ numeric_value = df_numeric[column_name].median()
90
+ elif summary_type == 'std_dev':
91
+ numeric_value = df_numeric[column_name].std()
92
+ else:
93
+ numeric_value = answer # Fallback if something went wrong
94
  else:
95
+ numeric_value = "Invalid column"
96
 
97
  # Construct a natural language response
98
  if summary_type == 'sum':
 
123
  Additional Context:
124
  • Query Asked: "{question}"
125
  """)