Spaces:

hertogateis
/

Table_QandA_v2

Sleeping

App Files Files Community

hertogateis commited on Jan 3

Commit

7797cc9

verified ·

1 Parent(s): 0cb6df9

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -147

app.py CHANGED Viewed

@@ -1,131 +1,64 @@
 import os
-import streamlit as st
-from st_aggrid import AgGrid
 import pandas as pd
-from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
-# Set the page layout for Streamlit
-st.set_page_config(layout="wide")
-# CSS styling
-style = '''
-    <style>
-        body {background-color: #F5F5F5; color: #000000;}
-        header {visibility: hidden;}
-        div.block-container {padding-top:4rem;}
-        section[data-testid="stSidebar"] div:first-child {
-        padding-top: 0;
-    }
-     .font {
-    text-align:center;
-    font-family:sans-serif;font-size: 1.25rem;}
-    </style>
-'''
-st.markdown(style, unsafe_allow_html=True)
-st.markdown('<p style="font-family:sans-serif;font-size: 1.5rem;text-align: right;"> HertogAI Table Q&A using TAPAS and Model Language</p>', unsafe_allow_html=True)
-st.markdown('<p style="font-family:sans-serif;font-size: 0.7rem;text-align: right;"> This code is based on Jordan Skinner. I enhanced his work for Data analysis COUNT, AVG, TOTAL, MEAN & StdDev </p>', unsafe_allow_html=True)
-st.markdown("<p style='font-family:sans-serif;font-size: 0.6rem;text-align: right;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
-# Initialize TAPAS pipeline
-tqa = pipeline(task="table-question-answering",
-              model="google/tapas-large-finetuned-wtq",
-              device="cpu")
-# Initialize T5 tokenizer and model for text generation
-t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
-t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
-# File uploader in the sidebar
-file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
-# File processing and question answering
-if file_name is None:
-    st.markdown('<p class="custom-font">Please click left side bar to upload an excel or csv file </p>', unsafe_allow_html=True)
-else:
-    try:
-        # Check file type and handle reading accordingly
-        if file_name.name.endswith('.csv'):
-            df = pd.read_csv(file_name, sep=';', encoding='ISO-8859-1')  # Adjust encoding if needed
-        elif file_name.name.endswith('.xlsx'):
-            df = pd.read_excel(file_name, engine='openpyxl')  # Use openpyxl to read .xlsx files
-        else:
-            st.error("Unsupported file type")
-            df = None
-        # Continue with further processing if df is loaded
-        if df is not None:
-            numeric_columns = df.select_dtypes(include=['object']).columns
-            for col in numeric_columns:
-                df[col] = pd.to_numeric(df[col], errors='ignore')
-            st.write("Original Data:")
-            st.write(df)
-            # Create a copy for numerical operations
-            df_numeric = df.copy()
-            df = df.astype(str)
-            # Display the first 5 rows of the dataframe in an editable grid
-            grid_response = AgGrid(
-                df.head(5),
-                columns_auto_size_mode='FIT_CONTENTS',
-                editable=True,
-                height=300,
-                width='100%',
-            )
-    except Exception as e:
-        st.error(f"Error reading file: {str(e)}")
-    # User input for the question
-    question = st.text_input('Type your question')
-    # Process the answer using TAPAS and T5
-    with st.spinner():
-        if st.button('Answer'):
-            try:
-                # Get the raw answer from TAPAS
-                raw_answer = tqa(table=df, query=question, truncation=True)
-                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result From TAPAS: </p>",
-                           unsafe_allow_html=True)
-                st.success(raw_answer)
-                # Extract relevant information from the TAPAS result
-                answer = raw_answer['answer']
-                aggregator = raw_answer.get('aggregator', '')
-                coordinates = raw_answer.get('coordinates', [])
-                cells = raw_answer.get('cells', [])
-                # Construct a base sentence replacing 'SUM' with the query term
-                base_sentence = f"The {question.lower()} of the selected data is {answer}."
-                if coordinates and cells:
-                    rows_info = [f"Row {coordinate[0] + 1}, Column '{df.columns[coordinate[1]]}' with value {cell}"
-                                 for coordinate, cell in zip(coordinates, cells)]
-                    rows_description = " and ".join(rows_info)
-                    base_sentence += f" This includes the following data: {rows_description}."
-                # Generate a fluent response using the T5 model, rephrasing the base sentence
-                input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"
-                # Tokenize the input and generate a fluent response using T5
-                inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
-                summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
-                # Decode the generated text
-                generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-                # Display the final generated response
-                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response with LLM: </p>", unsafe_allow_html=True)
-                st.success(generated_text)
-            except Exception as e:
-                st.warning("Please retype your question and make sure to use the column name and cell value correctly.")
 # Manually fix the aggregator if it returns an incorrect one
 if 'MEDIAN' in question.upper() and 'AVERAGE' in aggregator.upper():
     aggregator = 'MEDIAN'
@@ -139,23 +72,27 @@ elif 'TOTAL' in question.upper() and 'SUM' in aggregator.upper():
 # Use the corrected aggregator for further processing
 summary_type = aggregator.lower()
-# Now, calculate the correct value using pandas based on the corrected aggregator
-if summary_type == 'sum':
-    numeric_value = df_numeric[column_name].sum()
-elif summary_type == 'max':
-    numeric_value = df_numeric[column_name].max()
-elif summary_type == 'min':
-    numeric_value = df_numeric[column_name].min()
-elif summary_type == 'average':
-    numeric_value = df_numeric[column_name].mean()
-elif summary_type == 'count':
-    numeric_value = df_numeric[column_name].count()
-elif summary_type == 'median':
-    numeric_value = df_numeric[column_name].median()
-elif summary_type == 'std_dev':
-    numeric_value = df_numeric[column_name].std()
 else:
-    numeric_value = processed_answer  # Fallback if something went wrong
 # Construct a natural language response
 if summary_type == 'sum':
@@ -186,8 +123,3 @@ st.success(f"""
     Additional Context:
     • Query Asked: "{question}"
 """)

 import os
 import pandas as pd
+import streamlit as st
+from tapas import tqa, t5_tokenizer, t5_model
+# Assuming 'df' is the DataFrame you are using and has numeric columns
+df_numeric = df.select_dtypes(include='number')
+# Ensure that `column_name` is defined and valid
+column_name = None  # Make sure this is defined later from TAPAS response
+# User input for the question
+question = st.text_input('Type your question')
+# Process the answer using TAPAS and T5
+with st.spinner():
+    if st.button('Answer'):
+        try:
+            # Get the raw answer from TAPAS
+            raw_answer = tqa(table=df, query=question, truncation=True)
+            st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result From TAPAS: </p>", unsafe_allow_html=True)
+            st.success(raw_answer)
+            # Extract relevant information from the TAPAS result
+            answer = raw_answer['answer']
+            aggregator = raw_answer.get('aggregator', '')
+            coordinates = raw_answer.get('coordinates', [])
+            cells = raw_answer.get('cells', [])
+            # Extract the column name based on coordinates
+            if coordinates:
+                row, col = coordinates[0]  # assuming single cell result
+                column_name = df.columns[col]  # Get the column name
+            # Construct a base sentence replacing 'SUM' with the query term
+            base_sentence = f"The {question.lower()} of the selected data is {answer}."
+            if coordinates and cells:
+                rows_info = [f"Row {coordinate[0] + 1}, Column '{df.columns[coordinate[1]]}' with value {cell}"
+                             for coordinate, cell in zip(coordinates, cells)]
+                rows_description = " and ".join(rows_info)
+                base_sentence += f" This includes the following data: {rows_description}."
+            # Generate a fluent response using the T5 model, rephrasing the base sentence
+            input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"
+            # Tokenize the input and generate a fluent response using T5
+            inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
+            summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
+            # Decode the generated text
+            generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            # Display the final generated response
+            st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response with LLM: </p>", unsafe_allow_html=True)
+            st.success(generated_text)
+        except Exception as e:
+            st.warning("Please retype your question and make sure to use the column name and cell value correctly.")
 # Manually fix the aggregator if it returns an incorrect one
 if 'MEDIAN' in question.upper() and 'AVERAGE' in aggregator.upper():
     aggregator = 'MEDIAN'
 # Use the corrected aggregator for further processing
 summary_type = aggregator.lower()
+# Check if `column_name` is valid before proceeding
+if column_name and column_name in df_numeric.columns:
+    # Now, calculate the correct value using pandas based on the corrected aggregator
+    if summary_type == 'sum':
+        numeric_value = df_numeric[column_name].sum()
+    elif summary_type == 'max':
+        numeric_value = df_numeric[column_name].max()
+    elif summary_type == 'min':
+        numeric_value = df_numeric[column_name].min()
+    elif summary_type == 'average':
+        numeric_value = df_numeric[column_name].mean()
+    elif summary_type == 'count':
+        numeric_value = df_numeric[column_name].count()
+    elif summary_type == 'median':
+        numeric_value = df_numeric[column_name].median()
+    elif summary_type == 'std_dev':
+        numeric_value = df_numeric[column_name].std()
+    else:
+        numeric_value = answer  # Fallback if something went wrong
 else:
+    numeric_value = "Invalid column"
 # Construct a natural language response
 if summary_type == 'sum':
     Additional Context:
     • Query Asked: "{question}"
 """)