TheoLvs commited on
Commit
088e816
·
1 Parent(s): 481f3b1

Connecting to front

Browse files
app.py CHANGED
@@ -4,7 +4,7 @@ embeddings_function = get_embeddings_function()
4
  from climateqa.papers.openalex import OpenAlex
5
  from sentence_transformers import CrossEncoder
6
 
7
- reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
8
  oa = OpenAlex()
9
 
10
  import gradio as gr
@@ -29,16 +29,19 @@ from utils import create_user_id
29
 
30
  # ClimateQ&A imports
31
  from climateqa.engine.llm import get_llm
32
- from climateqa.engine.chains.answer_rag import make_rag_chain
33
  from climateqa.engine.vectorstore import get_pinecone_vectorstore
34
  from climateqa.engine.retriever import ClimateQARetriever
 
35
  from climateqa.engine.embeddings import get_embeddings_function
36
  from climateqa.engine.chains.prompts import audience_prompts
37
  from climateqa.sample_questions import QUESTIONS
38
  from climateqa.constants import POSSIBLE_REPORTS
39
  from climateqa.utils import get_image_from_azure_blob_storage
40
  from climateqa.engine.keywords import make_keywords_chain
41
- from climateqa.engine.chains.answer_rag import make_rag_papers_chain
 
 
 
42
 
43
  # Load environment variables in local mode
44
  try:
@@ -81,48 +84,21 @@ user_id = create_user_id()
81
 
82
 
83
 
84
- def parse_output_llm_with_sources(output):
85
- # Split the content into a list of text and "[Doc X]" references
86
- content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
87
- parts = []
88
- for part in content_parts:
89
- if part.startswith("Doc"):
90
- subparts = part.split(",")
91
- subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
92
- subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
93
- parts.append("".join(subparts))
94
- else:
95
- parts.append(part)
96
- content_parts = "".join(parts)
97
- return content_parts
98
-
99
-
100
  # Create vectorstore and retriever
101
  vectorstore = get_pinecone_vectorstore(embeddings_function)
102
  llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
 
 
103
 
104
 
105
- def make_pairs(lst):
106
- """from a list of even lenght, make tupple pairs"""
107
- return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
108
-
109
-
110
- def serialize_docs(docs):
111
- new_docs = []
112
- for doc in docs:
113
- new_doc = {}
114
- new_doc["page_content"] = doc.page_content
115
- new_doc["metadata"] = doc.metadata
116
- new_docs.append(new_doc)
117
- return new_docs
118
-
119
 
120
 
121
  async def chat(query,history,audience,sources,reports):
122
  """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
123
  (messages in gradio format, messages in langchain format, source documents)"""
124
 
125
- print(f">> NEW QUESTION : {query}")
 
126
 
127
  if audience == "Children":
128
  audience_prompt = audience_prompts["children"]
@@ -139,59 +115,39 @@ async def chat(query,history,audience,sources,reports):
139
 
140
  if len(reports) == 0:
141
  reports = []
142
-
143
- retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
144
- rag_chain = make_rag_chain(retriever,llm)
145
 
146
- inputs = {"query": query,"audience": audience_prompt}
147
- result = rag_chain.astream_log(inputs) #{"callbacks":[MyCustomAsyncHandler()]})
148
  # result = rag_chain.stream(inputs)
149
 
150
- path_reformulation = "/logs/reformulation/final_output"
151
- path_keywords = "/logs/keywords/final_output"
152
- path_retriever = "/logs/find_documents/final_output"
153
- path_answer = "/logs/answer/streamed_output_str/-"
154
 
 
155
  docs_html = ""
156
  output_query = ""
157
  output_language = ""
158
  output_keywords = ""
159
  gallery = []
 
160
 
161
- try:
162
- async for op in result:
163
-
164
- op = op.ops[0]
 
165
 
166
- if op['path'] == path_reformulation: # reforulated question
167
- try:
168
- output_language = op['value']["language"] # str
169
- output_query = op["value"]["question"]
170
- except Exception as e:
171
- raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
172
-
173
- if op["path"] == path_keywords:
174
- try:
175
- output_keywords = op['value']["keywords"] # str
176
- output_keywords = " AND ".join(output_keywords)
177
- except Exception as e:
178
- pass
179
-
180
 
181
- elif op['path'] == path_retriever: # documents
182
- try:
183
- docs = op['value']['docs'] # List[Document]
184
- docs_html = []
185
- for i, d in enumerate(docs, 1):
186
- docs_html.append(make_html_source(d, i))
187
- docs_html = "".join(docs_html)
188
- except TypeError:
189
- print("No documents found")
190
- print("op: ",op)
191
- continue
192
 
193
- elif op['path'] == path_answer: # final answer
194
- new_token = op['value'] # str
195
  # time.sleep(0.01)
196
  previous_answer = history[-1][1]
197
  previous_answer = previous_answer if previous_answer is not None else ""
@@ -199,10 +155,47 @@ async def chat(query,history,audience,sources,reports):
199
  answer_yet = parse_output_llm_with_sources(answer_yet)
200
  history[-1] = (query,answer_yet)
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
 
204
- else:
205
- continue
206
 
207
  history = [tuple(x) for x in history]
208
  yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
@@ -276,68 +269,6 @@ async def chat(query,history,audience,sources,reports):
276
  yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
277
 
278
 
279
- def make_html_source(source,i):
280
- meta = source.metadata
281
- # content = source.page_content.split(":",1)[1].strip()
282
- content = source.page_content.strip()
283
-
284
- toc_levels = []
285
- for j in range(2):
286
- level = meta[f"toc_level{j}"]
287
- if level != "N/A":
288
- toc_levels.append(level)
289
- else:
290
- break
291
- toc_levels = " > ".join(toc_levels)
292
-
293
- if len(toc_levels) > 0:
294
- name = f"<b>{toc_levels}</b><br/>{meta['name']}"
295
- else:
296
- name = meta['name']
297
-
298
- if meta["chunk_type"] == "text":
299
-
300
- card = f"""
301
- <div class="card" id="doc{i}">
302
- <div class="card-content">
303
- <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
304
- <p>{content}</p>
305
- </div>
306
- <div class="card-footer">
307
- <span>{name}</span>
308
- <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
309
- <span role="img" aria-label="Open PDF">🔗</span>
310
- </a>
311
- </div>
312
- </div>
313
- """
314
-
315
- else:
316
-
317
- if meta["figure_code"] != "N/A":
318
- title = f"{meta['figure_code']} - {meta['short_name']}"
319
- else:
320
- title = f"{meta['short_name']}"
321
-
322
- card = f"""
323
- <div class="card card-image">
324
- <div class="card-content">
325
- <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
326
- <p>{content}</p>
327
- <p class='ai-generated'>AI-generated description</p>
328
- </div>
329
- <div class="card-footer">
330
- <span>{name}</span>
331
- <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
332
- <span role="img" aria-label="Open PDF">🔗</span>
333
- </a>
334
- </div>
335
- </div>
336
- """
337
-
338
- return card
339
-
340
-
341
 
342
  # else:
343
  # docs_string = "No relevant passages found in the climate science reports (IPCC and IPBES)"
@@ -390,54 +321,54 @@ papers_cols_widths = {
390
  papers_cols = list(papers_cols_widths.keys())
391
  papers_cols_widths = list(papers_cols_widths.values())
392
 
393
- async def find_papers(query, keywords,after):
394
 
395
- summary = ""
396
 
397
- df_works = oa.search(keywords,after = after)
398
- df_works = df_works.dropna(subset=["abstract"])
399
- df_works = oa.rerank(query,df_works,reranker)
400
- df_works = df_works.sort_values("rerank_score",ascending=False)
401
- G = oa.make_network(df_works)
402
 
403
- height = "750px"
404
- network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
405
- network_html = network.generate_html()
406
 
407
- network_html = network_html.replace("'", "\"")
408
- css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
409
- network_html = network_html + css_to_inject
410
 
411
 
412
- network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
413
- display-capture; encrypted-media;" sandbox="allow-modals allow-forms
414
- allow-scripts allow-same-origin allow-popups
415
- allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
416
- allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
417
 
418
 
419
- docs = df_works["content"].head(15).tolist()
420
 
421
- df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
422
- df_works["doc"] = df_works["doc"] + 1
423
- df_works = df_works[papers_cols]
424
 
425
- yield df_works,network_html,summary
426
 
427
- chain = make_rag_papers_chain(llm)
428
- result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
429
- path_answer = "/logs/StrOutputParser/streamed_output/-"
430
 
431
- async for op in result:
432
 
433
- op = op.ops[0]
434
 
435
- if op['path'] == path_answer: # reforulated question
436
- new_token = op['value'] # str
437
- summary += new_token
438
- else:
439
- continue
440
- yield df_works,network_html,summary
441
 
442
 
443
 
@@ -560,9 +491,6 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
560
 
561
 
562
 
563
-
564
-
565
-
566
  #---------------------------------------------------------------------------------------
567
  # OTHER TABS
568
  #---------------------------------------------------------------------------------------
@@ -571,25 +499,25 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
571
  with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
572
  gallery_component = gr.Gallery()
573
 
574
- with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
575
 
576
- with gr.Row():
577
- with gr.Column(scale=1):
578
- query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
579
- keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
580
- after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
581
- search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
582
 
583
- with gr.Column(scale=7):
584
 
585
- with gr.Tab("Summary",elem_id="papers-summary-tab"):
586
- papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
587
 
588
- with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
589
- papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
590
 
591
- with gr.Tab("Citations network",elem_id="papers-network-tab"):
592
- citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
593
 
594
 
595
 
@@ -609,13 +537,13 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
609
 
610
  (textbox
611
  .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
612
- .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_textbox")
613
  .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
614
  )
615
 
616
  (examples_hidden
617
  .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
618
- .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_examples")
619
  .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
620
  )
621
 
@@ -630,47 +558,8 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
630
 
631
  dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
632
 
633
- query_papers.submit(generate_keywords,[query_papers], [keywords_papers])
634
- search_papers.click(find_papers,[query_papers,keywords_papers,after], [papers_dataframe,citations_network,papers_summary])
635
-
636
- # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
637
- # (textbox
638
- # .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
639
- # .success(change_tab,None,tabs)
640
- # .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
641
- # .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
642
- # .success(lambda x : textbox,[textbox],[textbox])
643
- # )
644
-
645
- # (examples_hidden
646
- # .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
647
- # .success(change_tab,None,tabs)
648
- # .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
649
- # .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
650
- # .success(lambda x : textbox,[textbox],[textbox])
651
- # )
652
- # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
653
- # answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
654
- # )
655
-
656
-
657
- # with Modal(visible=True) as first_modal:
658
- # gr.Markdown("# Welcome to ClimateQ&A !")
659
-
660
- # gr.Markdown("### Examples")
661
-
662
- # examples = gr.Examples(
663
- # ["Yo ça roule","ça boume"],
664
- # [examples_hidden],
665
- # examples_per_page=8,
666
- # run_on_click=False,
667
- # elem_id="examples",
668
- # api_name="examples",
669
- # )
670
-
671
-
672
- # submit.click(lambda: Modal(visible=True), None, config_modal)
673
-
674
 
675
  demo.queue()
676
 
 
4
  from climateqa.papers.openalex import OpenAlex
5
  from sentence_transformers import CrossEncoder
6
 
7
+ # reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
8
  oa = OpenAlex()
9
 
10
  import gradio as gr
 
29
 
30
  # ClimateQ&A imports
31
  from climateqa.engine.llm import get_llm
 
32
  from climateqa.engine.vectorstore import get_pinecone_vectorstore
33
  from climateqa.engine.retriever import ClimateQARetriever
34
+ from climateqa.engine.reranker import get_reranker
35
  from climateqa.engine.embeddings import get_embeddings_function
36
  from climateqa.engine.chains.prompts import audience_prompts
37
  from climateqa.sample_questions import QUESTIONS
38
  from climateqa.constants import POSSIBLE_REPORTS
39
  from climateqa.utils import get_image_from_azure_blob_storage
40
  from climateqa.engine.keywords import make_keywords_chain
41
+ # from climateqa.engine.chains.answer_rag import make_rag_papers_chain
42
+ from climateqa.engine.graph import make_graph_agent,display_graph
43
+
44
+ from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox
45
 
46
  # Load environment variables in local mode
47
  try:
 
84
 
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  # Create vectorstore and retriever
88
  vectorstore = get_pinecone_vectorstore(embeddings_function)
89
  llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
90
+ reranker = get_reranker("nano")
91
+ agent = make_graph_agent(llm,vectorstore,reranker)
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  async def chat(query,history,audience,sources,reports):
97
  """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
98
  (messages in gradio format, messages in langchain format, source documents)"""
99
 
100
+ date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
101
+ print(f">> NEW QUESTION ({date_now}) : {query}")
102
 
103
  if audience == "Children":
104
  audience_prompt = audience_prompts["children"]
 
115
 
116
  if len(reports) == 0:
117
  reports = []
 
 
 
118
 
119
+ inputs = {"user_input": query,"audience": audience_prompt,"sources":sources}
120
+ result = agent.astream_events(inputs,version = "v1") #{"callbacks":[MyCustomAsyncHandler()]})
121
  # result = rag_chain.stream(inputs)
122
 
123
+ # path_reformulation = "/logs/reformulation/final_output"
124
+ # path_keywords = "/logs/keywords/final_output"
125
+ # path_retriever = "/logs/find_documents/final_output"
126
+ # path_answer = "/logs/answer/streamed_output_str/-"
127
 
128
+ docs = []
129
  docs_html = ""
130
  output_query = ""
131
  output_language = ""
132
  output_keywords = ""
133
  gallery = []
134
+ start_streaming = False
135
 
136
+ steps_display = {
137
+ "categorize_intent":("🔄️ Analyzing user message",True),
138
+ "transform_query":("🔄️ Thinking step by step to answer the question",True),
139
+ "retrieve_documents":("🔄️ Searching in the knowledge base",False),
140
+ }
141
 
142
+ try:
143
+ async for event in result:
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ if event["event"] == "on_chat_model_stream":
146
+ if start_streaming == False:
147
+ start_streaming = True
148
+ history[-1] = (query,"")
 
 
 
 
 
 
 
149
 
150
+ new_token = event["data"]["chunk"].content
 
151
  # time.sleep(0.01)
152
  previous_answer = history[-1][1]
153
  previous_answer = previous_answer if previous_answer is not None else ""
 
155
  answer_yet = parse_output_llm_with_sources(answer_yet)
156
  history[-1] = (query,answer_yet)
157
 
158
+
159
+ elif event["name"] == "retrieve_documents" and event["event"] == "on_chain_end":
160
+ try:
161
+ docs = event["data"]["output"]["documents"]
162
+ docs_html = []
163
+ for i, d in enumerate(docs, 1):
164
+ docs_html.append(make_html_source(d, i))
165
+ docs_html = "".join(docs_html)
166
+ except Exception as e:
167
+ print(f"Error getting documents: {e}")
168
+ print(event)
169
+
170
+
171
+ for event_name,(event_description,display_output) in steps_display.items():
172
+ if event["name"] == event_name:
173
+ if event["event"] == "on_chain_start":
174
+ # answer_yet = f"<p><span class='loader'></span>{event_description}</p>"
175
+ # answer_yet = make_toolbox(event_description, "", checked = False)
176
+ answer_yet = event_description
177
+ history[-1] = (query,answer_yet)
178
+ # elif event["event"] == "on_chain_end":
179
+ # answer_yet = ""
180
+ # history[-1] = (query,answer_yet)
181
+ # if display_output:
182
+ # print(event["data"]["output"])
183
+
184
+ # if op['path'] == path_reformulation: # reforulated question
185
+ # try:
186
+ # output_language = op['value']["language"] # str
187
+ # output_query = op["value"]["question"]
188
+ # except Exception as e:
189
+ # raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
190
+
191
+ # if op["path"] == path_keywords:
192
+ # try:
193
+ # output_keywords = op['value']["keywords"] # str
194
+ # output_keywords = " AND ".join(output_keywords)
195
+ # except Exception as e:
196
+ # pass
197
 
198
 
 
 
199
 
200
  history = [tuple(x) for x in history]
201
  yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
 
269
  yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
270
 
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  # else:
274
  # docs_string = "No relevant passages found in the climate science reports (IPCC and IPBES)"
 
321
  papers_cols = list(papers_cols_widths.keys())
322
  papers_cols_widths = list(papers_cols_widths.values())
323
 
324
+ # async def find_papers(query, keywords,after):
325
 
326
+ # summary = ""
327
 
328
+ # df_works = oa.search(keywords,after = after)
329
+ # df_works = df_works.dropna(subset=["abstract"])
330
+ # df_works = oa.rerank(query,df_works,reranker)
331
+ # df_works = df_works.sort_values("rerank_score",ascending=False)
332
+ # G = oa.make_network(df_works)
333
 
334
+ # height = "750px"
335
+ # network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
336
+ # network_html = network.generate_html()
337
 
338
+ # network_html = network_html.replace("'", "\"")
339
+ # css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
340
+ # network_html = network_html + css_to_inject
341
 
342
 
343
+ # network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
344
+ # display-capture; encrypted-media;" sandbox="allow-modals allow-forms
345
+ # allow-scripts allow-same-origin allow-popups
346
+ # allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
347
+ # allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
348
 
349
 
350
+ # docs = df_works["content"].head(15).tolist()
351
 
352
+ # df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
353
+ # df_works["doc"] = df_works["doc"] + 1
354
+ # df_works = df_works[papers_cols]
355
 
356
+ # yield df_works,network_html,summary
357
 
358
+ # chain = make_rag_papers_chain(llm)
359
+ # result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
360
+ # path_answer = "/logs/StrOutputParser/streamed_output/-"
361
 
362
+ # async for op in result:
363
 
364
+ # op = op.ops[0]
365
 
366
+ # if op['path'] == path_answer: # reforulated question
367
+ # new_token = op['value'] # str
368
+ # summary += new_token
369
+ # else:
370
+ # continue
371
+ # yield df_works,network_html,summary
372
 
373
 
374
 
 
491
 
492
 
493
 
 
 
 
494
  #---------------------------------------------------------------------------------------
495
  # OTHER TABS
496
  #---------------------------------------------------------------------------------------
 
499
  with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
500
  gallery_component = gr.Gallery()
501
 
502
+ # with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
503
 
504
+ # with gr.Row():
505
+ # with gr.Column(scale=1):
506
+ # query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
507
+ # keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
508
+ # after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
509
+ # search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
510
 
511
+ # with gr.Column(scale=7):
512
 
513
+ # with gr.Tab("Summary",elem_id="papers-summary-tab"):
514
+ # papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
515
 
516
+ # with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
517
+ # papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
518
 
519
+ # with gr.Tab("Citations network",elem_id="papers-network-tab"):
520
+ # citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
521
 
522
 
523
 
 
537
 
538
  (textbox
539
  .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
540
+ .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
541
  .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
542
  )
543
 
544
  (examples_hidden
545
  .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
546
+ .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
547
  .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
548
  )
549
 
 
558
 
559
  dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
560
 
561
+ # query_papers.submit(generate_keywords,[query_papers], [keywords_papers])
562
+ # search_papers.click(find_papers,[query_papers,keywords_papers,after], [papers_dataframe,citations_network,papers_summary])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
 
564
  demo.queue()
565
 
climateqa/engine/chains/answer_rag.py CHANGED
@@ -2,15 +2,11 @@ from operator import itemgetter
2
 
3
  from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_core.output_parsers import StrOutputParser
5
- from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
6
  from langchain_core.prompts.prompt import PromptTemplate
7
  from langchain_core.prompts.base import format_document
8
 
9
- from climateqa.engine.chains.reformulation import make_reformulation_chain
10
- from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
11
- from climateqa.engine.prompts import papers_prompt_template
12
- from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
13
- from climateqa.engine.keywords import make_keywords_chain
14
 
15
  DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
16
 
@@ -40,105 +36,64 @@ def get_text_docs(x):
40
  def get_image_docs(x):
41
  return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
42
 
43
-
44
- def make_rag_chain(retriever,llm):
45
-
46
- # Construct the prompt
47
  prompt = ChatPromptTemplate.from_template(answer_prompt_template)
48
- prompt_without_docs = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
49
-
50
- # ------- CHAIN 0 - Reformulation
51
- reformulation = make_reformulation_chain(llm)
52
- reformulation = prepare_chain(reformulation,"reformulation")
53
-
54
- # ------- Find all keywords from the reformulated query
55
- keywords = make_keywords_chain(llm)
56
- keywords = {"keywords":itemgetter("question") | keywords}
57
- keywords = prepare_chain(keywords,"keywords")
58
-
59
- # ------- CHAIN 1
60
- # Retrieved documents
61
- find_documents = {"docs": itemgetter("question") | retriever} | RunnablePassthrough()
62
- find_documents = prepare_chain(find_documents,"find_documents")
63
-
64
- # ------- CHAIN 2
65
- # Construct inputs for the llm
66
- input_documents = {
67
- "context":lambda x : _combine_documents(x["docs"]),
68
- **pass_values(["question","audience","language","keywords"])
69
- }
70
-
71
- # ------- CHAIN 3
72
- # Bot answer
73
- llm_final = rename_chain(llm,"answer")
74
-
75
- answer_with_docs = {
76
- "answer": input_documents | prompt | llm_final | StrOutputParser(),
77
- **pass_values(["question","audience","language","query","docs","keywords"]),
78
- }
79
-
80
- answer_without_docs = {
81
- "answer": prompt_without_docs | llm_final | StrOutputParser(),
82
- **pass_values(["question","audience","language","query","docs","keywords"]),
83
- }
84
-
85
- # def has_images(x):
86
- # image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
87
- # return len(image_docs) > 0
88
-
89
- def has_docs(x):
90
- return len(x["docs"]) > 0
91
 
92
- answer = RunnableBranch(
93
- (lambda x: has_docs(x), answer_with_docs),
94
- answer_without_docs,
95
- )
96
 
97
 
98
- # ------- FINAL CHAIN
99
- # Build the final chain
100
- rag_chain = reformulation | keywords | find_documents | answer
101
 
102
- return rag_chain
 
 
 
103
 
 
 
 
104
 
105
- def make_rag_papers_chain(llm):
106
 
107
- prompt = ChatPromptTemplate.from_template(papers_prompt_template)
108
 
109
- input_documents = {
110
- "context":lambda x : _combine_documents(x["docs"]),
111
- **pass_values(["question","language"])
112
- }
113
 
114
- chain = input_documents | prompt | llm | StrOutputParser()
115
- chain = rename_chain(chain,"answer")
116
 
117
- return chain
118
 
 
 
 
 
 
119
 
 
 
120
 
 
121
 
122
 
123
 
124
- def make_illustration_chain(llm):
125
 
126
- prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
127
 
128
- input_description_images = {
129
- "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
130
- **pass_values(["question","audience","language","answer"]),
131
- }
132
 
133
- illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
134
- return illustration_chain
135
 
 
136
 
137
- def make_answer_rag_node(llm):
 
 
 
138
 
139
-
140
- def answer_rag(state):
141
- answer = "\n".join([x["question"] for x in state["questions"]])
142
- return {"answer":answer}
143
-
144
- return answer_rag
 
2
 
3
  from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_core.output_parsers import StrOutputParser
 
5
  from langchain_core.prompts.prompt import PromptTemplate
6
  from langchain_core.prompts.base import format_document
7
 
8
+ from climateqa.engine.chains.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
9
+ from climateqa.engine.chains.prompts import papers_prompt_template
 
 
 
10
 
11
  DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
12
 
 
36
  def get_image_docs(x):
37
  return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
38
 
39
+ def make_rag_chain(llm):
 
 
 
40
  prompt = ChatPromptTemplate.from_template(answer_prompt_template)
41
+ chain = ({
42
+ "context":lambda x : _combine_documents(x["documents"]),
43
+ "query":itemgetter("query"),
44
+ "language":itemgetter("language"),
45
+ "audience":itemgetter("audience"),
46
+ } | prompt | llm | StrOutputParser())
47
+ return chain
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ def make_rag_chain_without_docs(llm):
50
+ prompt = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
51
+ chain = prompt | llm | StrOutputParser()
52
+ return chain
53
 
54
 
55
+ def make_rag_node(llm,with_docs = True):
 
 
56
 
57
+ if with_docs:
58
+ rag_chain = make_rag_chain(llm)
59
+ else:
60
+ rag_chain = make_rag_chain_without_docs(llm)
61
 
62
+ async def answer_rag(state,config):
63
+ answer = await rag_chain.ainvoke(state,config)
64
+ return {"answer":answer}
65
 
66
+ return answer_rag
67
 
 
68
 
 
 
 
 
69
 
 
 
70
 
71
+ # def make_rag_papers_chain(llm):
72
 
73
+ # prompt = ChatPromptTemplate.from_template(papers_prompt_template)
74
+ # input_documents = {
75
+ # "context":lambda x : _combine_documents(x["docs"]),
76
+ # **pass_values(["question","language"])
77
+ # }
78
 
79
+ # chain = input_documents | prompt | llm | StrOutputParser()
80
+ # chain = rename_chain(chain,"answer")
81
 
82
+ # return chain
83
 
84
 
85
 
 
86
 
 
87
 
 
 
 
 
88
 
89
+ # def make_illustration_chain(llm):
 
90
 
91
+ # prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
92
 
93
+ # input_description_images = {
94
+ # "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
95
+ # **pass_values(["question","audience","language","answer"]),
96
+ # }
97
 
98
+ # illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
99
+ # return illustration_chain
 
 
 
 
climateqa/engine/chains/{intent_routing.py → intent_categorization.py} RENAMED
@@ -7,7 +7,7 @@ from langchain_core.utils.function_calling import convert_to_openai_function
7
  from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
8
 
9
 
10
- class IntentRouter(BaseModel):
11
  """Analyzing the user message input"""
12
 
13
  language: str = Field(
@@ -37,31 +37,31 @@ class IntentRouter(BaseModel):
37
 
38
 
39
 
40
- def make_intent_router_chain(llm):
41
 
42
- openai_functions = [convert_to_openai_function(IntentRouter)]
43
- llm_with_router = llm.bind(functions = openai_functions,function_call={"name":"IntentRouter"})
44
 
45
  prompt = ChatPromptTemplate.from_messages([
46
  ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
47
  ("user", "input: {input}")
48
  ])
49
 
50
- chain = prompt | llm_with_router | JsonOutputFunctionsParser()
51
  return chain
52
 
53
 
54
- def make_intent_router_node(llm):
55
 
56
- router_chain = make_intent_router_chain(llm)
57
 
58
- def route_input_message(state):
59
- output = router_chain.invoke({"input":state["user_input"]})
60
  if "language" not in output: output["language"] = "English"
61
  output["query"] = state["user_input"]
62
  return output
63
 
64
- return route_input_message
65
 
66
 
67
 
 
7
  from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
8
 
9
 
10
+ class IntentCategorizer(BaseModel):
11
  """Analyzing the user message input"""
12
 
13
  language: str = Field(
 
37
 
38
 
39
 
40
+ def make_intent_categorization_chain(llm):
41
 
42
+ openai_functions = [convert_to_openai_function(IntentCategorizer)]
43
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
44
 
45
  prompt = ChatPromptTemplate.from_messages([
46
  ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
47
  ("user", "input: {input}")
48
  ])
49
 
50
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
51
  return chain
52
 
53
 
54
+ def make_intent_categorization_node(llm):
55
 
56
+ categorization_chain = make_intent_categorization_chain(llm)
57
 
58
+ def categorize_message(state):
59
+ output = categorization_chain.invoke({"input":state["user_input"]})
60
  if "language" not in output: output["language"] = "English"
61
  output["query"] = state["user_input"]
62
  return output
63
 
64
+ return categorize_message
65
 
66
 
67
 
climateqa/engine/chains/prompts.py CHANGED
@@ -56,7 +56,7 @@ Passages:
56
  {context}
57
 
58
  -----------------------
59
- Question: {question} - Explained to {audience}
60
  Answer in {language} with the passages citations:
61
  """
62
 
@@ -137,7 +137,7 @@ Guidelines:
137
  - If the question is not related to environmental issues, never never answer it. Say it's not your role.
138
  - Make paragraphs by starting new lines to make your answers more readable.
139
 
140
- Question: {question}
141
  Answer in {language}:
142
  """
143
 
 
56
  {context}
57
 
58
  -----------------------
59
+ Question: {query} - Explained to {audience}
60
  Answer in {language} with the passages citations:
61
  """
62
 
 
137
  - If the question is not related to environmental issues, never never answer it. Say it's not your role.
138
  - Make paragraphs by starting new lines to make your answers more readable.
139
 
140
+ Question: {query}
141
  Answer in {language}:
142
  """
143
 
climateqa/engine/chains/{query_transform.py → query_transformation.py} RENAMED
File without changes
climateqa/engine/chains/retriever.py CHANGED
@@ -45,7 +45,7 @@ def suppress_output():
45
 
46
 
47
 
48
- def make_retriever_node(vectorstore,reranker):
49
 
50
  def retrieve_documents(state):
51
 
@@ -53,15 +53,12 @@ def make_retriever_node(vectorstore,reranker):
53
  questions = state["questions"]
54
 
55
  # Use sources from the user input or from the LLM detection
56
- sources_input = state["sources_input"] if "sources_input" in state else ["auto"]
 
 
 
57
  auto_mode = "auto" in sources_input
58
-
59
- # Constants
60
- k_final = 15
61
- k_before_reranking = 100
62
- k_summary = 5
63
- rerank_by_question = True
64
-
65
  # There are several options to get the final top k
66
  # Option 1 - Get 100 documents by question and rerank by question
67
  # Option 2 - Get 100/n documents by question and rerank the total
@@ -96,9 +93,14 @@ def make_retriever_node(vectorstore,reranker):
96
  docs_question = retriever.get_relevant_documents(question)
97
 
98
  # Rerank
99
- with suppress_output():
100
- docs_question = rerank_docs(reranker,docs_question,question)
101
-
 
 
 
 
 
102
  # If rerank by question we select the top documents for each question
103
  if rerank_by_question:
104
  docs_question = docs_question[:k_by_question[i]]
@@ -112,7 +114,7 @@ def make_retriever_node(vectorstore,reranker):
112
 
113
  # Sorting the list in descending order by rerank_score
114
  # Then select the top k
115
- docs = sorted(docs, key=lambda x: x.metadata["rerank_score"], reverse=True)
116
  docs = docs[:k_final]
117
 
118
  new_state = {"documents":docs}
 
45
 
46
 
47
 
48
+ def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
49
 
50
  def retrieve_documents(state):
51
 
 
53
  questions = state["questions"]
54
 
55
  # Use sources from the user input or from the LLM detection
56
+ if "sources_input" not in state or state["sources_input"] is None:
57
+ sources_input = ["auto"]
58
+ else:
59
+ sources_input = state["sources_input"]
60
  auto_mode = "auto" in sources_input
61
+
 
 
 
 
 
 
62
  # There are several options to get the final top k
63
  # Option 1 - Get 100 documents by question and rerank by question
64
  # Option 2 - Get 100/n documents by question and rerank the total
 
93
  docs_question = retriever.get_relevant_documents(question)
94
 
95
  # Rerank
96
+ if reranker is not None:
97
+ with suppress_output():
98
+ docs_question = rerank_docs(reranker,docs_question,question)
99
+ else:
100
+ # Add a default reranking score
101
+ for doc in docs_question:
102
+ doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
103
+
104
  # If rerank by question we select the top documents for each question
105
  if rerank_by_question:
106
  docs_question = docs_question[:k_by_question[i]]
 
114
 
115
  # Sorting the list in descending order by rerank_score
116
  # Then select the top k
117
+ docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
118
  docs = docs[:k_final]
119
 
120
  new_state = {"documents":docs}
climateqa/engine/graph.py CHANGED
@@ -4,14 +4,20 @@ from contextlib import contextmanager
4
 
5
  from langchain.schema import Document
6
  from langgraph.graph import END, StateGraph
 
 
7
  from typing_extensions import TypedDict
8
  from typing import List
9
 
 
 
10
  from .chains.answer_chitchat import make_chitchat_node
11
  from .chains.answer_ai_impact import make_ai_impact_node
12
- from .chains.query_transform import make_query_transform_node
13
  from .chains.translation import make_translation_node
14
- from .chains.intent_routing import make_intent_router_node
 
 
15
 
16
 
17
  class GraphState(TypedDict):
@@ -24,9 +30,109 @@ class GraphState(TypedDict):
24
  query: str
25
  questions : List[dict]
26
  answer: str
27
- audience: str
28
- sources_input: str
29
  documents: List[Document]
30
 
31
  def search(state):
32
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  from langchain.schema import Document
6
  from langgraph.graph import END, StateGraph
7
+ from langchain_core.runnables.graph import CurveStyle, NodeColors, MermaidDrawMethod
8
+
9
  from typing_extensions import TypedDict
10
  from typing import List
11
 
12
+ from IPython.display import display, HTML, Image
13
+
14
  from .chains.answer_chitchat import make_chitchat_node
15
  from .chains.answer_ai_impact import make_ai_impact_node
16
+ from .chains.query_transformation import make_query_transform_node
17
  from .chains.translation import make_translation_node
18
+ from .chains.intent_categorization import make_intent_categorization_node
19
+ from .chains.retriever import make_retriever_node
20
+ from .chains.answer_rag import make_rag_node
21
 
22
 
23
  class GraphState(TypedDict):
 
30
  query: str
31
  questions : List[dict]
32
  answer: str
33
+ audience: str = "experts"
34
+ sources_input: List[str] = ["auto"]
35
  documents: List[Document]
36
 
37
  def search(state):
38
+ return {}
39
+
40
+ def route_intent(state):
41
+ intent = state["intent"]
42
+ if intent in ["chitchat","esg"]:
43
+ return "answer_chitchat"
44
+ elif intent == "ai_impact":
45
+ return "answer_ai_impact"
46
+ else:
47
+ # Search route
48
+ return "search"
49
+
50
+ def route_translation(state):
51
+ if state["language"].lower() == "english":
52
+ return "transform_query"
53
+ else:
54
+ return "translate_query"
55
+
56
+ def route_based_on_relevant_docs(state,threshold_docs=0.2):
57
+ docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
58
+ if len(docs) > 0:
59
+ return "answer_rag"
60
+ else:
61
+ return "answer_rag_no_docs"
62
+
63
+
64
+ def make_id_dict(values):
65
+ return {k:k for k in values}
66
+
67
+ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
68
+
69
+ workflow = StateGraph(GraphState)
70
+
71
+ # Define the node functions
72
+ categorize_intent = make_intent_categorization_node(llm)
73
+ transform_query = make_query_transform_node(llm)
74
+ translate_query = make_translation_node(llm)
75
+ answer_chitchat = make_chitchat_node(llm)
76
+ answer_ai_impact = make_ai_impact_node(llm)
77
+ retrieve_documents = make_retriever_node(vectorstore,reranker)
78
+ answer_rag = make_rag_node(llm,with_docs=True)
79
+ answer_rag_no_docs = make_rag_node(llm,with_docs=False)
80
+
81
+ # Define the nodes
82
+ workflow.add_node("categorize_intent", categorize_intent)
83
+ workflow.add_node("search", search)
84
+ workflow.add_node("transform_query", transform_query)
85
+ workflow.add_node("translate_query", translate_query)
86
+ workflow.add_node("answer_chitchat", answer_chitchat)
87
+ workflow.add_node("answer_ai_impact", answer_ai_impact)
88
+ workflow.add_node("retrieve_documents",retrieve_documents)
89
+ workflow.add_node("answer_rag",answer_rag)
90
+ workflow.add_node("answer_rag_no_docs",answer_rag_no_docs)
91
+
92
+ # Entry point
93
+ workflow.set_entry_point("categorize_intent")
94
+
95
+ # CONDITIONAL EDGES
96
+ workflow.add_conditional_edges(
97
+ "categorize_intent",
98
+ route_intent,
99
+ make_id_dict(["answer_chitchat","answer_ai_impact","search"])
100
+ )
101
+
102
+ workflow.add_conditional_edges(
103
+ "search",
104
+ route_translation,
105
+ make_id_dict(["translate_query","transform_query"])
106
+ )
107
+
108
+ workflow.add_conditional_edges(
109
+ "retrieve_documents",
110
+ lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
111
+ make_id_dict(["answer_rag","answer_rag_no_docs"])
112
+ )
113
+
114
+ # Define the edges
115
+ workflow.add_edge("translate_query", "transform_query")
116
+ workflow.add_edge("transform_query", "retrieve_documents")
117
+ workflow.add_edge("retrieve_documents", "answer_rag")
118
+ workflow.add_edge("answer_rag", END)
119
+ workflow.add_edge("answer_rag_no_docs", END)
120
+ workflow.add_edge("answer_chitchat", END)
121
+ workflow.add_edge("answer_ai_impact", END)
122
+
123
+ # Compile
124
+ app = workflow.compile()
125
+ return app
126
+
127
+
128
+
129
+
130
+ def display_graph(app):
131
+
132
+ display(
133
+ Image(
134
+ app.get_graph(xray = True).draw_mermaid_png(
135
+ draw_method=MermaidDrawMethod.API,
136
+ )
137
+ )
138
+ )
climateqa/engine/reranker.py CHANGED
@@ -34,7 +34,7 @@ def rerank_docs(reranker,docs,query):
34
  for result in results.results:
35
  doc_id = result.document.doc_id
36
  doc = docs[doc_id]
37
- doc.metadata["rerank_score"] = result.score
38
  doc.metadata["query_used_for_retrieval"] = query
39
  docs_reranked.append(doc)
40
  return docs_reranked
 
34
  for result in results.results:
35
  doc_id = result.document.doc_id
36
  doc = docs[doc_id]
37
+ doc.metadata["reranking_score"] = result.score
38
  doc.metadata["query_used_for_retrieval"] = query
39
  docs_reranked.append(doc)
40
  return docs_reranked
front/__init__.py ADDED
File without changes
front/callbacks.py ADDED
File without changes
front/utils.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+
4
+ def make_pairs(lst):
5
+ """from a list of even lenght, make tupple pairs"""
6
+ return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
7
+
8
+
9
+ def serialize_docs(docs):
10
+ new_docs = []
11
+ for doc in docs:
12
+ new_doc = {}
13
+ new_doc["page_content"] = doc.page_content
14
+ new_doc["metadata"] = doc.metadata
15
+ new_docs.append(new_doc)
16
+ return new_docs
17
+
18
+
19
+
20
+ def parse_output_llm_with_sources(output):
21
+ # Split the content into a list of text and "[Doc X]" references
22
+ content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
23
+ parts = []
24
+ for part in content_parts:
25
+ if part.startswith("Doc"):
26
+ subparts = part.split(",")
27
+ subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
28
+ subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
29
+ parts.append("".join(subparts))
30
+ else:
31
+ parts.append(part)
32
+ content_parts = "".join(parts)
33
+ return content_parts
34
+
35
+
36
+ def make_html_source(source,i):
37
+ meta = source.metadata
38
+ # content = source.page_content.split(":",1)[1].strip()
39
+ content = source.page_content.strip()
40
+
41
+ toc_levels = []
42
+ for j in range(2):
43
+ level = meta[f"toc_level{j}"]
44
+ if level != "N/A":
45
+ toc_levels.append(level)
46
+ else:
47
+ break
48
+ toc_levels = " > ".join(toc_levels)
49
+
50
+ if len(toc_levels) > 0:
51
+ name = f"<b>{toc_levels}</b><br/>{meta['name']}"
52
+ else:
53
+ name = meta['name']
54
+
55
+ score = meta['reranking_score']
56
+ if score > 0.8:
57
+ color = "score-green"
58
+ elif score > 0.4:
59
+ color = "score-orange"
60
+ else:
61
+ color = "score-red"
62
+
63
+ relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>"
64
+
65
+ if meta["chunk_type"] == "text":
66
+
67
+ card = f"""
68
+ <div class="card" id="doc{i}">
69
+ <div class="card-content">
70
+ <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
71
+ <p>{content}</p>
72
+ {relevancy_score}
73
+ </div>
74
+ <div class="card-footer">
75
+ <span>{name}</span>
76
+ <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
77
+ <span role="img" aria-label="Open PDF">🔗</span>
78
+ </a>
79
+ </div>
80
+ </div>
81
+ """
82
+
83
+ else:
84
+
85
+ if meta["figure_code"] != "N/A":
86
+ title = f"{meta['figure_code']} - {meta['short_name']}"
87
+ else:
88
+ title = f"{meta['short_name']}"
89
+
90
+ card = f"""
91
+ <div class="card card-image">
92
+ <div class="card-content">
93
+ <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
94
+ <p>{content}</p>
95
+ <p class='ai-generated'>AI-generated description</p>
96
+ {relevancy_score}
97
+ </div>
98
+ <div class="card-footer">
99
+ <span>{name}</span>
100
+ <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
101
+ <span role="img" aria-label="Open PDF">🔗</span>
102
+ </a>
103
+ </div>
104
+ </div>
105
+ """
106
+
107
+ return card
108
+
109
+
110
+
111
+ def make_toolbox(tool_name,description = "",checked = False,elem_id = "toggle"):
112
+
113
+ if checked:
114
+ span = "<span class='checkmark'>&#10003;</span>"
115
+ else:
116
+ span = "<span class='loader'></span>"
117
+
118
+ # toolbox = f"""
119
+ # <div class="dropdown">
120
+ # <label for="{elem_id}" class="dropdown-toggle">
121
+ # {span}
122
+ # {tool_name}
123
+ # <span class="caret"></span>
124
+ # </label>
125
+ # <input type="checkbox" id="{elem_id}" hidden/>
126
+ # <div class="dropdown-content">
127
+ # <p>{description}</p>
128
+ # </div>
129
+ # </div>
130
+ # """
131
+
132
+
133
+ toolbox = f"""
134
+ <div class="dropdown">
135
+ <label for="{elem_id}" class="dropdown-toggle">
136
+ {span}
137
+ {tool_name}
138
+ </label>
139
+ </div>
140
+ """
141
+
142
+ return toolbox
requirements.txt CHANGED
@@ -2,13 +2,16 @@ gradio==4.19.1
2
  azure-storage-file-share==12.11.1
3
  azure-storage-blob
4
  python-dotenv==1.0.0
5
- langchain==0.1.4
6
- langchain_openai==0.0.6
7
- pinecone-client==3.0.2
 
8
  sentence-transformers==2.6.0
9
  huggingface-hub
10
- msal
11
  pyalex==0.13
12
  networkx==3.2.1
13
  pyvis==0.3.2
14
  flashrank==0.2.5
 
 
 
 
2
  azure-storage-file-share==12.11.1
3
  azure-storage-blob
4
  python-dotenv==1.0.0
5
+ langchain==0.2.1
6
+ langchain_openai==0.1.7
7
+ langgraph==0.0.55
8
+ pinecone-client==4.1.0
9
  sentence-transformers==2.6.0
10
  huggingface-hub
 
11
  pyalex==0.13
12
  networkx==3.2.1
13
  pyvis==0.3.2
14
  flashrank==0.2.5
15
+ rerankers==0.3.0
16
+ torch==2.3.0
17
+ nvidia-cudnn-cu12==8.9.2.26
sandbox/20240310 - CQA - Semantic Routing 1.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
style.css CHANGED
@@ -363,3 +363,105 @@ span.chatbot > p > img{
363
  .a-doc-ref{
364
  text-decoration: none !important;
365
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  .a-doc-ref{
364
  text-decoration: none !important;
365
  }
366
+
367
+
368
+ .dropdown {
369
+ position: relative;
370
+ display:inline-block;
371
+ margin-bottom: 10px;
372
+ }
373
+
374
+ .dropdown-toggle {
375
+ background-color: #f2f2f2;
376
+ color: black;
377
+ padding: 10px;
378
+ font-size: 16px;
379
+ cursor: pointer;
380
+ display: block;
381
+ width: 400px; /* Adjust width as needed */
382
+ position: relative;
383
+ display: flex;
384
+ align-items: center; /* Vertically center the contents */
385
+ justify-content: left;
386
+ }
387
+
388
+ .dropdown-toggle .caret {
389
+ content: "";
390
+ position: absolute;
391
+ right: 10px;
392
+ top: 50%;
393
+ border-left: 5px solid transparent;
394
+ border-right: 5px solid transparent;
395
+ border-top: 5px solid black;
396
+ transform: translateY(-50%);
397
+ }
398
+
399
+ input[type="checkbox"] {
400
+ display: none !important;
401
+ }
402
+
403
+ input[type="checkbox"]:checked + .dropdown-content {
404
+ display: block;
405
+ }
406
+
407
+ .dropdown-content {
408
+ display: none;
409
+ position: absolute;
410
+ background-color: #f9f9f9;
411
+ min-width: 300px;
412
+ box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2);
413
+ z-index: 1;
414
+ padding: 12px;
415
+ border: 1px solid #ccc;
416
+ }
417
+
418
+ input[type="checkbox"]:checked + .dropdown-toggle + .dropdown-content {
419
+ display: block;
420
+ }
421
+
422
+ input[type="checkbox"]:checked + .dropdown-toggle .caret {
423
+ border-top: 0;
424
+ border-bottom: 5px solid black;
425
+ }
426
+
427
+ .loader {
428
+ border: 1px solid #d0d0d0 !important; /* Light grey background */
429
+ border-top: 1px solid #3498db !important; /* Blue color */
430
+ border-right: 1px solid #3498db !important; /* Blue color */
431
+ border-radius: 50%;
432
+ width: 20px;
433
+ height: 20px;
434
+ animation: spin 2s linear infinite;
435
+ display:inline-block;
436
+ margin-right:10px !important;
437
+ }
438
+
439
+ .checkmark{
440
+ color:green !important;
441
+ font-size:18px;
442
+ margin-right:10px !important;
443
+ }
444
+
445
+ @keyframes spin {
446
+ 0% { transform: rotate(0deg); }
447
+ 100% { transform: rotate(360deg); }
448
+ }
449
+
450
+
451
+ .relevancy-score{
452
+ margin-top:10px !important;
453
+ font-size:10px !important;
454
+ font-style:italic;
455
+ }
456
+
457
+ .score-green{
458
+ color:green !important;
459
+ }
460
+
461
+ .score-orange{
462
+ color:orange !important;
463
+ }
464
+
465
+ .score-orange{
466
+ color:red !important;
467
+ }