muhammedAdnan3
/

PromptWizardCornai

Model card Files Files and versions Community

muhammedAdnan3 commited on Feb 2

Commit

a2d6347

verified ·

1 Parent(s): cc6916c

Uploading everything from Mircosoft data

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
.gitignore +398 -0
CODE_OF_CONDUCT.md +9 -0
LICENSE +21 -0
Makefile +16 -0
README.md +265 -3
RESPONSIBLE_AI.md +41 -0
SECURITY.md +41 -0
demos/aquarat/.env +8 -0
demos/aquarat/configs/prompt_library.yaml +36 -0
demos/aquarat/configs/promptopt_config.yaml +52 -0
demos/aquarat/configs/setup_config.yaml +14 -0
demos/aquarat/demo.ipynb +296 -0
demos/bbh/.env +8 -0
demos/bbh/configs/prompt_library.yaml +36 -0
demos/bbh/configs/promptopt_config.yaml +52 -0
demos/bbh/configs/setup_config.yaml +14 -0
demos/bbh/demo.ipynb +428 -0
demos/bbh/description.py +97 -0
demos/gsm8k/.env +8 -0
demos/gsm8k/configs/prompt_library.yaml +36 -0
demos/gsm8k/configs/promptopt_config.yaml +52 -0
demos/gsm8k/configs/setup_config.yaml +14 -0
demos/gsm8k/demo.ipynb +298 -0
demos/scenarios/.env +8 -0
demos/scenarios/configs/prompt_library.yaml +36 -0
demos/scenarios/configs/promptopt_config.yaml +53 -0
demos/scenarios/configs/setup_config.yaml +14 -0
demos/scenarios/dataset_scenarios_demo.ipynb +1146 -0
demos/svamp/.env +8 -0
demos/svamp/configs/prompt_library.yaml +36 -0
demos/svamp/configs/promptopt_config.yaml +52 -0
demos/svamp/configs/setup_config.yaml +14 -0
demos/svamp/demo.ipynb +295 -0
docs/images/arithmetic_task.png +0 -0
docs/images/bigbench.png +0 -0
docs/images/comaprision.png +3 -0
docs/images/cost_analysis.png +0 -0
docs/images/curve.png +0 -0
docs/images/github.png +0 -0
docs/images/icl_results.png +0 -0
docs/images/iterative_flowchart-1.png +3 -0
docs/images/msr_blog.png +0 -0
docs/images/overview.png +3 -0
docs/images/ppc.png +3 -0
docs/images/ppc_1.png +3 -0
docs/images/prompting.png +0 -0
docs/images/sequential_flowchart-1.png +0 -0
docs/images/slm_prompt.png +0 -0
docs/index.html +784 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/images/comaprision.png filter=lfs diff=lfs merge=lfs -text
+docs/images/iterative_flowchart-1.png filter=lfs diff=lfs merge=lfs -text
+docs/images/overview.png filter=lfs diff=lfs merge=lfs -text
+docs/images/ppc_1.png filter=lfs diff=lfs merge=lfs -text
+docs/images/ppc.png filter=lfs diff=lfs merge=lfs -text
+images/iterative_flowchart-1.png filter=lfs diff=lfs merge=lfs -text
+images/overview.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,398 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# JetBrains Rider
+*.sln.iml

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [[email protected]](mailto:[email protected]) with questions or concerns

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

Makefile ADDED Viewed

	@@ -0,0 +1,16 @@

+.PHONY: install style test
+PYTHON := python
+CHECK_DIRS := promptwizard tests
+install:
+	@${PYTHON} setup.py bdist_wheel
+	@${PYTHON} -m pip install dist/sdtools*
+style:
+	black $(CHECK_DIRS)
+	isort -rc $(CHECK_DIRS)
+	flake8 $(CHECK_DIRS)
+test:
+	@${PYTHON} -m pytest -n auto --dist=loadfile -s -v ./tests/

README.md CHANGED Viewed

@@ -1,3 +1,265 @@
----
-license: bsd-2-clause
----

+# PromptWizard 🧙
+<p align="left">
+  <a href='https://arxiv.org/abs/2405.18369'>
+    <img src=https://img.shields.io/badge/arXiv-2409.10566-b31b1b.svg>
+  </a>
+  <a href='https://www.microsoft.com/en-us/research/blog/promptwizard-the-future-of-prompt-optimization-through-feedback-driven-self-evolving-prompts/'>
+    <img src=images/msr_blog.png width="16">
+    Blog Post
+  </a>
+  <a href='https://microsoft.github.io/PromptWizard/'>
+    <img src=images/github.png width="16">
+    Project Website
+  </a>
+</p>
+> **PromptWizard: Task-Aware Prompt Optimization Framework**<br>
+> Eshaan Agarwal, Joykirat Singh, Vivek Dani, Raghav Magazine, Tanuja Ganu, Akshay Nambi <br>
+## Overview 🌟
+<p align="center">Overview of the PromptWizard framework</p>
+<img src="./images/overview.png" >
+PromptWizard is a discrete prompt optimization framework that employs a self-evolving mechanism where the LLM generates, critiques, and refines its own prompts and examples, continuously improving through iterative feedback and synthesis. This self-adaptive approach ensures holistic optimization by evolving both the instructions and in-context learning examples for better task performance.
+Three key components of PromptWizard are te following :
+- Feedback-driven Refinement: LLM generates, critiques, and refines its own prompts and examples, continuously improving through iterative feedback and synthesis
+- Critique and Synthesize diverse examples: Generates synthetic examples that are robust, diverse and task-aware. Also it optimizes both prompt and examples in tandem
+- Self generated Chain of Thought (CoT) steps with combination of positive, negative and synthetic examples
+<p align="center">Stage 1: Iterative optimization of instructions</p>
+<p align="center">
+  <img src="./images/iterative_flowchart-1.png" width="49.5%" />
+</p>
+<p align="center">Stage 2: Sequential optimization of instruction and examples</p>
+<p align="center">
+<img src="./images/sequential_flowchart-1.png" width="49.5%" />
+</p>
+## Installation ⬇️
+Follow these steps to set up the development environment and install the package:
+1) Clone the repository
+    ```
+    git clone https://github.com/microsoft/PromptWizard
+    cd PromptWizard
+    ```
+2) Create and activate a virtual environment
+    On Windows
+    ```
+    python -m venv venv
+    venv\Scripts\activate
+    ```
+    On macOS/Linux:
+    ```
+    python -m venv venv
+    source venv/bin/activate
+    ```
+3) Install the package in development mode:
+    ```
+    pip install -e .
+    ```
+## Quickstart 🏃
+There are three main ways to use PromptWizard:
+- Scenario 1 : Optimizing prompts without examples
+- Scenario 2 : Generating synthetic examples and using them to optimize prompts
+- Scenario 3 : Optimizing prompts with training data
+**NOTE** : Refer this [notebook](demos/scenarios/dataset_scenarios_demo.ipynb) to get a detailed understanding of the usage for each of the scenarios. **This serves as a starting point to understand the usage of PromptWizard**
+#### High level overview of using PromptWizard
+- Decide your scenario
+- Fix the configuration and environmental varibles for API calling
+  - Use ```promptopt_config.yaml``` to set configurations. For example for GSM8k this [file](demos/gsm8k/configs/promptopt_config.yaml) can be used
+  - Use ```.env``` to set environmental varibles. For GSM8k this [file](demos/gsm8k/.env) can be used
+  ```
+  USE_OPENAI_API_KEY="XXXX"
+  # Replace with True/False based on whether or not to use OPENAI API key
+  # If the first variable is set to True then fill the following two
+  OPENAI_API_KEY="XXXX"
+  OPENAI_MODEL_NAME ="XXXX"
+  # If the first variable is set to False then fill the following three
+  AZURE_OPENAI_ENDPOINT="XXXXX"
+  # Replace with your Azure OpenAI Endpoint
+  OPENAI_API_VERSION="XXXX"
+  # Replace with the version of your API
+  AZURE_OPENAI_CHAT_DEPLOYMENT_NAME="XXXXX"
+  # Create a deployment for the model and place the deployment name here.
+  ```
+- Run the code
+  - To run PromptWizard on your custom dataset please jump [here](#run-on-custom-dataset)
+#### Running PromptWizard with training data (Scenario 3)
+- We support [GSM8k](https://huggingface.co/datasets/openai/gsm8k), [SVAMP](https://huggingface.co/datasets/ChilleD/SVAMP), [AQUARAT](https://huggingface.co/datasets/deepmind/aqua_rat) and [Instruction_Induction(BBII)](https://github.com/xqlin98/INSTINCT/tree/main/Induction/experiments/data/instruction_induction/raw) datasets
+- Please note that time taken for prompt optimzation is dependent on the dataset. In our experiments for the above mentioned datasets, it took around 20 - 30 minutes on average.
+#### Running on GSM8k (AQUARAT/SVAMP)
+- Please note that this code requires access to LLMs via API calling for which we support AZURE endpoints or OPENAI keys
+- Set the AZURE endpoint configurations in [.env](demos/gsm8k/.env)
+- Follow the steps in [demo.ipynb](demos/gsm8k/demo.ipynb) to download the data, run the prompt optimization and carry out inference.
+#### Running on BBII
+- BBII has many datasets in it, based on the dataset set the configs [here](demos/bbh/configs/promptopt_config.yaml)
+- In configs ```task_description```,```base_instruction``` and ```answer_format``` need to be changed for different datasets in BBII, the rest of the configs remain the same
+- A demo is presented in  [demo.ipynb](demos/bbh/demo.ipynb)
+## Run on Custom Datasets 🗃️
+### Create Custom Dataset
+- Our code expects the dataset to be in ```.jsonl``` file format
+- Both the train and test set follow the same format
+- Every sample in the ```.jsonl``` should have 2 fields :
+  1) ```question``` : It should contain the complete question that is to asked to the LLM
+  2) ```answer``` : It should contain the ground truth answer which can be verbose or concise
+### Run on Custom Dataset
+NOTE : Refer to [demos](demos) folder for examples of folders for four datasets. The ```.ipynb``` in each of the folders shows how to run PromptWizard on that particular dataset. A similar procedure can be followed for a new dataset. Below is the explanation of each of the components of the ```.ipynb``` and the dataset specifc folder structure in detail
+#### Steps to be followed for custom datasets
+1) Every new dataset needs to have the following
+    - ```configs``` folder to store files for defining optimization hyperparameters and setup configs
+    - ```data``` folder to store ```train.jsonl``` and ```test.jsonl``` as curated [here](#create-custom-dataset) (this is done in the notebooks)
+    - ```.env``` file for environment varibles to be used for API calling
+    - ```.py/.ipynb``` script to run the code
+2) Set the hyperparameters like number of mutations, refine steps, in-context examples etc.
+    - Set the following in [promptopt_config.yaml](demos/gsm8k/configs/promptopt_config.yaml) :
+        - ```task_description``` : Desciption of the task at hand which will be fed into the prompt
+          - For GSM8k a description like the following can be used
+            ```
+            You are a mathematics expert. You will be given a mathematics problem which you need to solve
+            ```
+        - ```base_instruction``` : Base instruction in line with the dataset
+          - A commonly used base instruction could be
+            ```
+            Lets think step by step.
+            ```
+        - ```answer_format``` : Instruction for specifying the answer format
+          - It is crucial to set the ```answer_format``` properly to ensure correct extraction by ```def extract_final_answer()```
+          - Answer format could be :
+            ```
+            At the end, wrap only your final option between <ANS_START> and <ANS_END> tags
+            ```
+            Then in ```def extract_final_answer()``` we can simply write code to extract string between the tags
+        - ```seen_set_size``` : The number of train samples to be used for prompt optimization
+          - In our experiments we set this to be 25. In general any number between 20-50 would work
+        - ```few_shot_count``` : The number of in-context examples needed in the prompt
+          - The value can be set to any positive integer based on the requirement
+          - For generating zero-shot prompts, set the values to a small number (i.e between 2-5) and after the final prompt is generated the in-context examples can be removed. We suggest using some in-context examples as during the optimization process the instructions in the prompt are refined using in-context examples hence setting it to a small number will give better zero-shot instructions in the prompt
+        - ```generate_reasoning``` : Whether or not to generate reasoning for the in-context examples
+          - In our experiments we found it to improve the prompt overall as it provides a step-by-step approach to reach the final answer. However if there is a constraint on the prompt length or number of prompt tokens, it can be turned off to get smaller sized prompts
+        - ```generate_expert_identity``` and ```generate_intent_keywords``` : Having these helped improve the prompt as they help making the prompt relevant to the task
+    - Refer ```promptopt_config.yaml``` files in folders present [here](demos)  for the descriptions used for AQUARAT, SVAMP and GSM8k. For BBII refer [description.py](demos/bbh/description.py) which has the meta instructions for each of the datasets
+    - Following are the global parameters which can be set based on the availability of the training data
+      - ```run_without_train_examples``` is a global hyperparameter which can be used when there are no training samples and in-context examples are not required in the final prompt
+      - ```generate_synthetic_examples``` is a global hyperparameter which can be used when there are no training samples and we want to generate synthetic data for training
+      - ```use_examples``` is a global hyperparameter which can be used to optimize prompts using training data
+3) Create a dataset specific class which inherits ```class DatasetSpecificProcessing``` similar to ```GSM8k(DatasetSpecificProcessing)``` in [demo.ipynb](demos/gsm8k/demo.ipynb) and define the following functions in it
+      1) In ```def extract_answer_from_output()``` : This is a dataset specific function, given the ```answer``` from the dataset it should extract and return  a concise form of the answer. Note that based on the dataset it can also simply return the ```answer``` as it is like in case of SVAMP and AQUARAT datasets
+      2) ```def extract_final_answer()``` : This is a LLM output specific function, given the verbose answer from the LLM it should extract and return the concise final answer
+      3) Define ```def access_answer()``` : This function takes an input the LLM output, then does the following:
+         - Extracts the concise answer using ```def extract_final_answer()``` from the LLM output as defined above
+         - Evaluates the extracted answer with the ground truth and retuns
+            - Extracted answer from LLM output
+            - Boolean value indicating if answer is correct or not
+         - The evaluation done here is dataset specific, for datasets like GSM8k, SVAMP and AQUARAT which have final answer as an number, we can do a direct match between the numbers generated and the ground truth, while for datasets where the answer is a sentence or paragraph it would be better to do evaluation with llm-as-a-judge, to compare the generated and ground truth paragraph/sentence. An example is available in ```def access_answer()``` in [this](demos/bbh/demo.ipynb) notebook
+## How PromptWizard Works 🔍
+- Using the problem description and initial prompt instruction, PW generates variations of the instruction by prompting LLMs to mutate it. Based on performance, the best prompt is selected. PW incorporates a critique component that provides feedback, thus guiding and refining the prompt over multiple iterations.
+- PW also optimizes in-context examples. PW selects a diverse set of examples
+from the training data, identifying positive and negative examples based on their performance with
+the modified prompt. Negative examples help inform further prompt refinements.
+- Examples and instructions are sequentially optimized, using the critique to generate synthetic examples that address the current prompt’s weaknesses. These examples are integrated to further refine the prompt.
+- PW generates detailed reasoning chains via Chain-of-Thought (CoT), enriching the prompt’s capacity for problem-solving.
+- PW aligns prompts with human reasoning by integrating task intent and expert
+personas, enhancing both model performance and interpretability.
+## Configurations ⚙️
+Here we define the various hyperparameters used in prompt optimization process found in [promptopt_config.yaml](demos/gsm8k/configs/promptopt_config.yaml)
+- ```mutate_refine_iterations```: Number of iterations for conducting mutation of task description
+ followed by refinement of instructions
+- ```mutation_rounds```: Number of rounds of mutation to be performed when generating different styles
+- ```refine_task_eg_iterations```: Number of iterations for refining task description and in context examples
+- ```style_variation```: Number of thinking style variations to be used in prompt mutation
+- ```questions_batch_size```: Number of questions to be asked to LLM in a single batch, during training step
+- ```min_correct_count```: Minimum number of batches of questions to correctly answered, for a prompt to be considered as performing good
+- ```max_eval_batches```: Maximum number of mini-batches on which we should evaluate the prompt
+- ```top_n```: Number of top best prompts to be considered from scoring stage for the next stage
+- ```seen_set_size```: Number of samples from trainset to be used for training
+- ```few_shot_count```: Number of in-context examples required in final prompt
+## Best Practices 💡
+Following are some of best pracitices we followed during are experiments
+- Regarding the parameters in [promptopt_config.yaml](demos/gsm8k/configs/promptopt_config.yaml)
+    - We found the best performing values for ```mutate_refine_iterations```,```mutation_rounds```,```refine_task_eg_iterations``` to be 3 or 5
+    - Other parameters have been set to their ideal values. ```seen_set_size``` can be increased to 50 and ```few_shot_count``` can be set based on the use case
+- The prompts generated at the end of the training process are usually very detailed, however user supervision can help tune it further for the task at hand
+- Trying both configurations of having synthetic in-context examples or in-context examples from the train set can be tried to find the best prompt based on use case.
+## Results 📈
+<p align="center">
+  <img src= "./images/curve.png" width="45%" />
+  <p align="center">PromptWizard consistently outperforms other methods across various
+thresholds, maintaining the highest p(τ) values, indicating that it consistently performs near the best
+possible accuracy across all tasks</p>
+</p>
+- The fiqure shows the performance profile curve for the instruction induction
+tasks. The performance profile curve visualizes how frequently
+different approaches’ performance is within a given distance of the best performance. In this curve,
+the x-axis (τ) represents the performance ratio relative to the best-performing method, and the y-axis
+(p(τ )) reflects the fraction of tasks where a method’s performance is within this ratio. So for a given
+method, the curve tells what percentage of the tasks are within τ distance to the best performance.
+## How to contribute: ✋
+This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com.
+When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [email protected] with any additional questions or comments.
+## Citation 📝
+If you make use of our work, please cite our paper:
+```
+@misc{agarwal2024promptwizardtaskawarepromptoptimization,
+      title={PromptWizard: Task-Aware Prompt Optimization Framework},
+      author={Eshaan Agarwal and Joykirat Singh and Vivek Dani and Raghav Magazine and Tanuja Ganu and Akshay Nambi},
+      year={2024},
+      eprint={2405.18369},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2405.18369},
+}
+```
+## Responsible AI Considerations
+For guidelines and best practices related to Responsible AI, please refer to our [Responsible AI Guidelines](RESPONSIBLE_AI.md).

RESPONSIBLE_AI.md ADDED Viewed

	@@ -0,0 +1,41 @@

+### PromptWizard: Responsible AI FAQ
+- What is PromptWizard?
+    PromptWizard is a novel framework for prompt optimization that supports to tune a good prompt for a given task and dataset, so that LLMs’ output/accuracy can be optimized. PromptWizard is solely designed for research settings, and its testing has only been carried out in such environments. It should not be used in downstream applications without additional analysis and mitigation to address potential harm or bias in the proposed application. Please refer to the paper - [PromptWizard: Task-Aware Agent-driven Prompt Optimization Framework (arxiv.org)](https://arxiv.org/abs/2405.18369)-for more details.
+- What can PromptWizard do?
+    PromptWizard framework is an AI-based framework that internally uses LLM to find the optimal prompt for a given task. It takes as input task description, dataset format & few training examples, hyperparameter configurations and outputs an optimized prompt for the given LLM and task intent.
+    Unlike existing approaches, PromptWizard optimizes both prompt instructions and in-context examples, maximizing the LLM performance. It iteratively refines prompts by mutating instructions using and incorporating negative examples. It further enhances both instructions and examples with the aid of a critic provided by LLM on a candidate prompt.
+    New synthetic instructions and examples are generated with detailed reasoning steps using LLM.
+- What is/are PromptWizard’s intended use(s)?
+    Please note that PromptWizard is an open-source framework under active development and intended for use for research purposes. It should not be used in any downstream applications without additional detailed evaluation of robustness, safety issues and assessment of any potential harm or bias in the proposed application. For all GenAI applications, prompt design and tuning are a tedious, skilful and laborious tasks. PromptWizard’s intended use is to design and optimize the prompt along with the few shot examples for a given task/domain and dataset. This well crafted prompt would enable the LLM to provide more accurate and high quality answer. We have also integrated Azure AI Content Safety service, to avoid/slow-down malicious uses.
+- How was PromptWizard evaluated? What metrics are used to measure performance?
+    PromptWizard framework is generic enough to work on any domain/dataset/task. However, we have evaluated the performance of PromptWizard across 35 tasks on 8 datasets. More details can be found [PromptWizard: Task-Aware Agent-driven Prompt Optimization Framework (arxiv.org)](https://arxiv.org/abs/2405.18369)
+    The opensource datasets used for evaluation include
+    - Medical challenges ([MedQA](https://github.com/jind11/MedQA), [PubMedQA](https://pubmedqa.github.io/))
+    - Commonsense reasoning ([CSQA](https://amritasaha1812.github.io/CSQA/), [SQA](https://www.microsoft.com/en-in/download/details.aspx?id=54253))
+    - Math reasoning problems ([GSM8k](https://huggingface.co/datasets/openai/gsm8k))
+    - Hate speech classification ([Ethos](https://link.springer.com/article/10.1007/s40747-021-00608-2)),
+    - Complex domain-specific tasks ([MMLU](https://huggingface.co/datasets/cais/mmlu) 6 medical tasks, [Big-Bench-Hard-23](https://huggingface.co/datasets/maveriq/bigbenchhard))
+    Additionally, the team has also conducted “red team” analysis to evaluate if PromptWizard optimizes harmful intent. With appropriate Azure content moderation deployed in the pipeline on the input to PromptWizard and output from PromptWizard, it didn’t optimize prompts for harmful intent. Please refer to the details for Azure content moderation [here](https://learn.microsoft.com/en-us/azure/ai-services/content-moderator/overview).
+- What are the limitations of PromptWizard? How can users minimize the impact of PromptWizard’s limitations when using the system?
+    - The framework is evaluated primarily on English languages tasks as described in earlier section. The framework is not yet evaluated for multilingual settings.
+    - The framework generates synthetic examples for few-shot learning based on task description. User is required to validate the correctness and diversity of generated synthetic examples.
+    - PromptWizard utilizes existing LLMs and does not train a new model. Hence, it inherits the capabilities and limitations of its base model, as well as common limitations among other large language models or limitations caused by its training process. Hence, we suggest using the appropriate base LLM suitable for your use-cases to work with PromptWizard.
+- What operational factors and settings allow for effective and responsible use of PromptWizard?
+  - Input considerations: Better performance with PromptWizard can be achieved by specifying the input components like task and intent as clearly and concisely as possible.
+  - Human involvement: PromptWizard optimizes the prompt with prompt instruction and a few shot examples for the given intent and task.  We suggest human oversight to review the optimized prompts before those are executed with LLMs.
+  - LLMs: Users can choose the LLM that is optimized for responsible use. The default LLM is GPT-4 which inherits the existing RAI mechanisms and filters from the LLM provider. Caching is enabled by default to increase reliability and control cost. We encourage developers to review [OpenAI’s Usage policies](https://openai.com/policies/usage-policies/) and [Azure OpenAI’s Code of Conduct](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/code-of-conduct) when using GPT-4.
+  - Content Safety: We have integrated [Azure AI Content Safety](https://learn.microsoft.com/en-us/azure/ai-services/content-safety/overview) service for content moderation. We suggest to deploy PromptWizard with such content safety system in the pipeline.

SECURITY.md ADDED Viewed

	@@ -0,0 +1,41 @@

+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.**
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+If you prefer to submit without logging in, send email to [[email protected]](mailto:[email protected]).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

demos/aquarat/.env ADDED Viewed

	@@ -0,0 +1,8 @@

+USE_OPENAI_API_KEY="False"
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""

demos/aquarat/configs/prompt_library.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+system_prompts: |
+  You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+  Guidelines
+  - Your role must always be a helpful assistant that assists students in understanding research papers.
+  - Only answer questions that are directly or indirectly related to the referenced paper(s).
+mode:
+  chat:
+    - name: CHAT-FIRST-MESSAGE
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+    - name: CHAT-NEXT-MESSAGES
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+  generation:
+    - name: FLASH_PROFILE
+      prompt_template: |
+        {user_msg}
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+      llm_request_type: rag-query
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]

demos/aquarat/configs/promptopt_config.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting <mutation_rounds>  rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description: "You are a mathematics expert. You will be given a mathematics problem which you need to solve"
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction: "Lets think step by step."
+# Instruction for specifying answer format
+answer_format: "At the end, wrap only your final option between <ANS_START> and <ANS_END> tags"
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################

demos/aquarat/configs/setup_config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+assistant_llm:
+  # put the unique_model_id that you specified in llm_config.yaml
+  prompt_opt: gpt-4o
+dir_info:
+  # Base directory for everything
+  base_dir: logs
+  log_dir_name: glue_logs
+experiment_name: aquarat
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:

demos/aquarat/demo.ipynb ADDED Viewed

	@@ -0,0 +1,296 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "695a1a9b",
+   "metadata": {},
+   "source": [
+    "#### Set environment variables in [.env](.env) for LLM API calling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8042a9cc",
+   "metadata": {},
+   "source": [
+    "### Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1fb3d81-16b6-4b8c-a028-880fdce5e14a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, \"../../\")\n",
+    "import promptwizard\n",
+    "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+    "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+    "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+    "from typing import Any\n",
+    "from tqdm import tqdm\n",
+    "import json\n",
+    "import os\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv(override = True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5bbe055e",
+   "metadata": {},
+   "source": [
+    "### Create a dataset specific class and define the required functions "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5f325d33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def extract_between(start, end, text):\n",
+    "    \"\"\"\n",
+    "    Extracts the substring from 'text' that is between 'start' and 'end' strings.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    - start (str): The starting delimiter string.\n",
+    "    - end (str): The ending delimiter string.\n",
+    "    - text (str): The text to search within.\n",
+    "    \n",
+    "    Returns:\n",
+    "    - str: The extracted substring between the start and end delimiters.\n",
+    "    \"\"\"\n",
+    "    start_index = text.find(start)\n",
+    "    if start_index == -1:\n",
+    "        return '' \n",
+    "    \n",
+    "    start_index += len(start)\n",
+    "    \n",
+    "    end_index = text.find(end, start_index)\n",
+    "    if end_index == -1:\n",
+    "        return ''  \n",
+    "    return text[start_index:end_index]\n",
+    "\n",
+    "class AQUARAT(DatasetSpecificProcessing):\n",
+    "\n",
+    "    def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+    "        def extract_answer_from_output(completion):\n",
+    "\n",
+    "                return completion\n",
+    "\n",
+    "        examples_set = []\n",
+    "\n",
+    "        for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+    "            example = {\n",
+    "              DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+    "              DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+    "              DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+    "            }\n",
+    "            examples_set.append(example)\n",
+    "\n",
+    "        save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+    "\n",
+    "    def extract_final_answer(self, answer: str):\n",
+    "        \n",
+    "        final_answer = extract_between(text=answer,start=\"<ANS_START>\",end=\"<ANS_END>\")\n",
+    "        return final_answer\n",
+    "    \n",
+    "    def access_answer(self, llm_output: str, gt_answer: str):\n",
+    "\n",
+    "        predicted_answer = self.extract_final_answer(llm_output)\n",
+    "        is_correct = False\n",
+    "        if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):\n",
+    "            is_correct = True\n",
+    "\n",
+    "        return is_correct, predicted_answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f384eb57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aquarat_processor = AQUARAT()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "if not os.path.exists(\"data\"):\n",
+    "    os.mkdir(\"data\")\n",
+    "dataset = load_dataset(\"deepmind/aqua_rat\", \"raw\")\n",
+    "num_samples = 1\n",
+    "for dataset_type in ['train','test']:\n",
+    "    data_list = []\n",
+    "    for data in dataset[dataset_type]:\n",
+    "        options = data['options'][0]\n",
+    "        for i in range(1,len(data['options'])):\n",
+    "            options = options + \" \"+ data['options'][i]\n",
+    "        data_list.append({\"question\": data['question']+\"\\n\"+options, \"answer\": data['correct']})\n",
+    "        if num_samples == 100 and dataset_type == 'train': # We sample only 100 train examples and use 25 out them for training randomly\n",
+    "            break\n",
+    "        num_samples += 1\n",
+    "    aquarat_processor.dataset_to_jsonl(\"data/\"+ dataset_type+'.jsonl', dataset=data_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db891c34",
+   "metadata": {},
+   "source": [
+    "### Set paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f43482f1-3e10-4cf7-8ea6-ff42c04067a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_file_name = os.path.join(\"data\", \"train.jsonl\")\n",
+    "test_file_name = os.path.join(\"data\", \"test.jsonl\")\n",
+    "path_to_config = \"configs\"\n",
+    "llm_config_path = os.path.join(path_to_config, \"llm_config.yaml\")\n",
+    "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+    "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "26ba1a62",
+   "metadata": {},
+   "source": [
+    "### Create an object for calling prompt optimization and inference functionalities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GluePromptOpt(promptopt_config_path,\n",
+    "                   setup_config_path,\n",
+    "                   train_file_name,\n",
+    "                   aquarat_processor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b25843b",
+   "metadata": {},
+   "source": [
+    "### Call prompt optmization function\n",
+    "1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real\n",
+    "2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples \n",
+    "3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "573c6151-2c03-45d9-9904-1724a1e20f1b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Function call to generate optimal prompt and expert profile \n",
+    "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "97549dd2",
+   "metadata": {},
+   "source": [
+    "### Save the optimized prompt and expert profile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34a716af-0d77-4c7d-b1c2-6438d66096ce",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import pickle \n",
+    "\n",
+    "if not os.path.exists(\"results\"):\n",
+    "    os.system(\"mkdir results\")\n",
+    "\n",
+    "with open(\"results/best_prompt.pkl\", 'wb') as f:\n",
+    "    pickle.dump(best_prompt, f)\n",
+    "with open(\"results/expert_profile.pkl\", 'wb') as f:\n",
+    "    pickle.dump(expert_profile, f)\n",
+    "\n",
+    "print(f\"Best prompt: {best_prompt} \\nExpert profile: {expert_profile}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bdbb7e07",
+   "metadata": {},
+   "source": [
+    "### Evaluate the optimized prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49b5711-82dd-4d18-8cd4-ee447cf8d74c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "gp.EXPERT_PROFILE = expert_profile\n",
+    "gp.BEST_PROMPT = best_prompt\n",
+    "\n",
+    "# Function call to evaluate the prompt\n",
+    "accuracy = gp.evaluate(test_file_name)\n",
+    "\n",
+    "print(f\"Final Accuracy: {accuracy}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PromptWizard",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

demos/bbh/.env ADDED Viewed

	@@ -0,0 +1,8 @@

+USE_OPENAI_API_KEY="False"
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""

demos/bbh/configs/prompt_library.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+system_prompts: |
+  You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+  Guidelines
+  - Your role must always be a helpful assistant that assists students in understanding research papers.
+  - Only answer questions that are directly or indirectly related to the referenced paper(s).
+mode:
+  chat:
+    - name: CHAT-FIRST-MESSAGE
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+    - name: CHAT-NEXT-MESSAGES
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+  generation:
+    - name: FLASH_PROFILE
+      prompt_template: |
+        {user_msg}
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+      llm_request_type: rag-query
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]

demos/bbh/configs/promptopt_config.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting <mutation_rounds>  rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description : 'Extract the second letter from the input word.'
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction : 'Output the second letter. Think step by step to arrive at the solution.'
+# Instruction for specifying answer format
+answer_format : 'For each input word, present the reasoning followed by the extracted letter (only single letter) between <ANS_START> and <ANS_END> tags'
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################

demos/bbh/configs/setup_config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+assistant_llm:
+  # put the unique_model_id that you specified in llm_config.yaml
+  prompt_opt: gpt-4o
+dir_info:
+  # Base directory for everything
+  base_dir: logs
+  log_dir_name: glue_logs
+experiment_name: bbh
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:

demos/bbh/demo.ipynb ADDED Viewed

	@@ -0,0 +1,428 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ece8514e",
+   "metadata": {},
+   "source": [
+    "#### Set environment variables in [.env](.env) for LLM API calling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "678ed8db",
+   "metadata": {},
+   "source": [
+    "### Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1fb3d81-16b6-4b8c-a028-880fdce5e14a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, \"../../\")\n",
+    "import promptwizard\n",
+    "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+    "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+    "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+    "from typing import Any\n",
+    "from tqdm import tqdm\n",
+    "import json\n",
+    "import os\n",
+    "from azure.identity import get_bearer_token_provider, AzureCliCredential\n",
+    "from openai import AzureOpenAI\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv(override = True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc9b746c",
+   "metadata": {},
+   "source": [
+    "### Below code can be used for LLM-as-a-judge eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "26719362",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_between(start, end, text):\n",
+    "    \"\"\"\n",
+    "    Extracts the substring from 'text' that is between 'start' and 'end' strings.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    - start (str): The starting delimiter string.\n",
+    "    - end (str): The ending delimiter string.\n",
+    "    - text (str): The text to search within.\n",
+    "    \n",
+    "    Returns:\n",
+    "    - str: The extracted substring between the start and end delimiters.\n",
+    "    \"\"\"\n",
+    "    start_index = text.find(start)\n",
+    "    if start_index == -1:\n",
+    "        return '' \n",
+    "    \n",
+    "    start_index += len(start)\n",
+    "    \n",
+    "    end_index = text.find(end, start_index)\n",
+    "    if end_index == -1:\n",
+    "        return ''  \n",
+    "    return text[start_index:end_index]\n",
+    "\n",
+    "def call_api(messages):\n",
+    "    \n",
+    "    token_provider = get_bearer_token_provider(\n",
+    "            AzureCliCredential(), \"https://cognitiveservices.azure.com/.default\"\n",
+    "        )\n",
+    "    client = AzureOpenAI(\n",
+    "        api_version=\"<OPENAI_API_VERSION>\",\n",
+    "        azure_endpoint=\"<AZURE_ENDPOINT>\",\n",
+    "        azure_ad_token_provider=token_provider\n",
+    "        )\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=\"<MODEL_DEPLOYMENT_NAME>\",\n",
+    "        messages=messages,\n",
+    "        temperature=0.0,\n",
+    "    )\n",
+    "    prediction = response.choices[0].message.content\n",
+    "    return prediction\n",
+    "\n",
+    "def llm_eval(predicted_answer,gt_answer):\n",
+    "    \n",
+    "    EVAL_PROMPT = f\"\"\"Given the Predicted_Answer and Reference_Answer, compare them and check they mean the same.\n",
+    "                    If they mean the same then return True between <ANS_START> and <ANS_END> tags , \n",
+    "                    If they differ in the meaning then return False between <ANS_START> and <ANS_END> tags \n",
+    "                    Following are the given :\n",
+    "                    Predicted_Answer: {predicted_answer}\n",
+    "                    Reference_Answer: {gt_answer}\"\"\"\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": \"\"},\n",
+    "        {\"role\": \"user\", \"content\": EVAL_PROMPT}\n",
+    "    ]\n",
+    "\n",
+    "    response = call_api(messages)\n",
+    "    final_judgement = extract_between(start=\"<ANS_START>\", end=\"<ANS_END>\", text=response)\n",
+    "    return final_judgement == \"True\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a5084d7",
+   "metadata": {},
+   "source": [
+    "### Create a dataset specific class and define the required functions "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5f325d33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "llm_as_judge_eval = True\n",
+    "\n",
+    "class BBH(DatasetSpecificProcessing):\n",
+    "\n",
+    "    def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+    "        def extract_answer_from_output(completion):\n",
+    "\n",
+    "                return completion\n",
+    "\n",
+    "        examples_set = []\n",
+    "\n",
+    "        for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+    "            example = {\n",
+    "              DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+    "              DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+    "              DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+    "            }\n",
+    "            examples_set.append(example)\n",
+    "\n",
+    "        save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+    "\n",
+    "    def extract_final_answer(self, answer: str):\n",
+    "        \n",
+    "        final_answer = extract_between(text=answer,start=\"<ANS_START>\",end=\"<ANS_END>\")\n",
+    "        return final_answer\n",
+    "    \n",
+    "    def access_answer(self, llm_output: str, gt_answer: str):\n",
+    "\n",
+    "        if llm_as_judge_eval:\n",
+    "            predicted_answer = self.extract_final_answer(llm_output)\n",
+    "            is_correct = False\n",
+    "            if llm_eval(predicted_answer,gt_answer):\n",
+    "                is_correct = True\n",
+    "        else:\n",
+    "            predicted_answer = self.extract_final_answer(llm_output)\n",
+    "            is_correct = False\n",
+    "            if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):\n",
+    "                is_correct = True\n",
+    "\n",
+    "            return is_correct, predicted_answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f384eb57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bbh_processor = BBH()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec7d1396",
+   "metadata": {},
+   "source": [
+    "### Load and save the dataset . \n",
+    "Set the ```dataset_to_run``` variable to choose 1 among the 19 datasets of BBII to run the optimization on"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "if not os.path.exists(\"data\"):\n",
+    "    os.mkdir(\"data\")\n",
+    "dataset_list = ['informal_to_formal','letters_list','negation','orthography_starts_with','rhymes','second_word_letter','sum','diff','sentence_similarity','taxonomy_animal','auto_categorization','object_counting','odd_one_out','antonyms','word_unscrambling','cause_and_effect','common_concept','word_sorting','synonyms']\n",
+    "\n",
+    "# Set the dataset on which to run optimization out of the 19 \n",
+    "dataset_to_run = 'second_word_letter'\n",
+    "\n",
+    "if not os.path.exists(\"data/\"+dataset_to_run):\n",
+    "    os.mkdir(\"data/\"+dataset_to_run)\n",
+    "    \n",
+    "os.system(\"git clone https://github.com/xqlin98/INSTINCT\")\n",
+    "\n",
+    "\n",
+    "for mode in ['execute','induce']:\n",
+    "    for dataset in dataset_list:\n",
+    "\n",
+    "        if dataset_to_run == dataset:\n",
+    "            data_list = []\n",
+    "\n",
+    "            file_path = 'INSTINCT/Induction/experiments/data/instruction_induction/raw/'+mode+'/'+dataset+'.json'  \n",
+    "            with open(file_path, 'r') as file:\n",
+    "                data = json.load(file)\n",
+    "            \n",
+    "            save_file_path = 'test.jsonl'\n",
+    "            if mode == 'execute':\n",
+    "                save_file_path = 'train.jsonl'\n",
+    "\n",
+    "            for key,sample in data['examples'].items():\n",
+    "                task = dataset\n",
+    "                if(task == 'cause_and_effect'):\n",
+    "                    cause = sample[\"cause\"]\n",
+    "                    effect = sample[\"effect\"]\n",
+    "                    import random\n",
+    "                    pair = [cause, effect]\n",
+    "                    random.shuffle(pair)\n",
+    "                    question = f\"Sentence 1: {pair[0]} Sentence 2: {pair[1]}\",\n",
+    "                    answer = cause,\n",
+    "                elif(task == 'antonyms'):\n",
+    "                    \n",
+    "                        question = sample[\"input\"],\n",
+    "                        answer = sample[\"output\"],\n",
+    "\n",
+    "                elif(task == 'common_concept'):\n",
+    "                    concept = sample[\"concept\"]\n",
+    "                    items = sample[\"items\"]\n",
+    "                    input = \", \".join(items)\n",
+    "                    question = f\"Objects: {input}\"\n",
+    "                    answer = f\"{concept}\"\n",
+    "\n",
+    "                elif(task == 'diff'):\n",
+    "                    input = sample[\"input\"]\n",
+    "                    output = sample[\"output\"]\n",
+    "                    question = f\"{input}\"\n",
+    "                    answer = f\"{output}\"\n",
+    "\n",
+    "                elif(task == 'informal_to_formal'):\n",
+    "                    informal = sample[\"input\"]\n",
+    "                    formal = sample[\"output\"]\n",
+    "                    question = f\"{informal}\"\n",
+    "                    answer = f\"{formal}\"\n",
+    "\n",
+    "                elif(task == 'synonyms' or task == 'word_unscrambling' or task == 'word_sorting' or task == 'letters_list' or task == 'negation' or task == 'orthography_starts_with' or task == 'second_word_letter' or task == 'sentence_similarity' or task == 'sum' or task == 'taxonomy_animal' or task == 'auto_categorization' or task == 'object_counting' or task == 'odd_one_out'):\n",
+    "                    informal = sample[\"input\"]\n",
+    "                    formal = sample[\"output\"] \n",
+    "                    question = f\"{informal}\"\n",
+    "                    answer = f\"{formal}\"\n",
+    "\n",
+    "                elif(task == 'rhymes'):\n",
+    "                    input = sample[\"input\"]\n",
+    "                    output = sample[\"other_rhymes\"]\n",
+    "                    output = \", \".join(output)\n",
+    "                    question = f\"{input}\"\n",
+    "                    answer = f\"{output}\"\n",
+    "            \n",
+    "                data_list.append({\"question\":question,\"answer\":answer})\n",
+    "            bbh_processor.dataset_to_jsonl(\"data/\"+dataset +\"/\"+save_file_path, dataset=data_list)\n",
+    "\n",
+    "os.system(\"rm -r INSTINCT\")\n",
+    "           "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe28a967",
+   "metadata": {},
+   "source": [
+    "### Set paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f43482f1-3e10-4cf7-8ea6-ff42c04067a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_file_name = os.path.join(\"data/\"+dataset_to_run, \"train.jsonl\")\n",
+    "test_file_name = os.path.join(\"data/\"+dataset_to_run, \"test.jsonl\")\n",
+    "path_to_config = \"configs\"\n",
+    "llm_config_path = os.path.join(path_to_config, \"llm_config.yaml\")\n",
+    "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+    "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "75ac5780",
+   "metadata": {},
+   "source": [
+    "### Create an object for calling prompt optimization and inference functionalities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GluePromptOpt(promptopt_config_path,\n",
+    "                   setup_config_path,\n",
+    "                   train_file_name,\n",
+    "                   bbh_processor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a26af0d",
+   "metadata": {},
+   "source": [
+    "### Call prompt optmization function\n",
+    "1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real\n",
+    "2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples \n",
+    "3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "573c6151-2c03-45d9-9904-1724a1e20f1b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Function call to generate optimal prompt and expert profile \n",
+    "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef923b11",
+   "metadata": {},
+   "source": [
+    "### Save the optimized prompt and expert profile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34a716af-0d77-4c7d-b1c2-6438d66096ce",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import pickle \n",
+    "if not os.path.exists(\"results\"):\n",
+    "    os.system(\"mkdir results\")\n",
+    "\n",
+    "with open(\"results/best_prompt.pkl\", 'wb') as f:\n",
+    "    pickle.dump(best_prompt, f)\n",
+    "with open(\"results/expert_profile.pkl\", 'wb') as f:\n",
+    "    pickle.dump(expert_profile, f)\n",
+    "\n",
+    "print(f\"Best prompt: {best_prompt} \\nExpert profile: {expert_profile}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1942c67e",
+   "metadata": {},
+   "source": [
+    "### Evaluate the optimized prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49b5711-82dd-4d18-8cd4-ee447cf8d74c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "gp.EXPERT_PROFILE = expert_profile\n",
+    "gp.BEST_PROMPT = best_prompt\n",
+    "\n",
+    "# Function call to evaluate the prompt\n",
+    "accuracy = gp.evaluate(test_file_name)\n",
+    "\n",
+    "print(f\"Final Accuracy: {accuracy}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PromptWizard",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

demos/bbh/description.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# BBH Datasets
+# informal_to_formal
+task_description = 'In this task, you will be given a sentence in an informal style. Your job is to rewrite the sentence in a formal style.'
+base_instruction = 'For each given sentence, provide a formal paraphrase.'
+answer_format = 'For each input sentence, present the reasoning followed by the format paraphrased sentence.'
+#letters_list
+task_description = 'In this task, you will be given a single word as input. Your job is to produce the output by adding a space between each character pair in the word.'
+base_instruction = 'For each given word, insert a space between each character pair in the word.'
+answer_format = 'For each input word, ouput only the space seperated characters.'
+#negation
+task_description = 'For each input, write a sentence that expresses the exact opposite meaning of the input.'
+base_instruction = 'For each given sentence, provide a new sentence that conveys the exact opposite meaning by using "not" in the input sentence, keeping the rest of the sentence unchanged.'
+answer_format = "For each input sentence, negate the meaning by adding 'not' to the input sentence."
+#orthography_starts_with
+task_description = 'For each input, output all the words in the sentence that begin with the character in brackets at the end of the sentence.'
+base_instruction = 'Output words with space separated that begin with the character in brackets at the end of the following sentence='
+answer_format = 'For each input sentence, present the reasoning followed by space seperated words.'
+#rhymes
+task_description = 'In this task, you will be given a single word as input. Your job is to produce list of comma sperated words that rhymes with the input word.'
+base_instruction = 'For each given word, provide a list of words that rhyme with the input word='
+answer_format = 'For each input word, present the reasoning followed by the list of rhyming word.'
+#second_word_letter
+task_description = 'Extract the second letter from the input word.'
+base_instruction = 'Output the second letter. Think step by step to arrive at the solution.'
+answer_format = 'For each input word, present the reasoning followed by the extracted letter (only single letter).'
+#sentence_similarity
+task_description = "Each input consists of two sentences (Sentence 1 and Sentence 2). Rate on a scale of 0 to 5 whether those sentences are paraphrases of each other, and also give a brief textual description of the rating (0 - definitely not, 2 - possibly, 3 - probably, 4 - almost perfectly and 5 - perfectly). Use \" - \" to separate them"
+base_instruction = """Rate the similarity of each pair of sentences according to the following scale:
+0 - Definitely not : The sentences are completely unrelated in meaning.
+1 - Probably not : The sentences have minor or superficial similarities but differ significantly in meaning.
+2 - Possibly : The sentences share some elements of meaning but are not strong paraphrases.
+3 - Probably : The sentences convey similar meanings but have some differences.
+4 - Almost perfectly : The sentences are very similar with only minor differences.
+5 - Perfectly :The sentences are nearly identical in meaning."""
+answer_format = 'Provide your rating and brief textual description for each pair of sentences from the 6 options. (0 - Definitely not, 1 - Probably not, 2 - Possibly, 3 - Probably, 4 - Almost perfectly, 5 - Perfectly)'
+#sum
+task_description = 'For each input, write the sum of the two numbers that appears there.'
+base_instruction = 'Output the sum of the following two numbers='
+answer_format = 'For each pair of numbers, present the reasoning followed by the sum.'
+#synonyms
+task_description = 'You will be given a word as input and need to output a word that is semantically similar.'
+base_instruction = 'Output a word that is semantically similar to the input word='
+answer_format = 'For each input word, present the reasoning followed by the synonym.'
+#taxonomy_animal
+task_description = 'In this task, you will be given a list of words. Your job is to identify and list all the animals from the given set of words.'
+base_instruction = 'For each given list of words, provide a new list containing only the animals.'
+answer_format = 'For each list of words, output the list of animals.'
+#auto_categorization
+task_description = 'Find the best categorization for the given set of words as input.'
+base_instruction = 'Output the best categorization for the following set of words='
+answer_format = 'For each set of words, present the reasoning followed by the best categorization.'
+#object_counting
+task_description = 'Find the number of objects in the given input.'
+base_instruction = 'Output the number of objects in the following input='
+answer_format = 'For each input, present the reasoning followed by the number of objects.'
+#odd_one_out
+task_description = 'Given the below list of words, find the odd one out'
+base_instruction = 'Output the word that does not belong to the group of words='
+answer_format = 'For each group of words, present the reasoning followed by the odd one out.'
+#word_sorting
+task_description = 'In this task, you will be given a set of words. Your job is to sort the words based on the first character of each word in alphabetical order.'
+base_instruction = 'For each given set of words, provide a sorted list of the words based on the first character of each word.'
+answer_format = 'For each input, list of sorted words based on the first character of each word.'
+#word_unscrambling
+task_description = 'In this task output all possible meaningful words that can be formed by rearranging all the letters of the given word. Each character must be used exactly once and the words must be valid.'
+base_instruction = 'Output comma seperated words of same length as input word.'
+answer_format = 'Output the all possible meaningful words comma seperated that can formed by rearranging the letters of the given word.'
+#antonyms
+task_description = 'In this task, you will be given a single word as input. Your job is to produce a word that has the exact opposite meaning (an antonym) to the input word.'
+base_instruction = 'For each given word, provide a word that is an antonym (has the exact opposite meaning).'
+answer_format = 'For each input word, output only a single word.'
+#cause_and_effect
+task_description = 'Find the cause in the following cause and effect pair. Each input consists of two sentences, where one is the cause and the other is the outcome.'
+base_instruction = 'Output the cause in the following cause and effect pair='
+answer_format = 'For each pair of sentences, present the reasoning followed by the cause.'
+#common_concept
+task_description = 'In this task, you will be given a list of objects. Your job is to identify and describe a common characteristic that links all the objects in the list.'
+base_instruction = 'The instruction is to ”involve” the objects mentioned in the input.'
+answer_format = 'For each list of objects, output the common concept by "involving" the objects mentioned.'

demos/gsm8k/.env ADDED Viewed

	@@ -0,0 +1,8 @@

+USE_OPENAI_API_KEY="False"
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""

demos/gsm8k/configs/prompt_library.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+system_prompts: |
+  You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+  Guidelines
+  - Your role must always be a helpful assistant that assists students in understanding research papers.
+  - Only answer questions that are directly or indirectly related to the referenced paper(s).
+mode:
+  chat:
+    - name: CHAT-FIRST-MESSAGE
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+    - name: CHAT-NEXT-MESSAGES
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+  generation:
+    - name: FLASH_PROFILE
+      prompt_template: |
+        {user_msg}
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+      llm_request_type: rag-query
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]

demos/gsm8k/configs/promptopt_config.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting <mutation_rounds>  rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description: "You are a mathematics expert. You will be given a mathematics problem which you need to solve"
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction: "Lets think step by step."
+# Instruction for specifying answer format
+answer_format: "For each question present the reasoning followed by the correct answer."
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################

demos/gsm8k/configs/setup_config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+assistant_llm:
+  # put the unique_model_id that you specified in llm_config.yaml
+  prompt_opt: gpt-4o
+dir_info:
+  # Base directory for everything
+  base_dir: logs
+  log_dir_name: glue_logs
+experiment_name: gsm8k
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:

demos/gsm8k/demo.ipynb ADDED Viewed

	@@ -0,0 +1,298 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6eb94b72",
+   "metadata": {},
+   "source": [
+    "#### Set environment variables in [.env](.env) for LLM API calling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "388020c6",
+   "metadata": {},
+   "source": [
+    "### Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11efa138",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, \"../../\")\n",
+    "import promptwizard\n",
+    "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+    "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+    "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+    "from typing import Any\n",
+    "from tqdm import tqdm\n",
+    "from re import compile, findall\n",
+    "import os\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv(override = True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "beb14821",
+   "metadata": {},
+   "source": [
+    "### Create a dataset specific class and define the required functions "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5f325d33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GSM8k(DatasetSpecificProcessing):\n",
+    "\n",
+    "    def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+    "        def extract_answer_from_output(completion):\n",
+    "            # Your functions for metrics and prompt building\n",
+    "            ans_re = compile(r\"#### (\\-?[0-9\\.\\,]+)\")\n",
+    "            self.INVALID_ANS = \"[invalid]\"\n",
+    "\n",
+    "            match = ans_re.search(completion)\n",
+    "            if match:\n",
+    "                match_str = match.group(1).strip()\n",
+    "                match_str = match_str.replace(\",\", \"\")\n",
+    "                return match_str\n",
+    "            else:\n",
+    "                return self.INVALID_ANS\n",
+    "\n",
+    "        examples_set = []\n",
+    "\n",
+    "        for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+    "            example = {\n",
+    "              DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+    "              DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+    "              DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+    "            }\n",
+    "            examples_set.append(example)\n",
+    "\n",
+    "        save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+    "\n",
+    "    def extract_final_answer(self, answer: str):\n",
+    "        \n",
+    "        if not answer:\n",
+    "            return self.INVALID_ANS\n",
+    "\n",
+    "        model_pred = answer.lower()\n",
+    "        preds = model_pred.split(self.ANSWER_START.lower())\n",
+    "        answer_flag = True if len(preds) > 1 else False\n",
+    "\n",
+    "        pred = preds[-1].replace(\",\", \"\")\n",
+    "        pred = [s for s in findall(r'-?\\d+\\.?\\d*', pred)]\n",
+    "\n",
+    "        if len(pred) == 0:\n",
+    "            return self.INVALID_ANS\n",
+    "\n",
+    "        if answer_flag:\n",
+    "            # choose the first element in list\n",
+    "            pred = pred[0]\n",
+    "        else:\n",
+    "            # choose the last element in list\n",
+    "            pred = pred[-1]\n",
+    "\n",
+    "        # (For arithmetic tasks) if a word ends with period, it will be omitted ...\n",
+    "        if pred[-1] == \".\":\n",
+    "            pred = pred[:-1]\n",
+    "        return pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f384eb57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gsm8k_processor = GSM8k()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11d2de75",
+   "metadata": {},
+   "source": [
+    "### Load and save the dataset "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(\"data\"):\n",
+    "    os.mkdir(\"data\")\n",
+    "    \n",
+    "dataset = load_dataset(\"openai/gsm8k\", \"main\")\n",
+    "num_samples = 0\n",
+    "for dataset_type in ['train','test']:\n",
+    "    data_list = []\n",
+    "    for data in dataset[dataset_type]:\n",
+    "        data_list.append({\"question\": data['question'], \"answer\": data['answer']})\n",
+    "        if num_samples == 100 and dataset_type == 'train': # We sample only 100 train examples and use 25 out them for training randomly\n",
+    "            break\n",
+    "        num_samples += 1\n",
+    "    gsm8k_processor.dataset_to_jsonl(\"data/\"+ dataset_type+'.jsonl', dataset=data_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac30e74f",
+   "metadata": {},
+   "source": [
+    "### Set paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f43482f1-3e10-4cf7-8ea6-ff42c04067a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_file_name = os.path.join(\"data\", \"train.jsonl\")\n",
+    "test_file_name = os.path.join(\"data\", \"test.jsonl\")\n",
+    "path_to_config = \"configs\"\n",
+    "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+    "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3392594d",
+   "metadata": {},
+   "source": [
+    "### Create an object for calling prompt optimization and inference functionalities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GluePromptOpt(promptopt_config_path,\n",
+    "                   setup_config_path,\n",
+    "                   train_file_name,\n",
+    "                   gsm8k_processor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1784648c",
+   "metadata": {},
+   "source": [
+    "### Call prompt optmization function\n",
+    "1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real\n",
+    "2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples \n",
+    "3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "573c6151-2c03-45d9-9904-1724a1e20f1b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Function call to generate optimal prompt and expert profile \n",
+    "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ee1aa99",
+   "metadata": {},
+   "source": [
+    "### Save the optimized prompt and expert profile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34a716af-0d77-4c7d-b1c2-6438d66096ce",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import pickle \n",
+    "\n",
+    "if not os.path.exists(\"results\"):\n",
+    "    os.system(\"mkdir results\")\n",
+    "    \n",
+    "with open(\"results/best_prompt.pkl\", 'wb') as f:\n",
+    "    pickle.dump(best_prompt, f)\n",
+    "with open(\"results/expert_profile.pkl\", 'wb') as f:\n",
+    "    pickle.dump(expert_profile, f)\n",
+    "\n",
+    "print(f\"Best prompt: {best_prompt} \\nExpert profile: {expert_profile}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aac42eed",
+   "metadata": {},
+   "source": [
+    "### Evaluate the optimized prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49b5711-82dd-4d18-8cd4-ee447cf8d74c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "gp.EXPERT_PROFILE = expert_profile\n",
+    "gp.BEST_PROMPT = best_prompt\n",
+    "\n",
+    "# Function call to evaluate the prompt\n",
+    "accuracy = gp.evaluate(test_file_name)\n",
+    "\n",
+    "print(f\"Final Accuracy: {accuracy}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "general",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

demos/scenarios/.env ADDED Viewed

	@@ -0,0 +1,8 @@

+USE_OPENAI_API_KEY="False"
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""

demos/scenarios/configs/prompt_library.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+system_prompts: |
+  You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+  Guidelines
+  - Your role must always be a helpful assistant that assists students in understanding research papers.
+  - Only answer questions that are directly or indirectly related to the referenced paper(s).
+mode:
+  chat:
+    - name: CHAT-FIRST-MESSAGE
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+    - name: CHAT-NEXT-MESSAGES
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+  generation:
+    - name: FLASH_PROFILE
+      prompt_template: |
+        {user_msg}
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+      llm_request_type: rag-query
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]

demos/scenarios/configs/promptopt_config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting <mutation_rounds>  rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description: "You are a mathematics expert. You will be given a mathematics problem which you need to solve"
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction: "Lets think step by step."
+# Instruction for specifying answer format
+answer_format: "For each question present the reasoning followed by the correct answer."
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################

demos/scenarios/configs/setup_config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+assistant_llm:
+  # put the unique_model_id that you specified in llm_config.yaml
+  prompt_opt: gpt-4o
+dir_info:
+  # Base directory for everything
+  base_dir: logs
+  log_dir_name: glue_logs
+experiment_name: gsm8k
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:

demos/scenarios/dataset_scenarios_demo.ipynb ADDED Viewed

	@@ -0,0 +1,1146 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6eb94b72",
+   "metadata": {},
+   "source": [
+    "## Following is a demo on running PromptWizard under different scenarios "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52c7ee0a",
+   "metadata": {},
+   "source": [
+    "#### Set environment variables in [.env](.env) for LLM API calling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3cffa5ef",
+   "metadata": {},
+   "source": [
+    "#### Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11efa138",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, \"../../\")\n",
+    "import promptwizard\n",
+    "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+    "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+    "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+    "from typing import Any\n",
+    "from tqdm import tqdm\n",
+    "from re import compile, findall\n",
+    "import os\n",
+    "from datasets import load_dataset\n",
+    "import yaml\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv(override = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9be22d5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def update_yaml_file(file_path,config_dict):\n",
+    "\n",
+    "    with open(file_path, 'r') as file:\n",
+    "        data = yaml.safe_load(file)\n",
+    "\n",
+    "\n",
+    "    for field,value in config_dict.items():\n",
+    "        data[field] = value\n",
+    "\n",
+    "    with open(file_path, 'w') as file:\n",
+    "        yaml.dump(data, file, default_flow_style=False)\n",
+    "\n",
+    "    print(\"YAML file updated successfully!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "78abb34a",
+   "metadata": {},
+   "source": [
+    "Set the paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "14399d47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path_to_config = \"configs\"\n",
+    "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+    "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f274af9",
+   "metadata": {},
+   "source": [
+    "### Now let us consider the three scenarios with respect to availability of training data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5aaed236",
+   "metadata": {},
+   "source": [
+    "#### Scenario 1 : We have no training data , but we also don't want in-context examples in final prompt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c34423d",
+   "metadata": {},
+   "source": [
+    "Set the configurations to generate mutations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec4e7607",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_path = 'configs/promptopt_config.yaml' \n",
+    "# Set the following based on the use case\n",
+    "config_dict = {\n",
+    "                \"task_description\": \"You are a mathematics expert. You will be given a mathematics problem which you need to solve\",\n",
+    "                \"base_instruction\": \"Lets think step by step.\",\n",
+    "                \"mutation_rounds\": 5\n",
+    "               }\n",
+    "update_yaml_file(file_path,config_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d984e84e",
+   "metadata": {},
+   "source": [
+    "Create an object for calling prompt optimization and inference functionalities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7aa4ccb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GluePromptOpt(promptopt_config_path,\n",
+    "                   setup_config_path,\n",
+    "                   dataset_jsonl=None,\n",
+    "                   data_processor=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d587065",
+   "metadata": {},
+   "source": [
+    "Call the optimization function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afe8de4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_prompt, expert_profile = gp.get_best_prompt(use_examples=False,run_without_train_examples=True,generate_synthetic_examples=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a30db274",
+   "metadata": {},
+   "source": [
+    "Output : Five mutated prompts are printed on the termial as shown below :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e5cb1a65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT = \"\"\"\n",
+    "Variations 1:\n",
+    "Expert Profile:\n",
+    "You are a mathematician with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to approach mathematical problems methodically, breaking them down into manageable steps and applying appropriate techniques to find solutions. You are familiar with both theoretical and applied mathematics, and you can explain your reasoning and solutions in a clear and concise manner. Your ability to solve mathematical problems efficiently and accurately makes you an invaluable resource for anyone seeking help with mathematics.:\n",
+    "Prompt:\n",
+    "You are a mathematics expert. You will be given a mathematics problem which you need to solve\n",
+    "Lets think step by step.\n",
+    "\n",
+    "\n",
+    "For each question present the reasoning followed by the correct answer.\n",
+    "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\n",
+    "_______________________________________________________________________\n",
+    "\n",
+    "Variations 2:\n",
+    "Expert Profile:\n",
+    "You are a mathematician with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to approach mathematical problems methodically, breaking them down into manageable steps and applying appropriate techniques to find solutions. You are familiar with both theoretical and applied mathematics, and you can explain your reasoning and solutions in a clear and concise manner. Your ability to solve mathematical problems efficiently and accurately makes you an invaluable resource for anyone seeking help with mathematics.:\n",
+    "Prompt:\n",
+    "Let's break this problem down step by step and devise an experiment to help solve it.\n",
+    "\n",
+    "\n",
+    "For each question present the reasoning followed by the correct answer.\n",
+    "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\n",
+    "_______________________________________________________________________\n",
+    "\n",
+    "Variations 3:\n",
+    "Expert Profile:\n",
+    "You are a mathematics expert with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to break down intricate problems into manageable steps, making it easier to find solutions. You are familiar with a wide range of mathematical techniques and tools, and you can apply them effectively to solve problems. Whether the problem involves solving equations, proving theorems, or analyzing data, you can provide a clear and accurate solution. Your ability to explain your reasoning and methodology ensures that others can follow and understand your approach, making you an invaluable resource for tackling challenging mathematical problems.:\n",
+    "Prompt:\n",
+    "Let's think through this problem step by step and make a list of ideas to solve it.\n",
+    "\n",
+    "\n",
+    "For each question present the reasoning followed by the correct answer.\n",
+    "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\n",
+    "_______________________________________________________________________\n",
+    "\n",
+    "Variations 4:\n",
+    "Expert Profile:\n",
+    "You are a mathematics expert with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to break down intricate problems into manageable steps, making it easier for others to follow your reasoning. You are familiar with a wide range of mathematical techniques and tools, and you can apply them effectively to find solutions. Whether the problem involves solving equations, proving theorems, or analyzing data, you can provide a clear, accurate, and well-explained solution. Your ability to communicate complex mathematical concepts in an understandable way makes you an invaluable resource for anyone seeking to solve mathematical problems.:\n",
+    "Prompt:\n",
+    "Let's approach this problem step by step and measure our progress as we go.\n",
+    "\n",
+    "\n",
+    "For each question present the reasoning followed by the correct answer.\n",
+    "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\n",
+    "Iterations completed:   0%|          | 0/3 [00:24<?, ?it/s]\n",
+    "Time taken to find best prompt: 24.79972267150879 sec\n",
+    "_______________________________________________________________________\n",
+    "\n",
+    "Variations 5:\n",
+    "Expert Profile:\n",
+    "You are a mathematics expert with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to approach problems methodically, breaking them down into manageable steps and applying appropriate mathematical techniques to find solutions. You are also adept at explaining your reasoning and methods in a clear and concise manner, making it easy for others to follow your thought process. Whether the problem involves solving equations, proving theorems, or analyzing data, you have the knowledge and skills to tackle it effectively. Your proficiency in mathematics is highly valuable in both academic and practical applications, and you are well-equipped to provide accurate and insightful solutions to a wide range of mathematical problems.:\n",
+    "Prompt:\n",
+    "Let's simplify this problem step by step to make it easier to solve.\n",
+    "\n",
+    "\n",
+    "For each question present the reasoning followed by the correct answer.\n",
+    "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfd54818",
+   "metadata": {},
+   "source": [
+    "#### Scenario 2 : We have no training data , but we also want in-context examples in final prompt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b07d1862",
+   "metadata": {},
+   "source": [
+    "This scenario has two steps \n",
+    "- Genrate synthetic data\n",
+    "- Optimize prompts using synthetic data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf44d6d7",
+   "metadata": {},
+   "source": [
+    "STEP 1 : Generate synthetic data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96d07ae3",
+   "metadata": {},
+   "source": [
+    "Set the configurations to first generate synthetic training data. \\\n",
+    "Any number of synthetic examples can be generated and then used for optimizing prompts as mentioned in STEP 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c7c1f19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_path = 'configs/promptopt_config.yaml' \n",
+    "# Set the number of synthetic training examples to be generated\n",
+    "config_dict = {\n",
+    "                \"num_train_examples\":20\n",
+    "               }\n",
+    "update_yaml_file(file_path,config_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2311b4ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GluePromptOpt(promptopt_config_path,\n",
+    "                   setup_config_path,\n",
+    "                   dataset_jsonl=None,\n",
+    "                   data_processor=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65ec6cd2",
+   "metadata": {},
+   "source": [
+    "Call the function to generate synthetic examples, which are saved in train.jsonl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff84f04e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_prompt, expert_profile = gp.get_best_prompt(use_examples=False,run_without_train_examples=False,generate_synthetic_examples=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a286dcdf",
+   "metadata": {},
+   "source": [
+    "STEP 2 : Optimize prompts using synthetic data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bb0a4060",
+   "metadata": {},
+   "source": [
+    "Create a dataset specific class and define the required functions "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7aaa5126",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GSM8k(DatasetSpecificProcessing):\n",
+    "\n",
+    "    def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+    "        def extract_answer_from_output(completion):\n",
+    "            # Your functions for metrics and prompt building\n",
+    "            ans_re = compile(r\"#### (\\-?[0-9\\.\\,]+)\")\n",
+    "            self.INVALID_ANS = \"[invalid]\"\n",
+    "\n",
+    "            match = ans_re.search(completion)\n",
+    "            if match:\n",
+    "                match_str = match.group(1).strip()\n",
+    "                match_str = match_str.replace(\",\", \"\")\n",
+    "                return match_str\n",
+    "            else:\n",
+    "                return self.INVALID_ANS\n",
+    "\n",
+    "        examples_set = []\n",
+    "\n",
+    "        for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+    "            example = {\n",
+    "              DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+    "              DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+    "              DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+    "            }\n",
+    "            examples_set.append(example)\n",
+    "\n",
+    "        save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+    "\n",
+    "    def extract_final_answer(self, answer: str):\n",
+    "        \n",
+    "        if not answer:\n",
+    "            return self.INVALID_ANS\n",
+    "\n",
+    "        model_pred = answer.lower()\n",
+    "        preds = model_pred.split(self.ANSWER_START.lower())\n",
+    "        answer_flag = True if len(preds) > 1 else False\n",
+    "\n",
+    "        pred = preds[-1].replace(\",\", \"\")\n",
+    "        pred = [s for s in findall(r'-?\\d+\\.?\\d*', pred)]\n",
+    "\n",
+    "        if len(pred) == 0:\n",
+    "            return self.INVALID_ANS\n",
+    "\n",
+    "        if answer_flag:\n",
+    "            # choose the first element in list\n",
+    "            pred = pred[0]\n",
+    "        else:\n",
+    "            # choose the last element in list\n",
+    "            pred = pred[-1]\n",
+    "\n",
+    "        # (For arithmetic tasks) if a word ends with period, it will be omitted ...\n",
+    "        if pred[-1] == \".\":\n",
+    "            pred = pred[:-1]\n",
+    "        return pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "212bea42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gsm8k_processor = GSM8k()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "36ae1f65",
+   "metadata": {},
+   "source": [
+    "Set the configurations to optimize the prompt on the synthetic data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67db60b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_path = 'configs/promptopt_config.yaml' \n",
+    "config_dict = {\n",
+    "                \"task_description\": \"You are a mathematics expert. You will be given a mathematics problem which you need to solve\",\n",
+    "                \"base_instruction\": \"Lets think step by step.\",\n",
+    "                \"mutation_rounds\": 2,\n",
+    "                \"few_shot_count\": 5,\n",
+    "                \"generate_reasoning\": True,\n",
+    "                \"mutate_refine_iterations\" : 3,\n",
+    "                \"seen_set_size\":20\n",
+    "               }\n",
+    "update_yaml_file(file_path,config_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc8eb2c5",
+   "metadata": {},
+   "source": [
+    "Call the optimization function "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e53934e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GluePromptOpt(promptopt_config_path,\n",
+    "                   setup_config_path,\n",
+    "                   dataset_jsonl = \"train_synthetic.jsonl\",\n",
+    "                   data_processor=gsm8k_processor)\n",
+    "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b4bcd46b",
+   "metadata": {},
+   "source": [
+    "Output : Following Prompt and Expert Profile are generated "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee6006f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT = \"\"\"\n",
+    "Generating Expert Identity....\n",
+    "Expert Identity: You are a mathematician with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your analytical skills and logical reasoning enable you to break down problems into manageable steps and find accurate solutions efficiently. You are familiar with a wide range of mathematical techniques and tools, and you can apply them to solve problems in both theoretical and applied contexts. Your expertise allows you to explain your solutions clearly and concisely, making complex concepts accessible to others. Whether the problem involves solving equations, proving theorems, or analyzing data, you are well-equipped to provide a thorough and correct solution.\n",
+    "Final best prompt: Provide a clear and detailed solution, breaking down all necessary steps. Ensure that the final answer is clearly marked and separated from the solution steps. Use proper mathematical notation and formatting throughout. Verify the final answer by checking the solution steps for accuracy. Simplify all expressions and fractions where possible. Handle special cases or edge cases appropriately, and clearly state any assumptions or conditions applied during the solution process. Finally, review the entire solution to ensure logical consistency and correct formatting.\n",
+    "\n",
+    "[Question] Solve for \\( x \\) in the equation \\( 2x + 3 = 11 \\).\n",
+    "[Answer] To solve for \\( x \\) in the equation \\( 2x + 3 = 11 \\), we will follow these steps:\n",
+    "\n",
+    "1. **Isolate the term containing \\( x \\)**:\n",
+    "   We start by isolating the term with \\( x \\) on one side of the equation. To do this, we need to eliminate the constant term on the left side of the equation.\n",
+    "\n",
+    "   \\[\n",
+    "   2x + 3 = 11\n",
+    "   \\]\n",
+    "\n",
+    "   Subtract 3 from both sides of the equation:\n",
+    "\n",
+    "   \\[\n",
+    "   2x + 3 - 3 = 11 - 3\n",
+    "   \\]\n",
+    "\n",
+    "   Simplifying this, we get:\n",
+    "\n",
+    "   \\[\n",
+    "   2x = 8\n",
+    "   \\]\n",
+    "\n",
+    "2. **Solve for \\( x \\)**:\n",
+    "   Now, we need to solve for \\( x \\) by isolating \\( x \\) itself. Since \\( x \\) is multiplied by 2, we will divide both sides of the equation by 2 to solve for \\( x \\).\n",
+    "\n",
+    "   \\[\n",
+    "   \\frac{2x}{2} = \\frac{8}{2}\n",
+    "   \\]\n",
+    "\n",
+    "   Simplifying this, we get:\n",
+    "\n",
+    "   \\[\n",
+    "   x = 4\n",
+    "   \\]\n",
+    "\n",
+    "3. **Verify the solution**:\n",
+    "   To ensure our solution is correct, we substitute \\( x = 4 \\) back into the original equation and check if both sides are equal.\n",
+    "\n",
+    "   Original equation:\n",
+    "\n",
+    "   \\[\n",
+    "   2x + 3 = 11\n",
+    "   \\]\n",
+    "\n",
+    "   Substitute \\( x = 4 \\):\n",
+    "\n",
+    "   \\[\n",
+    "   2(4) + 3 = 11\n",
+    "   \\]\n",
+    "\n",
+    "   Simplifying this, we get:\n",
+    "\n",
+    "   \\[\n",
+    "   8 + 3 = 11\n",
+    "   \\]\n",
+    "\n",
+    "   \\[\n",
+    "   11 = 11\n",
+    "   \\]\n",
+    "\n",
+    "   Since both sides of the equation are equal, our solution is verified to be correct.\n",
+    "\n",
+    "**Final Answer**: \\( x = 4 \\) <ANS_START> \\( x = 4 \\) <ANS_END>\n",
+    "\n",
+    "[Question] Solve for \\( x \\) in the equation \\( x^2 - 4x + 4 = 0 \\).\n",
+    "[Answer] To solve the quadratic equation \\( x^2 - 4x + 4 = 0 \\), we will follow these steps:\n",
+    "\n",
+    "1. **Identify the quadratic equation**: The given equation is \\( x^2 - 4x + 4 = 0 \\).\n",
+    "\n",
+    "2. **Recognize the standard form**: The standard form of a quadratic equation is \\( ax^2 + bx + c = 0 \\). Here, \\( a = 1 \\), \\( b = -4 \\), and \\( c = 4 \\).\n",
+    "\n",
+    "3. **Factor the quadratic expression**: We need to factor the quadratic expression on the left-hand side of the equation. We look for two numbers that multiply to \\( c \\) (which is 4) and add up to \\( b \\) (which is -4). These numbers are -2 and -2.\n",
+    "\n",
+    "4. **Write the factored form**: The quadratic expression \\( x^2 - 4x + 4 \\) can be factored as \\( (x - 2)(x - 2) \\) or \\( (x - 2)^2 \\).\n",
+    "\n",
+    "5. **Set the factored form equal to zero**: We now have \\( (x - 2)^2 = 0 \\).\n",
+    "\n",
+    "6. **Solve for \\( x \\)**: To find the value of \\( x \\), we take the square root of both sides of the equation:\n",
+    "   \\[\n",
+    "   \\sqrt{(x - 2)^2} = \\sqrt{0}\n",
+    "   \\]\n",
+    "   This simplifies to:\n",
+    "   \\[\n",
+    "   x - 2 = 0\n",
+    "   \\]\n",
+    "\n",
+    "7. **Isolate \\( x \\)**: Add 2 to both sides of the equation to solve for \\( x \\):\n",
+    "   \\[\n",
+    "   x = 2\n",
+    "   \\]\n",
+    "\n",
+    "8. **Verify the solution**: Substitute \\( x = 2 \\) back into the original equation to ensure it satisfies the equation:\n",
+    "   \\[\n",
+    "   (2)^2 - 4(2) + 4 = 4 - 8 + 4 = 0\n",
+    "   \\]\n",
+    "   Since the left-hand side equals the right-hand side (0), the solution \\( x = 2 \\) is verified.\n",
+    "\n",
+    "**Final Answer**: \\( x = 2 \\) <ANS_START> \\( x = 2 \\) <ANS_END>\n",
+    "\n",
+    "[Question] Find the derivative of \\( f(x) = 3x^2 \\cdot \\sin(x) \\).\n",
+    "[Answer] To find the derivative of the function \\( f(x) = 3x^2 \\cdot \\sin(x) \\), we will use the product rule of differentiation. The product rule states that if we have a function \\( f(x) = u(x) \\cdot v(x) \\), then its derivative \\( f'(x) \\) is given by:\n",
+    "\n",
+    "\\[ f'(x) = u'(x) \\cdot v(x) + u(x) \\cdot v'(x) \\]\n",
+    "\n",
+    "Here, we identify \\( u(x) = 3x^2 \\) and \\( v(x) = \\sin(x) \\).\n",
+    "\n",
+    "Step 1: Differentiate \\( u(x) = 3x^2 \\)\n",
+    "\\[ u'(x) = \\frac{d}{dx}(3x^2) = 3 \\cdot 2x = 6x \\]\n",
+    "\n",
+    "Step 2: Differentiate \\( v(x) = \\sin(x) \\)\n",
+    "\\[ v'(x) = \\frac{d}{dx}(\\sin(x)) = \\cos(x) \\]\n",
+    "\n",
+    "Step 3: Apply the product rule\n",
+    "\\[ f'(x) = u'(x) \\cdot v(x) + u(x) \\cdot v'(x) \\]\n",
+    "\\[ f'(x) = (6x) \\cdot \\sin(x) + (3x^2) \\cdot \\cos(x) \\]\n",
+    "\n",
+    "Step 4: Simplify the expression\n",
+    "\\[ f'(x) = 6x \\sin(x) + 3x^2 \\cos(x) \\]\n",
+    "\n",
+    "Thus, the derivative of the function \\( f(x) = 3x^2 \\cdot \\sin(x) \\) is:\n",
+    "\n",
+    "\\[ \\boxed{f'(x) = 6x \\sin(x) + 3x^2 \\cos(x)} \\]\n",
+    "\n",
+    "To verify the final answer, we can recheck each step to ensure accuracy:\n",
+    "- The derivative of \\( 3x^2 \\) is correctly calculated as \\( 6x \\).\n",
+    "- The derivative of \\( \\sin(x) \\) is correctly calculated as \\( \\cos(x) \\).\n",
+    "- The product rule is correctly applied, and the terms are correctly combined and simplified.\n",
+    "\n",
+    "Therefore, the final answer is confirmed to be correct. <ANS_START> \\( f'(x) = 3x^2 \\cos(x) + 6x \\sin(x) \\) <ANS_END>\n",
+    "\n",
+    "[Question] Evaluate the definite integral \\( \\int_{0}^{1} (4x^3 - 2x + 1) \\, dx \\).\n",
+    "[Answer] To evaluate the definite integral \\( \\int_{0}^{1} (4x^3 - 2x + 1) \\, dx \\), we will follow these steps:\n",
+    "\n",
+    "1. **Find the antiderivative** of the integrand \\( 4x^3 - 2x + 1 \\).\n",
+    "2. **Evaluate the antiderivative** at the upper limit of integration (1).\n",
+    "3. **Evaluate the antiderivative** at the lower limit of integration (0).\n",
+    "4. **Subtract the value** of the antiderivative at the lower limit from the value at the upper limit to find the definite integral.\n",
+    "\n",
+    "### Step-by-Step Solution:\n",
+    "\n",
+    "1. **Find the antiderivative**:\n",
+    "   - The antiderivative of \\( 4x^3 \\) is \\( \\frac{4x^4}{4} = x^4 \\).\n",
+    "   - The antiderivative of \\( -2x \\) is \\( -\\frac{2x^2}{2} = -x^2 \\).\n",
+    "   - The antiderivative of \\( 1 \\) is \\( x \\).\n",
+    "\n",
+    "   Therefore, the antiderivative of \\( 4x^3 - 2x + 1 \\) is:\n",
+    "   \\[\n",
+    "   F(x) = x^4 - x^2 + x\n",
+    "   \\]\n",
+    "\n",
+    "2. **Evaluate the antiderivative at the upper limit (1)**:\n",
+    "   \\[\n",
+    "   F(1) = 1^4 - 1^2 + 1 = 1 - 1 + 1 = 1\n",
+    "   \\]\n",
+    "\n",
+    "3. **Evaluate the antiderivative at the lower limit (0)**:\n",
+    "   \\[\n",
+    "   F(0) = 0^4 - 0^2 + 0 = 0\n",
+    "   \\]\n",
+    "\n",
+    "4. **Subtract the value at the lower limit from the value at the upper limit**:\n",
+    "   \\[\n",
+    "   \\int_{0}^{1} (4x^3 - 2x + 1) \\, dx = F(1) - F(0) = 1 - 0 = 1\n",
+    "   \\]\n",
+    "\n",
+    "### Final Answer:\n",
+    "\\[\n",
+    "\\boxed{1}\n",
+    "\\] <ANS_START> \\( 1 \\) <ANS_END>\n",
+    "\n",
+    "[Question] Solve the system of equations:\n",
+    "\\[ \\begin{cases} \n",
+    "x + 2y + z = 6 \\\\\n",
+    "2x - y + 3z = 14 \\\\\n",
+    "3x + y - z = 2 \n",
+    "\\end{cases} \\]\n",
+    "[Answer] To solve the system of equations:\n",
+    "\\[ \\begin{cases} \n",
+    "x + 2y + z = 6 \\\\\n",
+    "2x - y + 3z = 14 \\\\\n",
+    "3x + y - z = 2 \n",
+    "\\end{cases} \\]\n",
+    "\n",
+    "we will use the method of elimination and substitution to find the values of \\(x\\), \\(y\\), and \\(z\\).\n",
+    "\n",
+    "**Step 1: Eliminate \\(z\\) from the first two equations.**\n",
+    "\n",
+    "First, we multiply the first equation by 3 to align the coefficients of \\(z\\):\n",
+    "\\[ 3(x + 2y + z) = 3 \\cdot 6 \\]\n",
+    "\\[ 3x + 6y + 3z = 18 \\]\n",
+    "\n",
+    "Now, we subtract the second equation from this result:\n",
+    "\\[ (3x + 6y + 3z) - (2x - y + 3z) = 18 - 14 \\]\n",
+    "\\[ 3x + 6y + 3z - 2x + y - 3z = 4 \\]\n",
+    "\\[ x + 7y = 4 \\]\n",
+    "\\[ \\text{(Equation 4)} \\]\n",
+    "\n",
+    "**Step 2: Eliminate \\(z\\) from the first and third equations.**\n",
+    "\n",
+    "Next, we multiply the first equation by 1 and the third equation by 1 to align the coefficients of \\(z\\):\n",
+    "\\[ 1(x + 2y + z) = 1 \\cdot 6 \\]\n",
+    "\\[ x + 2y + z = 6 \\]\n",
+    "\n",
+    "\\[ 1(3x + y - z) = 1 \\cdot 2 \\]\n",
+    "\\[ 3x + y - z = 2 \\]\n",
+    "\n",
+    "Now, we add these two equations:\n",
+    "\\[ (x + 2y + z) + (3x + y - z) = 6 + 2 \\]\n",
+    "\\[ x + 2y + z + 3x + y - z = 8 \\]\n",
+    "\\[ 4x + 3y = 8 \\]\n",
+    "\\[ \\text{(Equation 5)} \\]\n",
+    "\n",
+    "**Step 3: Solve the system of equations formed by Equation 4 and Equation 5.**\n",
+    "\n",
+    "We now have:\n",
+    "\\[ \\begin{cases} \n",
+    "x + 7y = 4 \\\\\n",
+    "4x + 3y = 8 \n",
+    "\\end{cases} \\]\n",
+    "\n",
+    "First, we solve Equation 4 for \\(x\\):\n",
+    "\\[ x = 4 - 7y \\]\n",
+    "\n",
+    "Substitute \\(x = 4 - 7y\\) into Equation 5:\n",
+    "\\[ 4(4 - 7y) + 3y = 8 \\]\n",
+    "\\[ 16 - 28y + 3y = 8 \\]\n",
+    "\\[ 16 - 25y = 8 \\]\n",
+    "\\[ -25y = 8 - 16 \\]\n",
+    "\\[ -25y = -8 \\]\n",
+    "\\[ y = \\frac{8}{25} \\]\n",
+    "\n",
+    "**Step 4: Substitute \\(y\\) back into Equation 4 to find \\(x\\).**\n",
+    "\n",
+    "\\[ x + 7\\left(\\frac{8}{25}\\right) = 4 \\]\n",
+    "\\[ x + \\frac{56}{25} = 4 \\]\n",
+    "\\[ x = 4 - \\frac{56}{25} \\]\n",
+    "\\[ x = \\frac{100}{25} - \\frac{56}{25} \\]\n",
+    "\\[ x = \\frac{44}{25} \\]\n",
+    "\n",
+    "**Step 5: Substitute \\(x\\) and \\(y\\) back into the first original equation to find \\(z\\).**\n",
+    "\n",
+    "\\[ x + 2y + z = 6 \\]\n",
+    "\\[ \\frac{44}{25} + 2\\left(\\frac{8}{25}\\right) + z = 6 \\]\n",
+    "\\[ \\frac{44}{25} + \\frac{16}{25} + z = 6 \\]\n",
+    "\\[ \\frac{60}{25} + z = 6 \\]\n",
+    "\\[ \\frac{60}{25} = 2.4 \\]\n",
+    "\\[ 2.4 + z = 6 \\]\n",
+    "\\[ z = 6 - 2.4 \\]\n",
+    "\\[ z = 3.6 \\]\n",
+    "\n",
+    "**Final Answer:**\n",
+    "\\[ x = \\frac{44}{25}, y = \\frac{8}{25}, z = 3.6 \\]\n",
+    "\n",
+    "We have verified each step and simplified all expressions. The solution is logically consistent and correctly formatted. <ANS_START> \\( x = \\frac{44}{25}, y = \\frac{8}{25}, z = 3.6 \\) <ANS_END>\n",
+    "\n",
+    "\n",
+    "For each question present the reasoning followed by the correct answer.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c61c2f84",
+   "metadata": {},
+   "source": [
+    "#### Scenario 3 : We have training data and also want in-context examples in final prompt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11d2de75",
+   "metadata": {},
+   "source": [
+    "Load and save the dataset "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(\"data\"):\n",
+    "    os.mkdir(\"data\")\n",
+    "    \n",
+    "dataset = load_dataset(\"openai/gsm8k\", \"main\")\n",
+    "num_samples = 0\n",
+    "for dataset_type in ['train','test']:\n",
+    "    data_list = []\n",
+    "    for data in dataset[dataset_type]:\n",
+    "        data_list.append({\"question\": data['question'], \"answer\": data['answer']})\n",
+    "        if num_samples == 100 and dataset_type == 'train': # We sample only 100 train examples and use 25 out them for training randomly\n",
+    "            break\n",
+    "        num_samples += 1\n",
+    "    gsm8k_processor.dataset_to_jsonl(\"data/\"+ dataset_type+'.jsonl', dataset=data_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "abf1671a",
+   "metadata": {},
+   "source": [
+    "Set the configurations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc841576",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_path = 'configs/promptopt_config.yaml' \n",
+    "config_dict = {\n",
+    "                \"task_description\": \"You are a mathematics expert. You will be given a mathematics problem which you need to solve\",\n",
+    "                \"base_instruction\": \"Lets think step by step.\",\n",
+    "                \"mutation_rounds\": 2,\n",
+    "                \"few_shot_count\": 5,\n",
+    "                \"generate_reasoning\": True,\n",
+    "                \"mutate_refine_iterations\" : 3,\n",
+    "                \"seen_set_size\":20\n",
+    "               }\n",
+    "update_yaml_file(file_path,config_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3392594d",
+   "metadata": {},
+   "source": [
+    "Create an object for calling prompt optimization and inference functionalities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GluePromptOpt(promptopt_config_path,\n",
+    "                   setup_config_path,\n",
+    "                   dataset_jsonl = os.path.join(\"data\", \"train.jsonl\"),\n",
+    "                   data_processor = gsm8k_processor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f421ce9",
+   "metadata": {},
+   "source": [
+    "Call the optimization function "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09e3e6e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15bb0e80",
+   "metadata": {},
+   "source": [
+    "Output : Following Prompt and Expert Profile are generated "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "696e6612",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT = \"\"\"Expert Identity: You are a mathematics expert with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to break down intricate problems into manageable steps, making it easier for others to follow your reasoning. You are familiar with a wide range of mathematical techniques and tools, and you can apply them effectively to find solutions. Whether the problem involves solving equations, proving theorems, or analyzing data, you can provide a clear, accurate, and well-explained solution. Your ability to communicate complex mathematical concepts in an understandable way makes you an invaluable resource for anyone seeking help with mathematics.\n",
+    "\n",
+    "Final best prompt: \n",
+    "\n",
+    "You are a mathematics expert. Your task is to solve a given mathematics problem accurately and provide a clear, detailed explanation of your solution process. Follow these steps to ensure a comprehensive and well-structured solution:\n",
+    "\n",
+    "1. **Understand the Problem**: Carefully read and comprehend the problem statement. Identify the key components and what is being asked.\n",
+    "\n",
+    "2. **Identify Components**: Break down the problem into its fundamental components, such as variables, constants, and relevant quantities (e.g., base pay, overtime pay, distances, speeds, etc.).\n",
+    "\n",
+    "3. **Apply Relevant Principles**: Use appropriate mathematical principles, formulas, and methods to solve the problem step by step.\n",
+    "\n",
+    "4. **Logical Reasoning**: Employ logical reasoning to explain each step of your solution process. Ensure that each step follows logically from the previous one.\n",
+    "\n",
+    "5. **Detailed Explanations**: Provide detailed explanations for each step to ensure clarity and understanding. Include intermediate results and how they contribute to the final solution.\n",
+    "\n",
+    "6. **Explicit Calculation Steps**: Show each calculation step in detail, including intermediate results. Use proper mathematical notation and symbols.\n",
+    "\n",
+    "7. **Verify Each Step**: Recheck each intermediate step of your calculation to verify the correctness of the final answer. Ensure that all arithmetic and algebraic operations are accurate.\n",
+    "\n",
+    "8. **Combine Results**: Clearly combine different components of the problem (e.g., base pay and overtime pay) before arriving at the final answer.\n",
+    "\n",
+    "9. **Simplify and Notate**: Simplify the final answer where possible, and use proper mathematical notation and symbols.\n",
+    "\n",
+    "10. **Mark the Final Answer**: Clearly mark the final answer within <ANS_START> and <ANS_END> tags.\n",
+    "\n",
+    "Ensure that your approach is tailored to the specific type of mathematical problem being solved, whether it involves arithmetic, algebra, geometry, calculus, or any other area of mathematics. Present the solutions in a clear and organized manner.\n",
+    "\n",
+    "**Additional Guidelines:**\n",
+    "- **Contextual Understanding**: Pay close attention to the context of the problem to ensure that all relationships and quantities are correctly interpreted.\n",
+    "- **Correct Application of Arithmetic Operations**: Double-check that all arithmetic operations are applied correctly and align with the problem's requirements.\n",
+    "- **Verification of Final Answer**: Dedicate a step to verify the final answer by rechecking all intermediate steps and ensuring they logically lead to the correct final result.\n",
+    "- **Clarity in Marking Final Answer**: Use the <ANS_START> and <ANS_END> tags to clearly mark the final answer.\n",
+    "\n",
+    "By following these steps and additional guidelines, you will ensure that the solution is accurate, well-explained, and clearly presented.\n",
+    "\n",
+    "\n",
+    "[Question] Bella bought stamps at the post office. Some of the stamps had a snowflake design, some had a truck design, and some had a rose design. Bella bought 11 snowflake stamps. She bought 9 more truck stamps than snowflake stamps, and 13 fewer rose stamps than truck stamps. How many stamps did Bella buy in all?\n",
+    "[Answer] 1. **Understand the Problem**: Bella bought three types of stamps: snowflake, truck, and rose. We need to determine the total number of stamps she bought, given the relationships between the quantities of each type.\n",
+    "\n",
+    "2. **Identify Components**:\n",
+    "   - Number of snowflake stamps: 11.\n",
+    "   - Number of truck stamps: 9 more than the number of snowflake stamps.\n",
+    "   - Number of rose stamps: 13 fewer than the number of truck stamps.\n",
+    "\n",
+    "3. **Apply Relevant Principles**: Use basic arithmetic operations to find the quantities of truck and rose stamps, and then sum all the quantities to find the total number of stamps.\n",
+    "\n",
+    "4. **Logical Reasoning**:\n",
+    "   - Number of snowflake stamps: 11.\n",
+    "   - Number of truck stamps: 11 (snowflake stamps) + 9 = 20.\n",
+    "   - Number of rose stamps: 20 (truck stamps) - 13 = 7.\n",
+    "\n",
+    "5. **Detailed Explanations**:\n",
+    "   - Calculate the number of truck stamps: 11 (snowflake stamps) + 9 = 20.\n",
+    "   - Calculate the number of rose stamps: 20 (truck stamps) - 13 = 7.\n",
+    "   - Calculate the total number of stamps: 11 (snowflake) + 20 (truck) + 7 (rose) = 38.\n",
+    "\n",
+    "6. **Explicit Calculation Steps**:\n",
+    "   - Truck stamps: 11 + 9 = $<11+9=20>20.\n",
+    "   - Rose stamps: 20 - 13 = $<20-13=7>7.\n",
+    "   - Total stamps: 11 + 20 + 7 = $<11+20+7=38>38.\n",
+    "\n",
+    "7. **Verify Each Step**: Recheck each calculation step to ensure correctness:\n",
+    "   - Truck stamps: 11 + 9 = 20.\n",
+    "   - Rose stamps: 20 - 13 = 7.\n",
+    "   - Total stamps: 11 + 20 + 7 = 38.\n",
+    "\n",
+    "8. **Combine Results**: Combine the number of each type of stamp correctly to find the total number of stamps.\n",
+    "\n",
+    "9. **Simplify and Notate**: The final answer is already simplified.\n",
+    "\n",
+    "10. **Mark the Final Answer**: <ANS_START>38<ANS_END>\n",
+    "\n",
+    "By following these steps, we ensure that the solution is accurate, well-explained, and clearly presented. <ANS_START>38<ANS_END>\n",
+    "\n",
+    "[Question] It takes Roque two hours to walk to work and one hour to ride his bike to work. Roque walks to and from work three times a week and rides his bike to and from work twice a week. How many hours in total does he take to get to and from work a week with walking and biking?\n",
+    "[Answer] 1. **Understand the Problem**: Roque has two modes of transportation to work: walking and biking. We need to calculate the total time he spends traveling to and from work in a week, considering the different times and frequencies for each mode.\n",
+    "\n",
+    "2. **Identify Components**:\n",
+    "   - Time to walk to work: 2 hours (one way).\n",
+    "   - Time to bike to work: 1 hour (one way).\n",
+    "   - Frequency of walking: 3 times a week (to and from work).\n",
+    "   - Frequency of biking: 2 times a week (to and from work).\n",
+    "\n",
+    "3. **Apply Relevant Principles**: Use basic arithmetic to calculate the total time spent walking and biking separately, then sum these times to get the total weekly travel time.\n",
+    "\n",
+    "4. **Logical Reasoning**:\n",
+    "   - Calculate the total walking time for a week:\n",
+    "     - One round trip (to and from work) by walking takes 2 hours (to work) + 2 hours (from work) = 4 hours.\n",
+    "     - Roque walks to and from work 3 times a week, so the total walking time is 4 hours per round trip * 3 round trips = 12 hours.\n",
+    "   - Calculate the total biking time for a week:\n",
+    "     - One round trip (to and from work) by biking takes 1 hour (to work) + 1 hour (from work) = 2 hours.\n",
+    "     - Roque bikes to and from work 2 times a week, so the total biking time is 2 hours per round trip * 2 round trips = 4 hours.\n",
+    "\n",
+    "5. **Detailed Explanations**:\n",
+    "   - Walking time calculation:\n",
+    "     - One round trip walking: 2 hours (to work) + 2 hours (from work) = 4 hours.\n",
+    "     - Total walking time for the week: 4 hours per round trip * 3 round trips = 12 hours.\n",
+    "   - Biking time calculation:\n",
+    "     - One round trip biking: 1 hour (to work) + 1 hour (from work) = 2 hours.\n",
+    "     - Total biking time for the week: 2 hours per round trip * 2 round trips = 4 hours.\n",
+    "   - Combine the total walking and biking times to get the total weekly travel time:\n",
+    "     - Total weekly travel time: 12 hours (walking) + 4 hours (biking) = 16 hours.\n",
+    "\n",
+    "6. **Explicit Calculation Steps**:\n",
+    "   - Walking time: 2 hours (one way) * 2 (round trip) * 3 (times a week) = $<2*2*3=12>12 hours.\n",
+    "   - Biking time: 1 hour (one way) * 2 (round trip) * 2 (times a week) = $<1*2*2=4>4 hours.\n",
+    "   - Total time: 12 hours (walking) + 4 hours (biking) = $<12+4=16>16 hours.\n",
+    "\n",
+    "7. **Verify Each Step**: Recheck each calculation step to ensure correctness. Confirm that the arithmetic operations and logic used are accurate.\n",
+    "\n",
+    "8. **Combine Results**: Combine the total walking and biking times correctly to ensure the final answer is accurate.\n",
+    "\n",
+    "9. **Simplify and Notate**: The final answer is already simplified and clearly presented.\n",
+    "\n",
+    "10. **Mark the Final Answer**: <ANS_START>16<ANS_END>\n",
+    "\n",
+    "By following these steps, we ensure that the solution is accurate, well-explained, and clearly presented. <ANS_START>16<ANS_END>\n",
+    "\n",
+    "[Question] Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?\n",
+    "[Answer] 1. **Understand the Problem**: Betty is saving money for a wallet that costs $100. She currently has half of the money she needs. Her parents and grandparents are contributing additional amounts to help her reach her goal. We need to determine how much more money Betty needs to buy the wallet.\n",
+    "\n",
+    "2. **Identify Components**:\n",
+    "   - Total cost of the wallet: $100.\n",
+    "   - Amount Betty currently has: half of $100.\n",
+    "   - Contribution from parents: $15.\n",
+    "   - Contribution from grandparents: twice the amount given by parents.\n",
+    "\n",
+    "3. **Apply Relevant Principles**: Use basic arithmetic to calculate the total amount of money Betty will have after receiving contributions from her parents and grandparents, and then determine how much more she needs to reach $100.\n",
+    "\n",
+    "4. **Logical Reasoning**:\n",
+    "   - Calculate the amount Betty currently has: $100 / 2 = $50.\n",
+    "   - Calculate the contribution from grandparents: 2 * $15 = $30.\n",
+    "   - Calculate the total amount of money Betty will have: $50 (current amount) + $15 (parents' contribution) + $30 (grandparents' contribution).\n",
+    "\n",
+    "5. **Detailed Explanations**:\n",
+    "   - Betty currently has $50 because half of $100 is $50.\n",
+    "   - Her parents give her $15.\n",
+    "   - Her grandparents give her twice the amount her parents give, which is 2 * $15 = $30.\n",
+    "   - The total amount of money Betty will have is $50 (current amount) + $15 (parents' contribution) + $30 (grandparents' contribution) = $95.\n",
+    "\n",
+    "6. **Explicit Calculation Steps**:\n",
+    "   - Current amount: $100 / 2 = $<100/2=50>50.\n",
+    "   - Grandparents' contribution: 2 * $15 = $<2*15=30>30.\n",
+    "   - Total amount: $50 + $15 + $30 = $<50+15+30=95>95.\n",
+    "\n",
+    "7. **Verify Each Step**: Recheck each calculation step to ensure correctness.\n",
+    "   - Current amount: $100 / 2 = $50.\n",
+    "   - Grandparents' contribution: 2 * $15 = $30.\n",
+    "   - Total amount: $50 + $15 + $30 = $95.\n",
+    "\n",
+    "8. **Combine Results**: Combine the total amount of money Betty will have correctly.\n",
+    "   - Total amount: $50 (current amount) + $15 (parents' contribution) + $30 (grandparents' contribution) = $95.\n",
+    "\n",
+    "9. **Simplify and Notate**: The final answer is already simplified.\n",
+    "\n",
+    "10. **Mark the Final Answer**: \n",
+    "   - Amount Betty needs to buy the wallet: $100 - $95 = $<100-95=5>5.\n",
+    "\n",
+    "<ANS_START>5<ANS_END> <ANS_START>5<ANS_END>\n",
+    "\n",
+    "[Question] A rectangle has a length of 10 cm and a width of 5 cm. What is the area and perimeter of the rectangle?\n",
+    "[Answer] 1. **Understand the Problem**: We need to find both the area and the perimeter of a rectangle given its length and width.\n",
+    "\n",
+    "2. **Identify Components**: \n",
+    "   - Length of the rectangle (L) = 10 cm\n",
+    "   - Width of the rectangle (W) = 5 cm\n",
+    "\n",
+    "3. **Apply Relevant Principles**: \n",
+    "   - The formula for the area of a rectangle is \\( \\text{Area} = \\text{Length} \\times \\text{Width} \\).\n",
+    "   - The formula for the perimeter of a rectangle is \\( \\text{Perimeter} = 2 \\times (\\text{Length} + \\text{Width}) \\).\n",
+    "\n",
+    "4. **Logical Reasoning**:\n",
+    "   - To find the area, multiply the length by the width.\n",
+    "   - To find the perimeter, add the length and the width, then multiply the result by 2.\n",
+    "\n",
+    "5. **Detailed Explanations**:\n",
+    "   - Calculate the area: \\( \\text{Area} = 10 \\, \\text{cm} \\times 5 \\, \\text{cm} \\).\n",
+    "   - Calculate the perimeter: \\( \\text{Perimeter} = 2 \\times (10 \\, \\text{cm} + 5 \\, \\text{cm}) \\).\n",
+    "\n",
+    "6. **Explicit Calculation Steps**:\n",
+    "   - Area: \\( 10 \\times 5 = 50 \\, \\text{cm}^2 \\).\n",
+    "   - Perimeter: \\( 2 \\times (10 + 5) = 2 \\times 15 = 30 \\, \\text{cm} \\).\n",
+    "\n",
+    "7. **Verify Each Step**: \n",
+    "   - Recheck the area calculation: \\( 10 \\times 5 = 50 \\, \\text{cm}^2 \\).\n",
+    "   - Recheck the perimeter calculation: \\( 2 \\times 15 = 30 \\, \\text{cm} \\).\n",
+    "\n",
+    "8. **Combine Results**: \n",
+    "   - The area of the rectangle is \\( 50 \\, \\text{cm}^2 \\).\n",
+    "   - The perimeter of the rectangle is \\( 30 \\, \\text{cm} \\).\n",
+    "\n",
+    "9. **Simplify and Notate**: \n",
+    "   - The final answers are already simplified.\n",
+    "\n",
+    "10. **Mark the Final Answer**: \n",
+    "   - Area: <ANS_START>50 \\, \\text{cm}^2<ANS_END>\n",
+    "   - Perimeter: <ANS_START>30 \\, \\text{cm}<ANS_END>\n",
+    "\n",
+    "By following these steps, we ensure that the solution is accurate, well-explained, and clearly presented. <ANS_START>50<ANS_END>\n",
+    "\n",
+    "[Question] Solve for x in the equation 2x + 3 = 11.\n",
+    "[Answer] **Understand the Problem**: We need to solve for the variable \\( x \\) in the given linear equation \\( 2x + 3 = 11 \\).\n",
+    "\n",
+    "**Identify Components**: \n",
+    "- The equation is \\( 2x + 3 = 11 \\).\n",
+    "- We need to isolate \\( x \\) on one side of the equation.\n",
+    "\n",
+    "**Apply Relevant Principles**: \n",
+    "- Use basic algebraic principles to isolate \\( x \\).\n",
+    "\n",
+    "**Logical Reasoning**:\n",
+    "1. Start with the given equation: \\( 2x + 3 = 11 \\).\n",
+    "2. Subtract 3 from both sides of the equation to isolate the term with \\( x \\):\n",
+    "   \\[\n",
+    "   2x + 3 - 3 = 11 - 3\n",
+    "   \\]\n",
+    "3. Simplify both sides:\n",
+    "   \\[\n",
+    "   2x = 8\n",
+    "   \\]\n",
+    "4. Divide both sides by 2 to solve for \\( x \\):\n",
+    "   \\[\n",
+    "   \\frac{2x}{2} = \\frac{8}{2}\n",
+    "   \\]\n",
+    "5. Simplify the division:\n",
+    "   \\[\n",
+    "   x = 4\n",
+    "   \\]\n",
+    "\n",
+    "**Detailed Explanations**:\n",
+    "- Subtracting 3 from both sides removes the constant term on the left side, leaving \\( 2x \\) isolated.\n",
+    "- Dividing both sides by 2 isolates \\( x \\) by removing the coefficient of 2.\n",
+    "\n",
+    "**Explicit Calculation Steps**:\n",
+    "1. \\( 2x + 3 = 11 \\)\n",
+    "2. \\( 2x + 3 - 3 = 11 - 3 \\)\n",
+    "3. \\( 2x = 8 \\)\n",
+    "4. \\( \\frac{2x}{2} = \\frac{8}{2} \\)\n",
+    "5. \\( x = 4 \\)\n",
+    "\n",
+    "**Verify Each Step**:\n",
+    "- Recheck each step to ensure no arithmetic errors:\n",
+    "  - Subtracting 3 from 11 gives 8.\n",
+    "  - Dividing 8 by 2 gives 4.\n",
+    "\n",
+    "**Combine Results**: The final value of \\( x \\) is correctly isolated and calculated.\n",
+    "\n",
+    "**Simplify and Notate**: The final answer is already simplified.\n",
+    "\n",
+    "**Mark the Final Answer**: <ANS_START>4<ANS_END>\n",
+    "\n",
+    "By following these steps, we ensure that the solution is accurate, well-explained, and clearly presented. <ANS_START>4<ANS_END>\n",
+    "\n",
+    "\n",
+    "For each question present the reasoning followed by the correct answer.\"\"\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "general",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

demos/svamp/.env ADDED Viewed

	@@ -0,0 +1,8 @@

+USE_OPENAI_API_KEY="False"
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""

demos/svamp/configs/prompt_library.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+system_prompts: |
+  You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+  Guidelines
+  - Your role must always be a helpful assistant that assists students in understanding research papers.
+  - Only answer questions that are directly or indirectly related to the referenced paper(s).
+mode:
+  chat:
+    - name: CHAT-FIRST-MESSAGE
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+    - name: CHAT-NEXT-MESSAGES
+      llm_request_type: rag-query
+      prompt_template: |
+        {user_msg}
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+  generation:
+    - name: FLASH_PROFILE
+      prompt_template: |
+        {user_msg}
+      prepend_system_prompts: False
+      prepend_system_guidelines: False
+      llm_request_type: rag-query
+      emb_model_id: text embedding ada 002 [vellm-openai2]
+      llm_model_id: gpt 35 Turbo [vellm-openai2]

demos/svamp/configs/promptopt_config.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting <mutation_rounds>  rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description: "You are a mathematics expert. You will be given a mathematics problem which you need to solve"
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction: "Lets think step by step."
+# Instruction for specifying answer format
+answer_format: "At the end, wrap your final answer and option if applicable between <ANS_START> and <ANS_END> tags"
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################

demos/svamp/configs/setup_config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+assistant_llm:
+  # put the unique_model_id that you specified in llm_config.yaml
+  prompt_opt: gpt-4o
+dir_info:
+  # Base directory for everything
+  base_dir: logs
+  log_dir_name: glue_logs
+experiment_name: svamp
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:

demos/svamp/demo.ipynb ADDED Viewed

	@@ -0,0 +1,295 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "14360485",
+   "metadata": {},
+   "source": [
+    "#### Set environment variables in [.env](.env) for LLM API calling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6bd95c11",
+   "metadata": {},
+   "source": [
+    "### Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1fb3d81-16b6-4b8c-a028-880fdce5e14a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, \"../../\")\n",
+    "import os\n",
+    "import promptwizard\n",
+    "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+    "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+    "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+    "from typing import Any\n",
+    "from tqdm import tqdm\n",
+    "import json\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv(override = True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f061d2fd",
+   "metadata": {},
+   "source": [
+    "### Create a dataset specific class and define the required functions "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f325d33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def extract_between(start, end, text):\n",
+    "    \"\"\"\n",
+    "    Extracts the substring from 'text' that is between 'start' and 'end' strings.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    - start (str): The starting delimiter string.\n",
+    "    - end (str): The ending delimiter string.\n",
+    "    - text (str): The text to search within.\n",
+    "    \n",
+    "    Returns:\n",
+    "    - str: The extracted substring between the start and end delimiters.\n",
+    "    \"\"\"\n",
+    "    start_index = text.find(start)\n",
+    "    if start_index == -1:\n",
+    "        return '' \n",
+    "    \n",
+    "    start_index += len(start)\n",
+    "    \n",
+    "    end_index = text.find(end, start_index)\n",
+    "    if end_index == -1:\n",
+    "        return ''  \n",
+    "    return text[start_index:end_index]\n",
+    "\n",
+    "class SVAMP(DatasetSpecificProcessing):\n",
+    "\n",
+    "    def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+    "        def extract_answer_from_output(completion):\n",
+    "\n",
+    "                return completion\n",
+    "\n",
+    "        examples_set = []\n",
+    "\n",
+    "        for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+    "            example = {\n",
+    "              DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+    "              DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+    "              DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+    "            }\n",
+    "            examples_set.append(example)\n",
+    "\n",
+    "        save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+    "\n",
+    "    def extract_final_answer(self, answer: str):\n",
+    "        \n",
+    "        final_answer = extract_between(text=answer,start=\"<ANS_START>\",end=\"<ANS_END>\")\n",
+    "        return final_answer\n",
+    "    \n",
+    "    def access_answer(self, llm_output: str, gt_answer: str):\n",
+    "\n",
+    "        predicted_answer = self.extract_final_answer(llm_output)\n",
+    "        is_correct = False\n",
+    "        if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):\n",
+    "            is_correct = True\n",
+    "\n",
+    "        return is_correct, predicted_answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f384eb57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svamp_processor = SVAMP()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "if not os.path.exists(\"data\"):\n",
+    "    os.mkdir(\"data\")\n",
+    "\n",
+    "dataset = load_dataset(\"ChilleD/SVAMP\")\n",
+    "\n",
+    "for dataset_type in ['train','test']:\n",
+    "    data_list = []\n",
+    "    num_samples = 0\n",
+    "    for data in dataset[dataset_type]:\n",
+    "        data_list.append({\"question\": data['question_concat'], \"answer\": data['Answer']})\n",
+    "        if dataset_type == 'train' and num_samples == 100: # We sample only 100 train examples and use 25 out them for training randomly\n",
+    "            break\n",
+    "        num_samples += 1\n",
+    "    svamp_processor.dataset_to_jsonl(\"data/\"+ dataset_type+'.jsonl', dataset=data_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4852b94b",
+   "metadata": {},
+   "source": [
+    "### Set paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "f43482f1-3e10-4cf7-8ea6-ff42c04067a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_file_name = os.path.join(\"data\", \"train.jsonl\")\n",
+    "test_file_name = os.path.join(\"data\", \"test.jsonl\")\n",
+    "path_to_config = \"configs\"\n",
+    "llm_config_path = os.path.join(path_to_config, \"llm_config.yaml\")\n",
+    "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+    "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7ba6394",
+   "metadata": {},
+   "source": [
+    "### Create an object for calling prompt optimization and inference functionalities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GluePromptOpt(promptopt_config_path,\n",
+    "                   setup_config_path,\n",
+    "                   train_file_name,\n",
+    "                   svamp_processor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e38ea08",
+   "metadata": {},
+   "source": [
+    "### Call prompt optmization function\n",
+    "1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real\n",
+    "2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples \n",
+    "3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "573c6151-2c03-45d9-9904-1724a1e20f1b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Function call to generate optimal prompt and expert profile \n",
+    "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bae1a791",
+   "metadata": {},
+   "source": [
+    "### Save the optimized prompt and expert profile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34a716af-0d77-4c7d-b1c2-6438d66096ce",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import pickle \n",
+    "\n",
+    "if not os.path.exists(\"results\"):\n",
+    "    os.system(\"mkdir results\")\n",
+    "\n",
+    "with open(\"results/best_prompt.pkl\", 'wb') as f:\n",
+    "    pickle.dump(best_prompt, f)\n",
+    "with open(\"results/expert_profile.pkl\", 'wb') as f:\n",
+    "    pickle.dump(expert_profile, f)\n",
+    "\n",
+    "print(f\"Best prompt: {best_prompt} \\nExpert profile: {expert_profile}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7691a87",
+   "metadata": {},
+   "source": [
+    "### Evaluate the optimized prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49b5711-82dd-4d18-8cd4-ee447cf8d74c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "gp.EXPERT_PROFILE = expert_profile\n",
+    "gp.BEST_PROMPT = best_prompt\n",
+    "\n",
+    "# Function call to evaluate the prompt\n",
+    "accuracy = gp.evaluate(test_file_name)\n",
+    "\n",
+    "print(f\"Final Accuracy: {accuracy}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

docs/images/arithmetic_task.png ADDED Viewed

docs/images/bigbench.png ADDED Viewed

docs/images/comaprision.png ADDED Viewed

Git LFS Details

SHA256: ace953e64449bdfaac42a9e587e3c1f37447755aad8c522c6d02be6d7e925c65
Pointer size: 131 Bytes
Size of remote file: 130 kB

docs/images/cost_analysis.png ADDED Viewed

docs/images/curve.png ADDED Viewed

docs/images/github.png ADDED Viewed

docs/images/icl_results.png ADDED Viewed

docs/images/iterative_flowchart-1.png ADDED Viewed

Git LFS Details

SHA256: 692c18bece5f26e48a8549c19b6f9969a284a0ce2e00d510da461ad579770f3e
Pointer size: 131 Bytes
Size of remote file: 168 kB

docs/images/msr_blog.png ADDED Viewed

docs/images/overview.png ADDED Viewed

Git LFS Details

SHA256: 6dbee997ee3b194173cd2cc5fee24dfef3cc28311bf3e28778e66fb1ce8ca9ae
Pointer size: 131 Bytes
Size of remote file: 252 kB

docs/images/ppc.png ADDED Viewed

Git LFS Details

SHA256: 85152be471400c6927f4d3c5d201755564d719dbe2602f5499c1c7179c66b607
Pointer size: 131 Bytes
Size of remote file: 113 kB

docs/images/ppc_1.png ADDED Viewed

Git LFS Details

SHA256: 04a31930771409f59afb7ac8ac5207b77ed526b12592a87c44d773ad10d95e9f
Pointer size: 131 Bytes
Size of remote file: 132 kB

docs/images/prompting.png ADDED Viewed

docs/images/sequential_flowchart-1.png ADDED Viewed

docs/images/slm_prompt.png ADDED Viewed

docs/index.html ADDED Viewed

	@@ -0,0 +1,784 @@

+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <meta name="description"
+        content=" PromptWizard:Task-Aware Prompt Optimization Framework">
+  <meta name="keywords" content="PromptWizard">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>PromptWizard</title>
+  <!-- Global site tag (gtag.js) - Google Analytics -->
+  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
+  <script>
+    window.dataLayer = window.dataLayer || [];
+    function gtag() {
+      dataLayer.push(arguments);
+    }
+    gtag('js', new Date());
+    gtag('config', 'G-PYVRSFMDRL');
+  </script>
+  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
+        rel="stylesheet">
+  <link rel="stylesheet" href="./static/css/bulma.min.css">
+  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
+  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
+  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
+  <link rel="stylesheet"
+        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
+  <link rel="stylesheet" href="./static/css/index.css">
+  <link rel="icon" href="./static/images/favicon.svg">
+  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+  <script defer src="./static/js/fontawesome.all.min.js"></script>
+  <script src="./static/js/bulma-carousel.min.js"></script>
+  <script src="./static/js/bulma-slider.min.js"></script>
+  <script src="./static/js/index.js"></script>
+  <style>
+    .red-text {
+    color: red;
+}
+    /* Collapsible content - initially hidden */
+    .col_content_1 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_2 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_3 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_4 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_5 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_6 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_7 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_8 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_9 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_10 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    .col_content_11 {
+        padding: 15px;
+        background-color: #f1f1f1;
+        display: none;
+    }
+    table {
+        width: 100%;
+        border-collapse: collapse;
+    }
+    table, th, td {
+        border: 1px solid black;
+    }
+    th, td {
+        padding: 8px;
+        text-align: left;
+    }
+    .btn {
+            display: flex; /* Use flexbox for layout */
+            justify-content: space-between; /* Space out content on left and right */
+            align-items: center; /* Center content vertically */
+            padding: 10px 20px; /* Add padding to the button */
+            font-size: 18px; /* Text size */
+            background-color: black;
+            color: white;
+            border: none;
+            border-radius: 5px;
+            cursor: pointer;
+            width: 100%; /* Button width (you can adjust this) */
+        }
+        /* Style for the + sign */
+        .btn .icon {
+            font-size: 24px; /* Size of the + sign */
+        }
+        .btn:hover {
+            background-color: gray; /* Hover effect */
+        }
+        /* Container for the slider */
+        .slider-container {
+            width: 80%;  /* Set the width of the slider */
+            margin: 0 auto;
+            overflow: hidden;
+            position: relative;
+        }
+        /* Slide wrapper that holds all the images */
+        .slider-wrapper {
+            display: flex;
+            transition: transform 0.5s ease-in-out;
+        }
+        /* Each image box (b5 box) */
+        .box {
+          flex: 0 0 100%;  /* Each image takes full width of the container */
+            display: flex;
+            justify-content: center;
+            align-items: center;
+        }
+        .box img {
+            width: 90%;  /* Make images responsive to fit the container */
+            max-height: 400px;  /* Control max height */
+            object-fit: cover;  /* Ensure images maintain aspect ratio */
+        }
+        /* Navigation buttons (next and previous) */
+        .prev, .next {
+            position: absolute;
+            top: 50%;
+            transform: translateY(-50%);
+            background-color: rgba(0, 0, 0, 0.5);
+            color: white;
+            border: none;
+            padding: 10px;
+            cursor: pointer;
+        }
+        .prev {
+            left: 10px;
+        }
+        .next {
+            right: 10px;
+        }
+        * {box-sizing: border-box;}
+        body {font-family: Verdana, sans-serif;}
+        .mySlides {display: none;}
+        img {vertical-align: middle;}
+        /* Slideshow container */
+        .slideshow-container {
+          max-width: 1000px;
+          position: relative;
+          margin: auto;
+        }
+        /* Caption text */
+        .text {
+          color: #f2f2f2;
+          font-size: 15px;
+          padding: 8px 12px;
+          position: absolute;
+          bottom: 8px;
+          width: 100%;
+          text-align: center;
+        }
+        /* Number text (1/3 etc) */
+        .numbertext {
+          color: #f2f2f2;
+          font-size: 12px;
+          padding: 8px 12px;
+          position: absolute;
+          top: 0;
+        }
+        /* The dots/bullets/indicators */
+        .dot {
+          height: 15px;
+          width: 15px;
+          margin: 0 2px;
+          background-color: #bbb;
+          border-radius: 50%;
+          display: inline-block;
+          transition: background-color 0.6s ease;
+        }
+        .active {
+          background-color: #717171;
+        }
+        /* Fading animation */
+        .fade {
+          animation-name: fade;
+          animation-duration: 1.5s;
+        }
+        @keyframes fade {
+          from {opacity: .4}
+          to {opacity: 1}
+        }
+        /* On smaller screens, decrease text size */
+        @media only screen and (max-width: 300px) {
+          .text {font-size: 11px}
+        }
+</style>
+</head>
+<body>
+<section class="hero">
+  <div class="hero-body">
+    <div class="container is-max-desktop">
+      <div class="columns is-centered">
+        <div class="column has-text-centered">
+          <h1 class="title is-1 publication-title">🧙 PromptWizard<br><p style="white-space: nowrap;">Task-Aware Prompt Optimization Framework</p></h1>
+          <div class="is-size-5 publication-authors">
+            <span class="author-block">
+              <a>Eshaan Agarwal</a>,</span>
+            <span class="author-block">
+              <a>Joykirat Singh</a>,</span>
+            <span class="author-block">
+              <a>Vivek Dani</a>,
+            </span>
+            <span class="author-block">
+              <a>Raghav Magazine</a>,
+            </span>
+            <span class="author-block">
+              <a>Tanuja Ganu</a>,
+            </span>
+            <span class="author-block">
+              <a>Akshay Nambi</a>
+            </span>
+          </div>
+          <div class="is-size-5 publication-authors">
+            <span class="author-block">Microsoft Research</span>
+          </div>
+          <div class="column has-text-centered">
+            <div class="publication-links">
+              <!-- PDF Link. -->
+              <span class="link-block">
+                <a href="https://arxiv.org/pdf/2405.18369"
+                   class="external-link button is-normal is-rounded is-dark">
+                  <span class="icon">
+                      <i class="fas fa-file-pdf"></i>
+                  </span>
+                  <span>Paper</span>
+                </a>
+              </span>
+              <span class="link-block">
+                <a href="https://arxiv.org/abs/2405.18369"
+                   class="external-link button is-normal is-rounded is-dark">
+                  <span class="icon">
+                      <i class="ai ai-arxiv"></i>
+                  </span>
+                  <span>arXiv</span>
+                </a>
+              </span>
+              <!-- Code Link. -->
+              <span class="link-block">
+                <a href="https://github.com/microsoft/PromptWizard"
+                   class="external-link button is-normal is-rounded is-dark">
+                  <span class="icon">
+                      <i class="fab fa-github"></i>
+                  </span>
+                  <span>Code</span>
+                  </a>
+              </span>
+            </div>
+          </div>
+        </div>
+      </div>
+    </div>
+  </div>
+</section>
+<div class="slideshow-container">
+  <div class="mySlides fade">
+    <div class="numbertext">1 / 3</div>
+    <p align="center">
+    <img src="images/overview.png">
+    </p>
+  </div>
+  <div class="mySlides fade">
+    <div class="numbertext">2 / 3</div>
+    <p align="center">
+    <img width="700" height="700" src="images/iterative_flowchart-1.png">
+    </p>
+  </div>
+  <div class="mySlides fade">
+    <div class="numbertext">3 / 3</div>
+    <p align="center">
+    <img width="700" height="700" src="images/sequential_flowchart-1.png">
+    </p>
+  </p>
+  </div>
+  </div>
+  <br>
+  <div style="text-align:center">
+    <span class="dot"></span>
+    <span class="dot"></span>
+    <span class="dot"></span>
+  </div>
+  <script>
+  let slideIndex = 0;
+  showSlides();
+  function showSlides() {
+    let i;
+    let slides = document.getElementsByClassName("mySlides");
+    let dots = document.getElementsByClassName("dot");
+    for (i = 0; i < slides.length; i++) {
+      slides[i].style.display = "none";
+    }
+    slideIndex++;
+    if (slideIndex > slides.length) {slideIndex = 1}
+    for (i = 0; i < dots.length; i++) {
+      dots[i].className = dots[i].className.replace(" active", "");
+    }
+    slides[slideIndex-1].style.display = "block";
+    dots[slideIndex-1].className += " active";
+    setTimeout(showSlides, 2000); // Change image every 2 seconds
+  }
+  </script>
+<section class="section">
+  <div class="container is-max-desktop">
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-four-fifths">
+        <div class="content has-text-justified">
+          <b>PromptWizard</b> is an open source framework for automated prompt and example optimization, leveraging a feedback-driven critique and synthesis process to balance exploration and exploitation. It consistently outperforms state-of-the-art methods while significantly reducing computational costs, enabling efficient and scalable prompt engineering across diverse tasks and LLMs.
+        </div>
+      </div>
+    </div>
+</section>
+<section class="section">
+  <div class="container is-max-desktop">
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-four-fifths">
+        <h2 class="title is-3">Overview</h2>
+        <div class="content has-text-justified">
+          Large language models (LLMs) like GPT-4 have achieved remarkable performance across diverse tasks. At the core of this success is prompting—the process of providing input instructions to guide models toward desired outputs. Studies have shown that prompting significantly influences LLM performance, making prompt engineering—the design and refinement of prompts—critical for maximizing accuracy. However, crafting effective prompts remains a labor-intensive and domain-specific task, requiring human expertise and subjective judgment. As models evolve and tasks vary, the need to repeatedly design prompts raises an important question: <br> <b>Can prompt engineering be automated to streamline this process and enhance scalability? </b>
+        </div>
+      </div>
+    </div>
+</section>
+<section class="section">
+  <div class="container is-max-desktop">
+    <!-- Motivation. -->
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-four-fifths">
+        <h2 class="title is-3">Motivation</h2>
+        <div class="content has-text-justified">
+          <h3>Prompting is central to LLMs!</h3>
+          <ul>
+            <li><b>Prompting</b>: The process of providing input instructions to guide models towards desired output</li>
+            <li><b>Prompt Engineering</b>: The process of designing and refining of prompts</li>
+            <li>Crating effective prompts is a challenge as:</li>
+            <ol>
+              <li>The task is labor-intensive</li>
+              <li>Prompts need to be domain-specific to work effectively</li>
+              <li>Often it equires human expertise and is subjective</li>
+              <li>Also as models and tasks evolve, there is a need for repeated design</li>
+            </ol>
+          </ul>
+        </div>
+      </div>
+    </div>
+    <!--/ Motivation. -->
+</section>
+<section class="section">
+  <div class="container is-max-desktop">
+    <!-- Abstract. -->
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-four-fifths">
+        <h2 class="title is-3">PromptWizard Working </h2>
+        <div class="content has-text-justified">
+          <p>
+            PromptWizard (PW) is a discrete prompt optimization framework that employs a self-evolving mechanism where the LLM generates, critiques, and refines its own prompts and examples, continuously improving through iterative feedback and synthesis. This self-adaptive approach ensures holistic optimization by evolving both the instructions and in-context learning examples for better task performance.
+          </p>
+          <h3>Three Key Insights :</h3>
+          <p>
+            <ol>
+              <li><b>Feedback-driven Refinement</b>: LLM generates, critiques, and refines its own prompts and examples, continuously improving through iterative feedback and synthesis
+              </li>
+              <li><b>Critique and Synthesize diverse examples</b>: Generates synthetic examples that are robust, diverse and task-aware. Also it optimizes both prompt and examples in tandem
+              </li>
+              <li><b>Self generated Chain of Thought (CoT)</b> steps with combination of positive, negative and synthetic examples</li>
+            </ol>
+            <p>
+              Following are the details of each step :
+            </p>
+            <button class="btn" onclick="toggleContent(this,'1')">1. Feedback driven Refinement <span class="icon">+</span></button>
+            <div class="col_content_1">
+              <ul>
+                <li>Prompt wizard uses a systematic, feedback-driven proces where it incorporates a critique component that provides feedback, thus guiding and refining the prompt over multiple iterations</li>
+                <li>The following steps help in carrying out this systematically</li>
+                <ul>
+                  <li><b>Mutate</b>: Takes an initial problem description + thinking Styles to generate prompts</li>
+                  <li><b>Scoring</b>: Evaluate the performance of the generated prompts to determine best prompt</li>
+                  <li><b>Critique</b>: Reviews where the prompt succeeded and failed by analyzing cases where the LLM struggled</li>
+                  <li><b>Synthesize</b>: Uses critique’s feedback to refine the best prompt</li>
+                </ul>
+                  </li>
+              </ul>
+            </div>
+            <script>
+              // Function to toggle the visibility of the collapsible content
+              function toggleContent(button,index) {
+                  var content = document.querySelector(".col_content_"+index);
+                  const icon = button.querySelector('.icon');
+                  if (content.style.display === "block") {
+                      content.style.display = "none"; // Hide content if it's visible
+                      icon.textContent = icon.textContent.replace('-', '+');
+                  } else {
+                      content.style.display = "block"; // Show content if it's hidden
+                      icon.textContent = icon.textContent.replace('+', '-');
+                  }
+              }
+          </script>
+            <br>
+            <button class="btn" onclick="toggleContent(this,'2')">2. Critique and Synthesize diverse examples <span class="icon">+</span></button>
+            <div class="col_content_2">
+              <ul>
+                <li>PromptWizard improves both prompt instructions and few-shot examples in tandem</li>
+                <li>It uses self-reflection to synthesize examples that are diverse and task-relevant </li>
+                <li>An iterative feedback loop is used that continuously refines both the prompt and few-shot examples</li>
+                <li>Few shot example optimization:</li>
+                <ul>
+                  <li><b>Critique</b>: Analyzes previously selected examples and use the feedback to determine how examples should evolve</li>
+                  <li><b>Synthesize</b>: Incorporates feedback to generate new synthetic examples that are more diverse, robust, and task-relevant</li>
+                </ul>
+                <li>Prompt instruction optimization:</li>
+                <ul>
+                  <li><b>Critique</b>: Identifies weaknesses and gaps that require addressing to further refine the prompt instruction</li>
+                  <li><b>Synthesize</b>:  Leverages feedback from the critique to synthesize and refine the prompt instruction</li>
+                </ul>
+              </ul>
+            </div>
+            <br>
+            <button class="btn" onclick="toggleContent(this,'3')">3. Chain of Thought Reasoning <span class="icon">+</span></button>
+            <div class="col_content_3">
+                <p>
+                  <ul>
+                    <li>Incorporating chain-of-thought (CoT) reasoning improves problem-solving abilities of the model</li>
+                    <li>CoT Reasoning takes the selected few-shot examples and generates a detailed reasoning chain for each example to facilitate problem-solving</li>
+                    <li>An LLM to check the coherence and relevance of examples</li>
+                </ul>
+                  </p>
+            </div>
+          </p>
+        </div>
+      </div>
+    </div>
+    <!--/ Abstract. -->
+</section>
+<section class="section">
+  <div class="container is-max-desktop">
+    <!-- Results. -->
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-four-fifths">
+        <h2 class="title is-3">Results</h2>
+        <div class="content has-text-justified">
+          <button class="btn" onclick="toggleContent(this,'4')">Instruction Induction Dataset<span class="icon">+</span></button>
+          <div class="col_content_4">
+            <p align="center">
+            <img src="./images/comaprision.png" >
+            </p>
+            <p align="center"><b>PromptWizard outperforms the baselines, achieving the highest accuracy on <b class="red-text">13/19 tasks (68%)</b> with 0-shot and <b class="red-text">16/19 (84%)</b> with 1-shot</b></p>
+            <p align="center">
+            <img src="./images/ppc.png" >
+            </p>
+            <p align="center"><b>PromptWizard consistently performs near the best possible accuracy across all tasks</b></p>
+            <p align="center">
+              <img src="./images/cost_analysis.png" >
+              </p>
+              <p align="center"><b>PromptWizard costs just $0.05 per task, <b class="red-text">5-60x reduction</b> in overall tokens/cost</b></p>
+          </div>
+          </div>
+          <button class="btn" onclick="toggleContent(this,'5')">Arithmetic Tasks<span class="icon">+</span></button>
+          <div class="col_content_5">
+              <p align="center">
+              <img src="./images/arithmetic_task.png" >
+              </p>
+          </div>
+          <br>
+          <button class="btn" onclick="toggleContent(this,'7')">Big Bench Hard<span class="icon">+</span></button>
+          <div class="col_content_7">
+              <p align="center">
+              <img src="./images/bigbench.png" >
+              </p>
+          </div>
+          <br>
+          <button class="btn" onclick="toggleContent(this,'8')">Prompts Using SLMs<span class="icon">+</span></button>
+          <div class="col_content_8">
+              <p align="center">
+              <img src="./images/slm_prompt.png" >
+              </p>
+              <p align="center"><b>PromptWizard using Llama-70B show a negligible <b b class="red-text">< 1% drop</b> in accuracy</b> </p>
+          </div>
+          <br>
+          <button class="btn" onclick="toggleContent(this,'9')">Varying the In-Context Examples<span class="icon">+</span></button>
+          <div class="col_content_9">
+              <p align="center">
+              <img src="./images/icl_results.png" >
+              </p>
+              <p align="center"><b>PromptWizard shows strong resilience even with fewer training samples mainly due to synthetic example generation and reasoning chains</b></p>
+          </div>
+          <br>
+          <button class="btn" onclick="toggleContent(this,'10')">Comparision with naive prompting<span class="icon">+</span></button>
+          <div class="col_content_10">
+              <p align="center">
+              <img src="./images/prompting.png" >
+              </p>
+              <p align="center"><b>Substantial performance improvements across all models when optimized prompts are generated by PromptWizard on GSM8k dataset</b></p>
+          </div>
+          <br>
+          <button class="btn" onclick="toggleContent(this,'11')">Comparision with Feedback based and other Prompt Optimization Techniques<span class="icon">+</span></button>
+          <div class="col_content_11">
+              <p align="center">
+                <table>
+                  <tr>
+                      <td>Dataset</td>
+                      <td colspan="4">Accuracy (high)</td>
+                  </tr>
+                  <tr>
+                      <td></td>
+                      <td>DSPy</td>
+                      <td>PromptAgent </td>
+                      <td>APO</td>
+                      <td>PW</td>
+                  </tr>
+                  <tr>
+                      <td>GSM8k</td>
+                      <td>78.2</td>
+                      <td>68.84</td>
+                      <td>25.67</td>
+                      <td><b>90</b></td>
+                  </tr>
+                  <tr>
+                      <td>AQUARAT</td>
+                      <td>55.1</td>
+                      <td>56.67</td>
+                      <td>20.12</td>
+                      <td><b>58.2</b></td>
+                  </tr>
+                  <tr>
+                      <td>SVAMP</td>
+                      <td>77</td>
+                      <td>78.67</td>
+                      <td>75.25</td>
+                      <td><b>82.3</b></td>
+                  </tr>
+                  <tr>
+                      <td>ETHOS</td>
+                      <td>84.1</td>
+                      <td>84.25</td>
+                      <td>80.62</td>
+                      <td><b>89.4</b></td>
+                  </tr>
+              </table>
+              <br>
+              <table>
+                <tr>
+                    <td>Dataset</td>
+                    <td colspan="4">Calls (low)</td>
+                </tr>
+                <tr>
+                    <td></td>
+                    <td>DSPy</td>
+                    <td>PromptAgent </td>
+                    <td>APO</td>
+                    <td>PW</td>
+                </tr>
+                <tr>
+                    <td>GSM8k</td>
+                    <td>915</td>
+                    <td>2115</td>
+                    <td>8490</td>
+                    <td><b>147</b></td>
+                </tr>
+                <tr>
+                    <td>AQUARAT</td>
+                    <td>920</td>
+                    <td>2200</td>
+                    <td>8500</td>
+                    <td><b>112</b></td>
+                </tr>
+                <tr>
+                    <td>SVAMP</td>
+                    <td>2300</td>
+                    <td>2111</td>
+                    <td>8000</td>
+                    <td><b>178</b></td>
+                </tr>
+                <tr>
+                    <td>ETHOS</td>
+                    <td>660</td>
+                    <td>2217</td>
+                    <td>8200</td>
+                    <td><b>80</b></td>
+                </tr>
+            </table>
+            <br>
+            <table>
+              <tr>
+                  <td>Dataset</td>
+                  <td colspan="4">Tokens (low)</td>
+              </tr>
+              <tr>
+                  <td></td>
+                  <td>DSPy</td>
+                  <td>PromptAgent </td>
+                  <td>APO</td>
+                  <td>PW</td>
+              </tr>
+              <tr>
+                  <td>GSM8k</td>
+                  <td>262</td>
+                  <td>500</td>
+                  <td><b>109</b></td>
+                  <td>237</td>
+              </tr>
+              <tr>
+                  <td>AQUARAT</td>
+                  <td>326</td>
+                  <td>875</td>
+                  <td><b>125</b></td>
+                  <td>200</td>
+              </tr>
+              <tr>
+                  <td>SVAMP</td>
+                  <td>189</td>
+                  <td>680</td>
+                  <td><b>85</b></td>
+                  <td>127</td>
+              </tr>
+              <tr>
+                  <td>ETHOS</td>
+                  <td>175</td>
+                  <td>417</td>
+                  <td><b>55</b></td>
+                  <td>190</td>
+              </tr>
+          </table>
+            </p>
+            <br>
+              <p align="center"> <b>PromptWizard outperforms feedback based methods like APO, PromptAgent and other prompt optimization techniques like DSPy in terms of accuracy and number of API calls for optimization on various datasets.
+              </p>
+              </b>
+          </div>
+        </div>
+      </div>
+    </div>
+    <!--/ Results. -->
+</section>
+<section class="section" id="BibTeX">
+  <div class="container is-max-desktop content">
+    <h2 class="title">BibTeX</h2>
+    <pre><code>@misc{agarwal2024promptwizardtaskawarepromptoptimization,
+      title={PromptWizard: Task-Aware Prompt Optimization Framework},
+      author={Eshaan Agarwal and Joykirat Singh and Vivek Dani and Raghav Magazine and Tanuja Ganu and Akshay Nambi},
+      year={2024},
+      eprint={2405.18369},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2405.18369},
+}</code></pre>
+  </div>
+</section>
+<footer class="footer">
+  <div class="container">
+    <div class="content has-text-centered">
+    </div>
+    <div class="columns is-centered">
+      <div class="column is-8">
+        <div class="content">
+          <p>
+            This website is licensed under a <a rel="license"
+                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
+            Commons Attribution-ShareAlike 4.0 International License</a>.
+          </p>
+          <p>
+            This means you are free to borrow the <a
+              href="https://github.com/nerfies/nerfies.github.io">source code</a> of this website,
+            we just ask that you link back to this page in the footer.
+            Please remember to remove the analytics code included in the header of the website which
+            you do not want on your website.
+          </p>
+        </div>
+      </div>
+    </div>
+  </div>
+</footer>
+</body>
+</html>