linktimecloud commited on
Commit
a228dd5
·
verified ·
1 Parent(s): 9bfb099

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. .env.tpl +6 -0
  2. .gitignore +164 -0
  3. LICENSE +21 -0
  4. README.md +194 -7
  5. ask.py +618 -0
  6. instructions/links.txt +3 -0
  7. requirements.txt +9 -0
.env.tpl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # right now we use Google search API
2
+ SEARCH_API_KEY=your-google-search-api-key
3
+ SEARCH_PROJECT_KEY=your-google-cx-key
4
+
5
+ # right now we use OpenAI API
6
+ LLM_API_KEY=your-openai-api-key
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+ .gradio
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 pengfeng
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,199 @@
1
  ---
2
- title: Ask.py
3
- emoji: 🏃
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.3.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: ask.py
3
+ app_file: ask.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.3.0
 
 
6
  ---
7
+ # ask.py
8
 
9
+ [![License](https://img.shields.io/github/license/pengfeng/ask.py)](LICENSE)
10
+
11
+ A single Python program to implement the search-extract-summarize flow, similar to AI search
12
+ engines such as Perplexity.
13
+
14
+ > [!NOTE]
15
+ > Our main goal is to illustrate the basic concepts of AI search engines with the raw constructs.
16
+ > Performance or scalability is not in the scope of this program.
17
+
18
+ ## The search-extract-summarize flow
19
+
20
+ Given a query, the program will
21
+
22
+ - search Google for the top 10 web pages
23
+ - crawl and scape the pages for their text content
24
+ - chunk the text content into chunks and save them into a vectordb
25
+ - perform a vector search with the query and find the top 10 matched chunks
26
+ - use the top 10 chunks as the context to ask an LLM to generate the answer
27
+ - output the answer with the references
28
+
29
+ Of course this flow is a very simplified version of the real AI search engines, but it is a good
30
+ starting point to understand the basic concepts.
31
+
32
+ One benefit is that we can manipulate the search function and output format.
33
+
34
+ For example, we can:
35
+
36
+ - search with date-restrict to only retrieve the latest information.
37
+ - search within a target-site to only create the answer from the contents from it.
38
+ - ask LLM to use a specific language to answer the question.
39
+ - ask LLM to answer with a specific length.
40
+ - crawl a specific list of urls and answer based on those contents only.
41
+
42
+ ## Quick start
43
+
44
+ ```bash
45
+
46
+ pip install -r requirements.txt
47
+
48
+ # modify .env file to set the API keys or export them as environment variables as below
49
+
50
+ # right now we use Google search API
51
+ export SEARCH_API_KEY="your-google-search-api-key"
52
+ export SEARCH_PROJECT_KEY="your-google-cx-key"
53
+
54
+ # right now we use OpenAI API
55
+ export LLM_API_KEY="your-openai-api-key"
56
+
57
+ # run the program
58
+ python ask.py -q "What is an LLM agent?"
59
+
60
+ # we can specify more parameters to control the behavior such as date_restrict and target_site
61
+ python ask.py --help
62
+ Usage: ask.py [OPTIONS]
63
+
64
+ Search web for the query and summarize the results
65
+
66
+ Options:
67
+ -q, --query TEXT Query to search [required]
68
+ --url-list TEXT Instead of doing web search, scrape the
69
+ target URL list and answer the query based
70
+ on the content [default:
71
+ instructions/links.txt]
72
+ -d, --date-restrict INTEGER Restrict search results to a specific date
73
+ range, default is no restriction
74
+ -s, --target-site TEXT Restrict search results to a specific site,
75
+ default is no restriction
76
+ --output-language TEXT Output language for the answer
77
+ --output-length INTEGER Output length for the answer
78
+ -m, --model-name TEXT Model name to use for inference
79
+ -l, --log-level [DEBUG|INFO|WARNING|ERROR]
80
+ Set the logging level [default: INFO]
81
+ --help Show this message and exit.
82
+ ```
83
+
84
+ ## Libraries and APIs used
85
+
86
+ - [Google Search API](https://developers.google.com/custom-search/v1/overview)
87
+ - [OpenAI API](https://beta.openai.com/docs/api-reference/completions/create)
88
+ - [Jinja2](https://jinja.palletsprojects.com/en/3.0.x/)
89
+ - [bs4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
90
+ - [duckdb](https://github.com/duckdb/duckdb)
91
+
92
+ ## Sample output
93
+
94
+ ### General Search
95
+
96
+ ```
97
+ % python ask.py -q "Why do we need agentic RAG even if we have ChatGPT?"
98
+
99
+ ✅ Found 10 links for query: Why do we need agentic RAG even if we have ChatGPT?
100
+ ✅ Scraping the URLs ...
101
+ ✅ Scraped 10 URLs ...
102
+ ✅ Chunking the text ...
103
+ ✅ Saving to vector DB ...
104
+ ✅ Querying the vector DB ...
105
+ ✅ Running inference with context ...
106
+
107
+ # Answer
108
+
109
+ Agentic RAG (Retrieval-Augmented Generation) is needed alongside ChatGPT for several reasons:
110
+
111
+ 1. **Precision and Contextual Relevance**: While ChatGPT offers generative responses, it may not
112
+ reliably provide precise answers, especially when specific, accurate information is critical[5].
113
+ Agentic RAG enhances this by integrating retrieval mechanisms that improve response context and
114
+ accuracy, allowing users to access the most relevant and recent data without the need for costly
115
+ model fine-tuning[2].
116
+
117
+ 2. **Customizability**: RAG allows businesses to create tailored chatbots that can securely
118
+ reference company-specific data[2]. In contrast, ChatGPT’s broader capabilities may not be
119
+ directly suited for specialized, domain-specific questions without comprehensive customization[3].
120
+
121
+ 3. **Complex Query Handling**: RAG can be optimized for complex queries and can be adjusted to
122
+ work better with specific types of inputs, such as comparing and contrasting information, a task
123
+ where ChatGPT may struggle under certain circumstances[9]. This level of customization can lead to
124
+ better performance in niche applications where precise retrieval of information is crucial.
125
+
126
+ 4. **Asynchronous Processing Capabilities**: Future agentic systems aim to integrate asynchronous
127
+ handling of actions, allowing for parallel processing and reducing wait times for retrieval and
128
+ computation, which is a limitation in the current form of ChatGPT[7]. This advancement would enhance
129
+ overall efficiency and responsiveness in conversations.
130
+
131
+ 5. **Incorporating Retrieved Information Effectively**: Using RAG can significantly improve how
132
+ retrieved information is utilized within a conversation. By effectively managing the context and
133
+ relevance of retrieved documents, RAG helps in framing prompts that can guide ChatGPT towards
134
+ delivering more accurate responses[10].
135
+
136
+ In summary, while ChatGPT excels in generating conversational responses, agentic RAG brings
137
+ precision, customization, and efficiency that can significantly enhance the overall conversational
138
+ AI experience.
139
+
140
+ # References
141
+
142
+ [1] https://community.openai.com/t/how-to-use-rag-properly-and-what-types-of-query-it-is-good-at/658204
143
+ [2] https://www.linkedin.com/posts/brianjuliusdc_dax-powerbi-chatgpt-activity-7235953280177041408-wQqq
144
+ [3] https://community.openai.com/t/how-to-use-rag-properly-and-what-types-of-query-it-is-good-at/658204
145
+ [4] https://community.openai.com/t/prompt-engineering-for-rag/621495
146
+ [5] https://www.ben-evans.com/benedictevans/2024/6/8/building-ai-products
147
+ [6] https://community.openai.com/t/prompt-engineering-for-rag/621495
148
+ [7] https://www.linkedin.com/posts/kurtcagle_agentic-rag-personalizing-and-optimizing-activity-7198097129993613312-z7Sm
149
+ [8] https://community.openai.com/t/how-to-use-rag-properly-and-what-types-of-query-it-is-good-at/658204
150
+ [9] https://community.openai.com/t/how-to-use-rag-properly-and-what-types-of-query-it-is-good-at/658204
151
+ [10] https://community.openai.com/t/prompt-engineering-for-rag/621495
152
+ ```
153
+
154
+ ### Only use the latest information from a specific site
155
+
156
+ This following query will only use the information from openai.com that are updated in the previous
157
+ day. The behavior is similar to the "site:openai.com" and "date-restrict" search parameters in Google
158
+ search.
159
+
160
+ ```
161
+ % python ask.py -q "OpenAI Swarm Framework" -d 1 -s openai.com
162
+ ✅ Found 10 links for query: OpenAI Swarm Framework
163
+ ✅ Scraping the URLs ...
164
+ ✅ Scraped 10 URLs ...
165
+ ✅ Chunking the text ...
166
+ ✅ Saving to vector DB ...
167
+ ✅ Querying the vector DB to get context ...
168
+ ✅ Running inference with context ...
169
+
170
+ # Answer
171
+
172
+ OpenAI Swarm Framework is an experimental platform designed for building, orchestrating, and
173
+ deploying multi-agent systems, enabling multiple AI agents to collaborate on complex tasks. It contrasts
174
+ with traditional single-agent models by facilitating agent interaction and coordination, thus enhancing
175
+ efficiency[5][9]. The framework provides developers with a way to orchestrate these agent systems in
176
+ a lightweight manner, leveraging Node.js for scalable applications[1][4].
177
+
178
+ One implementation of this framework is Swarm.js, which serves as a Node.js SDK, allowing users to
179
+ create and manage agents that perform tasks and hand off conversations. Swarm.js is positioned as
180
+ an educational tool, making it accessible for both beginners and experts, although it may still contain
181
+ bugs and is currently lightweight[1][3][7]. This new approach emphasizes multi-agent collaboration and is
182
+ well-suited for back-end development, requiring some programming expertise for effective implementation[9].
183
+
184
+ Overall, OpenAI Swarm facilitates a shift in how AI systems can collaborate, differing from existing
185
+ OpenAI tools by focusing on backend orchestration rather than user-interactive front-end applications[9].
186
+
187
+ # References
188
+
189
+ [1] https://community.openai.com/t/introducing-swarm-js-node-js-implementation-of-openai-swarm/977510
190
+ [2] https://community.openai.com/t/introducing-swarm-js-a-node-js-implementation-of-openai-swarm/977510
191
+ [3] https://community.openai.com/t/introducing-swarm-js-node-js-implementation-of-openai-swarm/977510
192
+ [4] https://community.openai.com/t/introducing-swarm-js-a-node-js-implementation-of-openai-swarm/977510
193
+ [5] https://community.openai.com/t/swarm-some-initial-insights/976602
194
+ [6] https://community.openai.com/t/swarm-some-initial-insights/976602
195
+ [7] https://community.openai.com/t/introducing-swarm-js-node-js-implementation-of-openai-swarm/977510
196
+ [8] https://community.openai.com/t/introducing-swarm-js-a-node-js-implementation-of-openai-swarm/977510
197
+ [9] https://community.openai.com/t/swarm-some-initial-insights/976602
198
+ [10] https://community.openai.com/t/swarm-some-initial-insights/976602
199
+ ```
ask.py ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import urllib.parse
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from functools import partial
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ import click
10
+ import duckdb
11
+ import gradio as gr
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ from dotenv import load_dotenv
15
+ from jinja2 import BaseLoader, Environment
16
+ from openai import OpenAI
17
+
18
+ script_dir = os.path.dirname(os.path.abspath(__file__))
19
+ default_env_file = os.path.abspath(os.path.join(script_dir, ".env"))
20
+
21
+
22
+ def get_logger(log_level: str) -> logging.Logger:
23
+ logger = logging.getLogger(__name__)
24
+ logger.setLevel(log_level)
25
+ handler = logging.StreamHandler()
26
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
27
+ handler.setFormatter(formatter)
28
+ logger.addHandler(handler)
29
+ return logger
30
+
31
+
32
+ class Ask:
33
+
34
+ def __init__(self, logger: Optional[logging.Logger] = None):
35
+ self.read_env_variables()
36
+
37
+ if logger is not None:
38
+ self.logger = logger
39
+ else:
40
+ self.logger = get_logger("INFO")
41
+
42
+ self.table_name = "document_chunks"
43
+ self.db_con = duckdb.connect(":memory:")
44
+
45
+ self.db_con.install_extension("vss")
46
+ self.db_con.load_extension("vss")
47
+ self.db_con.install_extension("fts")
48
+ self.db_con.load_extension("fts")
49
+ self.db_con.sql("CREATE SEQUENCE seq_docid START 1000")
50
+
51
+ self.db_con.execute(
52
+ f"""
53
+ CREATE TABLE {self.table_name} (
54
+ doc_id INTEGER PRIMARY KEY DEFAULT nextval('seq_docid'),
55
+ url TEXT,
56
+ chunk TEXT,
57
+ vec FLOAT[{self.embedding_dimensions}]
58
+ );
59
+ """
60
+ )
61
+
62
+ self.session = requests.Session()
63
+ user_agent: str = (
64
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
65
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
66
+ "Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
67
+ )
68
+ self.session.headers.update({"User-Agent": user_agent})
69
+
70
+ def read_env_variables(self) -> None:
71
+ err_msg = ""
72
+
73
+ self.search_api_key = os.environ.get("SEARCH_API_KEY")
74
+ if self.search_api_key is None:
75
+ err_msg += "SEARCH_API_KEY env variable not set.\n"
76
+ self.search_project_id = os.environ.get("SEARCH_PROJECT_KEY")
77
+ if self.search_project_id is None:
78
+ err_msg += "SEARCH_PROJECT_KEY env variable not set.\n"
79
+ self.llm_api_key = os.environ.get("LLM_API_KEY")
80
+ if self.llm_api_key is None:
81
+ err_msg += "LLM_API_KEY env variable not set.\n"
82
+
83
+ if err_msg != "":
84
+ raise Exception(f"\n{err_msg}\n")
85
+
86
+ self.llm_base_url = os.environ.get("LLM_BASE_URL")
87
+ if self.llm_base_url is None:
88
+ self.llm_base_url = "https://api.openai.com/v1"
89
+
90
+ self.embedding_model = os.environ.get("EMBEDDING_MODEL")
91
+ self.embedding_dimensions = os.environ.get("EMBEDDING_DIMENSIONS")
92
+
93
+ if self.embedding_model is None or self.embedding_dimensions is None:
94
+ self.embedding_model = "text-embedding-3-small"
95
+ self.embedding_dimensions = 1536
96
+
97
+ def search_web(self, query: str, date_restrict: int, target_site: str) -> List[str]:
98
+ escaped_query = urllib.parse.quote(query)
99
+ url_base = (
100
+ f"https://www.googleapis.com/customsearch/v1?key={self.search_api_key}"
101
+ f"&cx={self.search_project_id}&q={escaped_query}"
102
+ )
103
+ url_paras = f"&safe=active"
104
+ if date_restrict is not None and date_restrict > 0:
105
+ url_paras += f"&dateRestrict={date_restrict}"
106
+ if target_site is not None and target_site != "":
107
+ url_paras += f"&siteSearch={target_site}&siteSearchFilter=i"
108
+ url = f"{url_base}{url_paras}"
109
+
110
+ self.logger.debug(f"Searching for query: {query}")
111
+
112
+ resp = requests.get(url)
113
+
114
+ if resp is None:
115
+ raise Exception("No response from search API")
116
+
117
+ search_results_dict = json.loads(resp.text)
118
+ if "error" in search_results_dict:
119
+ raise Exception(
120
+ f"Error in search API response: {search_results_dict['error']}"
121
+ )
122
+
123
+ if "searchInformation" not in search_results_dict:
124
+ raise Exception(
125
+ f"No search information in search API response: {resp.text}"
126
+ )
127
+
128
+ total_results = search_results_dict["searchInformation"].get("totalResults", 0)
129
+ if total_results == 0:
130
+ self.logger.warning(f"No results found for query: {query}")
131
+ return []
132
+
133
+ results = search_results_dict.get("items", [])
134
+ if results is None or len(results) == 0:
135
+ self.logger.warning(f"No result items in the response for query: {query}")
136
+ return []
137
+
138
+ found_links = []
139
+ for result in results:
140
+ link = result.get("link", None)
141
+ if link is None or link == "":
142
+ self.logger.warning(f"Search result link missing: {result}")
143
+ continue
144
+ found_links.append(link)
145
+ return found_links
146
+
147
+ def _scape_url(self, url: str) -> Tuple[str, str]:
148
+ try:
149
+ response = self.session.get(url, timeout=10)
150
+ soup = BeautifulSoup(response.content, "lxml", from_encoding="utf-8")
151
+
152
+ body_tag = soup.body
153
+ if body_tag:
154
+ body_text = body_tag.get_text()
155
+ body_text = " ".join(body_text.split()).strip()
156
+ self.logger.debug(f"Scraped {url}: {body_text}...")
157
+ if len(body_text) > 100:
158
+ return url, body_text
159
+ else:
160
+ self.logger.warning(
161
+ f"Body text too short for url: {url}, length: {len(body_text)}"
162
+ )
163
+ return url, ""
164
+ else:
165
+ self.logger.warning(f"No body tag found in the response for url: {url}")
166
+ return url, ""
167
+ except Exception as e:
168
+ self.logger.error(f"Scraping error {url}: {e}")
169
+ return url, ""
170
+
171
+ def scrape_urls(self, urls: List[str]) -> Dict[str, str]:
172
+ # the key is the url and the value is the body text
173
+ scrape_results: Dict[str, str] = {}
174
+
175
+ partial_scrape = partial(self._scape_url)
176
+ with ThreadPoolExecutor(max_workers=10) as executor:
177
+ results = executor.map(partial_scrape, urls)
178
+
179
+ for url, body_text in results:
180
+ if body_text != "":
181
+ scrape_results[url] = body_text
182
+
183
+ return scrape_results
184
+
185
+ def chunk_results(
186
+ self, scrape_results: Dict[str, str], size: int, overlap: int
187
+ ) -> Dict[str, List[str]]:
188
+ chunking_results: Dict[str, List[str]] = {}
189
+ for url, text in scrape_results.items():
190
+ chunks = []
191
+ for pos in range(0, len(text), size - overlap):
192
+ chunks.append(text[pos : pos + size])
193
+ chunking_results[url] = chunks
194
+ return chunking_results
195
+
196
+ def get_embedding(self, client: OpenAI, texts: List[str]) -> List[List[float]]:
197
+ if len(texts) == 0:
198
+ return []
199
+
200
+ response = client.embeddings.create(input=texts, model=self.embedding_model)
201
+ embeddings = []
202
+ for i in range(len(response.data)):
203
+ embeddings.append(response.data[i].embedding)
204
+ return embeddings
205
+
206
+ def batch_get_embedding(
207
+ self, client: OpenAI, chunk_batch: Tuple[str, List[str]]
208
+ ) -> Tuple[Tuple[str, List[str]], List[List[float]]]:
209
+ """
210
+ Return the chunk_batch as well as the embeddings for each chunk so that
211
+ we can aggregate them and save them to the database together.
212
+
213
+ Args:
214
+ - client: OpenAI client
215
+ - chunk_batch: Tuple of URL and list of chunks scraped from the URL
216
+
217
+ Returns:
218
+ - Tuple of chunk_bach and list of result embeddings
219
+ """
220
+ texts = chunk_batch[1]
221
+ embeddings = self.get_embedding(client, texts)
222
+ return chunk_batch, embeddings
223
+
224
+ def save_to_db(self, chunking_results: Dict[str, List[str]]) -> None:
225
+ client = self._get_api_client()
226
+ embed_batch_size = 50
227
+ query_batch_size = 100
228
+ insert_data = []
229
+
230
+ batches: List[Tuple[str, List[str]]] = []
231
+ for url, list_chunks in chunking_results.items():
232
+ for i in range(0, len(list_chunks), embed_batch_size):
233
+ list_chunks = list_chunks[i : i + embed_batch_size]
234
+ batches.append((url, list_chunks))
235
+
236
+ self.logger.info(f"Embedding {len(batches)} batches of chunks ...")
237
+ partial_get_embedding = partial(self.batch_get_embedding, client)
238
+ with ThreadPoolExecutor(max_workers=10) as executor:
239
+ all_embeddings = executor.map(partial_get_embedding, batches)
240
+ self.logger.info(f"✅ Finished embedding.")
241
+
242
+ for chunk_batch, embeddings in all_embeddings:
243
+ url = chunk_batch[0]
244
+ list_chunks = chunk_batch[1]
245
+ insert_data.extend(
246
+ [
247
+ (url.replace("'", " "), chunk.replace("'", " "), embedding)
248
+ for chunk, embedding in zip(list_chunks, embeddings)
249
+ ]
250
+ )
251
+
252
+ for i in range(0, len(insert_data), query_batch_size):
253
+ # insert the batch into DuckDB
254
+ value_str = ", ".join(
255
+ [
256
+ f"('{url}', '{chunk}', {embedding})"
257
+ for url, chunk, embedding in insert_data[i : i + embed_batch_size]
258
+ ]
259
+ )
260
+ query = f"""
261
+ INSERT INTO {self.table_name} (url, chunk, vec) VALUES {value_str};
262
+ """
263
+ self.db_con.execute(query)
264
+
265
+ self.db_con.execute(
266
+ f"""
267
+ CREATE INDEX cos_idx ON {self.table_name} USING HNSW (vec)
268
+ WITH (metric = 'cosine');
269
+ """
270
+ )
271
+ self.logger.info(f"✅ Created the vector index ...")
272
+ self.db_con.execute(
273
+ f"""
274
+ PRAGMA create_fts_index(
275
+ {self.table_name}, 'doc_id', 'chunk'
276
+ );
277
+ """
278
+ )
279
+ self.logger.info(f"✅ Created the full text search index ...")
280
+
281
+ def vector_search(self, query: str) -> List[Dict[str, Any]]:
282
+ client = self._get_api_client()
283
+ embeddings = self.get_embedding(client, [query])[0]
284
+
285
+ query_result: duckdb.DuckDBPyRelation = self.db_con.sql(
286
+ f"""
287
+ SELECT * FROM {self.table_name}
288
+ ORDER BY array_distance(vec, {embeddings}::FLOAT[{self.embedding_dimensions}])
289
+ LIMIT 10;
290
+ """
291
+ )
292
+
293
+ self.logger.debug(query_result)
294
+
295
+ matched_chunks = []
296
+ for record in query_result.fetchall():
297
+ result_record = {
298
+ "url": record[1],
299
+ "chunk": record[2],
300
+ }
301
+ matched_chunks.append(result_record)
302
+
303
+ return matched_chunks
304
+
305
+ def _get_api_client(self) -> OpenAI:
306
+ return OpenAI(api_key=self.llm_api_key, base_url=self.llm_base_url)
307
+
308
+ def _render_template(self, template_str: str, variables: Dict[str, Any]) -> str:
309
+ env = Environment(loader=BaseLoader(), autoescape=False)
310
+ template = env.from_string(template_str)
311
+ return template.render(variables)
312
+
313
+ def run_inference(
314
+ self,
315
+ query: str,
316
+ model_name: str,
317
+ matched_chunks: List[Dict[str, Any]],
318
+ output_language: str,
319
+ output_length: int,
320
+ ) -> str:
321
+ system_prompt = (
322
+ "You are an expert summarizing the answers based on the provided contents."
323
+ )
324
+ user_promt_template = """
325
+ Given the context as a sequence of references with a reference id in the
326
+ format of a leading [x], please answer the following question using {{ language }}:
327
+
328
+ {{ query }}
329
+
330
+ In the answer, use format [1], [2], ..., [n] in line where the reference is used.
331
+ For example, "According to the research from Google[3], ...".
332
+
333
+ Please create the answer strictly related to the context. If the context has no
334
+ information about the query, please write "No related information found in the context."
335
+ using {{ language }}.
336
+
337
+ {{ length_instructions }}
338
+
339
+ Here is the context:
340
+ {{ context }}
341
+ """
342
+ context = ""
343
+ for i, chunk in enumerate(matched_chunks):
344
+ context += f"[{i+1}] {chunk['chunk']}\n"
345
+
346
+ if output_length is None or output_length == 0:
347
+ length_instructions = ""
348
+ else:
349
+ length_instructions = (
350
+ f"Please provide the answer in { output_length } words."
351
+ )
352
+
353
+ user_prompt = self._render_template(
354
+ user_promt_template,
355
+ {
356
+ "query": query,
357
+ "context": context,
358
+ "language": output_language,
359
+ "length_instructions": length_instructions,
360
+ },
361
+ )
362
+
363
+ self.logger.debug(f"Running inference with model: {model_name}")
364
+ self.logger.debug(f"Final user prompt: {user_prompt}")
365
+
366
+ api_client = self._get_api_client()
367
+ completion = api_client.chat.completions.create(
368
+ model=model_name,
369
+ messages=[
370
+ {
371
+ "role": "system",
372
+ "content": system_prompt,
373
+ },
374
+ {
375
+ "role": "user",
376
+ "content": user_prompt,
377
+ },
378
+ ],
379
+ )
380
+ if completion is None:
381
+ raise Exception("No completion from the API")
382
+
383
+ response_str = completion.choices[0].message.content
384
+ return response_str
385
+
386
+
387
+ def _read_url_list(url_list_file: str) -> str:
388
+ if url_list_file is None:
389
+ return None
390
+
391
+ with open(url_list_file, "r") as f:
392
+ links = f.readlines()
393
+ links = [
394
+ link.strip()
395
+ for link in links
396
+ if link.strip() != "" and not link.startswith("#")
397
+ ]
398
+ return "\n".join(links)
399
+
400
+
401
+ def _run_query(
402
+ query: str,
403
+ date_restrict: int,
404
+ target_site: str,
405
+ output_language: str,
406
+ output_length: int,
407
+ url_list_str: str,
408
+ model_name: str,
409
+ log_level: str,
410
+ ) -> str:
411
+ logger = get_logger(log_level)
412
+
413
+ load_dotenv(dotenv_path=default_env_file, override=False)
414
+
415
+ ask = Ask(logger=logger)
416
+
417
+ if url_list_str is None or url_list_str.strip() == "":
418
+ logger.info("Searching the web ...")
419
+ links = ask.search_web(query, date_restrict, target_site)
420
+ logger.info(f"✅ Found {len(links)} links for query: {query}")
421
+ for i, link in enumerate(links):
422
+ logger.debug(f"{i+1}. {link}")
423
+ else:
424
+ links = url_list_str.split("\n")
425
+
426
+ logger.info("Scraping the URLs ...")
427
+ scrape_results = ask.scrape_urls(links)
428
+ logger.info(f"✅ Scraped {len(scrape_results)} URLs.")
429
+
430
+ logger.info("Chunking the text ...")
431
+ chunking_results = ask.chunk_results(scrape_results, 1000, 100)
432
+ total_chunks = 0
433
+ for url, chunks in chunking_results.items():
434
+ logger.debug(f"URL: {url}")
435
+ total_chunks += len(chunks)
436
+ for i, chunk in enumerate(chunks):
437
+ logger.debug(f"Chunk {i+1}: {chunk}")
438
+ logger.info(f"✅ Generated {total_chunks} chunks ...")
439
+
440
+ logger.info(f"Saving {total_chunks} chunks to DB ...")
441
+ ask.save_to_db(chunking_results)
442
+ logger.info(f"✅ Successfully embedded and saved chunks to DB.")
443
+
444
+ logger.info("Querying the vector DB to get context ...")
445
+ matched_chunks = ask.vector_search(query)
446
+ for i, result in enumerate(matched_chunks):
447
+ logger.debug(f"{i+1}. {result}")
448
+ logger.info(f"✅ Got {len(matched_chunks)} matched chunks.")
449
+
450
+ logger.info("Running inference with context ...")
451
+ answer = ask.run_inference(
452
+ query=query,
453
+ model_name=model_name,
454
+ matched_chunks=matched_chunks,
455
+ output_language=output_language,
456
+ output_length=output_length,
457
+ )
458
+ logger.info("✅ Finished inference API call.")
459
+ logger.info("generateing output ...")
460
+
461
+ answer = f"# Answer\n\n{answer}\n"
462
+ references = "\n".join(
463
+ [f"[{i+1}] {result['url']}" for i, result in enumerate(matched_chunks)]
464
+ )
465
+ return f"{answer}\n\n# References\n\n{references}"
466
+
467
+
468
+ def launch_gradio(
469
+ query: str,
470
+ date_restrict: int,
471
+ target_site: str,
472
+ output_language: str,
473
+ output_length: int,
474
+ url_list_str: str,
475
+ model_name: str,
476
+ log_level: str,
477
+ ) -> None:
478
+ iface = gr.Interface(
479
+ fn=_run_query,
480
+ inputs=[
481
+ gr.Textbox(label="Query", value=query),
482
+ gr.Number(
483
+ label="Date Restrict (Optional) [0 or empty means no date limit.]",
484
+ value=date_restrict,
485
+ ),
486
+ gr.Textbox(
487
+ label="Target Sites (Optional) [Empty means seach the whole web.]",
488
+ value=target_site,
489
+ ),
490
+ gr.Textbox(
491
+ label="Output Language (Optional) [Default is English.]",
492
+ value=output_language,
493
+ ),
494
+ gr.Number(
495
+ label="Output Length in words (Optional) [Default is automatically decided by LLM.]",
496
+ value=output_length,
497
+ ),
498
+ gr.Textbox(
499
+ label="URL List (Optional) [When specified, scrape the urls instead of searching the web.]",
500
+ lines=5,
501
+ max_lines=20,
502
+ value=url_list_str,
503
+ ),
504
+ ],
505
+ additional_inputs=[
506
+ gr.Textbox(label="Model Name", value=model_name),
507
+ gr.Textbox(label="Log Level", value=log_level),
508
+ ],
509
+ outputs="text",
510
+ show_progress=True,
511
+ flagging_options=[("Report Error", None)],
512
+ title="Ask.py - Web Search-Extract-Summarize",
513
+ description="Search the web with the query and summarize the results. Source code: https://github.com/pengfeng/ask.py",
514
+ )
515
+
516
+ iface.launch()
517
+
518
+
519
+ @click.command(help="Search web for the query and summarize the results")
520
+ @click.option(
521
+ "--web-ui",
522
+ is_flag=True,
523
+ help="Launch the web interface",
524
+ )
525
+ @click.option("--query", "-q", required=False, help="Query to search")
526
+ @click.option(
527
+ "--date-restrict",
528
+ "-d",
529
+ type=int,
530
+ required=False,
531
+ default=None,
532
+ help="Restrict search results to a specific date range, default is no restriction",
533
+ )
534
+ @click.option(
535
+ "--target-site",
536
+ "-s",
537
+ required=False,
538
+ default=None,
539
+ help="Restrict search results to a specific site, default is no restriction",
540
+ )
541
+ @click.option(
542
+ "--output-language",
543
+ required=False,
544
+ default="English",
545
+ help="Output language for the answer",
546
+ )
547
+ @click.option(
548
+ "--output-length",
549
+ type=int,
550
+ required=False,
551
+ default=None,
552
+ help="Output length for the answer",
553
+ )
554
+ @click.option(
555
+ "--url-list-file",
556
+ type=str,
557
+ required=False,
558
+ default=None,
559
+ show_default=True,
560
+ help="Instead of doing web search, scrape the target URL list and answer the query based on the content",
561
+ )
562
+ @click.option(
563
+ "--model-name",
564
+ "-m",
565
+ required=False,
566
+ default="gpt-4o-mini",
567
+ help="Model name to use for inference",
568
+ )
569
+ @click.option(
570
+ "-l",
571
+ "--log-level",
572
+ "log_level",
573
+ default="INFO",
574
+ type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False),
575
+ help="Set the logging level",
576
+ show_default=True,
577
+ )
578
+ def search_extract_summarize(
579
+ web_ui: bool,
580
+ query: str,
581
+ date_restrict: int,
582
+ target_site: str,
583
+ output_language: str,
584
+ output_length: int,
585
+ url_list_file: str,
586
+ model_name: str,
587
+ log_level: str,
588
+ ):
589
+ if web_ui:
590
+ launch_gradio(
591
+ query=query,
592
+ date_restrict=date_restrict,
593
+ target_site=target_site,
594
+ output_language=output_language,
595
+ output_length=output_length,
596
+ url_list_str=_read_url_list(url_list_file),
597
+ model_name=model_name,
598
+ log_level=log_level,
599
+ )
600
+ else:
601
+ if query is None:
602
+ raise Exception("Query is required for the command line mode")
603
+
604
+ result = _run_query(
605
+ query=query,
606
+ date_restrict=date_restrict,
607
+ target_site=target_site,
608
+ output_language=output_language,
609
+ output_length=output_length,
610
+ url_list_str=_read_url_list(url_list_file),
611
+ model_name=model_name,
612
+ log_level=log_level,
613
+ )
614
+ click.echo(result)
615
+
616
+
617
+ if __name__ == "__main__":
618
+ search_extract_summarize()
instructions/links.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # we will crawl these pages and answer the question based on their contents
2
+ https://en.wikipedia.org/wiki/Large_language_model
3
+ https://en.wikipedia.org/wiki/Retrieval-augmented_generation
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ click==8.1.7
2
+ requests==2.31.0
3
+ openai==1.40.2
4
+ jinja2==3.1.3
5
+ bs4==0.0.2
6
+ lxml==4.8.0
7
+ python-dotenv==1.0.1
8
+ duckdb==1.1.2
9
+ gradio==5.3.0