Commit
·
aec7bcb
1
Parent(s):
7af562a
Adding notebooks for hugging face transformers
Browse files- .gitignore +143 -0
- .ipynb_checkpoints/Example-Notebook-checkpoint.ipynb +0 -64
- .jupyter/desktop-settings.json +0 -3
- .jupyter/desktop-workspaces/default-37a8.jupyterlab-workspace +0 -1
- 01-Transformers/00-HF-basics.ipynb +653 -0
- 01-Transformers/01-Pipelines-for-NLP-Tasks.ipynb +711 -0
- 01-Transformers/02-LLMs.ipynb +0 -0
.gitignore
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# https://github.com/jupyter/notebook/blob/main/.gitignore
|
3 |
+
|
4 |
+
**/*.ipynb_checkpoints/
|
5 |
+
**/*.jupyter/
|
6 |
+
|
7 |
+
*.bundle.*
|
8 |
+
lib/
|
9 |
+
node_modules/
|
10 |
+
*.egg-info/
|
11 |
+
.ipynb_checkpoints
|
12 |
+
*.tsbuildinfo
|
13 |
+
|
14 |
+
# Created by https://www.gitignore.io/api/python
|
15 |
+
# Edit at https://www.gitignore.io/?templates=python
|
16 |
+
|
17 |
+
### Python ###
|
18 |
+
# Byte-compiled / optimized / DLL files
|
19 |
+
__pycache__/
|
20 |
+
*.py[cod]
|
21 |
+
*$py.class
|
22 |
+
|
23 |
+
# C extensions
|
24 |
+
*.so
|
25 |
+
|
26 |
+
# Distribution / packaging
|
27 |
+
.Python
|
28 |
+
build/
|
29 |
+
develop-eggs/
|
30 |
+
dist/
|
31 |
+
downloads/
|
32 |
+
eggs/
|
33 |
+
.eggs/
|
34 |
+
lib/
|
35 |
+
lib64/
|
36 |
+
parts/
|
37 |
+
sdist/
|
38 |
+
var/
|
39 |
+
wheels/
|
40 |
+
pip-wheel-metadata/
|
41 |
+
share/python-wheels/
|
42 |
+
.installed.cfg
|
43 |
+
*.egg
|
44 |
+
MANIFEST
|
45 |
+
|
46 |
+
# PyInstaller
|
47 |
+
# Usually these files are written by a python script from a template
|
48 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
49 |
+
*.manifest
|
50 |
+
*.spec
|
51 |
+
|
52 |
+
# Installer logs
|
53 |
+
pip-log.txt
|
54 |
+
pip-delete-this-directory.txt
|
55 |
+
|
56 |
+
# Unit test / coverage reports
|
57 |
+
htmlcov/
|
58 |
+
.tox/
|
59 |
+
.nox/
|
60 |
+
.coverage
|
61 |
+
.coverage.*
|
62 |
+
.cache
|
63 |
+
nosetests.xml
|
64 |
+
coverage.xml
|
65 |
+
*.cover
|
66 |
+
.hypothesis/
|
67 |
+
.pytest_cache/
|
68 |
+
|
69 |
+
# Translations
|
70 |
+
*.mo
|
71 |
+
*.pot
|
72 |
+
|
73 |
+
# Scrapy stuff:
|
74 |
+
.scrapy
|
75 |
+
|
76 |
+
# Sphinx documentation
|
77 |
+
docs/_build/
|
78 |
+
|
79 |
+
# PyBuilder
|
80 |
+
target/
|
81 |
+
|
82 |
+
# pyenv
|
83 |
+
.python-version
|
84 |
+
|
85 |
+
# celery beat schedule file
|
86 |
+
celerybeat-schedule
|
87 |
+
|
88 |
+
# SageMath parsed files
|
89 |
+
*.sage.py
|
90 |
+
|
91 |
+
# Spyder project settings
|
92 |
+
.spyderproject
|
93 |
+
.spyproject
|
94 |
+
|
95 |
+
# Rope project settings
|
96 |
+
.ropeproject
|
97 |
+
|
98 |
+
# Mr Developer
|
99 |
+
.mr.developer.cfg
|
100 |
+
.project
|
101 |
+
.pydevproject
|
102 |
+
|
103 |
+
# mkdocs documentation
|
104 |
+
/site
|
105 |
+
|
106 |
+
# mypy
|
107 |
+
.mypy_cache/
|
108 |
+
.dmypy.json
|
109 |
+
dmypy.json
|
110 |
+
|
111 |
+
# Pyre type checker
|
112 |
+
.pyre/
|
113 |
+
|
114 |
+
# OS X stuff
|
115 |
+
*.DS_Store
|
116 |
+
|
117 |
+
# End of https://www.gitignore.io/api/python
|
118 |
+
|
119 |
+
_temp_extension
|
120 |
+
junit.xml
|
121 |
+
[uU]ntitled*
|
122 |
+
notebook/static/*
|
123 |
+
!notebook/static/favicons
|
124 |
+
notebook/labextension
|
125 |
+
notebook/schemas
|
126 |
+
docs/source/changelog.md
|
127 |
+
docs/source/contributing.md
|
128 |
+
|
129 |
+
# playwright
|
130 |
+
ui-tests/test-results
|
131 |
+
ui-tests/playwright-report
|
132 |
+
|
133 |
+
# VSCode
|
134 |
+
.vscode
|
135 |
+
|
136 |
+
# RTC
|
137 |
+
.jupyter_ystore.db
|
138 |
+
|
139 |
+
# yarn >=2.x local files
|
140 |
+
.yarn/*
|
141 |
+
.pnp.*
|
142 |
+
ui-tests/.yarn/*
|
143 |
+
ui-tests/.pnp.*
|
.ipynb_checkpoints/Example-Notebook-checkpoint.ipynb
DELETED
@@ -1,64 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 2,
|
6 |
-
"id": "79519eae-15e4-4f94-8ec5-c2a7724a3275",
|
7 |
-
"metadata": {},
|
8 |
-
"outputs": [],
|
9 |
-
"source": [
|
10 |
-
"# My First Notebook for Hugging face"
|
11 |
-
]
|
12 |
-
},
|
13 |
-
{
|
14 |
-
"cell_type": "code",
|
15 |
-
"execution_count": 3,
|
16 |
-
"id": "39677f86-b552-458f-9d40-366b11ea2f0e",
|
17 |
-
"metadata": {},
|
18 |
-
"outputs": [
|
19 |
-
{
|
20 |
-
"data": {
|
21 |
-
"text/plain": [
|
22 |
-
"24"
|
23 |
-
]
|
24 |
-
},
|
25 |
-
"execution_count": 3,
|
26 |
-
"metadata": {},
|
27 |
-
"output_type": "execute_result"
|
28 |
-
}
|
29 |
-
],
|
30 |
-
"source": [
|
31 |
-
"12+12"
|
32 |
-
]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"cell_type": "code",
|
36 |
-
"execution_count": null,
|
37 |
-
"id": "7ed8225a-3777-48ba-a76a-aa6cc903d083",
|
38 |
-
"metadata": {},
|
39 |
-
"outputs": [],
|
40 |
-
"source": []
|
41 |
-
}
|
42 |
-
],
|
43 |
-
"metadata": {
|
44 |
-
"kernelspec": {
|
45 |
-
"display_name": "Python 3 (ipykernel)",
|
46 |
-
"language": "python",
|
47 |
-
"name": "python3"
|
48 |
-
},
|
49 |
-
"language_info": {
|
50 |
-
"codemirror_mode": {
|
51 |
-
"name": "ipython",
|
52 |
-
"version": 3
|
53 |
-
},
|
54 |
-
"file_extension": ".py",
|
55 |
-
"mimetype": "text/x-python",
|
56 |
-
"name": "python",
|
57 |
-
"nbconvert_exporter": "python",
|
58 |
-
"pygments_lexer": "ipython3",
|
59 |
-
"version": "3.8.18"
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"nbformat": 4,
|
63 |
-
"nbformat_minor": 5
|
64 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.jupyter/desktop-settings.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"pythonPath": "/Users/milindchawre/Library/jupyterlab-desktop/jlab_server/bin/python"
|
3 |
-
}
|
|
|
|
|
|
|
|
.jupyter/desktop-workspaces/default-37a8.jupyterlab-workspace
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":1,"widgets":["notebook:Example-Notebook.ipynb"]},"current":"notebook:Example-Notebook.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"visible":true,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"visible":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.26227795193312436,0.7377220480668757,0],"top":{"simpleVisibility":true}},"notebook:Example-Notebook.ipynb":{"data":{"path":"Example-Notebook.ipynb","factory":"Notebook"}}},"metadata":{"id":"default"}}
|
|
|
|
01-Transformers/00-HF-basics.ipynb
ADDED
@@ -0,0 +1,653 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 24,
|
6 |
+
"id": "27b0322e-d6a8-4202-9f78-8d2754ebdd97",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"#!pip list | grep hugging"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 40,
|
16 |
+
"id": "da82a90f-7098-4d0c-9fe8-3e0cfc39671d",
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"#!pip install transformers datasets"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 12,
|
26 |
+
"id": "829575c2-c292-4455-8cc6-48764e64c4b0",
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"#!pip install torch"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": 1,
|
36 |
+
"id": "ba0ced0b-35cd-40fd-934f-1013d4a1364d",
|
37 |
+
"metadata": {},
|
38 |
+
"outputs": [
|
39 |
+
{
|
40 |
+
"name": "stderr",
|
41 |
+
"output_type": "stream",
|
42 |
+
"text": [
|
43 |
+
"/Users/milindchawre/.pyenv/versions/3.12.2/envs/hugging-face/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
44 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
45 |
+
]
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"source": [
|
49 |
+
"import transformers\n",
|
50 |
+
"import datasets"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"cell_type": "code",
|
55 |
+
"execution_count": 2,
|
56 |
+
"id": "d196c435-fa5a-4c3b-bec4-0181aa00e8bb",
|
57 |
+
"metadata": {},
|
58 |
+
"outputs": [
|
59 |
+
{
|
60 |
+
"data": {
|
61 |
+
"text/plain": [
|
62 |
+
"'4.44.0'"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
"execution_count": 2,
|
66 |
+
"metadata": {},
|
67 |
+
"output_type": "execute_result"
|
68 |
+
}
|
69 |
+
],
|
70 |
+
"source": [
|
71 |
+
"transformers.__version__"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": 3,
|
77 |
+
"id": "55ddfdaa-3a22-4eab-ad36-24355cbb7fee",
|
78 |
+
"metadata": {},
|
79 |
+
"outputs": [
|
80 |
+
{
|
81 |
+
"data": {
|
82 |
+
"text/plain": [
|
83 |
+
"'2.21.0'"
|
84 |
+
]
|
85 |
+
},
|
86 |
+
"execution_count": 3,
|
87 |
+
"metadata": {},
|
88 |
+
"output_type": "execute_result"
|
89 |
+
}
|
90 |
+
],
|
91 |
+
"source": [
|
92 |
+
"datasets.__version__"
|
93 |
+
]
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"cell_type": "code",
|
97 |
+
"execution_count": 4,
|
98 |
+
"id": "13b8669b-5cc5-40b8-bd22-a7ff44fa43f3",
|
99 |
+
"metadata": {},
|
100 |
+
"outputs": [],
|
101 |
+
"source": [
|
102 |
+
"from datasets import load_dataset"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": 5,
|
108 |
+
"id": "a722a796-84b7-4c45-a104-3d863d52cbb5",
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [],
|
111 |
+
"source": [
|
112 |
+
"reviews = load_dataset('rotten_tomatoes')"
|
113 |
+
]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"cell_type": "code",
|
117 |
+
"execution_count": 6,
|
118 |
+
"id": "607c352e-70c8-4697-b9a1-a4c68e55d502",
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [
|
121 |
+
{
|
122 |
+
"data": {
|
123 |
+
"text/plain": [
|
124 |
+
"datasets.dataset_dict.DatasetDict"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
"execution_count": 6,
|
128 |
+
"metadata": {},
|
129 |
+
"output_type": "execute_result"
|
130 |
+
}
|
131 |
+
],
|
132 |
+
"source": [
|
133 |
+
"type(reviews)"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"cell_type": "code",
|
138 |
+
"execution_count": 7,
|
139 |
+
"id": "e0637ff3-90b9-41cf-bdd0-ba3dbe185225",
|
140 |
+
"metadata": {},
|
141 |
+
"outputs": [
|
142 |
+
{
|
143 |
+
"name": "stdout",
|
144 |
+
"output_type": "stream",
|
145 |
+
"text": [
|
146 |
+
"DatasetDict({\n",
|
147 |
+
" train: Dataset({\n",
|
148 |
+
" features: ['text', 'label'],\n",
|
149 |
+
" num_rows: 8530\n",
|
150 |
+
" })\n",
|
151 |
+
" validation: Dataset({\n",
|
152 |
+
" features: ['text', 'label'],\n",
|
153 |
+
" num_rows: 1066\n",
|
154 |
+
" })\n",
|
155 |
+
" test: Dataset({\n",
|
156 |
+
" features: ['text', 'label'],\n",
|
157 |
+
" num_rows: 1066\n",
|
158 |
+
" })\n",
|
159 |
+
"})\n"
|
160 |
+
]
|
161 |
+
}
|
162 |
+
],
|
163 |
+
"source": [
|
164 |
+
"print(reviews)"
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": 8,
|
170 |
+
"id": "79f60106-7628-4605-920f-6bb8375e6cb5",
|
171 |
+
"metadata": {},
|
172 |
+
"outputs": [
|
173 |
+
{
|
174 |
+
"data": {
|
175 |
+
"text/html": [
|
176 |
+
"<div>\n",
|
177 |
+
"<style scoped>\n",
|
178 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
179 |
+
" vertical-align: middle;\n",
|
180 |
+
" }\n",
|
181 |
+
"\n",
|
182 |
+
" .dataframe tbody tr th {\n",
|
183 |
+
" vertical-align: top;\n",
|
184 |
+
" }\n",
|
185 |
+
"\n",
|
186 |
+
" .dataframe thead th {\n",
|
187 |
+
" text-align: right;\n",
|
188 |
+
" }\n",
|
189 |
+
"</style>\n",
|
190 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
191 |
+
" <thead>\n",
|
192 |
+
" <tr style=\"text-align: right;\">\n",
|
193 |
+
" <th></th>\n",
|
194 |
+
" <th>text</th>\n",
|
195 |
+
" <th>label</th>\n",
|
196 |
+
" </tr>\n",
|
197 |
+
" </thead>\n",
|
198 |
+
" <tbody>\n",
|
199 |
+
" <tr>\n",
|
200 |
+
" <th>0</th>\n",
|
201 |
+
" <td>the rock is destined to be the 21st century's ...</td>\n",
|
202 |
+
" <td>1</td>\n",
|
203 |
+
" </tr>\n",
|
204 |
+
" <tr>\n",
|
205 |
+
" <th>1</th>\n",
|
206 |
+
" <td>the gorgeously elaborate continuation of \" the...</td>\n",
|
207 |
+
" <td>1</td>\n",
|
208 |
+
" </tr>\n",
|
209 |
+
" <tr>\n",
|
210 |
+
" <th>2</th>\n",
|
211 |
+
" <td>effective but too-tepid biopic</td>\n",
|
212 |
+
" <td>1</td>\n",
|
213 |
+
" </tr>\n",
|
214 |
+
" <tr>\n",
|
215 |
+
" <th>3</th>\n",
|
216 |
+
" <td>if you sometimes like to go to the movies to h...</td>\n",
|
217 |
+
" <td>1</td>\n",
|
218 |
+
" </tr>\n",
|
219 |
+
" <tr>\n",
|
220 |
+
" <th>4</th>\n",
|
221 |
+
" <td>emerges as something rare , an issue movie tha...</td>\n",
|
222 |
+
" <td>1</td>\n",
|
223 |
+
" </tr>\n",
|
224 |
+
" <tr>\n",
|
225 |
+
" <th>...</th>\n",
|
226 |
+
" <td>...</td>\n",
|
227 |
+
" <td>...</td>\n",
|
228 |
+
" </tr>\n",
|
229 |
+
" <tr>\n",
|
230 |
+
" <th>8525</th>\n",
|
231 |
+
" <td>any enjoyment will be hinge from a personal th...</td>\n",
|
232 |
+
" <td>0</td>\n",
|
233 |
+
" </tr>\n",
|
234 |
+
" <tr>\n",
|
235 |
+
" <th>8526</th>\n",
|
236 |
+
" <td>if legendary shlockmeister ed wood had ever ma...</td>\n",
|
237 |
+
" <td>0</td>\n",
|
238 |
+
" </tr>\n",
|
239 |
+
" <tr>\n",
|
240 |
+
" <th>8527</th>\n",
|
241 |
+
" <td>hardly a nuanced portrait of a young woman's b...</td>\n",
|
242 |
+
" <td>0</td>\n",
|
243 |
+
" </tr>\n",
|
244 |
+
" <tr>\n",
|
245 |
+
" <th>8528</th>\n",
|
246 |
+
" <td>interminably bleak , to say nothing of boring .</td>\n",
|
247 |
+
" <td>0</td>\n",
|
248 |
+
" </tr>\n",
|
249 |
+
" <tr>\n",
|
250 |
+
" <th>8529</th>\n",
|
251 |
+
" <td>things really get weird , though not particula...</td>\n",
|
252 |
+
" <td>0</td>\n",
|
253 |
+
" </tr>\n",
|
254 |
+
" </tbody>\n",
|
255 |
+
"</table>\n",
|
256 |
+
"<p>8530 rows × 2 columns</p>\n",
|
257 |
+
"</div>"
|
258 |
+
],
|
259 |
+
"text/plain": [
|
260 |
+
" text label\n",
|
261 |
+
"0 the rock is destined to be the 21st century's ... 1\n",
|
262 |
+
"1 the gorgeously elaborate continuation of \" the... 1\n",
|
263 |
+
"2 effective but too-tepid biopic 1\n",
|
264 |
+
"3 if you sometimes like to go to the movies to h... 1\n",
|
265 |
+
"4 emerges as something rare , an issue movie tha... 1\n",
|
266 |
+
"... ... ...\n",
|
267 |
+
"8525 any enjoyment will be hinge from a personal th... 0\n",
|
268 |
+
"8526 if legendary shlockmeister ed wood had ever ma... 0\n",
|
269 |
+
"8527 hardly a nuanced portrait of a young woman's b... 0\n",
|
270 |
+
"8528 interminably bleak , to say nothing of boring . 0\n",
|
271 |
+
"8529 things really get weird , though not particula... 0\n",
|
272 |
+
"\n",
|
273 |
+
"[8530 rows x 2 columns]"
|
274 |
+
]
|
275 |
+
},
|
276 |
+
"execution_count": 8,
|
277 |
+
"metadata": {},
|
278 |
+
"output_type": "execute_result"
|
279 |
+
}
|
280 |
+
],
|
281 |
+
"source": [
|
282 |
+
"reviews['train'].to_pandas()"
|
283 |
+
]
|
284 |
+
},
|
285 |
+
{
|
286 |
+
"cell_type": "code",
|
287 |
+
"execution_count": 9,
|
288 |
+
"id": "8971be5a-f5ba-4cb9-be88-7fa509b201ef",
|
289 |
+
"metadata": {},
|
290 |
+
"outputs": [
|
291 |
+
{
|
292 |
+
"data": {
|
293 |
+
"text/plain": [
|
294 |
+
"'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'"
|
295 |
+
]
|
296 |
+
},
|
297 |
+
"execution_count": 9,
|
298 |
+
"metadata": {},
|
299 |
+
"output_type": "execute_result"
|
300 |
+
}
|
301 |
+
],
|
302 |
+
"source": [
|
303 |
+
"reviews['train'].to_pandas()['text'][0]"
|
304 |
+
]
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"cell_type": "code",
|
308 |
+
"execution_count": 10,
|
309 |
+
"id": "74db724b-063c-4c91-9681-37231a5a09fd",
|
310 |
+
"metadata": {},
|
311 |
+
"outputs": [],
|
312 |
+
"source": [
|
313 |
+
"from transformers import pipeline\n",
|
314 |
+
"import torch"
|
315 |
+
]
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"cell_type": "code",
|
319 |
+
"execution_count": 11,
|
320 |
+
"id": "4a22ba6c-2db8-4dff-9abd-037f1a08fc7a",
|
321 |
+
"metadata": {
|
322 |
+
"editable": true,
|
323 |
+
"slideshow": {
|
324 |
+
"slide_type": ""
|
325 |
+
},
|
326 |
+
"tags": []
|
327 |
+
},
|
328 |
+
"outputs": [
|
329 |
+
{
|
330 |
+
"name": "stderr",
|
331 |
+
"output_type": "stream",
|
332 |
+
"text": [
|
333 |
+
"No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).\n",
|
334 |
+
"Using a pipeline without specifying a model name and revision in production is not recommended.\n",
|
335 |
+
"/Users/milindchawre/.pyenv/versions/3.12.2/envs/hugging-face/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
|
336 |
+
" warnings.warn(\n"
|
337 |
+
]
|
338 |
+
}
|
339 |
+
],
|
340 |
+
"source": [
|
341 |
+
"classifier = pipeline(\"sentiment-analysis\", device=0)"
|
342 |
+
]
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"cell_type": "code",
|
346 |
+
"execution_count": 12,
|
347 |
+
"id": "2d64c34d-c2d8-4cda-9c04-91e5497304a9",
|
348 |
+
"metadata": {},
|
349 |
+
"outputs": [
|
350 |
+
{
|
351 |
+
"data": {
|
352 |
+
"text/plain": [
|
353 |
+
"[{'label': 'POSITIVE', 'score': 0.9998668432235718}]"
|
354 |
+
]
|
355 |
+
},
|
356 |
+
"execution_count": 12,
|
357 |
+
"metadata": {},
|
358 |
+
"output_type": "execute_result"
|
359 |
+
}
|
360 |
+
],
|
361 |
+
"source": [
|
362 |
+
"classifier(\"This was great movie!\")"
|
363 |
+
]
|
364 |
+
},
|
365 |
+
{
|
366 |
+
"cell_type": "code",
|
367 |
+
"execution_count": 13,
|
368 |
+
"id": "fc11b406-3770-44ea-9654-4ec2e2e1081b",
|
369 |
+
"metadata": {},
|
370 |
+
"outputs": [
|
371 |
+
{
|
372 |
+
"data": {
|
373 |
+
"text/plain": [
|
374 |
+
"[{'label': 'POSITIVE', 'score': 0.9998465776443481}]"
|
375 |
+
]
|
376 |
+
},
|
377 |
+
"execution_count": 13,
|
378 |
+
"metadata": {},
|
379 |
+
"output_type": "execute_result"
|
380 |
+
}
|
381 |
+
],
|
382 |
+
"source": [
|
383 |
+
"classifier(\"This was great ok!\")"
|
384 |
+
]
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"cell_type": "code",
|
388 |
+
"execution_count": 14,
|
389 |
+
"id": "682a57fc-d2f6-4ad5-b05c-ad4a311216fa",
|
390 |
+
"metadata": {},
|
391 |
+
"outputs": [
|
392 |
+
{
|
393 |
+
"data": {
|
394 |
+
"text/plain": [
|
395 |
+
"[{'label': 'NEGATIVE', 'score': 0.9997976422309875}]"
|
396 |
+
]
|
397 |
+
},
|
398 |
+
"execution_count": 14,
|
399 |
+
"metadata": {},
|
400 |
+
"output_type": "execute_result"
|
401 |
+
}
|
402 |
+
],
|
403 |
+
"source": [
|
404 |
+
"classifier(\"This was not that good movie!\")"
|
405 |
+
]
|
406 |
+
},
|
407 |
+
{
|
408 |
+
"cell_type": "code",
|
409 |
+
"execution_count": 15,
|
410 |
+
"id": "3e790b3e-cb65-46a8-9bcb-74503742fcb6",
|
411 |
+
"metadata": {},
|
412 |
+
"outputs": [
|
413 |
+
{
|
414 |
+
"data": {
|
415 |
+
"text/plain": [
|
416 |
+
"[{'label': 'NEGATIVE', 'score': 0.9997455477714539}]"
|
417 |
+
]
|
418 |
+
},
|
419 |
+
"execution_count": 15,
|
420 |
+
"metadata": {},
|
421 |
+
"output_type": "execute_result"
|
422 |
+
}
|
423 |
+
],
|
424 |
+
"source": [
|
425 |
+
"classifier(\"This was worst movie!\")"
|
426 |
+
]
|
427 |
+
},
|
428 |
+
{
|
429 |
+
"cell_type": "code",
|
430 |
+
"execution_count": 18,
|
431 |
+
"id": "56f1f502-f683-4c3a-924d-9a16e5d6c55c",
|
432 |
+
"metadata": {},
|
433 |
+
"outputs": [
|
434 |
+
{
|
435 |
+
"data": {
|
436 |
+
"text/plain": [
|
437 |
+
"[{'label': 'NEGATIVE', 'score': 0.9991676807403564}]"
|
438 |
+
]
|
439 |
+
},
|
440 |
+
"execution_count": 18,
|
441 |
+
"metadata": {},
|
442 |
+
"output_type": "execute_result"
|
443 |
+
}
|
444 |
+
],
|
445 |
+
"source": [
|
446 |
+
"classifier(\"In the movie, the acting was fine, but the story was bad, while the costume was good but the daigloues are boring!\")"
|
447 |
+
]
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"cell_type": "code",
|
451 |
+
"execution_count": 19,
|
452 |
+
"id": "a8755bd0-5c4b-4da7-84d5-fb1986188d4e",
|
453 |
+
"metadata": {},
|
454 |
+
"outputs": [],
|
455 |
+
"source": [
|
456 |
+
"def score(review_text):\n",
|
457 |
+
" return classifier(review_text)[0]['label']"
|
458 |
+
]
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"cell_type": "code",
|
462 |
+
"execution_count": 20,
|
463 |
+
"id": "1de830b3-bb2d-42ce-90b6-b2868017d499",
|
464 |
+
"metadata": {},
|
465 |
+
"outputs": [],
|
466 |
+
"source": [
|
467 |
+
"test_df = reviews['test'].to_pandas()"
|
468 |
+
]
|
469 |
+
},
|
470 |
+
{
|
471 |
+
"cell_type": "code",
|
472 |
+
"execution_count": 22,
|
473 |
+
"id": "0b0004ea-562c-493b-8a0c-125dd20a185f",
|
474 |
+
"metadata": {},
|
475 |
+
"outputs": [],
|
476 |
+
"source": [
|
477 |
+
"test_df['model_prediction'] = test_df['text'].apply(score)"
|
478 |
+
]
|
479 |
+
},
|
480 |
+
{
|
481 |
+
"cell_type": "code",
|
482 |
+
"execution_count": 24,
|
483 |
+
"id": "21715d4f-51f2-4118-b5d8-23b5d432d3b6",
|
484 |
+
"metadata": {},
|
485 |
+
"outputs": [
|
486 |
+
{
|
487 |
+
"data": {
|
488 |
+
"text/html": [
|
489 |
+
"<div>\n",
|
490 |
+
"<style scoped>\n",
|
491 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
492 |
+
" vertical-align: middle;\n",
|
493 |
+
" }\n",
|
494 |
+
"\n",
|
495 |
+
" .dataframe tbody tr th {\n",
|
496 |
+
" vertical-align: top;\n",
|
497 |
+
" }\n",
|
498 |
+
"\n",
|
499 |
+
" .dataframe thead th {\n",
|
500 |
+
" text-align: right;\n",
|
501 |
+
" }\n",
|
502 |
+
"</style>\n",
|
503 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
504 |
+
" <thead>\n",
|
505 |
+
" <tr style=\"text-align: right;\">\n",
|
506 |
+
" <th></th>\n",
|
507 |
+
" <th>text</th>\n",
|
508 |
+
" <th>label</th>\n",
|
509 |
+
" <th>model_prediction</th>\n",
|
510 |
+
" </tr>\n",
|
511 |
+
" </thead>\n",
|
512 |
+
" <tbody>\n",
|
513 |
+
" <tr>\n",
|
514 |
+
" <th>0</th>\n",
|
515 |
+
" <td>lovingly photographed in the manner of a golde...</td>\n",
|
516 |
+
" <td>1</td>\n",
|
517 |
+
" <td>POSITIVE</td>\n",
|
518 |
+
" </tr>\n",
|
519 |
+
" <tr>\n",
|
520 |
+
" <th>1</th>\n",
|
521 |
+
" <td>consistently clever and suspenseful .</td>\n",
|
522 |
+
" <td>1</td>\n",
|
523 |
+
" <td>POSITIVE</td>\n",
|
524 |
+
" </tr>\n",
|
525 |
+
" <tr>\n",
|
526 |
+
" <th>2</th>\n",
|
527 |
+
" <td>it's like a \" big chill \" reunion of the baade...</td>\n",
|
528 |
+
" <td>1</td>\n",
|
529 |
+
" <td>NEGATIVE</td>\n",
|
530 |
+
" </tr>\n",
|
531 |
+
" <tr>\n",
|
532 |
+
" <th>3</th>\n",
|
533 |
+
" <td>the story gives ample opportunity for large-sc...</td>\n",
|
534 |
+
" <td>1</td>\n",
|
535 |
+
" <td>POSITIVE</td>\n",
|
536 |
+
" </tr>\n",
|
537 |
+
" <tr>\n",
|
538 |
+
" <th>4</th>\n",
|
539 |
+
" <td>red dragon \" never cuts corners .</td>\n",
|
540 |
+
" <td>1</td>\n",
|
541 |
+
" <td>POSITIVE</td>\n",
|
542 |
+
" </tr>\n",
|
543 |
+
" <tr>\n",
|
544 |
+
" <th>...</th>\n",
|
545 |
+
" <td>...</td>\n",
|
546 |
+
" <td>...</td>\n",
|
547 |
+
" <td>...</td>\n",
|
548 |
+
" </tr>\n",
|
549 |
+
" <tr>\n",
|
550 |
+
" <th>1061</th>\n",
|
551 |
+
" <td>a terrible movie that some people will neverth...</td>\n",
|
552 |
+
" <td>0</td>\n",
|
553 |
+
" <td>NEGATIVE</td>\n",
|
554 |
+
" </tr>\n",
|
555 |
+
" <tr>\n",
|
556 |
+
" <th>1062</th>\n",
|
557 |
+
" <td>there are many definitions of 'time waster' bu...</td>\n",
|
558 |
+
" <td>0</td>\n",
|
559 |
+
" <td>NEGATIVE</td>\n",
|
560 |
+
" </tr>\n",
|
561 |
+
" <tr>\n",
|
562 |
+
" <th>1063</th>\n",
|
563 |
+
" <td>as it stands , crocodile hunter has the hurrie...</td>\n",
|
564 |
+
" <td>0</td>\n",
|
565 |
+
" <td>NEGATIVE</td>\n",
|
566 |
+
" </tr>\n",
|
567 |
+
" <tr>\n",
|
568 |
+
" <th>1064</th>\n",
|
569 |
+
" <td>the thing looks like a made-for-home-video qui...</td>\n",
|
570 |
+
" <td>0</td>\n",
|
571 |
+
" <td>NEGATIVE</td>\n",
|
572 |
+
" </tr>\n",
|
573 |
+
" <tr>\n",
|
574 |
+
" <th>1065</th>\n",
|
575 |
+
" <td>enigma is well-made , but it's just too dry an...</td>\n",
|
576 |
+
" <td>0</td>\n",
|
577 |
+
" <td>NEGATIVE</td>\n",
|
578 |
+
" </tr>\n",
|
579 |
+
" </tbody>\n",
|
580 |
+
"</table>\n",
|
581 |
+
"<p>1066 rows × 3 columns</p>\n",
|
582 |
+
"</div>"
|
583 |
+
],
|
584 |
+
"text/plain": [
|
585 |
+
" text label \\\n",
|
586 |
+
"0 lovingly photographed in the manner of a golde... 1 \n",
|
587 |
+
"1 consistently clever and suspenseful . 1 \n",
|
588 |
+
"2 it's like a \" big chill \" reunion of the baade... 1 \n",
|
589 |
+
"3 the story gives ample opportunity for large-sc... 1 \n",
|
590 |
+
"4 red dragon \" never cuts corners . 1 \n",
|
591 |
+
"... ... ... \n",
|
592 |
+
"1061 a terrible movie that some people will neverth... 0 \n",
|
593 |
+
"1062 there are many definitions of 'time waster' bu... 0 \n",
|
594 |
+
"1063 as it stands , crocodile hunter has the hurrie... 0 \n",
|
595 |
+
"1064 the thing looks like a made-for-home-video qui... 0 \n",
|
596 |
+
"1065 enigma is well-made , but it's just too dry an... 0 \n",
|
597 |
+
"\n",
|
598 |
+
" model_prediction \n",
|
599 |
+
"0 POSITIVE \n",
|
600 |
+
"1 POSITIVE \n",
|
601 |
+
"2 NEGATIVE \n",
|
602 |
+
"3 POSITIVE \n",
|
603 |
+
"4 POSITIVE \n",
|
604 |
+
"... ... \n",
|
605 |
+
"1061 NEGATIVE \n",
|
606 |
+
"1062 NEGATIVE \n",
|
607 |
+
"1063 NEGATIVE \n",
|
608 |
+
"1064 NEGATIVE \n",
|
609 |
+
"1065 NEGATIVE \n",
|
610 |
+
"\n",
|
611 |
+
"[1066 rows x 3 columns]"
|
612 |
+
]
|
613 |
+
},
|
614 |
+
"execution_count": 24,
|
615 |
+
"metadata": {},
|
616 |
+
"output_type": "execute_result"
|
617 |
+
}
|
618 |
+
],
|
619 |
+
"source": [
|
620 |
+
"test_df"
|
621 |
+
]
|
622 |
+
},
|
623 |
+
{
|
624 |
+
"cell_type": "code",
|
625 |
+
"execution_count": null,
|
626 |
+
"id": "3c916443-92ad-4dc6-a5bf-10aad52d8b4c",
|
627 |
+
"metadata": {},
|
628 |
+
"outputs": [],
|
629 |
+
"source": []
|
630 |
+
}
|
631 |
+
],
|
632 |
+
"metadata": {
|
633 |
+
"kernelspec": {
|
634 |
+
"display_name": "Python 3 (ipykernel)",
|
635 |
+
"language": "python",
|
636 |
+
"name": "python3"
|
637 |
+
},
|
638 |
+
"language_info": {
|
639 |
+
"codemirror_mode": {
|
640 |
+
"name": "ipython",
|
641 |
+
"version": 3
|
642 |
+
},
|
643 |
+
"file_extension": ".py",
|
644 |
+
"mimetype": "text/x-python",
|
645 |
+
"name": "python",
|
646 |
+
"nbconvert_exporter": "python",
|
647 |
+
"pygments_lexer": "ipython3",
|
648 |
+
"version": "3.12.2"
|
649 |
+
}
|
650 |
+
},
|
651 |
+
"nbformat": 4,
|
652 |
+
"nbformat_minor": 5
|
653 |
+
}
|
01-Transformers/01-Pipelines-for-NLP-Tasks.ipynb
ADDED
@@ -0,0 +1,711 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "b89da7e6-431a-4659-a5b3-45323d11fd03",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Pipelines for NLP Tasks"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": 2,
|
14 |
+
"id": "76d9d1e3-05e1-456d-b564-5d096896a778",
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [
|
17 |
+
{
|
18 |
+
"name": "stderr",
|
19 |
+
"output_type": "stream",
|
20 |
+
"text": [
|
21 |
+
"/Users/milindchawre/.pyenv/versions/3.12.2/envs/hugging-face/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
22 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
23 |
+
]
|
24 |
+
}
|
25 |
+
],
|
26 |
+
"source": [
|
27 |
+
"import transformers\n",
|
28 |
+
"from transformers import pipeline"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "code",
|
33 |
+
"execution_count": 3,
|
34 |
+
"id": "cf038c7c-13ce-4231-8acb-2a6b8de67de6",
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [
|
37 |
+
{
|
38 |
+
"name": "stdout",
|
39 |
+
"output_type": "stream",
|
40 |
+
"text": [
|
41 |
+
"4.44.0\n"
|
42 |
+
]
|
43 |
+
}
|
44 |
+
],
|
45 |
+
"source": [
|
46 |
+
"print(transformers.__version__)"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "markdown",
|
51 |
+
"id": "1dbc51c2-1efd-4811-929a-54ef8424c30c",
|
52 |
+
"metadata": {},
|
53 |
+
"source": [
|
54 |
+
"## Loading Tasks\n",
|
55 |
+
"\n",
|
56 |
+
"The task defining which pipeline will be returned. Currently accepted tasks are:\n",
|
57 |
+
" \n",
|
58 |
+
" - `\"audio-classification\"`: will return a [`AudioClassificationPipeline`].\n",
|
59 |
+
" - `\"automatic-speech-recognition\"`: will return a [`AutomaticSpeechRecognitionPipeline`].\n",
|
60 |
+
" - `\"conversational\"`: will return a [`ConversationalPipeline`].\n",
|
61 |
+
" - `\"depth-estimation\"`: will return a [`DepthEstimationPipeline`].\n",
|
62 |
+
" - `\"document-question-answering\"`: will return a [`DocumentQuestionAnsweringPipeline`].\n",
|
63 |
+
" - `\"feature-extraction\"`: will return a [`FeatureExtractionPipeline`].\n",
|
64 |
+
" - `\"fill-mask\"`: will return a [`FillMaskPipeline`]:.\n",
|
65 |
+
" - `\"image-classification\"`: will return a [`ImageClassificationPipeline`].\n",
|
66 |
+
" - `\"image-segmentation\"`: will return a [`ImageSegmentationPipeline`].\n",
|
67 |
+
" - `\"image-to-text\"`: will return a [`ImageToTextPipeline`].\n",
|
68 |
+
" - `\"object-detection\"`: will return a [`ObjectDetectionPipeline`].\n",
|
69 |
+
" - `\"question-answering\"`: will return a [`QuestionAnsweringPipeline`].\n",
|
70 |
+
" - `\"summarization\"`: will return a [`SummarizationPipeline`].\n",
|
71 |
+
" - `\"table-question-answering\"`: will return a [`TableQuestionAnsweringPipeline`].\n",
|
72 |
+
" - `\"text2text-generation\"`: will return a [`Text2TextGenerationPipeline`].\n",
|
73 |
+
" - `\"text-classification\"` (alias `\"sentiment-analysis\"` available): will return a\n",
|
74 |
+
" [`TextClassificationPipeline`].\n",
|
75 |
+
" - `\"text-generation\"`: will return a [`TextGenerationPipeline`]:.\n",
|
76 |
+
" - `\"token-classification\"` (alias `\"ner\"` available): will return a [`TokenClassificationPipeline`].\n",
|
77 |
+
" - `\"translation\"`: will return a [`TranslationPipeline`].\n",
|
78 |
+
" - `\"translation_xx_to_yy\"`: will return a [`TranslationPipeline`].\n",
|
79 |
+
" - `\"video-classification\"`: will return a [`VideoClassificationPipeline`].\n",
|
80 |
+
" - `\"visual-question-answering\"`: will return a [`VisualQuestionAnsweringPipeline`].\n",
|
81 |
+
" - `\"zero-shot-classification\"`: will return a [`ZeroShotClassificationPipeline`].\n",
|
82 |
+
" - `\"zero-shot-image-classification\"`: will return a [`ZeroShotImageClassificationPipeline`].\n",
|
83 |
+
" - `\"zero-shot-object-detection\"`: will return a [`ZeroShotObjectDetectionPipeline`]."
|
84 |
+
]
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"cell_type": "markdown",
|
88 |
+
"id": "aaa893bc-026c-456a-99f0-8f56a47da96e",
|
89 |
+
"metadata": {
|
90 |
+
"tags": []
|
91 |
+
},
|
92 |
+
"source": [
|
93 |
+
"## Classification \n",
|
94 |
+
"\n",
|
95 |
+
"### Default Models"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 4,
|
101 |
+
"id": "c3498239-01f0-49b1-bf3e-d9cba22d41ac",
|
102 |
+
"metadata": {},
|
103 |
+
"outputs": [
|
104 |
+
{
|
105 |
+
"name": "stderr",
|
106 |
+
"output_type": "stream",
|
107 |
+
"text": [
|
108 |
+
"No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).\n",
|
109 |
+
"Using a pipeline without specifying a model name and revision in production is not recommended.\n",
|
110 |
+
"/Users/milindchawre/.pyenv/versions/3.12.2/envs/hugging-face/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
|
111 |
+
" warnings.warn(\n"
|
112 |
+
]
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"data": {
|
116 |
+
"text/plain": [
|
117 |
+
"[{'label': 'POSITIVE', 'score': 0.9998236298561096}]"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
"execution_count": 4,
|
121 |
+
"metadata": {},
|
122 |
+
"output_type": "execute_result"
|
123 |
+
}
|
124 |
+
],
|
125 |
+
"source": [
|
126 |
+
"pipe = pipeline(task=\"text-classification\",device=0)\n",
|
127 |
+
"pipe(\"This restaurant is ok\")"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "markdown",
|
132 |
+
"id": "e88492a4-003a-4387-9590-0f3f621278ba",
|
133 |
+
"metadata": {},
|
134 |
+
"source": [
|
135 |
+
"### Specific Models\n",
|
136 |
+
"\n",
|
137 |
+
"Perhaps you want to use a different model for different categories or text types, for example, financial news: https://huggingface.co/ProsusAI/finbert\n",
|
138 |
+
"\n",
|
139 |
+
"You can explore more details in the paper: https://arxiv.org/pdf/1908.10063"
|
140 |
+
]
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"cell_type": "code",
|
144 |
+
"execution_count": 5,
|
145 |
+
"id": "bcf0966e-892f-46ca-9321-c0b31c9862ab",
|
146 |
+
"metadata": {},
|
147 |
+
"outputs": [
|
148 |
+
{
|
149 |
+
"name": "stderr",
|
150 |
+
"output_type": "stream",
|
151 |
+
"text": [
|
152 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
153 |
+
"To disable this warning, you can either:\n",
|
154 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
155 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
|
156 |
+
"Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n"
|
157 |
+
]
|
158 |
+
}
|
159 |
+
],
|
160 |
+
"source": [
|
161 |
+
"pipe = pipeline(model=\"ProsusAI/finbert\")"
|
162 |
+
]
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"cell_type": "code",
|
166 |
+
"execution_count": 6,
|
167 |
+
"id": "fdf85b86-1132-4ef7-80c8-36e729be2910",
|
168 |
+
"metadata": {},
|
169 |
+
"outputs": [
|
170 |
+
{
|
171 |
+
"data": {
|
172 |
+
"text/plain": [
|
173 |
+
"[{'label': 'positive', 'score': 0.9350943565368652}]"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
"execution_count": 6,
|
177 |
+
"metadata": {},
|
178 |
+
"output_type": "execute_result"
|
179 |
+
}
|
180 |
+
],
|
181 |
+
"source": [
|
182 |
+
"pipe(\"Shares of food delivery companies surged despite the catastrophic impact of coronavirus on global markets.\")"
|
183 |
+
]
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"execution_count": 7,
|
188 |
+
"id": "6eca0d8e-1d32-4801-945a-34e9e7bbf83d",
|
189 |
+
"metadata": {},
|
190 |
+
"outputs": [],
|
191 |
+
"source": [
|
192 |
+
"tweets = ['Gonna buy AAPL, its about to surge up!',\n",
|
193 |
+
" 'Gotta sell AAPL, its gonna plummet!']"
|
194 |
+
]
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"cell_type": "code",
|
198 |
+
"execution_count": 8,
|
199 |
+
"id": "bfc8d3e1-d7d4-457f-a61c-25c023f4851a",
|
200 |
+
"metadata": {},
|
201 |
+
"outputs": [
|
202 |
+
{
|
203 |
+
"data": {
|
204 |
+
"text/plain": [
|
205 |
+
"[{'label': 'positive', 'score': 0.523411750793457},\n",
|
206 |
+
" {'label': 'neutral', 'score': 0.5528597831726074}]"
|
207 |
+
]
|
208 |
+
},
|
209 |
+
"execution_count": 8,
|
210 |
+
"metadata": {},
|
211 |
+
"output_type": "execute_result"
|
212 |
+
}
|
213 |
+
],
|
214 |
+
"source": [
|
215 |
+
"pipe(tweets)"
|
216 |
+
]
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"cell_type": "markdown",
|
220 |
+
"id": "d2e8be7b-2cb7-425d-ae25-fa0b57f67f6a",
|
221 |
+
"metadata": {},
|
222 |
+
"source": [
|
223 |
+
"# Named Entity Recognition\n",
|
224 |
+
"\n",
|
225 |
+
"Let's explore another NLP task, such as NER - Named Entity Recognition\n",
|
226 |
+
"\n",
|
227 |
+
"**Note, this is a much larger model! If you run this it will download about 1.5 GB on to your computer inside of a cache folder!**"
|
228 |
+
]
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"cell_type": "code",
|
232 |
+
"execution_count": 9,
|
233 |
+
"id": "1486daa6-6108-4179-8fd3-682c17ad8f56",
|
234 |
+
"metadata": {
|
235 |
+
"editable": true,
|
236 |
+
"slideshow": {
|
237 |
+
"slide_type": ""
|
238 |
+
},
|
239 |
+
"tags": []
|
240 |
+
},
|
241 |
+
"outputs": [
|
242 |
+
{
|
243 |
+
"name": "stderr",
|
244 |
+
"output_type": "stream",
|
245 |
+
"text": [
|
246 |
+
"No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).\n",
|
247 |
+
"Using a pipeline without specifying a model name and revision in production is not recommended.\n",
|
248 |
+
"Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n"
|
249 |
+
]
|
250 |
+
}
|
251 |
+
],
|
252 |
+
"source": [
|
253 |
+
"pipe = pipeline(task=\"text-classification\")"
|
254 |
+
]
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"cell_type": "code",
|
258 |
+
"execution_count": 10,
|
259 |
+
"id": "293aa275-9fde-4017-9345-ce7a4debf315",
|
260 |
+
"metadata": {},
|
261 |
+
"outputs": [
|
262 |
+
{
|
263 |
+
"name": "stderr",
|
264 |
+
"output_type": "stream",
|
265 |
+
"text": [
|
266 |
+
"No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).\n",
|
267 |
+
"Using a pipeline without specifying a model name and revision in production is not recommended.\n",
|
268 |
+
"Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
|
269 |
+
"- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
270 |
+
"- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
271 |
+
"Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n"
|
272 |
+
]
|
273 |
+
}
|
274 |
+
],
|
275 |
+
"source": [
|
276 |
+
"ner_tag_pipe = pipeline('ner')"
|
277 |
+
]
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"cell_type": "code",
|
281 |
+
"execution_count": 22,
|
282 |
+
"id": "cbaaa69a-9dab-42a3-a1f5-1b109251c8c6",
|
283 |
+
"metadata": {},
|
284 |
+
"outputs": [],
|
285 |
+
"source": [
|
286 |
+
"result = ner_tag_pipe(\"After working at Tesla I started to study Nikola Tesla a lot more, especially at university in the USA.\")"
|
287 |
+
]
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"cell_type": "code",
|
291 |
+
"execution_count": 23,
|
292 |
+
"id": "88e7d85a-6811-47a5-b400-2b36eb32e87e",
|
293 |
+
"metadata": {},
|
294 |
+
"outputs": [],
|
295 |
+
"source": [
|
296 |
+
"#sentence =\"\"\"After working at Tomtom I started to study AI a lot more, especially at home in the Mumbai, Topics like RAG, hugging face and data science interest me more, Eating food like snacks and packed food with my laptop is my working setup.\"\"\"\n",
|
297 |
+
"#result = ner_tag_pipe(sentence)"
|
298 |
+
]
|
299 |
+
},
|
300 |
+
{
|
301 |
+
"cell_type": "code",
|
302 |
+
"execution_count": 24,
|
303 |
+
"id": "1be4c2c4-c1ae-446a-9ce3-680934e7da9c",
|
304 |
+
"metadata": {},
|
305 |
+
"outputs": [
|
306 |
+
{
|
307 |
+
"data": {
|
308 |
+
"text/plain": [
|
309 |
+
"[{'entity': 'I-ORG',\n",
|
310 |
+
" 'score': 0.9137765,\n",
|
311 |
+
" 'index': 4,\n",
|
312 |
+
" 'word': 'Te',\n",
|
313 |
+
" 'start': 17,\n",
|
314 |
+
" 'end': 19},\n",
|
315 |
+
" {'entity': 'I-ORG',\n",
|
316 |
+
" 'score': 0.3789888,\n",
|
317 |
+
" 'index': 5,\n",
|
318 |
+
" 'word': '##sla',\n",
|
319 |
+
" 'start': 19,\n",
|
320 |
+
" 'end': 22},\n",
|
321 |
+
" {'entity': 'I-PER',\n",
|
322 |
+
" 'score': 0.99693346,\n",
|
323 |
+
" 'index': 10,\n",
|
324 |
+
" 'word': 'Nikola',\n",
|
325 |
+
" 'start': 42,\n",
|
326 |
+
" 'end': 48},\n",
|
327 |
+
" {'entity': 'I-PER',\n",
|
328 |
+
" 'score': 0.9901416,\n",
|
329 |
+
" 'index': 11,\n",
|
330 |
+
" 'word': 'Te',\n",
|
331 |
+
" 'start': 49,\n",
|
332 |
+
" 'end': 51},\n",
|
333 |
+
" {'entity': 'I-PER',\n",
|
334 |
+
" 'score': 0.8931826,\n",
|
335 |
+
" 'index': 12,\n",
|
336 |
+
" 'word': '##sla',\n",
|
337 |
+
" 'start': 51,\n",
|
338 |
+
" 'end': 54},\n",
|
339 |
+
" {'entity': 'I-LOC',\n",
|
340 |
+
" 'score': 0.9997478,\n",
|
341 |
+
" 'index': 22,\n",
|
342 |
+
" 'word': 'USA',\n",
|
343 |
+
" 'start': 99,\n",
|
344 |
+
" 'end': 102}]"
|
345 |
+
]
|
346 |
+
},
|
347 |
+
"execution_count": 24,
|
348 |
+
"metadata": {},
|
349 |
+
"output_type": "execute_result"
|
350 |
+
}
|
351 |
+
],
|
352 |
+
"source": [
|
353 |
+
"result"
|
354 |
+
]
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"cell_type": "markdown",
|
358 |
+
"id": "88b446e2-8b25-4736-932b-4a7570c2570b",
|
359 |
+
"metadata": {},
|
360 |
+
"source": [
|
361 |
+
"# Question Answering"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"execution_count": 25,
|
367 |
+
"id": "2ab7745a-1fe3-4e5d-89cf-8399185acd9d",
|
368 |
+
"metadata": {},
|
369 |
+
"outputs": [
|
370 |
+
{
|
371 |
+
"name": "stderr",
|
372 |
+
"output_type": "stream",
|
373 |
+
"text": [
|
374 |
+
"No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).\n",
|
375 |
+
"Using a pipeline without specifying a model name and revision in production is not recommended.\n",
|
376 |
+
"Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n"
|
377 |
+
]
|
378 |
+
}
|
379 |
+
],
|
380 |
+
"source": [
|
381 |
+
"qa_bot = pipeline('question-answering')"
|
382 |
+
]
|
383 |
+
},
|
384 |
+
{
|
385 |
+
"cell_type": "code",
|
386 |
+
"execution_count": 26,
|
387 |
+
"id": "4380b8e5-4807-48da-bd3e-65e78c42e967",
|
388 |
+
"metadata": {},
|
389 |
+
"outputs": [],
|
390 |
+
"source": [
|
391 |
+
"text = \"\"\"\n",
|
392 |
+
"D-Day, marked on June 6, 1944, stands as one of the most significant military operations in history, \n",
|
393 |
+
"initiating the Allied invasion of Nazi-occupied Europe during World War II. Known as Operation Overlord, \n",
|
394 |
+
"this massive amphibious assault involved nearly 160,000 Allied troops landing on the beaches of Normandy, \n",
|
395 |
+
"France, across five sectors: Utah, Omaha, Gold, Juno, and Sword. Supported by over 5,000 ships and 13,000 \n",
|
396 |
+
"aircraft, the operation was preceded by extensive aerial and naval bombardment and an airborne assault. \n",
|
397 |
+
"The invasion set the stage for the liberation of Western Europe from Nazi control, despite the heavy \n",
|
398 |
+
"casualties and formidable German defenses. This day not only demonstrated the logistical prowess \n",
|
399 |
+
"and courage of the Allied forces but also marked a turning point in the war, leading to the eventual \n",
|
400 |
+
"defeat of Nazi Germany.\n",
|
401 |
+
"\"\"\""
|
402 |
+
]
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"cell_type": "code",
|
406 |
+
"execution_count": 38,
|
407 |
+
"id": "c185e089-0966-44b6-b4d8-882faf504e3c",
|
408 |
+
"metadata": {},
|
409 |
+
"outputs": [],
|
410 |
+
"source": [
|
411 |
+
"question = \"What were the five beach sectors on D-Day?\"\n",
|
412 |
+
"\n",
|
413 |
+
"result = qa_bot(question=question,context=text)"
|
414 |
+
]
|
415 |
+
},
|
416 |
+
{
|
417 |
+
"cell_type": "code",
|
418 |
+
"execution_count": 36,
|
419 |
+
"id": "720fcd66-2e85-4034-a81a-f49901d2fbb7",
|
420 |
+
"metadata": {},
|
421 |
+
"outputs": [],
|
422 |
+
"source": [
|
423 |
+
"#\n",
|
424 |
+
"#question = \"Who is sherlock holmes?\"\n",
|
425 |
+
"#result = qa_bot(question=question,context=text)"
|
426 |
+
]
|
427 |
+
},
|
428 |
+
{
|
429 |
+
"cell_type": "code",
|
430 |
+
"execution_count": 39,
|
431 |
+
"id": "8d1918a0-70f3-4d6c-8a17-bac0d07a9761",
|
432 |
+
"metadata": {},
|
433 |
+
"outputs": [
|
434 |
+
{
|
435 |
+
"data": {
|
436 |
+
"text/plain": [
|
437 |
+
"{'score': 0.9430821537971497,\n",
|
438 |
+
" 'start': 345,\n",
|
439 |
+
" 'end': 379,\n",
|
440 |
+
" 'answer': 'Utah, Omaha, Gold, Juno, and Sword'}"
|
441 |
+
]
|
442 |
+
},
|
443 |
+
"execution_count": 39,
|
444 |
+
"metadata": {},
|
445 |
+
"output_type": "execute_result"
|
446 |
+
}
|
447 |
+
],
|
448 |
+
"source": [
|
449 |
+
"result"
|
450 |
+
]
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"cell_type": "markdown",
|
454 |
+
"id": "f536e540-4080-4135-84e8-86fe13dc2fe6",
|
455 |
+
"metadata": {},
|
456 |
+
"source": [
|
457 |
+
"## Translations\n",
|
458 |
+
"\n",
|
459 |
+
"Translates from one language to another.\n",
|
460 |
+
"\n",
|
461 |
+
"This translation pipeline can currently be loaded from pipeline() using the following task identifier: \"translation_xx_to_yy\".\n",
|
462 |
+
"\n",
|
463 |
+
"The models that this pipeline can use are models that have been fine-tuned on a translation task. See the up-to-date list of available models on www.huggingface.co/models. \n",
|
464 |
+
"\n",
|
465 |
+
"Note: You would typically call a specific model for translations: https://huggingface.co/models?pipeline_tag=translation"
|
466 |
+
]
|
467 |
+
},
|
468 |
+
{
|
469 |
+
"cell_type": "code",
|
470 |
+
"execution_count": 6,
|
471 |
+
"id": "24f282e5-1f0e-44ca-9b92-22b211208274",
|
472 |
+
"metadata": {},
|
473 |
+
"outputs": [
|
474 |
+
{
|
475 |
+
"name": "stderr",
|
476 |
+
"output_type": "stream",
|
477 |
+
"text": [
|
478 |
+
"No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).\n",
|
479 |
+
"Using a pipeline without specifying a model name and revision in production is not recommended.\n"
|
480 |
+
]
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"data": {
|
484 |
+
"application/vnd.jupyter.widget-view+json": {
|
485 |
+
"model_id": "453587fb51494963bf1c3ba521c405f4",
|
486 |
+
"version_major": 2,
|
487 |
+
"version_minor": 0
|
488 |
+
},
|
489 |
+
"text/plain": [
|
490 |
+
"config.json: 0%| | 0.00/1.21k [00:00<?, ?B/s]"
|
491 |
+
]
|
492 |
+
},
|
493 |
+
"metadata": {},
|
494 |
+
"output_type": "display_data"
|
495 |
+
},
|
496 |
+
{
|
497 |
+
"name": "stderr",
|
498 |
+
"output_type": "stream",
|
499 |
+
"text": [
|
500 |
+
"C:\\Users\\Marcial\\AppData\\Roaming\\Python\\Python39\\site-packages\\huggingface_hub\\file_download.py:148: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Marcial\\.cache\\huggingface\\hub\\models--t5-base. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
501 |
+
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
502 |
+
" warnings.warn(message)\n"
|
503 |
+
]
|
504 |
+
},
|
505 |
+
{
|
506 |
+
"data": {
|
507 |
+
"application/vnd.jupyter.widget-view+json": {
|
508 |
+
"model_id": "e099a45ebcfa45c48dfecbcfb3eec137",
|
509 |
+
"version_major": 2,
|
510 |
+
"version_minor": 0
|
511 |
+
},
|
512 |
+
"text/plain": [
|
513 |
+
"pytorch_model.bin: 0%| | 0.00/892M [00:00<?, ?B/s]"
|
514 |
+
]
|
515 |
+
},
|
516 |
+
"metadata": {},
|
517 |
+
"output_type": "display_data"
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"data": {
|
521 |
+
"application/vnd.jupyter.widget-view+json": {
|
522 |
+
"model_id": "6607a675437748f586b034cbcbebd046",
|
523 |
+
"version_major": 2,
|
524 |
+
"version_minor": 0
|
525 |
+
},
|
526 |
+
"text/plain": [
|
527 |
+
"generation_config.json: 0%| | 0.00/147 [00:00<?, ?B/s]"
|
528 |
+
]
|
529 |
+
},
|
530 |
+
"metadata": {},
|
531 |
+
"output_type": "display_data"
|
532 |
+
},
|
533 |
+
{
|
534 |
+
"data": {
|
535 |
+
"application/vnd.jupyter.widget-view+json": {
|
536 |
+
"model_id": "eeaea7f972ce4f968a30bcf0c8cf7a30",
|
537 |
+
"version_major": 2,
|
538 |
+
"version_minor": 0
|
539 |
+
},
|
540 |
+
"text/plain": [
|
541 |
+
"spiece.model: 0%| | 0.00/792k [00:00<?, ?B/s]"
|
542 |
+
]
|
543 |
+
},
|
544 |
+
"metadata": {},
|
545 |
+
"output_type": "display_data"
|
546 |
+
},
|
547 |
+
{
|
548 |
+
"data": {
|
549 |
+
"application/vnd.jupyter.widget-view+json": {
|
550 |
+
"model_id": "21b2cfc97ada4a41bb3cefb2e2b95170",
|
551 |
+
"version_major": 2,
|
552 |
+
"version_minor": 0
|
553 |
+
},
|
554 |
+
"text/plain": [
|
555 |
+
"tokenizer.json: 0%| | 0.00/1.39M [00:00<?, ?B/s]"
|
556 |
+
]
|
557 |
+
},
|
558 |
+
"metadata": {},
|
559 |
+
"output_type": "display_data"
|
560 |
+
},
|
561 |
+
{
|
562 |
+
"name": "stderr",
|
563 |
+
"output_type": "stream",
|
564 |
+
"text": [
|
565 |
+
"C:\\Users\\Marcial\\AppData\\Roaming\\Python\\Python39\\site-packages\\transformers\\models\\t5\\tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
|
566 |
+
"For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
|
567 |
+
"- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n",
|
568 |
+
"- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
|
569 |
+
"- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
|
570 |
+
" warnings.warn(\n"
|
571 |
+
]
|
572 |
+
}
|
573 |
+
],
|
574 |
+
"source": [
|
575 |
+
"from transformers import pipeline\n",
|
576 |
+
"translate = pipeline('translation_en_to_fr')"
|
577 |
+
]
|
578 |
+
},
|
579 |
+
{
|
580 |
+
"cell_type": "code",
|
581 |
+
"execution_count": 7,
|
582 |
+
"id": "88c1074c-d1c2-4957-9a32-bb0275b9a3c2",
|
583 |
+
"metadata": {},
|
584 |
+
"outputs": [
|
585 |
+
{
|
586 |
+
"name": "stderr",
|
587 |
+
"output_type": "stream",
|
588 |
+
"text": [
|
589 |
+
"C:\\Users\\Marcial\\AppData\\Roaming\\Python\\Python39\\site-packages\\transformers\\generation\\utils.py:1186: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation)\n",
|
590 |
+
" warnings.warn(\n"
|
591 |
+
]
|
592 |
+
}
|
593 |
+
],
|
594 |
+
"source": [
|
595 |
+
"result = translate(\"Hello, my name is Jose. What is your name?\")"
|
596 |
+
]
|
597 |
+
},
|
598 |
+
{
|
599 |
+
"cell_type": "code",
|
600 |
+
"execution_count": 8,
|
601 |
+
"id": "f57e175b-c6e3-4645-941d-2d415295ff14",
|
602 |
+
"metadata": {},
|
603 |
+
"outputs": [
|
604 |
+
{
|
605 |
+
"data": {
|
606 |
+
"text/plain": [
|
607 |
+
"[{'translation_text': 'Bonjour, mon nom est Jose, quel est votre nom ?'}]"
|
608 |
+
]
|
609 |
+
},
|
610 |
+
"execution_count": 8,
|
611 |
+
"metadata": {},
|
612 |
+
"output_type": "execute_result"
|
613 |
+
}
|
614 |
+
],
|
615 |
+
"source": [
|
616 |
+
"result"
|
617 |
+
]
|
618 |
+
},
|
619 |
+
{
|
620 |
+
"cell_type": "code",
|
621 |
+
"execution_count": 9,
|
622 |
+
"id": "619c77f5-9eda-47a0-b721-2bf4efa9cec5",
|
623 |
+
"metadata": {},
|
624 |
+
"outputs": [],
|
625 |
+
"source": [
|
626 |
+
"result = translate(\"Hello, my name is Jose.\")"
|
627 |
+
]
|
628 |
+
},
|
629 |
+
{
|
630 |
+
"cell_type": "code",
|
631 |
+
"execution_count": 10,
|
632 |
+
"id": "10eea677-5ac7-477f-a67e-8f36c31e2acb",
|
633 |
+
"metadata": {},
|
634 |
+
"outputs": [
|
635 |
+
{
|
636 |
+
"data": {
|
637 |
+
"text/plain": [
|
638 |
+
"[{'translation_text': 'Bonjour, mon nom est Jose.'}]"
|
639 |
+
]
|
640 |
+
},
|
641 |
+
"execution_count": 10,
|
642 |
+
"metadata": {},
|
643 |
+
"output_type": "execute_result"
|
644 |
+
}
|
645 |
+
],
|
646 |
+
"source": [
|
647 |
+
"result"
|
648 |
+
]
|
649 |
+
},
|
650 |
+
{
|
651 |
+
"cell_type": "code",
|
652 |
+
"execution_count": 11,
|
653 |
+
"id": "fb287c23-f3e2-47e9-b92c-a0714937e0cf",
|
654 |
+
"metadata": {},
|
655 |
+
"outputs": [],
|
656 |
+
"source": [
|
657 |
+
"result = translate(\"Hello, I am called Jose.\")"
|
658 |
+
]
|
659 |
+
},
|
660 |
+
{
|
661 |
+
"cell_type": "code",
|
662 |
+
"execution_count": 12,
|
663 |
+
"id": "ca2a23c2-eed8-45ef-9d96-208dc2370b34",
|
664 |
+
"metadata": {},
|
665 |
+
"outputs": [
|
666 |
+
{
|
667 |
+
"data": {
|
668 |
+
"text/plain": [
|
669 |
+
"[{'translation_text': \"Bonjour, je m'appelle Jose.\"}]"
|
670 |
+
]
|
671 |
+
},
|
672 |
+
"execution_count": 12,
|
673 |
+
"metadata": {},
|
674 |
+
"output_type": "execute_result"
|
675 |
+
}
|
676 |
+
],
|
677 |
+
"source": [
|
678 |
+
"result"
|
679 |
+
]
|
680 |
+
},
|
681 |
+
{
|
682 |
+
"cell_type": "code",
|
683 |
+
"execution_count": null,
|
684 |
+
"id": "1b6b3d58-694b-4a0e-808d-84be7d5745ff",
|
685 |
+
"metadata": {},
|
686 |
+
"outputs": [],
|
687 |
+
"source": []
|
688 |
+
}
|
689 |
+
],
|
690 |
+
"metadata": {
|
691 |
+
"kernelspec": {
|
692 |
+
"display_name": "Python 3 (ipykernel)",
|
693 |
+
"language": "python",
|
694 |
+
"name": "python3"
|
695 |
+
},
|
696 |
+
"language_info": {
|
697 |
+
"codemirror_mode": {
|
698 |
+
"name": "ipython",
|
699 |
+
"version": 3
|
700 |
+
},
|
701 |
+
"file_extension": ".py",
|
702 |
+
"mimetype": "text/x-python",
|
703 |
+
"name": "python",
|
704 |
+
"nbconvert_exporter": "python",
|
705 |
+
"pygments_lexer": "ipython3",
|
706 |
+
"version": "3.12.2"
|
707 |
+
}
|
708 |
+
},
|
709 |
+
"nbformat": 4,
|
710 |
+
"nbformat_minor": 5
|
711 |
+
}
|
01-Transformers/02-LLMs.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|