rhoitjadhav commited on
Commit
b2577d0
·
1 Parent(s): 1bfba87

update dockerfile

Browse files
Files changed (3) hide show
  1. Dockerfile +20 -23
  2. load_data.py +100 -22
  3. start.sh +37 -7
Dockerfile CHANGED
@@ -1,39 +1,36 @@
1
- FROM ubuntu:20.04
2
 
3
  # Exposing ports
4
  EXPOSE 6900
5
 
6
- # Working Directory
7
- WORKDIR /app
8
-
9
  # Environment variables
10
- ENV ARGILLA_LOCAL_AUTH_USERS_DB_FILE=/app/users.yml
11
  ENV UVICORN_PORT=6900
12
 
13
- # Install Python
 
 
 
 
 
 
 
14
  RUN apt update
15
- RUN apt -y install curl python3.9 python3.9-dev python3.9-distutils gcc gnupg apache2-utils sudo openssl
16
- RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
17
- RUN python3.9 get-pip.py
 
 
18
 
19
  # Install argilla
20
- RUN pip install argilla[server]
 
21
 
22
  # Install Elasticsearch
23
  RUN curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add -
24
- RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | tee -a /etc/apt/sources.list.d/elastic-7.x.list
25
  RUN apt update
26
- RUN apt -y install elasticsearch
27
-
28
- # Copy users db file along with execution script
29
- COPY users.yml /app
30
- COPY start.sh /app
31
- COPY load_data.py /app
32
- RUN chmod +x /app/start.sh
33
-
34
- RUN useradd -ms /bin/bash user -p "$(openssl passwd -1 ubuntu)"
35
- RUN echo 'user ALL=(ALL) ALL' >> /etc/sudoers
36
 
37
  # Executing argilla along with elasticsearch
38
- ENTRYPOINT "/app/start.sh"
39
- #CMD ["sudo", "/bin/bash", "-c", "/etc/init.d/elasticsearch start; sleep 15; uvicorn argilla:app --host '0.0.0.0'"]
 
1
+ FROM python:3.9-slim
2
 
3
  # Exposing ports
4
  EXPOSE 6900
5
 
 
 
 
6
  # Environment variables
7
+ ENV ARGILLA_LOCAL_AUTH_USERS_DB_FILE=/packages/users.yml
8
  ENV UVICORN_PORT=6900
9
 
10
+ # Copying argilla distribution files
11
+ COPY *.whl /packages/
12
+
13
+ # Copy users db file along with execution script
14
+ COPY start.sh /
15
+ COPY load_data.py /
16
+
17
+ # Install packages
18
  RUN apt update
19
+ RUN apt -y install python3.9-dev gcc gnupg apache2-utils systemctl curl sudo vim
20
+
21
+ # Create new user for starting elasticsearch
22
+ RUN useradd -ms /bin/bash user -p "$(openssl passwd -1 ubuntu)"
23
+ RUN echo 'user ALL=(ALL) ALL' >> /etc/sudoers
24
 
25
  # Install argilla
26
+ RUN chmod +x /start.sh \
27
+ && for wheel in /packages/*.whl; do pip install "$wheel"[server]; done
28
 
29
  # Install Elasticsearch
30
  RUN curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add -
31
+ RUN echo "deb https://artifacts.elastic.co/packages/8.x/apt stable main" | tee -a /etc/apt/sources.list.d/elastic-8.x.list
32
  RUN apt update
33
+ RUN apt -y install elasticsearch=8.5.3
 
 
 
 
 
 
 
 
 
34
 
35
  # Executing argilla along with elasticsearch
36
+ CMD /bin/bash /start.sh
 
load_data.py CHANGED
@@ -1,13 +1,81 @@
1
- import time
2
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
 
 
4
 
5
- def load_data():
6
- # install datasets library with pip install datasets
7
- import argilla as rg
8
- from datasets import load_dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- rg.init(api_key="admin.apikey")
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # load dataset from the hub
13
  dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")
@@ -15,21 +83,31 @@ def load_data():
15
  # read in dataset, assuming its a dataset for token classification
16
  dataset_rg = rg.read_datasets(dataset, task="TokenClassification")
17
 
 
 
 
 
 
 
18
  # log the dataset
19
- rg.log(dataset_rg, "gutenberg_spacy-ner")
20
-
21
-
22
- if __name__ == '__main__':
23
- while True:
24
- try:
25
- response = requests.get("http://localhost:6900/")
26
- if response.status_code == 200:
27
- load_data()
28
- print("Data Loaded!")
29
- break
30
- else:
31
- time.sleep(5)
32
-
33
- except Exception as e:
34
- print(e)
35
  time.sleep(10)
 
 
 
 
 
 
1
  import requests
2
+ import time
3
+ import pandas as pd
4
+ import argilla as rg
5
+ from datasets import load_dataset
6
+ from argilla.labeling.text_classification import Rule, add_rules
7
+
8
+
9
+ def load_datasets():
10
+ # This is the code that you want to execute when the endpoint is available
11
+ print("Argilla is available! Loading datasets")
12
+
13
+ rg.init(api_key="UPLOAD_API_KEY", workspace="huggingface")
14
+
15
+ # load dataset from json
16
+ my_dataframe = pd.read_json(
17
+ "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json")
18
+
19
+ # convert pandas dataframe to DatasetForTextClassification
20
+ dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)
21
+
22
+ # Define labeling schema to avoid UI user modification
23
+ settings = rg.TextClassificationSettings(label_schema=["POSITIVE", "NEGATIVE"])
24
+ rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)
25
+
26
+ # log the dataset
27
+ rg.log(
28
+ dataset_rg,
29
+ name="sst-sentiment-explainability",
30
+ tags={
31
+ "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations from Transformers Interpret."
32
+ }
33
+ )
34
 
35
+ dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
36
+ dataset_rg = rg.read_datasets(dataset, task="Text2Text")
37
 
38
+ # log the dataset
39
+ rg.log(
40
+ dataset_rg,
41
+ name="news-text-summarization",
42
+ tags={
43
+ "description": "A text summarization dataset with news pieces and their predicted summaries."
44
+ }
45
+ )
46
+
47
+ # Read dataset from Hub
48
+ dataset_rg = rg.read_datasets(
49
+ load_dataset("argilla/agnews_weak_labeling", split="train"),
50
+ task="TextClassification",
51
+ )
52
+
53
+ # Define labeling schema to avoid UI user modification
54
+ settings = rg.TextClassificationSettings(label_schema=["World", "Sports", "Sci/Tech", "Business"])
55
+ rg.configure_dataset(name="news-programmatic-labeling", settings=settings)
56
+
57
+ # log the dataset
58
+ rg.log(
59
+ dataset_rg,
60
+ name="news-programmatic-labeling",
61
+ tags={
62
+ "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
63
+ }
64
+ )
65
 
66
+ # define queries and patterns for each category (using ES DSL)
67
+ queries = [
68
+ (["money", "financ*", "dollar*"], "Business"),
69
+ (["war", "gov*", "minister*", "conflict"], "World"),
70
+ (["*ball", "sport*", "game", "play*"], "Sports"),
71
+ (["sci*", "techno*", "computer*", "software", "web"], "Sci/Tech"),
72
+ ]
73
+
74
+ # define rules
75
+ rules = [Rule(query=term, label=label) for terms, label in queries for term in terms]
76
+
77
+ # add rules to the dataset
78
+ add_rules(dataset="news-programmatic-labeling", rules=rules)
79
 
80
  # load dataset from the hub
81
  dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")
 
83
  # read in dataset, assuming its a dataset for token classification
84
  dataset_rg = rg.read_datasets(dataset, task="TokenClassification")
85
 
86
+ # Define labeling schema to avoid UI user modification
87
+ labels = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG",
88
+ "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]
89
+ settings = rg.TokenClassificationSettings(label_schema=labels)
90
+ rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)
91
+
92
  # log the dataset
93
+ rg.log(
94
+ dataset_rg,
95
+ "gutenberg_spacy-ner-monitoring",
96
+ tags={
97
+ "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained models."
98
+ }
99
+ )
100
+
101
+
102
+ while True:
103
+ try:
104
+ response = requests.get("http://0.0.0.0:6900/")
105
+ if response.status_code == 200:
106
+ load_datasets()
107
+ break
108
+ else:
109
  time.sleep(10)
110
+ except Exception as e:
111
+ print(e)
112
+ time.sleep(10)
113
+ pass
start.sh CHANGED
@@ -2,16 +2,46 @@
2
 
3
  set -e
4
 
5
- echo "ubuntu" | sudo -S su user
 
6
 
7
- # Start elasticsearch
8
- sudo /etc/init.d/elasticsearch start
9
- echo "Waiting for elasticsearch to start"
10
- sleep 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Load data
13
- pip3 install datasets
14
- python3.9 /app/load_data.py &
15
 
16
  # Start argilla
17
  uvicorn argilla:app --host "0.0.0.0"
 
2
 
3
  set -e
4
 
5
+ # Changing user
6
+ sudo -S su user
7
 
8
+ # Generate hashed passwords
9
+ admin_password=$(htpasswd -nbB "" "$ADMIN_PASSWORD" | cut -d ":" -f 2 | tr -d "\n")
10
+ argilla_password=$(htpasswd -nbB "" "$ARGILLA_PASSWORD" | cut -d ":" -f 2 | tr -d "\n")
11
+
12
+ # Create users.yml file
13
+ cat >/packages/users.yml <<EOF
14
+ - username: "admin"
15
+ api_key: $ADMIN_API_KEY
16
+ full_name: Hugging Face
17
18
+ hashed_password: $admin_password
19
+ workspaces: []
20
+
21
+ - username: "argilla"
22
+ api_key: $ARGILLA_API_KEY
23
+ full_name: Hugging Face
24
25
+ hashed_password: $argilla_password
26
+ workspaces: ["admin"]
27
+ EOF
28
+
29
+ # Disable security in elasticsearch configuration
30
+ sudo sed -i "s/xpack.security.enabled: true/xpack.security.enabled: false/g" /etc/elasticsearch/elasticsearch.yml
31
+ sudo sed -i "s/cluster.initial_master_nodes/#cluster.initial_master_nodes/g" /etc/elasticsearch/elasticsearch.yml
32
+ echo "cluster.routing.allocation.disk.threshold_enabled: false" | sudo tee -a /etc/elasticsearch/elasticsearch.yml
33
+
34
+ # Create elasticsearch directory and change ownership
35
+ sudo mkdir -p /var/run/elasticsearch
36
+ sudo chown -R elasticsearch:elasticsearch /var/run/elasticsearch
37
+
38
+ # Starting elasticsearch
39
+ sudo systemctl daemon-reload
40
+ sudo systemctl enable elasticsearch
41
+ sudo systemctl start elasticsearch
42
 
43
  # Load data
44
+ python /app/load_data.py &
 
45
 
46
  # Start argilla
47
  uvicorn argilla:app --host "0.0.0.0"