Spaces:
Runtime error
Runtime error
MarkusStoll
commited on
Commit
·
5ee8932
1
Parent(s):
ebe1006
navigate ready
Browse files- Dockerfile +1 -1
- README.md +1 -2
- layout.json +39 -41
- run.py +28 -7
Dockerfile
CHANGED
@@ -6,7 +6,7 @@ ENV HOME=/code
|
|
6 |
RUN apt install curl
|
7 |
RUN pip install pip -U
|
8 |
|
9 |
-
RUN pip install renumics-spotlight==1.3.
|
10 |
|
11 |
RUN pip install datasets
|
12 |
COPY prepare.py .
|
|
|
6 |
RUN apt install curl
|
7 |
RUN pip install pip -U
|
8 |
|
9 |
+
RUN pip install renumics-spotlight==1.3.0rc7 httpx
|
10 |
|
11 |
RUN pip install datasets
|
12 |
COPY prepare.py .
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Cleanlab CIFAR-100 with Spotlight
|
3 |
emoji: 🧐
|
4 |
colorFrom: gray
|
5 |
colorTo: blue
|
@@ -14,7 +14,6 @@ tags:
|
|
14 |
- renumics
|
15 |
- spotlight
|
16 |
- EDA
|
17 |
-
duplicated_from: renumics/cifar10-cleanlab
|
18 |
---
|
19 |
|
20 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Navigate Cleanlab Data Issues in CIFAR-100 with Spotlight
|
3 |
emoji: 🧐
|
4 |
colorFrom: gray
|
5 |
colorTo: blue
|
|
|
14 |
- renumics
|
15 |
- spotlight
|
16 |
- EDA
|
|
|
17 |
---
|
18 |
|
19 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
layout.json
CHANGED
@@ -3,12 +3,12 @@
|
|
3 |
"children": [
|
4 |
{
|
5 |
"kind": "split",
|
6 |
-
"weight":
|
7 |
"orientation": "vertical",
|
8 |
"children": [
|
9 |
{
|
10 |
"kind": "tab",
|
11 |
-
"weight":
|
12 |
"children": [
|
13 |
{
|
14 |
"kind": "widget",
|
@@ -17,10 +17,10 @@
|
|
17 |
"config": {
|
18 |
"tableView": "full",
|
19 |
"visibleColumns": [
|
20 |
-
"
|
21 |
-
"fine_label_str",
|
22 |
"label_score",
|
23 |
"near_duplicate_score",
|
|
|
24 |
"outlier_score"
|
25 |
],
|
26 |
"sorting": null,
|
@@ -31,7 +31,7 @@
|
|
31 |
},
|
32 |
{
|
33 |
"kind": "tab",
|
34 |
-
"weight":
|
35 |
"children": [
|
36 |
{
|
37 |
"kind": "widget",
|
@@ -43,7 +43,7 @@
|
|
43 |
},
|
44 |
{
|
45 |
"kind": "tab",
|
46 |
-
"weight":
|
47 |
"children": [
|
48 |
{
|
49 |
"kind": "widget",
|
@@ -54,50 +54,26 @@
|
|
54 |
{
|
55 |
"view": "ImageView",
|
56 |
"columns": [
|
57 |
-
"
|
58 |
],
|
59 |
-
"name": "
|
60 |
-
"key": "
|
61 |
},
|
62 |
{
|
63 |
"view": "TextLens",
|
64 |
"columns": [
|
65 |
-
"
|
66 |
],
|
67 |
"name": "view",
|
68 |
-
"key": "
|
69 |
},
|
70 |
{
|
71 |
"view": "TextLens",
|
72 |
"columns": [
|
73 |
-
"
|
74 |
],
|
75 |
"name": "view",
|
76 |
-
"key": "
|
77 |
-
},
|
78 |
-
{
|
79 |
-
"view": "ScalarView",
|
80 |
-
"columns": [
|
81 |
-
"label_score"
|
82 |
-
],
|
83 |
-
"name": "view",
|
84 |
-
"key": "4ae33ae9-919a-4b10-9216-cd7c9448f9ac"
|
85 |
-
},
|
86 |
-
{
|
87 |
-
"view": "ScalarView",
|
88 |
-
"columns": [
|
89 |
-
"outlier_score"
|
90 |
-
],
|
91 |
-
"name": "view",
|
92 |
-
"key": "13fb6430-3ffc-422c-92be-243b174b9a15"
|
93 |
-
},
|
94 |
-
{
|
95 |
-
"view": "ScalarView",
|
96 |
-
"columns": [
|
97 |
-
"near_duplicate_score"
|
98 |
-
],
|
99 |
-
"name": "view",
|
100 |
-
"key": "daf7c0b7-2185-4e50-9eb0-ffab8d1ff906"
|
101 |
}
|
102 |
],
|
103 |
"visibleColumns": 8
|
@@ -109,7 +85,7 @@
|
|
109 |
},
|
110 |
{
|
111 |
"kind": "tab",
|
112 |
-
"weight":
|
113 |
"children": [
|
114 |
{
|
115 |
"kind": "widget",
|
@@ -120,16 +96,38 @@
|
|
120 |
"embedding_reduced"
|
121 |
],
|
122 |
"reductionMethod": null,
|
123 |
-
"colorBy": "
|
124 |
"sizeBy": "is_label_issue",
|
125 |
-
"filter":
|
126 |
"umapNNeighbors": 20,
|
127 |
"umapMetric": "cosine",
|
128 |
"umapMinDist": 0.15,
|
129 |
"pcaNormalization": null,
|
130 |
-
"umapMenuLocalGlobalBalance":
|
131 |
"umapMenuIsAdvanced": false
|
132 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
}
|
134 |
]
|
135 |
}
|
|
|
3 |
"children": [
|
4 |
{
|
5 |
"kind": "split",
|
6 |
+
"weight": 44.24966799468792,
|
7 |
"orientation": "vertical",
|
8 |
"children": [
|
9 |
{
|
10 |
"kind": "tab",
|
11 |
+
"weight": 33.54784241752236,
|
12 |
"children": [
|
13 |
{
|
14 |
"kind": "widget",
|
|
|
17 |
"config": {
|
18 |
"tableView": "full",
|
19 |
"visibleColumns": [
|
20 |
+
"label",
|
|
|
21 |
"label_score",
|
22 |
"near_duplicate_score",
|
23 |
+
"pred",
|
24 |
"outlier_score"
|
25 |
],
|
26 |
"sorting": null,
|
|
|
31 |
},
|
32 |
{
|
33 |
"kind": "tab",
|
34 |
+
"weight": 23.686809949341544,
|
35 |
"children": [
|
36 |
{
|
37 |
"kind": "widget",
|
|
|
43 |
},
|
44 |
{
|
45 |
"kind": "tab",
|
46 |
+
"weight": 42.765347633136095,
|
47 |
"children": [
|
48 |
{
|
49 |
"kind": "widget",
|
|
|
54 |
{
|
55 |
"view": "ImageView",
|
56 |
"columns": [
|
57 |
+
"full_image"
|
58 |
],
|
59 |
+
"name": "full_image",
|
60 |
+
"key": "7hA9fgoBXsKTCCFVYZfhRb"
|
61 |
},
|
62 |
{
|
63 |
"view": "TextLens",
|
64 |
"columns": [
|
65 |
+
"label"
|
66 |
],
|
67 |
"name": "view",
|
68 |
+
"key": "a7fedf96-f36e-4836-9ffe-7b249c16db46"
|
69 |
},
|
70 |
{
|
71 |
"view": "TextLens",
|
72 |
"columns": [
|
73 |
+
"pred"
|
74 |
],
|
75 |
"name": "view",
|
76 |
+
"key": "527a66c2-b3d4-4be0-9879-8749ee4fd0ed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
}
|
78 |
],
|
79 |
"visibleColumns": 8
|
|
|
85 |
},
|
86 |
{
|
87 |
"kind": "tab",
|
88 |
+
"weight": 55.75033200531208,
|
89 |
"children": [
|
90 |
{
|
91 |
"kind": "widget",
|
|
|
96 |
"embedding_reduced"
|
97 |
],
|
98 |
"reductionMethod": null,
|
99 |
+
"colorBy": "label",
|
100 |
"sizeBy": "is_label_issue",
|
101 |
+
"filter": false,
|
102 |
"umapNNeighbors": 20,
|
103 |
"umapMetric": "cosine",
|
104 |
"umapMinDist": 0.15,
|
105 |
"pcaNormalization": null,
|
106 |
+
"umapMenuLocalGlobalBalance": null,
|
107 |
"umapMenuIsAdvanced": false
|
108 |
}
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"kind": "widget",
|
112 |
+
"name": "Scatter Plot",
|
113 |
+
"type": "scatterplot",
|
114 |
+
"config": {
|
115 |
+
"xAxisColumn": null,
|
116 |
+
"yAxisColumn": null,
|
117 |
+
"colorBy": null,
|
118 |
+
"sizeBy": null,
|
119 |
+
"filter": false
|
120 |
+
}
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"kind": "widget",
|
124 |
+
"name": "Histogram",
|
125 |
+
"type": "histogram",
|
126 |
+
"config": {
|
127 |
+
"columnKey": null,
|
128 |
+
"stackByColumnKey": null,
|
129 |
+
"filter": false
|
130 |
+
}
|
131 |
}
|
132 |
]
|
133 |
}
|
run.py
CHANGED
@@ -17,26 +17,47 @@ if __name__ == "__main__":
|
|
17 |
df = pickle.load(file)
|
18 |
print("Dataset loaded from cache.")
|
19 |
|
|
|
20 |
label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
|
21 |
-
label_issue = DataIssue(
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
outlier_issue_row =
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
near_duplicate_issue_row = df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
|
27 |
-
near_duplicate_issue = DataIssue(severity="medium", title="near-duplicate-issue", rows=near_duplicate_issue_row, description="Near duplicate issue found by cleanlab")
|
28 |
|
29 |
-
df = df.drop(["full_image"], axis=1)
|
30 |
while True:
|
31 |
dtypes = {
|
32 |
"image": spotlight.Image,
|
|
|
33 |
"embedding": spotlight.Embedding,
|
34 |
"embedding_reduced": spotlight.Embedding,
|
35 |
"probabilities": spotlight.Embedding,
|
36 |
}
|
37 |
|
38 |
view = spotlight.show(
|
39 |
-
df,
|
40 |
dtype=dtypes,
|
41 |
issues=[label_issue,outlier_issue,near_duplicate_issue],
|
42 |
layout="layout.json",
|
|
|
17 |
df = pickle.load(file)
|
18 |
print("Dataset loaded from cache.")
|
19 |
|
20 |
+
|
21 |
label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
|
22 |
+
label_issue = DataIssue(
|
23 |
+
severity="medium",
|
24 |
+
title="label-issue",
|
25 |
+
rows=label_issue_rows,
|
26 |
+
description="Label issue found by cleanlab - Review and correct if necessary",
|
27 |
+
)
|
28 |
|
29 |
+
outlier_issue_row = (
|
30 |
+
df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
|
31 |
+
)
|
32 |
+
outlier_issue = DataIssue(
|
33 |
+
severity="medium",
|
34 |
+
title="outlier-issue",
|
35 |
+
rows=outlier_issue_row,
|
36 |
+
description="Outlier score < 0.6 - Review and remove or collect more data",
|
37 |
+
)
|
38 |
+
|
39 |
+
near_duplicate_issue_row = (
|
40 |
+
df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
|
41 |
+
)
|
42 |
+
near_duplicate_issue = DataIssue(
|
43 |
+
severity="medium",
|
44 |
+
title="near-duplicate-issue",
|
45 |
+
rows=near_duplicate_issue_row,
|
46 |
+
description="Near duplicate issue found by cleanlab - Review and remove if necessary",
|
47 |
+
)
|
48 |
|
|
|
|
|
49 |
|
|
|
50 |
while True:
|
51 |
dtypes = {
|
52 |
"image": spotlight.Image,
|
53 |
+
"image_full": spotlight.Image,
|
54 |
"embedding": spotlight.Embedding,
|
55 |
"embedding_reduced": spotlight.Embedding,
|
56 |
"probabilities": spotlight.Embedding,
|
57 |
}
|
58 |
|
59 |
view = spotlight.show(
|
60 |
+
df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str":"pred"}),
|
61 |
dtype=dtypes,
|
62 |
issues=[label_issue,outlier_issue,near_duplicate_issue],
|
63 |
layout="layout.json",
|