navigate-data-issues

Runtime error

App Files Files Community

MarkusStoll commited on Aug 3, 2023

Commit

5ee8932

1 Parent(s): ebe1006

navigate ready

Browse files

Files changed (4) hide show

Dockerfile +1 -1
README.md +1 -2
layout.json +39 -41
run.py +28 -7

Dockerfile CHANGED Viewed

@@ -6,7 +6,7 @@ ENV HOME=/code
 RUN apt install curl
 RUN pip install pip -U
-RUN pip install renumics-spotlight==1.3.0rc6
 RUN pip install datasets
 COPY prepare.py .

 RUN apt install curl
 RUN pip install pip -U
+RUN pip install renumics-spotlight==1.3.0rc7 httpx
 RUN pip install datasets
 COPY prepare.py .

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Cleanlab CIFAR-100 with Spotlight
 emoji: 🧐
 colorFrom: gray
 colorTo: blue
@@ -14,7 +14,6 @@ tags:
 - renumics
 - spotlight
 - EDA
-duplicated_from: renumics/cifar10-cleanlab
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Navigate Cleanlab Data Issues in CIFAR-100 with Spotlight
 emoji: 🧐
 colorFrom: gray
 colorTo: blue
 - renumics
 - spotlight
 - EDA
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

layout.json CHANGED Viewed

@@ -3,12 +3,12 @@
     "children": [
         {
             "kind": "split",
-            "weight": 24.37657642133775,
             "orientation": "vertical",
             "children": [
                 {
                     "kind": "tab",
-                    "weight": 23.652554002465973,
                     "children": [
                         {
                             "kind": "widget",
@@ -17,10 +17,10 @@
                             "config": {
                                 "tableView": "full",
                                 "visibleColumns": [
-                                    "fine_label_prediction_str",
-                                    "fine_label_str",
                                     "label_score",
                                     "near_duplicate_score",
                                     "outlier_score"
                                 ],
                                 "sorting": null,
@@ -31,7 +31,7 @@
                 },
                 {
                     "kind": "tab",
-                    "weight": 11.033364782611708,
                     "children": [
                         {
                             "kind": "widget",
@@ -43,7 +43,7 @@
                 },
                 {
                     "kind": "tab",
-                    "weight": 38.67424218071708,
                     "children": [
                         {
                             "kind": "widget",
@@ -54,50 +54,26 @@
                                     {
                                         "view": "ImageView",
                                         "columns": [
-                                            "image"
                                         ],
-                                        "name": "image",
-                                        "key": "iW3ihwygEHg4QZv5YzJ8ww"
                                     },
                                     {
                                         "view": "TextLens",
                                         "columns": [
-                                            "fine_label_str"
                                         ],
                                         "name": "view",
-                                        "key": "346d7554-5395-44d6-b358-a351901cb02e"
                                     },
                                     {
                                         "view": "TextLens",
                                         "columns": [
-                                            "fine_label_prediction_str"
                                         ],
                                         "name": "view",
-                                        "key": "99761cf8-350a-469d-8dbc-0df7b0db4d48"
-                                    },
-                                    {
-                                        "view": "ScalarView",
-                                        "columns": [
-                                            "label_score"
-                                        ],
-                                        "name": "view",
-                                        "key": "4ae33ae9-919a-4b10-9216-cd7c9448f9ac"
-                                    },
-                                    {
-                                        "view": "ScalarView",
-                                        "columns": [
-                                            "outlier_score"
-                                        ],
-                                        "name": "view",
-                                        "key": "13fb6430-3ffc-422c-92be-243b174b9a15"
-                                    },
-                                    {
-                                        "view": "ScalarView",
-                                        "columns": [
-                                            "near_duplicate_score"
-                                        ],
-                                        "name": "view",
-                                        "key": "daf7c0b7-2185-4e50-9eb0-ffab8d1ff906"
                                     }
                                 ],
                                 "visibleColumns": 8
@@ -109,7 +85,7 @@
         },
         {
             "kind": "tab",
-            "weight": 51.915353562320064,
             "children": [
                 {
                     "kind": "widget",
@@ -120,16 +96,38 @@
                             "embedding_reduced"
                         ],
                         "reductionMethod": null,
-                        "colorBy": "fine_label_str",
                         "sizeBy": "is_label_issue",
-                        "filter": true,
                         "umapNNeighbors": 20,
                         "umapMetric": "cosine",
                         "umapMinDist": 0.15,
                         "pcaNormalization": null,
-                        "umapMenuLocalGlobalBalance": 0.5,
                         "umapMenuIsAdvanced": false
                     }
                 }
             ]
         }

     "children": [
         {
             "kind": "split",
+            "weight": 44.24966799468792,
             "orientation": "vertical",
             "children": [
                 {
                     "kind": "tab",
+                    "weight": 33.54784241752236,
                     "children": [
                         {
                             "kind": "widget",
                             "config": {
                                 "tableView": "full",
                                 "visibleColumns": [
+                                    "label",
                                     "label_score",
                                     "near_duplicate_score",
+                                    "pred",
                                     "outlier_score"
                                 ],
                                 "sorting": null,
                 },
                 {
                     "kind": "tab",
+                    "weight": 23.686809949341544,
                     "children": [
                         {
                             "kind": "widget",
                 },
                 {
                     "kind": "tab",
+                    "weight": 42.765347633136095,
                     "children": [
                         {
                             "kind": "widget",
                                     {
                                         "view": "ImageView",
                                         "columns": [
+                                            "full_image"
                                         ],
+                                        "name": "full_image",
+                                        "key": "7hA9fgoBXsKTCCFVYZfhRb"
                                     },
                                     {
                                         "view": "TextLens",
                                         "columns": [
+                                            "label"
                                         ],
                                         "name": "view",
+                                        "key": "a7fedf96-f36e-4836-9ffe-7b249c16db46"
                                     },
                                     {
                                         "view": "TextLens",
                                         "columns": [
+                                            "pred"
                                         ],
                                         "name": "view",
+                                        "key": "527a66c2-b3d4-4be0-9879-8749ee4fd0ed"
                                     }
                                 ],
                                 "visibleColumns": 8
         },
         {
             "kind": "tab",
+            "weight": 55.75033200531208,
             "children": [
                 {
                     "kind": "widget",
                             "embedding_reduced"
                         ],
                         "reductionMethod": null,
+                        "colorBy": "label",
                         "sizeBy": "is_label_issue",
+                        "filter": false,
                         "umapNNeighbors": 20,
                         "umapMetric": "cosine",
                         "umapMinDist": 0.15,
                         "pcaNormalization": null,
+                        "umapMenuLocalGlobalBalance": null,
                         "umapMenuIsAdvanced": false
                     }
+                },
+                {
+                    "kind": "widget",
+                    "name": "Scatter Plot",
+                    "type": "scatterplot",
+                    "config": {
+                        "xAxisColumn": null,
+                        "yAxisColumn": null,
+                        "colorBy": null,
+                        "sizeBy": null,
+                        "filter": false
+                    }
+                },
+                {
+                    "kind": "widget",
+                    "name": "Histogram",
+                    "type": "histogram",
+                    "config": {
+                        "columnKey": null,
+                        "stackByColumnKey": null,
+                        "filter": false
+                    }
                 }
             ]
         }

run.py CHANGED Viewed

@@ -17,26 +17,47 @@ if __name__ == "__main__":
             df = pickle.load(file)
         print("Dataset loaded from cache.")
         label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
-        label_issue = DataIssue(severity="medium", title="label-issue", rows=label_issue_rows, description="Label issue found by cleanlab")
-        outlier_issue_row = df[df["outlier_score"]<0.6].sort_values("outlier_score").index.tolist()
-        outlier_issue = DataIssue(severity="medium", title="outlier-issue", rows=outlier_issue_row, description="Outlier score < 0.6")
-        near_duplicate_issue_row = df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
-        near_duplicate_issue = DataIssue(severity="medium", title="near-duplicate-issue", rows=near_duplicate_issue_row, description="Near duplicate issue found by cleanlab")
-        df = df.drop(["full_image"], axis=1)
         while True:
             dtypes = {
                 "image": spotlight.Image,
                 "embedding": spotlight.Embedding,
                 "embedding_reduced": spotlight.Embedding,
                 "probabilities": spotlight.Embedding,
             }
             view = spotlight.show(
-                df,
                 dtype=dtypes,
                 issues=[label_issue,outlier_issue,near_duplicate_issue],
                 layout="layout.json",

             df = pickle.load(file)
         print("Dataset loaded from cache.")
         label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
+        label_issue = DataIssue(
+            severity="medium",
+            title="label-issue",
+            rows=label_issue_rows,
+            description="Label issue found by cleanlab - Review and correct if necessary",
+        )
+        outlier_issue_row = (
+            df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
+        )
+        outlier_issue = DataIssue(
+            severity="medium",
+            title="outlier-issue",
+            rows=outlier_issue_row,
+            description="Outlier score < 0.6 - Review and remove or collect more data",
+        )
+        near_duplicate_issue_row = (
+            df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
+        )
+        near_duplicate_issue = DataIssue(
+            severity="medium",
+            title="near-duplicate-issue",
+            rows=near_duplicate_issue_row,
+            description="Near duplicate issue found by cleanlab - Review and remove if necessary",
+        )
         while True:
             dtypes = {
                 "image": spotlight.Image,
+                "image_full": spotlight.Image,
                 "embedding": spotlight.Embedding,
                 "embedding_reduced": spotlight.Embedding,
                 "probabilities": spotlight.Embedding,
             }
             view = spotlight.show(
+                df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str":"pred"}),
                 dtype=dtypes,
                 issues=[label_issue,outlier_issue,near_duplicate_issue],
                 layout="layout.json",