Spaces:
Running
Running
File size: 2,560 Bytes
2548bde 88295ae 157383b 8ca5e69 157383b e76d2e2 157383b 2548bde 157383b 6a42136 8df3bff 372cd85 8df3bff 8ca5e69 6a42136 88295ae 5d67afc 8ca5e69 5d67afc 8ca5e69 9259adf 5d67afc 9259adf 5d67afc a3aa635 5d67afc 5901238 5d67afc 6a42136 157383b 2548bde 157383b 88295ae b2a0b3c 372cd85 b2a0b3c 70050ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# import datetime
import json
import os
from pathlib import Path
import gradio as gr
import huggingface_hub as hfh
# from apscheduler.schedulers.background import BackgroundScheduler
DATASET_ID = "albertvillanova/datasets-report"
DATASET_PATH = "dataset"
DATA_DIR = "data"
DATA_PATH = f"{DATASET_PATH}/{DATA_DIR}"
def pull_dataset_repo(repo_id=DATASET_ID, repo_path=DATASET_PATH):
token = os.environ.get('HUB_TOKEN')
repo = hfh.Repository(
local_dir=repo_path,
clone_from=repo_id,
repo_type="dataset",
use_auth_token=token,
)
repo.git_pull()
return repo
def load_dates():
return [data_path.stem for data_path in sorted(Path(DATA_PATH).iterdir())]
repo = pull_dataset_repo()
dates = load_dates()
datasets = hfh.list_datasets()
def filter_datasets_by_date(date_from, date_to):
with open(f"{DATA_PATH}/{date_from}.json") as f:
ids_from = json.load(f)
with open(f"{DATA_PATH}/{date_to}.json") as f:
ids_to = json.load(f)
ids = set(ids_to) - set(ids_from)
dss = [ds for ds in datasets if ds.id in ids]
for ds in dss:
try:
_ = getattr(ds, "downloads")
except AttributeError:
setattr(ds, "downloads", 0)
dss = sorted(dss, key=lambda item: item.downloads, reverse=True)
return dss
def filter_dataframe(date_from, date_to):
dss = filter_datasets_by_date(date_from, date_to)
return [[ds.id, ds.downloads] for ds in dss]
# def update_datasets():
# # Retrieve datasets
# datasets = hfh.list_datasets()
# # Save dataset IDs
# repo = pull_dataset_repo()
# os.makedirs(DATA_PATH, exist_ok=True)
# today = datetime.datetime.now(datetime.timezone.utc).date().isoformat()
# with repo.commit(f"Add {today} data file"):
# with open(f"data/{today}.json", "w") as f:
# json.dump([ds.id for ds in sorted(datasets, key=lambda item: item.id)], f)
#
#
# scheduler = BackgroundScheduler()
# scheduler.add_job(update_datasets, trigger="cron", hour=0, minute=1, timezone=datetime.timezone.utc)
# scheduler.start()
with gr.Blocks() as demo:
with gr.Row():
date_from = gr.Dropdown(choices=dates, label="Date from")
date_to = gr.Dropdown(choices=dates, label="Date to")
submit_btn = gr.Button("Submit")
outputs = gr.Dataframe(
headers=["Dataset", "Downloads"],
datatype=["str", "number"],
label="Created datasets",
)
submit_btn.click(fn=filter_dataframe, inputs=[date_from, date_to], outputs=outputs)
demo.launch()
|