katja-hofmann commited on 20 days ago

Commit

f1c8ee5

1 Parent(s): e199e7c

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
CODE_OF_CONDUCT.md +10 -0
CONTRIBUTING.md +14 -0
LICENSE.md +96 -0
SECURITY.md +37 -0
assets/Demonstrator/Fig_01.png +0 -0
assets/Demonstrator/Fig_02.png +0 -0
assets/Demonstrator/Fig_03.png +0 -0
assets/Demonstrator/Fig_04.png +0 -0
assets/Demonstrator/Fig_05.png +0 -0
assets/Demonstrator/Fig_06.png +0 -0
assets/Demonstrator/Fig_07.png +0 -0
assets/Demonstrator/Fig_08.png +0 -0
assets/Demonstrator/Fig_09.png +0 -0
assets/Demonstrator/Fig_10.png +0 -0
assets/Demonstrator/Fig_11.png +0 -0
assets/Demonstrator/Fig_12.png +0 -0
assets/Demonstrator/Fig_13.png +0 -0
assets/Demonstrator/Fig_14.png +0 -0
assets/Demonstrator/Fig_15.png +0 -0
assets/Readme/model_capabilities.gif +3 -0
assets/Readme/wham_gen_1.gif +3 -0
assets/Readme/wham_gen_2.gif +3 -0
assets/Readme/wham_gen_3.gif +3 -0
assets/Readme/wham_gen_4.gif +3 -0
assets/Readme/wham_gen_5.gif +3 -0
assets/Readme/wham_gen_6.gif +3 -0
assets/Readme/wham_gen_7.gif +3 -0
assets/Readme/wham_gen_8.gif +3 -0
assets/Readme/wham_gen_9.gif +3 -0
configs/metadata_custom_tag.config +5 -0
models/WHAM_1.6B_v1.ckpt +3 -0
models/WHAM_200M.ckpt +3 -0
requirements.txt +48 -0
run_dreaming.py +264 -0
run_server.py +519 -0
setup_local.sh +21 -0
wham/models/nn/model_blocks.py +49 -0
wham/models/nn/nanoGPT.py +665 -0
wham/models/pl/__init__.py +0 -0
wham/models/pl/pl_base_model.py +5 -0
wham/models/vqgan/taming/LICENSE +24 -0
wham/models/vqgan/taming/model.py +696 -0
wham/models/vqgan/taming/quantize.py +146 -0
wham/models/vqgan/taming_vq_model.py +264 -0
wham/models/vqgan/vqgan.py +236 -0
wham/models/vqgan/vqgan_models.py +311 -0
wham/models/vqvae/vqvae_utils.py +154 -0
wham/models/wham_base/__init__.py +0 -0
wham/models/wham_base/encode_predict_decode_base.py +256 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+fonts/arial.ttf filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [[email protected]](mailto:[email protected]) with questions or concerns
+- Employees can reach out at [aka.ms/opensource/moderation-support](https://aka.ms/opensource/moderation-support)

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Contributing
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to,
+and actually do, grant us the rights to use your contribution. For details, visit
+https://cla.microsoft.com.
+When you submit a pull request, a CLA-bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
+instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+or contact [[email protected]](mailto:[email protected]) with any additional questions or comments.

LICENSE.md ADDED Viewed

	@@ -0,0 +1,96 @@

+# MICROSOFT RESEARCH LICENSE TERMS
+**IF YOU LIVE IN THE UNITED STATES, PLEASE READ THE “BINDING ARBITRATION AND CLASS ACTION WAIVER” SECTION BELOW. IT AFFECTS HOW DISPUTES ARE RESOLVED.**
+These license terms are an agreement between you and Microsoft Corporation (or one of its affiliates). They apply to the source code, object code, machine learning models, or data (collectively “Materials”) that accompany this license. IF YOU COMPLY WITH THESE LICENSE TERMS, YOU HAVE THE RIGHTS BELOW. BY USING THE MATERIALS, YOU ACCEPT THESE TERMS.
+## 1) INSTALLATION AND USE RIGHTS TO THE MATERIALS.
+Subject to the terms of this agreement, you have the below rights, if applicable, to use the Materials solely for non-commercial, non-revenue generating, research purposes:
+a) **Source Code.** If source code is included, you may use and modify the source code, but you may not distribute the source code.
+b) **Object Code.** If object code is included, you may use the object code, but you may not distribute the object code.
+c) **Models.** If machine learning model(s) are included, you may use the model(s), but you may not distribute the models.
+d) **Data.** If data is included, you may use the data, but your use must be consistent with the consent under which the data was provided and/or gathered and you may not modify or distribute the data.
+## 2) SCOPE OF LICENSE.
+The Materials are licensed, not sold. Microsoft reserves all other rights. Unless applicable law gives you more rights despite this limitation, you will not (and have no right to):
+a) Work around any technical limitations in the Materials that only allow you to use it in certain ways;
+b) Reverse engineer, decompile or disassemble the Materials;
+c) Remove, minimize, block, or modify any notices of Microsoft or its suppliers in the Materials;
+d) Use the Materials in any way that is against the law or to create or propagate malware; or
+e) Share, publish, distribute or lend the Materials, provide the Materials as a stand-alone hosted solution for others to use, or transfer the Materials or this agreement to any third party.
+## 3) PERSONAL DATA.
+If the data (set forth in Section 1(d) above) includes or is found to include any data that enables any ability to identify an individual ("Personal Data"), you will not use such Personal Data for any purpose other than was authorized and consented to by the data subject/research participant. You will not use Personal Data to contact any person. You will keep Personal Data in strict confidence. You will not share any Personal Data that is collected or in your possession with any third party for any reason and as required under the original consent agreement. Further, you will destroy the Personal Data and any backup or copies, **immediately upon the completion of your research.**
+## 4) LICENSE TO MICROSOFT.
+Notwithstanding the limitations in Section 1, you may distribute your modifications back to Microsoft, and if you do provide Microsoft with modifications of the Materials, you hereby grant Microsoft, without any restrictions or limitations, a non-exclusive, perpetual, irrevocable, royalty-free, assignable and sub-licensable license, to reproduce, publicly perform or display, install, use, modify, post, distribute, make and have made, sell and transfer such modifications and derivatives for any purpose.
+## 5) PUBLICATION.
+You may publish (or present papers or articles) on your results from using the Materials provided that no material or substantial portion of the Materials is included in any such publication or presentation.
+## 6) FEEDBACK.
+Any feedback about the Materials provided by you to us is voluntarily given, and Microsoft shall be free to use the feedback as it sees fit without obligation or restriction of any kind, even if the feedback is designated by you as confidential. **Additional** Such feedback shall be considered a contribution and licensed to Microsoft under the terms of Section 4 above.
+## 7) COMPLIANCE WITH TRADE LAWS.
+You acknowledge that the Materials may be subject to applicable trade laws in one or more countries. You will comply with all relevant laws and regulations applicable to the import or export of the Materials, including but not limited to, trade laws such as the U.S. Export Administration Regulations or other end-user, end use, and destination restrictions by the U.S. and other governments, as well as sanctions regulations administered by the U.S. Office of Foreign Assets Control. Microsoft may suspend or terminate the agreement immediately to the extent that Microsoft reasonably concludes that continued performance would violate trade laws or put it at risk of becoming subject to sanctions or penalties under trade laws. For additional information, see www.microsoft.com/exporting.
+## 8) SUPPORT SERVICES.
+Microsoft is not obligated under this agreement to provide any support services for the Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+## 9) BINDING ARBITRATION AND CLASS ACTION WAIVER.
+**This Section applies if you live in (or, if a business, your principal place of business is in) the United States.** If you and Microsoft have a dispute, you and Microsoft agree to try for 60 days to resolve it informally. If you and Microsoft can’t, you and Microsoft agree to **binding individual arbitration before the American Arbitration Association** under the Federal Arbitration Act ("FAA"), and not to **sue in court in front of a judge or jury.** Instead, a neutral arbitrator will decide. **Class action lawsuits, class-wide arbitrations, private attorney-general actions,** and any other proceeding where someone acts in a representative capacity **are not allowed;** nor is combining individual proceedings without the consent of all parties. The complete Arbitration Agreement contains more terms and is at aka.ms/arb-agreement-1. You and Microsoft agree to these terms.
+## 10) ENTIRE AGREEMENT.
+This agreement, and any other terms Microsoft may provide for supplements, updates, or third-party applications, is the entire agreement for the Materials.
+## 11) APPLICABLE LAW AND PLACE TO RESOLVE DISPUTES.
+If you acquired the Materials in the United States or Canada, the laws of the state or province where you live (or, if a business, where your principal place of business is located) govern the interpretation of this agreement, claims for its breach, and all other claims (including consumer protection, unfair competition, and tort claims), regardless of conflict of laws principles, except that the FAA governs everything related to arbitration. If you acquired the Materials in any other country, its laws apply, except that the FAA governs everything related to arbitration. If U.S. federal jurisdiction exists, you and Microsoft consent to exclusive jurisdiction and venue in the federal court in King County, Washington for all disputes heard in court (excluding arbitration). If not, you and Microsoft consent to exclusive jurisdiction and venue in the Superior Court of King County, Washington for all disputes heard in court (excluding arbitration).
+## 12) CONSUMER RIGHTS; REGIONAL VARIATIONS.
+This agreement describes certain legal rights. You may have other rights, including consumer rights, under the laws of your state, province, or country. Separate and apart from your relationship with Microsoft, you may also have rights with respect to the party from which you acquired the Materials. This agreement does not change those other rights if the laws of your state, province, or country do not permit it to do so. For example, if you acquired the Materials in one of the below regions, or mandatory country law applies, then the following provisions apply to you:
+a) **Australia.** You have statutory guarantees under the Australian Consumer Law and nothing in this agreement is intended to affect those rights.
+b) **Canada.** If you acquired this software in Canada, you may stop receiving updates by turning off the automatic update feature, disconnecting your device from the Internet (if and when you re-connect to the Internet, however, the Materials will resume checking for and installing updates), or uninstalling the Materials. The product documentation, if any, may also specify how to turn off updates for your specific device or software.
+c) **Germany and Austria.**
+   i. **Warranty.** The properly licensed software will perform substantially as described in any Microsoft materials that accompany the Materials. However, Microsoft gives no contractual guarantee in relation to the licensed software.
+   ii. **Limitation of Liability.** In case of intentional conduct, gross negligence, claims based on the Product Liability Act, as well as, in case of death or personal or physical injury, Microsoft is liable according to the statutory law.
+Subject to the foregoing clause (ii), Microsoft will only be liable for slight negligence if Microsoft is in breach of such material contractual obligations, the fulfillment of which facilitate the due performance of this agreement, the breach of which would endanger the purpose of this agreement and the compliance with which a party may constantly trust in (so-called "cardinal obligations"). In other cases of slight negligence, Microsoft will not be liable for slight negligence.
+## 13) DISCLAIMER OF WARRANTY.
+THE MATERIALS ARE LICENSED "AS IS." YOU BEAR THE RISK OF USING THEM. MICROSOFT GIVES NO EXPRESS WARRANTIES, GUARANTEES, OR CONDITIONS. TO THE EXTENT PERMITTED UNDER APPLICABLE LAWS, MICROSOFT EXCLUDES ALL IMPLIED WARRANTIES, INCLUDING MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.
+## 14) LIMITATION ON AND EXCLUSION OF DAMAGES.
+IF YOU HAVE ANY BASIS FOR RECOVERING DAMAGES DESPITE THE PRECEDING DISCLAIMER OF WARRANTY, YOU CAN RECOVER FROM MICROSOFT AND ITS SUPPLIERS ONLY DIRECT DAMAGES UP TO U.S. $5.00. YOU CANNOT RECOVER ANY OTHER DAMAGES, INCLUDING CONSEQUENTIAL, LOST PROFITS, SPECIAL, INDIRECT OR INCIDENTAL DAMAGES.
+This limitation applies to:
+- (a) anything related to the Materials, services, content (including code) on third party Internet sites, or third party applications; and
+- (b) claims for breach of contract, warranty, guarantee, or condition; strict liability, negligence, or other tort; or any other claim; in each case to the extent permitted by applicable law.
+It also applies even if Microsoft knew or should have known about the possibility of the damages. The above limitation or exclusion may not apply to you because your state, province, or country may not allow the exclusion or limitation of incidental, consequential, or other damages.

SECURITY.md ADDED Viewed

	@@ -0,0 +1,37 @@

+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.**
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+If you prefer to submit without logging in, send email to [[email protected]](mailto:[email protected]).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).

assets/Demonstrator/Fig_01.png ADDED Viewed

assets/Demonstrator/Fig_02.png ADDED Viewed

assets/Demonstrator/Fig_03.png ADDED Viewed

assets/Demonstrator/Fig_04.png ADDED Viewed

assets/Demonstrator/Fig_05.png ADDED Viewed

assets/Demonstrator/Fig_06.png ADDED Viewed

assets/Demonstrator/Fig_07.png ADDED Viewed

assets/Demonstrator/Fig_08.png ADDED Viewed

assets/Demonstrator/Fig_09.png ADDED Viewed

assets/Demonstrator/Fig_10.png ADDED Viewed

assets/Demonstrator/Fig_11.png ADDED Viewed

assets/Demonstrator/Fig_12.png ADDED Viewed

assets/Demonstrator/Fig_13.png ADDED Viewed

assets/Demonstrator/Fig_14.png ADDED Viewed

assets/Demonstrator/Fig_15.png ADDED Viewed

assets/Readme/model_capabilities.gif ADDED Viewed

Git LFS Details

SHA256: 87cf1460b2779a1c85b70e2229a7e1e256c501a5e3db26ea74e445b9dc75e965
Pointer size: 132 Bytes
Size of remote file: 8.63 MB

assets/Readme/wham_gen_1.gif ADDED Viewed

Git LFS Details

SHA256: 96558d0ad8084eafaf60ee360f13fe8decfbc5ac737b0c2788c01310e81750d1
Pointer size: 132 Bytes
Size of remote file: 4.42 MB

assets/Readme/wham_gen_2.gif ADDED Viewed

Git LFS Details

SHA256: 1296bb4ccdac5c7d3a1e7e9adfc48a6ec255933ff252a31d4e45cd117a28aee7
Pointer size: 132 Bytes
Size of remote file: 4.15 MB

assets/Readme/wham_gen_3.gif ADDED Viewed

Git LFS Details

SHA256: cb8ea8b3d6c8ec737a9b03f4cd93aeb36ddddc33695849b9b83543a8c2242b6f
Pointer size: 132 Bytes
Size of remote file: 4.27 MB

assets/Readme/wham_gen_4.gif ADDED Viewed

Git LFS Details

SHA256: 45e895599dddae5e6d2eb31f66957726fb82662f41b149f4de206466083f5a42
Pointer size: 132 Bytes
Size of remote file: 4.3 MB

assets/Readme/wham_gen_5.gif ADDED Viewed

Git LFS Details

SHA256: e7e7675c737bf5cbdfb54dfcc568eeda4c4212dbe5726741205610ab29cfcabb
Pointer size: 132 Bytes
Size of remote file: 4.24 MB

assets/Readme/wham_gen_6.gif ADDED Viewed

Git LFS Details

SHA256: e536b1f88a92de4e116a6acd022987778f63ed5a841517758c14a0d7f2a3c2bd
Pointer size: 132 Bytes
Size of remote file: 4.09 MB

assets/Readme/wham_gen_7.gif ADDED Viewed

Git LFS Details

SHA256: eb7e6c63eb8c46fc8c824d93406550082b6532ea9473cd021bae72a7d6cbe7db
Pointer size: 132 Bytes
Size of remote file: 4.13 MB

assets/Readme/wham_gen_8.gif ADDED Viewed

Git LFS Details

SHA256: 366f3f92310f3cfa55c9f4da719b01c8399c42f7d7bb860c5f7153568e4991d5
Pointer size: 132 Bytes
Size of remote file: 3.98 MB

assets/Readme/wham_gen_9.gif ADDED Viewed

Git LFS Details

SHA256: 931713a1d9a9dbdef7b4a1821ef78d490282bf8475e65b39948f8b5f42dc9982
Pointer size: 132 Bytes
Size of remote file: 4.53 MB

configs/metadata_custom_tag.config ADDED Viewed

	@@ -0,0 +1,5 @@

+%Image::ExifTool::UserDefined = (
+    'Image::ExifTool::XMP::xmp' => {
+        'ProgramName' => { Name => 'ProgramName', Writable => 'string' }
+    }
+);

models/WHAM_1.6B_v1.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c4997074883aa1a39a5994a7dea91fb62b2382fc039523458827adb777af8e9
+size 20339650059

models/WHAM_200M.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ddb8e03a33f0849a63da030fea3de4994d95e16888993b8ab92faa904f3b31f
+size 3980245067

requirements.txt ADDED Viewed

	@@ -0,0 +1,48 @@

+--find-links https://download.pytorch.org/whl/torch_stable.html
+aiohttp==3.9.3
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.2.0
+blinker==1.7.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpickle==3.0.0
+cmake==3.28.3
+einops==0.6.0
+ffmpegcv==0.3.10
+filelock==3.13.1
+Flask==3.0.2
+frozenlist==1.4.1
+fsspec==2024.2.0
+idna==3.6
+importlib_metadata==7.0.2
+itsdangerous==2.1.2
+Jinja2==3.1.3
+lightning-utilities==0.10.1
+lit==17.0.6
+MarkupSafe==2.1.5
+mpmath==1.3.0
+multidict==6.0.5
+networkx==3.2.1
+numpy==1.25.2
+opencv-python==4.6.0.66
+opencv-python-headless==4.9.0.80
+packaging==23.2
+pillow==10.2.0
+pytorch-lightning==1.9.4
+PyYAML==6.0.1
+requests==2.31.0
+sympy==1.12
+tensordict==0.1.2
+torch==2.0.1+cu118
+torchinfo==1.7.1
+torchmetrics==0.11.4
+torchvision==0.15.2+cu118
+tqdm==4.66.2
+triton==2.0.0
+typing_extensions==4.10.0
+urllib3==2.2.1
+Werkzeug==3.0.1
+yarl==1.9.4
+zipp==3.17.0

run_dreaming.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+Example script for running dreaming on a dataset.
+The idea is that there are ground_truth ("reference") video clips, and we dream the same clips given some initial context.
+After dreaming, we have two sets of videos which, barring the intrinsic noise of the game environment (e.g., randomness of other players),
+should be identical if model was ideal.
+"""
+import argparse
+from pathlib import Path
+import os
+import subprocess
+import cv2
+from tensordict import TensorDict
+import torch as th
+from tqdm import tqdm
+import numpy as np
+import ffmpegcv
+from PIL import Image
+import wham.utils as utils
+parser = argparse.ArgumentParser(description="Run dreaming.")
+parser.add_argument("--model_path", type=str, required=True, help="Path to the model checkpoint.")
+parser.add_argument("--data_path", type=str, required=True, help="Path to the directory that contains the ground truth data to dream for.")
+parser.add_argument("--output", type=str, default="dreaming_output", help="Path to the directory where output should be put.")
+parser.add_argument("--max_files", type=int, default=None, help="Maximum number of files to process.")
+parser.add_argument("--metadata_config", type=str, default="configs/metadata_custom_tag.config", help="Path to metadata tag config for origin field.")
+parser.add_argument(
+    "--protocol",
+    type=str,
+    default="base",
+    choices=["base", "comprehensive"],
+    help="What protocol to use for the dreaming. base = action conditioned, comprehensive = dream actions as well.",
+)
+parser.add_argument("--batch_size", type=int, default=1, help="Batch size for dreaming. Higher batch_size uses more VRAM but overall is faster.")
+parser.add_argument("--context_length", type=int, default=10, help="Number of frames to use an initial context.")
+parser.add_argument("--steps_to_dream", type=int, default=10, help="Batch size for dreaming.")
+parser.add_argument("--sampling_temperature", type=float, default=0.9, help="Temperature for sampling from the model.")
+parser.add_argument("--sampling_top_k", type=int, default=None, help="Top-k for sampling from the model.")
+parser.add_argument("--sampling_top_p", type=float, default=None, help="Top-p for sampling from the model.")
+def get_context_data(image_context, action_context, action_sequences):
+    # Make sure we have CHW images:
+    assert image_context.shape[-3] == 3, "Image context should be CHW"
+    image_context = th.from_numpy(image_context).cuda()
+    action_data = th.from_numpy(action_context).float().cuda()
+    action_sequences = th.from_numpy(action_sequences).float().cuda() if action_sequences is not None else None
+    return TensorDict({"images": image_context, "actions_output": action_data}, batch_size=image_context.shape[:2])
+def add_video_metadata(file_path, metadata_config):
+    # Construct the exiftool command
+    cmd = [
+        'exiftool',
+        '-config', metadata_config,
+        f'-ProgramName=\"{utils.PROGRAM_NAME}\"',
+        '-overwrite_original',
+        file_path
+    ]
+    try:
+        # Execute the exiftool command
+        subprocess.run(cmd, check=True)
+        print(f"Metadata modified successfully.")
+        # Print the new file metadata
+        cmd_output = [
+            'exiftool',
+            file_path
+        ]
+        subprocess.run(cmd_output, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error modifying metadata: {e}")
+@th.no_grad()
+def do_dreaming(model, image_context, action_context, args, action_sequences=None):
+    """
+    image_contect and action_context provide the initial context for the model to dream from.
+    If action_sequences (batch_size, args.steps_to_dream, action_dim) is provided, then model will be prompted with these actions.
+    """
+    context_data = get_context_data(image_context, action_context, action_sequences)
+    encoded_context_data = model.encode_context(context_data)
+    encoded_action_sequences = None
+    if action_sequences is not None:
+        assert action_sequences.shape[1] == args.steps_to_dream, "action_sequences should have shape (batch_size, args.steps_to_dream, action_dim)"
+        action_sequences = TensorDict({"actions_output": action_sequences}, batch_size=action_sequences.shape[:2]).cuda()
+        encoded_action_sequences = model.encode_context(action_sequences)
+    encoded_dreamt_steps = []
+    for dream_step in range(args.steps_to_dream):
+        encoded_predicted_step, _ = model.predictor.predict_next_step(
+            encoded_context_data, temperature=args.sampling_temperature, top_k=args.sampling_top_k, top_p=args.sampling_top_p, min_tokens_to_keep=1
+        )
+        # Remove first step from context if we are at the max context length:
+        if encoded_context_data.shape[1] == args.context_length:
+            encoded_context_data = encoded_context_data[:, 1:]
+        # Add predicted image + action to the context
+        append_step = encoded_predicted_step
+        if encoded_action_sequences is not None:
+            # Replace predicted action with real action
+            append_step["actions_output"] = encoded_action_sequences["actions_output"][:, [dream_step], :]
+        encoded_context_data = th.cat((encoded_context_data, append_step), dim=1)
+        encoded_dreamt_steps.append(encoded_predicted_step)
+    # Decode everything
+    dreamed_images = []
+    actions_during_dream = []
+    for seq_i in range(args.steps_to_dream):
+        decoded_step = model.decode_context(encoded_dreamt_steps[seq_i])
+        dreamed_images.append(decoded_step["images"][:, [0]].cpu().numpy())
+        actions_during_dream.append(decoded_step["actions_output"][:, [0]].cpu().numpy())
+    dreamed_images = np.concatenate(dreamed_images, axis=1)
+    actions_during_dream = np.concatenate(actions_during_dream, axis=1)
+    return dreamed_images, actions_during_dream
+@th.no_grad()
+def encode_decode_images(model, images):
+    """
+    Pass ground_truth images through the encoding/decoding process of the model.
+    """
+    context = TensorDict({"images": th.from_numpy(images).cuda()}, batch_size=images.shape[:2])
+    output_images = []
+    for seq_i in range(images.shape[1]):
+        encoded_images = model.encode_context(context[:, [seq_i]])
+        decoded_images = model.decode_context(encoded_images)
+        output_images.append(decoded_images["images"].cpu().numpy())
+    return np.concatenate(output_images, axis=1)
+def main(args):
+    total_video_length = args.context_length + args.steps_to_dream
+    # Now, load the model:
+    model_path = Path(args.model_path)
+    assert model_path.is_file(), "Could not find the model!"
+    model = utils.load_model_from_checkpoint(model_path).cuda()
+    # Glob the dataset to find all the ground truth segments we want to construct a dream for:
+    data_path = Path(args.data_path)
+    ground_truth_files = list(data_path.rglob("*.npz"))
+    num_dreams = len(ground_truth_files)
+    if args.max_files is not None:
+        # Sort to make sure we always get the same files
+        ground_truth_files = sorted(ground_truth_files)
+        ground_truth_files = ground_truth_files[: args.max_files]
+        num_dreams = len(ground_truth_files)
+    output_path = Path(args.output)
+    os.makedirs(output_path, exist_ok=True)
+    print("=" * 100)
+    print(f"GENERATING DREAMS OF {num_dreams} SEGMENTS")
+    print(f"WRITING TO {args.output}")
+    print("=" * 100)
+    dreams_created = 0
+    with tqdm(total=num_dreams, desc="Dreams") as pbar:
+        while ground_truth_files:
+            # Load batch_size headers:
+            batches = min(args.batch_size, len(ground_truth_files))
+            batched_image_context = []
+            batched_image_sequence = []
+            batched_action_context = []
+            batched_action_sequence = []
+            episode_names = []
+            for i in range(batches):
+                episode = ground_truth_files.pop()
+                episode_names.append(episode)
+                try:
+                    data = np.load(episode)
+                    images = data["images"]
+                    actions = data["actions"]
+                except Exception:
+                    print(f"Failed to load episode {episode} - skipping.")
+                    continue
+                if actions.shape[0] < total_video_length:
+                    # We want to make sure we have ground_truth comparisons for the entire dream, so we ensure the episode is long enough
+                    raise ValueError(f"Episode {episode} is too short to dream from. It has {actions.shape[0]} steps, but we need at least {total_video_length}.")
+                batched_image_context.append(images[: args.context_length])
+                batched_image_sequence.append(images[args.context_length: total_video_length])
+                batched_action_context.append(actions[: args.context_length])
+                batched_action_sequence.append(actions[args.context_length: total_video_length])
+            image_context = np.array(batched_image_context)
+            image_sequences = np.array(batched_image_sequence)
+            action_context = np.array(batched_action_context)
+            action_sequences = np.array(batched_action_sequence)
+            if args.protocol == "comprehensive":
+                # We do not need to pass in the action sequences for comprehensive protocol
+                action_sequences = None
+            full_image_sequence = np.concatenate((image_context, image_sequences), axis=1)
+            dreamt_images, actions_during_dream = do_dreaming(model, image_context, action_context, args, action_sequences=action_sequences)
+            encoded_decoded_images_batch = encode_decode_images(model, full_image_sequence)
+            pbar.update(batches)
+            dreams_created += batches
+            # Save the dreams:
+            # We are aiming to mimic the folder structure of the ground truth dataset, so use the episode names
+            # but make them relative to our output folder:
+            for i, dream in enumerate(dreamt_images):
+                episode = episode_names[i]
+                output_file = output_path / episode.relative_to(data_path)
+                output_file.parent.mkdir(parents=True, exist_ok=True)
+                np.savez(
+                    output_file,
+                    context_length=args.context_length,
+                    steps_to_dream=args.steps_to_dream,
+                    raw_context=image_context[i],
+                    dreamt_images=dream,
+                    all_actions=np.concatenate((action_context[i], actions_during_dream[i])),
+                    encoded_decoded_ground_truth_images=encoded_decoded_images_batch[i],
+                )
+                video_file = str(output_file.with_suffix(".mp4"))
+                writer = ffmpegcv.VideoWriter(video_file, None, utils.DREAMING_FPS)
+                full_sequence = np.concatenate((image_context[i], dream), axis=0)
+                for frame in full_sequence:
+                    img = frame.transpose(1, 2, 0).astype(np.uint8).copy()
+                    # Please DO NOT remove this watermark. This will infringe upon the repo's license agreement
+                    (text_width, _), _ = cv2.getTextSize(utils.WATERMARK_TEXT, utils.WATERMARK_FONT, utils.WATERMARK_FONT_SCALE, utils.WATERMARK_FONT_THICKNESS)
+                    x = img.shape[1] - text_width - 10  # 10 pixels from the right edge
+                    y = img.shape[0] - 10  # 10 pixels from the bottom edge
+                    cv2.putText(img, utils.WATERMARK_TEXT, (x, y), utils.WATERMARK_FONT, utils.WATERMARK_FONT_SCALE, utils.WATERMARK_FONT_COLOR, utils.WATERMARK_FONT_THICKNESS)
+                    # Add image metadata
+                    pil_image = Image.fromarray(img)
+                    pil_image.info['Id'] = 0x0131
+                    pil_image.info['Type'] = 2
+                    pil_image.info['Value'] = utils.PROGRAM_NAME.encode("utf-8")
+                    pil_image.info['Len'] = len(utils.PROGRAM_NAME) + 1
+                    # Convert pil_image to a CV2 format for the video writer
+                    cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+                    writer.write(cv_image)
+                writer.release()
+                add_video_metadata(video_file, args.metadata_config)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args)

run_server.py ADDED Viewed

	@@ -0,0 +1,519 @@

+import argparse
+from dataclasses import dataclass, field
+import json
+import copy
+import multiprocessing as mp
+import uuid
+from datetime import datetime, timedelta
+from collections import defaultdict, deque
+import io
+import zipfile
+import queue
+import time
+import random
+import logging
+from tensordict import TensorDict
+import cv2
+from flask import Flask, request, make_response, send_file
+from PIL import Image
+import torchvision.transforms as T
+import numpy as np
+import torch as th
+from wham.utils import load_model_from_checkpoint, POS_BINS_BOUNDARIES, POS_BINS_MIDDLE
+logging.basicConfig(level=logging.INFO)
+parser = argparse.ArgumentParser(description="Simple Dreamer")
+parser.add_argument("--model", type=str, required=True, help="Path to the model file for the local runs")
+parser.add_argument("--debug", action="store_true", help="Enable flask debug mode.")
+parser.add_argument("--random_model", action="store_true", help="Use randomly initialized model instead of the provided one")
+parser.add_argument("--port", type=int, default=5000)
+parser.add_argument("--max_concurrent_jobs", type=int, default=30, help="Maximum number of jobs that can be run concurrently on this server.")
+parser.add_argument("--max_dream_steps_per_job", type=int, default=10, help="Maximum number of dream steps each job can request.")
+parser.add_argument("--max_job_lifespan", type=int, default=60 * 10, help="Maximum number of seconds we keep run around if not polled.")
+parser.add_argument("--image_width", type=int, default=300, help="Width of the image")
+parser.add_argument("--image_height", type=int, default=180, help="Height of the image")
+parser.add_argument("--max_batch_size", type=int, default=3, help="Maximum batch size for the dreamer workers")
+PREDICTION_JSON_FILENAME = "predictions.json"
+# Minimum time between times we check when to delete jobs. We do this when adding new jobs.
+JOB_CLEANUP_CHECK_RATE = timedelta(seconds=10)
+MAX_CANCELLED_ID_QUEUE_SIZE = 100
+DEFAULT_SAMPLING_SETTINGS = {
+    "temperature": 0.9,
+    "top_k": None,
+    "top_p": 1.0,
+    "max_context_length": 10,
+}
+def float_or_none(string):
+    if string.lower() == "none":
+        return None
+    return float(string)
+def be_image_preprocess(image, target_width, target_height):
+    # If target_width and target_height are specified, resize the image.
+    if target_width is not None and target_height is not None:
+        # Make sure we do not try to resize if the image is already the correct size.
+        if image.shape[1] != target_width or image.shape[0] != target_height:
+            image = cv2.resize(image, (target_width, target_height))
+    return np.transpose(image, (2, 0, 1))
+def action_vector_to_be_action_vector(action):
+    # Preprocess a BE action vector from 16 numbers with:
+    #  12 buttons [0, 1] and 4 stick directions [-1, 1]
+    # to discrete actions valid for the token model
+    #  12 buttons [0, 1] and 4 stick directions {discrete bin}
+    action[-4:] = np.digitize(action[-4:], bins=POS_BINS_BOUNDARIES) - 1
+    return action
+def be_action_vector_to_action_vector(action):
+    # Preprocess a BE action vector into unified space
+    for stick_index in range(-4, 0):
+        action[stick_index] = POS_BINS_MIDDLE[int(action[stick_index])]
+    return action
+@dataclass
+class DreamJob:
+    job_id: str
+    sampling_settings: dict
+    num_predictions_remaining: int
+    num_predictions_done: int
+    # (B, T, C, H, W)
+    context_images: th.Tensor
+    context_actions: th.Tensor
+    # Tokens that will replace the context_images if they are provided
+    context_tokens: list
+    # This will replace the dreamed action if provided.
+    # For every step, we remove the first action until exhausted
+    actions_to_take: th.Tensor = None
+@dataclass
+class DreamJobResult:
+    job_id: str
+    dream_step_index: int
+    # (B, 1, C, H, W)
+    dreamt_image: th.Tensor
+    dreamt_action: th.Tensor
+    dreamt_tokens: th.Tensor
+    result_creation_time: datetime = field(default_factory=datetime.now)
+def setup_and_load_model_be_model(args):
+    model = load_model_from_checkpoint(args.model)
+    th.set_float32_matmul_precision("high")
+    th.backends.cuda.matmul.allow_tf32 = True
+    return model
+def get_job_batchable_information(job):
+    """Return comparable object of job information. Used for batching"""
+    context_length = job.context_images.shape[1]
+    return (context_length, job.sampling_settings)
+def fetch_list_of_batchable_jobs(job_queue, cancelled_ids_set, max_batch_size, timeout=1):
+    """Return a list of jobs (or empty list) that can be batched together"""
+    batchable_jobs = []
+    required_job_info = None
+    while len(batchable_jobs) < max_batch_size:
+        try:
+            job = job_queue.get(timeout=timeout)
+        except queue.Empty:
+            break
+        # If pipe breaks, also gracefully return
+        except OSError:
+            break
+        if job.job_id in cancelled_ids_set:
+            # This job was cancelled, so discard it completely
+            continue
+        job_info = get_job_batchable_information(job)
+        if required_job_info is None:
+            required_job_info = job_info
+        elif required_job_info != job_info:
+            # This job is not batchable, put it back
+            job_queue.put(job)
+            # we assume here that, generally, the others jobs would also be
+            # invalid. So we just return the batchable jobs we have instead
+            # of going through more.
+            break
+        batchable_jobs.append(job)
+    return batchable_jobs
+def update_cancelled_jobs(cancelled_ids_queue, cancelled_ids_deque, cancelled_ids_set):
+    """IN-PLACE Update cancelled_ids_set with new ids from the queue"""
+    has_changed = False
+    while not cancelled_ids_queue.empty():
+        try:
+            cancelled_id = cancelled_ids_queue.get_nowait()
+        except queue.Empty:
+            break
+        cancelled_ids_deque.append(cancelled_id)
+        has_changed = True
+    if has_changed:
+        cancelled_ids_set.clear()
+        cancelled_ids_set.update(cancelled_ids_deque)
+def predict_step(context_data, sampling_settings, model, tokens=None):
+    with th.no_grad():
+        predicted_step = model.predict_next_step(context_data, min_tokens_to_keep=1, tokens=tokens, **sampling_settings)
+    return predicted_step
+def dreamer_worker(job_queue, result_queue, cancelled_jobs_queue, quit_flag, device_to_use, args):
+    logger = logging.getLogger(f"dreamer_worker {device_to_use}")
+    logger.info("Loading up model...")
+    model = setup_and_load_model_be_model(args)
+    model = model.to(device_to_use)
+    logger.info("Model loaded. Fetching results")
+    cancelled_ids_deque = deque(maxlen=MAX_CANCELLED_ID_QUEUE_SIZE)
+    cancelled_ids_set = set()
+    while not quit_flag.is_set():
+        update_cancelled_jobs(cancelled_jobs_queue, cancelled_ids_deque, cancelled_ids_set)
+        batchable_jobs = fetch_list_of_batchable_jobs(job_queue, cancelled_ids_set, max_batch_size=args.max_batch_size)
+        if len(batchable_jobs) == 0:
+            continue
+        sampling_settings = batchable_jobs[0].sampling_settings
+        # make better way for passing these arguments around. sampling_settings
+        # is passed as kwargs to predicting step, but max_context_length is not part of valid
+        # keys there, so we need to pop it out.
+        max_context_length = sampling_settings.pop("max_context_length")
+        images = [job.context_images[:, :max_context_length] for job in batchable_jobs]
+        actions = [job.context_actions[:, :max_context_length] for job in batchable_jobs]
+        tokens = [job.context_tokens for job in batchable_jobs]
+        images = th.concat(images, dim=0).to(device_to_use)
+        actions = th.concat(actions, dim=0).to(device_to_use)
+        context_data = TensorDict({
+            "images": images,
+            "actions_output": actions
+        }, batch_size=images.shape[:2])
+        predicted_step, predicted_image_tokens = predict_step(context_data, sampling_settings, model, tokens)
+        predicted_step = predicted_step.cpu()
+        predicted_images = predicted_step["images"]
+        predicted_actions = predicted_step["actions_output"]
+        predicted_image_tokens = predicted_image_tokens.cpu()
+        for job_i, job in enumerate(batchable_jobs):
+            image_context = job.context_images
+            action_context = job.context_actions
+            token_context = job.context_tokens
+            # Keep batch dimension
+            dreamt_image = predicted_images[job_i].unsqueeze(0)
+            dreamt_action = predicted_actions[job_i].unsqueeze(0)
+            dreamt_tokens = predicted_image_tokens[job_i].unsqueeze(0)
+            # Replace the dreamed action if provided
+            actions_to_take = job.actions_to_take
+            if actions_to_take is not None and actions_to_take.shape[1] > 0:
+                dreamt_action = actions_to_take[:, 0:1]
+                # Remove the action we took
+                actions_to_take = actions_to_take[:, 1:]
+                if actions_to_take.shape[1] == 0:
+                    actions_to_take = None
+            result_queue.put(DreamJobResult(
+                job_id=job.job_id,
+                dream_step_index=job.num_predictions_done,
+                dreamt_image=dreamt_image,
+                dreamt_action=dreamt_action,
+                dreamt_tokens=dreamt_tokens
+            ))
+            # Add job back in the queue if we have more steps to do
+            if job.num_predictions_remaining > 0:
+                # Stack the dreamt image and action to the context
+                if image_context.shape[1] >= max_context_length:
+                    image_context = image_context[:, 1:]
+                    action_context = action_context[:, 1:]
+                    token_context = token_context[1:]
+                image_context = th.cat([image_context, dreamt_image], dim=1)
+                action_context = th.cat([action_context, dreamt_action], dim=1)
+                token_context.append(dreamt_tokens[0, 0].tolist())
+                # We need to add context length back to sampling settings...
+                # add some better way of passing these settings around
+                job.sampling_settings["max_context_length"] = max_context_length
+                job_queue.put(DreamJob(
+                    job_id=job.job_id,
+                    sampling_settings=job.sampling_settings,
+                    num_predictions_remaining=job.num_predictions_remaining - 1,
+                    num_predictions_done=job.num_predictions_done + 1,
+                    context_images=image_context,
+                    context_actions=action_context,
+                    context_tokens=token_context,
+                    actions_to_take=actions_to_take
+                ))
+class DreamerServer:
+    def __init__(self, num_workers, args):
+        self.num_workers = num_workers
+        self.args = args
+        self.model = None
+        self.jobs = mp.Queue(maxsize=args.max_concurrent_jobs)
+        self.results_queue = mp.Queue()
+        self.cancelled_jobs = set()
+        self.cancelled_jobs_queues = [mp.Queue() for _ in range(num_workers)]
+        # job_id -> results
+        self._last_result_cleanup = datetime.now()
+        self._max_job_lifespan_datetime = timedelta(seconds=args.max_job_lifespan)
+        self.local_results = defaultdict(list)
+        self.logger = logging.getLogger("DreamerServer")
+    def get_details(self):
+        details = {
+            "model_file": self.args.model,
+            "max_concurrent_jobs": self.args.max_concurrent_jobs,
+            "max_dream_steps_per_job": self.args.max_dream_steps_per_job,
+            "max_job_lifespan": self.args.max_job_lifespan,
+        }
+        return json.dumps(details)
+    def _check_if_should_remove_old_jobs(self):
+        time_now = datetime.now()
+        # Only cleanup every JOB_CLEANUP_CHECK_RATE seconds at most
+        if time_now - self._last_result_cleanup < JOB_CLEANUP_CHECK_RATE:
+            return
+        self._last_result_cleanup = time_now
+        # First add existing results to the local results
+        self._gather_new_results()
+        # Check if we should remove old jobs
+        job_ids = list(self.local_results.keys())
+        for job_id in job_ids:
+            results = self.local_results[job_id]
+            # If newest result is older than max_job_lifespan, remove the job
+            if time_now - results[-1].result_creation_time > self._max_job_lifespan_datetime:
+                self.logger.info(f"Deleted job {job_id} because it was too old. Last result was {results[-1].result_creation_time}")
+                del self.local_results[job_id]
+    def add_new_job(self, request, request_json):
+        """
+        Add new dreaming job to the queues.
+        Request should have:
+        Returns: json object with new job id
+        """
+        self._check_if_should_remove_old_jobs()
+        sampling_settings = copy.deepcopy(DEFAULT_SAMPLING_SETTINGS)
+        if "num_steps_to_predict" not in request_json:
+            return make_response("num_steps_to_predict not in request", 400)
+        num_steps_to_predict = request_json['num_steps_to_predict']
+        if num_steps_to_predict > self.args.max_dream_steps_per_job:
+            return make_response(f"num_steps_to_predict too large. Max {self.args.max_dream_steps_per_job}", 400)
+        num_parallel_predictions = int(request_json['num_parallel_predictions']) if 'num_parallel_predictions' in request_json else 1
+        if (self.jobs.qsize() + num_parallel_predictions) >= self.args.max_concurrent_jobs:
+            return make_response(f"Too many jobs already running. Max {self.args.max_concurrent_jobs}", 400)
+        for key in sampling_settings:
+            sampling_settings[key] = float_or_none(request_json[key]) if key in request_json else sampling_settings[key]
+        context_images = []
+        context_actions = []
+        context_tokens = []
+        future_actions = []
+        for step in request_json["steps"]:
+            image_path = step["image_name"]
+            image = np.array(Image.open(request.files[image_path].stream))
+            image = be_image_preprocess(image, target_width=self.args.image_width, target_height=self.args.image_height)
+            context_images.append(th.from_numpy(image))
+            action = step["action"]
+            action = action_vector_to_be_action_vector(action)
+            context_actions.append(th.tensor(action))
+            tokens = step["tokens"]
+            context_tokens.append(tokens)
+        future_actions = None
+        if "future_actions" in request_json:
+            future_actions = []
+            for step in request_json["future_actions"]:
+                # The rest is the action vector
+                action = step["action"]
+                action = action_vector_to_be_action_vector(action)
+                # Add sequence and batch dimensions
+                future_actions.append(th.tensor(action))
+        # Add batch dimensions
+        context_images = th.stack(context_images).unsqueeze(0)
+        context_actions = th.stack(context_actions).unsqueeze(0)
+        future_actions = th.stack(future_actions).unsqueeze(0) if future_actions is not None else None
+        list_of_job_ids = []
+        for _ in range(num_parallel_predictions):
+            job_id = uuid.uuid4().hex
+            self.jobs.put(DreamJob(
+                job_id=job_id,
+                sampling_settings=sampling_settings,
+                num_predictions_remaining=num_steps_to_predict,
+                num_predictions_done=0,
+                context_images=context_images,
+                context_actions=context_actions,
+                context_tokens=context_tokens,
+                actions_to_take=future_actions
+            ))
+            list_of_job_ids.append(job_id)
+        job_queue_size = self.jobs.qsize()
+        return json.dumps({"job_ids": list_of_job_ids, "current_jobs_in_queue": job_queue_size})
+    def _gather_new_results(self):
+        if not self.results_queue.empty():
+            for _ in range(self.results_queue.qsize()):
+                result = self.results_queue.get()
+                if result.job_id in self.cancelled_jobs:
+                    # Discard result if job was cancelled
+                    continue
+                self.local_results[result.job_id].append(result)
+    def get_new_results(self, request, request_json):
+        if "job_ids" not in request_json:
+            return make_response("job_ids not in request", 400)
+        self._gather_new_results()
+        job_ids = request_json["job_ids"]
+        if not isinstance(job_ids, list):
+            job_ids = [job_ids]
+        return_results = []
+        for job_id in job_ids:
+            if job_id in self.local_results:
+                return_results.append(self.local_results[job_id])
+                del self.local_results[job_id]
+        if len(return_results) == 0:
+            return make_response("No new responses", 204)
+        output_json = []
+        output_image_bytes = {}
+        for job_results in return_results:
+            for result in job_results:
+                action = result.dreamt_action.numpy()
+                # Remember to remove batch and sequence dimensions
+                action = be_action_vector_to_action_vector(action[0, 0].tolist())
+                dreamt_tokens = result.dreamt_tokens[0, 0].tolist()
+                image_filename = f"{result.job_id}_{result.dream_step_index}.png"
+                output_json.append({
+                    "job_id": result.job_id,
+                    "dream_step_index": result.dream_step_index,
+                    "action": action,
+                    "tokens": dreamt_tokens,
+                    "image_filename": image_filename
+                })
+                image_bytes = io.BytesIO()
+                # this probably is not as smooth as it could be
+                T.ToPILImage()(result.dreamt_image[0, 0]).save(image_bytes, format="PNG")
+                output_image_bytes[image_filename] = image_bytes.getvalue()
+        # Write a zip file with all the images
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
+        zip_bytes = io.BytesIO()
+        with zipfile.ZipFile(zip_bytes, "w") as z:
+            for filename, bytes in output_image_bytes.items():
+                z.writestr(filename, bytes)
+            # Write the json
+            z.writestr(PREDICTION_JSON_FILENAME, json.dumps(output_json))
+        zip_bytes.seek(0)
+        return send_file(
+            zip_bytes,
+            mimetype="zip",
+            as_attachment=True,
+            download_name=f"dreaming_results_{timestamp}.zip"
+        )
+    def cancel_job(self, request, request_json):
+        if "job_id" not in request_json:
+            return make_response("job_id not in request", 400)
+        job_id = request_json["job_id"]
+        self.cancelled_jobs.add(job_id)
+        # Cancel all jobs in the queue with this id
+        for job_queue in self.cancelled_jobs_queues:
+            job_queue.put(job_id)
+        return make_response("OK", 200)
+def main_run(args):
+    app = Flask(__name__)
+    num_workers = th.cuda.device_count()
+    if num_workers == 0:
+        raise RuntimeError("No CUDA devices found. Cannot run Dreamer.")
+    server = DreamerServer(num_workers, args)
+    quit_flag = mp.Event()
+    # Start the dreamer worker(s)
+    dreamer_worker_processes = []
+    for device_i in range(num_workers):
+        device = f"cuda:{device_i}"
+        dreamer_worker_process = mp.Process(
+            target=dreamer_worker,
+            args=(server.jobs, server.results_queue, server.cancelled_jobs_queues[device_i], quit_flag, device, args)
+        )
+        dreamer_worker_process.daemon = True
+        dreamer_worker_process.start()
+        dreamer_worker_processes.append(dreamer_worker_process)
+    # Add the API endpoints
+    @app.route('/')
+    def details():
+        return server.get_details()
+    @app.route('/new_job', methods=['POST'])
+    def new_job():
+        request_json = json.loads(request.form["json"])
+        return server.add_new_job(request, request_json)
+    @app.route('/get_job_results', methods=['GET'])
+    def get_results():
+        # the "Json" is now in regular GET payload/parameters
+        request_json = {"job_ids": request.args.getlist("job_ids")}
+        return server.get_new_results(request, request_json)
+    @app.route('/cancel_job', methods=['GET'])
+    def cancel_job():
+        request_json = request.args.to_dict()
+        return server.cancel_job(request, request_json)
+    app.run(host="0.0.0.0", port=args.port, debug=args.debug)
+    # Cleanup
+    quit_flag.set()
+    for dreamer_worker_process in dreamer_worker_processes:
+        dreamer_worker_process.join()
+if __name__ == '__main__':
+    args = parser.parse_args()
+    main_run(args)

setup_local.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+# Tested using Python 3.9
+echo "Making and activating a new virtual environment..."
+python3.9 -m venv venv
+echo "Activating the virtual environment..."
+source venv/bin/activate
+echo "Upgrading pip..."
+pip install --upgrade pip
+echo "Instaling the required packages..."
+pip install -r requirements.txt
+echo "Instaling the exiftool package for adding file metadata on Linux..."
+sudo apt install -y exiftool
+echo "Installing ffmpeg..."
+sudo apt install ffmpeg
+echo "All packages installed successfully!"

wham/models/nn/model_blocks.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch.nn as nn
+"""
+Some Utility blocks for ViT-VQGAN.
+ConvNeXt blocks are based on:
+Liu, Zhuang, et al. "A convnet for the 2020s."
+Proceedings of the IEEE/CVF conference on computer vision and pattern recognition. 2022.
+"""
+class ConvNextDownsampleBig(nn.Module):
+    def __init__(self, c_in, c_out):
+        super().__init__()
+        self.group_norm = nn.GroupNorm(c_in, c_in)
+        self.conv1 = nn.Conv2d(c_in, c_out, kernel_size=8, stride=4, padding=0)
+    def forward(self, x):
+        return self.conv1(self.group_norm(x))
+class ConvNextBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.conv1 = nn.Conv2d(channels, channels, kernel_size=7, stride=1, padding=7 // 2, groups=channels)  # 'Depthwise' conv
+        self.group_norm = nn.GroupNorm(channels, channels)  # Should be equivalent to layernorm
+        # Transformer-style non-linearity
+        self.conv2 = nn.Conv2d(channels, channels * 4, kernel_size=1, stride=1, padding=0)
+        self.activation = nn.GELU()
+        self.conv3 = nn.Conv2d(channels * 4, channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.group_norm(y)
+        y = self.conv2(y)
+        y = self.activation(y)
+        y = self.conv3(y)
+        return x + y
+class ConvNextDownsample(nn.Module):
+    def __init__(self, c_in, c_out):
+        super().__init__()
+        self.group_norm = nn.GroupNorm(c_in, c_in)
+        self.conv1 = nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)
+    def forward(self, x):
+        return self.conv1(self.group_norm(x))

wham/models/nn/nanoGPT.py ADDED Viewed

	@@ -0,0 +1,665 @@

+# From https://github.com/karpathy/nanoGPT/blob/master/model.py - Thanks Andrej Karpathy
+# MIT License
+# Copyright (c) 2022 Andrej Karpathy
+#               2023 Microsoft Research
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+# OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+Full definition of a GPT Language Model, all of it in this single file.
+References:
+1) the official GPT-2 TensorFlow implementation released by OpenAI:
+https://github.com/openai/gpt-2/blob/master/src/model.py
+2) huggingface/transformers PyTorch implementation:
+https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+"""
+from dataclasses import dataclass
+import inspect
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+NEGATIVE_INFINITE_FLOAT = -float("inf")
+CROSS_ENTROPY_INVALID_CLASS_TARGET = -1
+# @torch.jit.script # good to enable when not using torch.compile, disable when using (our default)
+def new_gelu(x):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
+    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+def limit_logits_to_valid_range(logits, valid_token_range):
+    """
+    MODIFIES logits INPLACE.
+    Mask out invalid positions in the logits tensor with -inf so they are not considered by the softmax.
+    Args:
+        logits: logits tensor of shape (batch_size, vocab_size)
+        valid_token_range: tuple of (start, end) indices of valid positions in the logits tensor (inclusive).
+                           Everything outside is masked out with -inf.
+    """
+    logits[:, : valid_token_range[0]] = NEGATIVE_INFINITE_FLOAT
+    logits[:, valid_token_range[1] + 1 :] = NEGATIVE_INFINITE_FLOAT
+def default_sample_token(logits, valid_token_range=None, temperature=1.0, deterministic=False, top_k=None, top_p=None, min_tokens_to_keep=1):
+    """
+    Given a vector of logits, sample and return an index according to settings.
+    logits: tensor of shape (batch_size, vocab_size)
+    valid_token_range should be a tuple, specifying start and end indices we'd like to sample from (inclusive).
+    If None, we'll sample from the full vocab.
+    If deterministic is True, we'll take the argmax of the logits which implies top-k sampling with top_k = 1, therefore user inputted values of top_p and top_k will be ignored.
+    Otherwise, either top-p (float) value can be specified or top-k (int) value can be specified.
+    Top-p (float top_p) : only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    Top-k (int top_k) : selects top_k tokens for generation.
+    min_tokens_to_keep: Used with both top_p and top_k sampling.
+    """
+    assert top_k is None or top_p is None, "Can only specify one of top-k or top-p sampling."
+    if temperature < 0.1:
+        # Avoid too low a temp, especially 0
+        temperature = 0.1
+    logits = logits / temperature
+    if valid_token_range is not None:
+        limit_logits_to_valid_range(logits, valid_token_range)
+    if deterministic:
+        selected_logits = select_logits(logits, top_k=1)
+    else:
+        selected_logits = select_logits(logits, top_p=top_p, top_k=top_k, min_tokens_to_keep=min_tokens_to_keep)
+    probs = F.softmax(selected_logits, dim=-1)
+    # More robustly handle errors in the sampling here
+    sampled_idx = torch.multinomial(probs, num_samples=1).squeeze(-1)
+    return sampled_idx
+def select_logits(logits, top_k=None, top_p=None, min_tokens_to_keep=1):
+    """
+    Select from original logits using top-k or top-p sampling.
+    Args:
+        logits (torch.Tensor): Logits to sample from.
+        k (int, optional): Number of top elements to consider in top-k sampling.
+        p (float, optional): Threshold probability for top-p sampling.
+        min_tokens_to_keep (int, optional): Minimum number of tokens to keep in the output.
+    Returns:
+        logits: Selected logits after top-k or top-p sampling. Sets all logits outside the selected ones to NEGATIVE_INFINITE_FLOAT.
+    """
+    assert top_k is None or top_p is None, "Can only specify one of top-k or top-p sampling."
+    min_tokens_to_keep = min(min_tokens_to_keep, logits.size(-1))
+    if top_k is not None:
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
+        # Top-k sampling
+        top_k = max(top_k, min_tokens_to_keep)
+        top_k = min(top_k, logits.size(-1))
+        top_k_logits, _ = torch.topk(logits, top_k)
+        indices_to_remove = logits < top_k_logits[..., -1:]
+        logits = torch.where(indices_to_remove, NEGATIVE_INFINITE_FLOAT, logits)
+    elif top_p is not None:
+        top_p = float(top_p)
+        if top_p < 0 or top_p > 1.0:
+            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        # Top-p sampling
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        sorted_probs = torch.softmax(sorted_logits, dim=-1)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove[..., :min_tokens_to_keep] = False
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
+        logits = torch.where(indices_to_remove, NEGATIVE_INFINITE_FLOAT, logits)
+    else:
+        # Return logits as is
+        pass
+    return logits
+class LayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class LayerNormMinimal(nn.Module):
+    """LayerNorm like above, but without learnable parameters"""
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.ndim = (ndim,)
+    def forward(self, input):
+        return F.layer_norm(input, self.ndim, eps=1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size), persistent=False)
+        self.cached_k = None
+        self.cached_v = None
+        self.current_cache_size = 0
+    def _manual_causal_attention(self, q, k, v, mask):
+        # q, k and v should be of shape (B, nh, T, hs)
+        token_len = q.size(-2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(mask[:, :, :token_len, :token_len] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        return y
+    def forward(self, x, cache=False):
+        batch_size, token_len, n_embd = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = q.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash and not cache:
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)
+        elif cache:
+            # manual implemention of attention (as below), but cache arrays we can reuse
+            assert token_len == 1, "Cache only works for single step"
+            assert self.cached_k is not None, "Must call reset_cache() before using cache"
+            assert self.current_cache_size < self.cached_k.size(2), "Trying to generate more steps than provided in reset_cache() `num_steps_to_come`"
+            assert self.dropout == 0.0, "Dropout not supported with caching"
+            this_step_q = q
+            self.cached_k[:, :, self.current_cache_size, :] = k[:, :, 0, :]
+            self.cached_v[:, :, self.current_cache_size, :] = v[:, :, 0, :]
+            # Remove the zero parts
+            k = self.cached_k[:, :, : self.current_cache_size + 1, :]
+            # compute last row of the attention mask
+            this_step_att_row = (this_step_q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            this_step_att_row = F.softmax(this_step_att_row, dim=-1)
+            # We only need output for the current step
+            y = this_step_att_row @ self.cached_v[:, :, : self.current_cache_size + 1, :]
+            # Update cache
+            self.current_cache_size += 1
+        else:
+            y = self._manual_causal_attention(q, k, v, self.bias)
+        y = y.transpose(1, 2).contiguous().view(batch_size, token_len, n_embd)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+    def reset_cache(self, x, num_steps_to_come):
+        """
+        Reset caches by doing initial pass with x data (returning same output as forward).
+        Also set the number of steps to come, which is used to initialize the buffers
+        """
+        batch_size, token_len, n_embd = x.size()
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = q.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        # Use SDPA instead of a manual implementation
+        # y = self._manual_causal_attention(q, k, v, self.bias)
+        y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)
+        y = y.transpose(1, 2).contiguous().view(batch_size, token_len, n_embd)
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        # Create full k,q,v for predicting all future steps.
+        # Just null-out the last num_steps_to_come-1 steps
+        pad_size = num_steps_to_come
+        self.current_cache_size = token_len
+        self.cached_k = torch.cat([k, torch.zeros(batch_size, self.n_head, pad_size, n_embd // self.n_head, device=k.device)], dim=2)
+        self.cached_v = torch.cat([v, torch.zeros(batch_size, self.n_head, pad_size, n_embd // self.n_head, device=v.device)], dim=2)
+        return y
+class SelfAttention(nn.Module):
+    """
+    Non-causal self-attention layer, the same as CausalSelfAttention but without the causal mask.
+    Duplicating the code to keep this separate for clarity.
+    """
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
+        assert self.flash, "SelfAttention only supports flash attention for now."
+        self.register_buffer("attn_mask", torch.ones((config.block_size, config.block_size)).bool().unsqueeze(0).unsqueeze(0))
+    def forward(self, x):
+        batch_size, token_len, n_embd = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = q.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(batch_size, token_len, self.n_head, n_embd // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        # self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=self.attn_mask, dropout_p=self.dropout, is_causal=False)
+        y = y.transpose(1, 2).contiguous().view(batch_size, token_len, n_embd)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = new_gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class GELU_MLP(nn.Module):
+    """MLP Block using PyTorch's native GELU activation function"""
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = F.gelu(x, approximate="tanh")
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+    def forward(self, x, cache=False, reset_cache_with_num_steps_to_come=None):
+        """
+        Args:
+            cache: If True, use the cache to predict the next token (assumes model was initialized with `reset_cache`).
+            reset_cache_with_num_steps_to_come:
+                If not None, reset and prepare the cache for cached prediction of the next `reset_cache_with_num_steps_to_come` tokens.
+                This is same as calling `reset_cache` with the same argument, but we include option here in `forward` to support torch hook functions (used to get embeddings from this module output).
+        Caching example:
+            ```
+            # Initialize model with reset_cache_with_num_steps_to_come=10
+            outputs[0] = model(inputs, reset_cache_with_num_steps_to_come=10)
+            # Predict next 10 tokens using cache
+            for i in range(10):
+                outputs[i+1] = model(inputs, cache=True)
+            ```
+        """
+        if reset_cache_with_num_steps_to_come:
+            return self.reset_cache(x, num_steps_to_come=reset_cache_with_num_steps_to_come)
+        x = x + self.attn(self.ln_1(x), cache=cache)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+    def reset_cache(self, x, num_steps_to_come):
+        x = x + self.attn.reset_cache(self.ln_1(x), num_steps_to_come=num_steps_to_come)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class BlockV2(nn.Module):
+    """
+    Compared to the Block in the original implementation, this one uses non-parametric LayerNorm and Pytorch's GELU.
+    These two changes save significant vram but are incompatible with previously trained models.
+    Hence the separate class.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNormMinimal(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNormMinimal(config.n_embd, bias=config.bias)
+        self.mlp = GELU_MLP(config)
+    def forward(self, x, cache=False, reset_cache_with_num_steps_to_come=None):
+        if reset_cache_with_num_steps_to_come:
+            return self.reset_cache(x, num_steps_to_come=reset_cache_with_num_steps_to_come)
+        x = x + self.attn(self.ln_1(x), cache=cache)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+    def reset_cache(self, x, num_steps_to_come):
+        x = x + self.attn.reset_cache(self.ln_1(x), num_steps_to_come=num_steps_to_come)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class SelfAttentionBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = SelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50304  # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = True  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    version: int = 1 # Version 1 is the original GPT, Version 2 is the one with non-parametric LayerNorm and Pytorch's GELU
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.version = config.version
+        print(f"[nanoGPT] creating model with version {self.version}")
+        if self.version == 1:
+            transformer_dict = dict(
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=LayerNorm(config.n_embd, bias=config.bias),
+            )
+        elif self.version == 2:
+            transformer_dict = dict(
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([BlockV2(config) for _ in range(config.n_layer)]),
+                ln_f=LayerNorm(config.n_embd, bias=config.bias), # This one is still parametric due to user error
+            )
+        transformer_dict["wte"] = nn.Embedding(config.vocab_size, config.n_embd)
+        self.transformer = nn.ModuleDict(transformer_dict)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # with weight tying when using torch.compile() some warnings get generated:
+        # "UserWarning: functional_call was passed multiple values for tied weights.
+        # This behavior is deprecated and will be an error in future versions"
+        # not 100% sure what this is, so far seems to be harmless.
+        self.transformer.wte.weight = self.lm_head.weight  # https://paperswithcode.com/method/weight-tying
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith("c_proj.weight"):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def _apply_pos_encoding(self, x):
+        device = x.device
+        token_len = x.size(1)
+        pos = torch.arange(0, token_len, dtype=torch.long, device=device).unsqueeze(0)
+        pos_emb = self.transformer.wpe(pos)
+        x = x + pos_emb
+        return x
+    def original_forward(self, idx, targets=None, loss_mask=None, loss_reduction="mean"):
+        batch_size, seq_len = idx.shape[:2]
+        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        x = self.transformer.drop(self._apply_pos_encoding(tok_emb))
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            if loss_mask is not None:
+                # Feeding target = CROSS_ENTROPY_INVALID_CLASS_TARGET to cross_entropy will ignore the loss
+                # for that position. This is useful for padding tokens.
+                targets[loss_mask == 0] = CROSS_ENTROPY_INVALID_CLASS_TARGET
+            loss = F.cross_entropy(
+                logits.view(batch_size * seq_len, self.config.vocab_size), targets.view(-1), ignore_index=CROSS_ENTROPY_INVALID_CLASS_TARGET, reduction=loss_reduction
+            )
+            if loss_reduction == "none":
+                # Reshape back into batch_size and seq_len
+                loss = loss.view(batch_size, seq_len)
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(x[:, [-1], :])  # note: using list [-1] to preserve the time dim
+            loss = None
+        return logits, loss
+    def forward(self, x, targets=None, loss_mask=None, loss_reduction="mean"):
+        token_len = x.size(1)
+        assert token_len <= self.config.block_size, f"Cannot forward sequence of length {token_len}, block size is only {self.config.block_size}"
+        return self.original_forward(x, targets, loss_mask, loss_reduction)
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, valid_token_range=None, temperature=1.0, top_k=None, raise_cropping=False, deterministic=False):
+        """
+        valid_token_range should be a tuple, specifying start and end indices we'd like to sample from (inclusive).
+        if None, we'll sample from the full vocab.
+        If raise_cropping is True, we'll raise an error if we need to crop the sequence context.
+        """
+        if valid_token_range is None:
+            valid_token_range = (0, self.config.vocab_size - 1)
+        assert len(valid_token_range) == 2
+        assert valid_token_range[0] < valid_token_range[1]
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx
+            if idx.size(1) > self.config.block_size:
+                if raise_cropping:
+                    raise ValueError("Tried to crop idxs but flag told to raise this")
+                else:
+                    idx_cond = idx[:, -self.config.block_size :]
+            # forward the model to get the logits for the index in the sequence
+            logits, _ = self(idx_cond)
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature  # logits is B T Vocabsize -> B Vocabsize
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = NEGATIVE_INFINITE_FLOAT
+            # Crop out the logits we don't want to sample from
+            if valid_token_range is not None:
+                limit_logits_to_valid_range(logits, valid_token_range)
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            if deterministic:
+                # Take max of the results
+                idx_next = torch.argmax(probs, dim=-1, keepdim=True)
+            else:
+                # sample from the distribution
+                idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+    @torch.no_grad()
+    def optimized_generate(
+        self,
+        idx,
+        num_new_tokens,
+        valid_token_ranges=None,
+        temperature=1.0,
+        deterministic=False,
+        raise_cropping=False,
+        top_k=None,
+        top_p=None,
+        min_tokens_to_keep=1,
+    ):
+        """
+        Generate function but optimized by caching the results in transformer blocks (think this is referred to as "attention caching").
+        The higher the num_new_tokens, the more the speedup compared to original generate.
+        Caveat: the context length + num_new_tokens must be less than the block size. This means that the first
+                generated tokens do not have full context length.
+        valid_token_ranges should be None or list of length num_new_tokens, specifying valid range for tokens for every step
+        """
+        # Properly compile the modules used and/or quantize for improved speed.
+        logit_layer = self.lm_head
+        embedder_fn = self.transformer.wte
+        if valid_token_ranges is None:
+            valid_token_ranges = [[0, self.config.vocab_size] for _ in range(num_new_tokens)]
+        assert len(valid_token_ranges) == num_new_tokens, "valid_token_ranges should be list of length num_new_tokens or None"
+        _, token_len = idx.size()
+        if token_len + num_new_tokens > self.config.block_size:
+            raise ValueError("Can't use optimized generation with num_new_tokens + context_length > block_size")
+        new_idxs = torch.zeros(idx.size(0), num_new_tokens, dtype=torch.long, device=idx.device)
+        # First, we need to cull the sequence to the block size
+        # and remove first max_new_tokens so we can reuse same position embeddings
+        # and not have to recompute them
+        num_original_tokens = idx.size(1)
+        original_idx = idx
+        if (num_original_tokens + num_new_tokens) > self.config.block_size:
+            if raise_cropping:
+                raise ValueError("Tried to crop idxs but flag told to raise this")
+            original_idx = idx[:, -self.config.block_size + num_new_tokens :]
+        original_pos = torch.arange(0, original_idx.size(1), dtype=torch.long, device=idx.device).unsqueeze(0)
+        # Now cache results with the original context
+        original_tok_emb = embedder_fn(original_idx)
+        original_pos_emb = self.transformer.wpe(original_pos)
+        original_x = original_tok_emb + original_pos_emb
+        for block in self.transformer.h:
+            # Reset the cache for each block, and cache new result
+            original_x = block(original_x, reset_cache_with_num_steps_to_come=num_new_tokens)
+        # Sample the first token
+        original_x = self.transformer.ln_f(original_x)
+        last_logit = logit_layer(original_x[:, [-1], :])
+        new_idxs[:, 0] = default_sample_token(
+            last_logit[:, -1, :], valid_token_ranges[0], temperature, deterministic, top_k=top_k, top_p=top_p, min_tokens_to_keep=min_tokens_to_keep
+        )
+        # Generate rest of the steps
+        for generation_idx in range(1, num_new_tokens):
+            # forward the model to get the logits for the index in the sequence
+            # This is the position of the latest generated token, not the currently going-to-be-generated token
+            latest_token_pos = num_original_tokens + generation_idx - 1
+            # We only need to pass in the latest token
+            newest_idx = new_idxs[:, generation_idx - 1].unsqueeze(-1)
+            newest_tok_emb = embedder_fn(newest_idx)
+            newest_pos_emb = self.transformer.wpe(torch.tensor(latest_token_pos, dtype=torch.long, device=idx.device).unsqueeze(0))
+            newest_x = newest_tok_emb + newest_pos_emb
+            for block in self.transformer.h:
+                newest_x = block(newest_x, cache=True)
+            newest_x = self.transformer.ln_f(newest_x)
+            newest_logit = logit_layer(newest_x)
+            # Check this function isn't slowing things down noticeably
+            new_idxs[:, generation_idx] = default_sample_token(
+                newest_logit[:, -1, :], valid_token_ranges[generation_idx], temperature, deterministic, top_k=top_k, top_p=top_p, min_tokens_to_keep=min_tokens_to_keep
+            )
+        # Combine indices
+        new_idxs = torch.cat((idx, new_idxs), dim=1)
+        return new_idxs

wham/models/pl/__init__.py ADDED Viewed

File without changes

wham/models/pl/pl_base_model.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import pytorch_lightning as pl
+class BaseTrainingModel(pl.LightningModule):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)

wham/models/vqgan/taming/LICENSE ADDED Viewed

	@@ -0,0 +1,24 @@

+All files under this directory are originally from the taming-transformers repository:
+https://github.com/CompVis/taming-transformers
+Below is a copy of the original license
+------------------------------------------------------------------------------
+Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE./

wham/models/vqgan/taming/model.py ADDED Viewed

	@@ -0,0 +1,696 @@

+# All files under this directory are originally from the taming-transformers repository:
+# https://github.com/CompVis/taming-transformers
+# MIT License
+# Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer
+#               2023 Microsoft Research
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+# OR OTHER DEALINGS IN THE SOFTWARE.
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class Model(nn.Module):
+    def __init__(
+        self, *, ch, out_ch, ch_mult=(1, 2, 4, 8), num_res_blocks, attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels, resolution, use_timestep=True
+    ):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = self.ch * 4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList(
+                [
+                    torch.nn.Linear(self.ch, self.temb_ch),
+                    torch.nn.Linear(self.temb_ch, self.temb_ch),
+                ]
+            )
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in + skip_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, x, t=None):
+        # assert x.shape[2] == x.shape[3] == self.resolution
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        double_z=True,
+        **ignore_kwargs
+    ):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, 2 * z_channels if double_z else z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        # assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        give_pre_end=False,
+        **ignorekwargs
+    ):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class VUNet(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        c_channels,
+        resolution,
+        z_channels,
+        use_timestep=False,
+        **ignore_kwargs
+    ):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = self.ch * 4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList(
+                [
+                    torch.nn.Linear(self.ch, self.temb_ch),
+                    torch.nn.Linear(self.temb_ch, self.temb_ch),
+                ]
+            )
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(c_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        self.z_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=1, stride=1, padding=0)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=2 * block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in + skip_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, x, z):
+        # assert x.shape[2] == x.shape[3] == self.resolution
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        z = self.z_in(z)
+        h = torch.cat((h, z), dim=1)
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class SimpleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, *args, **kwargs):
+        super().__init__()
+        self.model = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels, in_channels, 1),
+                ResnetBlock(in_channels=in_channels, out_channels=2 * in_channels, temb_channels=0, dropout=0.0),
+                ResnetBlock(in_channels=2 * in_channels, out_channels=4 * in_channels, temb_channels=0, dropout=0.0),
+                ResnetBlock(in_channels=4 * in_channels, out_channels=2 * in_channels, temb_channels=0, dropout=0.0),
+                nn.Conv2d(2 * in_channels, in_channels, 1),
+                Upsample(in_channels, with_conv=True),
+            ]
+        )
+        # end
+        self.norm_out = Normalize(in_channels)
+        self.conv_out = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        for i, layer in enumerate(self.model):
+            if i in [1, 2, 3]:
+                x = layer(x, None)
+            else:
+                x = layer(x)
+        h = self.norm_out(x)
+        h = nonlinearity(h)
+        x = self.conv_out(h)
+        return x
+class UpsampleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution, ch_mult=(2, 2), dropout=0.0):
+        super().__init__()
+        # upsampling
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = in_channels
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            res_block = []
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout))
+                block_in = block_out
+            self.res_blocks.append(nn.ModuleList(res_block))
+            if i_level != self.num_resolutions - 1:
+                self.upsample_blocks.append(Upsample(block_in, True))
+                curr_res = curr_res * 2
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        # upsampling
+        h = x
+        for k, i_level in enumerate(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.res_blocks[i_level][i_block](h, None)
+            if i_level != self.num_resolutions - 1:
+                h = self.upsample_blocks[k](h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

wham/models/vqgan/taming/quantize.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# All files under this directory are originally from the taming-transformers repository:
+# https://github.com/CompVis/taming-transformers
+# MIT License
+# Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer
+#               2023 Microsoft Research
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+# OR OTHER DEALINGS IN THE SOFTWARE.
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+class VectorQuantizer2(nn.Module):
+    """
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly
+    avoids costly matrix multiplications and allows for post-hoc remapping of indices.
+    """
+    # NOTE: due to a bug the beta term was applied to the wrong term. for
+    # backwards compatibility we use the buggy version by default, but you can
+    # specify legacy=False to fix it.
+    def __init__(self, n_e, e_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.legacy = legacy
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed + 1
+            print(f"Remapping {self.n_e} indices to {self.re_embed} indices. " f"Using {self.unknown_index} for unknown indices.")
+        else:
+            self.re_embed = n_e
+        self.sane_index_shape = sane_index_shape
+    def remap_to_used(self, inds):
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        match = (inds[:, :, None] == used[None, None, ...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2) < 1
+        if self.unknown_index == "random":
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+    def unmap_to_all(self, inds):
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]:  # extra token
+            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
+        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+        return back.reshape(ishape)
+    def forward(self, z, temp=None, rescale_logits=False, return_logits=False):
+        assert temp is None or temp == 1.0, "Only for interface compatible with Gumbel"
+        assert rescale_logits == False, "Only for interface compatible with Gumbel"
+        assert return_logits == False, "Only for interface compatible with Gumbel"
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = rearrange(z, "b c h w -> b h w c").contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        d = (
+            torch.sum(z_flattened**2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, dim=1)
+            - 2 * torch.einsum("bd,dn->bn", z_flattened, rearrange(self.embedding.weight, "n d -> d n"))
+        )
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        perplexity = None
+        min_encodings = None
+        # compute loss for embedding
+        if not self.legacy:
+            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2)
+        else:
+            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        z_q = rearrange(z_q, "b h w c -> b c h w").contiguous()
+        if self.remap is not None:
+            min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1)  # add batch axis
+            min_encoding_indices = self.remap_to_used(min_encoding_indices)
+            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
+        if self.sane_index_shape:
+            min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
+        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+    def get_codebook_entry(self, indices, shape):
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            indices = indices.reshape(shape[0], -1)  # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1)  # flatten again
+        # get quantized latent vectors
+        z_q = self.embedding(indices)
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q

wham/models/vqgan/taming_vq_model.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# Wrapper for the VQ models from the taming-transformers repo
+# https://github.com/CompVis/taming-transformers
+from typing import Any, Mapping
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from wham.models.vqgan.taming.model import Encoder, Decoder
+from wham.models.vqgan.taming.quantize import VectorQuantizer2 as VectorQuantizer
+from wham.models.wham_base.tensor_spaces import TensorSpace
+from wham.models.wham_base.encoder_decoder import EncoderDecoderBase
+HARDCODED_IMAGE_SIZE = 128
+def taming_vq_preprocess_images(imgs):
+    """Normalize images (as pytorch tensor uint8s) as in taming-transformers"""
+    return imgs.float() / 127.5 - 1.0
+def taming_vq_revert_preprocess_images(imgs):
+    """Revert preprocessing of images from taming to uint8 as in taming-transformers"""
+    # Clamp first
+    imgs = torch.clamp(imgs, -1.0, 1.0)
+    return ((imgs + 1) * 127.5).byte()
+class _VQModelFromTamingRepository(pl.LightningModule):
+    """
+    This aims to be the original VQ model from the taming-transformers repo with as little modifications as possible. This should not be used directly.
+    Source: https://github.com/CompVis/taming-transformers/blob/master/taming/models/vqgan.py
+    MIT License
+    Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer
+                  2023 Microsoft Research
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+    DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+    OR OTHER DEALINGS IN THE SOFTWARE.
+    """
+    def __init__(
+        self,
+        ddconfig,
+        n_embed,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key="image",
+        colorize_nlabels=None,
+        monitor=None,
+        remap=None,
+        sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+    ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        # NOTE: Loss is disabled for this repo (we only want inference)
+        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25, remap=remap, sane_index_shape=sane_index_shape)
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        # Note: the '!= "None"' check is for checkpoints that mistakenly stored the None as a string
+        if ckpt_path is not None and ckpt_path != "None":
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.image_key = image_key
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels) == int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+        return x.float()
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        raise NotImplementedError("This copy of the model code does not support training")
+    def validation_step(self, batch, batch_idx):
+        raise NotImplementedError("This copy of the model code does not support training")
+    def configure_optimizers(self):
+        raise NotImplementedError("This copy of the model code does not support training")
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
+        return x
+class TamingVQModel(EncoderDecoderBase):
+    __DEBUG_CREATION_KWARGS__ = {
+        "ckpt_path": None,
+        "model_spec": {
+            "taming_n_embed": 16,
+            "taming_embed_dim": 8,
+            "taming_num_indices_per_axis": 8,
+            "taming_ddconfig": {
+                "double_z": False,
+                "z_channels": 16,
+                "resolution": 128,
+                "in_channels": 3,
+                "out_ch": 3,
+                "ch": 128,
+                "ch_mult": [1, 1, 1, 1, 1],
+                "num_res_blocks": 1,
+                "attn_resolutions": [16],
+                "dropout": 0.0,
+            },
+        },
+    }
+    def __init__(self, model_spec, ckpt_path, **kwargs):
+        super().__init__()
+        self._vocab_size = model_spec["taming_n_embed"]
+        self.num_indices_per_axis = model_spec["taming_num_indices_per_axis"]
+        self.num_indices_total = self.num_indices_per_axis**2
+        self.taming_embed_dim = model_spec["taming_embed_dim"]
+        taming_ddconfig = model_spec.get("taming_ddconfig", None)
+        if taming_ddconfig is None:
+            raise ValueError("To run TamingVQModel, specify model_spec.taming_ddconfig, which should match the ddconfig used when training the model")
+        self.vq_model = _VQModelFromTamingRepository(taming_ddconfig, self._vocab_size, self.taming_embed_dim, ckpt_path=ckpt_path)
+        resolution = taming_ddconfig["resolution"]
+        in_channels = taming_ddconfig["in_channels"]
+        self.world_space = TensorSpace((in_channels, resolution, resolution), dtype=torch.uint8, low=0, high=255)
+        self.encoder_space = TensorSpace((self.num_indices_total,), dtype=torch.long, low=0, high=self.vocab_size - 1)
+    @property
+    def vocab_size(self):
+        """Return the number of entries in the codebook."""
+        return self._vocab_size
+    @property
+    def encoded_bottleneck_dim(self):
+        """Return the dimensionality of the latent vector encoded into codebook indices."""
+        return self.num_indices_total
+    def _preprocess_images(self, images):
+        """Preprocess images (B, C, H, W)"""
+        return taming_vq_preprocess_images(images)
+    def _revert_image_preprocess(self, x_batch):
+        """Revert the preprocessing done in _preprocess_images"""
+        return taming_vq_revert_preprocess_images(x_batch)
+    def decode_from_encoding_indices(self, encoding_indices, return_vq_embeddings=False):
+        """Return decoded images (B, C, H, W) for a batch of encoding indices (B, self.encoded_bottleneck_dim)"""
+        batch_size = encoding_indices.shape[0]
+        z = self.vq_model.quantize.get_codebook_entry(encoding_indices, shape=(batch_size, self.num_indices_per_axis, self.num_indices_per_axis, self.taming_embed_dim))
+        data_recon = self.vq_model.decode(z)
+        # Denormalize and cast to uint8
+        data_recon = self._revert_image_preprocess(data_recon)
+        if return_vq_embeddings:
+            return data_recon, z
+        return data_recon
+    def get_encoding_indices_for_images(self, images):
+        """
+        Return encoding indices (B, self.encoded_bottleneck_dim) for a batch of images (B, C, H, W).
+        Useful auxiliary method for testing.
+        """
+        x_batch = self._preprocess_images(images)
+        _, _, (_, _, encoding_indices) = self.vq_model.encode(x_batch)
+        # Split back into (B, self.encoded_bottleneck_dim)
+        encoding_indices = encoding_indices.view(images.shape[0], -1)
+        return encoding_indices
+    def forward_returning_action_and_embedding(self, states, actions_input, timesteps, attention_mask, images):
+        seq_len_dim = 1
+        assert images.shape[seq_len_dim] == 1, f"We require seq_len==1, but provided {images.shape[seq_len_dim]}."
+        images = images.squeeze(dim=seq_len_dim)  # get rid of timestep dimension
+        x_batch = self._preprocess_images(images)
+        quant, _, (_, _, encoding_indices) = self.vq_model.encode(x_batch)
+        # Split back into (B, self.encoded_bottleneck_dim)
+        encoding_indices = encoding_indices.reshape(quant.shape[0], 1, quant.shape[2], quant.shape[3])
+        quant = quant.unsqueeze(seq_len_dim)
+        return None, {"quantized": quant, "encoding_indices": encoding_indices}
+    def _encode(self, world_space_tensor: torch.tensor) -> torch.tensor:
+        batch, time = world_space_tensor.shape[:2]
+        world_space_tensor = world_space_tensor.view(batch * time, *world_space_tensor.shape[2:])
+        encodings = self.get_encoding_indices_for_images(world_space_tensor)
+        # Reshape back to (batch, time, ...)
+        encodings = encodings.view(batch, time, -1)
+        return encodings
+    def _decode(self, encoder_space_tensor: torch.tensor) -> torch.tensor:
+        batch, time = encoder_space_tensor.shape[:2]
+        encoder_space_tensor = encoder_space_tensor.view(batch * time, *encoder_space_tensor.shape[2:])
+        decoded = self.decode_from_encoding_indices(encoder_space_tensor)
+        # Reshape back to (batch, time, ...)
+        decoded = decoded.view(batch, time, *decoded.shape[1:])
+        return decoded

wham/models/vqgan/vqgan.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from wham.models.wham_base.tensor_spaces import TensorSpace
+from wham.models.wham_base.encoder_decoder import EncoderDecoderBase
+from wham.models.vqgan import vqgan_models as vqgan
+from wham.models.vqvae.vqvae_utils import make_grid, normalise_rgb, rev_normalise_rgb
+from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
+from pytorch_lightning.loggers.wandb import WandbLogger
+TARGET_GAN_UPDATE = 5
+GAN_DWEIGHT_MAX = 250
+GAN_LOGIT_CAP = 5.0
+MAX_PIXEL_WEIGHTING = 0.1
+# The GAN parts are from Taming Transformers (https://github.com/CompVis/taming-transformers)
+"""
+ViT-VQGAN is based on:
+Yu, Jiahui, et al. "Vector-quantized image modeling with improved vqgan."
+ICLR 2022
+"""
+def create_vqgan_model_for_training(variant):
+    return VQGANModel(variant=variant)
+class VQGANModel(EncoderDecoderBase):
+    @classmethod
+    def create_from_variant(cls, variant):
+        return VQGANModel(variant=variant)
+    def __init__(self, variant=None, ckpt_path=None, model_spec=None):
+        super().__init__()
+        self.save_hyperparameters()
+        self.variant = variant
+        if model_spec is not None:
+            self.model_spec = model_spec
+        else:
+            self.model_spec = variant["model_spec"]
+        # Batches of images we will use for logging
+        self.reference_x_batch = None  # Same images used throughout training to see progress of the model
+        self.random_batch = None  # Different images every iteration
+        if variant is None and "image_size_per_y_axis" in self.model_spec:
+            self.image_size_x = self.model_spec["image_size_per_x_axis"]
+            self.image_size_y = self.model_spec["image_size_per_y_axis"]
+        else:
+            assert "image_size_per_x_axis" in variant and "image_size_per_y_axis" in variant, "Please provide the image size as separate x and y for the VQGAN model"
+            self.image_size_x = variant["image_size_per_x_axis"]
+            self.image_size_y = variant["image_size_per_y_axis"]
+        self._embedding_dim = self.model_spec["embedding_dim"]
+        self.encoder = vqgan.ViTEncoder(
+            patch_size=self.model_spec["patch_size"],
+            transf_dim=self.model_spec["transf_dim"],
+            embedding_dim=self.model_spec["embedding_dim"],
+            image_size_x=self.image_size_x,
+            image_size_y=self.image_size_y,
+            num_layers=self.model_spec["num_layers"],
+            head_size=self.model_spec["head_size"],
+        )
+        self._bottleneck_size = self.encoder.bottleneck
+        self.vq_vae = vqgan.ViTVectorQuantizer(
+            self.model_spec["vocab_size"],
+            self.model_spec["embedding_dim"],
+            self.model_spec["commitment_cost"],
+        )
+        self.decoder = vqgan.ViTDecoder(
+            patch_size=self.model_spec["patch_size"],
+            transf_dim=self.model_spec["transf_dim"],
+            embedding_dim=self.model_spec["embedding_dim"],
+            image_size_x=self.image_size_x,
+            image_size_y=self.image_size_y,
+            num_layers=self.model_spec["num_layers"],
+            head_size=self.model_spec["head_size"],
+            expected_bottleneck=self._bottleneck_size,
+        )
+        self.is_perceptual = self.model_spec["is_perceptual"]
+        assert self.is_perceptual  # This should be on
+        # Keep track of the usage of the codebook indices
+        self.codebook_index_usage = np.zeros(self.model_spec["vocab_size"], dtype=np.int64)
+        self.gan = self.model_spec.get("use_gan", False)
+        if self.gan:
+            # Only make the patchgan if we are using it. This makes it easier to experiment with GAN settings after pretraining the VQ-VAE for instance
+            self.patch_gan = vqgan.PatchGan(channel_start=self.model_spec["gan_channel_start"])
+            # Make a copy of the patchgan since we are only using a single optimizer
+            self.target_patchgan = vqgan.PatchGan(channel_start=self.model_spec["gan_channel_start"])
+            self.target_patchgan.requires_grad_(False)
+            self.target_patchgan.load_state_dict(self.patch_gan.state_dict())
+            self.target_update = TARGET_GAN_UPDATE
+            # At which iteration to start using the GAN loss
+            self.gan_start = self.model_spec["gan_start"]
+            # How much weight to give to the GAN loss gradients compared to the vq autoencoder loss
+            self.gan_weight = self.model_spec["gan_weight"]
+            # How many steps to train the discriminator before applying the gan loss.
+            self.gan_discrim_pretrain = self.model_spec["gan_discrim_pretrain"]
+            # How many steps to warmup the gan loss
+            self.gan_discrim_warmup = self.model_spec["gan_discrim_warmup"]
+            # Keeping track of the number of updates
+            self.updates = 0
+            print(f"Using GAN with weight {self.gan_weight} and target update {self.target_update} and gan start {self.gan_start} over {self.gan_discrim_warmup} steps")
+        self.lpips_model = None
+        # We don't need this for using the encoder/decoder
+        # self.lpips_model = lpips.LPIPS(net=self.model_spec["lpips_model"]).eval()
+        # for param in self.lpips_model.parameters():
+            # param.requires_grad = False
+        if ckpt_path is not None and ckpt_path != "None":
+            print(f"Initing VQGAN model from {ckpt_path}")
+            loaded_ckpt = torch.load(ckpt_path, map_location="cpu")
+            # Can ignore stuff here
+            self.load_state_dict(loaded_ckpt["state_dict"], strict=False)
+        self.world_space = TensorSpace((3, self.image_size_y, self.image_size_x), dtype=torch.uint8, low=0, high=255)
+        self.encoder_space = TensorSpace((self._bottleneck_size,), dtype=torch.long, low=0, high=self.vocab_size - 1)
+    @property
+    def vocab_size(self):
+        """Return the number of entries in the codebook."""
+        return self.vq_vae._vocab_size
+    @property
+    def encoded_bottleneck_dim(self):
+        """Return the dimensionality of the latent vector encoded into codebook indices."""
+        return self._bottleneck_size
+    @property
+    def embedding_dim(self):
+        """The dimensionality of quantized vectors (the dimension of codebook vectors)."""
+        return self.vq_vae._embedding_dim
+    def _get_last_layer(self):
+        """
+        The last layer used for generating the image.
+        Used for balancing the gradients of the reconstruction and the GAN loss.
+        """
+        return self.decoder.get_last_layer()
+    def _preprocess_images(self, images):
+        """Preprocess images (B, C, H, W)"""
+        x_batch = images.float() / 255
+        x_batch = normalise_rgb(x_batch)
+        return x_batch
+    def _revert_image_preprocess(self, x_batch):
+        """Revert the preprocessing done in _preprocess_images"""
+        normalized_imgs = rev_normalise_rgb(x_batch.clone())
+        x_batch = torch.clip(normalized_imgs, 0, 1)
+        images = (x_batch * 255).byte()
+        return images
+    def _get_latent_continuous(self, batch):
+        z = self.encoder(batch)
+        return z
+    def _get_latent_discretized(self, z):
+        z_quantized, vq_loss, perplexity, indices = self.vq_vae(z)
+        return z_quantized, vq_loss, perplexity, indices
+    def _encode_decode(self, x_batch):
+        z = self._get_latent_continuous(x_batch)
+        z_quantized, vq_loss, perplexity, indices = self._get_latent_discretized(z)
+        data_recon = self.decoder(z_quantized)
+        return vq_loss, perplexity, data_recon, indices
+    def _log_vars(self, log_vars):
+        prefix = "train" if self.training else "val"
+        for key, val in log_vars.items():
+            self.log(f"{prefix}/{key}", val, on_step=True, on_epoch=True, prog_bar=False, logger=True, sync_dist=True)
+    def decode_from_encoding_indices(self, encoding_indices):
+        """Return decoded images (B, C, H, W) for a batch of encoding indices (B, self.encoded_bottleneck_dim)"""
+        z = self.vq_vae.convert_encoding_indices_to_quantized_embeddings(encoding_indices)
+        data_recon = self.decoder(z)
+        # Denormalize and cast to uint8
+        data_recon = self._revert_image_preprocess(data_recon)
+        return data_recon
+    def get_encoding_indices_for_images(self, images):
+        """
+        Return encoding indices (B, self.encoded_bottleneck_dim) for a batch of images (B, C, H, W).
+        Useful auxiliary method for testing.
+        """
+        x_batch = self._preprocess_images(images)
+        z = self._get_latent_continuous(x_batch)
+        encoding_indices = self.vq_vae(z, only_return_encoding_indices=True)
+        return encoding_indices
+    def forward_returning_action_and_embedding(self, states, actions_input, timesteps, attention_mask, images):
+        raise NotImplementedError
+    def get_encoding_output(self, images):
+        """
+        Return outputs from the encoder for a batch of images (B, C, H, W).
+        Returns:
+            quantized_z: (B, self.encoded_bottleneck_dim, self.embedding_dim), quantized latent vectors with straight-through gradient estimator
+            vq_loss: (B, ), VQ loss for each image
+            perplexity: (B, ), perplexity for each image
+            encoding_indices: (B, self.encoded_bottleneck_dim), encoding indices for each image
+        """
+        x_batch = self._preprocess_images(images)
+        z = self._get_latent_continuous(x_batch)
+        quantized_z, vq_loss, perplexity, encoding_indices = self.vq_vae(z)
+        quantized_z = quantized_z.view(quantized_z.shape[0], self.encoded_bottleneck_dim, self.embedding_dim)
+        return quantized_z, vq_loss, perplexity, encoding_indices
+    def _encode(self, world_space_tensor: torch.tensor) -> torch.tensor:
+        # Flatten time and batch dim into one
+        batch, time = world_space_tensor.shape[:2]
+        world_space_tensor = world_space_tensor.view(batch * time, *world_space_tensor.shape[2:])
+        encodings = self.get_encoding_indices_for_images(world_space_tensor)
+        # Reshape back to (batch, time, ...)
+        encodings = encodings.view(batch, time, -1)
+        return encodings
+    def _decode(self, encoder_space_tensor: torch.tensor) -> torch.tensor:
+        # Flatten time and batch dim into one
+        batch, time = encoder_space_tensor.shape[:2]
+        encoder_space_tensor = encoder_space_tensor.view(batch * time, *encoder_space_tensor.shape[2:])
+        decoded = self.decode_from_encoding_indices(encoder_space_tensor)
+        # Reshape back to (batch, time, ...)
+        decoded = decoded.view(batch, time, *decoded.shape[1:])
+        return decoded

wham/models/vqgan/vqgan_models.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# MIT License
+# Copyright (c) 2018 Zalando Research
+#               2023 Microsoft Research
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+# OR OTHER DEALINGS IN THE SOFTWARE.
+from math import sqrt
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from wham.models.nn.nanoGPT import GPTConfig, SelfAttentionBlock
+from wham.models.nn.model_blocks import ConvNextBlock, ConvNextDownsample, ConvNextDownsampleBig
+# Mainly following https://github.com/zalandoresearch/pytorch-vq-vae/blob/master/vq-vae.ipynb
+"""
+ViT-VQGAN is based on:
+Yu, Jiahui, et al. "Vector-quantized image modeling with improved vqgan."
+ICLR 2022
+"""
+def _convert_encoding_indices_to_quantized_embeddings(encoding_indices, embedding_layer, vocab_size, embedding_dim):
+    """
+    Args:
+        encoding_indices: tensor of integers (batch_size, bottleneck_size)
+                            Each batch item represents a single image as a sequence of integers (indeces of codebook vectors)
+    Output:
+        quantized: tensor of floats (batch_size, bottleneck_size, embedding_dim)
+    """
+    batch_dim, bottleneck_size = encoding_indices.shape[:2]
+    encoding_indices = encoding_indices.view(-1).unsqueeze(1)
+    one_hot_encoding_indices = torch.zeros(encoding_indices.shape[0], vocab_size, device=encoding_indices.device)
+    one_hot_encoding_indices.scatter_(1, encoding_indices, 1)
+    quantized = torch.matmul(one_hot_encoding_indices, embedding_layer)
+    quantized = quantized.view(batch_dim, bottleneck_size, embedding_dim).contiguous()
+    return quantized
+class ViTVectorQuantizer(nn.Module):
+    """
+    Vector Quantizer for a Vision Transformer based VQ model using normalised codebook embeddings as in https://arxiv.org/abs/2110.04627.
+    """
+    def __init__(self, vocab_size, embedding_dim, commitment_cost, epsilon=1e-5):
+        super().__init__()
+        self._embedding_dim = embedding_dim
+        self._vocab_size = vocab_size
+        self._epsilon = epsilon
+        self._embedding = nn.Embedding(self._vocab_size, self._embedding_dim)
+        self._embedding.weight.data.uniform_(-1 / self._vocab_size, 1 / self._vocab_size)
+        self._commitment_cost = commitment_cost
+    @property
+    def vocab_size(self):
+        """Return the number of entries in the codebook."""
+        return self._vocab_size
+    def convert_encoding_indices_to_quantized_embeddings(self, encoding_indices):
+        """
+        Args:
+            encoding_indices: tensor of integers (batch_size, bottleneck_size)
+                              Each batch item represents a single image as a sequence of integers (indeces of codebook vectors)
+        Output:
+            quantized: tensor of floats (batch_size, self._embedding_dim, bottleneck_size)
+        """
+        return _convert_encoding_indices_to_quantized_embeddings(encoding_indices, F.normalize(self._embedding.weight), self._vocab_size, self._embedding_dim)
+    def forward(self, inputs, only_return_encoding_indices=False):
+        """
+        If only_return_encoding_indices is True, then only return the indices of codebook vectors
+        """
+        input_shape = inputs.shape
+        # Flatten input from Batch Tokens Embedding to B*T E
+        flat_input = inputs.view(-1, self._embedding_dim)
+        # Normalize inputs
+        flat_input = F.normalize(flat_input)
+        # Embeddings are always normalized
+        embeddings_to_use = F.normalize(self._embedding.weight)
+        # Calculate distances
+        distances = torch.sum(flat_input**2, dim=1, keepdim=True) + torch.sum(embeddings_to_use**2, dim=1) - 2 * torch.matmul(flat_input, embeddings_to_use.t())
+        # Encoding
+        encoding_indices = torch.argmin(distances, dim=1).unsqueeze(1)
+        if only_return_encoding_indices:
+            # Add back batch dimension
+            return encoding_indices.view(input_shape[0], -1)
+        one_hot_encoding_indices = torch.zeros(encoding_indices.shape[0], self._vocab_size, device=inputs.device)
+        one_hot_encoding_indices.scatter_(1, encoding_indices, 1)
+        # Quantize and unflatten
+        quantized = torch.matmul(one_hot_encoding_indices, embeddings_to_use).view(input_shape)
+        # Loss
+        e_latent_loss = F.mse_loss(quantized.detach(), inputs)
+        q_latent_loss = F.mse_loss(quantized, inputs.detach())
+        loss = q_latent_loss + self._commitment_cost * e_latent_loss
+        quantized = inputs + (quantized - inputs).detach()
+        avg_probs = torch.mean(one_hot_encoding_indices, dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + self._epsilon)))
+        return quantized, loss, perplexity, encoding_indices.view(input_shape[0], -1)
+class ViTEncoder(nn.Module):
+    def __init__(self, patch_size, transf_dim, embedding_dim, image_size_x, image_size_y, num_layers, head_size):
+        super().__init__()
+        self.image_size_x = image_size_x
+        self.image_size_y = image_size_y
+        # We will pad the image to make it divisible by patch_size
+        self.x_pad = (patch_size - (self.image_size_x % patch_size)) % patch_size
+        self.y_pad = (patch_size - (self.image_size_y % patch_size)) % patch_size
+        assert (self.image_size_x + self.x_pad) % patch_size == 0 and (
+            self.image_size_y + self.y_pad
+        ) % patch_size == 0, "image_size_x and image_size_y must be divisible by patch_size"
+        self.vit_tokens = ((image_size_x + self.x_pad) // patch_size) * ((image_size_y + self.y_pad) // patch_size)
+        self._bottleneck = self.vit_tokens
+        print(f"Bottleneck is {self.bottleneck} for image size {image_size_x}x{image_size_y} with ViT Encoder and patch size {patch_size}")
+        self.patch_size = patch_size
+        self.transf_dim = transf_dim
+        self.embedding_dim = embedding_dim
+        self.proj1 = nn.Linear(3 * patch_size * patch_size, transf_dim)
+        self.pos_embeds = nn.Embedding(self.vit_tokens, transf_dim)
+        assert self.transf_dim % head_size == 0, "transf_dim must be divisible by head_size"
+        n_heads = self.transf_dim // head_size
+        transformer_config = GPTConfig(block_size=self.vit_tokens, n_layer=num_layers, n_head=n_heads, n_embd=transf_dim, bias=False, dropout=0)
+        self.vit = nn.Sequential(*[SelfAttentionBlock(transformer_config) for _ in range(num_layers)])
+        self.output_ln = nn.LayerNorm(transf_dim)
+        self.output_proj = nn.Linear(transf_dim, embedding_dim)
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith("c_proj.weight"):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02 / sqrt(2 * transformer_config.n_layer))
+    @property
+    def bottleneck(self):
+        return self._bottleneck
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, inputs):
+        # inputs: (batch_size, 3, image_size_x, image_size_y)
+        # Patch input images
+        batch_size = inputs.shape[0]
+        padded_inputs = F.pad(inputs, (0, self.x_pad, 0, self.y_pad), mode="constant", value=0)
+        x = padded_inputs.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
+        num_x_patches = (self.image_size_x + self.x_pad) // self.patch_size
+        num_y_patches = (self.image_size_y + self.y_pad) // self.patch_size
+        # inputs is of shape (batch_size, 3, num_x_patches, num_y_patches, patch_size, patch_size)
+        # Turn it into (batch_size, patches, input_dim)
+        patches = x.permute(0, 2, 3, 1, 4, 5).contiguous().view(batch_size, num_x_patches * num_y_patches, 3 * self.patch_size * self.patch_size)
+        proj_patches = self.proj1(patches)
+        pos_embeds = self.pos_embeds.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        vit_input = proj_patches + pos_embeds
+        vit_output = self.vit(vit_input)
+        vit_output = self.output_ln(vit_output)
+        embeddings = self.output_proj(vit_output)
+        normalised_embeddings = F.normalize(embeddings, dim=-1)
+        return normalised_embeddings
+class ViTDecoder(nn.Module):
+    def __init__(self, patch_size, transf_dim, embedding_dim, image_size_x, image_size_y, num_layers, head_size, expected_bottleneck=None):
+        super().__init__()
+        self.image_size_x = image_size_x
+        self.image_size_y = image_size_y
+        self.x_pad = (patch_size - (self.image_size_x % patch_size)) % patch_size
+        self.y_pad = (patch_size - (self.image_size_y % patch_size)) % patch_size
+        assert (self.image_size_x + self.x_pad) % patch_size == 0 and (
+            self.image_size_y + self.y_pad
+        ) % patch_size == 0, "image_size_x and image_size_y must be divisible by patch_size"
+        self.vit_tokens = ((image_size_x + self.x_pad) // patch_size) * ((image_size_y + self.y_pad) // patch_size)
+        if expected_bottleneck is not None:
+            assert (
+                self.vit_tokens == expected_bottleneck
+            ), f"Expected bottleneck of {expected_bottleneck} but got {self.vit_tokens} for image size {image_size_x}x{image_size_y} with ViT Decoder and patch size {patch_size}"
+        self.patch_size = patch_size
+        self.transf_dim = transf_dim
+        self.embedding_dim = embedding_dim
+        self.proj1 = nn.Linear(embedding_dim, transf_dim)
+        self.pos_embeds = nn.Embedding(self.vit_tokens, transf_dim)
+        assert self.transf_dim % head_size == 0, "transf_dim must be divisible by head_size"
+        n_heads = self.transf_dim // head_size
+        transformer_config = GPTConfig(block_size=self.vit_tokens, n_layer=num_layers, n_head=n_heads, n_embd=transf_dim, bias=False, dropout=0)
+        self.vit = nn.Sequential(*[SelfAttentionBlock(transformer_config) for _ in range(num_layers)])
+        self.output_ln = nn.LayerNorm(transf_dim)
+        self.output_proj = nn.Linear(transf_dim, 3 * patch_size * patch_size)
+        # Couldn't resist the name
+        self.folder = nn.Fold(
+            output_size=(self.image_size_y + self.y_pad, self.image_size_x + self.x_pad),
+            kernel_size=(self.patch_size, self.patch_size),
+            stride=(self.patch_size, self.patch_size),
+        )
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith("c_proj.weight"):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02 / sqrt(2 * transformer_config.n_layer))
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, inputs):
+        # Patch input images
+        batch_size = inputs.shape[0]
+        # Unproject the embeddings from the VQ embedding space to the transformer space
+        proj_patches = self.proj1(inputs).reshape(batch_size, self.vit_tokens, self.transf_dim)
+        pos_embeds = self.pos_embeds.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        vit_input = proj_patches + pos_embeds
+        vit_output = self.vit(vit_input)
+        vit_output = self.output_ln(vit_output)
+        predictions = self.output_proj(vit_output)  # (batch, patches, 3 * patch_size * patch_size)
+        # Reassemble the image into (batch, 3, image_size_x, image_size_y)
+        fold_inputs = predictions.permute(0, 2, 1).contiguous()
+        image_pred = self.folder(fold_inputs)
+        unpadded_image_pred = image_pred[:, :, : self.image_size_y, : self.image_size_x]  # Remove padding in the same way it was applied in the encoder
+        # Anything on the output?
+        return unpadded_image_pred
+    def get_last_layer(self):
+        """
+        Return the last layer weights of the model, to use for loss balancing.
+        """
+        return self.output_proj.weight
+class PatchGan(nn.Module):
+    def __init__(self, channel_start):
+        super().__init__()
+        x = channel_start
+        self.downsample1 = ConvNextDownsampleBig(3, x)
+        self.block1 = ConvNextBlock(x)
+        self.downsample2 = ConvNextDownsampleBig(x, x)
+        self.block2 = ConvNextBlock(x)
+        self.last = nn.Conv2d(x, 1, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        batch_size = x.shape[0]
+        y = torch.nn.functional.gelu(self.downsample1(x))
+        y = self.block1(y)
+        z = torch.nn.functional.gelu(self.downsample2(y))
+        z = self.block2(z)
+        return self.last(z).reshape(batch_size, -1)

wham/models/vqvae/vqvae_utils.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import math
+from typing import List, Optional, Tuple, Union
+import torch
+def normalise_rgb(X, channels_first=True):
+    """
+    Take in an image tensor of shape [ ... , 3], which is assumed to have already been divided by
+    255 so X \in [0,1]. These functions do additional normalisation, roughly ending up with mean
+    of zero and unit variance. The constants appeared in most vision repos, and are supposedly the
+    'right' constants to use based on imagenet statistics.
+    assert X.shape[-1] == 3
+    """
+    channel_dim = 1 if channels_first else -1
+    assert X.shape[channel_dim] == 3
+    if channels_first:
+        X[:, 0, ...] -= 0.485
+        X[:, 0, ...] /= 0.229
+        X[:, 1, ...] -= 0.456
+        X[:, 1, ...] /= 0.224
+        X[:, 2, ...] -= 0.406
+        X[:, 2, ...] /= 0.225
+    else:
+        X[..., 0] -= 0.485
+        X[..., 0] /= 0.229
+        X[..., 1] -= 0.456
+        X[..., 1] /= 0.224
+        X[..., 2] -= 0.406
+        X[..., 2] /= 0.225
+    return X
+def rev_normalise_rgb(X, channels_first=True):
+    """
+    Reverse `normalise_rgb`, so the output lives in [0,1]. This function is needed for
+    reconstruction visualisation, etc.
+    """
+    channel_dim = 1 if channels_first else -1
+    assert X.shape[channel_dim] == 3
+    if channels_first:
+        X[:, 0, ...] *= 0.229
+        X[:, 0, ...] += 0.485
+        X[:, 1, ...] *= 0.224
+        X[:, 1, ...] += 0.456
+        X[:, 2, ...] *= 0.225
+        X[:, 2, ...] += 0.406
+    else:
+        X[..., 0] *= 0.229
+        X[..., 0] += 0.485
+        X[..., 1] *= 0.224
+        X[..., 1] += 0.456
+        X[..., 2] *= 0.225
+        X[..., 2] += 0.406
+    return X
+@torch.no_grad()
+def make_grid(
+    tensor: Union[torch.Tensor, List[torch.Tensor]],
+    nrow: int = 8,
+    padding: int = 2,
+    normalize: bool = False,
+    value_range: Optional[Tuple[int, int]] = None,
+    scale_each: bool = False,
+    pad_value: float = 0.0,
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Make a grid of images.
+    Args:
+        tensor (Tensor or list): 4D mini-batch Tensor of shape (B x C x H x W)
+            or a list of images all of the same size.
+        nrow (int, optional): Number of images displayed in each row of the grid.
+            The final grid size is ``(B / nrow, nrow)``. Default: ``8``.
+        padding (int, optional): amount of padding. Default: ``2``.
+        normalize (bool, optional): If True, shift the image to the range (0, 1),
+            by the min and max values specified by ``value_range``. Default: ``False``.
+        value_range (tuple, optional): tuple (min, max) where min and max are numbers,
+            then these numbers are used to normalize the image. By default, min and max
+            are computed from the tensor.
+        scale_each (bool, optional): If ``True``, scale each image in the batch of
+            images separately rather than the (min, max) over all images. Default: ``False``.
+        pad_value (float, optional): Value for the padded pixels. Default: ``0``.
+    Returns:
+        grid (Tensor): the tensor containing grid of images.
+    """
+    if not torch.is_tensor(tensor):
+        if isinstance(tensor, list):
+            for t in tensor:
+                if not torch.is_tensor(t):
+                    raise TypeError(f"tensor or list of tensors expected, got a list containing {type(t)}")
+        else:
+            raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}")
+    # if list of tensors, convert to a 4D mini-batch Tensor
+    if isinstance(tensor, list):
+        tensor = torch.stack(tensor, dim=0)
+    if tensor.dim() == 2:  # single image H x W
+        tensor = tensor.unsqueeze(0)
+    if tensor.dim() == 3:  # single image
+        if tensor.size(0) == 1:  # if single-channel, convert to 3-channel
+            tensor = torch.cat((tensor, tensor, tensor), 0)
+        tensor = tensor.unsqueeze(0)
+    if tensor.dim() == 4 and tensor.size(1) == 1:  # single-channel images
+        tensor = torch.cat((tensor, tensor, tensor), 1)
+    if normalize is True:
+        tensor = tensor.clone()  # avoid modifying tensor in-place
+        if value_range is not None and not isinstance(value_range, tuple):
+            raise TypeError("value_range has to be a tuple (min, max) if specified. min and max are numbers")
+        def norm_ip(img, low, high):
+            img.clamp_(min=low, max=high)
+            img.sub_(low).div_(max(high - low, 1e-5))
+        def norm_range(t, value_range):
+            if value_range is not None:
+                norm_ip(t, value_range[0], value_range[1])
+            else:
+                norm_ip(t, float(t.min()), float(t.max()))
+        if scale_each is True:
+            for t in tensor:  # loop over mini-batch dimension
+                norm_range(t, value_range)
+        else:
+            norm_range(tensor, value_range)
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError("tensor should be of type torch.Tensor")
+    if tensor.size(0) == 1:
+        return tensor.squeeze(0)
+    # make the mini-batch of images into a grid
+    nmaps = tensor.size(0)
+    xmaps = min(nrow, nmaps)
+    ymaps = int(math.ceil(float(nmaps) / xmaps))
+    height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding)
+    num_channels = tensor.size(1)
+    grid = tensor.new_full((num_channels, height * ymaps + padding, width * xmaps + padding), pad_value)
+    k = 0
+    for y in range(ymaps):
+        for x in range(xmaps):
+            if k >= nmaps:
+                break
+            # Tensor.copy_() is a valid method but seems to be missing from the stubs
+            # https://pytorch.org/docs/stable/tensors.html#torch.Tensor.copy_
+            grid.narrow(1, y * height + padding, height - padding).narrow(2, x * width + padding, width - padding).copy_(tensor[k])  # type: ignore[attr-defined]
+            k = k + 1
+    return grid

wham/models/wham_base/__init__.py ADDED Viewed

File without changes

wham/models/wham_base/encode_predict_decode_base.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from typing import Any, Union, Type, Callable, Tuple, Mapping, Optional
+import torch as th
+import pytorch_lightning as pl
+from tensordict import TensorDict  # type: ignore # requires installing stubs for tensordict
+from .tensor_spaces import TensorDictSpace
+from .encoder_decoder import EncoderDecoderBase
+from .pl_creation_args import LightningModuleCreationArgs
+def create_encoder_args_from_config_dict(
+    config_dict: dict[str, Union[dict[str, Any], tuple]], class_name_to_model: Callable[[str], Type[pl.LightningModule]]
+) -> Mapping[str, Union[LightningModuleCreationArgs, Tuple[LightningModuleCreationArgs, LightningModuleCreationArgs]]]:
+    """
+    Given a dictionary mapping modality names to their encoder-decoder arguments, create the corresponding
+    creation args (LightningModuleCreationArgs) for each modality.
+    See LightningModuleCreationArgs.from_dict for more details.
+    Args:
+        config_dict: A dictionary mapping modality names to their encoder-decoder arguments.
+                     Root level of this dictionary should be modality names we expect.
+        class_name_to_model: A function mapping class names to their corresponding model classes.
+    Returns:
+        A dictionary mapping modality names to their encoder-decoder creation args.
+        Each value may be a LightningModuleCreationArgs, or a tuple of two LightningModuleCreationArgs.
+        If value is a LightningModuleCreationArgs, then same model is used for encoding and decoding.
+        If value is a tuple of two LightningModuleCreationArgs, then first is used for encoding and second for decoding.
+    """
+    # Giving explicit type hint here to make mypy happy
+    modalities: dict[str, Any] = {}
+    for modality_name, modality_config in config_dict.items():
+        if isinstance(modality_config, (list, tuple)):
+            assert len(modality_config) == 2, f"Expected two entries for modality {modality_name}, got {len(modality_config)}"
+            modalities[modality_name] = (
+                LightningModuleCreationArgs.from_dict(modality_config[0], class_name_to_model),
+                LightningModuleCreationArgs.from_dict(modality_config[1], class_name_to_model),
+            )
+        else:
+            modalities[modality_name] = LightningModuleCreationArgs.from_dict(modality_config, class_name_to_model)
+    return modalities
+def create_encoder_modules_from_args(
+    encoders: Mapping[str, Union[LightningModuleCreationArgs, Tuple[LightningModuleCreationArgs, LightningModuleCreationArgs]]], remove_checkpoint_path: bool = True
+) -> th.nn.ModuleDict:
+    """
+    Create the encoder modules from given creation args (LightningModuleCreationArgs).
+    Args:
+        encoders: A dictionary mapping modality names to their encoder-decoder creation args.
+                  If value is a LightningModuleCreationArgs, then same model is used for encoding and decoding.
+                  If value is a tuple of two LightningModuleCreationArgs, then first is used for encoding and second for decoding.
+        remove_checkpoint_path: If True, then remove the checkpoint_path from the creation args. This prepares the
+                                created moduled to be properly saved and loaded as part of the bigger model
+    Returns:
+        A dictionary mapping modality names to their encoder-decoder modules.
+    """
+    modalities = {}
+    for modality_name, modality_args in encoders.items():
+        if isinstance(modality_args, (list, tuple)):
+            modalities[modality_name] = th.nn.ModuleList(
+                [
+                    modality_args[0].create_module(remove_checkpoint_path=remove_checkpoint_path),
+                    modality_args[1].create_module(remove_checkpoint_path=remove_checkpoint_path),
+                ]
+            )
+        else:
+            modalities[modality_name] = modality_args.create_module(remove_checkpoint_path=remove_checkpoint_path)
+    return th.nn.ModuleDict(modalities)
+class EncodePredictDecodeModule(pl.LightningModule):
+    """
+    Base-class for models that encode, predict and decode.
+    Args:
+        context_encoders: A dictionary mapping modality names to their encoder-decoders.
+                          If value is a pl.LightningModule, then same model is used for encoding and decoding.
+                          If value is a tuple of two pl.LightningModule, then first is used for encoding and second for decoding.
+        condition_encoders: Same as `context_encoders`, but for conditions.
+    """
+    def __init__(
+        self,
+        predictor_args: LightningModuleCreationArgs,
+        context_encoders: th.nn.ModuleDict,
+        condition_encoders: Optional[th.nn.ModuleDict] = None,
+    ):
+        if condition_encoders is None:
+            condition_encoders = th.nn.ModuleDict(dict())
+        self._assert_encoders(context_encoders)
+        self._assert_encoders(condition_encoders)
+        super().__init__()
+        self.context_encoders = context_encoders
+        self.condition_encoders = condition_encoders
+        self.context_world_space, self.context_encoder_space = self._get_spaces_from_encoders(context_encoders)
+        self.condition_world_space, self.condition_encoder_space = self._get_spaces_from_encoders(condition_encoders)
+        self.predictor = predictor_args.create_module(context_space=self.context_encoder_space, condition_space=self.condition_encoder_space)
+    def _assert_encoders(self, encoders: th.nn.ModuleDict) -> None:
+        """Check that encoder dictionary is valid"""
+        assert isinstance(encoders, th.nn.ModuleDict), f"Invalid type for encoders: {type(encoders)}. Expected th.nn.ModuleDict"
+        for modality_name, encoder in encoders.items():
+            assert isinstance(encoder, EncoderDecoderBase) or isinstance(
+                encoder, th.nn.ModuleList
+            ), f"Invalid type for modality {modality_name}: {type(encoder)}. Expected EncoderDecoderBase or Tuple[EncoderDecoderBase]"
+            if isinstance(encoder, th.nn.ModuleList):
+                assert len(encoder) == 2, f"Invalid number of arguments for modality {modality_name}: {len(encoder)}. Expected two (encoder, decoder)"
+                assert isinstance(
+                    encoder[0], EncoderDecoderBase
+                ), f"Invalid type for encoder of modality {modality_name}: {type(encoder[0])}. Expected EncoderDecoderBase"
+                assert isinstance(
+                    encoder[1], EncoderDecoderBase
+                ), f"Invalid type for decoder of modality {modality_name}: {type(encoder[1])}. Expected EncoderDecoderBase"
+    def _get_spaces_from_encoders(self, encoders: th.nn.ModuleDict) -> Tuple[TensorDictSpace, TensorDictSpace]:
+        """
+        Given a modality dictionary mapping modality names to their encoders and decoders,
+        extract the world space and encoder space,
+        """
+        world_spaces = {}
+        encoder_spaces = {}
+        for modality_name, modality in encoders.items():
+            if isinstance(modality, EncoderDecoderBase):
+                encoder_spaces[modality_name] = modality.encoder_space
+                world_spaces[modality_name] = modality.world_space
+            elif isinstance(modality, th.nn.ModuleList):
+                assert len(modality) == 2, f"Invalid number of modules for modality {modality_name}: {len(modality)}. Expected 2."
+                # Make sure that both encoder and decoder spaces match the expected space
+                encoder_encoder_space = modality[0].encoder_space
+                decoder_encoder_space = modality[1].encoder_space
+                assert (
+                    encoder_encoder_space == decoder_encoder_space
+                ), f"Encoder and decoder spaces for modality {modality_name} do not match: {encoder_encoder_space} != {decoder_encoder_space}"
+                encoder_world_space = modality[0].world_space
+                decoder_world_space = modality[1].world_space
+                assert (
+                    encoder_world_space == decoder_world_space
+                ), f"Encoder and decoder world spaces for modality {modality_name} do not match: {encoder_world_space} != {decoder_world_space}"
+                encoder_spaces[modality_name] = encoder_encoder_space
+                world_spaces[modality_name] = encoder_world_space
+            else:
+                raise TypeError(f"Invalid type for modality {modality_name}: {type(modality)}. Expected EncoderDecoderBase or th.nn.ModuleList")
+        return TensorDictSpace(world_spaces), TensorDictSpace(encoder_spaces)
+    def _encode(self, input_td: TensorDict, encoders: th.nn.ModuleDict, space: TensorDictSpace) -> TensorDict:
+        """
+        Encode input_td into encoder space using the given encoders.
+        Args:
+            input_td: A tensordict mapping modality names to their inputs.
+            encoders: A dictionary mapping modality names to their encoders.
+        Returns:
+            An encoded tensordict.
+        """
+        encoded_context = {}
+        preceding_dims = space.get_preceding_dimensions(input_td, allow_key_subset=True)
+        for modality_name in input_td.keys():
+            encoder = encoders[modality_name]
+            if isinstance(encoder, EncoderDecoderBase):
+                encoded_context[modality_name] = encoder.encode(input_td[modality_name])
+            elif isinstance(encoder, th.nn.ModuleList):
+                encoded_context[modality_name] = encoder[0].encode(input_td[modality_name])
+            else:
+                raise TypeError(f"Invalid type for modality {modality_name}: {type(encoder)}. Expected EncoderDecoderBase or th.nn.ModuleList")
+        return TensorDict(encoded_context, batch_size=preceding_dims)
+    def _decode(self, input_td: TensorDict, encoders: th.nn.ModuleDict, space: TensorDictSpace) -> TensorDict:
+        """
+        Decode input_td into the original space using the given encoders.
+        Args:
+            input_td: A tensordict mapping modality names to their encoded inputs.
+            encoders: A dictionary mapping modality names to their encoders.
+        Returns:
+            A decoded tensordict.
+        """
+        decoded_context = {}
+        preceding_dims = space.get_preceding_dimensions(input_td, allow_key_subset=True)
+        for modality_name in input_td.keys():
+            encoder = encoders[modality_name]
+            if isinstance(encoder, EncoderDecoderBase):
+                decoded_context[modality_name] = encoder.decode(input_td[modality_name])
+            elif isinstance(encoder, th.nn.ModuleList):
+                decoded_context[modality_name] = encoder[1].decode(input_td[modality_name])
+            else:
+                raise TypeError(f"Invalid type for modality {modality_name}: {type(encoder)}. Expected EncoderDecoderBase or th.nn.ModuleList")
+        return TensorDict(decoded_context, batch_size=preceding_dims)
+    def encode_context(self, context: TensorDict) -> TensorDict:
+        """
+        Encode the given context into the encoder space.
+        Args:
+            context: A tensordict mapping modality names to their inputs.
+        Returns:
+            An encoded tensordict.
+        """
+        assert self.context_world_space.contains(context, allow_key_subset=True), f"Context {context} is not contained in context world space {self.context_world_space}"
+        return self._encode(context, self.context_encoders, self.context_world_space)
+    def decode_context(self, encoded_context: TensorDict) -> TensorDict:
+        """
+        Decode the given encoded context into the original space.
+        Args:
+            encoded_context: A tensordict mapping modality names to their encoded inputs.
+        Returns:
+            A decoded tensordict.
+        """
+        assert self.context_encoder_space.contains(
+            encoded_context,
+            allow_key_subset=True,
+        ), f"Encoded context {encoded_context} is not contained in context encoder space {self.context_encoder_space}"
+        return self._decode(encoded_context, self.context_encoders, self.context_encoder_space)
+    def encode_condition(self, condition: TensorDict) -> TensorDict:
+        """
+        Encode the given condition into the encoder space.
+        Args:
+            condition: A tensordict mapping modality names to their inputs.
+        Returns:
+            An encoded tensordict.
+        """
+        assert self.condition_world_space.contains(
+            condition, allow_key_subset=True
+        ), f"Condition {condition} is not contained in condition world space {self.condition_world_space}"
+        return self._encode(condition, self.condition_encoders, self.condition_world_space)
+    def decode_condition(self, encoded_condition: TensorDict) -> TensorDict:
+        """
+        Decode the given encoded condition into the original space.
+        Args:
+            encoded_condition: A tensordict mapping modality names to their encoded inputs.
+        Returns:
+            A decoded tensordict.
+        """
+        assert self.condition_encoder_space.contains(
+            encoded_condition, allow_key_subset=True
+        ), f"Encoded condition {encoded_condition} is not contained in condition encoder space {self.condition_encoder_space}"
+        return self._decode(encoded_condition, self.condition_encoders, self.condition_encoder_space)