Spaces:

kernel-luso-comfort
/

BiomedParse

Sleeping

App Files Files Community

kernel-luso-comfort commited on Dec 22, 2024

Commit

6ba63c9

1 Parent(s): cbd253a

Add initial module structure and entry points for modeling and utilities

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
Dockerfile +77 -0
README.md +5 -5
colabs/ENVIRONMENT.md +6 -0
colabs/biomedparse_inference_demo.py +156 -0
colabs/environment.yml +149 -0
colabs/requirements-colab-pip-freeze.txt +567 -0
colabs/requirements-colab.txt +39 -0
configs/biomedparse_inference.yaml +204 -0
entrypoint.sh +5 -0
examples/Part_1_516_pathology_breast.png +3 -0
inference_utils/inference.py +149 -0
inference_utils/output_processing.py +91 -0
inference_utils/processing_utils.py +182 -0
inference_utils/target_dist.json +1 -0
main.py +106 -0
modeling/BaseModel.py +45 -0
modeling/__init__.py +1 -0
modeling/architectures/__init__.py +5 -0
modeling/architectures/build.py +22 -0
modeling/architectures/seem_model_demo.py +923 -0
modeling/architectures/seem_model_v0.py +1160 -0
modeling/architectures/seem_model_v1.py +1179 -0
modeling/architectures/xdecoder_model.py +937 -0
modeling/body/__init__.py +10 -0
modeling/body/build.py +13 -0
modeling/body/xdecoder_head.py +126 -0
modeling/interface/__init__.py +13 -0
modeling/interface/build.py +14 -0
modeling/interface/modules.py +200 -0
modeling/interface/prototype/__init__.py +0 -0
modeling/interface/prototype/attention_data_struct_seemdemo.py +265 -0
modeling/interface/prototype/attention_data_struct_seemv0.py +264 -0
modeling/interface/prototype/attention_data_struct_seemv1.py +302 -0
modeling/interface/seem_demo.py +397 -0
modeling/interface/seem_v0.py +392 -0
modeling/interface/seem_v1.py +389 -0
modeling/interface/xdecoder.py +497 -0
modeling/language/LangEncoder/__init__.py +35 -0
modeling/language/LangEncoder/build.py +16 -0
modeling/language/LangEncoder/transformer.py +222 -0
modeling/language/__init__.py +10 -0
modeling/language/build.py +14 -0
modeling/language/loss.py +232 -0
modeling/language/misc.py +66 -0
modeling/language/vlpencoder.py +206 -0
modeling/modules/__init__.py +6 -0
modeling/modules/attention.py +487 -0
modeling/modules/criterion.py +874 -0
modeling/modules/matcher.py +632 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,77 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM continuumio/miniconda3:latest
+# Add build argument to force rebuild
+ARG CACHEBUST=1
+# Avoid tzdata interactive configuration
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=UTC
+# Install system dependencies
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    git \
+    build-essential \
+    python3-dev \
+    wget \
+    openmpi-bin \
+    libopenmpi-dev \
+    libopenmpi3 \
+    libhwloc15 \
+    libevent-dev \
+    libpmix2 \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Set up OpenMPI environment
+ENV OMPI_MCA_btl_vader_single_copy_mechanism=none \
+    OMPI_ALLOW_RUN_AS_ROOT=1 \
+    OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \
+    PATH=/usr/lib/x86_64-linux-gnu/openmpi/bin:$PATH \
+    LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/openmpi/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+# Copy environment file
+COPY colabs/environment.yml /tmp/environment.yml
+# Create conda environment
+RUN conda env create -f /tmp/environment.yml && \
+    conda run -n biomedparse pip install gradio==3.50.2
+# Initialize conda in bash
+RUN conda init bash
+# Make RUN commands use the new environment
+SHELL ["conda", "run", "-n", "biomedparse", "/bin/bash", "-c"]
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set up HF token for the user
+RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
+    echo "export HF_TOKEN=$(cat /run/secrets/HF_TOKEN)" >> $HOME/.bashrc
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy all files to the app directory
+COPY --chown=user . $HOME/app
+# Set permissions for entrypoint script
+RUN chmod 755 $HOME/app/entrypoint.sh
+# Add conda environment to user's path
+RUN echo "conda activate biomedparse" >> $HOME/.bashrc
+# Use entrypoint script to set up environment and run application
+ENTRYPOINT ["/bin/bash", "-c"]
+CMD ["exec /home/user/app/entrypoint.sh"]

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
-title: BiomedParse
-emoji: 📊
-colorFrom: gray
-colorTo: purple
 sdk: docker
 pinned: false
-short_description: BiomedParse
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Biomedparse Docker
+emoji: 📉
+colorFrom: yellow
+colorTo: blue
 sdk: docker
 pinned: false
+license: cc-by-nc-sa-4.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

colabs/ENVIRONMENT.md ADDED Viewed

	@@ -0,0 +1,6 @@

+# Description of Google Colab Environment
+- Hardware: Python 3 Google Compute Engine Backend on T4 GPU
+- CUDA version: 12.2
+- Driver Version: 535.104.05
+- Python version: 3.10.12

colabs/biomedparse_inference_demo.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# -*- coding: utf-8 -*-
+"""biomedparse_inference_demo.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1jL4wvdtBWz6G_yBkFn8tyDD0hV1RtKVZ
+# BiomedParse Inference Demo Notebook
+Welcome to the demo notebook for BiomedParse, a comprehensive tool for biomedical image analysis. BiomedParse is designed to simultaneously handle segmentation, detection, and recognition tasks across major biomedical image modalities, providing a unified solution for complex image analysis in biomedical research.
+[[`Paper`](https://aka.ms/biomedparse-paper)] [[`Demo`](https://microsoft.github.io/BiomedParse/)] [[`Model`](https://huggingface.co/microsoft/BiomedParse)]  [[`Data`](https://huggingface.co/datasets/microsoft/BiomedParseData)]
+## Model Checkpoint Access
+The BiomedParse model checkpoint is hosted on [HuggingFace](https://huggingface.co/microsoft/BiomedParse). To access the model:
+1. Visit the [model page](https://huggingface.co/microsoft/BiomedParse).
+2. Make sure to review and accept the terms of use to gain access to the checkpoint.
+3. Retrieve your HuggingFace access token from your user profile.
+## Setting Up Access
+To use the model, set your Hugging Face access token in the HF_TOKEN environment variable or as a Colab secret. This step ensures secure and authorized access to the model resources.
+"""
+# Set your Hugging Face access token in your environment
+# import os
+# os.environ['HF_TOKEN'] = 'your_huggingface_access_token_here'
+# Or, if you are using Google Colab, set HF_TOKEN on Colab secrets.
+from google.colab import userdata
+import huggingface_hub
+huggingface_hub.login(userdata.get('HF_TOKEN'))
+from huggingface_hub import hf_hub_download
+model_file = hf_hub_download(repo_id="microsoft/BiomedParse", filename="biomedparse_v1.pt", local_dir="pretrained")
+print(f"Downloaded model file to: {model_file}")
+"""## Environment Setup"""
+!git clone https://github.com/microsoft/BiomedParse
+!pip install -r BiomedParse/assets/requirements/requirements.txt
+"""# Restart Colab Runtime"""
+# Make sure to restart Colab runtime after installing dependencies
+import os
+try:
+    import google.colab
+    os._exit(0)
+except ImportError:
+    pass
+import os
+os.chdir('/content/BiomedParse')
+print(os.getcwd())
+"""## Load the model weights"""
+from PIL import Image
+import torch
+import argparse
+import numpy as np
+from modeling.BaseModel import BaseModel
+from modeling import build_model
+from utilities.distributed import init_distributed # changed from utils
+from utilities.arguments import load_opt_from_config_files
+from utilities.constants import BIOMED_CLASSES
+from inference_utils.inference import interactive_infer_image
+conf_files = "configs/biomedparse_inference.yaml"
+opt = load_opt_from_config_files([conf_files])
+opt = init_distributed(opt)
+model_file = "../pretrained/biomedparse_v1.pt"
+model = BaseModel(opt, build_model(opt)).from_pretrained(model_file).eval().cuda()
+with torch.no_grad():
+    model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(BIOMED_CLASSES + ["background"], is_eval=True)
+"""# Run Inference"""
+# RGB image input of shape (H, W, 3). Currently only batch size 1 is supported.
+image = Image.open('examples/Part_1_516_pathology_breast.png', formats=['png'])
+image = image.convert('RGB')
+# text prompts querying objects in the image. Multiple ones can be provided.
+prompts = ['neoplastic cells', 'inflammatory cells']
+pred_mask = interactive_infer_image(model, image, prompts)
+pred_mask.shape
+# load ground truth mask
+gt_masks = []
+for prompt in prompts:
+    gt_mask = Image.open(f"examples/Part_1_516_pathology_breast_{prompt.replace(' ', '+')}.png", formats=['png'])
+    gt_mask = 1*(np.array(gt_mask.convert('RGB'))[:,:,0] > 0)
+    gt_masks.append(gt_mask)
+# prediction with ground truth mask
+for i, pred in enumerate(pred_mask):
+    gt = gt_masks[i]
+    dice = (1*(pred>0.5) & gt).sum() * 2.0 / (1*(pred>0.5).sum() + gt.sum())
+    print(f'Dice score for {prompts[i]}: {dice:.4f}')
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image
+import matplotlib.patches as mpatches
+def overlay_masks(image, masks, colors):
+    overlay = image.copy()
+    overlay = np.array(overlay, dtype=np.uint8)
+    for mask, color in zip(masks, colors):
+        overlay[mask > 0] = (overlay[mask > 0] * 0.4 + np.array(color) * 0.6).astype(np.uint8)
+    return Image.fromarray(overlay)
+def generate_colors(n):
+    cmap = plt.get_cmap('tab10')
+    colors = [tuple(int(255 * val) for val in cmap(i)[:3]) for i in range(n)]
+    return colors
+original_image = Image.open('examples/Part_1_516_pathology_breast.png').convert('RGB')
+colors = generate_colors(len(prompts))
+pred_overlay = overlay_masks(original_image, [1*(pred_mask[i] > 0.5) for i in range(len(prompts))], colors)
+gt_overlay = overlay_masks(original_image, gt_masks, colors)
+legend_patches = [mpatches.Patch(color=np.array(color) / 255, label=prompt) for color, prompt in zip(colors, prompts)]
+fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+axes[0].imshow(original_image)
+axes[0].set_title("Original Image")
+axes[0].axis('off')
+axes[1].imshow(pred_overlay)
+axes[1].set_title("Predictions")
+axes[1].axis('off')
+axes[1].legend(handles=legend_patches, loc='upper right', fontsize='small')
+axes[2].imshow(gt_overlay)
+axes[2].set_title("Ground Truth")
+axes[2].axis('off')
+axes[2].legend(handles=legend_patches, loc='upper right', fontsize='small')
+plt.tight_layout()
+plt.show()

colabs/environment.yml ADDED Viewed

	@@ -0,0 +1,149 @@

+name: biomedparse
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py39h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.7.2=h06a4308_0
+  - certifi=2024.7.4=py39h06a4308_0
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - cuda-cudart=12.4.127=0
+  - cuda-cupti=12.4.127=0
+  - cuda-libraries=12.4.0=0
+  - cuda-nvrtc=12.4.127=0
+  - cuda-nvtx=12.4.127=0
+  - cuda-opencl=12.6.37=0
+  - cuda-runtime=12.4.0=0
+  - cuda-version=12.6=3
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py39h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py39heeb90bb_0
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.7=py39h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.4=py39h06a4308_0
+  - jpeg=9e=h5eee18b_3
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=12.4.2.65=0
+  - libcufft=11.2.0.44=0
+  - libcufile=1.11.0.15=0
+  - libcurand=10.3.7.37=0
+  - libcusolver=11.6.0.99=0
+  - libcusparse=12.3.0.142=0
+  - libdeflate=1.17=h5eee18b_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h5eee18b_3
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.2.5.2=0
+  - libnvfatbin=12.6.20=0
+  - libnvjitlink=12.4.99=0
+  - libnvjpeg=12.3.1.89=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libwebp-base=1.3.2=h5eee18b_0
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markupsafe=2.1.3=py39h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py39h5eee18b_1
+  - mkl_fft=1.3.8=py39h5eee18b_0
+  - mkl_random=1.2.4=py39hdb19cb5_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py39h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.2.1=py39h06a4308_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.5.2=he7f1fd0_0
+  - openssl=3.0.14=h5eee18b_0
+  - pip=24.2=py39h06a4308_0
+  - pysocks=1.7.1=py39h06a4308_0
+  - python=3.9.19=h955ad1f_1
+  - pytorch=2.4.0=py3.9_cuda12.4_cudnn9.1.0_0
+  - pytorch-cuda=12.4=hc786d27_6
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.1=py39h5eee18b_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.32.3=py39h06a4308_0
+  - setuptools=72.1.0=py39h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - sympy=1.12=py39h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.14=h39e8969_0
+  - torchaudio=2.4.0=py39_cu124
+  - torchtriton=3.0.0=py39
+  - torchvision=0.19.0=py39_cu124
+  - typing_extensions=4.11.0=py39h06a4308_0
+  - tzdata=2024a=h04d1e81_0
+  - urllib3=2.2.2=py39h06a4308_0
+  - wheel=0.43.0=py39h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_1
+  - zstd=1.5.5=hc292b87_2
+  - pip:
+      - accelerate==0.23.0
+      - antlr4-python3-runtime==4.9.3
+      - appdirs==1.4.4
+      - black==21.4b2
+      - open-clip-torch==2.26.1
+      - cloudpickle==3.0.0
+      - cython==3.0.2
+      - deepspeed==0.10.3
+      - git+https://github.com/MaureenZOU/detectron2-xyz.git
+      - diffdist==0.1
+      - einops==0.8.0
+      - ftfy==6.1.1
+      - fvcore==0.1.5.post20221221
+      - hjson==3.1.0
+      - huggingface-hub==0.17.3
+      - hydra-core==1.3.2
+      - imageio==2.35.1
+      - infinibatch==0.1.1
+      - iopath==0.1.9
+      - json-tricks==3.17.3
+      - kornia==0.7.0
+      - mpi4py==3.1.5
+      - mup==1.0.0
+      - mypy-extensions==1.0.0
+      - ninja==1.11.1.1
+      - nltk==3.8.1
+      - numpy==1.23.1
+      - omegaconf==2.3.0
+      - opencv-python==4.8.1.78
+      - pandas==2.0.3
+      - pathspec==0.12.1
+      - pillow==9.4.0
+      - portalocker==2.10.1
+      - py-cpuinfo==9.0.0
+      - pycocotools==2.0.7
+      - pydantic==1.10.18
+      - pydot==3.0.1
+      - regex==2023.10.3
+      - scikit-image==0.21.0
+      - scikit-learn==1.3.1
+      - sentencepiece==0.1.99
+      - tabulate==0.9.0
+      - termcolor==2.4.0
+      - timm==0.4.12
+      - tokenizers==0.14.1
+      - transformers==4.34.0
+      - vision-datasets==0.2.2
+      - yacs==0.1.8

colabs/requirements-colab-pip-freeze.txt ADDED Viewed

	@@ -0,0 +1,567 @@

+absl-py==1.4.0
+accelerate==0.23.0
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.10
+aiosignal==1.3.2
+alabaster==1.0.0
+albucore==0.0.19
+albumentations==1.4.20
+altair==5.5.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==3.7.1
+appdirs==1.4.4
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+array_record==0.5.1
+arviz==0.20.0
+astropy==6.1.7
+astropy-iers-data==0.2024.12.16.0.35.48
+astunparse==1.6.3
+async-timeout==4.0.3
+atpublic==4.1.0
+attrs==24.3.0
+audioread==3.0.1
+autograd==1.7.0
+babel==2.16.0
+backcall==0.2.0
+beautifulsoup4==4.12.3
+bigframes==1.29.0
+bigquery-magics==0.4.0
+black==21.4b2
+bleach==6.2.0
+blinker==1.9.0
+blis==0.7.11
+blosc2==2.7.1
+bokeh==3.6.2
+Bottleneck==1.4.2
+bqplot==0.12.43
+branca==0.8.1
+CacheControl==0.14.1
+cachetools==5.5.0
+catalogue==2.0.10
+certifi==2024.12.14
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.0
+chex==0.1.88
+clarabel==0.9.0
+click==8.1.7
+cloudpathlib==0.20.0
+cloudpickle==3.1.0
+cmake==3.31.2
+cmdstanpy==1.2.5
+colorcet==3.1.0
+colorlover==0.3.0
+colour==0.1.5
+community==1.0.0b1
+confection==0.1.5
+cons==0.4.6
+contourpy==1.3.1
+cryptography==43.0.3
+cuda-python==12.2.1
+cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.10.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
+cufflinks==0.17.3
+cupy-cuda12x==12.2.0
+cvxopt==1.3.2
+cvxpy==1.6.0
+cycler==0.12.1
+cymem==2.0.10
+Cython==3.0.2
+dask==2024.10.0
+datascience==0.17.6
+db-dtypes==1.3.1
+dbus-python==1.2.18
+debugpy==1.8.0
+decorator==4.4.2
+deepspeed==0.10.3
+defusedxml==0.7.1
+Deprecated==1.2.15
+detectron2 @ git+https://github.com/MaureenZOU/detectron2-xyz.git@42121d75e10d9f858f3a91b6a39f5722c02868f0
+diffdist==0.1
+diffusers==0.31.0
+distro==1.9.0
+dlib==19.24.2
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+docstring_parser==0.16
+docutils==0.21.2
+dopamine_rl==4.1.0
+duckdb==1.1.3
+earthengine-api==1.4.3
+easydict==1.13
+editdistance==0.8.1
+eerepr==0.0.4
+einops==0.8.0
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
+entrypoints==0.4
+et_xmlfile==2.0.0
+etils==1.11.0
+etuples==0.3.9
+eval_type_backport==0.2.0
+exceptiongroup==1.2.2
+fastai==2.7.18
+fastcore==1.7.27
+fastdownload==0.0.7
+fastjsonschema==2.21.1
+fastprogress==1.0.3
+fastrlock==0.8.3
+filelock==3.16.1
+firebase-admin==6.6.0
+Flask==3.1.0
+flatbuffers==24.3.25
+flax==0.8.5
+folium==0.19.2
+fonttools==4.55.3
+frozendict==2.4.6
+frozenlist==1.5.0
+fsspec==2024.10.0
+ftfy==6.1.1
+future==1.0.0
+fvcore==0.1.5.post20221221
+gast==0.6.0
+gcsfs==2024.10.0
+GDAL==3.6.4
+gdown==5.2.0
+geemap==0.35.1
+gensim==4.3.3
+geocoder==1.38.1
+geographiclib==2.0
+geopandas==1.0.1
+geopy==2.4.1
+gin-config==0.5.0
+gitdb==4.0.11
+GitPython==3.1.43
+glob2==0.7
+google==2.0.3
+google-ai-generativelanguage==0.6.10
+google-api-core==2.19.2
+google-api-python-client==2.155.0
+google-auth==2.27.0
+google-auth-httplib2==0.2.0
+google-auth-oauthlib==1.2.1
+google-cloud-aiplatform==1.74.0
+google-cloud-bigquery==3.25.0
+google-cloud-bigquery-connection==1.17.0
+google-cloud-bigquery-storage==2.27.0
+google-cloud-bigtable==2.27.0
+google-cloud-core==2.4.1
+google-cloud-datastore==2.20.2
+google-cloud-firestore==2.19.0
+google-cloud-functions==1.19.0
+google-cloud-iam==2.17.0
+google-cloud-language==2.16.0
+google-cloud-pubsub==2.27.1
+google-cloud-resource-manager==1.14.0
+google-cloud-storage==2.19.0
+google-cloud-translate==3.19.0
+google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
+google-crc32c==1.6.0
+google-genai==0.3.0
+google-generativeai==0.8.3
+google-pasta==0.2.0
+google-resumable-media==2.7.2
+googleapis-common-protos==1.66.0
+googledrivedownloader==0.4
+graphviz==0.20.3
+greenlet==3.1.1
+grpc-google-iam-v1==0.13.1
+grpcio==1.68.1
+grpcio-status==1.62.3
+gspread==6.0.2
+gspread-dataframe==3.3.1
+gym==0.25.2
+gym-notices==0.0.8
+h11==0.14.0
+h5netcdf==1.4.1
+h5py==3.12.1
+hjson==3.1.0
+holidays==0.63
+holoviews==1.20.0
+html5lib==1.1
+httpcore==1.0.7
+httpimport==1.4.0
+httplib2==0.22.0
+httpx==0.28.1
+huggingface-hub==0.17.3
+humanize==4.11.0
+hydra-core==1.3.2
+hyperopt==0.2.7
+ibis-framework==9.2.0
+idna==3.10
+imageio==2.36.1
+imageio-ffmpeg==0.5.1
+imagesize==1.4.1
+imbalanced-learn==0.12.4
+imgaug==0.4.0
+immutabledict==4.2.1
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+imutils==0.5.4
+infinibatch==0.1.1
+inflect==7.4.0
+iniconfig==2.0.0
+intel-cmplr-lib-ur==2025.0.4
+intel-openmp==2025.0.4
+iopath==0.1.9
+ipyevents==2.0.2
+ipyfilechooser==0.6.0
+ipykernel==5.5.6
+ipyleaflet==0.19.2
+ipyparallel==8.8.0
+ipython==7.34.0
+ipython-genutils==0.2.0
+ipython-sql==0.5.0
+ipytree==0.2.2
+ipywidgets==7.7.1
+itsdangerous==2.2.0
+jax==0.4.33
+jax-cuda12-pjrt==0.4.33
+jax-cuda12-plugin==0.4.33
+jaxlib==0.4.33
+jeepney==0.7.1
+jellyfish==1.1.0
+jieba==0.42.1
+Jinja2==3.1.4
+jiter==0.8.2
+joblib==1.4.2
+json-tricks==3.17.3
+jsonpatch==1.33
+jsonpickle==4.0.1
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter-client==6.1.12
+jupyter-console==6.1.0
+jupyter-leaflet==0.19.2
+jupyter-server==1.24.0
+jupyter_core==5.7.2
+jupyterlab_pygments==0.3.0
+jupyterlab_widgets==3.0.13
+kaggle==1.6.17
+kagglehub==0.3.5
+keras==3.5.0
+keyring==23.5.0
+kiwisolver==1.4.7
+kornia==0.7.0
+langchain==0.3.12
+langchain-core==0.3.25
+langchain-text-splitters==0.3.3
+langcodes==3.5.0
+langsmith==0.2.3
+language_data==1.3.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lazy_loader==0.4
+libclang==18.1.1
+libcudf-cu12 @ https://pypi.nvidia.com/libcudf-cu12/libcudf_cu12-24.10.1-py3-none-manylinux_2_28_x86_64.whl
+librosa==0.10.2.post1
+lightgbm==4.5.0
+linkify-it-py==2.0.3
+llvmlite==0.43.0
+locket==1.0.0
+logical-unification==0.4.6
+lxml==5.3.0
+marisa-trie==1.2.1
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.8.0
+matplotlib-inline==0.1.7
+matplotlib-venn==1.1.1
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+miniKanren==1.0.3
+missingno==0.5.2
+mistune==3.0.2
+mizani==0.13.1
+mkl==2025.0.1
+ml-dtypes==0.4.1
+mlxtend==0.23.3
+more-itertools==10.5.0
+moviepy==1.0.3
+mpi4py==3.1.5
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multipledispatch==1.0.0
+multitasking==0.0.11
+mup==1.0.0
+murmurhash==1.0.11
+music21==9.3.0
+mypy-extensions==1.0.0
+namex==0.0.8
+narwhals==1.18.4
+natsort==8.4.0
+nbclassic==1.1.0
+nbclient==0.10.1
+nbconvert==7.16.4
+nbformat==5.10.4
+ndindex==1.9.2
+nest-asyncio==1.6.0
+networkx==3.4.2
+nibabel==5.3.2
+ninja==1.11.1.3
+nltk==3.8.1
+notebook==6.5.5
+notebook_shim==0.2.4
+numba==0.60.0
+numexpr==2.10.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvcc-cu12==12.6.85
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.6.0.74
+nvidia-cufft-cu12==11.3.0.4
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-nccl-cu12==2.23.4
+nvidia-nvjitlink-cu12==12.6.85
+nvtx==0.2.10
+nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.10.0-py3-none-any.whl
+oauth2client==4.1.3
+oauthlib==3.2.2
+omegaconf==2.3.0
+open_clip_torch==2.26.1
+openai==1.57.4
+opencv-contrib-python==4.10.0.84
+opencv-python==4.8.1.78
+opencv-python-headless==4.10.0.84
+openpyxl==3.1.5
+opentelemetry-api==1.29.0
+opentelemetry-sdk==1.29.0
+opentelemetry-semantic-conventions==0.50b0
+opt_einsum==3.4.0
+optax==0.2.4
+optree==0.13.1
+orbax-checkpoint==0.6.4
+orjson==3.10.12
+osqp==0.6.7.post3
+packaging==24.2
+pandas==2.0.3
+pandas-datareader==0.10.0
+pandas-gbq==0.25.0
+pandas-stubs==2.2.2.240909
+pandocfilters==1.5.1
+panel==1.5.4
+param==2.2.0
+parso==0.8.4
+parsy==2.1
+partd==1.4.2
+pathlib==1.0.1
+pathspec==0.12.1
+patsy==1.0.1
+peewee==3.17.8
+peft==0.14.0
+pexpect==4.9.0
+pickleshare==0.7.5
+Pillow==9.4.0
+platformdirs==4.3.6
+plotly==5.24.1
+plotnine==0.14.4
+pluggy==1.5.0
+ply==3.11
+polars==1.9.0
+pooch==1.8.2
+portalocker==3.0.0
+portpicker==1.5.2
+preshed==3.0.9
+prettytable==3.12.0
+proglog==0.1.10
+progressbar2==4.5.0
+prometheus_client==0.21.1
+promise==2.3
+prompt_toolkit==3.0.48
+propcache==0.2.1
+prophet==1.1.6
+proto-plus==1.25.0
+protobuf==4.25.5
+psutil==5.9.5
+psycopg2==2.9.10
+ptyprocess==0.7.0
+py-cpuinfo==9.0.0
+py4j==0.10.9.7
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycocotools==2.0.7
+pycparser==2.22
+pydantic==1.10.19
+pydantic_core==2.27.1
+pydata-google-auth==1.9.0
+pydot==3.0.3
+pydotplus==2.0.2
+PyDrive==1.3.1
+PyDrive2==1.21.3
+pyerfa==2.0.1.5
+pygame==2.6.1
+pygit2==1.16.0
+Pygments==2.18.0
+PyGObject==3.42.1
+PyJWT==2.10.1
+pylibcudf-cu12 @ https://pypi.nvidia.com/pylibcudf-cu12/pylibcudf_cu12-24.10.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
+pylibcugraph-cu12==24.10.0
+pylibraft-cu12==24.10.0
+pymc==5.19.1
+pymystem3==0.2.0
+pynvjitlink-cu12==0.4.0
+pyogrio==0.10.0
+Pyomo==6.8.2
+PyOpenGL==3.1.7
+pyOpenSSL==24.2.1
+pyparsing==3.2.0
+pyperclip==1.9.0
+pyproj==3.7.0
+pyshp==2.3.1
+PySocks==1.7.1
+pyspark==3.5.3
+pytensor==2.26.4
+pytest==8.3.4
+python-apt==0.0.0
+python-box==7.3.0
+python-dateutil==2.8.2
+python-louvain==0.16
+python-slugify==8.0.4
+python-utils==3.9.1
+pytz==2024.2
+pyviz_comms==3.0.3
+PyWavelets==1.8.0
+PyYAML==6.0.1
+pyzmq==24.0.1
+qdldl==0.1.7.post4
+ratelim==0.1.6
+referencing==0.35.1
+regex==2023.10.3
+requests==2.32.3
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+requirements-parser==0.9.0
+rich==13.9.4
+rmm-cu12==24.10.0
+rpds-py==0.22.3
+rpy2==3.4.2
+rsa==4.9
+safetensors==0.4.5
+scikit-image==0.21.0
+scikit-learn==1.3.1
+scipy==1.13.1
+scooby==0.10.0
+scs==3.2.7
+seaborn==0.13.2
+SecretStorage==3.3.1
+Send2Trash==1.8.3
+sentence-transformers==3.3.1
+sentencepiece==0.1.99
+sentry-sdk==2.19.2
+setproctitle==1.3.4
+shap==0.46.0
+shapely==2.0.6
+shellingham==1.5.4
+simple-parsing==0.1.6
+six==1.17.0
+sklearn-pandas==2.2.0
+slicer==0.0.8
+smart-open==7.1.0
+smmap==5.0.1
+sniffio==1.3.1
+snowballstemmer==2.2.0
+soundfile==0.12.1
+soupsieve==2.6
+soxr==0.5.0.post1
+spacy==3.7.5
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+Sphinx==8.1.3
+sphinxcontrib-applehelp==2.0.0
+sphinxcontrib-devhelp==2.0.0
+sphinxcontrib-htmlhelp==2.1.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==2.0.0
+sphinxcontrib-serializinghtml==2.0.0
+SQLAlchemy==2.0.36
+sqlglot==25.1.0
+sqlparse==0.5.3
+srsly==2.5.0
+stanio==0.5.1
+statsmodels==0.14.4
+StrEnum==0.4.15
+stringzilla==3.11.1
+sympy==1.13.1
+tables==3.10.1
+tabulate==0.9.0
+tbb==2022.0.0
+tcmlib==1.2.0
+tenacity==9.0.0
+tensorboard==2.17.1
+tensorboard-data-server==0.7.2
+tensorflow==2.17.1
+tensorflow-datasets==4.9.7
+tensorflow-hub==0.16.1
+tensorflow-io-gcs-filesystem==0.37.1
+tensorflow-metadata==1.13.1
+tensorflow-probability==0.24.0
+tensorstore==0.1.71
+termcolor==2.5.0
+terminado==0.18.1
+text-unidecode==1.3
+textblob==0.17.1
+tf-slim==1.1.0
+tf_keras==2.17.0
+thinc==8.2.5
+threadpoolctl==3.5.0
+tifffile==2024.12.12
+timm==0.4.12
+tinycss2==1.4.0
+tokenizers==0.14.1
+toml==0.10.2
+tomli==2.2.1
+toolz==0.12.1
+torch @ https://download.pytorch.org/whl/cu121_full/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl
+torchaudio @ https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl
+torchsummary==1.5.1
+torchvision @ https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp310-cp310-linux_x86_64.whl
+tornado==6.3.3
+tqdm==4.67.1
+traitlets==5.7.1
+traittypes==0.2.1
+transformers==4.34.0
+tweepy==4.14.0
+typeguard==4.4.1
+typer==0.15.1
+types-pytz==2024.2.0.20241003
+types-setuptools==75.6.0.20241126
+typing_extensions==4.12.2
+tzdata==2024.2
+tzlocal==5.2
+uc-micro-py==1.0.3
+umf==0.9.1
+uritemplate==4.1.1
+urllib3==2.2.3
+vega-datasets==0.9.0
+vision-datasets==0.2.2
+wadllib==1.3.6
+wandb==0.19.1
+wasabi==1.1.3
+wcwidth==0.2.13
+weasel==0.4.1
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==14.1
+Werkzeug==3.1.3
+widgetsnbextension==3.6.10
+wordcloud==1.9.4
+wrapt==1.17.0
+xarray==2024.11.0
+xarray-einstats==0.8.0
+xgboost==2.1.3
+xlrd==2.0.1
+xyzservices==2024.9.0
+yacs==0.1.8
+yarl==1.18.3
+yellowbrick==1.5
+yfinance==0.2.50
+zipp==3.21.0

colabs/requirements-colab.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+pillow==9.4.0
+opencv-python==4.8.1.78
+pyyaml==6.0.1
+json_tricks==3.17.3
+yacs==0.1.8
+scikit-learn==1.3.1
+pandas==2.0.3
+timm==0.4.12
+numpy==1.26.4
+einops==0.8.0
+fvcore==0.1.5.post20221221
+transformers==4.34.0
+sentencepiece==0.1.99
+ftfy==6.1.1
+regex==2023.10.3
+nltk==3.8.1
+mpi4py==3.1.5
+vision-datasets==0.2.2
+cython==3.0.2
+pycocotools==2.0.7
+diffdist==0.1
+#pyarrow==13.0.0
+#cityscapesscripts==2.2.2
+#shapely==1.8.0
+scikit-image==0.21.0
+mup==1.0.0
+accelerate==0.23.0
+kornia==0.7.0
+deepspeed==0.10.3
+#wandb==0.15.12
+infinibatch==0.1.1
+open-clip-torch==2.26.1
+git+https://github.com/MaureenZOU/detectron2-xyz.git
+#gradio==3.42.0
+#torch==2.3.1 #2.0.1
+#torchvision==0.15.2
+#torchaudio==2.0.2
+#torch==2.1.0
+#torchvision==0.16.0

configs/biomedparse_inference.yaml ADDED Viewed

	@@ -0,0 +1,204 @@

+# Define Test/Trainer/Saving
+PIPELINE: XDecoderPipeline
+TRAINER: xdecoder
+SAVE_DIR: "../../data/output/test"
+base_path: "./"
+# Resume Logistic
+RESUME: false
+WEIGHT: false
+RESUME_FROM: ""
+EVAL_AT_START: false
+# Logging and Debug
+WANDB: False
+LOG_EVERY: 100
+FIND_UNUSED_PARAMETERS: false
+# Speed up training
+FP16: false
+PORT: "36873"
+# misc
+LOADER:
+  JOINT: False
+  KEY_DATASET: "coco"
+STANDARD_TEXT_FOR_EVAL: False
+##################
+# Task settings
+##################
+VERBOSE: true
+MODEL:
+  NAME: seem_model_demo
+  HEAD: xdecoder_head
+  DIM_PROJ: 512
+  TEXT:
+    ARCH: vlpencoder
+    NAME: transformer
+    TOKENIZER: clip
+    CONTEXT_LENGTH: 77 # 77
+    WIDTH: 512
+    HEADS: 8
+    LAYERS: 12 # 6
+    AUTOGRESSIVE: True
+  BACKBONE:
+    NAME: focal
+    PRETRAINED: ""
+    LOAD_PRETRAINED: false
+    FOCAL:
+      PRETRAIN_IMG_SIZE: 224
+      PATCH_SIZE: 4
+      EMBED_DIM: 192
+      DEPTHS: [2, 2, 18, 2]
+      FOCAL_LEVELS: [4, 4, 4, 4]
+      FOCAL_WINDOWS: [3, 3, 3, 3]
+      DROP_PATH_RATE: 0.3
+      MLP_RATIO: 4.0
+      DROP_RATE: 0.0
+      PATCH_NORM: True
+      USE_CONV_EMBED: True
+      SCALING_MODULATOR: True
+      USE_CHECKPOINT: False
+      USE_POSTLN: true
+      USE_POSTLN_IN_MODULATION: false
+      USE_LAYERSCALE: True
+      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+      OUT_INDICES: [0, 1, 2, 3]
+  ENCODER:
+    NAME: transformer_encoder_fpn
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 16
+    BINARY_CLASSES: False
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 512
+    MASK_DIM: 512
+    NORM: "GN"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  DECODER:
+    NAME: seem_demo
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    MASK:
+      ENABLED: False
+    DETECTION: False
+    SPATIAL:
+      ENABLED: True
+      MAX_ITER: 1
+    GROUNDING:
+      ENABLED: True
+      MAX_LEN: 5
+      TEXT_WEIGHT: 2.0
+      CLASS_WEIGHT: 0.5
+    VISUAL:
+      ENABLED: False
+    AUDIO:
+      ENABLED: False
+    RETRIEVAL:
+      ENABLED: False
+    LVIS:
+      ENABLED: True
+      THRES: 0.7
+    OPENIMAGE:
+      ENABLED: False
+      NEGATIVE_SAMPLES: 5
+      GROUNDING:
+        ENABLED: False
+        MAX_LEN: 5
+    CAPTION:
+      ENABLED: False
+      PHRASE_PROB: 0.5
+      SIM_THRES: 0.95
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    GCLASS_WEIGHT: 0.4
+    GMASK_WEIGHT: 1.0
+    GDICE_WEIGHT: 1.0
+    SCLASS_WEIGHT: 0.4
+    SMASK_WEIGHT: 1.0
+    SDICE_WEIGHT: 1.0
+    OCLASS_WEIGHT: 0.4
+    OMASK_WEIGHT: 1.0
+    ODICE_WEIGHT: 1.0
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BBOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    CAPTION_WEIGHT: 2.0
+    COST_SPATIAL:
+      CLASS_WEIGHT: 5.0
+      MASK_WEIGHT: 2.0
+      DICE_WEIGHT: 2.0
+    HIDDEN_DIM: 512
+    NUM_OBJECT_QUERIES: 101
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    MAX_SPATIAL_LEN: [512, 512, 512, 512]
+    # ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
+    TOP_GROUNDING_LAYERS: 10
+    TOP_CAPTION_LAYERS: 10
+    TOP_SPATIAL_LAYERS: 10
+    TOP_OPENIMAGE_LAYERS: 10
+    TEST:
+      SEMANTIC_ON: True
+      INSTANCE_ON: True
+      PANOPTIC_ON: True
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.4
+      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
+      DETECTIONS_PER_IMAGE: 100
+# Multi-modal Architecture, order matters
+ATTENTION_ARCH:
+  VARIABLE:
+    queries: ["object"]
+    tokens: ["grounding", "spatial", "visual", "audio"]
+  SELF_ATTENTION:
+    queries:
+      object:
+        [
+          "queries_object",
+          "tokens_grounding",
+          "tokens_spatial",
+          "tokens_visual",
+          "tokens_audio",
+        ]
+    tokens:
+      grounding: ["queries_object", "tokens_grounding"]
+      spatial: ["tokens_spatial"]
+      visual: ["tokens_visual"]
+      audio: ["queries_object", "tokens_audio"]
+  CROSS_ATTENTION:
+    queries:
+      object: True
+    tokens:
+      grounding: False
+      spatial: False
+      visual: False
+      audio: False
+  MASKING:
+    ["tokens_spatial", "tokens_grounding", "tokens_visual", "tokens_audio"]
+  DUPLICATION:
+    queries:
+      grounding: "queries_object"
+      spatial: "queries_object"
+  SPATIAL_MEMORIES: 32
+INPUT:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+# INPUT:
+#   PIXEL_MEAN: [64.284, 59.293, 59.962]
+#   PIXEL_STD: [62.484, 60.865, 59.835]

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/bin/bash
+if [ -f "/run/secrets/HF_TOKEN" ]; then
+    export HF_TOKEN=$(cat /run/secrets/HF_TOKEN)
+fi
+exec conda run --no-capture-output -n biomedparse python main.py

examples/Part_1_516_pathology_breast.png ADDED Viewed

Git LFS Details

SHA256: 473e76cd22df5b7d9da17ed49dc7139be1f6d62d4854c49236bd953b35b04c34
Pointer size: 131 Bytes
Size of remote file: 966 kB

inference_utils/inference.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+from PIL import Image
+from torchvision import transforms
+#from utils.visualizer import Visualizer
+# from detectron2.utils.colormap import random_color
+# from detectron2.data import MetadataCatalog
+# from detectron2.structures import BitMasks
+from modeling.language.loss import vl_similarity
+from utilities.constants import BIOMED_CLASSES
+#from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+# import cv2
+# import os
+# import glob
+# import subprocess
+from PIL import Image
+import random
+t = []
+t.append(transforms.Resize((1024, 1024), interpolation=Image.BICUBIC))
+transform = transforms.Compose(t)
+#metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+all_classes = ['background'] + [name.replace('-other','').replace('-merged','')
+                                for name in BIOMED_CLASSES] + ["others"]
+# colors_list = [(np.array(color['color'])/255).tolist() for color in COCO_CATEGORIES] + [[1, 1, 1]]
+# use color list from matplotlib
+import matplotlib.colors as mcolors
+colors = dict(mcolors.TABLEAU_COLORS, **mcolors.BASE_COLORS)
+colors_list = [list(colors.values())[i] for i in range(16)]
+from .output_processing import mask_stats, combine_masks
+@torch.no_grad()
+def interactive_infer_image(model, image, prompts):
+    image_resize = transform(image)
+    width = image.size[0]
+    height = image.size[1]
+    image_resize = np.asarray(image_resize)
+    image = torch.from_numpy(image_resize.copy()).permute(2,0,1).cuda()
+    data = {"image": image, 'text': prompts, "height": height, "width": width}
+    # inistalize task
+    model.model.task_switch['spatial'] = False
+    model.model.task_switch['visual'] = False
+    model.model.task_switch['grounding'] = True
+    model.model.task_switch['audio'] = False
+    model.model.task_switch['grounding'] = True
+    batch_inputs = [data]
+    results,image_size,extra = model.model.evaluate_demo(batch_inputs)
+    pred_masks = results['pred_masks'][0]
+    v_emb = results['pred_captions'][0]
+    t_emb = extra['grounding_class']
+    t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+    v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+    temperature = model.model.sem_seg_head.predictor.lang_encoder.logit_scale
+    out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+    matched_id = out_prob.max(0)[1]
+    pred_masks_pos = pred_masks[matched_id,:,:]
+    pred_class = results['pred_logits'][0][matched_id].max(dim=-1)[1]
+    # interpolate mask to ori size
+    pred_mask_prob = F.interpolate(pred_masks_pos[None,], (data['height'], data['width']),
+                                   mode='bilinear')[0,:,:data['height'],:data['width']].sigmoid().cpu().numpy()
+    pred_masks_pos = (1*(pred_mask_prob > 0.5)).astype(np.uint8)
+    return pred_mask_prob
+# def interactive_infer_panoptic_biomedseg(model, image, tasks, reftxt=None):
+#     image_ori = transform(image)
+#     #mask_ori = image['mask']
+#     width = image_ori.size[0]
+#     height = image_ori.size[1]
+#     image_ori = np.asarray(image_ori)
+#     visual = Visualizer(image_ori, metadata=metadata)
+#     images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+#     data = {"image": images, "height": height, "width": width}
+#     if len(tasks) == 0:
+#         tasks = ["Panoptic"]
+#     # inistalize task
+#     model.model.task_switch['spatial'] = False
+#     model.model.task_switch['visual'] = False
+#     model.model.task_switch['grounding'] = False
+#     model.model.task_switch['audio'] = False
+#     # check if reftxt is list of strings
+#     assert isinstance(reftxt, list), f"reftxt should be a list of strings, but got {type(reftxt)}"
+#     model.model.task_switch['grounding'] = True
+#     predicts = {}
+#     for i, txt in enumerate(reftxt):
+#         data['text'] = txt
+#         batch_inputs = [data]
+#         results,image_size,extra = model.model.evaluate_demo(batch_inputs)
+#         pred_masks = results['pred_masks'][0]
+#         v_emb = results['pred_captions'][0]
+#         t_emb = extra['grounding_class']
+#         t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+#         v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+#         temperature = model.model.sem_seg_head.predictor.lang_encoder.logit_scale
+#         out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+#         matched_id = out_prob.max(0)[1]
+#         pred_masks_pos = pred_masks[matched_id,:,:]
+#         pred_class = results['pred_logits'][0][matched_id].max(dim=-1)[1]
+#         # interpolate mask to ori size
+#         #pred_masks_pos = (F.interpolate(pred_masks_pos[None,], image_size[-2:], mode='bilinear')[0,:,:data['height'],:data['width']] > 0.0).float().cpu().numpy()
+#         # masks.append(pred_masks_pos[0])
+#         # mask = pred_masks_pos[0]
+#         # masks.append(mask)
+#         # interpolate mask to ori size
+#         pred_mask_prob = F.interpolate(pred_masks_pos[None,], image_size[-2:], mode='bilinear')[0,:,:data['height'],:data['width']].sigmoid().cpu().numpy()
+#         #pred_masks_pos = 1*(pred_mask_prob > 0.5)
+#         predicts[txt] = pred_mask_prob[0]
+#     masks = combine_masks(predicts)
+#     predict_mask_stats = {}
+#     print(masks.keys())
+#     for i, txt in enumerate(masks):
+#         mask = masks[txt]
+#         demo = visual.draw_binary_mask(mask, color=colors_list[i], text=txt)
+#         predict_mask_stats[txt] = mask_stats((predicts[txt]*255), image_ori)
+#     res = demo.get_image()
+#     torch.cuda.empty_cache()
+#     # return Image.fromarray(res), stroke_inimg, stroke_refimg
+#     return Image.fromarray(res), None, predict_mask_stats

inference_utils/output_processing.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import json
+from scipy import stats
+import numpy as np
+import huggingface_hub
+def check_mask_stats(img, mask, modality_type, target):
+    # img: np.array, shape=(H, W, 3) RGB image with pixel values in [0, 255]
+    # mask: np.array, shape=(H, W, 1) mask probability scaled to [0,255] with pixel values in [0, 255]
+    # modality_type: str, see target_dist.json for the list of modality types
+    # target: str, see target_dist.json for the list of targets
+    huggingface_hub.hf_hub_download('microsoft/BiomedParse', filename='target_dist.json', local_dir='./inference_utils')
+    huggingface_hub.hf_hub_download('microsoft/BiomedParse', filename="config.yaml", local_dir="./configs")
+    target_dist = json.load(open("inference_utils/target_dist.json"))
+    if modality_type not in target_dist:
+        raise ValueError(f"Currently support modality types: {list(target_dist.keys())}")
+    if target not in target_dist[modality_type]:
+        raise ValueError(f"Currently support targets for {modality_type}: {list(target_dist[modality_type].keys())}")
+    ms = mask_stats(mask, img)
+    ps = [stats.ks_1samp([ms[i]], stats.beta(param[0], param[1]).cdf).pvalue for i, param in enumerate(target_dist[modality_type][target])]
+    p_value = np.prod(ps)
+    adj_p_value = p_value**0.24    # adjustment for four test products
+    return adj_p_value
+def mask_stats(mask, img):
+    # mask is a prediction mask with pixel values in [0, 255] for probability in [0, 1]
+    # img is a RGB image with pixel values in [0, 255]
+    if mask.max() <= 127:
+        return [0, 0, 0, 0]
+    return [mask[mask>=128].mean()/256, img[:,:,0][mask>=128].mean()/256,
+            img[:,:,1][mask>=128].mean()/256, img[:,:,2][mask>=128].mean()/256]
+def combine_masks(predicts):
+    # predicts: a dictionary of pixel probability, {TARGET: pred_prob}
+    pixel_preds = {}
+    target_area = {}
+    target_probs = {}
+    for target in predicts:
+        pred = predicts[target]
+        pred_region = np.where(pred > 0.1)
+        target_area[target] = 0
+        target_probs[target] = 0
+        for (i,j) in zip(*pred_region):
+            if (i,j) not in pixel_preds:
+                pixel_preds[(i,j)] = {}
+            pixel_preds[(i,j)][target] = pred[i,j]
+            target_area[target] += 1
+            target_probs[target] += pred[i,j]
+    for target in predicts:
+        if target_area[target] == 0:
+            continue
+        target_probs[target] /= target_area[target]
+    # generate combined masks
+    combined_areas = {t: 0 for t in predicts}
+    for index in pixel_preds:
+        pred_target = sorted(pixel_preds[index].keys(), key=lambda t: pixel_preds[index][t], reverse=True)[0]
+        combined_areas[pred_target] += 1
+    # discard targets with small areas
+    discard_targets = []
+    for target in predicts:
+        if combined_areas[target] < 0.6 * target_area[target]:
+            discard_targets.append(target)
+    # keep the most confident target
+    most_confident_target = sorted(predicts.keys(), key=lambda t: target_probs[t], reverse=True)[0]
+    discard_targets = [t for t in discard_targets if t != most_confident_target]
+    masks = {t: np.zeros_like(predicts[t]).astype(np.uint8) for t in predicts if t not in discard_targets}
+    for index in pixel_preds:
+        candidates = [t for t in pixel_preds[index] if t not in discard_targets and pixel_preds[index][t] > 0.5]
+        if len(candidates) == 0:
+            continue
+        pred_target = max(candidates, key=lambda t: pixel_preds[index][t])
+        masks[pred_target][index[0], index[1]] = 1
+    return masks

inference_utils/processing_utils.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import numpy as np
+from skimage import transform
+import pydicom
+from io import BytesIO
+from PIL import Image
+import nibabel as nib
+import SimpleITK as sitk
+from skimage import measure
+"""
+    This script contains utility functions for reading and processing different imaging modalities.
+"""
+CT_WINDOWS = {'abdomen': [-150, 250],
+              'lung': [-1000, 1000],
+              'pelvis': [-55, 200],
+              'liver': [-25, 230],
+              'colon': [-68, 187],
+              'pancreas': [-100, 200]}
+def process_intensity_image(image_data, is_CT, site=None):
+    # process intensity-based image. If CT, apply site specific windowing
+    # image_data: 2D numpy array of shape (H, W)
+    # return: 3-channel numpy array of shape (H, W, 3) as model input
+    if is_CT:
+        # process image with windowing
+        if site and site in CT_WINDOWS:
+            window = CT_WINDOWS[site]
+        else:
+            raise ValueError(f'Please choose CT site from {CT_WINDOWS.keys()}')
+        lower_bound, upper_bound = window
+    else:
+        # process image with intensity range 0.5-99.5 percentile
+        lower_bound, upper_bound = np.percentile(
+            image_data[image_data > 0], 0.5
+        ), np.percentile(image_data[image_data > 0], 99.5)
+    image_data_pre = np.clip(image_data, lower_bound, upper_bound)
+    image_data_pre = (
+        (image_data_pre - image_data_pre.min())
+        / (image_data_pre.max() - image_data_pre.min())
+        * 255.0
+    )
+    # pad to square with equal padding on both sides
+    shape = image_data_pre.shape
+    if shape[0] > shape[1]:
+        pad = (shape[0]-shape[1])//2
+        pad_width = ((0,0), (pad, pad))
+    elif shape[0] < shape[1]:
+        pad = (shape[1]-shape[0])//2
+        pad_width = ((pad, pad), (0,0))
+    else:
+        pad_width = None
+    if pad_width is not None:
+        image_data_pre = np.pad(image_data_pre, pad_width, 'constant', constant_values=0)
+    # resize image to 1024x1024
+    image_size = 1024
+    resize_image = transform.resize(image_data_pre, (image_size, image_size), order=3,
+                                    mode='constant', preserve_range=True, anti_aliasing=True)
+    # convert to 3-channel image
+    resize_image = np.stack([resize_image]*3, axis=-1)
+    return resize_image.astype(np.uint8)
+def read_dicom(image_path, is_CT, site=None):
+    # read dicom file and return pixel data
+    # dicom_file: str, path to dicom file
+    # is_CT: bool, whether image is CT or not
+    # site: str, one of CT_WINDOWS.keys()
+    # return: 2D numpy array of shape (H, W)
+    ds = pydicom.dcmread(image_path)
+    image_array = ds.pixel_array * ds.RescaleSlope + ds.RescaleIntercept
+    image_array = process_intensity_image(image_array, is_CT, site)
+    return image_array
+def read_nifti(image_path, is_CT, slice_idx, site=None, HW_index=(0, 1), channel_idx=None):
+    # read nifti file and return pixel data
+    # image_path: str, path to nifti file
+    # is_CT: bool, whether image is CT or not
+    # slice_idx: int, slice index to read
+    # site: str, one of CT_WINDOWS.keys()
+    # HW_index: tuple, index of height and width in the image shape
+    # return: 2D numpy array of shape (H, W)
+    nii = nib.load(image_path)
+    image_array = nii.get_fdata()
+    if HW_index != (0, 1):
+        image_array = np.moveaxis(image_array, HW_index, (0, 1))
+    # get slice
+    if channel_idx is None:
+        image_array = image_array[:, :, slice_idx]
+    else:
+        image_array = image_array[:, :, slice_idx, channel_idx]
+    image_array = process_intensity_image(image_array, is_CT, site)
+    return image_array
+def read_rgb(image_path):
+    # read RGB image and return resized pixel data
+    # image_path: str, path to RGB image
+    # return: BytesIO buffer
+    # read image into numpy array
+    image = Image.open(image_path)
+    image = np.array(image)
+    if len(image.shape) == 2:
+        image = np.stack([image]*3, axis=-1)
+    elif image.shape[2] == 4:
+        image = image[:,:,:3]
+    # pad to square with equal padding on both sides
+    shape = image.shape
+    if shape[0] > shape[1]:
+        pad = (shape[0]-shape[1])//2
+        pad_width = ((0,0), (pad, pad), (0,0))
+    elif shape[0] < shape[1]:
+        pad = (shape[1]-shape[0])//2
+        pad_width = ((pad, pad), (0,0), (0,0))
+    else:
+        pad_width = None
+    if pad_width is not None:
+        image = np.pad(image, pad_width, 'constant', constant_values=0)
+    # resize image to 1024x1024 for each channel
+    image_size = 1024
+    resize_image = np.zeros((image_size, image_size, 3), dtype=np.uint8)
+    for i in range(3):
+        resize_image[:,:,i] = transform.resize(image[:,:,i], (image_size, image_size), order=3,
+                                    mode='constant', preserve_range=True, anti_aliasing=True)
+    return resize_image
+def get_instances(mask):
+    # get intances from binary mask
+    seg = sitk.GetImageFromArray(mask)
+    filled = sitk.BinaryFillhole(seg)
+    d = sitk.SignedMaurerDistanceMap(filled, insideIsPositive=False, squaredDistance=False, useImageSpacing=False)
+    ws = sitk.MorphologicalWatershed( d, markWatershedLine=False, level=1)
+    ws = sitk.Mask( ws, sitk.Cast(seg, ws.GetPixelID()))
+    ins_mask = sitk.GetArrayFromImage(ws)
+    # filter out instances with small area outliers
+    props = measure.regionprops_table(ins_mask, properties=('label', 'area'))
+    mean_area = np.mean(props['area'])
+    std_area = np.std(props['area'])
+    threshold = mean_area - 2*std_area - 1
+    ins_mask_filtered = ins_mask.copy()
+    for i, area in zip(props['label'], props['area']):
+        if area < threshold:
+            ins_mask_filtered[ins_mask == i] = 0
+    return ins_mask_filtered

inference_utils/target_dist.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"CT-Abdomen": {"postcava": [[244.8001455798728, 5.314270814858824], [7.183679633251858, 5.168810995426391], [7.183679633251858, 5.168810995426391], [7.183679633251858, 5.168810995426391]], "aorta": [[570.5260544851909, 8.97527503179567], [3.3715049586348242, 1.4971164544774238], [3.3715049586348242, 1.4971164544774238], [3.3715049586348242, 1.4971164544774238]], "right kidney": [[831.8568013426873, 14.991866448573818], [4.970270375121704, 3.050385928796316], [4.970270375121704, 3.050385928796316], [4.970270375121704, 3.050385928796316]], "kidney": [[824.7288483151449, 17.740666994112335], [5.134294543833492, 3.188304874790919], [5.134294543833492, 3.188304874790919], [5.134294543833492, 3.188304874790919]], "left kidney": [[765.9269280548916, 14.314482540419498], [5.084499568327313, 3.2061871556243515], [5.084499568327313, 3.2061871556243515], [5.084499568327313, 3.2061871556243515]], "duodenum": [[121.5002253116006, 5.0616837393558045], [13.60882943690214, 15.313999640884173], [13.60882943690214, 15.313999640884173], [13.60882943690214, 15.313999640884173]], "pancreas": [[182.85416969377923, 6.9039775525067135], [17.489564177159146, 14.924761571311656], [17.489564177159146, 14.924761571311656], [17.489564177159146, 14.924761571311656]], "liver (non abdomen window)": [[481.5690096331249, 8.413924027868077], [6.047563882283547, 6.86712354789198], [6.047563882283547, 6.86712354789198], [6.047563882283547, 6.86712354789198]], "liver": [[497.88613290346797, 8.79208581405346], [20.552757782824486, 16.312687320589742], [20.552757782824486, 16.312687320589742], [20.552757782824486, 16.312687320589742]], "spleen": [[496.77984794364835, 8.498216025126785], [14.594250163059534, 10.71357260923987], [14.594250163059534, 10.71357260923987], [14.594250163059534, 10.71357260923987]], "stomach": [[137.7555592980079, 3.928159238756134], [5.978844398494112, 10.238758157160921], [5.978844398494112, 10.238758157160921], [5.978844398494112, 10.238758157160921]], "gallbladder": [[109.56988864543307, 3.4765854683723596], [32.35084093358493, 41.113482214152384], [32.35084093358493, 41.113482214152384], [32.35084093358493, 41.113482214152384]], "left adrenal gland": [[121.60075395406241, 4.266683492995461], [17.017417548383662, 18.48528509828753], [17.017417548383662, 18.48528509828753], [17.017417548383662, 18.48528509828753]], "adrenal gland": [[182.4265613513338, 7.813186080282246], [18.97442893128976, 20.599617257380345], [18.97442893128976, 20.599617257380345], [18.97442893128976, 20.599617257380345]], "right adrenal gland": [[158.21570288963346, 5.736947411814261], [17.17089273745977, 19.09450167978653], [17.17089273745977, 19.09450167978653], [17.17089273745977, 19.09450167978653]], "bladder": [[172.667607742299, 4.6885066612866835], [42.56984081338662, 56.45115036285909], [42.56984081338662, 56.45115036285909], [42.56984081338662, 56.45115036285909]], "esophagus": [[253.86092392814248, 6.886078359154348], [13.252110919965341, 15.437200766467301], [13.252110919965341, 15.437200766467301], [13.252110919965341, 15.437200766467301]]}, "CT-Chest": {"nodule": [[115.14726334918862, 3.0043952160348844], [5.275338876748403, 7.899248653413393], [5.275338876748403, 7.899248653413393], [5.275338876748403, 7.899248653413393]], "COVID-19 infection": [[226.93782607812352, 10.662200522447263], [11.74323002038987, 23.773784082857407], [11.74323002038987, 23.773784082857407], [11.74323002038987, 23.773784082857407]], "tumor": [[81.39154648592063, 3.0363381821985254], [9.799683628807484, 19.248706134279548], [9.799683628807484, 19.248706134279548], [9.799683628807484, 19.248706134279548]]}, "MRI-Abdomen": {"aorta": [[840.9822169946456, 13.699556855062456], [2.9798604461548766, 1.19765659474954], [2.9798604461548766, 1.19765659474954], [2.9798604461548766, 1.19765659474954]], "postcava": [[151.3891903352374, 4.700455115571472], [3.065810750535689, 2.074722812609995], [3.065810750535689, 2.074722812609995], [3.065810750535689, 2.074722812609995]], "right kidney": [[613.4017011464975, 11.282616103318485], [4.63815461741129, 2.2967740371944867], [4.63815461741129, 2.2967740371944867], [4.63815461741129, 2.2967740371944867]], "duodenum": [[88.51851857758399, 5.251374959142798], [9.350910364523573, 8.85976960554745], [9.350910364523573, 8.85976960554745], [9.350910364523573, 8.85976960554745]], "kidney": [[831.5762248415444, 18.739059302777875], [5.715871882386201, 2.6205541393599527], [5.715871882386201, 2.6205541393599527], [5.715871882386201, 2.6205541393599527]], "left kidney": [[255.4744196400276, 5.573793361388763], [6.081920320421431, 2.930383603114708], [6.081920320421431, 2.930383603114708], [6.081920320421431, 2.930383603114708]], "liver": [[491.1931789168259, 9.294627086787225], [10.138029098677139, 6.28829088692463], [10.138029098677139, 6.28829088692463], [10.138029098677139, 6.28829088692463]], "pancreas": [[136.2304629992425, 5.676744286342953], [19.631392824605342, 11.528214201070567], [19.631392824605342, 11.528214201070567], [19.631392824605342, 11.528214201070567]], "gallbladder": [[75.18767252055355, 2.8711737605829892], [14.500831537679415, 20.696868858705496], [14.500831537679415, 20.696868858705496], [14.500831537679415, 20.696868858705496]], "stomach": [[89.16380420023327, 4.461224829090838], [10.266772743753412, 16.943404348738376], [10.266772743753412, 16.943404348738376], [10.266772743753412, 16.943404348738376]], "spleen": [[413.92566589639046, 7.99961594912814], [7.267087388529462, 5.149714876028216], [7.267087388529462, 5.149714876028216], [7.267087388529462, 5.149714876028216]], "left adrenal gland": [[86.44109991236728, 4.826813402237061], [17.153928230900817, 14.858036650050408], [17.153928230900817, 14.858036650050408], [17.153928230900817, 14.858036650050408]], "adrenal gland": [[303.9642820935704, 16.729857009916806], [19.500678047021523, 17.02588768312544], [19.500678047021523, 17.02588768312544], [19.500678047021523, 17.02588768312544]], "right adrenal gland": [[172.36803145644578, 8.050377438528958], [15.257519917725558, 13.431078702905772], [15.257519917725558, 13.431078702905772], [15.257519917725558, 13.431078702905772]], "esophagus": [[193.1348898340059, 7.6397334220243325], [12.240331385391299, 16.812971132953354], [12.240331385391299, 16.812971132953354], [12.240331385391299, 16.812971132953354]]}, "MRI-Cardiac": {"left heart ventricle": [[964.9072936969454, 17.21177762137991], [5.880290818671821, 4.100959742819713], [5.880290818671821, 4.100959742819713], [5.880290818671821, 4.100959742819713]], "myocardium": [[448.3393673888417, 17.591805257426998], [5.208511169313307, 15.910705163394415], [5.208511169313307, 15.910705163394415], [5.208511169313307, 15.910705163394415]], "right heart ventricle": [[359.88937669636215, 9.392153523781843], [5.924076424141962, 5.554667293878979], [5.924076424141962, 5.554667293878979], [5.924076424141962, 5.554667293878979]]}, "MRI-FLAIR-Brain": {"edema": [[69.4159007224176, 5.568921766085619], [13.400334168570177, 4.965265405638592], [13.400334168570177, 4.965265405638592], [13.400334168570177, 4.965265405638592]], "tumor core": [[154.26935124167449, 8.089254912853598], [14.908340542645478, 4.820086393609397], [14.908340542645478, 4.820086393609397], [14.908340542645478, 4.820086393609397]], "whole tumor": [[485.48717118600956, 16.01178236475156], [25.74323915508559, 8.636438181178145], [25.74323915508559, 8.636438181178145], [25.74323915508559, 8.636438181178145]]}, "MRI-T1-Gd-Brain": {"enhancing tumor": [[175.6437881777937, 7.539344668413025], [17.864705093992068, 5.36432831714689], [17.864705093992068, 5.36432831714689], [17.864705093992068, 5.36432831714689]], "non-enhancing tumor": [[37.6625733247702, 3.8454536110058246], [6.568014639412233, 8.446289690167484], [6.568014639412233, 8.446289690167484], [6.568014639412233, 8.446289690167484]], "tumor core": [[180.88223552813486, 6.610443841067055], [9.70294999498087, 5.30262880784197], [9.70294999498087, 5.30262880784197], [9.70294999498087, 5.30262880784197]]}, "Pathology": {"connective tissue cells": [[46.71165884847293, 4.997126203483956], [9.942495884846476, 15.700775443760845], [4.328453739888501, 18.42621798468577], [9.798096322131162, 11.920352021312304]], "inflammatory cells": [[39.600337990197595, 3.1848025413959706], [6.287418328538852, 20.538379638162322], [2.9521703595392146, 25.264465092284006], [6.559595490616054, 12.004686961917436]], "neoplastic cells": [[82.29374052289526, 8.22429924322936], [9.592296798563375, 14.818916788142138], [4.948629785308088, 19.78516221506478], [10.729094314024243, 12.934345198477494]], "epithelial cells": [[91.75183574899573, 9.577544361042948], [13.469843493323452, 27.305962287612964], [4.696928248406198, 25.254143364646463], [11.077634907582583, 13.487595094752443]]}, "X-Ray-Chest": {"left lung": [[529.1669758355144, 7.465035502868491], [8.220284641505614, 11.62958600654364], [8.220284641505614, 11.62958600654364], [8.220284641505614, 11.62958600654364]], "lung": [[465.7809501354513, 7.147122106450173], [8.781306299078446, 12.335455073688102], [8.781306299078446, 12.335455073688102], [8.781306299078446, 12.335455073688102]], "right lung": [[567.6127039725319, 7.532428563004494], [8.067311420424144, 11.229763331648746], [8.067311420424144, 11.229763331648746], [8.067311420424144, 11.229763331648746]]}, "Ultrasound-Cardiac": {"left heart atrium": [[1188.687550702627, 24.234766943758856], [5.18832820435626, 13.705576921752291], [5.18832820435626, 13.705576921752291], [5.18832820435626, 13.705576921752291]], "left heart ventricle": [[2787.334986695437, 58.297232816307506], [15.28158405889985, 56.95469460140377], [15.28158405889985, 56.95469460140377], [15.28158405889985, 56.95469460140377]]}, "Endoscopy": {"neoplastic polyp": [[392.89875472390315, 5.4678888279040745], [7.477729277754545, 1.6522601344780465], [7.2704247484339035, 6.347521355120636], [4.3902399436060335, 6.543658310376327]], "polyp": [[163.7838288028474, 3.4851615302599117], [7.03659746479883, 1.9088902542177986], [6.992807172875011, 6.756628353721484], [5.185761648208865, 8.977427344868255]], "non-neoplastic polyp": [[214.9199548332033, 4.360826895414348], [7.303363948417486, 1.9789835935004905], [10.54652900087687, 9.009706115553772], [6.917879576439251, 10.404634951284532]]}, "Fundus": {"optic cup": [[1482.9561484784422, 35.78105120937013], [52.1031548324398, 1.5080077510381715], [10.023538467761934, 3.1641925551155046], [3.394564722036805, 2.4391933423559626]], "optic disc": [[626.9141229495486, 20.95002931507066], [18.278454005466408, 1.8261365514325893], [16.42282430959315, 11.171338052048034], [4.8937792939550135, 6.987302868644637]]}, "Dermoscopy": {"lesion": [[134.43456931870887, 4.743684855379663], [5.18053578956456, 2.3527492367343634], [3.809383004477107, 6.368793378843402], [2.3888068456218847, 6.655396307215968]], "melanoma": [[454.17848530764076, 9.6466178116726], [4.022144360826467, 7.870140640677671], [4.87109613458874, 18.93721534855073], [3.107895746664011, 13.604075970992069]]}, "OCT": {"edema": [[260.11475018501574, 7.379315940573871], [4.162158474003, 17.437425953761988], [12.65808078622105, 81.37165793634547], [1.763378481483125, 4.427309203795247]]}}

main.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import gradio as gr
+import torch
+from PIL import Image
+import numpy as np
+import matplotlib.pyplot as plt
+from huggingface_hub import hf_hub_download
+from modeling.BaseModel import BaseModel
+from modeling import build_model
+from utilities.distributed import init_distributed
+from utilities.arguments import load_opt_from_config_files
+from utilities.constants import BIOMED_CLASSES
+from inference_utils.inference import interactive_infer_image
+def overlay_masks(image, masks, colors):
+    overlay = image.copy()
+    overlay = np.array(overlay, dtype=np.uint8)
+    for mask, color in zip(masks, colors):
+        overlay[mask > 0] = (overlay[mask > 0] * 0.4 + np.array(color) * 0.6).astype(
+            np.uint8
+        )
+    return Image.fromarray(overlay)
+def generate_colors(n):
+    cmap = plt.get_cmap("tab10")
+    colors = [tuple(int(255 * val) for val in cmap(i)[:3]) for i in range(n)]
+    return colors
+def init_model():
+    # Download model
+    model_file = hf_hub_download(
+        repo_id="microsoft/BiomedParse",
+        filename="biomedparse_v1.pt",
+        token=os.getenv("HF_TOKEN"),
+    )
+    # Initialize model
+    conf_files = "configs/biomedparse_inference.yaml"
+    opt = load_opt_from_config_files([conf_files])
+    opt = init_distributed(opt)
+    model = BaseModel(opt, build_model(opt)).from_pretrained(model_file).eval().cuda()
+    with torch.no_grad():
+        model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(
+            BIOMED_CLASSES + ["background"], is_eval=True
+        )
+    return model
+def predict(image, prompts):
+    if not prompts:
+        return None
+    # Convert string input to list
+    prompts = [p.strip() for p in prompts.split(",")]
+    # Convert to RGB if needed
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    # Get predictions
+    pred_mask = interactive_infer_image(model, image, prompts)
+    # Generate visualization
+    colors = generate_colors(len(prompts))
+    pred_overlay = overlay_masks(
+        image, [1 * (pred_mask[i] > 0.5) for i in range(len(prompts))], colors
+    )
+    return pred_overlay
+def run():
+    global model
+    model = init_model()
+    demo = gr.Interface(
+        fn=predict,
+        inputs=[
+            gr.Image(type="pil", label="Input Image"),
+            gr.Textbox(
+                label="Prompts",
+                placeholder="Enter prompts separated by commas (e.g., neoplastic cells, inflammatory cells)",
+            ),
+        ],
+        outputs=gr.Image(type="pil", label="Prediction"),
+        title="BiomedParse Demo",
+        description="Upload a biomedical image and enter prompts (separated by commas) to detect specific features.",
+        examples=[
+            [
+                "examples/Part_1_516_pathology_breast.png",
+                "neoplastic cells, inflammatory cells",
+            ]
+        ],
+    )
+    demo.launch(server_name="0.0.0.0", server_port=7860)
+if __name__ == "__main__":
+    print(f"HF_TOKEN={os.getenv('HF_TOKEN')}")
+    run()

modeling/BaseModel.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import logging
+import torch
+import torch.nn as nn
+from utilities.model import align_and_update_state_dicts
+from utilities.distributed import init_distributed
+from utilities.arguments import load_opt_from_config_files
+import huggingface_hub
+logger = logging.getLogger(__name__)
+class BaseModel(nn.Module):
+    def __init__(self, opt, module: nn.Module):
+        super(BaseModel, self).__init__()
+        self.opt = opt
+        self.model = module
+    def forward(self, *inputs, **kwargs):
+        outputs = self.model(*inputs, **kwargs)
+        return outputs
+    def save_pretrained(self, save_dir):
+        torch.save(self.model.state_dict(), os.path.join(save_dir, "model_state_dict.pt"))
+    def from_pretrained(self, pretrained, filename: str = "biomedparse_v1.pt",
+                        local_dir: str = "./pretrained", config_dir: str = "./configs"):
+        if pretrained.startswith("hf_hub:"):
+            hub_name = pretrained.split(":")[1]
+            huggingface_hub.hf_hub_download(hub_name, filename=filename,
+                                            local_dir=local_dir)
+            huggingface_hub.hf_hub_download(hub_name, filename="config.yaml",
+                                            local_dir=config_dir)
+            load_dir = os.path.join(local_dir, filename)
+        else:
+            load_dir = pretrained
+        state_dict = torch.load(load_dir, map_location=self.opt['device'])
+        state_dict = align_and_update_state_dicts(self.model.state_dict(), state_dict)
+        self.model.load_state_dict(state_dict, strict=False)
+        return self

modeling/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .architectures import build_model

modeling/architectures/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .xdecoder_model import *
+from .seem_model_v0 import *
+from .seem_model_v1 import *
+from .seem_model_demo import *
+from .build import build_model

modeling/architectures/build.py ADDED Viewed

	@@ -0,0 +1,22 @@

+_model_entrypoints = {}
+def build_model(config, **kwargs):
+    model_name = config['MODEL']['NAME']
+    if not is_model(model_name):
+        raise ValueError(f'Unkown model: {model_name}')
+    return model_entrypoints(model_name)(config, **kwargs)
+def register_model(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _model_entrypoints[model_name] = fn
+    return fn
+def model_entrypoints(model_name):
+    return _model_entrypoints[model_name]
+def is_model(model_name):
+    return model_name in _model_entrypoints

modeling/architectures/seem_model_demo.py ADDED Viewed

	@@ -0,0 +1,923 @@

+# --------------------------------------------------------
+# SEEM -- Segment Everything Everywhere All at Once
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import random
+from typing import Tuple
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from kornia.contrib import distance_transform
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from detectron2.utils.memory import retry_if_cuda_oom
+from detectron2.data import MetadataCatalog
+from .build import register_model
+from ..utils import configurable, get_class_names, get_iou
+from ..vision.backbone import build_backbone, Backbone
+from ..body import build_xdecoder_head
+from ..modules import sem_seg_postprocess, SetCriterion, HungarianMatcher, bbox_postprocess
+from ..language import build_language_encoder
+from ..language.loss import vl_similarity
+from utilities.prompt_engineering import prompt_engineering
+from utilities.constants import COCO_PANOPTIC_CLASSES
+class GeneralizedSEEM(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        losses: dict,
+        num_queries: int,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        task_switch: dict,
+        phrase_prob: float,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        semantic_on: bool,
+        panoptic_on: bool,
+        instance_on: bool,
+        test_topk_per_image: int,
+        train_dataset_name: str,
+        interactive_mode: str,
+        interactive_iter: str,
+        dilation_kernel: torch.Tensor,
+    ):
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.losses = losses
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+        # additional args
+        self.semantic_on = semantic_on
+        self.instance_on = instance_on
+        self.panoptic_on = panoptic_on
+        # caption argument
+        self.task_switch = task_switch
+        self.phrase_prob = phrase_prob
+        self.test_topk_per_image = test_topk_per_image
+        self.train_class_names = None
+        self.interactive_mode = interactive_mode
+        self.interactive_iter = interactive_iter
+        if not self.semantic_on:
+            assert self.sem_seg_postprocess_before_inference
+        self.register_buffer("dilation_kernel", dilation_kernel)
+    @classmethod
+    def from_config(cls, cfg):
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        openimage_switch = {'grounding': dec_cfg['OPENIMAGE']['GROUNDING'].get('ENABLED', False),
+                            'mask': dec_cfg['OPENIMAGE'].get('ENABLED', False)}
+        task_switch = {'bbox': dec_cfg.get('DETECTION', False),
+                       'mask': dec_cfg.get('MASK', True),
+                       'spatial': dec_cfg['SPATIAL'].get('ENABLED', False),
+                       'grounding': dec_cfg['GROUNDING'].get('ENABLED', False),
+                       'openimage': openimage_switch,
+                       'visual': dec_cfg['VISUAL'].get('ENABLED', False),
+                       'audio': dec_cfg['AUDIO'].get('ENABLED', False)}
+        # build model
+        extra = {'task_switch': task_switch}
+        backbone = build_backbone(cfg)
+        lang_encoder = build_language_encoder(cfg)
+        sem_seg_head = build_xdecoder_head(cfg, backbone.output_shape(), lang_encoder, extra=extra)
+        # Training Settings.
+        loss_weights = {}
+        matcher = None
+        losses = {}
+        weight_dict = {}
+        grd_weight = {}
+        top_x_layers = {}
+        criterion = None
+        train_dataset_name = None
+        phrase_prob = None
+        # Loss parameters:
+        deep_supervision = None
+        no_object_weight = None
+        interactive_mode = 'best'
+        interactive_iter = 20
+        dilation = 3
+        dilation_kernel = torch.ones((1, 1, dilation, dilation), device=torch.cuda.current_device())
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "losses": losses,
+            "num_queries": dec_cfg['NUM_OBJECT_QUERIES'],
+            "object_mask_threshold": dec_cfg['TEST']['OBJECT_MASK_THRESHOLD'],
+            "overlap_threshold": dec_cfg['TEST']['OVERLAP_THRESHOLD'],
+            "metadata": None,
+            "size_divisibility": dec_cfg['SIZE_DIVISIBILITY'],
+            "sem_seg_postprocess_before_inference": (
+                dec_cfg['TEST']['SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE']
+                or dec_cfg['TEST']['PANOPTIC_ON']
+                or dec_cfg['TEST']['INSTANCE_ON']
+            ),
+            "pixel_mean": cfg['INPUT']['PIXEL_MEAN'],
+            "pixel_std": cfg['INPUT']['PIXEL_STD'],
+            "task_switch": task_switch,
+            "phrase_prob": phrase_prob,
+            # inference
+            "semantic_on": dec_cfg['TEST']['SEMANTIC_ON'],
+            "instance_on": dec_cfg['TEST']['INSTANCE_ON'],
+            "panoptic_on": dec_cfg['TEST']['PANOPTIC_ON'],
+            "test_topk_per_image": cfg['MODEL']['DECODER']['TEST']['DETECTIONS_PER_IMAGE'],
+            "train_dataset_name": train_dataset_name,
+            "interactive_mode": interactive_mode,
+            "interactive_iter": interactive_iter,
+            "dilation_kernel": dilation_kernel,
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs, mode='default'):
+        if self.training:
+            losses = {}
+            if self.task_switch['mask']:
+                losses_seg = self.forward_seg(batched_inputs)
+                losses.update(losses_seg)
+            if self.task_switch['openimage'] and self.task_switch['openimage']['mask']:
+                losses_openimage = self.forward_openimage(batched_inputs['openimage'])
+                losses_openimage = {key.replace('mask', 'openimage'):value for key, value in losses_openimage.items()}
+                losses_openimage = {key.replace('grounding', 'grounding_openimage'):value for key, value in losses_openimage.items()}
+                losses.update(losses_openimage)
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else: # remove this loss if not specified in `weight_dict`
+                    losses.pop(k)
+            return losses
+        else:
+            if mode == 'interactive':
+                return self.evaluate_interactive(batched_inputs)
+            elif mode == 'grounding_spatial':
+                return self.evaluate_grounding_sptial(batched_inputs, mode)
+            elif mode in ['grounding_phrasecut', 'grounding_refcoco']:
+                return self.evaluate_grounding(batched_inputs, mode)
+            else:
+                return self.evaluate(batched_inputs)
+    def forward_seg(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        self.sem_seg_head.predictor.lang_encoder.get_text_embeddings(self.train_class_names, is_eval=False)
+        extra = {}
+        # mask classification target
+        if "instances" in batched_inputs[0]:
+            # input bounding box is checked to be correct.
+            targets = self.prepare_targets(batched_inputs, images)
+            if self.task_switch['grounding']:
+                grounding_tokens = [x['grounding_query_embs'] for x in targets] # need to pad for more than one grounding token
+                grounding_tokens = nn.utils.rnn.pad_sequence(grounding_tokens, padding_value=-1)
+                non_zero_query_mask = (grounding_tokens.sum(dim=-1) == -grounding_tokens.shape[-1])
+                grounding_tokens[non_zero_query_mask] = 0
+                extra['grounding_tokens'] = grounding_tokens
+                extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+            if self.task_switch['spatial']:
+                pos_masks = [x['spatial_query']['rand_shape'].to(self.device) for x in batched_inputs]
+                neg_masks = [(x['spatial_query']['rand_shape'].to(self.device) & False) for x in batched_inputs]
+                fp_masks = torch.stack([(x['spatial_query']['rand_shape'].to(self.device) & False) for x in batched_inputs])
+                extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks, 'false_positive_mask': fp_masks})
+        features = self.backbone(images.tensor)
+        mask_features, _, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        # forward spatial only without gradient
+        if self.task_switch['spatial']:
+            with torch.no_grad():
+                # generate random integeter between [0,3]
+                rand_iter_num = random.randint(0, 2)
+                for i in range(rand_iter_num):
+                    outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, extra=extra, task='spatial')
+                    extra.update(outputs)
+                    extra.update(self.prepare_next_spaital_mask(extra, batched_inputs))
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, extra=extra, task='seg')
+        extra = {'lang_logit': self.sem_seg_head.predictor.lang_encoder.logit_scale,
+                 'class_embeddings': getattr(self.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('default')),
+                 'false_positive_mask': extra['false_positive_mask']}
+        # bipartite matching-based loss
+        self.criterion.losses = self.losses['seg'] # seg criterion losses
+        losses = self.criterion(outputs, targets, extra)
+        del outputs
+        return losses
+    def evaluate_demo(self, batched_inputs):
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        extra = {}
+        if 'stroke' in batched_inputs[0]:
+            pos_masks = (batched_inputs[0]['stroke'].to(self.device)).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+            neg_masks = (batched_inputs[0]['stroke'].to(self.device) & False).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        if 'visual' in batched_inputs[0]:
+            extra.update(batched_inputs[0]['visual'])
+        if 'text' in batched_inputs[0]:
+            gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(batched_inputs[0]['text'], name='grounding', token=False, norm=False)
+            token_emb = gtext['token_emb']
+            tokens = gtext['tokens']
+            query_emb = token_emb[tokens['attention_mask'].bool()]
+            non_zero_query_mask = torch.zeros(query_emb[:,None].shape[:-1], dtype=torch.bool, device=query_emb.device)
+            extra['grounding_tokens'] = query_emb[:,None]
+            extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+            extra['grounding_class'] = gtext['class_emb']
+        if 'audio' in batched_inputs[0]:
+            gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(batched_inputs[0]['audio'], name='grounding', token=False, norm=False)
+            token_emb = gtext['token_emb']
+            tokens = gtext['tokens']
+            query_emb = token_emb[tokens['attention_mask'].bool()]
+            non_zero_query_mask = torch.zeros(query_emb[:,None].shape[:-1], dtype=torch.bool, device=query_emb.device)
+            extra['audio_tokens'] = query_emb[:,None]
+            extra['audio_nonzero_mask'] = non_zero_query_mask.t()
+            extra['audio_class'] = gtext['class_emb']
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='demo')
+        return outputs, images.tensor.shape, extra
+        assert self.task_switch['spatial']
+        assert 'spatial_query' in batched_inputs[0]
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        extra = {}
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+        multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+        mask_features = mask_features.repeat(nm,1,1,1)
+        all_batch_shape_iou = []
+        pred_smask_pointer = None
+        prev_smask_pointer = None
+        pred_smask_all = None
+        query_index = self.sem_seg_head.predictor.query_index
+        assert self.interactive_mode == 'best'
+        pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+        pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+        neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+        neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+        extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        for i in range(self.interactive_iter):
+            outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='spatial')
+            extra.update(outputs)
+            pred_smask = F.interpolate(outputs['prev_mask'], images.tensor.shape[-2:], mode='bicubic')
+            s = image_sizes[0]
+            b = batched_inputs[0]
+            pred_smask_all = F.interpolate(pred_smask[:,:,:s[0],:s[1]], (b['height'], b['width']), mode='bicubic')[:,0].sigmoid() > 0.5
+            gt_smask = b['gt_masks_orisize']
+            all_batch_shape_iou += [get_iou(gt_smask, pred_smask_all)]
+            extra.update(self.prepare_next_spaital_mask(extra, batched_inputs))
+        all_batch_shape_iou = torch.stack(all_batch_shape_iou)
+        processed_results = [{"mask_iou": all_batch_shape_iou[:,i]} for i in range(len(all_batch_shape_iou[0]))]
+        return processed_results
+    def evaluate(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, target_queries=queries_grounding)
+        mask_cls_results = outputs["pred_logits"]
+        mask_pred_results = outputs["pred_masks"]
+        box_pred_results = outputs["pred_boxes"] if self.task_switch['bbox'] else [None for i in range(len(mask_pred_results))]
+        # upsample masks
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+        input_size = mask_pred_results.shape[-2:]
+        del outputs
+        processed_results = []
+        for mask_cls_result, mask_pred_result, box_pred_result, input_per_image, image_size in zip(
+            mask_cls_results, mask_pred_results, box_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            if self.sem_seg_postprocess_before_inference:
+                mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                    mask_pred_result, image_size, height, width
+                )
+                mask_cls_result = mask_cls_result.to(mask_pred_result)
+            # semantic segmentation inference
+            if self.semantic_on:
+                r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
+                if not self.sem_seg_postprocess_before_inference:
+                    r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
+                processed_results[-1]["sem_seg"] = r
+            # panoptic segmentation inference
+            if self.panoptic_on:
+                panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+            # instance segmentation inference
+            if self.instance_on:
+                if self.task_switch['bbox']:
+                    box_pred_result = bbox_postprocess(box_pred_result, input_size, image_size, height, width)
+                instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result, box_pred_result)
+                processed_results[-1]["instances"] = instance_r
+        return processed_results
+    def evaluate_interactive(self, batched_inputs):
+        assert self.task_switch['spatial']
+        assert 'spatial_query' in batched_inputs[0]
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        extra = {}
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+        multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+        mask_features = mask_features.repeat(nm,1,1,1)
+        all_batch_shape_iou = []
+        pred_smask_pointer = None
+        prev_smask_pointer = None
+        pred_smask_all = None
+        query_index = self.sem_seg_head.predictor.query_index
+        assert self.interactive_mode == 'best'
+        pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+        pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+        neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+        neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+        extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        for i in range(self.interactive_iter):
+            outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='spatial')
+            extra.update(outputs)
+            pred_smask = F.interpolate(outputs['prev_mask'], images.tensor.shape[-2:], mode='bicubic')
+            s = image_sizes[0]
+            b = batched_inputs[0]
+            pred_smask_all = F.interpolate(pred_smask[:,:,:s[0],:s[1]], (b['height'], b['width']), mode='bicubic')[:,0].sigmoid() > 0.5
+            gt_smask = b['gt_masks_orisize']
+            all_batch_shape_iou += [get_iou(gt_smask, pred_smask_all)]
+            extra.update(self.prepare_next_spaital_mask(extra, batched_inputs))
+        all_batch_shape_iou = torch.stack(all_batch_shape_iou)
+        processed_results = [{"mask_iou": all_batch_shape_iou[:,i]} for i in range(len(all_batch_shape_iou[0]))]
+        return processed_results
+    def evaluate_referring_image(self, batched_inputs, extra={}):
+        assert self.task_switch['spatial']
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        assert self.interactive_mode == 'best'
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        if 'spatial_query' in batched_inputs[0]:
+            image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+            nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+            multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+            mask_features = mask_features.repeat(nm,1,1,1)
+            query_index = self.sem_seg_head.predictor.query_index
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='refimg')
+        return outputs, images.tensor.shape
+    def evaluate_grounding(self, batched_inputs, mode):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        extra = {}
+        # mask_pred_results = []
+        # for idx, batch_per_image in enumerate(batched_inputs):
+        #     grd_texts = batch_per_image['groundings']['texts']
+        #     grd_masks = []
+        #     for anno_text in grd_texts:
+        #         gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings([anno_text[0]], name='grounding', token=False, norm=False)
+        #         token_emb = gtext['token_emb']
+        #         tokens = gtext['tokens']
+        #         grd_emb = token_emb[0][tokens['attention_mask'].bool()[0]]
+        #         extra['grounding_tokens'] = grd_emb[:,None]
+        #         assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        #         features = self.backbone(images.tensor)
+        #         outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+        #         pred_gmasks = outputs['pred_masks'][idx,self.num_queries:2*self.num_queries-1]
+        #         v_emb = outputs['pred_captions'][idx,self.num_queries:2*self.num_queries-1]
+        #         t_emb = grd_emb[-1:]
+        #         t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #         v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #         temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+        #         out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+        #         matched_id = out_prob.max(0)[1]
+        #         grd_masks += [pred_gmasks[matched_id,:,:]]
+        #     mask_pred_results += [torch.cat(grd_masks)]
+        # comment for multi object inference.
+        mask_pred_results = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            grd_texts = batch_per_image['groundings']['texts']
+            grd_texts = [x[0] for x in grd_texts]
+            gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+            token_emb = gtext['token_emb']
+            tokens = gtext['tokens']
+            query_emb = token_emb[tokens['attention_mask'].bool()]
+            non_zero_query_mask = torch.zeros(query_emb[:,None].shape[:-1], dtype=torch.bool, device=query_emb.device)
+            extra['grounding_tokens'] = query_emb[:,None]
+            extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+            features = self.backbone(images.tensor)
+            outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+            pred_gmasks = outputs['pred_gmasks'][idx]
+            v_emb = outputs['pred_gtexts'][idx]
+            t_emb = gtext['class_emb']
+            t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+            out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+            matched_id = out_prob.max(0)[1]
+            mask_pred_results += [pred_gmasks[matched_id,:,:]]
+        for i in range(len(mask_pred_results)):
+            # upsample masks
+            mask_pred_results[i] = F.interpolate(
+                mask_pred_results[i][None,],
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+        processed_results = []
+        for mask_pred_result, input_per_image, image_size in zip(
+            mask_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                mask_pred_result, image_size, height, width
+            )
+            processed_results[-1]['grounding_mask'] = mask_pred_result
+            # compute bbox
+            # bbox = BitMasks(mask_pred_result > 0).get_bounding_boxes()
+            # bbox = BoxMode.convert(bbox.tensor, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            # processed_results[-1]['grounding_box'] = bbox
+        return processed_results
+    def evaluate_grounding_sptial(self, batched_inputs, mode):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        extra = {}
+        dilation = 3
+        pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+        pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor
+        pos_masks = (F.conv2d(pos_masks.float(), self.dilation_kernel, padding=dilation//2) > 0).unbind(0)
+        neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+        neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+        mask_pred_results = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            grd_texts = batch_per_image['groundings']['texts']
+            grd_masks = []
+            for idx2, anno_text in enumerate(grd_texts):
+                extra.update({'spatial_query_pos_mask': [pos_masks[idx2]], 'spatial_query_neg_mask': [neg_masks[idx2]]})
+                gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings([anno_text[0]], name='grounding', token=False, norm=False)
+                token_emb = gtext['token_emb']
+                tokens = gtext['tokens']
+                grd_emb = token_emb[0][tokens['attention_mask'].bool()[0]]
+                non_zero_query_mask = torch.zeros(grd_emb[:,None].shape[:-1], dtype=torch.bool, device=grd_emb.device)
+                extra['grounding_tokens'] = grd_emb[:,None]
+                extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+                assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+                features = self.backbone(images.tensor)
+                outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+                pred_gmasks = outputs['pred_gmasks'][idx]
+                v_emb = outputs['pred_gtexts'][idx]
+                t_emb = gtext['class_emb']
+                t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+                v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+                temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+                out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+                matched_id = out_prob.max(0)[1]
+                grd_masks += [pred_gmasks[matched_id,:,:]]
+            mask_pred_results += [torch.cat(grd_masks)]
+        # comment for multi object inference.
+        # mask_pred_results = []
+        # for idx, batch_per_image in enumerate(batched_inputs):
+        #     grd_texts = batch_per_image['groundings']['texts']
+        #     grd_texts = [x[0] for x in grd_texts]
+        #     gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+        #     token_emb = gtext['token_emb']
+        #     tokens = gtext['tokens']
+        #     query_emb = token_emb[tokens['attention_mask'].bool()]
+        #     non_zero_query_mask = torch.zeros(query_emb[:,None].shape[:-1], dtype=torch.bool, device=query_emb.device)
+        #     extra['grounding_tokens'] = query_emb[:,None]
+        #     extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+        #     features = self.backbone(images.tensor)
+        #     outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+        #     pred_gmasks = outputs['pred_gmasks'][idx]
+        #     v_emb = outputs['pred_gtexts'][idx]
+        #     t_emb = gtext['class_emb']
+        #     t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #     v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #     temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+        #     out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+        #     matched_id = out_prob.max(0)[1]
+        #     mask_pred_results += [pred_gmasks[matched_id,:,:]]
+        for i in range(len(mask_pred_results)):
+            # upsample masks
+            mask_pred_results[i] = F.interpolate(
+                mask_pred_results[i][None,],
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+        processed_results = []
+        for mask_pred_result, input_per_image, image_size in zip(
+            mask_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                mask_pred_result, image_size, height, width
+            )
+            processed_results[-1]['grounding_mask'] = mask_pred_result
+        return processed_results
+    def prepare_targets(self, batched_inputs, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            targets_per_image = batch_per_image['instances'].to(self.device)
+            # pad gt
+            gt_masks = targets_per_image.gt_masks.tensor
+            padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+            padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+            gt_boxes = targets_per_image.gt_boxes.tensor
+            ratio = torch.tensor([w_pad,h_pad,w_pad,h_pad]).to(gt_boxes.device)[None,:]
+            gt_boxes = gt_boxes / ratio
+            xc,yc,w,h = (gt_boxes[:,0] + gt_boxes[:,2])/2, (gt_boxes[:,1] + gt_boxes[:,3])/2, gt_boxes[:,2] - gt_boxes[:,0], gt_boxes[:,3] - gt_boxes[:,1]
+            gt_boxes = torch.stack([xc,yc,w,h]).permute(1,0)
+            target_dict = {
+                    "labels": targets_per_image.gt_classes,
+                    "is_things": targets_per_image.is_things,
+                    "masks": padded_masks,
+                    "boxes": gt_boxes,
+                    }
+            if self.task_switch['spatial']:
+                # prepare targets for spatial query
+                target_dict['gt_spatial_masks'] = batch_per_image['spatial_query']['gt_masks']
+            if self.task_switch['grounding']:
+                grd_masks = batch_per_image['groundings']['masks']
+                grd_texts = batch_per_image['groundings']['texts']
+                grd_hash = batch_per_image['groundings']['hash']
+                grd_task = batch_per_image['groundings']['mode']
+                if len(grd_masks) == 0:
+                    padded_masks = None
+                else:
+                    padded_masks = torch.zeros((grd_masks.shape[0], h_pad, w_pad), dtype=grd_masks.dtype, device=grd_masks.device)
+                    padded_masks[:, : grd_masks.shape[1], : grd_masks.shape[2]] = grd_masks
+                gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+                token_emb = gtext['token_emb']
+                tokens = gtext['tokens']
+                unique_hash_id = np.unique(grd_hash, return_index=True)[1]
+                selected_mask = np.zeros(len(grd_hash)).astype(bool)
+                selected_mask[unique_hash_id] = True
+                selected_token_emb = token_emb[selected_mask]
+                selected_attn_mask = tokens['attention_mask'][selected_mask]
+                query_emb = selected_token_emb[selected_attn_mask.bool()]
+                class_idx = tokens['attention_mask'].sum(dim=-1) - 1
+                class_idx = torch.stack((torch.arange(len(class_idx), device=class_idx.device), class_idx)).tolist()
+                class_emb = token_emb[class_idx]
+                target_dict['grounding_masks'] = padded_masks
+                target_dict['grounding_query_embs'] = query_emb
+                target_dict['grounding_class_embs'] = class_emb
+                target_dict['grounding_hash'] = grd_hash
+                target_dict['grounding_task'] = grd_task
+            new_targets.append(target_dict)
+        return new_targets
+    def prepare_next_spaital_mask(self, outputs, batched_inputs):
+        gt_masks = [batched_inputs[i]['spatial_query']['gt_masks'] for i in range(len(batched_inputs))]
+        if self.training:
+            gt_masks = ImageList.from_tensors(gt_masks, self.size_divisibility).tensor
+        else:
+            gt_masks = ImageList.from_tensors(gt_masks, self.size_divisibility).tensor.transpose(0,1)
+        pred_masks = (F.interpolate(outputs['prev_mask'], size=gt_masks.shape[-2:], mode='bilinear', align_corners=False).sigmoid() > 0.5)
+        prev_masks = torch.stack(outputs['spatial_query_pos_mask']) | torch.stack(outputs['spatial_query_neg_mask'])
+        fn = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks) # fn: False Negative, gt:1, pred:0, prev:0
+        fp = (~gt_masks & pred_masks) & (~prev_masks) # fp: False Positive, gt:0, pred:1, prev:0
+        # compute iou between gt and pred
+        iou = (gt_masks & pred_masks).sum(list(range(1,len(fn.shape)))) / ((gt_masks | pred_masks).sum(dim=list(range(1,len(fn.shape)))) + 1e-8)
+        fn_sum = fn.sum(dim=list(range(1,len(fn.shape))))
+        fp_sum = fp.sum(dim=list(range(1,len(fp.shape))))
+        is_postive = fn_sum > fp_sum
+        # is_postive = torch.ones(len(fn_sum), device=torch.cuda.current_device()).bool()
+        select_mask = torch.stack([fn[i] if is_postive[i] else fp[i] for i in range(len(fn))])
+        # conv implementation
+        n,_,h,w=select_mask.shape
+        mask_dt = (distance_transform((~F.pad(select_mask, pad=(1, 1, 1, 1), mode='constant', value=0)).float())[:,:,1:-1,1:-1]).reshape(n,-1)
+        max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+        next_mask = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool()
+        next_mask = next_mask.view(n,-1)
+        next_mask[max_xy_idx] = True
+        next_mask = next_mask.reshape((n,1,h,w)).float()
+        dilation = 3
+        next_mask = F.conv2d(next_mask, self.dilation_kernel, padding=dilation//2) > 0
+        # determine whether next mask is zero
+        keep = (iou < 0.925)
+        next_mask = next_mask & keep.view(-1,1,1,1)
+        pos_mask = []
+        neg_mask = []
+        for idx, ip in enumerate(is_postive):
+            if ip:
+                pos_mask += [outputs['spatial_query_pos_mask'][idx] | next_mask[idx]]
+                neg_mask += [outputs['spatial_query_neg_mask'][idx]]
+            else:
+                pos_mask += [outputs['spatial_query_pos_mask'][idx]]
+                neg_mask += [outputs['spatial_query_neg_mask'][idx] | next_mask[idx]]
+        if 'false_positive_mask' in outputs:
+            fp = outputs['false_positive_mask'] | fp
+        return {'spatial_query_pos_mask': pos_mask, 'spatial_query_neg_mask': neg_mask, 'false_positive_mask': fp}
+    def semantic_inference(self, mask_cls, mask_pred):
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg
+    def panoptic_inference(self, mask_cls, mask_pred):
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+        keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+        cur_mask_cls = mask_cls[keep]
+        cur_mask_cls = cur_mask_cls[:, :-1]
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
+        segments_info = []
+        current_segment_id = 0
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            return panoptic_seg, segments_info
+        else:
+            # take argmax
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
+                mask_area = (cur_mask_ids == k).sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+                mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
+                if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
+                    if mask_area / original_area < self.overlap_threshold:
+                        continue
+                    # merge stuff regions
+                    if not isthing:
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                            continue
+                        else:
+                            stuff_memory_list[int(pred_class)] = current_segment_id + 1
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+                    segments_info.append(
+                        {
+                            "id": current_segment_id,
+                            "isthing": bool(isthing),
+                            "category_id": int(pred_class),
+                        }
+                    )
+            return panoptic_seg, segments_info
+    def instance_inference(self, mask_cls, mask_pred, box_pred):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+        # [Q, K]
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
+        # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
+        labels_per_image = labels[topk_indices]
+        topk_indices = (topk_indices // self.sem_seg_head.num_classes)
+        # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
+        mask_pred = mask_pred[topk_indices]
+        if box_pred is not None:
+            box_pred = box_pred[topk_indices]
+        # if this is panoptic segmentation, we only keep the "thing" classes
+        if self.panoptic_on:
+            keep = torch.zeros_like(scores_per_image).bool()
+            for i, lab in enumerate(labels_per_image):
+                keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
+            scores_per_image = scores_per_image[keep]
+            labels_per_image = labels_per_image[keep]
+            mask_pred = mask_pred[keep]
+            if box_pred is not None:
+                box_pred = box_pred[keep]
+        result = Instances(image_size)
+        # mask (before sigmoid)
+        result.pred_masks = (mask_pred > 0).float()
+        # result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # Uncomment the following to get boxes from masks (this is slow)
+        if box_pred is not None:
+            result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
+        else:
+            result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
+        result.scores = scores_per_image * mask_scores_per_image
+        result.pred_classes = labels_per_image
+        return result
+@register_model
+def get_seem_model(cfg, **kwargs):
+    return GeneralizedSEEM(cfg)

modeling/architectures/seem_model_v0.py ADDED Viewed

	@@ -0,0 +1,1160 @@

+# --------------------------------------------------------
+# SEEM -- Segment Everything Everywhere All at Once
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import random
+from typing import Tuple
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from kornia.contrib import distance_transform
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from detectron2.utils.memory import retry_if_cuda_oom
+from detectron2.data import MetadataCatalog
+from .build import register_model
+from ..utils import configurable, get_class_names, get_iou
+from ..vision.backbone import build_backbone, Backbone
+from ..body import build_xdecoder_head
+from ..modules import sem_seg_postprocess, SetCriterion, HungarianMatcher, bbox_postprocess
+from ..language import build_language_encoder
+from ..language.loss import vl_similarity
+from utilities.prompt_engineering import prompt_engineering
+from utilities.constants import COCO_PANOPTIC_CLASSES
+class GeneralizedSEEM(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        losses: dict,
+        num_queries: int,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        task_switch: dict,
+        phrase_prob: float,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        semantic_on: bool,
+        panoptic_on: bool,
+        instance_on: bool,
+        test_topk_per_image: int,
+        train_dataset_name: str,
+        interactive_mode: str,
+        interactive_iter: str,
+        dilation_kernel: torch.Tensor,
+        train_max_iter: int,
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            semantic_on: bool, whether to output semantic segmentation prediction
+            instance_on: bool, whether to output instance segmentation prediction
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.losses = losses
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+        # additional args
+        self.semantic_on = semantic_on
+        self.instance_on = instance_on
+        self.panoptic_on = panoptic_on
+        # caption argument
+        self.task_switch = task_switch
+        self.phrase_prob = phrase_prob
+        self.train_max_iter = train_max_iter
+        self.test_topk_per_image = test_topk_per_image
+        self.train_class_names = get_class_names(train_dataset_name)
+        self.interactive_mode = interactive_mode
+        self.interactive_iter = interactive_iter
+        if not self.semantic_on:
+            assert self.sem_seg_postprocess_before_inference
+        self.register_buffer("dilation_kernel", dilation_kernel)
+    @classmethod
+    def from_config(cls, cfg):
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        # Loss parameters:
+        deep_supervision = dec_cfg['DEEP_SUPERVISION']
+        no_object_weight = dec_cfg['NO_OBJECT_WEIGHT']
+        # loss weights
+        loss_weights = {'mask': {'ce': dec_cfg['CLASS_WEIGHT'], 'dice': dec_cfg['DICE_WEIGHT'], 'bce': dec_cfg['MASK_WEIGHT']},
+                        'bbox': {'l1': dec_cfg['BBOX_WEIGHT'], 'giou': dec_cfg['GIOU_WEIGHT']},
+                        'spatial': {'ce': dec_cfg['SCLASS_WEIGHT'], 'dice': dec_cfg['SDICE_WEIGHT'], 'bce': dec_cfg['SMASK_WEIGHT']},
+                        'grounding': {'ce': dec_cfg['GCLASS_WEIGHT'], 'dice': dec_cfg['GDICE_WEIGHT'], 'bce': dec_cfg['GMASK_WEIGHT']},
+                        'openimage': {'ce': dec_cfg['OCLASS_WEIGHT'], 'dice': dec_cfg['ODICE_WEIGHT'], 'bce': dec_cfg['OMASK_WEIGHT']}}
+        openimage_switch = {'grounding': dec_cfg['OPENIMAGE']['GROUNDING'].get('ENABLED', False),
+                            'mask': dec_cfg['OPENIMAGE'].get('ENABLED', False)}
+        task_switch = {'bbox': dec_cfg.get('DETECTION', False),
+                       'mask': dec_cfg['MASK'].get('ENABLED', True),
+                       'spatial': dec_cfg['SPATIAL'].get('ENABLED', False),
+                       'grounding': dec_cfg['GROUNDING'].get('ENABLED', False),
+                       'openimage': openimage_switch}
+        top_x_layers = {'mask': dec_cfg.get('TOP_MASK_LAYERS', 10),
+                        'grounding': dec_cfg.get('TOP_GROUNDING_LAYERS', 10),
+                        'openimage': dec_cfg.get('TOP_OPENIMAGE_LAYERS', 10),
+                        'spatial': dec_cfg.get('TOP_SPATIAL_LAYERS', 10)}
+        spatial_cost = {"class_weight": dec_cfg['COST_SPATIAL']['CLASS_WEIGHT'],
+                        "mask_weight": dec_cfg['COST_SPATIAL']['MASK_WEIGHT'],
+                        "dice_weight": dec_cfg['COST_SPATIAL']['DICE_WEIGHT']}
+        extra = {'task_switch': task_switch}
+        backbone = build_backbone(cfg)
+        lang_encoder = build_language_encoder(cfg)
+        sem_seg_head = build_xdecoder_head(cfg, backbone.output_shape(), lang_encoder, extra=extra)
+        # building criterion
+        matcher = HungarianMatcher(
+            cost_class=loss_weights['mask']['ce'],
+            cost_mask=loss_weights['mask']['bce'],
+            cost_dice=loss_weights['mask']['dice'],
+            num_points=dec_cfg['TRAIN_NUM_POINTS'],
+            spatial_cost=spatial_cost,
+        )
+        # init weight dict and criterion loss functions.
+        losses = {'seg': [], 'openimage': []}
+        if task_switch['mask']:
+            losses['seg'] += ["labels", "masks"]
+        if task_switch['spatial']:
+            losses['seg'] += ["spatials"]
+        if task_switch['grounding']:
+            losses['seg'] += ["groundings"]
+        if task_switch['openimage']:
+            losses['openimage'] += ["labels_openimage", "masks"]
+        if task_switch['openimage']['grounding']:
+            losses['openimage'] += ["groundings"]
+        weight_dict = {}
+        for key, turn_on in task_switch.items():
+            if turn_on:
+                if isinstance(loss_weights[key], dict):
+                    # HACK it should support bbox in the future
+                    for key_, weight in loss_weights[key].items():
+                        weight_dict["loss_{}_{}_0".format(key, key_)] = weight # NOTE: hard code for segmentation that has multiple loss
+                else:
+                    weight_dict["loss_{}_0".format(key)] = loss_weights[key]
+        # generate full weight dict and remove not computed layers.
+        if deep_supervision:
+            dec_layers = dec_cfg['DEC_LAYERS']
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                for k, v in weight_dict.items():
+                    if (i+1) > (top_x_layers[k.split('_')[1]] - 1):
+                        continue
+                    aux_weight_dict.update({k.replace('_0', f"_{i+1}"): v})
+            weight_dict.update(aux_weight_dict)
+        grd_weight = {'text': dec_cfg['GROUNDING']['TEXT_WEIGHT'], 'class': dec_cfg['GROUNDING']['CLASS_WEIGHT']}
+        # generate critenrion for loss function.
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            top_x_layers=top_x_layers,
+            eos_coef=no_object_weight,
+            losses=[],
+            num_points=dec_cfg['TRAIN_NUM_POINTS'],
+            oversample_ratio=dec_cfg['OVERSAMPLE_RATIO'],
+            importance_sample_ratio=dec_cfg['IMPORTANCE_SAMPLE_RATIO'],
+            grounding_weight=grd_weight,
+        )
+        # extra logistic
+        train_dataset_name = cfg['DATASETS']['TRAIN'][0] # HACK for only one training set.
+        train_max_iter = dec_cfg['SPATIAL'].get('MAX_ITER', 3)
+        phrase_prob = dec_cfg['CAPTION'].get('PHRASE_PROB', 0.5)
+        interactive_mode = cfg['STROKE_SAMPLER']['EVAL']['MODE']
+        interactive_iter = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER']
+        dilation = 3
+        dilation_kernel = torch.ones((1, 1, dilation, dilation), device=torch.cuda.current_device())
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "losses": losses,
+            "num_queries": dec_cfg['NUM_OBJECT_QUERIES'],
+            "object_mask_threshold": dec_cfg['TEST']['OBJECT_MASK_THRESHOLD'],
+            "overlap_threshold": dec_cfg['TEST']['OVERLAP_THRESHOLD'],
+            "metadata": MetadataCatalog.get(cfg['DATASETS']['TRAIN'][0]),
+            "size_divisibility": dec_cfg['SIZE_DIVISIBILITY'],
+            "sem_seg_postprocess_before_inference": (
+                dec_cfg['TEST']['SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE']
+                or dec_cfg['TEST']['PANOPTIC_ON']
+                or dec_cfg['TEST']['INSTANCE_ON']
+            ),
+            "pixel_mean": cfg['INPUT']['PIXEL_MEAN'],
+            "pixel_std": cfg['INPUT']['PIXEL_STD'],
+            "task_switch": task_switch,
+            "phrase_prob": phrase_prob,
+            # inference
+            "semantic_on": dec_cfg['TEST']['SEMANTIC_ON'],
+            "instance_on": dec_cfg['TEST']['INSTANCE_ON'],
+            "panoptic_on": dec_cfg['TEST']['PANOPTIC_ON'],
+            "test_topk_per_image": cfg['TEST']['DETECTIONS_PER_IMAGE'],
+            "train_dataset_name": train_dataset_name,
+            "interactive_mode": interactive_mode,
+            "interactive_iter": interactive_iter,
+            "dilation_kernel": dilation_kernel,
+            "train_max_iter": train_max_iter,
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs, mode='default'):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        if self.training:
+            losses = {}
+            if self.task_switch['mask'] or self.task_switch['grounding'] or self.task_switch['spatial']:
+                losses_seg = self.forward_seg(batched_inputs)
+                losses.update(losses_seg)
+            if self.task_switch['openimage'] and self.task_switch['openimage']['mask']:
+                losses_openimage = self.forward_openimage(batched_inputs['openimage'])
+                losses_openimage = {key.replace('mask', 'openimage'):value for key, value in losses_openimage.items()}
+                losses_openimage = {key.replace('grounding', 'grounding_openimage'):value for key, value in losses_openimage.items()}
+                losses.update(losses_openimage)
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else: # remove this loss if not specified in `weight_dict`
+                    losses.pop(k)
+            return losses
+        else:
+            if mode == 'interactive':
+                return self.evaluate_interactive(batched_inputs)
+            elif mode == 'interactive_grounding':
+                return self.evaluate_interactive_grounding(batched_inputs)
+            elif mode == 'grounding_spatial':
+                return self.evaluate_grounding_sptial(batched_inputs, mode)
+            elif mode in ['grounding_phrasecut', 'grounding_refcoco']:
+                return self.evaluate_grounding(batched_inputs, mode)
+            else:
+                return self.evaluate(batched_inputs)
+    def forward_seg(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        self.sem_seg_head.predictor.lang_encoder.get_text_embeddings(self.train_class_names, is_eval=False)
+        extra = {}
+        # mask classification target
+        if "instances" in batched_inputs[0]:
+            # input bounding box is checked to be correct.
+            targets = self.prepare_targets(batched_inputs, images)
+            if self.task_switch['grounding']:
+                grounding_tokens = [x['grounding_query_embs'] for x in targets] # need to pad for more than one grounding token
+                grounding_tokens = nn.utils.rnn.pad_sequence(grounding_tokens, padding_value=-1)
+                non_zero_query_mask = (grounding_tokens.sum(dim=-1) == -grounding_tokens.shape[-1])
+                grounding_tokens[non_zero_query_mask] = 0
+                extra['grounding_tokens'] = grounding_tokens
+                extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+            if self.task_switch['spatial']:
+                pos_masks = [x['spatial_query']['rand_shape'].to(self.device) for x in batched_inputs]
+                neg_masks = [(x['spatial_query']['rand_shape'].to(self.device) & False) for x in batched_inputs]
+                fp_masks = torch.stack([(x['spatial_query']['rand_shape'].to(self.device) & False) for x in batched_inputs])
+                extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks, 'false_positive_mask': fp_masks})
+        features = self.backbone(images.tensor)
+        mask_features, _, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        # forward spatial only without gradient
+        if self.task_switch['spatial']:
+            with torch.no_grad():
+                # generate random integeter between [0,3]
+                rand_iter_num = random.randint(0, self.train_max_iter)
+                for i in range(rand_iter_num):
+                    outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, extra=extra, task='spatial')
+                    extra.update(outputs)
+                    extra.update(self.prepare_next_spaital_mask(extra, batched_inputs))
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, extra=extra, task='seg')
+        extra = {'lang_logit': self.sem_seg_head.predictor.lang_encoder.logit_scale,
+                 'class_embeddings': getattr(self.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('default')),
+                 'false_positive_mask': extra['false_positive_mask']}
+        # bipartite matching-based loss
+        self.criterion.losses = self.losses['seg'] # seg criterion losses
+        losses = self.criterion(outputs, targets, extra)
+        del outputs
+        return losses
+    def evaluate(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, target_queries=queries_grounding)
+        mask_cls_results = outputs["pred_logits"]
+        mask_pred_results = outputs["pred_masks"]
+        box_pred_results = outputs["pred_boxes"] if self.task_switch['bbox'] else [None for i in range(len(mask_pred_results))]
+        # upsample masks
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+        input_size = mask_pred_results.shape[-2:]
+        del outputs
+        processed_results = []
+        for mask_cls_result, mask_pred_result, box_pred_result, input_per_image, image_size in zip(
+            mask_cls_results, mask_pred_results, box_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            if self.sem_seg_postprocess_before_inference:
+                mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                    mask_pred_result, image_size, height, width
+                )
+                mask_cls_result = mask_cls_result.to(mask_pred_result)
+            # semantic segmentation inference
+            if self.semantic_on:
+                r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
+                if not self.sem_seg_postprocess_before_inference:
+                    r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
+                processed_results[-1]["sem_seg"] = r
+            # panoptic segmentation inference
+            if self.panoptic_on:
+                panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+            # instance segmentation inference
+            if self.instance_on:
+                if self.task_switch['bbox']:
+                    box_pred_result = bbox_postprocess(box_pred_result, input_size, image_size, height, width)
+                instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result, box_pred_result)
+                processed_results[-1]["instances"] = instance_r
+        return processed_results
+    def evaluate_interactive(self, batched_inputs):
+        assert self.task_switch['spatial']
+        assert 'spatial_query' in batched_inputs[0]
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        extra = {}
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+        multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+        mask_features = mask_features.repeat(nm,1,1,1)
+        all_batch_shape_iou = []
+        pred_smask_pointer = None
+        prev_smask_pointer = None
+        pred_smask_all = None
+        # visualization code
+        # v_pred_mask = []
+        # v_pos_mask = []
+        # v_neg_mask = []
+        # v_gt_mask = batched_inputs[0]['spatial_query']['gt_masks'][0]
+        query_index = self.sem_seg_head.predictor.query_index
+        if self.interactive_mode in ['best', 'best_random']:
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        elif self.interactive_mode == 'random':
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)==1).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)==-1).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor
+            extra.update({'spatial_query_pos_mask': pos_masks[:,0:1].unbind(), 'spatial_query_neg_mask': neg_masks[:,0:1].unbind()})
+        else:
+            assert False, "invalid interactive mode"
+        for i in range(self.interactive_iter):
+            # v_pos_mask += [extra['spatial_query_pos_mask'][0][0][:image_sizes[0][0],:image_sizes[0][1]].float().cpu().numpy()]
+            # v_neg_mask += [extra['spatial_query_neg_mask'][0][0][:image_sizes[0][0],:image_sizes[0][1]].float().cpu().numpy()]
+            outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='spatial')
+            extra.update(outputs)
+            pred_smask = F.interpolate(outputs['prev_mask'], images.tensor.shape[-2:], mode='bilinear')
+            # v_pred_mask += [(pred_smask[0,0][:image_sizes[0][0],:image_sizes[0][1]].sigmoid() > 0.5).float().cpu().numpy()]
+            s = image_sizes[0]
+            b = batched_inputs[0]
+            pred_smask_all = F.interpolate(pred_smask[:,:,:s[0],:s[1]], (b['height'], b['width']), mode='bilinear')[:,0].sigmoid() > 0.5
+            gt_smask = b['gt_masks_orisize']
+            ious = get_iou(gt_smask, pred_smask_all)
+            all_batch_shape_iou += [ious]
+            if (ious > 0.9).sum() == len(ious):
+                all_batch_shape_iou += [ious for j in range(self.interactive_iter-i-1)]
+                break
+            if self.interactive_mode in ['best', 'best_random']:
+                extra.update(self.prepare_next_spaital_mask(extra, batched_inputs, mode=self.interactive_mode))
+            elif self.interactive_mode == 'random':
+                extra.update({'spatial_query_pos_mask': pos_masks[:,i+1:i+2].unbind(), 'spatial_query_neg_mask': neg_masks[:,i+1:i+2].unbind()})
+            else:
+                assert False, "invalid interactive mode"
+        all_batch_shape_iou = torch.stack(all_batch_shape_iou)
+        processed_results = [{"mask_iou": all_batch_shape_iou[:,i]} for i in range(len(all_batch_shape_iou[0]))]
+        return processed_results
+    def evaluate_interactive_single(self, batched_inputs, extra={}):
+        assert self.task_switch['spatial']
+        assert 'spatial_query' in batched_inputs[0]
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+        multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+        mask_features = mask_features.repeat(nm,1,1,1)
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='spatial')
+        pred_smask = F.interpolate(outputs['prev_mask'], images.tensor.shape[-2:], mode='bicubic')
+        s = image_sizes[0]
+        b = batched_inputs[0]
+        pred_smask_ori = F.interpolate(pred_smask[:,:,:s[0],:s[1]], (b['height'], b['width']), mode='bicubic')[:,0].sigmoid() > 0.5
+        pred_smask_batch = pred_smask[:,:,:s[0],:s[1]].sigmoid() > 0.5
+        ious = []
+        if 'gt_masks_orisize' in b:
+            gt_smask = b['gt_masks_orisize'].to(pred_smask_ori.device)
+            ious = get_iou(gt_smask, pred_smask_ori)
+        processed_results = [{"mask_iou": ious, 'pred_mask_ori': pred_smask_ori, 'pred_mask_batch': pred_smask_batch}]
+        return processed_results
+    def evaluate_interactive_grounding(self, batched_inputs):
+        assert self.task_switch['spatial']
+        assert 'spatial_query' in batched_inputs[0]
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        extra = {}
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+        multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+        mask_features = mask_features.repeat(nm,1,1,1)
+        all_batch_shape_iou = []
+        pred_smask_pointer = None
+        prev_smask_pointer = None
+        pred_smask_all = None
+        # visualization code
+        # v_pred_mask = []
+        # v_pos_mask = []
+        # v_neg_mask = []
+        # v_gt_mask = batched_inputs[0]['spatial_query']['gt_masks'][0]
+        query_index = self.sem_seg_head.predictor.query_index
+        if self.interactive_mode in ['best', 'best_random']:
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        elif self.interactive_mode == 'random':
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)==1).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)==-1).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor
+            extra.update({'spatial_query_pos_mask': pos_masks[:,0:1].unbind(), 'spatial_query_neg_mask': neg_masks[:,0:1].unbind()})
+        else:
+            assert False, "invalid interactive mode"
+        grd_texts = batched_inputs[0]['classes']
+        gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+        token_emb = gtext['token_emb']
+        tokens = gtext['tokens']
+        query_emb = nn.utils.rnn.pad_sequence([_token_emb[_tokens.bool()] for _token_emb, _tokens in zip(token_emb, tokens['attention_mask'])], padding_value=-1)
+        non_zero_query_mask = (query_emb.sum(dim=-1) < 0)
+        extra['grounding_tokens'] = query_emb
+        extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+        for i in range(self.interactive_iter):
+            # v_pos_mask += [extra['spatial_query_pos_mask'][0][0][:image_sizes[0][0],:image_sizes[0][1]].float().cpu().numpy()]
+            # v_neg_mask += [extra['spatial_query_neg_mask'][0][0][:image_sizes[0][0],:image_sizes[0][1]].float().cpu().numpy()]
+            outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='spatial')
+            extra.update(outputs)
+            pred_smask = F.interpolate(outputs['prev_mask'], images.tensor.shape[-2:], mode='bilinear')
+            # v_pred_mask += [(pred_smask[0,0][:image_sizes[0][0],:image_sizes[0][1]].sigmoid() > 0.5).float().cpu().numpy()]
+            s = image_sizes[0]
+            b = batched_inputs[0]
+            pred_smask_all = F.interpolate(pred_smask[:,:,:s[0],:s[1]], (b['height'], b['width']), mode='bilinear')[:,0].sigmoid() > 0.5
+            gt_smask = b['gt_masks_orisize']
+            ious = get_iou(gt_smask, pred_smask_all)
+            all_batch_shape_iou += [ious]
+            if (ious > 0.9).sum() == len(ious):
+                all_batch_shape_iou += [ious for j in range(self.interactive_iter-i-1)]
+                break
+            if self.interactive_mode in ['best', 'best_random']:
+                extra.update(self.prepare_next_spaital_mask(extra, batched_inputs, mode=self.interactive_mode))
+            elif self.interactive_mode == 'random':
+                extra.update({'spatial_query_pos_mask': pos_masks[:,i+1:i+2].unbind(), 'spatial_query_neg_mask': neg_masks[:,i+1:i+2].unbind()})
+            else:
+                assert False, "invalid interactive mode"
+        all_batch_shape_iou = torch.stack(all_batch_shape_iou)
+        processed_results = [{"mask_iou": all_batch_shape_iou[:,i]} for i in range(len(all_batch_shape_iou[0]))]
+        # visualization
+        # VL.step()
+        # import cv2
+        # v_masks = []
+        # v_pos_masks = []
+        # v_neg_masks = []
+        # txt = []
+        # img = batched_inputs[0]['image'].permute(1,2,0).cpu().numpy()
+        # mask_img = VL.overlay_single_mask_to_image(img[:,:,::-1], v_gt_mask.cpu().float().numpy())
+        # acc_pos_mask = np.zeros(v_pos_mask[0].shape)
+        # acc_neg_mask = np.zeros(v_neg_mask[0].shape)
+        # for x,y,z,iou in zip(v_pos_mask, v_neg_mask, v_pred_mask, all_batch_shape_iou):
+        #     # dilate x,y
+        #     x = cv2.dilate(x, np.ones((5,5), np.uint8), iterations=3)
+        #     y = cv2.dilate(y, np.ones((5,5), np.uint8), iterations=3)
+        #     acc_pos_mask += x
+        #     acc_neg_mask += y
+        #     v_masks += [z]
+        #     v_pos_masks += [acc_pos_mask.clip(0,1)]
+        #     v_neg_masks += [acc_neg_mask.clip(0,1)]
+        #     txt += ["pred_{}".format(str(iou[0].item())[0:5])]
+        # VL.add_image(img[:,:,::-1])
+        # VL.insert(mask_img, "gt_mask")
+        # VL.overlay_obj_mask_to_image_withposneg(img[:,:,::-1], v_masks, v_pos_masks, v_neg_masks, txt, max_len=20)
+        return processed_results
+    def evaluate_referring_image(self, batched_inputs, extra={}):
+        assert self.task_switch['spatial']
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        assert self.interactive_mode == 'best'
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        if 'spatial_query' in batched_inputs[0]:
+            image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+            nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+            multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+            mask_features = mask_features.repeat(nm,1,1,1)
+            query_index = self.sem_seg_head.predictor.query_index
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='refimg')
+        return outputs, images.tensor.shape
+    def evaluate_grounding(self, batched_inputs, mode):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        extra = {}
+        # mask_pred_results = []
+        # for idx, batch_per_image in enumerate(batched_inputs):
+        #     grd_texts = batch_per_image['groundings']['texts']
+        #     grd_masks = []
+        #     for anno_text in grd_texts:
+        #         gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings([anno_text[0]], name='grounding', token=False, norm=False)
+        #         token_emb = gtext['token_emb']
+        #         tokens = gtext['tokens']
+        #         grd_emb = token_emb[0][tokens['attention_mask'].bool()[0]]
+        #         extra['grounding_tokens'] = grd_emb[:,None]
+        #         assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        #         features = self.backbone(images.tensor)
+        #         outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+        #         pred_gmasks = outputs['pred_masks'][idx,self.num_queries:2*self.num_queries-1]
+        #         v_emb = outputs['pred_captions'][idx,self.num_queries:2*self.num_queries-1]
+        #         t_emb = grd_emb[-1:]
+        #         t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #         v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #         temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+        #         out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+        #         matched_id = out_prob.max(0)[1]
+        #         grd_masks += [pred_gmasks[matched_id,:,:]]
+        #     mask_pred_results += [torch.cat(grd_masks)]
+        # comment for multi object inference.
+        mask_pred_results = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            grd_texts = batch_per_image['groundings']['texts']
+            grd_texts = [x[0] for x in grd_texts]
+            gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+            token_emb = gtext['token_emb']
+            tokens = gtext['tokens']
+            query_emb = token_emb[tokens['attention_mask'].bool()]
+            non_zero_query_mask = torch.zeros(query_emb[:,None].shape[:-1], dtype=torch.bool, device=query_emb.device)
+            extra['grounding_tokens'] = query_emb[:,None]
+            extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+            features = self.backbone(images.tensor)
+            outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+            pred_gmasks = outputs['pred_gmasks'][idx]
+            v_emb = outputs['pred_gtexts'][idx]
+            t_emb = gtext['class_emb']
+            t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+            out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+            matched_id = out_prob.max(0)[1]
+            mask_pred_results += [pred_gmasks[matched_id,:,:]]
+        for i in range(len(mask_pred_results)):
+            # upsample masks
+            mask_pred_results[i] = F.interpolate(
+                mask_pred_results[i][None,],
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+        processed_results = []
+        for mask_pred_result, input_per_image, image_size in zip(
+            mask_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                mask_pred_result, image_size, height, width
+            )
+            processed_results[-1]['grounding_mask'] = mask_pred_result
+            # compute bbox
+            # bbox = BitMasks(mask_pred_result > 0).get_bounding_boxes()
+            # bbox = BoxMode.convert(bbox.tensor, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            # processed_results[-1]['grounding_box'] = bbox
+        return processed_results
+    def evaluate_grounding_sptial(self, batched_inputs, mode):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        extra = {}
+        dilation = 3
+        pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+        pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor
+        pos_masks = (F.conv2d(pos_masks.float(), self.dilation_kernel, padding=dilation//2) > 0).unbind(0)
+        neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+        neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+        mask_pred_results = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            grd_texts = batch_per_image['groundings']['texts']
+            grd_masks = []
+            for idx2, anno_text in enumerate(grd_texts):
+                extra.update({'spatial_query_pos_mask': [pos_masks[idx2]], 'spatial_query_neg_mask': [neg_masks[idx2]]})
+                gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings([anno_text[0]], name='grounding', token=False, norm=False)
+                token_emb = gtext['token_emb']
+                tokens = gtext['tokens']
+                grd_emb = token_emb[0][tokens['attention_mask'].bool()[0]]
+                non_zero_query_mask = torch.zeros(grd_emb[:,None].shape[:-1], dtype=torch.bool, device=grd_emb.device)
+                extra['grounding_tokens'] = grd_emb[:,None]
+                extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+                assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+                features = self.backbone(images.tensor)
+                outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+                pred_gmasks = outputs['pred_gmasks'][idx]
+                v_emb = outputs['pred_gtexts'][idx]
+                t_emb = gtext['class_emb']
+                t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+                v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+                temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+                out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+                matched_id = out_prob.max(0)[1]
+                grd_masks += [pred_gmasks[matched_id,:,:]]
+                # grd_masks += [outputs['prev_mask'][0]]
+            mask_pred_results += [torch.cat(grd_masks)]
+        # comment for multi object inference.
+        # mask_pred_results = []
+        # for idx, batch_per_image in enumerate(batched_inputs):
+        #     grd_texts = batch_per_image['groundings']['texts']
+        #     grd_texts = [x[0] for x in grd_texts]
+        #     gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+        #     token_emb = gtext['token_emb']
+        #     tokens = gtext['tokens']
+        #     query_emb = token_emb[tokens['attention_mask'].bool()]
+        #     non_zero_query_mask = torch.zeros(query_emb[:,None].shape[:-1], dtype=torch.bool, device=query_emb.device)
+        #     extra['grounding_tokens'] = query_emb[:,None]
+        #     extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+        #     features = self.backbone(images.tensor)
+        #     outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+        #     pred_gmasks = outputs['pred_gmasks'][idx]
+        #     v_emb = outputs['pred_gtexts'][idx]
+        #     t_emb = gtext['class_emb']
+        #     t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #     v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #     temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+        #     out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+        #     matched_id = out_prob.max(0)[1]
+        #     mask_pred_results += [pred_gmasks[matched_id,:,:]]
+        for i in range(len(mask_pred_results)):
+            # upsample masks
+            mask_pred_results[i] = F.interpolate(
+                mask_pred_results[i][None,],
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+        processed_results = []
+        for mask_pred_result, input_per_image, image_size in zip(
+            mask_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                mask_pred_result, image_size, height, width
+            )
+            processed_results[-1]['grounding_mask'] = mask_pred_result
+        return processed_results
+    def prepare_targets(self, batched_inputs, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            targets_per_image = batch_per_image['instances'].to(self.device)
+            # pad gt
+            gt_masks = targets_per_image.gt_masks.tensor
+            padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+            padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+            gt_boxes = targets_per_image.gt_boxes.tensor
+            ratio = torch.tensor([w_pad,h_pad,w_pad,h_pad]).to(gt_boxes.device)[None,:]
+            gt_boxes = gt_boxes / ratio
+            xc,yc,w,h = (gt_boxes[:,0] + gt_boxes[:,2])/2, (gt_boxes[:,1] + gt_boxes[:,3])/2, gt_boxes[:,2] - gt_boxes[:,0], gt_boxes[:,3] - gt_boxes[:,1]
+            gt_boxes = torch.stack([xc,yc,w,h]).permute(1,0)
+            target_dict = {
+                    "labels": targets_per_image.gt_classes,
+                    "is_things": targets_per_image.is_things,
+                    "masks": padded_masks,
+                    "boxes": gt_boxes,
+                    }
+            if self.task_switch['spatial']:
+                # prepare targets for spatial query
+                target_dict['gt_spatial_masks'] = batch_per_image['spatial_query']['gt_masks']
+            if self.task_switch['grounding']:
+                grd_masks = batch_per_image['groundings']['masks']
+                grd_texts = batch_per_image['groundings']['texts']
+                grd_hash = batch_per_image['groundings']['hash']
+                grd_task = batch_per_image['groundings']['mode']
+                if len(grd_masks) == 0:
+                    padded_masks = None
+                else:
+                    padded_masks = torch.zeros((grd_masks.shape[0], h_pad, w_pad), dtype=grd_masks.dtype, device=grd_masks.device)
+                    padded_masks[:, : grd_masks.shape[1], : grd_masks.shape[2]] = grd_masks
+                gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+                token_emb = gtext['token_emb']
+                tokens = gtext['tokens']
+                unique_hash_id = np.unique(grd_hash, return_index=True)[1]
+                selected_mask = np.zeros(len(grd_hash)).astype(np.bool)
+                selected_mask[unique_hash_id] = True
+                selected_token_emb = token_emb[selected_mask]
+                selected_attn_mask = tokens['attention_mask'][selected_mask]
+                query_emb = selected_token_emb[selected_attn_mask.bool()]
+                class_idx = tokens['attention_mask'].sum(dim=-1) - 1
+                class_idx = torch.stack((torch.arange(len(class_idx), device=class_idx.device), class_idx)).tolist()
+                class_emb = token_emb[class_idx]
+                target_dict['grounding_masks'] = padded_masks
+                target_dict['grounding_query_embs'] = query_emb
+                target_dict['grounding_class_embs'] = class_emb
+                target_dict['grounding_hash'] = grd_hash
+                target_dict['grounding_task'] = grd_task
+            new_targets.append(target_dict)
+        return new_targets
+    def prepare_next_spaital_mask(self, outputs, batched_inputs, mode='best'):
+        gt_masks = [batched_inputs[i]['spatial_query']['gt_masks'] for i in range(len(batched_inputs))]
+        if self.training:
+            gt_masks = ImageList.from_tensors(gt_masks, self.size_divisibility).tensor
+        else:
+            gt_masks = ImageList.from_tensors(gt_masks, self.size_divisibility).tensor.transpose(0,1)
+        pred_masks = (F.interpolate(outputs['prev_mask'], size=gt_masks.shape[-2:], mode='bilinear', align_corners=False).sigmoid() > 0.5)
+        prev_masks = torch.stack(outputs['spatial_query_pos_mask']) | torch.stack(outputs['spatial_query_neg_mask'])
+        fn = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks) # fn: False Negative, gt:1, pred:0, prev:0
+        fp = (~gt_masks & pred_masks) & (~prev_masks) # fp: False Positive, gt:0, pred:1, prev:0
+        # compute iou between gt and pred
+        iou = (gt_masks & pred_masks).sum(list(range(1,len(fn.shape)))) / ((gt_masks | pred_masks).sum(dim=list(range(1,len(fn.shape)))) + 1e-8)
+        fn_sum = fn.sum(dim=list(range(1,len(fn.shape))))
+        fp_sum = fp.sum(dim=list(range(1,len(fp.shape))))
+        is_postive = fn_sum > fp_sum
+        # is_postive = torch.ones(len(fn_sum), device=torch.cuda.current_device()).bool()
+        select_mask = torch.stack([fn[i] if is_postive[i] else fp[i] for i in range(len(fn))])
+        # conv implementation
+        n,_,h,w = select_mask.shape
+        mask_dt = (distance_transform((~F.pad(select_mask, pad=(1, 1, 1, 1), mode='constant', value=0)).float())[:,:,1:-1,1:-1]).reshape(n,-1)
+        if mode == 'best':
+            max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+        elif mode == 'best_random':
+            max_xy_idx = torch.stack([torch.arange(n), torch.cat([(mask_dt[i] > 0).nonzero()[torch.randint(0, len((mask_dt[i] > 0).nonzero()), (1,))][0] for i in range(len(mask_dt))]).cpu()]).tolist()
+        next_mask = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool()
+        next_mask = next_mask.view(n,-1)
+        next_mask[max_xy_idx] = True
+        next_mask = next_mask.reshape((n,1,h,w)).float()
+        dilation = 3
+        next_mask = F.conv2d(next_mask, self.dilation_kernel, padding=dilation//2) > 0
+        # determine whether next mask is zero
+        keep = (iou < 0.925)
+        next_mask = next_mask & keep.view(-1,1,1,1)
+        pos_mask = []
+        neg_mask = []
+        for idx, ip in enumerate(is_postive):
+            if ip:
+                pos_mask += [outputs['spatial_query_pos_mask'][idx] | next_mask[idx]]
+                neg_mask += [outputs['spatial_query_neg_mask'][idx]]
+            else:
+                pos_mask += [outputs['spatial_query_pos_mask'][idx]]
+                neg_mask += [outputs['spatial_query_neg_mask'][idx] | next_mask[idx]]
+        if 'false_positive_mask' in outputs:
+            fp = outputs['false_positive_mask'] | fp
+        return {'spatial_query_pos_mask': pos_mask, 'spatial_query_neg_mask': neg_mask, 'false_positive_mask': fp}
+    def semantic_inference(self, mask_cls, mask_pred):
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg
+    def panoptic_inference(self, mask_cls, mask_pred):
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+        keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+        cur_mask_cls = mask_cls[keep]
+        cur_mask_cls = cur_mask_cls[:, :-1]
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
+        segments_info = []
+        current_segment_id = 0
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            return panoptic_seg, segments_info
+        else:
+            # take argmax
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
+                mask_area = (cur_mask_ids == k).sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+                mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
+                if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
+                    if mask_area / original_area < self.overlap_threshold:
+                        continue
+                    # merge stuff regions
+                    if not isthing:
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                            continue
+                        else:
+                            stuff_memory_list[int(pred_class)] = current_segment_id + 1
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+                    segments_info.append(
+                        {
+                            "id": current_segment_id,
+                            "isthing": bool(isthing),
+                            "category_id": int(pred_class),
+                        }
+                    )
+            return panoptic_seg, segments_info
+    def instance_inference(self, mask_cls, mask_pred, box_pred):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+        # [Q, K]
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
+        # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
+        labels_per_image = labels[topk_indices]
+        topk_indices = (topk_indices // self.sem_seg_head.num_classes)
+        # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
+        mask_pred = mask_pred[topk_indices]
+        if box_pred is not None:
+            box_pred = box_pred[topk_indices]
+        # if this is panoptic segmentation, we only keep the "thing" classes
+        if self.panoptic_on:
+            keep = torch.zeros_like(scores_per_image).bool()
+            for i, lab in enumerate(labels_per_image):
+                keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
+            scores_per_image = scores_per_image[keep]
+            labels_per_image = labels_per_image[keep]
+            mask_pred = mask_pred[keep]
+            if box_pred is not None:
+                box_pred = box_pred[keep]
+        result = Instances(image_size)
+        # mask (before sigmoid)
+        result.pred_masks = (mask_pred > 0).float()
+        # result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # Uncomment the following to get boxes from masks (this is slow)
+        if box_pred is not None:
+            result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
+        else:
+            result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
+        result.scores = scores_per_image * mask_scores_per_image
+        result.pred_classes = labels_per_image
+        return result
+    def prepare_targets4query(self, targets, images, topk=5):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        new_queries = []
+        for targets_per_image in targets:
+            # we randomly sample maximally topk concepts
+            unique_target_classes = [k for k in set(targets_per_image.gt_classes.tolist())]
+            selected_target_classes = random.sample(unique_target_classes, min(topk, len(unique_target_classes)))
+            new_targets_per_image = []
+            new_queries_per_image = []
+            for clss in selected_target_classes:
+                indices = (targets_per_image.gt_classes == clss).nonzero().view(-1)
+                # pad gt
+                gt_masks = targets_per_image.gt_masks[indices]
+                padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+                padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+                # convert class into concept name and then token seq
+                self.sem_seg_head.predictor.lang_encoder.get_text_embeddings([COCO_PANOPTIC_CLASSES[clss]], name='grounding')
+                query = getattr(self.sem_seg_head.predictor.lang_encoder, 'grounding_text_embeddings')
+                new_targets.append(
+                    {
+                        "labels": targets_per_image.gt_classes[indices],
+                        "masks": padded_masks,
+                    }
+                )
+                new_queries_per_image.append(query)
+            new_queries.append(new_queries_per_image)
+        return new_targets, new_queries
+@register_model
+def get_seem_model(cfg, **kwargs):
+    return GeneralizedSEEM(cfg)

modeling/architectures/seem_model_v1.py ADDED Viewed

	@@ -0,0 +1,1179 @@

+# --------------------------------------------------------
+# SEEM -- Segment Everything Everywhere All at Once
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import random
+from typing import Tuple
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from kornia.contrib import distance_transform
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from detectron2.utils.memory import retry_if_cuda_oom
+from detectron2.data import MetadataCatalog
+from .build import register_model
+from ..utils import configurable, get_class_names, get_iou, Spatial_ImageList
+from ..vision.backbone import build_backbone, Backbone
+from ..body import build_xdecoder_head
+from ..modules import sem_seg_postprocess, SetCriterion, HungarianMatcher, bbox_postprocess
+from ..language import build_language_encoder
+from ..language.loss import vl_similarity
+from utilities.prompt_engineering import prompt_engineering
+from utilities.constants import COCO_PANOPTIC_CLASSES, BIOMED_CLASSES
+class GeneralizedSEEM(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        losses: dict,
+        num_queries: int,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        task_switch: dict,
+        phrase_prob: float,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        semantic_on: bool,
+        panoptic_on: bool,
+        instance_on: bool,
+        test_topk_per_image: int,
+        train_dataset_name: str,
+        interactive_mode: str,
+        interactive_iter: str,
+        dilation_kernel: torch.Tensor,
+        train_max_iter: int,
+        binary_classes: bool,
+        standard_text_for_eval: bool,
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            semantic_on: bool, whether to output semantic segmentation prediction
+            instance_on: bool, whether to output instance segmentation prediction
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.losses = losses
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+        # additional args
+        self.semantic_on = semantic_on
+        self.instance_on = instance_on
+        self.panoptic_on = panoptic_on
+        # caption argument
+        self.task_switch = task_switch
+        self.phrase_prob = phrase_prob
+        self.train_max_iter = train_max_iter
+        self.test_topk_per_image = test_topk_per_image
+        self.train_class_names = get_class_names(train_dataset_name)
+        if binary_classes:
+            self.train_class_names = ['target', 'background']
+        self.interactive_mode = interactive_mode
+        self.interactive_iter = interactive_iter
+        if not self.semantic_on:
+            assert self.sem_seg_postprocess_before_inference
+        self.register_buffer("dilation_kernel", dilation_kernel)
+        self.standard_text_for_eval = standard_text_for_eval
+    @classmethod
+    def from_config(cls, cfg):
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        # Loss parameters:
+        deep_supervision = dec_cfg['DEEP_SUPERVISION']
+        no_object_weight = dec_cfg['NO_OBJECT_WEIGHT']
+        # loss weights
+        loss_weights = {'mask': {'ce': dec_cfg['CLASS_WEIGHT'], 'dice': dec_cfg['DICE_WEIGHT'], 'bce': dec_cfg['MASK_WEIGHT']},
+                        'bbox': {'l1': dec_cfg['BBOX_WEIGHT'], 'giou': dec_cfg['GIOU_WEIGHT']},
+                        'spatial': {'ce': dec_cfg['SCLASS_WEIGHT'], 'dice': dec_cfg['SDICE_WEIGHT'], 'bce': dec_cfg['SMASK_WEIGHT']},
+                        'grounding': {'ce': dec_cfg['GCLASS_WEIGHT'], 'dice': dec_cfg['GDICE_WEIGHT'], 'bce': dec_cfg['GMASK_WEIGHT']},
+                        'openimage': {'ce': dec_cfg['OCLASS_WEIGHT'], 'dice': dec_cfg['ODICE_WEIGHT'], 'bce': dec_cfg['OMASK_WEIGHT']}}
+        openimage_switch = {'grounding': dec_cfg['OPENIMAGE']['GROUNDING'].get('ENABLED', False),
+                            'mask': dec_cfg['OPENIMAGE'].get('ENABLED', False)}
+        task_switch = {'bbox': dec_cfg.get('DETECTION', False),
+                       'mask': dec_cfg['MASK'].get('ENABLED', True),
+                       'spatial': dec_cfg['SPATIAL'].get('ENABLED', False),
+                       'grounding': dec_cfg['GROUNDING'].get('ENABLED', False),
+                       'openimage': openimage_switch}
+        top_x_layers = {'mask': dec_cfg.get('TOP_MASK_LAYERS', 10),
+                        'grounding': dec_cfg.get('TOP_GROUNDING_LAYERS', 10),
+                        'openimage': dec_cfg.get('TOP_OPENIMAGE_LAYERS', 10),
+                        'spatial': dec_cfg.get('TOP_SPATIAL_LAYERS', 10)}
+        spatial_cost = {"class_weight": dec_cfg['COST_SPATIAL']['CLASS_WEIGHT'],
+                        "mask_weight": dec_cfg['COST_SPATIAL']['MASK_WEIGHT'],
+                        "dice_weight": dec_cfg['COST_SPATIAL']['DICE_WEIGHT']}
+        extra = {'task_switch': task_switch}
+        backbone = build_backbone(cfg)
+        lang_encoder = build_language_encoder(cfg)
+        sem_seg_head = build_xdecoder_head(cfg, backbone.output_shape(), lang_encoder, extra=extra)
+        # building criterion
+        matcher = HungarianMatcher(
+            cost_class=loss_weights['mask']['ce'],
+            cost_mask=loss_weights['mask']['bce'],
+            cost_dice=loss_weights['mask']['dice'],
+            num_points=dec_cfg['TRAIN_NUM_POINTS'],
+            spatial_cost=spatial_cost,
+        )
+        # init weight dict and criterion loss functions.
+        losses = {'seg': [], 'openimage': []}
+        if task_switch['mask']:
+            losses['seg'] += ["labels", "masks"]
+        if task_switch['spatial']:
+            losses['seg'] += ["spatials"]
+        if task_switch['grounding']:
+            losses['seg'] += ["groundings"]
+        if task_switch['openimage']:
+            losses['openimage'] += ["labels_openimage", "masks"]
+        if task_switch['openimage']['grounding']:
+            losses['openimage'] += ["groundings"]
+        weight_dict = {}
+        for key, turn_on in task_switch.items():
+            if turn_on:
+                if isinstance(loss_weights[key], dict):
+                    # HACK it should support bbox in the future
+                    for key_, weight in loss_weights[key].items():
+                        weight_dict["loss_{}_{}_0".format(key, key_)] = weight # NOTE: hard code for segmentation that has multiple loss
+                else:
+                    weight_dict["loss_{}_0".format(key)] = loss_weights[key]
+        # generate full weight dict and remove not computed layers.
+        if deep_supervision:
+            dec_layers = dec_cfg['DEC_LAYERS']
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                for k, v in weight_dict.items():
+                    if (i+1) > (top_x_layers[k.split('_')[1]] - 1):
+                        continue
+                    aux_weight_dict.update({k.replace('_0', f"_{i+1}"): v})
+            weight_dict.update(aux_weight_dict)
+        grd_weight = {'text': dec_cfg['GROUNDING']['TEXT_WEIGHT'], 'class': dec_cfg['GROUNDING']['CLASS_WEIGHT']}
+        # generate critenrion for loss function.
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            top_x_layers=top_x_layers,
+            eos_coef=no_object_weight,
+            losses=[],
+            num_points=dec_cfg['TRAIN_NUM_POINTS'],
+            oversample_ratio=dec_cfg['OVERSAMPLE_RATIO'],
+            importance_sample_ratio=dec_cfg['IMPORTANCE_SAMPLE_RATIO'],
+            grounding_weight=grd_weight,
+        )
+        # extra logistic
+        train_dataset_name = cfg['DATASETS']['TRAIN'][0] # HACK for only one training set.
+        train_max_iter = dec_cfg['SPATIAL'].get('MAX_ITER', 3)
+        phrase_prob = dec_cfg['CAPTION'].get('PHRASE_PROB', 0.5)
+        interactive_mode = cfg['STROKE_SAMPLER']['EVAL']['MODE']
+        interactive_iter = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER']
+        dilation = 3
+        dilation_kernel = torch.ones((1, 1, dilation, dilation), device=torch.cuda.current_device())
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "losses": losses,
+            "num_queries": dec_cfg['NUM_OBJECT_QUERIES'],
+            "object_mask_threshold": dec_cfg['TEST']['OBJECT_MASK_THRESHOLD'],
+            "overlap_threshold": dec_cfg['TEST']['OVERLAP_THRESHOLD'],
+            "metadata": MetadataCatalog.get(cfg['DATASETS']['TRAIN'][0]),
+            "size_divisibility": dec_cfg['SIZE_DIVISIBILITY'],
+            "sem_seg_postprocess_before_inference": (
+                dec_cfg['TEST']['SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE']
+                or dec_cfg['TEST']['PANOPTIC_ON']
+                or dec_cfg['TEST']['INSTANCE_ON']
+            ),
+            "pixel_mean": cfg['INPUT']['PIXEL_MEAN'],
+            "pixel_std": cfg['INPUT']['PIXEL_STD'],
+            "task_switch": task_switch,
+            "phrase_prob": phrase_prob,
+            # inference
+            "semantic_on": dec_cfg['TEST']['SEMANTIC_ON'],
+            "instance_on": dec_cfg['TEST']['INSTANCE_ON'],
+            "panoptic_on": dec_cfg['TEST']['PANOPTIC_ON'],
+            "test_topk_per_image": cfg['TEST']['DETECTIONS_PER_IMAGE'],
+            "train_dataset_name": train_dataset_name,
+            "interactive_mode": interactive_mode,
+            "interactive_iter": interactive_iter,
+            "dilation_kernel": dilation_kernel,
+            "train_max_iter": train_max_iter,
+            "binary_classes": enc_cfg['BINARY_CLASSES'],
+            "standard_text_for_eval": cfg['STANDARD_TEXT_FOR_EVAL'],
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs, mode='default'):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        if self.training:
+            losses = {}
+            if self.task_switch['mask'] or self.task_switch['grounding'] or self.task_switch['spatial']:
+                losses_seg = self.forward_seg(batched_inputs)
+                losses.update(losses_seg)
+            if self.task_switch['openimage'] and self.task_switch['openimage']['mask']:
+                losses_openimage = self.forward_openimage(batched_inputs['openimage'])
+                losses_openimage = {key.replace('mask', 'openimage'):value for key, value in losses_openimage.items()}
+                losses_openimage = {key.replace('grounding', 'grounding_openimage'):value for key, value in losses_openimage.items()}
+                losses.update(losses_openimage)
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else: # remove this loss if not specified in `weight_dict`
+                    losses.pop(k)
+            return losses
+        else:
+            if mode == 'interactive':
+                return self.evaluate_interactive(batched_inputs)
+            elif mode == 'interactive_grounding':
+                return self.evaluate_interactive_grounding(batched_inputs)
+            elif mode == 'grounding_spatial':
+                return self.evaluate_grounding_sptial(batched_inputs, mode)
+            elif mode in ['grounding_phrasecut', 'grounding_refcoco']:
+                return self.evaluate_grounding(batched_inputs, mode)
+            else:
+                return self.evaluate(batched_inputs)
+    def forward_seg(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        self.sem_seg_head.predictor.lang_encoder.get_text_embeddings(self.train_class_names, is_eval=False)
+        extra = {}
+        # mask classification target
+        if "instances" in batched_inputs[0]:
+            # input bounding box is checked to be correct.
+            targets = self.prepare_targets(batched_inputs, images)
+            if self.task_switch['grounding']:
+                grounding_tokens = [x['grounding_query_embs'] for x in targets] # need to pad for more than one grounding token
+                grounding_tokens = nn.utils.rnn.pad_sequence(grounding_tokens, padding_value=-1)
+                non_zero_query_mask = (grounding_tokens.sum(dim=-1) == -grounding_tokens.shape[-1])
+                grounding_tokens[non_zero_query_mask] = 0
+                extra['grounding_tokens'] = grounding_tokens
+                extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+            if self.task_switch['spatial']:
+                pos_masks = [x['spatial_query']['rand_shape'].to(self.device) for x in batched_inputs]
+                neg_masks = [(x['spatial_query']['rand_shape'].to(self.device) & False) for x in batched_inputs]
+                fp_masks = nn.utils.rnn.pad_sequence([(x['spatial_query']['rand_shape'].to(self.device) & False) for x in batched_inputs], padding_value=False, batch_first=True)
+                extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks, 'false_positive_mask': fp_masks})
+        features = self.backbone(images.tensor)
+        mask_features, _, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        # forward spatial only without gradient
+        if self.task_switch['spatial']:
+            with torch.no_grad():
+                # generate random integeter between [0,3]
+                rand_iter_num = random.randint(0, self.train_max_iter)
+                for i in range(rand_iter_num):
+                    outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, extra=extra, task='spatial')
+                    extra.update(outputs)
+                    extra.update(self.prepare_next_spaital_mask(extra, batched_inputs))
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, extra=extra, task='seg')
+        extra = {'lang_logit': self.sem_seg_head.predictor.lang_encoder.logit_scale,
+                 'class_embeddings': getattr(self.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('default')),
+                 'false_positive_mask': extra['false_positive_mask']}
+        # bipartite matching-based loss
+        self.criterion.losses = self.losses['seg'] # seg criterion losses
+        if self.task_switch['mask']:
+            losses = self.criterion(outputs, targets, extra)
+        else:
+            losses = self.criterion.forward_vlp(outputs, targets, extra)
+        del outputs
+        return losses
+    def evaluate(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, target_queries=queries_grounding)
+        mask_cls_results = outputs["pred_logits"]
+        mask_pred_results = outputs["pred_masks"]
+        box_pred_results = outputs["pred_boxes"] if self.task_switch['bbox'] else [None for i in range(len(mask_pred_results))]
+        # upsample masks
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+        input_size = mask_pred_results.shape[-2:]
+        del outputs
+        processed_results = []
+        for mask_cls_result, mask_pred_result, box_pred_result, input_per_image, image_size in zip(
+            mask_cls_results, mask_pred_results, box_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            if self.sem_seg_postprocess_before_inference:
+                mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                    mask_pred_result, image_size, height, width
+                )
+                mask_cls_result = mask_cls_result.to(mask_pred_result)
+            # semantic segmentation inference
+            if self.semantic_on:
+                r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
+                if not self.sem_seg_postprocess_before_inference:
+                    r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
+                processed_results[-1]["sem_seg"] = r
+            # panoptic segmentation inference
+            if self.panoptic_on:
+                panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+            # instance segmentation inference
+            if self.instance_on:
+                if self.task_switch['bbox']:
+                    box_pred_result = bbox_postprocess(box_pred_result, input_size, image_size, height, width)
+                instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result, box_pred_result)
+                processed_results[-1]["instances"] = instance_r
+        return processed_results
+    def evaluate_interactive(self, batched_inputs):
+        assert self.task_switch['spatial']
+        assert 'spatial_query' in batched_inputs[0]
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        extra = {}
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        all_batch_shape_iou = []
+        pred_smask_pointer = None
+        prev_smask_pointer = None
+        pred_smask_all = None
+        # visualization code
+        # v_pred_mask = []
+        # v_pos_mask = []
+        # v_neg_mask = []
+        # v_gt_mask = batched_inputs[0]['spatial_query']['gt_masks'][0]
+        query_index = self.sem_seg_head.predictor.query_index
+        if self.interactive_mode in ['best', 'best_random']:
+            pos_masks = [x['spatial_query']['rand_shape'].to(self.device)[:,0] for x in batched_inputs]
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+            neg_masks = [(x['spatial_query']['rand_shape'].to(self.device) & False)[:,0] for x in batched_inputs]
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        elif self.interactive_mode == 'random':
+            assert False, "interactive mode not correctly implemented"
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)==1).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)==-1).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor
+            extra.update({'spatial_query_pos_mask': pos_masks[:,0:1].unbind(), 'spatial_query_neg_mask': neg_masks[:,0:1].unbind()})
+        else:
+            assert False, "invalid interactive mode"
+        for i in range(self.interactive_iter):
+            # v_pos_mask += [extra['spatial_query_pos_mask'][0][0][:image_sizes[0][0],:image_sizes[0][1]].float().cpu().numpy()]
+            # v_neg_mask += [extra['spatial_query_neg_mask'][0][0][:image_sizes[0][0],:image_sizes[0][1]].float().cpu().numpy()]
+            outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='spatial')
+            extra.update(outputs)
+            pred_smask = F.interpolate(outputs['prev_mask'], images.tensor.shape[-2:], mode='bilinear')
+            # v_pred_mask += [(pred_smask[0,0][:image_sizes[0][0],:image_sizes[0][1]].sigmoid() > 0.5).float().cpu().numpy()]
+            s = image_sizes[0]
+            b = batched_inputs[0]
+            pred_smask_all = F.interpolate(pred_smask[:,:,:s[0],:s[1]], (b['height'], b['width']), mode='bilinear')[0].sigmoid() > 0.5
+            gt_smask = b['gt_masks_orisize']
+            ious = get_iou(gt_smask, pred_smask_all)
+            all_batch_shape_iou += [ious]
+            if (ious > 0.9).sum() == len(ious):
+                all_batch_shape_iou += [ious for j in range(self.interactive_iter-i-1)]
+                break
+            if self.interactive_mode in ['best', 'best_random']:
+                extra.update(self.prepare_next_spaital_mask(extra, batched_inputs, mode=self.interactive_mode))
+            elif self.interactive_mode == 'random':
+                extra.update({'spatial_query_pos_mask': pos_masks[:,i+1:i+2].unbind(), 'spatial_query_neg_mask': neg_masks[:,i+1:i+2].unbind()})
+            else:
+                assert False, "invalid interactive mode"
+        all_batch_shape_iou = torch.stack(all_batch_shape_iou)
+        processed_results = [{"mask_iou": all_batch_shape_iou[:,i]} for i in range(len(all_batch_shape_iou[0]))]
+        return processed_results
+    def evaluate_interactive_single(self, batched_inputs, extra={}):
+        assert self.task_switch['spatial']
+        assert 'spatial_query' in batched_inputs[0]
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+        multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+        mask_features = mask_features.repeat(nm,1,1,1)
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='spatial')
+        pred_smask = F.interpolate(outputs['prev_mask'], images.tensor.shape[-2:], mode='bicubic')
+        s = image_sizes[0]
+        b = batched_inputs[0]
+        pred_smask_ori = F.interpolate(pred_smask[:,:,:s[0],:s[1]], (b['height'], b['width']), mode='bicubic')[:,0].sigmoid() > 0.5
+        pred_smask_batch = pred_smask[:,:,:s[0],:s[1]].sigmoid() > 0.5
+        ious = []
+        if 'gt_masks_orisize' in b:
+            gt_smask = b['gt_masks_orisize'].to(pred_smask_ori.device)
+            ious = get_iou(gt_smask, pred_smask_ori)
+        processed_results = [{"mask_iou": ious, 'pred_mask_ori': pred_smask_ori, 'pred_mask_batch': pred_smask_batch}]
+        return processed_results
+    def evaluate_interactive_grounding(self, batched_inputs):
+        assert self.task_switch['spatial']
+        assert 'spatial_query' in batched_inputs[0]
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        extra = {}
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+        nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+        multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+        mask_features = mask_features.repeat(nm,1,1,1)
+        all_batch_shape_iou = []
+        pred_smask_pointer = None
+        prev_smask_pointer = None
+        pred_smask_all = None
+        # visualization code
+        # v_pred_mask = []
+        # v_pos_mask = []
+        # v_neg_mask = []
+        # v_gt_mask = batched_inputs[0]['spatial_query']['gt_masks'][0]
+        query_index = self.sem_seg_head.predictor.query_index
+        if self.interactive_mode in ['best', 'best_random']:
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        elif self.interactive_mode == 'random':
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)==1).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)==-1).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor
+            extra.update({'spatial_query_pos_mask': pos_masks[:,0:1].unbind(), 'spatial_query_neg_mask': neg_masks[:,0:1].unbind()})
+        else:
+            assert False, "invalid interactive mode"
+        grd_texts = batched_inputs[0]['classes']
+        gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+        token_emb = gtext['token_emb']
+        tokens = gtext['tokens']
+        query_emb = nn.utils.rnn.pad_sequence([_token_emb[_tokens.bool()] for _token_emb, _tokens in zip(token_emb, tokens['attention_mask'])], padding_value=-1)
+        non_zero_query_mask = (query_emb.sum(dim=-1) < 0)
+        extra['grounding_tokens'] = query_emb
+        extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+        for i in range(self.interactive_iter):
+            # v_pos_mask += [extra['spatial_query_pos_mask'][0][0][:image_sizes[0][0],:image_sizes[0][1]].float().cpu().numpy()]
+            # v_neg_mask += [extra['spatial_query_neg_mask'][0][0][:image_sizes[0][0],:image_sizes[0][1]].float().cpu().numpy()]
+            outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='spatial')
+            extra.update(outputs)
+            pred_smask = F.interpolate(outputs['prev_mask'], images.tensor.shape[-2:], mode='bilinear')
+            # v_pred_mask += [(pred_smask[0,0][:image_sizes[0][0],:image_sizes[0][1]].sigmoid() > 0.5).float().cpu().numpy()]
+            s = image_sizes[0]
+            b = batched_inputs[0]
+            pred_smask_all = F.interpolate(pred_smask[:,:,:s[0],:s[1]], (b['height'], b['width']), mode='bilinear')[:,0].sigmoid() > 0.5
+            gt_smask = b['gt_masks_orisize']
+            ious = get_iou(gt_smask, pred_smask_all)
+            all_batch_shape_iou += [ious]
+            if (ious > 0.9).sum() == len(ious):
+                all_batch_shape_iou += [ious for j in range(self.interactive_iter-i-1)]
+                break
+            if self.interactive_mode in ['best', 'best_random']:
+                extra.update(self.prepare_next_spaital_mask(extra, batched_inputs, mode=self.interactive_mode))
+            elif self.interactive_mode == 'random':
+                extra.update({'spatial_query_pos_mask': pos_masks[:,i+1:i+2].unbind(), 'spatial_query_neg_mask': neg_masks[:,i+1:i+2].unbind()})
+            else:
+                assert False, "invalid interactive mode"
+        all_batch_shape_iou = torch.stack(all_batch_shape_iou)
+        processed_results = [{"mask_iou": all_batch_shape_iou[:,i]} for i in range(len(all_batch_shape_iou[0]))]
+        # visualization
+        # VL.step()
+        # import cv2
+        # v_masks = []
+        # v_pos_masks = []
+        # v_neg_masks = []
+        # txt = []
+        # img = batched_inputs[0]['image'].permute(1,2,0).cpu().numpy()
+        # mask_img = VL.overlay_single_mask_to_image(img[:,:,::-1], v_gt_mask.cpu().float().numpy())
+        # acc_pos_mask = np.zeros(v_pos_mask[0].shape)
+        # acc_neg_mask = np.zeros(v_neg_mask[0].shape)
+        # for x,y,z,iou in zip(v_pos_mask, v_neg_mask, v_pred_mask, all_batch_shape_iou):
+        #     # dilate x,y
+        #     x = cv2.dilate(x, np.ones((5,5), np.uint8), iterations=3)
+        #     y = cv2.dilate(y, np.ones((5,5), np.uint8), iterations=3)
+        #     acc_pos_mask += x
+        #     acc_neg_mask += y
+        #     v_masks += [z]
+        #     v_pos_masks += [acc_pos_mask.clip(0,1)]
+        #     v_neg_masks += [acc_neg_mask.clip(0,1)]
+        #     txt += ["pred_{}".format(str(iou[0].item())[0:5])]
+        # VL.add_image(img[:,:,::-1])
+        # VL.insert(mask_img, "gt_mask")
+        # VL.overlay_obj_mask_to_image_withposneg(img[:,:,::-1], v_masks, v_pos_masks, v_neg_masks, txt, max_len=20)
+        return processed_results
+    def evaluate_referring_image(self, batched_inputs, extra={}):
+        assert self.task_switch['spatial']
+        assert len(batched_inputs) == 1, "only support batch size equal to 1"
+        assert self.interactive_mode == 'best'
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        mask_features, transformer_encoder_features, multi_scale_features = self.sem_seg_head.pixel_decoder.forward_features(features)
+        if 'spatial_query' in batched_inputs[0]:
+            image_sizes = [x["image"].shape[-2:] for x in batched_inputs]
+            nm = len(batched_inputs[0]['spatial_query']['rand_shape'])
+            multi_scale_features = [m.repeat(nm,1,1,1) for m in multi_scale_features]
+            mask_features = mask_features.repeat(nm,1,1,1)
+            query_index = self.sem_seg_head.predictor.query_index
+            pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+            pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor.unbind(0)
+            neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+            neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+        outputs = self.sem_seg_head.predictor(multi_scale_features, mask_features, target_queries=queries_grounding, extra=extra, task='refimg')
+        return outputs, images.tensor.shape
+    def evaluate_grounding(self, batched_inputs, mode):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        extra = {}
+        # mask_pred_results = []
+        # for idx, batch_per_image in enumerate(batched_inputs):
+        #     grd_texts = batch_per_image['groundings']['texts']
+        #     grd_masks = []
+        #     for anno_text in grd_texts:
+        #         gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings([anno_text[0]], name='grounding', token=False, norm=False)
+        #         token_emb = gtext['token_emb']
+        #         tokens = gtext['tokens']
+        #         grd_emb = token_emb[0][tokens['attention_mask'].bool()[0]]
+        #         extra['grounding_tokens'] = grd_emb[:,None]
+        #         assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        #         features = self.backbone(images.tensor)
+        #         outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+        #         pred_gmasks = outputs['pred_masks'][idx,self.num_queries:2*self.num_queries-1]
+        #         v_emb = outputs['pred_captions'][idx,self.num_queries:2*self.num_queries-1]
+        #         t_emb = grd_emb[-1:]
+        #         t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #         v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #         temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+        #         out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+        #         matched_id = out_prob.max(0)[1]
+        #         grd_masks += [pred_gmasks[matched_id,:,:]]
+        #     mask_pred_results += [torch.cat(grd_masks)]
+        # comment for multi object inference.
+        mask_pred_results = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            grd_texts = batch_per_image['groundings']['texts']
+            if self.standard_text_for_eval:
+                standard_texts = []
+                for grd in batch_per_image['grounding_info']:
+                    mask_file = grd['mask_file'].split('.')[0].split('/')[-1]
+                    target = mask_file.split('_')[-1].replace('+', ' ')
+                    site = mask_file.split('_')[-2].replace('+', ' ')
+                    modality = mask_file.split('_')[-3].replace('+', ' ')
+                    standard_texts.append(f'{target} in {site} {modality}')
+                grd_texts = standard_texts
+                batch_per_image['groundings']['texts'] = standard_texts
+            gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+            token_emb = gtext['token_emb']
+            tokens = gtext['tokens']
+            query_emb = token_emb[tokens['attention_mask'].bool()]
+            non_zero_query_mask = torch.zeros(query_emb[:,None].shape[:-1], dtype=torch.bool, device=query_emb.device)
+            extra['grounding_tokens'] = query_emb[:,None]
+            extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+            features = self.backbone(images.tensor)
+            outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+            pred_gmasks = outputs['pred_gmasks'][idx]
+            v_emb = outputs['pred_gtexts'][idx]
+            t_emb = gtext['class_emb']
+            t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+            out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+            matched_id = out_prob.max(0)[1]
+            mask_pred_results += [pred_gmasks[matched_id,:,:]]
+        for i in range(len(mask_pred_results)):
+            # upsample masks
+            mask_pred_results[i] = F.interpolate(
+                mask_pred_results[i][None,],
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+        processed_results = []
+        for mask_pred_result, input_per_image, image_size in zip(
+            mask_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                mask_pred_result, image_size, height, width
+            )
+            processed_results[-1]['grounding_mask'] = mask_pred_result
+            # compute bbox
+            # bbox = BitMasks(mask_pred_result > 0).get_bounding_boxes()
+            # bbox = BoxMode.convert(bbox.tensor, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            # processed_results[-1]['grounding_box'] = bbox
+        return processed_results
+    def evaluate_grounding_sptial(self, batched_inputs, mode):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        extra = {}
+        dilation = 3
+        pos_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device)).unbind(0)
+        pos_masks = ImageList.from_tensors(pos_masks, self.size_divisibility).tensor
+        pos_masks = (F.conv2d(pos_masks.float(), self.dilation_kernel, padding=dilation//2) > 0).unbind(0)
+        neg_masks = (batched_inputs[0]['spatial_query']['rand_shape'].to(self.device) & False).unbind(0)
+        neg_masks = ImageList.from_tensors(neg_masks, self.size_divisibility).tensor.unbind(0)
+        mask_pred_results = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            grd_texts = batch_per_image['groundings']['texts']
+            grd_masks = []
+            for idx2, anno_text in enumerate(grd_texts):
+                extra.update({'spatial_query_pos_mask': [pos_masks[idx2]], 'spatial_query_neg_mask': [neg_masks[idx2]]})
+                gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings([anno_text[0]], name='grounding', token=False, norm=False)
+                token_emb = gtext['token_emb']
+                tokens = gtext['tokens']
+                grd_emb = token_emb[0][tokens['attention_mask'].bool()[0]]
+                non_zero_query_mask = torch.zeros(grd_emb[:,None].shape[:-1], dtype=torch.bool, device=grd_emb.device)
+                extra['grounding_tokens'] = grd_emb[:,None]
+                extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+                assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+                features = self.backbone(images.tensor)
+                outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+                pred_gmasks = outputs['pred_gmasks'][idx]
+                v_emb = outputs['pred_gtexts'][idx]
+                t_emb = gtext['class_emb']
+                t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+                v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+                temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+                out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+                matched_id = out_prob.max(0)[1]
+                grd_masks += [pred_gmasks[matched_id,:,:]]
+                # grd_masks += [outputs['prev_mask'][0]]
+            mask_pred_results += [torch.cat(grd_masks)]
+        # comment for multi object inference.
+        # mask_pred_results = []
+        # for idx, batch_per_image in enumerate(batched_inputs):
+        #     grd_texts = batch_per_image['groundings']['texts']
+        #     grd_texts = [x[0] for x in grd_texts]
+        #     gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+        #     token_emb = gtext['token_emb']
+        #     tokens = gtext['tokens']
+        #     query_emb = token_emb[tokens['attention_mask'].bool()]
+        #     non_zero_query_mask = torch.zeros(query_emb[:,None].shape[:-1], dtype=torch.bool, device=query_emb.device)
+        #     extra['grounding_tokens'] = query_emb[:,None]
+        #     extra['grounding_nonzero_mask'] = non_zero_query_mask.t()
+        #     features = self.backbone(images.tensor)
+        #     outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+        #     pred_gmasks = outputs['pred_gmasks'][idx]
+        #     v_emb = outputs['pred_gtexts'][idx]
+        #     t_emb = gtext['class_emb']
+        #     t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #     v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #     temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+        #     out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+        #     matched_id = out_prob.max(0)[1]
+        #     mask_pred_results += [pred_gmasks[matched_id,:,:]]
+        for i in range(len(mask_pred_results)):
+            # upsample masks
+            mask_pred_results[i] = F.interpolate(
+                mask_pred_results[i][None,],
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+        processed_results = []
+        for mask_pred_result, input_per_image, image_size in zip(
+            mask_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                mask_pred_result, image_size, height, width
+            )
+            processed_results[-1]['grounding_mask'] = mask_pred_result
+        return processed_results
+    def prepare_targets(self, batched_inputs, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            target_dict = {}
+            if self.task_switch['mask']:
+                targets_per_image = batch_per_image['instances'].to(self.device)
+                # pad gt
+                gt_masks = targets_per_image.gt_masks.tensor
+                padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+                padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+                gt_boxes = targets_per_image.gt_boxes.tensor
+                ratio = torch.tensor([w_pad,h_pad,w_pad,h_pad]).to(gt_boxes.device)[None,:]
+                gt_boxes = gt_boxes / ratio
+                xc,yc,w,h = (gt_boxes[:,0] + gt_boxes[:,2])/2, (gt_boxes[:,1] + gt_boxes[:,3])/2, gt_boxes[:,2] - gt_boxes[:,0], gt_boxes[:,3] - gt_boxes[:,1]
+                gt_boxes = torch.stack([xc,yc,w,h]).permute(1,0)
+                target_dict.update({
+                        "labels": targets_per_image.gt_classes,
+                        "is_things": targets_per_image.is_things,
+                        "masks": padded_masks,
+                        "boxes": gt_boxes,
+                        })
+            if self.task_switch['spatial']:
+                # prepare targets for spatial query
+                target_dict['gt_spatial_masks'] = batch_per_image['spatial_query']['gt_masks']
+            if self.task_switch['grounding']:
+                grd_masks = batch_per_image['groundings']['masks']
+                grd_texts = batch_per_image['groundings']['texts']
+                grd_hash = batch_per_image['groundings']['hash']
+                grd_task = batch_per_image['groundings']['mode']
+                if len(grd_masks) == 0:
+                    padded_masks = None
+                else:
+                    padded_masks = torch.zeros((grd_masks.shape[0], h_pad, w_pad), dtype=grd_masks.dtype, device=grd_masks.device)
+                    padded_masks[:, : grd_masks.shape[1], : grd_masks.shape[2]] = grd_masks
+                gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+                token_emb = gtext['token_emb']
+                tokens = gtext['tokens']
+                unique_hash_id = np.unique(grd_hash, return_index=True)[1]
+                selected_mask = np.zeros(len(grd_hash)).astype(bool)
+                selected_mask[unique_hash_id] = True
+                selected_token_emb = token_emb[selected_mask]
+                selected_attn_mask = tokens['attention_mask'][selected_mask]
+                query_emb = selected_token_emb[selected_attn_mask.bool()]
+                class_idx = tokens['attention_mask'].sum(dim=-1) - 1
+                class_idx = torch.stack((torch.arange(len(class_idx), device=class_idx.device), class_idx)).tolist()
+                class_emb = token_emb[class_idx]
+                target_dict['grounding_masks'] = padded_masks
+                target_dict['grounding_query_embs'] = query_emb
+                target_dict['grounding_class_embs'] = class_emb
+                target_dict['grounding_hash'] = grd_hash
+                target_dict['grounding_task'] = grd_task
+            new_targets.append(target_dict)
+        return new_targets
+    def prepare_next_spaital_mask(self, outputs, batched_inputs, mode='best'):
+        gt_masks = [batched_inputs[i]['spatial_query']['gt_masks'] for i in range(len(batched_inputs))]
+        gt_masks = Spatial_ImageList.from_tensors(gt_masks, self.size_divisibility).tensor
+        pred_masks = (F.interpolate(outputs['prev_mask'], size=gt_masks.shape[-2:], mode='bilinear', align_corners=False).sigmoid() > 0.5)
+        prev_masks = nn.utils.rnn.pad_sequence(outputs['spatial_query_pos_mask'], padding_value=False, batch_first=True) | \
+                        nn.utils.rnn.pad_sequence(outputs['spatial_query_neg_mask'], padding_value=False, batch_first=True)
+        fn = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks) # fn: False Negative, gt:1, pred:0, prev:0
+        fp = (~gt_masks & pred_masks) & (~prev_masks) # fp: False Positive, gt:0, pred:1, prev:0
+        # compute iou between gt and pred
+        iou = (gt_masks & pred_masks).sum(list(range(2,len(fn.shape)))) / ((gt_masks | pred_masks).sum(dim=list(range(2,len(fn.shape)))) + 1e-8)
+        fn_sum = fn.sum(dim=list(range(2,len(fn.shape))))
+        fp_sum = fp.sum(dim=list(range(2,len(fp.shape))))
+        is_postive = fn_sum > fp_sum
+        select_mask = torch.zeros_like(fn)
+        select_mask[is_postive] = fn[is_postive]
+        select_mask[~is_postive] = fp[~is_postive]
+        # is_postive = torch.ones(len(fn_sum), device=torch.cuda.current_device()).bool()
+        # conv implementation
+        bs,ns,h,w = select_mask.shape
+        mask_dt = (distance_transform((~F.pad(select_mask, pad=(1, 1, 1, 1), mode='constant', value=0)).float())[:,:,1:-1,1:-1]).reshape(bs*ns,-1)
+        if mode == 'best':
+            max_xy_idx = torch.stack([torch.arange(bs*ns), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+        elif mode == 'best_random':
+            max_xy_idx = torch.stack([torch.arange(bs*ns), torch.cat([(mask_dt[i] > 0).nonzero()[torch.randint(0, len((mask_dt[i] > 0).nonzero()), (1,))][0] for i in range(len(mask_dt))]).cpu()]).tolist()
+        next_mask = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool()
+        next_mask = next_mask.view(bs*ns,-1)
+        next_mask[max_xy_idx] = True
+        next_mask = next_mask.reshape((bs*ns,1,h,w)).float()
+        dilation = 3
+        next_mask = F.conv2d(next_mask, self.dilation_kernel, padding=dilation//2).reshape(bs,ns,h,w) > 0
+        # determine whether next mask is zero
+        keep = (iou < 0.925)
+        next_mask = next_mask & keep.view(bs,ns,1,1)
+        pos_mask = []
+        neg_mask = []
+        for idx, ip in enumerate(is_postive):
+            mask_len = len(outputs['spatial_query_pos_mask'][idx])
+            pos_mask += [outputs['spatial_query_pos_mask'][idx] | (next_mask[idx][:mask_len] & ip[:mask_len,None,None])]
+            neg_mask += [outputs['spatial_query_neg_mask'][idx] | (next_mask[idx][:mask_len] & (~ip[:mask_len,None,None]))]
+        if 'false_positive_mask' in outputs:
+            fp = outputs['false_positive_mask'] | fp
+        return {'spatial_query_pos_mask': pos_mask, 'spatial_query_neg_mask': neg_mask, 'false_positive_mask': fp}
+    def semantic_inference(self, mask_cls, mask_pred):
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg
+    def panoptic_inference(self, mask_cls, mask_pred):
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+        keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+        cur_mask_cls = mask_cls[keep]
+        cur_mask_cls = cur_mask_cls[:, :-1]
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
+        segments_info = []
+        current_segment_id = 0
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            return panoptic_seg, segments_info
+        else:
+            # take argmax
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
+                mask_area = (cur_mask_ids == k).sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+                mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
+                if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
+                    if mask_area / original_area < self.overlap_threshold:
+                        continue
+                    # merge stuff regions
+                    if not isthing:
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                            continue
+                        else:
+                            stuff_memory_list[int(pred_class)] = current_segment_id + 1
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+                    segments_info.append(
+                        {
+                            "id": current_segment_id,
+                            "isthing": bool(isthing),
+                            "category_id": int(pred_class),
+                        }
+                    )
+            return panoptic_seg, segments_info
+    def instance_inference(self, mask_cls, mask_pred, box_pred):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+        # [Q, K]
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
+        # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
+        labels_per_image = labels[topk_indices]
+        topk_indices = (topk_indices // self.sem_seg_head.num_classes)
+        # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
+        mask_pred = mask_pred[topk_indices]
+        if box_pred is not None:
+            box_pred = box_pred[topk_indices]
+        # if this is panoptic segmentation, we only keep the "thing" classes
+        if self.panoptic_on:
+            keep = torch.zeros_like(scores_per_image).bool()
+            for i, lab in enumerate(labels_per_image):
+                keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
+            scores_per_image = scores_per_image[keep]
+            labels_per_image = labels_per_image[keep]
+            mask_pred = mask_pred[keep]
+            if box_pred is not None:
+                box_pred = box_pred[keep]
+        result = Instances(image_size)
+        # mask (before sigmoid)
+        result.pred_masks = (mask_pred > 0).float()
+        # result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # Uncomment the following to get boxes from masks (this is slow)
+        if box_pred is not None:
+            result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
+        else:
+            result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
+        result.scores = scores_per_image * mask_scores_per_image
+        result.pred_classes = labels_per_image
+        return result
+    def prepare_targets4query(self, targets, images, topk=5):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        new_queries = []
+        for targets_per_image in targets:
+            # we randomly sample maximally topk concepts
+            unique_target_classes = [k for k in set(targets_per_image.gt_classes.tolist())]
+            selected_target_classes = random.sample(unique_target_classes, min(topk, len(unique_target_classes)))
+            new_targets_per_image = []
+            new_queries_per_image = []
+            for clss in selected_target_classes:
+                indices = (targets_per_image.gt_classes == clss).nonzero().view(-1)
+                # pad gt
+                gt_masks = targets_per_image.gt_masks[indices]
+                padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+                padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+                # convert class into concept name and then token seq
+                self.sem_seg_head.predictor.lang_encoder.get_text_embeddings([BIOMED_CLASSES[clss]], name='grounding')
+                query = getattr(self.sem_seg_head.predictor.lang_encoder, 'grounding_text_embeddings')
+                new_targets.append(
+                    {
+                        "labels": targets_per_image.gt_classes[indices],
+                        "masks": padded_masks,
+                    }
+                )
+                new_queries_per_image.append(query)
+            new_queries.append(new_queries_per_image)
+        return new_targets, new_queries
+@register_model
+def get_seem_model(cfg, **kwargs):
+    return GeneralizedSEEM(cfg)

modeling/architectures/xdecoder_model.py ADDED Viewed

	@@ -0,0 +1,937 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou ([email protected]), Ziyi Dou, Jianwei Yang
+# --------------------------------------------------------
+from typing import Tuple
+import random
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from timm.models.layers import trunc_normal_
+from nltk.stem.lancaster import LancasterStemmer
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks, BoxMode
+from detectron2.utils.memory import retry_if_cuda_oom
+from detectron2.data import MetadataCatalog
+from .build import register_model
+from ..utils import configurable, get_class_names
+from ..vision.backbone import build_backbone, Backbone
+from ..body import build_xdecoder_head
+from ..modules import sem_seg_postprocess, SetCriterion, HungarianMatcher, bbox_postprocess
+from ..language import build_language_encoder
+from ..language.loss import vl_similarity, image_text_contrastive_loss_queue
+from utilities.prompt_engineering import prompt_engineering
+from utilities.constants import COCO_PANOPTIC_CLASSES
+st = LancasterStemmer()
+class GeneralizedXdecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        losses: dict,
+        num_queries: int,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        task_switch: dict,
+        phrase_prob: float,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        semantic_on: bool,
+        panoptic_on: bool,
+        instance_on: bool,
+        test_topk_per_image: int,
+        train_dataset_name: str,
+        retrieval_emsemble: bool,
+        backbone_dim: int,
+        dim_proj: int,
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            semantic_on: bool, whether to output semantic segmentation prediction
+            instance_on: bool, whether to output instance segmentation prediction
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.losses = losses
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+        # additional args
+        self.semantic_on = semantic_on
+        self.instance_on = instance_on
+        self.panoptic_on = panoptic_on
+        # caption argument
+        self.task_switch = task_switch
+        self.phrase_prob = phrase_prob
+        self.test_topk_per_image = test_topk_per_image
+        self.train_class_names = get_class_names(train_dataset_name)
+        self.retrieval_emsemble = retrieval_emsemble
+        # backbone itc loss
+        if task_switch['retrieval'] and retrieval_emsemble:
+            self.backbone_proj = nn.Parameter(torch.empty(backbone_dim, dim_proj))
+            trunc_normal_(self.backbone_proj, std=.02)
+        if not self.semantic_on:
+            assert self.sem_seg_postprocess_before_inference
+    @classmethod
+    def from_config(cls, cfg):
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        # Loss parameters:
+        deep_supervision = dec_cfg['DEEP_SUPERVISION']
+        no_object_weight = dec_cfg['NO_OBJECT_WEIGHT']
+        # loss weights, switcher for task, and top layers to compute loss
+        loss_weights = {'mask': {'ce': dec_cfg['CLASS_WEIGHT'], 'dice': dec_cfg['DICE_WEIGHT'], 'bce': dec_cfg['MASK_WEIGHT']},
+                        'bbox': {'l1': dec_cfg['BBOX_WEIGHT'], 'giou': dec_cfg['GIOU_WEIGHT']},
+                        'caption': dec_cfg['CAPTION_WEIGHT'],
+                        'captioning': dec_cfg['CAPTIONING_WEIGHT'],
+                        'retrieval': {'decoder': dec_cfg['RETRIEVAL_WEIGHT'], 'backbone': dec_cfg['BACKBONER_WEIGHT']},
+                        'grounding': {'ce': dec_cfg['GCLASS_WEIGHT'], 'dice': dec_cfg['GDICE_WEIGHT'], 'bce': dec_cfg['GMASK_WEIGHT']}}
+        task_switch = {'bbox': dec_cfg.get('DETECTION', False),
+                       'mask': dec_cfg.get('MASK', True),
+                       'caption': dec_cfg['CAPTION'].get('ENABLED', False),
+                       'captioning': dec_cfg['CAPTIONING'].get('ENABLED', False),
+                       'retrieval': dec_cfg['RETRIEVAL'].get('ENABLED', False),
+                       'grounding': dec_cfg['GROUNDING'].get('ENABLED', False)}
+        top_x_layers = {'mask': dec_cfg.get('TOP_MASK_LAYERS', 10),
+                        'caption': dec_cfg.get('TOP_CAPTION_LAYERS', 10),
+                        'captioning': dec_cfg.get('TOP_CAPTIONING_LAYERS', 10),
+                        'retrieval': dec_cfg.get('TOP_RETRIEVAL_LAYERS', 10),
+                        'grounding': dec_cfg.get('TOP_GROUNDING_LAYERS', 10),}
+        # build model
+        extra = {'task_switch': task_switch}
+        backbone = build_backbone(cfg)
+        lang_encoder = build_language_encoder(cfg)
+        sem_seg_head = build_xdecoder_head(cfg, backbone.output_shape(), lang_encoder, extra)
+        # building criterion
+        matcher = HungarianMatcher(
+            cost_class=loss_weights['mask']['ce'],
+            cost_mask=loss_weights['mask']['bce'],
+            cost_dice=loss_weights['mask']['dice'],
+            num_points=dec_cfg['TRAIN_NUM_POINTS'],
+        )
+        # init weight dict and criterion loss functions.
+        losses = {'seg': [], 'vlp': []}
+        if task_switch['mask']:
+            losses['seg'] += ["labels", "masks"]
+        if task_switch['caption']:
+            losses['seg'] += ["captions"]
+        if task_switch['grounding']:
+            losses['seg'] += ["groundings"]
+        if task_switch['captioning']:
+            losses['vlp'] += ["captionings"]
+        if task_switch['retrieval']:
+            losses['vlp'] += ["retrievals"]
+        weight_dict = {}
+        for key, turn_on in task_switch.items():
+            if turn_on:
+                if isinstance(loss_weights[key], dict):
+                    # HACK it should support bbox in the future
+                    for key_, weight in loss_weights[key].items():
+                        weight_dict["loss_{}_{}_0".format(key, key_)] = weight # NOTE: hard code for segmentation that has multiple loss
+                else:
+                    weight_dict["loss_{}_0".format(key)] = loss_weights[key]
+        # generate full weight dict and remove not computed layers.
+        if deep_supervision:
+            dec_layers = dec_cfg['DEC_LAYERS']
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                for k, v in weight_dict.items():
+                    if (i+1) > (top_x_layers[k.split('_')[1]] - 1):
+                        continue
+                    aux_weight_dict.update({k.replace('_0', f"_{i+1}"): v})
+            weight_dict.update(aux_weight_dict)
+        grd_weight = {'text': dec_cfg['GROUNDING']['TEXT_WEIGHT'], 'class': dec_cfg['GROUNDING']['CLASS_WEIGHT']}
+        # generate critenrion for loss function.
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            top_x_layers=top_x_layers,
+            eos_coef=no_object_weight,
+            losses=[],
+            num_points=dec_cfg['TRAIN_NUM_POINTS'],
+            oversample_ratio=dec_cfg['OVERSAMPLE_RATIO'],
+            importance_sample_ratio=dec_cfg['IMPORTANCE_SAMPLE_RATIO'],
+            grounding_weight=grd_weight,
+        )
+        # extra logistic
+        train_dataset_name = cfg['DATASETS']['TRAIN'][0] # HACK for only one training set.
+        phrase_prob = dec_cfg['CAPTION'].get('PHRASE_PROB', 0.5)
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "losses": losses,
+            "num_queries": dec_cfg['NUM_OBJECT_QUERIES'],
+            "object_mask_threshold": dec_cfg['TEST']['OBJECT_MASK_THRESHOLD'],
+            "overlap_threshold": dec_cfg['TEST']['OVERLAP_THRESHOLD'],
+            "metadata": MetadataCatalog.get(cfg['DATASETS']['TRAIN'][0]),
+            "size_divisibility": dec_cfg['SIZE_DIVISIBILITY'],
+            "sem_seg_postprocess_before_inference": (
+                dec_cfg['TEST']['SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE']
+                or dec_cfg['TEST']['PANOPTIC_ON']
+                or dec_cfg['TEST']['INSTANCE_ON']
+            ),
+            "pixel_mean": cfg['INPUT']['PIXEL_MEAN'],
+            "pixel_std": cfg['INPUT']['PIXEL_STD'],
+            "task_switch": task_switch,
+            "phrase_prob": phrase_prob,
+            # inference
+            "semantic_on": dec_cfg['TEST']['SEMANTIC_ON'],
+            "instance_on": dec_cfg['TEST']['INSTANCE_ON'],
+            "panoptic_on": dec_cfg['TEST']['PANOPTIC_ON'],
+            "test_topk_per_image": cfg['COCO']['TEST']['DETECTIONS_PER_IMAGE'],
+            "train_dataset_name": train_dataset_name,
+            "retrieval_emsemble": dec_cfg['RETRIEVAL']['ENSEMBLE'],
+            "backbone_dim": cfg['MODEL']['BACKBONE_DIM'],
+            "dim_proj": cfg['MODEL']['DIM_PROJ'],
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs, mode=None):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        if self.training:
+            losses = {}
+            if self.task_switch['mask']:
+                losses_seg = self.forward_seg(batched_inputs['coco'])
+                losses.update(losses_seg)
+            if self.task_switch['retrieval'] or self.task_switch['captioning']:
+                losses_vlp = self.forward_vlp(batched_inputs['vlp'])
+                losses.update(losses_vlp)
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else: # remove this loss if not specified in `weight_dict`
+                    losses.pop(k)
+            return losses
+        else:
+            if mode == 'retrieval':
+                return self.evaluate_retrieval(batched_inputs)
+            elif mode == 'captioning':
+                return self.evaluate_captioning(batched_inputs)
+            elif mode == 'classification':
+                return self.evaluate_classification(batched_inputs)
+            elif mode == 'grounding_refcoco':
+                return self.evaluate_grounding(batched_inputs, mode)
+            else:
+                return self.evaluate(batched_inputs)
+    def forward_seg(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        self.sem_seg_head.predictor.lang_encoder.get_text_embeddings(self.train_class_names, is_eval=False)
+        extra = {}
+        # mask classification target
+        if "instances" in batched_inputs[0]:
+            # input bounding box is checked to be correct.
+            targets = self.prepare_targets(batched_inputs, images)
+            if self.task_switch['grounding']:
+                grounding_tokens = [x['grounding_query_embs'] for x in targets] # need to pad for more than one grounding token
+                grounding_tokens = nn.utils.rnn.pad_sequence(grounding_tokens)
+                extra['grounding_tokens'] = grounding_tokens
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, extra=extra)
+        _outputs = {}
+        for key, value in outputs.items():
+            if key == 'pred_logits':
+                _outputs[key] = value[:,:self.num_queries-1]
+            elif key == 'pred_masks':
+                _outputs[key] = value[:,:self.num_queries-1]
+                if self.task_switch['grounding']:
+                    _outputs['pred_gmasks'] = value[:,self.num_queries:2*self.num_queries-1]
+            elif key == 'pred_captions':
+                _outputs[key] = value[:,:self.num_queries-1]
+                if self.task_switch['grounding']:
+                    _outputs['pred_gtexts'] = value[:,self.num_queries:2*self.num_queries-1]
+            elif key == 'aux_outputs':
+                _outputs[key] = []
+                for i in range(len(value)):
+                    _outputs[key] += [{}]
+                    for _key, _value in value[i].items():
+                        if _key == 'pred_logits':
+                            _outputs[key][i][_key] = _value[:,:self.num_queries-1]
+                        elif _key == 'pred_masks':
+                            _outputs[key][i][_key] = _value[:,:self.num_queries-1]
+                            if self.task_switch['grounding']:
+                                _outputs[key][i]['pred_gmasks'] = _value[:,self.num_queries:2*self.num_queries-1]
+                        elif _key == 'pred_captions':
+                            _outputs[key][i][_key] = _value[:,:self.num_queries-1]
+                            if self.task_switch['grounding']:
+                                _outputs[key][i]['pred_gtexts'] = _value[:,self.num_queries:2*self.num_queries-1]
+        outputs = _outputs
+        extra = {'lang_logit': self.sem_seg_head.predictor.lang_encoder.logit_scale,
+                 'class_embeddings': getattr(self.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('default'))}
+        # bipartite matching-based loss
+        self.criterion.losses = self.losses['seg'] # seg criterion losses
+        losses = self.criterion(outputs, targets, extra)
+        del outputs
+        del _outputs
+        return losses
+    def forward_vlp(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        targets_vlp = self.prepare_vlp_targets(batched_inputs, images.tensor.device)
+        extra = {"token_embedding": self.sem_seg_head.predictor.lang_encoder.lang_encoder.token_embedding,
+                 "lang_encoder": self.sem_seg_head.predictor.lang_encoder,
+                 "training": self.training}
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, target_queries=None, target_vlp=targets_vlp, task='vlp', extra=extra)
+        for key, value in outputs.items():
+            if key == 'pred_captionings':
+                outputs[key] = value
+            elif key == 'pred_captions':
+                # outputs[key] = value[:,-1:]
+                outputs[key] = value
+            elif key == 'aux_outputs':
+                outputs[key] = []
+                for i in range(len(value)):
+                    outputs[key] += [{}]
+                    for _key, _value in value[i].items():
+                        if _key == 'pred_captions':
+                            # outputs[key][i][_key] = _value[:,-1:]
+                            outputs[key][i][_key] = _value
+                        elif _key == 'pred_captionings':
+                            outputs[key][i][_key] = _value
+        self.criterion.losses = self.losses['vlp'] # seg criterion losses
+        losses = self.criterion.forward_vlp(outputs, targets_vlp, extra)
+        del outputs
+        if self.task_switch['retrieval'] and self.retrieval_emsemble:
+            # compute backbone vlp.
+            v_emb = features['res5']
+            bs,nc,_,_ = v_emb.shape
+            v_emb = v_emb.reshape(bs,nc,-1)
+            v_emb = F.adaptive_avg_pool1d(v_emb, 1).reshape(bs,nc) @ self.backbone_proj
+            t_emb = torch.cat([x['caption_proj'] for x in targets_vlp], dim=0)
+            loss_contrast = image_text_contrastive_loss_queue(v_emb, t_emb, self.sem_seg_head.predictor.lang_encoder, None)
+            losses['loss_retrieval_backbone_0'] = loss_contrast
+        return losses
+    def evaluate(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, target_queries=queries_grounding)
+        mask_cls_results = outputs["pred_logits"]
+        mask_pred_results = outputs["pred_masks"]
+        box_pred_results = outputs["pred_boxes"] if self.task_switch['bbox'] else [None for i in range(len(mask_pred_results))]
+        caption_pred_results = outputs["pred_captions"] if self.task_switch['caption'] else [None for i in range(len(mask_pred_results))]
+        # upsample masks
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+            mode="bicubic",
+            align_corners=False,
+            antialias=True
+        )
+        input_size = mask_pred_results.shape[-2:]
+        keep_sem_bgd = self.metadata.keep_sem_bgd if hasattr(self.metadata, 'keep_sem_bgd') else False
+        del outputs
+        processed_results = []
+        for mask_cls_result, mask_pred_result, box_pred_result, caption_pred_result, input_per_image, image_size in zip(
+            mask_cls_results, mask_pred_results, box_pred_results, caption_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            if self.sem_seg_postprocess_before_inference:
+                mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                    mask_pred_result, image_size, height, width
+                )
+                mask_cls_result = mask_cls_result.to(mask_pred_result)
+            # semantic segmentation inference
+            if self.semantic_on:
+                r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result, keep_sem_bgd)
+                if not self.sem_seg_postprocess_before_inference:
+                    r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
+                processed_results[-1]["sem_seg"] = r
+            # panoptic segmentation inference
+            if self.panoptic_on:
+                panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+            # instance segmentation inference
+            if self.instance_on:
+                if self.task_switch['bbox']:
+                    box_pred_result = bbox_postprocess(box_pred_result, input_size, image_size, height, width)
+                instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result, box_pred_result)
+                processed_results[-1]["instances"] = instance_r
+            if self.task_switch['caption']:
+                processed_results[-1]["captions"] = caption_pred_result
+                processed_results[-1]["masks"] = mask_pred_result
+        return processed_results
+    def evaluate_retrieval(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, target_queries=queries_grounding)
+        v_emb_it = outputs['pred_captions'][:,-1]
+        # compute backbone score
+        if self.task_switch['retrieval'] and self.retrieval_emsemble:
+            _v_emb_it = features['res5']
+            bs,nc,_,_ = _v_emb_it.shape
+            _v_emb_it = _v_emb_it.reshape(bs,nc,-1)
+            _v_emb_it = F.adaptive_avg_pool1d(_v_emb_it, 1).reshape(bs,nc) @ self.backbone_proj
+        processed_results = []
+        for idx, batch_data in enumerate(batched_inputs):
+            caption_ids = []
+            t_emb_its = []
+            processed_results.append({})
+            for caption in batch_data['captions']:
+                lang_results = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(caption)
+                t_emb_it = lang_results['class_emb']
+                caption_ids.append(batch_data['image_id'])
+                t_emb_its.append(t_emb_it)
+            t_emb_it = torch.cat(t_emb_its, dim=0)
+            image_embeds = [v_emb_it[idx].unsqueeze(0)]
+            if self.task_switch['retrieval'] and self.retrieval_emsemble:
+                image_embeds += [_v_emb_it[idx].unsqueeze(0)]
+            caption_results = {
+                    'image_embeds': image_embeds,
+                    'text_embeds': t_emb_it,
+                    'caption_ids': caption_ids,
+                    'image_ids': batch_data['image_id'],
+                }
+            processed_results[-1]["caption"] = caption_results
+        del features
+        return processed_results
+    def evaluate_captioning(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        if not hasattr(self, 'start_token'):
+            self.start_token = torch.tensor([[49406]*77], device=self.device)
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        captioning_mask = None
+        if 'captioning_mask' in batched_inputs[-1]:
+            captioning_mask = torch.cat([x['captioning_mask'] for x in batched_inputs])
+        outputs = self.sem_seg_head(features, target_queries=queries_grounding, task='captioning_infer', extra={'start_token': self.start_token, 'captioning_mask': captioning_mask})
+        processed_results = []
+        for idx, batch_data in enumerate(batched_inputs):
+            processed_results.append({})
+            processed_results[-1]["captioning_token"] = outputs['pred_captionings'][idx]
+            processed_results[-1]["captioning_text"] = outputs['pred_texts'][idx].split('.')[0]
+            processed_results[-1]["image_id"] = batched_inputs[idx]['image_id']
+        return processed_results
+    def evaluate_classification(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, target_queries=queries_grounding)
+        processed_results = []
+        for idx, batch_data in enumerate(batched_inputs):
+            processed_results.append({})
+            processed_results[-1]["pred_class"] = outputs['pred_logits'][idx,-1]
+        return processed_results
+    def evaluate_grounding_baseline(self, batched_inputs, mode):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        img_bs = images.tensor.shape[0]
+        targets = targets_grounding = queries_grounding = None
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features, target_queries=queries_grounding)
+        mask_pred_results = outputs["pred_masks"]
+        caption_pred_results = outputs["pred_captions"] if self.task_switch['caption'] else [None for i in range(len(mask_pred_results))]
+        # upsample masks
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+            mode="bicubic",
+            align_corners=False,
+            antialias=True
+        )
+        processed_results = []
+        for mask_pred_result, caption_pred_result, input_per_image, image_size in zip(
+            mask_pred_results, caption_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                mask_pred_result, image_size, height, width
+            )[:-1]
+            texts_all = input_per_image['groundings']['texts']
+            grd_masks = []
+            for texts in texts_all:
+                if mode == 'grounding_refcoco':
+                    self.sem_seg_head.predictor.lang_encoder.get_text_embeddings(texts, name='grounding', prompt=False, is_eval=True)
+                elif mode == 'grounding_phrasecut':
+                    self.sem_seg_head.predictor.lang_encoder.get_text_embeddings(texts, name='grounding', prompt=True, is_eval=False)
+                t_emb = getattr(self.sem_seg_head.predictor.lang_encoder, "{}_text_embeddings".format('grounding')).t()
+                v_emb = caption_pred_result[:-1]
+                v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+                vt_sim = v_emb @ t_emb
+                max_id = vt_sim.max(0)[1][0]
+                grd_masks += [mask_pred_result[max_id]]
+            processed_results[-1]['grounding_mask'] = torch.stack(grd_masks)
+        return processed_results
+    def evaluate_grounding(self, batched_inputs, mode):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        extra = {}
+        # mask_pred_results = []
+        # for idx, batch_per_image in enumerate(batched_inputs):
+        #     grd_texts = batch_per_image['groundings']['texts']
+        #     grd_masks = []
+        #     for anno_text in grd_texts:
+        #         gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings([anno_text[0]], name='grounding', token=False, norm=False)
+        #         token_emb = gtext['token_emb']
+        #         tokens = gtext['tokens']
+        #         grd_emb = token_emb[0][tokens['attention_mask'].bool()[0]]
+        #         extra['grounding_tokens'] = grd_emb[:,None]
+        #         assert len(images.tensor) == 1, "grounding evaluation only support single batch size now"
+        #         features = self.backbone(images.tensor)
+        #         outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+        #         pred_gmasks = outputs['pred_masks'][idx,self.num_queries:2*self.num_queries-1]
+        #         v_emb = outputs['pred_captions'][idx,self.num_queries:2*self.num_queries-1]
+        #         t_emb = grd_emb[-1:]
+        #         t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #         v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        #         temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+        #         out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+        #         matched_id = out_prob.max(0)[1]
+        #         grd_masks += [pred_gmasks[matched_id,:,:]]
+        #     mask_pred_results += [torch.cat(grd_masks)]
+        # comment for multi object inference.
+        mask_pred_results = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            grd_texts = batch_per_image['groundings']['texts']
+            grd_texts = [x[0] for x in grd_texts]
+            gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+            token_emb = gtext['token_emb']
+            tokens = gtext['tokens']
+            query_emb = token_emb[tokens['attention_mask'].bool()]
+            extra['grounding_tokens'] = query_emb[:,None]
+            features = self.backbone(images.tensor)
+            outputs = self.sem_seg_head(features, extra=extra, task='grounding_eval')
+            pred_gmasks = outputs['pred_masks'][idx,self.num_queries:2*self.num_queries-1]
+            v_emb = outputs['pred_captions'][idx,self.num_queries:2*self.num_queries-1]
+            t_emb = gtext['class_emb']
+            t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            temperature = self.sem_seg_head.predictor.lang_encoder.logit_scale
+            out_prob = vl_similarity(v_emb, t_emb, temperature=temperature)
+            matched_id = out_prob.max(0)[1]
+            mask_pred_results += [pred_gmasks[matched_id,:,:]]
+        for i in range(len(mask_pred_results)):
+            # upsample masks
+            mask_pred_results[i] = F.interpolate(
+                mask_pred_results[i][None,],
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bicubic",
+                align_corners=False,
+                antialias=True
+            )[0]
+        processed_results = []
+        for mask_pred_result, input_per_image, image_size in zip(
+            mask_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            processed_results.append({})
+            mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                mask_pred_result, image_size, height, width
+            )
+            processed_results[-1]['grounding_mask'] = mask_pred_result
+            # compute bbox
+            # bbox = BitMasks(mask_pred_result > 0).get_bounding_boxes()
+            # bbox = BoxMode.convert(bbox.tensor, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            # processed_results[-1]['grounding_box'] = bbox
+        return processed_results
+    def prepare_vlp_targets(self, batched_inputs, device):
+        input_ids = []
+        attention_mask = []
+        for cnt, x in enumerate(batched_inputs):
+            captions = x['captions']
+            randid = random.randint(0, len(captions)-1)
+            input_ids += x['tokens']['input_ids'][randid:randid+1]
+            attention_mask += x['tokens']['attention_mask'][randid:randid+1]
+        input_ids = torch.stack(input_ids)
+        attention_mask = torch.stack(attention_mask)
+        tokens = {"input_ids": input_ids, "attention_mask": attention_mask}
+        lang_results = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(tokens, token=True)
+        target_vlp = []
+        for cnt, x in enumerate(batched_inputs):
+            target_dict = {}
+            target_dict["caption_tokens"] = lang_results['token_emb'][cnt:cnt+1]
+            target_dict["caption_proj"] = lang_results['class_emb'][cnt:cnt+1]
+            target_dict["caption_tokenids"] = lang_results['tokens']['input_ids'][cnt:cnt+1]
+            target_dict["caption_mask"] = lang_results['tokens']['attention_mask'][cnt:cnt+1]
+            target_vlp.append(target_dict)
+        return target_vlp
+    def prepare_targets(self, batched_inputs, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        for idx, batch_per_image in enumerate(batched_inputs):
+            targets_per_image = batch_per_image["instances"].to(self.device)
+            # pad gt
+            gt_masks = targets_per_image.gt_masks
+            padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+            padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+            gt_boxes = targets_per_image.gt_boxes.tensor
+            ratio = torch.tensor([w_pad,h_pad,w_pad,h_pad]).to(gt_boxes.device)[None,:]
+            gt_boxes = gt_boxes / ratio
+            xc,yc,w,h = (gt_boxes[:,0] + gt_boxes[:,2])/2, (gt_boxes[:,1] + gt_boxes[:,3])/2, gt_boxes[:,2] - gt_boxes[:,0], gt_boxes[:,3] - gt_boxes[:,1]
+            gt_boxes = torch.stack([xc,yc,w,h]).permute(1,0)
+            target_dict = {
+                    "labels": targets_per_image.gt_classes,
+                    "is_things": targets_per_image.is_things,
+                    "masks": padded_masks,
+                    "boxes": gt_boxes
+                    }
+            if self.task_switch['caption']:
+                caption = batch_per_image["captions"]
+                caption_noun = batch_per_image["captions_noun"]
+                rand_index = random.randint(0, len(caption)-1)
+                text = caption[rand_index]
+                nouns = caption_noun[rand_index]
+                noun_captions = [prompt_engineering(noun, topk=10000, suffix='.') for noun in nouns] + [text]
+                self.sem_seg_head.predictor.lang_encoder.get_text_embeddings(noun_captions, is_eval=False, name='caption_noun', prompt=False)
+                ctext = getattr(self.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('caption_noun'))
+                target_dict["captions"] = ctext
+                target_dict["captions_hash"] = [(hash(st.stem(txt)) % 10**16) for txt in (nouns + [text])]
+                target_dict["labels_hash"] = [(hash(st.stem(COCO_PANOPTIC_CLASSES[label_id].replace('-other','').replace('-merged','').replace('-stuff',''))) % 10**16) for label_id in target_dict['labels']]
+            if self.task_switch['grounding']:
+                grd_masks = batch_per_image['groundings']['masks']
+                grd_texts = batch_per_image['groundings']['texts']
+                grd_hash = batch_per_image['groundings']['hash']
+                grd_task = batch_per_image['groundings']['mode']
+                if len(grd_masks) == 0:
+                    padded_masks = None
+                else:
+                    padded_masks = torch.zeros((grd_masks.shape[0], h_pad, w_pad), dtype=grd_masks.dtype, device=grd_masks.device)
+                    padded_masks[:, : grd_masks.shape[1], : grd_masks.shape[2]] = grd_masks
+                gtext = self.sem_seg_head.predictor.lang_encoder.get_text_token_embeddings(grd_texts, name='grounding', token=False, norm=False)
+                token_emb = gtext['token_emb']
+                tokens = gtext['tokens']
+                unique_hash_id = np.unique(grd_hash, return_index=True)[1]
+                selected_mask = np.zeros(len(grd_hash)).astype(np.bool)
+                selected_mask[unique_hash_id] = True
+                selected_token_emb = token_emb[selected_mask]
+                selected_attn_mask = tokens['attention_mask'][selected_mask]
+                query_emb = selected_token_emb[selected_attn_mask.bool()]
+                class_idx = tokens['attention_mask'].sum(dim=-1) - 1
+                class_idx = torch.stack((torch.arange(len(class_idx), device=class_idx.device), class_idx)).tolist()
+                class_emb = token_emb[class_idx]
+                target_dict['grounding_masks'] = padded_masks
+                target_dict['grounding_query_embs'] = query_emb
+                target_dict['grounding_class_embs'] = class_emb
+                target_dict['grounding_hash'] = grd_hash
+                target_dict['grounding_task'] = grd_task
+            new_targets.append(target_dict)
+        return new_targets
+    def semantic_inference(self, mask_cls, mask_pred, keep_sem_bgd=False):
+        if keep_sem_bgd:
+            mask_cls = F.softmax(mask_cls, dim=-1)
+        else:
+            mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg
+    def panoptic_inference(self, mask_cls, mask_pred):
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+        keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+        cur_mask_cls = mask_cls[keep]
+        cur_mask_cls = cur_mask_cls[:, :-1]
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
+        segments_info = []
+        current_segment_id = 0
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            return panoptic_seg, segments_info
+        else:
+            # take argmax
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            thing_dataset_id_to_contiguous_id = self.metadata.thing_dataset_id_to_contiguous_id if hasattr(self.metadata, 'thing_dataset_id_to_contiguous_id') else {}
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in thing_dataset_id_to_contiguous_id.values()
+                mask_area = (cur_mask_ids == k).sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+                mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
+                if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
+                    if mask_area / original_area < self.overlap_threshold:
+                        continue
+                    # merge stuff regions
+                    if not isthing:
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                            continue
+                        else:
+                            stuff_memory_list[int(pred_class)] = current_segment_id + 1
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+                    segments_info.append(
+                        {
+                            "id": current_segment_id,
+                            "isthing": bool(isthing),
+                            "category_id": int(pred_class),
+                        }
+                    )
+            return panoptic_seg, segments_info
+    def instance_inference(self, mask_cls, mask_pred, box_pred):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+        # [Q, K]
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
+        # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
+        labels_per_image = labels[topk_indices]
+        topk_indices = (topk_indices // self.sem_seg_head.num_classes)
+        # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
+        mask_pred = mask_pred[topk_indices]
+        if box_pred is not None:
+            box_pred = box_pred[topk_indices]
+        # if this is panoptic segmentation, we only keep the "thing" classes
+        if self.panoptic_on:
+            thing_dataset_id_to_contiguous_id = self.metadata.thing_dataset_id_to_contiguous_id if hasattr(self.metadata, 'thing_dataset_id_to_contiguous_id') else {}
+            keep = torch.zeros_like(scores_per_image).bool()
+            for i, lab in enumerate(labels_per_image):
+                keep[i] = lab in thing_dataset_id_to_contiguous_id.values()
+            scores_per_image = scores_per_image[keep]
+            labels_per_image = labels_per_image[keep]
+            mask_pred = mask_pred[keep]
+            if box_pred is not None:
+                box_pred = box_pred[keep]
+        result = Instances(image_size)
+        # mask (before sigmoid)
+        result.pred_masks = (mask_pred > 0).float()
+        # result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # Uncomment the following to get boxes from masks (this is slow)
+        if box_pred is not None:
+            result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
+        else:
+            result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
+        result.scores = scores_per_image * mask_scores_per_image
+        result.pred_classes = labels_per_image
+        return result
+@register_model
+def get_xdecoder_model(cfg, **kwargs):
+    return GeneralizedXdecoder(cfg)

modeling/body/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .xdecoder_head import *
+from .build import *
+def build_xdecoder_head(config, *args, **kwargs):
+    model_name = config['MODEL']['HEAD']
+    if not is_model(model_name):
+        raise ValueError(f'Unkown model: {model_name}')
+    body = model_entrypoints(model_name)(config, *args, **kwargs)
+    return body

modeling/body/build.py ADDED Viewed

	@@ -0,0 +1,13 @@

+_model_entrypoints = {}
+def register_body(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _model_entrypoints[model_name] = fn
+    return fn
+def model_entrypoints(model_name):
+    return _model_entrypoints[model_name]
+def is_model(model_name):
+    return model_name in _model_entrypoints

modeling/body/xdecoder_head.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import Dict
+from torch import nn
+from detectron2.layers import ShapeSpec
+from .build import register_body
+from ..vision.encoder import build_encoder
+from ..interface import build_decoder
+from ..utils import configurable
+class XdecoderHead(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+        # extra parameters
+        transformer_predictor: nn.Module,
+        transformer_in_feature: str,
+        binary_classes: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+        self.transformer_in_feature = transformer_in_feature
+        self.num_classes = num_classes
+        if binary_classes:
+            self.num_classes = 1
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], lang_encoder: nn.Module, extra: dict):
+        in_features_type = cfg['MODEL']['DECODER']['TRANSFORMER_IN_FEATURE']
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        # figure out in_channels to transformer predictor
+        if in_features_type == "transformer_encoder":
+            transformer_predictor_in_channels = enc_cfg['CONVS_DIM']
+        elif in_features_type == "pixel_embedding":
+            transformer_predictor_in_channels = enc_cfg['MASK_DIM']
+        elif in_features_type == "multi_scale_pixel_decoder":
+            transformer_predictor_in_channels = enc_cfg['CONVS_DIM']
+        else:
+            transformer_predictor_in_channels = input_shape[dec_cfg['TRANSFORMER_IN_FEATURE']].channels
+        return {
+            "input_shape": {
+                k: v for k, v in input_shape.items() if k in enc_cfg['IN_FEATURES']
+            },
+            "ignore_value": enc_cfg['IGNORE_VALUE'],
+            "num_classes": enc_cfg.get('NUM_CLASSES', None),
+            "pixel_decoder": build_encoder(cfg, input_shape),
+            "loss_weight": enc_cfg['LOSS_WEIGHT'],
+            "transformer_in_feature": dec_cfg['TRANSFORMER_IN_FEATURE'],
+            "transformer_predictor": build_decoder(
+                cfg,
+                transformer_predictor_in_channels,
+                lang_encoder,
+                mask_classification=True,
+                extra=extra,
+            ),
+            "binary_classes": enc_cfg['BINARY_CLASSES']
+        }
+    def forward(self, features, mask=None, target_queries=None, target_vlp=None, task='seg', extra={}):
+        return self.layers(features, mask, target_queries, target_vlp, task, extra)
+    def layers(self, features, mask=None, target_queries=None, target_vlp=None, task='seg', extra={}):
+        mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
+        if self.transformer_in_feature == "multi_scale_pixel_decoder":
+            predictions = self.predictor(multi_scale_features, mask_features, mask, target_queries, target_vlp, task, extra)
+        else:
+            if self.transformer_in_feature == "transformer_encoder":
+                assert (
+                    transformer_encoder_features is not None
+                ), "Please use the TransformerEncoderPixelDecoder."
+                predictions = self.predictor(transformer_encoder_features, mask_features, mask)
+            elif self.transformer_in_feature == "pixel_embedding":
+                predictions = self.predictor(mask_features, mask_features, mask)
+            else:
+                predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
+        return predictions
+@register_body
+def get_xdecoder_head(cfg, input_shape, lang_encoder, extra):
+    return XdecoderHead(cfg, input_shape, lang_encoder, extra)

modeling/interface/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .xdecoder import *
+from .seem_v0 import *
+from .seem_v1 import *
+from .seem_demo import *
+from .build import *
+def build_decoder(config, *args, **kwargs):
+    model_name = config['MODEL']['DECODER']['NAME']
+    if not is_model(model_name):
+        raise ValueError(f'Unkown model: {model_name}')
+    return model_entrypoints(model_name)(config, *args, **kwargs)

modeling/interface/build.py ADDED Viewed

	@@ -0,0 +1,14 @@

+_model_entrypoints = {}
+def register_decoder(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _model_entrypoints[model_name] = fn
+    return fn
+def model_entrypoints(model_name):
+    return _model_entrypoints[model_name]
+def is_model(model_name):
+    return model_name in _model_entrypoints

modeling/interface/modules.py ADDED Viewed

	@@ -0,0 +1,200 @@

+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from timm.models.layers import trunc_normal_
+from detectron2.layers import Conv2d
+import fvcore.nn.weight_init as weight_init
+from ..utils import MultiheadAttention
+class SelfAttentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt,
+                     tgt_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt,
+                    tgt_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, tgt_mask,
+                                    tgt_key_padding_mask, query_pos)
+        return self.forward_post(tgt, tgt_mask,
+                                 tgt_key_padding_mask, query_pos)
+class CrossAttentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     memory_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        tgt2, avg_attn = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt, avg_attn
+    def forward_pre(self, tgt, memory,
+                    memory_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        tgt2, avg_attn = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)
+        tgt = tgt + self.dropout(tgt2)
+        return tgt, avg_attn
+    def forward(self, tgt, memory,
+                memory_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, memory_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, memory_mask,
+                                 memory_key_padding_mask, pos, query_pos)
+class FFNLayer(nn.Module):
+    def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm = nn.LayerNorm(d_model)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt):
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt):
+        if self.normalize_before:
+            return self.forward_pre(tgt)
+        return self.forward_post(tgt)
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x

modeling/interface/prototype/__init__.py ADDED Viewed

File without changes

modeling/interface/prototype/attention_data_struct_seemdemo.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+predict_name_matcher = {"predictions_class": ["pred_logits"],
+                        "predictions_mask":["pred_masks", "pred_gmasks", "pred_smasks"],
+                        "predictions_caption":["pred_captions", "pred_gtexts"],
+                        "predictions_maskemb":["pred_maskembs", "pred_smaskembs"],
+                        "predictions_pos_spatial":["pred_pspatials"],
+                        "predictions_neg_spatial":["pred_nspatials"],
+                        "predictions_pos_visual":["pred_pvisuals"],
+                        "predictions_neg_visual":["pred_nvisuals"]}
+predict_index_matcher = {"predictions_class": ["queries_object"],
+                         "predictions_mask":["queries_object", "queries_grounding", "queries_spatial"],
+                         "predictions_caption": ["queries_object", "queries_grounding"],
+                         "predictions_maskemb":["queries_object", "queries_spatial"],
+                         "predictions_pos_spatial":["all"],
+                         "predictions_neg_spatial":["all"],
+                         "predictions_pos_visual":["all"],
+                         "predictions_neg_visual":["all"]}
+class Variable(object):
+    '''
+    Store dataset variable for attention
+    output: embedding that accumuates during cross/self attention
+    pos: positional embedding that is fixed during cross/self attention
+    name: name of the variable
+    type: type of the variable, e.g. queries, tokens
+    attn_mask: attention mask for corss attention
+    masking: masking for padding
+    '''
+    def __init__(self, output, name, _type, pos=None):
+        self.output = output
+        self.pos = pos
+        self.name = name
+        self.type = _type
+        self.attn_mask = None
+        self.masking = None
+    def copy(self,):
+        output = self.output.clone() if self.output is not None else None
+        pos = self.pos.clone() if self.pos is not None else None
+        return Variable(output, self.name, self.type, pos)
+class AttentionDataStruct(nn.Module):
+    '''
+    Store dataset structure for cross/self attention
+    task_switch: switch for different tasks
+    p_attn_variables: prototype of variables that is used in cross/self attention
+    p_self_attn: prototype of variables that is used in self attention
+    p_cross_attn: prototype of variables that is used in cross attention
+    p_iter: prototype of iteration for different queries
+    p_masking: prototype of masking for different tokens
+    p_duplication: prototype of duplication for different quries
+    '''
+    def __init__(self, attn_arch, task_switch):
+        super(AttentionDataStruct, self).__init__()
+        self.task_switch = task_switch
+        # p stands for prototype
+        self.p_attn_variables = attn_arch['VARIABLE']
+        self.p_self_attn = attn_arch['SELF_ATTENTION']
+        self.p_cross_attn = attn_arch['CROSS_ATTENTION']
+        self.p_masking = attn_arch['MASKING']
+        self.p_duplication = attn_arch['DUPLICATION']
+        self.num_layers = attn_arch['NUM_LAYERS']
+    def reset(self, flags, task, extra):
+        # reset variables
+        self.attn_variables = {}
+        self.cross_attn_dict = {}
+        self.self_attn_dict = {}
+        self.duplication_dict = {}
+        self.query_index = {}
+        self.output = {}
+        self.flags = {}
+        self.spatial_memory = {}
+        # initialize duplication
+        for key, values in self.p_duplication.items():
+            for name in values:
+                self.duplication_dict["{}_{}".format(key, name)] = self.p_duplication[key][name]
+        # initialize flag
+        self.flags = {"object": True}
+        self.flags.update(flags)
+        # initialize task
+        self.task = task
+        # initialize output
+        if self.task_switch['mask']:
+            self.output['predictions_class'] = []
+            self.output['predictions_mask'] = []
+            self.output['predictions_maskemb'] = []
+        if self.task_switch['bbox']:
+            self.output['predictions_bbox'] = []
+        if self.task_switch['spatial'] and ('spatial' in self.flags and self.flags['spatial']==True):
+            self.output['predictions_pos_spatial'] = []
+            self.output['predictions_neg_spatial'] = []
+        if self.task_switch['spatial'] and ('memories_spatial' in self.flags and self.flags['memories_spatial']==True):
+            self.spatial_memory['prev_batch_mask'] = extra['prev_mask']
+        if (self.task_switch['grounding'] and ('grounding' in self.flags and self.flags['grounding']==True)) \
+                or (self.task_switch['audio'] and ('audio' in self.flags and self.flags['audio']==True)):
+            self.output['predictions_caption'] = []
+        if self.task_switch['visual']:
+            self.output['predictions_pos_visual'] = []
+            self.output['predictions_neg_visual'] = []
+        # initialize cross_attn, whether the variable is used in cross attention
+        for key, values in self.p_cross_attn.items():
+            for name in values:
+                self.cross_attn_dict["{}_{}".format(key, name)] = self.p_cross_attn[key][name]
+        # initialize self_attn, whether the variable is used in self attention, and the interactions between queries
+        for key, values in self.p_self_attn.items():
+            for name in values:
+                self.self_attn_dict["{}_{}".format(key, name)] = self.p_self_attn[key][name]
+        # initialize masking
+        self.masking = self.p_masking
+        # initialize query_index
+        self.query_index = {"all":[0, None]}
+    def set(self, name, _type, output=None, pos=None, var=None):
+        if var is not None:
+            self.attn_variables[name] = var
+        elif name in self.duplication_dict:
+            assert self.duplication_dict[name] in self.attn_variables, "Duplication variable {} is not initialized yet.".format(name)
+            self.attn_variables[name] = self.attn_variables[self.duplication_dict[name]].copy()
+        else:
+            var = Variable(output, name, _type, pos)
+            self.attn_variables[name] = var
+    def set_results(self, results):
+        for name in self.cross_attn_name:
+            self.attn_variables[name].attn_mask = results['attn_mask'][:,self.query_index[name][0]:self.query_index[name][1]]
+        for key in self.output:
+            self.output[key].append(results[key])
+    def set_maskings(self, name, masking):
+        self.attn_variables[name].masking = masking
+    def cross_attn_variables(self, ):
+        cross_attn_name = [key for key, value in self.cross_attn_dict.items()
+                           if (value==True) and (key in self.attn_variables)
+                           and ((key not in self.flags) or (key in self.flags and self.flags[key]==True))]
+        self.cross_attn_name = cross_attn_name
+        output = torch.cat([self.attn_variables[name].output for name in cross_attn_name])
+        pos_emb = torch.cat([self.attn_variables[name].pos for name in cross_attn_name])
+        index = 0
+        for name in cross_attn_name:
+            self.query_index[name] = [index, index + self.attn_variables[name].output.shape[0]]
+            index += self.attn_variables[name].output.shape[0]
+        return output, pos_emb
+    def cross_attn_mask(self, size, num_heads):
+        attn_mask = torch.cat([self.attn_variables[name].attn_mask for name in self.cross_attn_name], dim=1)
+        # hard code memories_spatial to previous selected mask
+        if 'memories_spatial' in self.cross_attn_name:
+            memory_attn_mask = self.spatial_memory['prev_batch_mask']
+            bs,c,_,_ = memory_attn_mask.shape
+            memory_attn_mask = F.interpolate(memory_attn_mask, size, mode='bilinear', align_corners=False)
+            memory_attn_mask = (memory_attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, num_heads, 1, 1).flatten(0, 1) < 0.5).bool().detach()
+            attn_mask[:,self.query_index['memories_spatial'][0]:self.query_index['memories_spatial'][1]] = memory_attn_mask
+        attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+        return attn_mask
+    def self_attn(self, bs, num_heads):
+        self_attn_name = [key for key, value in self.self_attn_dict.items()
+                          if len(value)>0 and key in self.attn_variables
+                          and ((key not in self.flags) or (key in self.flags and self.flags[key]==True))]
+        self.self_attn_name = self_attn_name
+        output = torch.cat([self.attn_variables[name].output for name in self_attn_name])
+        pos_emb = torch.cat([self.attn_variables[name].pos for name in self_attn_name])
+        index = 0
+        for name in self_attn_name:
+            self.query_index[name] = [index, index + self.attn_variables[name].output.shape[0]]
+            index += self.attn_variables[name].output.shape[0]
+        self_attn_mask = torch.ones((bs, output.shape[0], output.shape[0]), dtype=torch.bool, device=output.device)
+        self_attn_pair = []
+        # build self_attention mask by query interaction
+        for key1, value in self.self_attn_dict.items():
+            for key2 in value:
+                if key1 not in self_attn_name or key2 not in self_attn_name:
+                    # exclude the variables that are not used in the current layer
+                    continue
+                if (key1 in self.masking or key2 in self.masking) and (key1 != key2):
+                    self_attn_pair += [[key1, key2]]
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1], self.query_index[key2][0]:self.query_index[key2][1]] = False
+        # build self_attention mask by masking, for birectional
+        for key in self.masking:
+            if key in self_attn_name:
+                self_attn_mask[:,self.query_index[key][0]:self.query_index[key][1],self.query_index[key][0]:self.query_index[key][1]][self.attn_variables[key].masking] = True
+                self_attn_mask[:,self.query_index[key][0]:self.query_index[key][1],self.query_index[key][0]:self.query_index[key][1]].transpose(1,2)[self.attn_variables[key].masking] = True
+        # build self_attention mask by masking, for uni-directional
+        for key1, key2 in self_attn_pair:
+            if key1 not in self_attn_name or key2 not in self_attn_name:
+                # exclude the variables that are not used in the current layer
+                continue
+            if key1 in self.masking:
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1],self.query_index[key2][0]:self.query_index[key2][1]][self.attn_variables[key1].masking] = True # HACK, not verified
+            if key2 in self.masking:
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1],self.query_index[key2][0]:self.query_index[key2][1]].transpose(1,2)[self.attn_variables[key2].masking] = True
+        self_attn_mask = self_attn_mask.repeat_interleave(num_heads, dim=0)
+        return output, pos_emb, self_attn_mask
+    def update_variables(self, output, mode):
+        name_set = self.self_attn_name if mode=='self_attn' else self.cross_attn_name
+        for key in name_set:
+            self.attn_variables[key].output = output[self.query_index[key][0]:self.query_index[key][1]]
+    def update_spatial_results(self, results):
+        v_emb = results['pred_smaskembs']
+        pred_smasks = results['pred_smasks']
+        s_emb = results['pred_pspatials']
+        pred_logits = v_emb @ s_emb.transpose(1,2)
+        logits_idx_y = pred_logits[:,:,0].max(dim=1)[1]
+        logits_idx_x = torch.arange(len(logits_idx_y), device=logits_idx_y.device)
+        logits_idx = torch.stack([logits_idx_x, logits_idx_y]).tolist()
+        pred_masks_pos = pred_smasks[logits_idx][:,None,]
+        extra = {"prev_mask": pred_masks_pos}
+        return extra
+    def organize_output(self, ):
+        outputs = {}
+        outputs['aux_outputs'] = [{} for i in range(self.num_layers)]
+        for key, values in self.output.items():
+            for _key, idx_name in zip(predict_name_matcher[key], predict_index_matcher[key]):
+                if idx_name not in self.query_index:
+                    continue
+                outputs[_key] = self.output[key][-1][:,self.query_index[idx_name][0]:self.query_index[idx_name][1]]
+                for idx, aux_values in enumerate(self.output[key][:-1]):
+                    outputs['aux_outputs'][idx][_key] = aux_values[:,self.query_index[idx_name][0]:self.query_index[idx_name][1]]
+        return outputs

modeling/interface/prototype/attention_data_struct_seemv0.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+predict_name_matcher = {"predictions_class": ["pred_logits"],
+                        "predictions_mask":["pred_masks", "pred_gmasks", "pred_smasks"],
+                        "predictions_caption":["pred_captions", "pred_gtexts"],
+                        "predictions_maskemb":["pred_smaskembs"],
+                        "predictions_pos_spatial":["pred_pspatials"],
+                        "predictions_neg_spatial":["pred_nspatials"],}
+predict_index_matcher = {"predictions_class": ["queries_object"],
+                         "predictions_mask":["queries_object", "queries_grounding", "queries_spatial"],
+                         "predictions_caption": ["queries_object", "queries_grounding"],
+                         "predictions_maskemb":["queries_spatial"],
+                         "predictions_pos_spatial":["all"],
+                         "predictions_neg_spatial":["all"],}
+class Variable(object):
+    '''
+    Store dataset variable for attention
+    output: embedding that accumuates during cross/self attention
+    pos: positional embedding that is fixed during cross/self attention
+    name: name of the variable
+    type: type of the variable, e.g. queries, tokens
+    attn_mask: attention mask for corss attention
+    masking: masking for padding
+    '''
+    def __init__(self, output, name, _type, pos=None):
+        self.output = output
+        self.pos = pos
+        self.name = name
+        self.type = _type
+        self.attn_mask = None
+        self.masking = None
+    def copy(self,):
+        output = self.output.clone() if self.output is not None else None
+        pos = self.pos.clone() if self.pos is not None else None
+        return Variable(output, self.name, self.type, pos)
+class AttentionDataStruct(nn.Module):
+    '''
+    Store dataset structure for cross/self attention
+    task_switch: switch for different tasks
+    p_attn_variables: prototype of variables that is used in cross/self attention
+    p_self_attn: prototype of variables that is used in self attention
+    p_cross_attn: prototype of variables that is used in cross attention
+    p_iter: prototype of iteration for different queries
+    p_masking: prototype of masking for different tokens
+    p_duplication: prototype of duplication for different quries
+    '''
+    def __init__(self, attn_arch, task_switch):
+        super(AttentionDataStruct, self).__init__()
+        self.task_switch = task_switch
+        # p stands for prototype
+        self.p_attn_variables = attn_arch['VARIABLE']
+        self.p_self_attn = attn_arch['SELF_ATTENTION']
+        self.p_cross_attn = attn_arch['CROSS_ATTENTION']
+        self.p_masking = attn_arch['MASKING']
+        self.p_duplication = attn_arch['DUPLICATION']
+        self.num_layers = attn_arch['NUM_LAYERS']
+    def reset(self, flags, task, extra):
+        # reset variables
+        self.attn_variables = {}
+        self.cross_attn_dict = {}
+        self.self_attn_dict = {}
+        self.duplication_dict = {}
+        self.query_index = {}
+        self.output = {}
+        self.flags = {}
+        self.spatial_memory = {}
+        # initialize duplication
+        for key, values in self.p_duplication.items():
+            for name in values:
+                self.duplication_dict["{}_{}".format(key, name)] = self.p_duplication[key][name]
+        # initialize flag
+        self.flags = {"object": True}
+        self.flags.update(flags)
+        # initialize task
+        self.task = task
+        # initialize output
+        if self.task_switch['mask']:
+            self.output['predictions_class'] = []
+            self.output['predictions_mask'] = []
+        if self.task_switch['bbox']:
+            self.output['predictions_bbox'] = []
+        if self.task_switch['spatial'] and ('spatial' in self.flags and self.flags['spatial']==True):
+            self.output['predictions_maskemb'] = []
+            self.output['predictions_pos_spatial'] = []
+            self.output['predictions_neg_spatial'] = []
+            # self.spatial_memory['spatial_query_mode'] = extra['spatial_query_mode']
+        if self.task_switch['spatial'] and ('memories_spatial' in self.flags and self.flags['memories_spatial']==True):
+            self.spatial_memory['prev_batch_mask'] = extra['prev_mask']
+        if self.task_switch['grounding'] and ('grounding' in self.flags and self.flags['grounding']==True):
+            self.output['predictions_caption'] = []
+        # initialize cross_attn, whether the variable is used in cross attention
+        for key, values in self.p_cross_attn.items():
+            for name in values:
+                self.cross_attn_dict["{}_{}".format(key, name)] = self.p_cross_attn[key][name]
+        # initialize self_attn, whether the variable is used in self attention, and the interactions between queries
+        for key, values in self.p_self_attn.items():
+            for name in values:
+                self.self_attn_dict["{}_{}".format(key, name)] = self.p_self_attn[key][name]
+        # initialize masking
+        self.masking = self.p_masking
+        # initialize query_index
+        self.query_index = {"all":[0, None]}
+    def set(self, name, _type, output=None, pos=None, var=None):
+        if var is not None:
+            self.attn_variables[name] = var
+        elif name in self.duplication_dict:
+            assert self.duplication_dict[name] in self.attn_variables, "Duplication variable {} is not initialized yet.".format(name)
+            self.attn_variables[name] = self.attn_variables[self.duplication_dict[name]].copy()
+        else:
+            var = Variable(output, name, _type, pos)
+            self.attn_variables[name] = var
+    def set_results(self, results):
+        for name in self.cross_attn_name:
+            self.attn_variables[name].attn_mask = results['attn_mask'][:,self.query_index[name][0]:self.query_index[name][1]]
+        for key in self.output:
+            self.output[key].append(results[key])
+    def set_maskings(self, name, masking):
+        self.attn_variables[name].masking = masking
+    def cross_attn_variables(self, ):
+        cross_attn_name = [key for key, value in self.cross_attn_dict.items()
+                           if (value==True) and (key in self.attn_variables)
+                           and ((key not in self.flags) or (key in self.flags and self.flags[key]==True))]
+        self.cross_attn_name = cross_attn_name
+        output = torch.cat([self.attn_variables[name].output for name in cross_attn_name])
+        pos_emb = torch.cat([self.attn_variables[name].pos for name in cross_attn_name])
+        index = 0
+        for name in cross_attn_name:
+            self.query_index[name] = [index, index + self.attn_variables[name].output.shape[0]]
+            index += self.attn_variables[name].output.shape[0]
+        return output, pos_emb
+    def cross_attn_mask(self, size, num_heads):
+        attn_mask = torch.cat([self.attn_variables[name].attn_mask for name in self.cross_attn_name], dim=1)
+        # hard code memories_spatial to previous selected mask
+        if 'memories_spatial' in self.cross_attn_name:
+            memory_attn_mask = self.spatial_memory['prev_batch_mask']
+            bs,c,_,_ = memory_attn_mask.shape
+            memory_attn_mask = F.interpolate(memory_attn_mask, size, mode='bilinear', align_corners=False)
+            memory_attn_mask = (memory_attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, num_heads, 1, 1).flatten(0, 1) < 0.5).bool().detach()
+            attn_mask[:,self.query_index['memories_spatial'][0]:self.query_index['memories_spatial'][1]] = memory_attn_mask
+        attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+        return attn_mask
+    def self_attn(self, bs, num_heads):
+        self_attn_name = [key for key, value in self.self_attn_dict.items()
+                          if len(value)>0 and key in self.attn_variables
+                          and ((key not in self.flags) or (key in self.flags and self.flags[key]==True))]
+        self.self_attn_name = self_attn_name
+        output = torch.cat([self.attn_variables[name].output for name in self_attn_name])
+        pos_emb = torch.cat([self.attn_variables[name].pos for name in self_attn_name])
+        index = 0
+        for name in self_attn_name:
+            self.query_index[name] = [index, index + self.attn_variables[name].output.shape[0]]
+            index += self.attn_variables[name].output.shape[0]
+        self_attn_mask = torch.ones((bs, output.shape[0], output.shape[0]), dtype=torch.bool, device=output.device)
+        self_attn_pair = []
+        # build self_attention mask by query interaction
+        for key1, value in self.self_attn_dict.items():
+            for key2 in value:
+                if key1 not in self_attn_name or key2 not in self_attn_name:
+                    # exclude the variables that are not used in the current layer
+                    continue
+                if (key1 in self.masking or key2 in self.masking) and (key1 != key2):
+                    self_attn_pair += [[key1, key2]]
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1], self.query_index[key2][0]:self.query_index[key2][1]] = False
+        # build self_attention mask by masking, for birectional
+        for key in self.masking:
+            if key in self_attn_name:
+                self_attn_mask[:,self.query_index[key][0]:self.query_index[key][1],self.query_index[key][0]:self.query_index[key][1]][self.attn_variables[key].masking] = True
+                self_attn_mask[:,self.query_index[key][0]:self.query_index[key][1],self.query_index[key][0]:self.query_index[key][1]].transpose(1,2)[self.attn_variables[key].masking] = True
+        # build self_attention mask by masking, for uni-directional
+        for key1, key2 in self_attn_pair:
+            if key1 not in self_attn_name or key2 not in self_attn_name:
+                # exclude the variables that are not used in the current layer
+                continue
+            if key1 in self.masking:
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1],self.query_index[key2][0]:self.query_index[key2][1]][self.attn_variables[key1].masking] = True # HACK, not verified
+            if key2 in self.masking:
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1],self.query_index[key2][0]:self.query_index[key2][1]].transpose(1,2)[self.attn_variables[key2].masking] = True
+        self_attn_mask = self_attn_mask.repeat_interleave(num_heads, dim=0)
+        return output, pos_emb, self_attn_mask
+    def update_variables(self, output, mode):
+        name_set = self.self_attn_name if mode=='self_attn' else self.cross_attn_name
+        for key in name_set:
+            self.attn_variables[key].output = output[self.query_index[key][0]:self.query_index[key][1]]
+    def update_spatial_results(self, results):
+        v_emb = results['pred_smaskembs']
+        pred_smasks = results['pred_smasks']
+        s_emb = results['pred_pspatials']
+        pred_logits = v_emb @ s_emb.transpose(1,2)
+        logits_idx_y = pred_logits[:,:,0].max(dim=1)[1]
+        logits_idx_x = torch.arange(len(logits_idx_y), device=logits_idx_y.device)
+        logits_idx = torch.stack([logits_idx_x, logits_idx_y]).tolist()
+        pred_masks_pos = pred_smasks[logits_idx][:,None,]
+        # s_emb = results['pred_nspatials']
+        # pred_logits = v_emb @ s_emb.transpose(1,2)
+        # logits_idx_y = pred_logits[:,:,0].max(dim=1)[1]
+        # logits_idx_x = torch.arange(len(logits_idx_y), device=logits_idx_y.device)
+        # logits_idx = torch.stack([logits_idx_x, logits_idx_y]).tolist()
+        # pred_masks_neg = pred_smasks[logits_idx][:,None,]
+        # # clip the negative mask to 0, and then multiply by -1
+        # pred_masks_neg = (pred_masks_neg.clip(0) * -1)
+        # keep_neg = (s_emb.sum(dim=list(range(1, s_emb.dim()))) != 0).float()
+        # pred_masks_neg = pred_masks_neg * keep_neg[:,None,None,None]
+        # extra = {"prev_mask": pred_masks_pos + pred_masks_neg}
+        extra = {"prev_mask": pred_masks_pos}
+        return extra
+    def organize_output(self, ):
+        outputs = {}
+        outputs['aux_outputs'] = [{} for i in range(self.num_layers)]
+        for key, values in self.output.items():
+            for _key, idx_name in zip(predict_name_matcher[key], predict_index_matcher[key]):
+                if idx_name not in self.query_index:
+                    continue
+                outputs[_key] = self.output[key][-1][:,self.query_index[idx_name][0]:self.query_index[idx_name][1]]
+                for idx, aux_values in enumerate(self.output[key][:-1]):
+                    outputs['aux_outputs'][idx][_key] = aux_values[:,self.query_index[idx_name][0]:self.query_index[idx_name][1]]
+        if self.task == 'spatial' or self.task == 'refimg':
+            outputs = self.update_spatial_results(outputs)
+        # outputs = self.update_spatial_results(outputs)
+        return outputs

modeling/interface/prototype/attention_data_struct_seemv1.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+predict_name_matcher = {"predictions_class": ["pred_logits"],
+                        "predictions_mask":["pred_masks", "pred_gmasks", "pred_smasks"],
+                        "predictions_caption":["pred_captions", "pred_gtexts", "pred_stexts"],
+                        "predictions_maskemb":["pred_smaskembs"],
+                        "predictions_pos_spatial":["pred_pspatials"],
+                        "predictions_neg_spatial":["pred_nspatials"],}
+predict_index_matcher = {"predictions_class": ["queries_object"],
+                         "predictions_mask":["queries_object", "queries_grounding", "queries_spatial"],
+                         "predictions_caption": ["queries_object", "queries_grounding", "queries_spatial"],
+                         "predictions_maskemb":["queries_spatial"],
+                         "predictions_pos_spatial":["all"],
+                         "predictions_neg_spatial":["all"],}
+class Variable(object):
+    '''
+    Store dataset variable for attention
+    output: embedding that accumuates during cross/self attention
+    pos: positional embedding that is fixed during cross/self attention
+    name: name of the variable
+    type: type of the variable, e.g. queries, tokens
+    attn_mask: attention mask for corss attention
+    masking: masking for padding
+    '''
+    def __init__(self, output, name, _type, pos=None):
+        self.output = output
+        self.pos = pos
+        self.name = name
+        self.type = _type
+        self.attn_mask = None
+        self.masking = None
+    def copy(self,):
+        output = self.output.clone() if self.output is not None else None
+        pos = self.pos.clone() if self.pos is not None else None
+        return Variable(output, self.name, self.type, pos)
+    def rand_sample(self, max_len):
+        rand_idx = torch.randint(0, len(self.pos), (max_len,))
+        self.output = self.output[rand_idx]
+        self.pos = self.pos[rand_idx]
+        return self
+class AttentionDataStruct(nn.Module):
+    '''
+    Store dataset structure for cross/self attention
+    task_switch: switch for different tasks
+    p_attn_variables: prototype of variables that is used in cross/self attention
+    p_self_attn: prototype of variables that is used in self attention
+    p_cross_attn: prototype of variables that is used in cross attention
+    p_iter: prototype of iteration for different queries
+    p_masking: prototype of masking for different tokens
+    p_duplication: prototype of duplication for different quries
+    '''
+    def __init__(self, attn_arch, task_switch):
+        super(AttentionDataStruct, self).__init__()
+        self.task_switch = task_switch
+        # p stands for prototype
+        self.p_attn_variables = attn_arch['VARIABLE']
+        self.p_self_attn = attn_arch['SELF_ATTENTION']
+        self.p_cross_attn = attn_arch['CROSS_ATTENTION']
+        self.p_masking = attn_arch['MASKING']
+        self.p_duplication = attn_arch['DUPLICATION']
+        self.num_layers = attn_arch['NUM_LAYERS']
+    def reset(self, flags, task, extra):
+        # reset variables
+        self.attn_variables = {}
+        self.cross_attn_dict = {}
+        self.self_attn_dict = {}
+        self.duplication_dict = {}
+        self.query_index = {}
+        self.output = {}
+        self.flags = {}
+        self.spatial_memory = {}
+        self.extra = {}
+        # initialize duplication
+        for key, values in self.p_duplication.items():
+            for name in values:
+                self.duplication_dict["{}_{}".format(key, name)] = self.p_duplication[key][name]
+        # initialize flag
+        self.flags = {"object": True}
+        self.flags.update(flags)
+        # initialize task
+        self.task = task
+        # initialize output
+        if self.task_switch['mask']:
+            self.output['predictions_class'] = []
+            self.output['predictions_mask'] = []
+        if self.task_switch['bbox']:
+            self.output['predictions_bbox'] = []
+        if self.task_switch['spatial'] and ('memories_spatial' in self.flags and self.flags['memories_spatial']==True):
+            self.spatial_memory['prev_batch_mask'] = extra['prev_mask']
+        if self.task_switch['grounding'] and ('grounding' in self.flags and self.flags['grounding']==True):
+            self.output['predictions_caption'] = []
+        if self.task_switch['spatial'] and ('spatial' in self.flags and self.flags['spatial']==True):
+            self.output['predictions_maskemb'] = []
+            self.output['predictions_pos_spatial'] = []
+            self.output['predictions_neg_spatial'] = []
+            self.output['predictions_mask'] = [] if 'predictions_mask' not in self.output else self.output['predictions_mask']
+            self.output['predictions_class'] = [] if 'predictions_class' not in self.output else self.output['predictions_class']
+            self.output['predictions_caption'] = [] if 'predictions_caption' not in self.output else self.output['predictions_caption']
+        # initialize cross_attn, whether the variable is used in cross attention
+        for key, values in self.p_cross_attn.items():
+            for name in values:
+                self.cross_attn_dict["{}_{}".format(key, name)] = self.p_cross_attn[key][name]
+        # initialize self_attn, whether the variable is used in self attention, and the interactions between queries
+        for key, values in self.p_self_attn.items():
+            for name in values:
+                self.self_attn_dict["{}_{}".format(key, name)] = self.p_self_attn[key][name]
+        # initialize masking
+        self.masking = self.p_masking
+        # initialize query_index
+        self.query_index = {"all":[0, None]}
+    def set(self, name, _type, output=None, pos=None, var=None, sample_size=None):
+        if var is not None:
+            self.attn_variables[name] = var
+        elif name in self.duplication_dict:
+            assert self.duplication_dict[name] in self.attn_variables, "Duplication variable {} is not initialized yet.".format(name)
+            var = self.attn_variables[self.duplication_dict[name]].copy()
+            if sample_size is not None:
+                var = var.rand_sample(sample_size)
+            self.attn_variables[name] = var
+        else:
+            var = Variable(output, name, _type, pos)
+            self.attn_variables[name] = var
+    def set_results(self, results):
+        for name in self.cross_attn_name:
+            self.attn_variables[name].attn_mask = results['attn_mask'][:,self.query_index[name][0]:self.query_index[name][1]]
+        for key in self.output:
+            self.output[key].append(results[key])
+    def set_maskings(self, name, masking):
+        self.attn_variables[name].masking = masking
+    def set_extra(self, extra):
+        self.extra.update(extra)
+    def cross_attn_variables(self, ):
+        cross_attn_name = [key for key, value in self.cross_attn_dict.items()
+                           if (value==True) and (key in self.attn_variables)
+                           and ((key not in self.flags) or (key in self.flags and self.flags[key]==True))]
+        self.cross_attn_name = cross_attn_name
+        output = torch.cat([self.attn_variables[name].output for name in cross_attn_name])
+        pos_emb = torch.cat([self.attn_variables[name].pos for name in cross_attn_name])
+        index = 0
+        for name in cross_attn_name:
+            self.query_index[name] = [index, index + self.attn_variables[name].output.shape[0]]
+            index += self.attn_variables[name].output.shape[0]
+        return output, pos_emb
+    def cross_attn_mask(self, size, num_heads):
+        attn_mask = torch.cat([self.attn_variables[name].attn_mask for name in self.cross_attn_name], dim=1)
+        # hard code memories_spatial to previous selected mask
+        if 'memories_spatial' in self.cross_attn_name:
+            memory_attn_mask = self.spatial_memory['prev_batch_mask']
+            bs,c,_,_ = memory_attn_mask.shape
+            memory_attn_mask = F.interpolate(memory_attn_mask, size, mode='bilinear', align_corners=False)
+            memory_attn_mask = (memory_attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, num_heads, 1, 1).flatten(0, 1) < 0.5).bool().detach()
+            repeat = (self.query_index['memories_spatial'][1] - self.query_index['memories_spatial'][0]) // c
+            mem_len = self.query_index['memories_spatial'][1] - self.query_index['memories_spatial'][0]
+            probs = torch.tensor([1./repeat for i in range(c)])
+            indices = torch.multinomial(probs, num_samples=mem_len, replacement=True).sort()[0]
+            attn_mask[:,self.query_index['memories_spatial'][0]:self.query_index['memories_spatial'][1]] = memory_attn_mask[:,indices]
+            self.extra['memory_indices'] = indices
+        attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+        return attn_mask
+    def self_attn(self, bs, num_heads):
+        self_attn_name = [key for key, value in self.self_attn_dict.items()
+                          if len(value)>0 and key in self.attn_variables
+                          and ((key not in self.flags) or (key in self.flags and self.flags[key]==True))]
+        self.self_attn_name = self_attn_name
+        output = torch.cat([self.attn_variables[name].output for name in self_attn_name])
+        pos_emb = torch.cat([self.attn_variables[name].pos for name in self_attn_name])
+        index = 0
+        for name in self_attn_name:
+            self.query_index[name] = [index, index + self.attn_variables[name].output.shape[0]]
+            index += self.attn_variables[name].output.shape[0]
+        self_attn_mask = torch.ones((bs, output.shape[0], output.shape[0]), dtype=torch.bool, device=output.device)
+        self_attn_pair = []
+        # build self_attention mask by query interaction
+        for key1, value in self.self_attn_dict.items():
+            for key2 in value:
+                if key1 not in self_attn_name or key2 not in self_attn_name:
+                    # exclude the variables that are not used in the current layer
+                    continue
+                if (key1 in self.masking or key2 in self.masking) and (key1 != key2):
+                    self_attn_pair += [[key1, key2]]
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1], self.query_index[key2][0]:self.query_index[key2][1]] = False
+        # build self_attention mask by masking, for birectional
+        for key in self.masking:
+            if key in self_attn_name:
+                self_attn_mask[:,self.query_index[key][0]:self.query_index[key][1],self.query_index[key][0]:self.query_index[key][1]][self.attn_variables[key].masking] = True
+                self_attn_mask[:,self.query_index[key][0]:self.query_index[key][1],self.query_index[key][0]:self.query_index[key][1]].transpose(1,2)[self.attn_variables[key].masking] = True
+        # build self_attention mask by masking, for uni-directional
+        for key1, key2 in self_attn_pair:
+            if key1 not in self_attn_name or key2 not in self_attn_name:
+                # exclude the variables that are not used in the current layer
+                continue
+            if key1 in self.masking:
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1],self.query_index[key2][0]:self.query_index[key2][1]][self.attn_variables[key1].masking] = True # HACK, not verified
+            if key2 in self.masking:
+                self_attn_mask[:,self.query_index[key1][0]:self.query_index[key1][1],self.query_index[key2][0]:self.query_index[key2][1]].transpose(1,2)[self.attn_variables[key2].masking] = True
+        # build self_attention mask masking for spatial query
+        # spatial query attend with itself
+        if 'queries_spatial' in self_attn_name and 'tokens_spatial' in self_attn_name:
+            diag_mask = ~(torch.eye(self.extra['spatial_query_number']).repeat_interleave(self.extra['sample_size'],dim=0).repeat_interleave(self.extra['sample_size'],dim=1)).bool()
+            self_attn_mask[:,self.query_index['queries_spatial'][0]:self.query_index['queries_spatial'][1],self.query_index['queries_spatial'][0]:self.query_index['queries_spatial'][1]] = diag_mask[None,]
+            # spatial query attend with spatial token
+            indices = self.extra['spatial_indices'].permute(0,2,1)
+            diag_index = torch.arange(self.extra['spatial_query_number'], device=indices.device).repeat_interleave(self.extra['sample_size'],dim=0)[None,:,None]
+            diag_mask = ~(indices == diag_index)
+            self_attn_mask[:,self.query_index['queries_spatial'][0]:self.query_index['queries_spatial'][1],self.query_index['tokens_spatial'][0]:self.query_index['tokens_spatial'][1]] = diag_mask
+            # spatial token attend with itself
+            diag_mask = ~(indices == indices.transpose(1,2))
+            self_attn_mask[:,self.query_index['tokens_spatial'][0]:self.query_index['tokens_spatial'][1],self.query_index['tokens_spatial'][0]:self.query_index['tokens_spatial'][1]] = diag_mask
+        if 'memory_indices' in self.extra:
+            # spatial query attend with memory
+            memory_indices = self.extra['memory_indices'][None,None,:]
+            diag_index = torch.arange(self.extra['spatial_query_number'], device=memory_indices.device).repeat_interleave(self.extra['sample_size'],dim=0)[None,:,None]
+            diag_mask = ~(diag_index == memory_indices)
+            self_attn_mask[:,self.query_index['queries_spatial'][0]:self.query_index['queries_spatial'][1],self.query_index['memories_spatial'][0]:self.query_index['memories_spatial'][1]] = diag_mask
+            # memory attend with itself
+            diag_mask = ~(memory_indices == memory_indices.transpose(1,2))
+            self_attn_mask[:,self.query_index['memories_spatial'][0]:self.query_index['memories_spatial'][1],self.query_index['memories_spatial'][0]:self.query_index['memories_spatial'][1]] = diag_mask
+        self_attn_mask = self_attn_mask.repeat_interleave(num_heads, dim=0)
+        return output, pos_emb, self_attn_mask
+    def update_variables(self, output, mode):
+        name_set = self.self_attn_name if mode=='self_attn' else self.cross_attn_name
+        for key in name_set:
+            self.attn_variables[key].output = output[self.query_index[key][0]:self.query_index[key][1]]
+    def update_spatial_results(self, results):
+        v_emb = results['pred_smaskembs']
+        pred_smasks = results['pred_smasks']
+        s_emb = results['pred_pspatials']
+        diag_mask = ~(torch.eye(self.extra['spatial_query_number'], device=s_emb.device).repeat_interleave(self.extra['sample_size'],dim=0)).bool()
+        offset = torch.zeros_like(diag_mask, device=s_emb.device).float()
+        offset.masked_fill_(diag_mask, float("-inf"))
+        pred_logits = v_emb @ s_emb.transpose(1,2) + offset[None,]
+        bs,_,ns=pred_logits.shape
+        _,_,h,w=pred_smasks.shape
+        logits_idx_y = pred_logits.max(dim=1)[1]
+        logits_idx_x = torch.arange(len(logits_idx_y), device=logits_idx_y.device)[:,None].repeat(1, logits_idx_y.shape[1])
+        logits_idx = torch.stack([logits_idx_x, logits_idx_y]).view(2,-1).tolist()
+        pred_masks_pos = pred_smasks[logits_idx].reshape(bs,ns,h,w)
+        extra = {"prev_mask": pred_masks_pos}
+        return extra
+    def organize_output(self, ):
+        outputs = {}
+        outputs['aux_outputs'] = [{} for i in range(self.num_layers)]
+        for key, values in self.output.items():
+            for _key, idx_name in zip(predict_name_matcher[key], predict_index_matcher[key]):
+                if idx_name not in self.query_index:
+                    continue
+                outputs[_key] = self.output[key][-1][:,self.query_index[idx_name][0]:self.query_index[idx_name][1]]
+                for idx, aux_values in enumerate(self.output[key][:-1]):
+                    outputs['aux_outputs'][idx][_key] = aux_values[:,self.query_index[idx_name][0]:self.query_index[idx_name][1]]
+        if self.task == 'spatial' or self.task == 'refimg':
+            outputs = self.update_spatial_results(outputs)
+        # outputs = self.update_spatial_results(outputs)
+        return outputs

modeling/interface/seem_demo.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# --------------------------------------------------------
+# SEEM -- Segment Everything Everywhere All At Once
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Xueyan Zou ([email protected]), Jianwei Yang ([email protected])
+# --------------------------------------------------------
+import logging
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from timm.models.layers import trunc_normal_
+from detectron2.layers import Conv2d
+import fvcore.nn.weight_init as weight_init
+from .build import register_decoder
+from .modules import SelfAttentionLayer, CrossAttentionLayer, FFNLayer, MLP
+from .prototype.attention_data_struct_seemdemo import AttentionDataStruct
+from ..utils import rand_sample_plain as rand_sample
+from ..utils import prepare_features, configurable
+from ..modules import PositionEmbeddingSine
+from ..modules.point_features import point_sample
+class SEEMDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        lang_encoder: nn.Module,
+        in_channels,
+        mask_classification=True,
+        *,
+        hidden_dim: int,
+        dim_proj: int,
+        num_queries: int,
+        contxt_len: int,
+        nheads: int,
+        dim_feedforward: int,
+        dec_layers: int,
+        pre_norm: bool,
+        mask_dim: int,
+        task_switch: dict,
+        enforce_input_project: bool,
+        max_spatial_len: int,
+        attn_arch: dict,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        assert mask_classification, "Only support mask classification model"
+        self.mask_classification = mask_classification
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.contxt_len = contxt_len
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # learnable positive negative indicator
+        self.pn_indicator = nn.Embedding(2, hidden_dim)
+        # level embedding (we always use 3 scales)
+        self.num_feature_levels = 3
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
+                weight_init.c2_xavier_fill(self.input_proj[-1])
+            else:
+                self.input_proj.append(nn.Sequential())
+        self.task_switch = task_switch
+        self.query_index = {}
+        # output FFNs
+        self.lang_encoder = lang_encoder
+        if self.task_switch['mask']:
+            self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+        self.class_embed = nn.Parameter(torch.empty(hidden_dim, dim_proj))
+        trunc_normal_(self.class_embed, std=.02)
+        if task_switch['bbox']:
+            self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        if task_switch['spatial']:
+            # spatial query
+            self.mask_sptial_embed = nn.ParameterList([nn.Parameter(torch.empty(hidden_dim, hidden_dim)) for x in range(3)])
+            trunc_normal_(self.mask_sptial_embed[0], std=.02)
+            trunc_normal_(self.mask_sptial_embed[1], std=.02)
+            trunc_normal_(self.mask_sptial_embed[2], std=.02)
+            self.max_spatial_len = max_spatial_len
+            # spatial memory
+            num_spatial_memories = attn_arch['SPATIAL_MEMORIES']
+            self.spatial_embed = nn.Embedding(num_spatial_memories, hidden_dim)
+            self.spatial_featured = nn.Embedding(num_spatial_memories, hidden_dim)
+        # build AttentionDataStruct
+        attn_arch['NUM_LAYERS'] = self.num_layers
+        self.attention_data = AttentionDataStruct(attn_arch, task_switch)
+    @classmethod
+    def from_config(cls, cfg, in_channels, lang_encoder, mask_classification, extra):
+        ret = {}
+        ret["lang_encoder"] = lang_encoder
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        ret["hidden_dim"] = dec_cfg['HIDDEN_DIM']
+        ret["dim_proj"] = cfg['MODEL']['DIM_PROJ']
+        ret["num_queries"] = dec_cfg['NUM_OBJECT_QUERIES']
+        ret["contxt_len"] = cfg['MODEL']['TEXT']['CONTEXT_LENGTH']
+        # Transformer parameters:
+        ret["nheads"] = dec_cfg['NHEADS']
+        ret["dim_feedforward"] = dec_cfg['DIM_FEEDFORWARD']
+        # NOTE: because we add learnable query features which requires supervision,
+        # we add minus 1 to decoder layers to be consistent with our loss
+        # implementation: that is, number of auxiliary losses is always
+        # equal to number of decoder layers. With learnable query features, the number of
+        # auxiliary losses equals number of decoders plus 1.
+        assert dec_cfg['DEC_LAYERS'] >= 1
+        ret["dec_layers"] = dec_cfg['DEC_LAYERS'] - 1
+        ret["pre_norm"] = dec_cfg['PRE_NORM']
+        ret["enforce_input_project"] = dec_cfg['ENFORCE_INPUT_PROJ']
+        ret["mask_dim"] = enc_cfg['MASK_DIM']
+        ret["task_switch"] = extra['task_switch']
+        ret["max_spatial_len"] = dec_cfg['MAX_SPATIAL_LEN']
+        # attn data struct
+        ret["attn_arch"] = cfg['ATTENTION_ARCH']
+        return ret
+    def forward(self, x, mask_features, mask=None, target_queries=None, target_vlp=None, task='seg', extra={}):
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels; del mask
+        spatial_extra_flag = 'spatial_query_pos_mask' in extra.keys() or task == 'refimg'
+        grounding_extra_flag = 'grounding_tokens' in extra.keys()
+        visual_extra_flag = 'visual_query_pos' in extra.keys()
+        audio_extra_flag = 'audio_tokens' in extra.keys()
+        spatial_memory_flag = 'prev_mask' in extra.keys()
+        flags = {"spatial": spatial_extra_flag, "grounding": grounding_extra_flag, "memories_spatial": spatial_memory_flag, "visual": visual_extra_flag, "audio": audio_extra_flag}
+        self.attention_data.reset(flags, task, extra)
+        src, pos, size_list = prepare_features(x, self.num_feature_levels, self.pe_layer, self.input_proj, self.level_embed)
+        _, bs, _ = src[0].shape
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+        self.attention_data.set('queries_object', 'queries', output, query_embed)
+        if self.task_switch['spatial'] and spatial_extra_flag:
+            # get divisor
+            _,h,w = extra['spatial_query_pos_mask'][0].shape
+            divisor = torch.tensor([h,w], device=output.device)[None,]
+            # Get mean pos spatial query
+            non_zero_pos_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_pos_mask']]
+            non_zero_pos_point = nn.utils.rnn.pad_sequence(non_zero_pos_point, padding_value=-1).permute(1,0,2)
+            non_zero_pos_mask = (non_zero_pos_point.sum(dim=-1) < 0)
+            spatial_query_pos = point_sample(mask_features, non_zero_pos_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
+            spatial_query_pos = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_pos.transpose(1,2), ~non_zero_pos_mask)]).transpose(0,1).nan_to_num()
+            # Get mean neg spatial query
+            non_zero_neg_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_neg_mask']]
+            non_zero_neg_point = nn.utils.rnn.pad_sequence(non_zero_neg_point, padding_value=-1).permute(1,0,2)
+            non_zero_neg_mask = (non_zero_neg_point.sum(dim=-1) < 0)
+            spatial_query_neg = point_sample(mask_features, non_zero_neg_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
+            spatial_query_neg = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_neg.transpose(1,2), ~non_zero_neg_mask)]).transpose(0,1).nan_to_num()
+            # merge positive and negative sample points for self attention
+            # Get layerwise spatial query
+            src_spatial_queries = []
+            src_spatial_maskings = []
+            for i in range(len(src)):
+                hw,_,dc = src[i].shape
+                src_mask_features = src[i].view(size_list[i][0],size_list[i][1],bs,dc)
+                src_mask_features = src_mask_features @ self.mask_sptial_embed[i]
+                non_zero_query_point_pos = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_pos_mask']]
+                non_zero_query_point_neg = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_neg_mask']]
+                non_zero_query_point = [torch.cat([x,y], dim=0) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                pos_neg_indicator = [torch.cat([torch.ones(x.shape[0], device=x.device), -torch.ones(y.shape[0], device=y.device)]) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                pos_neg_indicator = nn.utils.rnn.pad_sequence(pos_neg_indicator, padding_value=0)
+                non_zero_query_point = nn.utils.rnn.pad_sequence(non_zero_query_point, padding_value=-1).permute(1,0,2)
+                non_zero_query_mask = (non_zero_query_point.sum(dim=-1) < 0)
+                non_zero_query_point[non_zero_query_mask] = 0
+                spatial_tokens = point_sample(src_mask_features.permute(2,3,0,1), non_zero_query_point.flip(dims=(2,)).type(src_mask_features.dtype), align_corners=True).permute(2,0,1)
+                spatial_tokens[pos_neg_indicator==1] += self.pn_indicator.weight[0:1]
+                spatial_tokens[pos_neg_indicator==-1] += self.pn_indicator.weight[1:2]
+                src_spatial_queries += [spatial_tokens]
+                src_spatial_maskings += [non_zero_query_mask]
+            if 'refimg' in task:
+                output_refimg = {}
+                output_refimg['visual_query_pos'] = spatial_query_pos
+                output_refimg['visual_query_neg'] = spatial_query_neg
+                output_refimg['src_visual_queries'] = src_spatial_queries
+                output_refimg['src_visual_maskings'] = src_spatial_maskings
+                return output_refimg
+            if task != 'demo':
+                # Get object query for spatial index
+                self.attention_data.set('queries_spatial', 'queries')
+        if self.task_switch['visual'] and visual_extra_flag:
+            visual_query_pos = extra['visual_query_pos']
+            visual_query_neg = extra['visual_query_neg']
+            src_visual_queries = extra['src_visual_queries']
+            src_visual_maskings = extra['src_visual_maskings']
+        if self.task_switch['grounding'] and grounding_extra_flag:
+            # Get grounding tokens
+            grounding_tokens = extra['grounding_tokens']
+            _grounding_tokens = grounding_tokens.detach().clone()
+            self.attention_data.set('tokens_grounding', 'tokens', grounding_tokens, _grounding_tokens)
+            self.attention_data.set_maskings('tokens_grounding', extra['grounding_nonzero_mask'])
+        if self.task_switch['audio'] and audio_extra_flag:
+            # Get grounding tokens
+            grounding_tokens = extra['audio_tokens']
+            _grounding_tokens = grounding_tokens.detach().clone()
+            self.attention_data.set('tokens_audio', 'tokens', grounding_tokens, _grounding_tokens)
+            self.attention_data.set_maskings('tokens_audio', extra['audio_nonzero_mask'])
+        output, query_embed = self.attention_data.cross_attn_variables()
+        # prediction heads on learnable query features
+        results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
+        results["predictions_pos_spatial"] = spatial_query_pos.transpose(0,1) if spatial_extra_flag else None
+        results["predictions_neg_spatial"] = spatial_query_neg.transpose(0,1) if spatial_extra_flag else None
+        results["predictions_pos_visual"] = visual_query_pos.transpose(0,1) if visual_extra_flag else None
+        results["predictions_neg_visual"] = visual_query_neg.transpose(0,1) if visual_extra_flag else None
+        self.attention_data.set_results(results)
+        for i in range(self.num_layers):
+            level_index = i % self.num_feature_levels
+            # CROSS ATTENTION
+            output, avg_attn = self.transformer_cross_attention_layers[i](
+                output, src[level_index],
+                memory_mask=self.attention_data.cross_attn_mask(size_list[level_index], self.num_heads),
+                memory_key_padding_mask=None,  # here we do not apply masking on padded region
+                pos=pos[level_index], query_pos=query_embed
+            )
+            self.attention_data.update_variables(output, 'cross_attn')
+            # SELF ATTENTION
+            self_attn_mask = torch.zeros((bs, self.num_queries, self.num_queries), device=query_embed.device).bool() # Default False (attend oq)
+            if self.task_switch['spatial'] and spatial_extra_flag:
+                # get spatial tokens
+                spatial_tokens = src_spatial_queries[level_index]
+                _spatial_tokens = spatial_tokens.detach().clone()
+                self.attention_data.set('tokens_spatial', 'tokens', spatial_tokens, _spatial_tokens)
+                self.attention_data.set_maskings('tokens_spatial', src_spatial_maskings[level_index])
+            if self.task_switch['visual'] and visual_extra_flag:
+                # get spatial tokens
+                visual_tokens = src_visual_queries[level_index]
+                _visual_tokens = visual_tokens.detach().clone()
+                self.attention_data.set('tokens_visual', 'tokens', visual_tokens, _visual_tokens)
+                self.attention_data.set_maskings('tokens_visual', src_visual_maskings[level_index])
+            output, query_embed, self_attn_mask = self.attention_data.self_attn(bs, self.num_heads)
+            output = self.transformer_self_attention_layers[i](
+                output, tgt_mask=self_attn_mask,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed)
+            # FFN
+            output = self.transformer_ffn_layers[i](
+                output
+            )
+            self.attention_data.update_variables(output, 'self_attn')
+            output, query_embed = self.attention_data.cross_attn_variables()
+            results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels], layer_id=i)
+            results["predictions_pos_spatial"] = spatial_query_pos.transpose(0,1) if spatial_extra_flag else None
+            results["predictions_neg_spatial"] = spatial_query_neg.transpose(0,1) if spatial_extra_flag else None
+            results["predictions_pos_visual"] = visual_query_pos.transpose(0,1) if visual_extra_flag else None
+            results["predictions_neg_visual"] = visual_query_neg.transpose(0,1) if visual_extra_flag else None
+            self.attention_data.set_results(results)
+        return self.attention_data.organize_output()
+    def forward_prediction_heads(self, output, mask_features, attn_mask_target_size, layer_id=-1):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        class_embed = decoder_output @ self.class_embed
+        outputs_class = self.lang_encoder.compute_similarity(class_embed)
+        mask_embed = self.mask_embed(decoder_output)
+        outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+        outputs_bbox = [None for i in range(len(outputs_mask))]
+        if self.task_switch['bbox']:
+            outputs_bbox = self.bbox_embed(decoder_output)
+        # NOTE: prediction is of higher-resolution
+        # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
+        attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
+        # must use bool type
+        # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
+        attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
+        attn_mask = attn_mask.detach()
+        outputs_caption = class_embed
+        results = {
+            "attn_mask": attn_mask,
+            "predictions_class": outputs_class,
+            "predictions_mask": outputs_mask,
+            "predictions_bbox": outputs_bbox,
+            "predictions_caption": outputs_caption,
+            "predictions_maskemb": mask_embed,
+        }
+        return results
+@register_decoder
+def get_seem_interface(cfg, in_channels, lang_encoder, mask_classification, extra):
+    return SEEMDecoder(cfg, in_channels, lang_encoder, mask_classification, extra)

modeling/interface/seem_v0.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# --------------------------------------------------------
+# SEEM -- Segment Everything Everywhere All at Once
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import logging
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from timm.models.layers import trunc_normal_
+from detectron2.layers import Conv2d
+import fvcore.nn.weight_init as weight_init
+from .build import register_decoder
+from .modules import SelfAttentionLayer, CrossAttentionLayer, FFNLayer, MLP
+from .prototype.attention_data_struct_seemv0 import AttentionDataStruct
+from ..utils import rand_sample_plain as rand_sample
+from ..utils import prepare_features, configurable
+from ..modules import PositionEmbeddingSine
+from ..modules.point_features import point_sample
+class SEEMDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        lang_encoder: nn.Module,
+        in_channels,
+        mask_classification=True,
+        *,
+        hidden_dim: int,
+        dim_proj: int,
+        num_queries: int,
+        contxt_len: int,
+        nheads: int,
+        dim_feedforward: int,
+        dec_layers: int,
+        pre_norm: bool,
+        mask_dim: int,
+        task_switch: dict,
+        enforce_input_project: bool,
+        max_spatial_len: int,
+        attn_arch: dict,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        assert mask_classification, "Only support mask classification model"
+        self.mask_classification = mask_classification
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.contxt_len = contxt_len
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # level embedding (we always use 3 scales)
+        self.num_feature_levels = 3
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
+                weight_init.c2_xavier_fill(self.input_proj[-1])
+            else:
+                self.input_proj.append(nn.Sequential())
+        self.task_switch = task_switch
+        self.query_index = {}
+        # output FFNs
+        self.lang_encoder = lang_encoder
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+        self.class_embed = nn.Parameter(torch.empty(hidden_dim, dim_proj))
+        trunc_normal_(self.class_embed, std=.02)
+        if task_switch['bbox']:
+            self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        if task_switch['spatial']:
+            # spatial query
+            self.mask_sptial_embed = nn.ParameterList([nn.Parameter(torch.empty(hidden_dim, hidden_dim)) for x in range(3)])
+            trunc_normal_(self.mask_sptial_embed[0], std=.02)
+            trunc_normal_(self.mask_sptial_embed[1], std=.02)
+            trunc_normal_(self.mask_sptial_embed[2], std=.02)
+            self.max_spatial_len = max_spatial_len
+            # spatial memory
+            num_spatial_memories = attn_arch['SPATIAL_MEMORIES']
+            self.spatial_embed = nn.Embedding(num_spatial_memories, hidden_dim)
+            self.spatial_featured = nn.Embedding(num_spatial_memories, hidden_dim)
+            # learnable positive negative indicator
+            self.pn_indicator = nn.Embedding(2, hidden_dim)
+        # build AttentionDataStruct
+        attn_arch['NUM_LAYERS'] = self.num_layers
+        self.attention_data = AttentionDataStruct(attn_arch, task_switch)
+    @classmethod
+    def from_config(cls, cfg, in_channels, lang_encoder, mask_classification, extra):
+        ret = {}
+        ret["lang_encoder"] = lang_encoder
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        ret["hidden_dim"] = dec_cfg['HIDDEN_DIM']
+        ret["dim_proj"] = cfg['MODEL']['DIM_PROJ']
+        ret["num_queries"] = dec_cfg['NUM_OBJECT_QUERIES']
+        ret["contxt_len"] = cfg['MODEL']['TEXT']['CONTEXT_LENGTH']
+        # Transformer parameters:
+        ret["nheads"] = dec_cfg['NHEADS']
+        ret["dim_feedforward"] = dec_cfg['DIM_FEEDFORWARD']
+        # NOTE: because we add learnable query features which requires supervision,
+        # we add minus 1 to decoder layers to be consistent with our loss
+        # implementation: that is, number of auxiliary losses is always
+        # equal to number of decoder layers. With learnable query features, the number of
+        # auxiliary losses equals number of decoders plus 1.
+        assert dec_cfg['DEC_LAYERS'] >= 1
+        ret["dec_layers"] = dec_cfg['DEC_LAYERS'] - 1
+        ret["pre_norm"] = dec_cfg['PRE_NORM']
+        ret["enforce_input_project"] = dec_cfg['ENFORCE_INPUT_PROJ']
+        ret["mask_dim"] = enc_cfg['MASK_DIM']
+        ret["task_switch"] = extra['task_switch']
+        ret["max_spatial_len"] = dec_cfg['MAX_SPATIAL_LEN']
+        # attn data struct
+        ret["attn_arch"] = cfg['ATTENTION_ARCH']
+        return ret
+    def forward(self, x, mask_features, mask=None, target_queries=None, target_vlp=None, task='seg', extra={}):
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels; del mask
+        spatial_extra_flag = 'spatial_query_pos_mask' in extra.keys() or task == 'refimg' or 'refimg_tokens' in extra
+        grounding_extra_flag = 'grounding_tokens' in extra.keys()
+        spatial_memory_flag = 'prev_mask' in extra.keys()
+        flags = {"spatial": spatial_extra_flag, "grounding": grounding_extra_flag, "memories_spatial": spatial_memory_flag}
+        self.attention_data.reset(flags, task, extra)
+        src, pos, size_list = prepare_features(x, self.num_feature_levels, self.pe_layer, self.input_proj, self.level_embed)
+        _, bs, _ = src[0].shape
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+        self.attention_data.set('queries_object', 'queries', output, query_embed)
+        if self.task_switch['spatial'] and spatial_extra_flag:
+            if 'refimg_tokens' not in extra:
+                # get divisor
+                _,h,w = extra['spatial_query_pos_mask'][0].shape
+                divisor = torch.tensor([h,w], device=output.device)[None,]
+                # Get mean pos spatial query
+                non_zero_pos_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_pos_mask']]
+                non_zero_pos_point = nn.utils.rnn.pad_sequence(non_zero_pos_point, padding_value=-1).permute(1,0,2)
+                non_zero_pos_mask = (non_zero_pos_point.sum(dim=-1) < 0)
+                spatial_query_pos = point_sample(mask_features, non_zero_pos_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
+                spatial_query_pos = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_pos.transpose(1,2), ~non_zero_pos_mask)]).transpose(0,1).nan_to_num()
+                # Get mean neg spatial query
+                non_zero_neg_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_neg_mask']]
+                non_zero_neg_point = nn.utils.rnn.pad_sequence(non_zero_neg_point, padding_value=-1).permute(1,0,2)
+                non_zero_neg_mask = (non_zero_neg_point.sum(dim=-1) < 0)
+                spatial_query_neg = point_sample(mask_features, non_zero_neg_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
+                spatial_query_neg = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_neg.transpose(1,2), ~non_zero_neg_mask)]).transpose(0,1).nan_to_num()
+                # merge positive and negative sample points for self attention
+                # pos_neg_points = [x|y for x,y in zip(extra['spatial_query_pos_mask'], extra['spatial_query_neg_mask'])]
+                # Get layerwise spatial query
+                src_spatial_queries = []
+                src_spatial_maskings = []
+                for i in range(len(src)):
+                    hw,_,dc = src[i].shape
+                    src_mask_features = src[i].view(size_list[i][0],size_list[i][1],bs,dc)
+                    src_mask_features = src_mask_features @ self.mask_sptial_embed[i]
+                    non_zero_query_point_pos = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_pos_mask']]
+                    non_zero_query_point_neg = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_neg_mask']]
+                    non_zero_query_point = [torch.cat([x,y], dim=0) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                    pos_neg_indicator = [torch.cat([torch.ones(x.shape[0], device=x.device), -torch.ones(y.shape[0], device=y.device)]) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                    pos_neg_indicator = nn.utils.rnn.pad_sequence(pos_neg_indicator, padding_value=0)
+                    non_zero_query_point = nn.utils.rnn.pad_sequence(non_zero_query_point, padding_value=-1).permute(1,0,2)
+                    non_zero_query_mask = (non_zero_query_point.sum(dim=-1) < 0)
+                    non_zero_query_point[non_zero_query_mask] = 0
+                    spatial_tokens = point_sample(src_mask_features.permute(2,3,0,1), non_zero_query_point.flip(dims=(2,)).type(src_mask_features.dtype), align_corners=True).permute(2,0,1)
+                    spatial_tokens[pos_neg_indicator==1] += self.pn_indicator.weight[0:1]
+                    spatial_tokens[pos_neg_indicator==-1] += self.pn_indicator.weight[1:2]
+                    src_spatial_queries += [spatial_tokens]
+                    src_spatial_maskings += [non_zero_query_mask]
+                if 'refimg' in task:
+                    output_refimg = {}
+                    output_refimg['spatial_query_pos'] = spatial_query_pos
+                    output_refimg['spatial_query_neg'] = spatial_query_neg
+                    output_refimg['src_spatial_queries'] = src_spatial_queries
+                    output_refimg['src_spatial_maskings'] = src_spatial_maskings
+                    return output_refimg
+            else:
+                spatial_query_pos = extra['refimg_tokens']['spatial_query_pos']
+                spatial_query_neg = extra['refimg_tokens']['spatial_query_neg']
+                src_spatial_queries = extra['refimg_tokens']['src_spatial_queries']
+                src_spatial_maskings = extra['refimg_tokens']['src_spatial_maskings']
+            # Get object query for spatial index
+            self.attention_data.set('queries_spatial', 'queries')
+            # set spatial memory
+            spatial_output = self.spatial_featured.weight.unsqueeze(1).repeat(1, bs, 1)
+            spatial_embed = self.spatial_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+            self.attention_data.set('memories_spatial', 'memories', spatial_output, spatial_embed)
+            # if 'queries_spatial' in extra:
+            #     self.attention_data.set('queries_spatial', 'queries', var=extra['queries_spatial'])
+            # if spatial_memory_flag:
+            #     prev_mask = (extra['prev_mask'].sigmoid() > 0.5).detach()
+            #     non_zero_query_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in prev_mask]
+            #     non_zero_query_point = nn.utils.rnn.pad_sequence(non_zero_query_point, padding_value=-1).permute(1,0,2)
+            #     non_zero_query_mask = (non_zero_query_point.sum(dim=-1) < 0)
+            #     spatial_memory = point_sample(mask_features, non_zero_query_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
+            #     spatial_memory = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_memory.transpose(1,2), ~non_zero_query_mask)]).transpose(0,1).nan_to_num()
+        if self.task_switch['grounding'] and grounding_extra_flag:
+            # Get grounding tokens
+            grounding_tokens = extra['grounding_tokens']
+            _grounding_tokens = grounding_tokens.detach().clone()
+            self.attention_data.set('tokens_grounding', 'tokens', grounding_tokens, _grounding_tokens)
+            self.attention_data.set('queries_grounding', 'queries')
+            self.attention_data.set_maskings('tokens_grounding', extra['grounding_nonzero_mask'])
+        output, query_embed = self.attention_data.cross_attn_variables()
+        # prediction heads on learnable query features
+        results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
+        results["predictions_pos_spatial"] = spatial_query_pos.transpose(0,1) if spatial_extra_flag else None
+        results["predictions_neg_spatial"] = spatial_query_neg.transpose(0,1) if spatial_extra_flag else None
+        self.attention_data.set_results(results)
+        for i in range(self.num_layers):
+            level_index = i % self.num_feature_levels
+            # CROSS ATTENTION
+            output, avg_attn = self.transformer_cross_attention_layers[i](
+                output, src[level_index],
+                memory_mask=self.attention_data.cross_attn_mask(size_list[level_index], self.num_heads),
+                memory_key_padding_mask=None,  # here we do not apply masking on padded region
+                pos=pos[level_index], query_pos=query_embed
+            )
+            self.attention_data.update_variables(output, 'cross_attn')
+            # SELF ATTENTION
+            self_attn_mask = torch.zeros((bs, self.num_queries, self.num_queries), device=query_embed.device).bool() # Default False (attend oq)
+            if self.task_switch['spatial'] and spatial_extra_flag:
+                # get spatial tokens
+                spatial_tokens = src_spatial_queries[level_index]
+                _spatial_tokens = spatial_tokens.detach().clone()
+                self.attention_data.set('tokens_spatial', 'tokens', spatial_tokens, _spatial_tokens)
+                self.attention_data.set_maskings('tokens_spatial', src_spatial_maskings[level_index])
+            output, query_embed, self_attn_mask = self.attention_data.self_attn(bs, self.num_heads)
+            output = self.transformer_self_attention_layers[i](
+                output, tgt_mask=self_attn_mask,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed)
+            # FFN
+            output = self.transformer_ffn_layers[i](
+                output
+            )
+            self.attention_data.update_variables(output, 'self_attn')
+            output, query_embed = self.attention_data.cross_attn_variables()
+            results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels], layer_id=i)
+            results["predictions_pos_spatial"] = spatial_query_pos.transpose(0,1) if spatial_extra_flag else None
+            results["predictions_neg_spatial"] = spatial_query_neg.transpose(0,1) if spatial_extra_flag else None
+            self.attention_data.set_results(results)
+        return self.attention_data.organize_output()
+    def forward_prediction_heads(self, output, mask_features, attn_mask_target_size, layer_id=-1):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        class_embed = decoder_output @ self.class_embed
+        outputs_class = self.lang_encoder.compute_similarity(class_embed)
+        mask_embed = self.mask_embed(decoder_output)
+        outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+        outputs_bbox = [None for i in range(len(outputs_mask))]
+        if self.task_switch['bbox']:
+            outputs_bbox = self.bbox_embed(decoder_output)
+        # NOTE: prediction is of higher-resolution
+        # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
+        attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
+        # must use bool type
+        # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
+        attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
+        attn_mask = attn_mask.detach()
+        outputs_caption = class_embed
+        results = {
+            "attn_mask": attn_mask,
+            "predictions_class": outputs_class,
+            "predictions_mask": outputs_mask,
+            "predictions_bbox": outputs_bbox,
+            "predictions_caption": outputs_caption,
+            "predictions_maskemb": mask_embed,
+        }
+        return results
+@register_decoder
+def get_seem_interface(cfg, in_channels, lang_encoder, mask_classification, extra):
+    return SEEMDecoder(cfg, in_channels, lang_encoder, mask_classification, extra)

modeling/interface/seem_v1.py ADDED Viewed

	@@ -0,0 +1,389 @@

+# --------------------------------------------------------
+# SEEM -- Segment Everything Everywhere All at Once
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import logging
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from timm.models.layers import trunc_normal_
+from detectron2.layers import Conv2d
+import fvcore.nn.weight_init as weight_init
+from .build import register_decoder
+from .modules import SelfAttentionLayer, CrossAttentionLayer, FFNLayer, MLP
+from .prototype.attention_data_struct_seemv1 import AttentionDataStruct
+from ..utils import rand_sample, prepare_features, configurable
+from ..modules import PositionEmbeddingSine
+from ..modules.point_features import point_sample
+class SEEMDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        lang_encoder: nn.Module,
+        in_channels,
+        mask_classification=True,
+        *,
+        hidden_dim: int,
+        dim_proj: int,
+        num_queries: int,
+        contxt_len: int,
+        nheads: int,
+        dim_feedforward: int,
+        dec_layers: int,
+        pre_norm: bool,
+        mask_dim: int,
+        task_switch: dict,
+        enforce_input_project: bool,
+        max_spatial_len: int,
+        attn_arch: dict,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        assert mask_classification, "Only support mask classification model"
+        self.mask_classification = mask_classification
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.contxt_len = contxt_len
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # level embedding (we always use 3 scales)
+        self.num_feature_levels = 3
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
+                weight_init.c2_xavier_fill(self.input_proj[-1])
+            else:
+                self.input_proj.append(nn.Sequential())
+        self.task_switch = task_switch
+        self.query_index = {}
+        # output FFNs
+        self.lang_encoder = lang_encoder
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+        self.class_embed = nn.Parameter(torch.empty(hidden_dim, dim_proj))
+        trunc_normal_(self.class_embed, std=.02)
+        if task_switch['bbox']:
+            self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        if task_switch['spatial']:
+            # spatial query
+            self.mask_sptial_embed = nn.ParameterList([nn.Parameter(torch.empty(hidden_dim, hidden_dim)) for x in range(3)])
+            trunc_normal_(self.mask_sptial_embed[0], std=.02)
+            trunc_normal_(self.mask_sptial_embed[1], std=.02)
+            trunc_normal_(self.mask_sptial_embed[2], std=.02)
+            self.max_spatial_len = max_spatial_len
+            # spatial memory
+            num_spatial_memories = attn_arch['SPATIAL_MEMORIES']
+            self.spatial_embed = nn.Embedding(num_spatial_memories, hidden_dim)
+            self.spatial_featured = nn.Embedding(num_spatial_memories, hidden_dim)
+            # learnable positive negative indicator
+            self.pn_indicator = nn.Embedding(2, hidden_dim)
+        # build AttentionDataStruct
+        attn_arch['NUM_LAYERS'] = self.num_layers
+        self.attention_data = AttentionDataStruct(attn_arch, task_switch)
+        self.sample_size = attn_arch['QUERY_NUMBER']
+    @classmethod
+    def from_config(cls, cfg, in_channels, lang_encoder, mask_classification, extra):
+        ret = {}
+        ret["lang_encoder"] = lang_encoder
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        ret["hidden_dim"] = dec_cfg['HIDDEN_DIM']
+        ret["dim_proj"] = cfg['MODEL']['DIM_PROJ']
+        ret["num_queries"] = dec_cfg['NUM_OBJECT_QUERIES']
+        ret["contxt_len"] = cfg['MODEL']['TEXT']['CONTEXT_LENGTH']
+        # Transformer parameters:
+        ret["nheads"] = dec_cfg['NHEADS']
+        ret["dim_feedforward"] = dec_cfg['DIM_FEEDFORWARD']
+        # NOTE: because we add learnable query features which requires supervision,
+        # we add minus 1 to decoder layers to be consistent with our loss
+        # implementation: that is, number of auxiliary losses is always
+        # equal to number of decoder layers. With learnable query features, the number of
+        # auxiliary losses equals number of decoders plus 1.
+        assert dec_cfg['DEC_LAYERS'] >= 1
+        ret["dec_layers"] = dec_cfg['DEC_LAYERS'] - 1
+        ret["pre_norm"] = dec_cfg['PRE_NORM']
+        ret["enforce_input_project"] = dec_cfg['ENFORCE_INPUT_PROJ']
+        ret["mask_dim"] = enc_cfg['MASK_DIM']
+        ret["task_switch"] = extra['task_switch']
+        ret["max_spatial_len"] = dec_cfg['MAX_SPATIAL_LEN']
+        # attn data struct
+        ret["attn_arch"] = cfg['ATTENTION_ARCH']
+        return ret
+    def forward(self, x, mask_features, mask=None, target_queries=None, target_vlp=None, task='seg', extra={}):
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels; del mask
+        spatial_extra_flag = 'spatial_query_pos_mask' in extra.keys() or task == 'refimg' or 'refimg_tokens' in extra
+        grounding_extra_flag = 'grounding_tokens' in extra.keys()
+        spatial_memory_flag = 'prev_mask' in extra.keys()
+        flags = {"spatial": spatial_extra_flag, "grounding": grounding_extra_flag, "memories_spatial": spatial_memory_flag}
+        self.attention_data.reset(flags, task, extra)
+        src, pos, size_list = prepare_features(x, self.num_feature_levels, self.pe_layer, self.input_proj, self.level_embed)
+        _,bs,_ = src[0].shape
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+        self.attention_data.set('queries_object', 'queries', output, query_embed)
+        if self.task_switch['spatial'] and spatial_extra_flag:
+            if 'refimg_tokens' not in extra:
+                # get divisor
+                c,h,w = extra['spatial_query_pos_mask'][0].shape
+                divisor = torch.tensor([1,h,w], device=output.device)[None,]
+                # Get mean pos spatial query
+                non_zero_pos_point = [rand_sample(m, divisor, self.max_spatial_len[-1]).t() for m in extra['spatial_query_pos_mask']]
+                non_zero_pos_index = [m[:,0:1].long() for m in non_zero_pos_point]
+                non_zero_pos_point = nn.utils.rnn.pad_sequence(non_zero_pos_point, padding_value=-1).permute(1,0,2)
+                non_zero_pos_index = nn.utils.rnn.pad_sequence(non_zero_pos_index, padding_value=-1).permute(1,0,2)[:,:,0]
+                non_zero_pos_mask = (non_zero_pos_point.sum(dim=-1) < 0)
+                spatial_query_pos = point_sample(mask_features, non_zero_pos_point[:,:,1:].flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
+                num_mask_per_batch = [len(m) for m in extra['spatial_query_pos_mask']]
+                spatial_query_pos = nn.utils.rnn.pad_sequence([torch.stack([x[ns==n].mean(dim=0, keepdim=False) if (ns==n).sum() > 0 else -torch.ones((x.shape[1]), device=spatial_query_pos.device) for n in range(mb)]) for x, m, ns, mb in zip(spatial_query_pos.transpose(1,2), ~non_zero_pos_mask, non_zero_pos_index, num_mask_per_batch)], padding_value=-1).nan_to_num()
+                # Get mean neg spatial query
+                non_zero_neg_point = [rand_sample(m, divisor, self.max_spatial_len[-1]).t() for m in extra['spatial_query_neg_mask']]
+                non_zero_neg_index = [m[:,0:1].long() for m in non_zero_neg_point]
+                non_zero_neg_point = nn.utils.rnn.pad_sequence(non_zero_neg_point, padding_value=-1).permute(1,0,2)
+                non_zero_neg_index = nn.utils.rnn.pad_sequence(non_zero_neg_index, padding_value=-1).permute(1,0,2)[:,:,0]
+                non_zero_neg_mask = (non_zero_neg_point.sum(dim=-1) < 0)
+                spatial_query_neg = point_sample(mask_features, non_zero_neg_point[:,:,1:].flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
+                num_mask_per_batch = [len(m) for m in extra['spatial_query_neg_mask']]
+                spatial_query_neg = nn.utils.rnn.pad_sequence([torch.stack([x[ns==n].mean(dim=0, keepdim=False) if (ns==n).sum() > 0 else -torch.ones((x.shape[1]), device=spatial_query_neg.device) for n in range(mb)]) for x, m, ns, mb in zip(spatial_query_neg.transpose(1,2), ~non_zero_neg_mask, non_zero_neg_index, num_mask_per_batch)], padding_value=-1).nan_to_num()
+                # Get layerwise spatial query
+                src_spatial_queries = []
+                src_spatial_maskings = []
+                src_spatial_indices = []
+                for i in range(len(src)):
+                    hw,_,dc = src[i].shape
+                    src_mask_features = src[i].view(size_list[i][0],size_list[i][1],bs,dc)
+                    src_mask_features = src_mask_features @ self.mask_sptial_embed[i]
+                    non_zero_query_point_pos = [rand_sample(m, divisor, self.max_spatial_len[i]).t() for m in extra['spatial_query_pos_mask']]
+                    non_zero_query_point_neg = [rand_sample(m, divisor, self.max_spatial_len[i]).t() for m in extra['spatial_query_neg_mask']]
+                    non_zero_query_point = [torch.cat([x[:,1:],y[:,1:]], dim=0) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                    non_zero_query_index = [torch.cat([x[:,0:1],y[:,0:1]], dim=0) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                    pos_neg_indicator = [torch.cat([torch.ones(x.shape[0], device=x.device), -torch.ones(y.shape[0], device=y.device)]) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                    pos_neg_indicator = nn.utils.rnn.pad_sequence(pos_neg_indicator, padding_value=0)
+                    non_zero_query_point = nn.utils.rnn.pad_sequence(non_zero_query_point, padding_value=-1).permute(1,0,2)
+                    non_zero_query_index = nn.utils.rnn.pad_sequence(non_zero_query_index, padding_value=-1).permute(1,0,2)
+                    non_zero_query_mask = (non_zero_query_point.sum(dim=-1) < 0)
+                    non_zero_query_point[non_zero_query_mask] = 0
+                    spatial_tokens = point_sample(src_mask_features.permute(2,3,0,1), non_zero_query_point.flip(dims=(2,)).type(src_mask_features.dtype), align_corners=True).permute(2,0,1)
+                    spatial_tokens[pos_neg_indicator==1] += self.pn_indicator.weight[0:1]
+                    spatial_tokens[pos_neg_indicator==-1] += self.pn_indicator.weight[1:2]
+                    src_spatial_queries += [spatial_tokens]
+                    src_spatial_maskings += [non_zero_query_mask]
+                    src_spatial_indices += [non_zero_query_index]
+                if 'refimg' in task:
+                    output_refimg = {}
+                    output_refimg['spatial_query_pos'] = spatial_query_pos
+                    output_refimg['spatial_query_neg'] = spatial_query_neg
+                    output_refimg['src_spatial_queries'] = src_spatial_queries
+                    output_refimg['src_spatial_maskings'] = src_spatial_maskings
+                    return output_refimg
+            else:
+                spatial_query_pos = extra['refimg_tokens']['spatial_query_pos']
+                spatial_query_neg = extra['refimg_tokens']['spatial_query_neg']
+                src_spatial_queries = extra['refimg_tokens']['src_spatial_queries']
+                src_spatial_maskings = extra['refimg_tokens']['src_spatial_maskings']
+            # Get object query for spatial index
+            self.attention_data.set_extra({"spatial_query_number": len(spatial_query_pos), "sample_size": self.sample_size})
+            self.attention_data.set('queries_spatial', 'queries', sample_size=self.sample_size*len(spatial_query_pos))
+            # set spatial memory
+            spatial_output = self.spatial_featured.weight.unsqueeze(1).repeat(1, bs, 1)
+            spatial_embed = self.spatial_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+            self.attention_data.set('memories_spatial', 'memories', spatial_output, spatial_embed)
+        if self.task_switch['grounding'] and grounding_extra_flag:
+            # Get grounding tokens
+            grounding_tokens = extra['grounding_tokens']
+            _grounding_tokens = grounding_tokens.detach().clone()
+            self.attention_data.set('tokens_grounding', 'tokens', grounding_tokens, _grounding_tokens)
+            self.attention_data.set('queries_grounding', 'queries')
+            self.attention_data.set_maskings('tokens_grounding', extra['grounding_nonzero_mask'])
+        output, query_embed = self.attention_data.cross_attn_variables()
+        # prediction heads on learnable query features
+        results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
+        results["predictions_pos_spatial"] = spatial_query_pos.transpose(0,1) if spatial_extra_flag else None
+        results["predictions_neg_spatial"] = spatial_query_neg.transpose(0,1) if spatial_extra_flag else None
+        self.attention_data.set_results(results)
+        for i in range(self.num_layers):
+            level_index = i % self.num_feature_levels
+            # CROSS ATTENTION
+            output, avg_attn = self.transformer_cross_attention_layers[i](
+                output, src[level_index],
+                memory_mask=self.attention_data.cross_attn_mask(size_list[level_index], self.num_heads),
+                memory_key_padding_mask=None,  # here we do not apply masking on padded region
+                pos=pos[level_index], query_pos=query_embed
+            )
+            self.attention_data.update_variables(output, 'cross_attn')
+            # SELF ATTENTION
+            self_attn_mask = torch.zeros((bs, self.num_queries, self.num_queries), device=query_embed.device).bool() # Default False (attend oq)
+            if self.task_switch['spatial'] and spatial_extra_flag:
+                # get spatial tokens
+                spatial_tokens = src_spatial_queries[level_index]
+                _spatial_tokens = spatial_tokens.detach().clone()
+                self.attention_data.set('tokens_spatial', 'tokens', spatial_tokens, _spatial_tokens)
+                self.attention_data.set_maskings('tokens_spatial', src_spatial_maskings[level_index])
+                self.attention_data.set_extra({"spatial_indices": src_spatial_indices[level_index]})
+            output, query_embed, self_attn_mask = self.attention_data.self_attn(bs, self.num_heads)
+            output = self.transformer_self_attention_layers[i](
+                output, tgt_mask=self_attn_mask,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed)
+            # FFN
+            output = self.transformer_ffn_layers[i](
+                output
+            )
+            self.attention_data.update_variables(output, 'self_attn')
+            output, query_embed = self.attention_data.cross_attn_variables()
+            results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels], layer_id=i)
+            results["predictions_pos_spatial"] = spatial_query_pos.transpose(0,1) if spatial_extra_flag else None
+            results["predictions_neg_spatial"] = spatial_query_neg.transpose(0,1) if spatial_extra_flag else None
+            self.attention_data.set_results(results)
+        return self.attention_data.organize_output()
+    def forward_prediction_heads(self, output, mask_features, attn_mask_target_size, layer_id=-1):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        class_embed = decoder_output @ self.class_embed
+        outputs_class = self.lang_encoder.compute_similarity(class_embed)
+        mask_embed = self.mask_embed(decoder_output)
+        outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+        outputs_bbox = [None for i in range(len(outputs_mask))]
+        if self.task_switch['bbox']:
+            outputs_bbox = self.bbox_embed(decoder_output)
+        # NOTE: prediction is of higher-resolution
+        # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
+        attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
+        # must use bool type
+        # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
+        attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
+        attn_mask = attn_mask.detach()
+        outputs_caption = class_embed
+        results = {
+            "attn_mask": attn_mask,
+            "predictions_class": outputs_class,
+            "predictions_mask": outputs_mask,
+            "predictions_bbox": outputs_bbox,
+            "predictions_caption": outputs_caption,
+            "predictions_maskemb": mask_embed,
+        }
+        return results
+@register_decoder
+def get_seem_interface(cfg, in_channels, lang_encoder, mask_classification, extra):
+    return SEEMDecoder(cfg, in_channels, lang_encoder, mask_classification, extra)

modeling/interface/xdecoder.py ADDED Viewed

	@@ -0,0 +1,497 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import logging
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from timm.models.layers import trunc_normal_
+from detectron2.layers import Conv2d
+import fvcore.nn.weight_init as weight_init
+from .build import register_decoder
+from .modules import SelfAttentionLayer, CrossAttentionLayer, FFNLayer, MLP
+from ..utils import configurable
+from ..modules import PositionEmbeddingSine
+class XDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        lang_encoder: nn.Module,
+        in_channels,
+        mask_classification=True,
+        *,
+        hidden_dim: int,
+        dim_proj: int,
+        num_queries: int,
+        contxt_len: int,
+        nheads: int,
+        dim_feedforward: int,
+        dec_layers: int,
+        pre_norm: bool,
+        mask_dim: int,
+        task_switch: dict,
+        captioning_step: int,
+        enforce_input_project: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        assert mask_classification, "Only support mask classification model"
+        self.mask_classification = mask_classification
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.contxt_len = contxt_len
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # level embedding (we always use 3 scales)
+        self.num_feature_levels = 3
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
+                weight_init.c2_xavier_fill(self.input_proj[-1])
+            else:
+                self.input_proj.append(nn.Sequential())
+        self.task_switch = task_switch
+        # output FFNs
+        self.lang_encoder = lang_encoder
+        if self.task_switch['mask']:
+            self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+        self.class_embed = nn.Parameter(torch.empty(hidden_dim, dim_proj))
+        trunc_normal_(self.class_embed, std=.02)
+        if task_switch['bbox']:
+            self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        # Caption Project and query
+        if task_switch['captioning']:
+            self.caping_embed = nn.Parameter(torch.empty(hidden_dim, dim_proj))
+            trunc_normal_(self.caping_embed, std=.02)
+            self.pos_embed_caping = nn.Embedding(contxt_len, hidden_dim)
+            self.captioning_step = captioning_step
+        # register self_attn_mask to avoid information leakage, it includes interaction between object query, class query and caping query
+        self_attn_mask = torch.zeros((1, num_queries + contxt_len, num_queries + contxt_len)).bool()
+        self_attn_mask[:, :num_queries, num_queries:] = True # object+class query does not attend with caption query.
+        self_attn_mask[:, num_queries:, num_queries:] = torch.triu(torch.ones((1, contxt_len, contxt_len)), diagonal=1).bool() # caption query only attend with previous token.
+        self_attn_mask[:, :num_queries-1, num_queries-1:num_queries] = True # object query does not attend with class query.
+        self_attn_mask[:, num_queries-1:num_queries, :num_queries-1] = True # class query does not attend with object query.
+        self.register_buffer("self_attn_mask", self_attn_mask)
+    @classmethod
+    def from_config(cls, cfg, in_channels, lang_encoder, mask_classification, extra):
+        ret = {}
+        ret["lang_encoder"] = lang_encoder
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+        enc_cfg = cfg['MODEL']['ENCODER']
+        dec_cfg = cfg['MODEL']['DECODER']
+        ret["hidden_dim"] = dec_cfg['HIDDEN_DIM']
+        ret["dim_proj"] = cfg['MODEL']['DIM_PROJ']
+        ret["num_queries"] = dec_cfg['NUM_OBJECT_QUERIES']
+        ret["contxt_len"] = cfg['MODEL']['TEXT']['CONTEXT_LENGTH']
+        # Transformer parameters:
+        ret["nheads"] = dec_cfg['NHEADS']
+        ret["dim_feedforward"] = dec_cfg['DIM_FEEDFORWARD']
+        # NOTE: because we add learnable query features which requires supervision,
+        # we add minus 1 to decoder layers to be consistent with our loss
+        # implementation: that is, number of auxiliary losses is always
+        # equal to number of decoder layers. With learnable query features, the number of
+        # auxiliary losses equals number of decoders plus 1.
+        assert dec_cfg['DEC_LAYERS'] >= 1
+        ret["dec_layers"] = dec_cfg['DEC_LAYERS'] - 1
+        ret["pre_norm"] = dec_cfg['PRE_NORM']
+        ret["enforce_input_project"] = dec_cfg['ENFORCE_INPUT_PROJ']
+        ret["mask_dim"] = enc_cfg['MASK_DIM']
+        ret["task_switch"] = extra['task_switch']
+        ret["captioning_step"] = dec_cfg['CAPTIONING'].get('STEP', 50)
+        return ret
+    def forward(self, x, mask_features, mask=None, target_queries=None, target_vlp=None, task='seg', extra={}):
+        if task == 'captioning_infer':
+            return self.forward_captioning(x, mask_features, mask=mask, target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra)
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels
+        src = []
+        pos = []
+        size_list = []
+        # disable mask, it does not affect performance
+        del mask
+        for i in range(self.num_feature_levels):
+            size_list.append(x[i].shape[-2:])
+            pos.append(self.pe_layer(x[i], None).flatten(2))
+            src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
+            # flatten NxCxHxW to HWxNxC
+            pos[-1] = pos[-1].permute(2, 0, 1)
+            src[-1] = src[-1].permute(2, 0, 1)
+        _, bs, _ = src[0].shape
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+        predictions_class = []
+        predictions_mask = []
+        predictions_bbox = []
+        predictions_caption = []
+        predictions_captioning = []
+        self_tgt_mask = None
+        if self.training and task == 'vlp' and self.task_switch['captioning']:
+            # output = torch.cat((output, self.query_feat_caping.weight.unsqueeze(1).repeat(1, bs, 1)), dim=0) # concat object query, class token and caption token.
+            caping_lang_embed = torch.cat([caption['caption_tokens'] for caption in target_vlp], dim=0).transpose(0, 1) # language output
+            _caping_lang_embed = caping_lang_embed.detach().clone()
+            output = torch.cat((output, _caping_lang_embed), dim=0) # concat object query, class token and caption token.
+            caping_lang_embed += self.pos_embed_caping.weight.unsqueeze(1).repeat(1, bs, 1)
+            query_embed = torch.cat((query_embed, caping_lang_embed), dim=0) # may not add at the beginning.
+            self_tgt_mask = self.self_attn_mask.repeat(output.shape[1]*self.num_heads, 1, 1)
+        elif (((self.training and task == 'seg') or (task == 'grounding_eval')) and self.task_switch['grounding']):
+            self_tgt_mask = self.self_attn_mask[:,:self.num_queries,:self.num_queries].repeat(output.shape[1]*self.num_heads, 1, 1)
+            grounding_tokens = extra['grounding_tokens']
+            _grounding_tokens = grounding_tokens.detach().clone()
+            # initialize with negative attention at the beginning.
+            pad_tgt_mask = torch.ones((1, self.num_queries + (self.num_queries-1) + len(grounding_tokens), self.num_queries + (self.num_queries-1) + len(grounding_tokens)), device=self_tgt_mask.device).bool().repeat(output.shape[1]*self.num_heads, 1, 1)
+            pad_tgt_mask[:,:self.num_queries,:self.num_queries] = self_tgt_mask
+            pad_tgt_mask[:,self.num_queries:,self.num_queries:] = False # grounding tokens could attend with eatch other
+            self_tgt_mask = pad_tgt_mask
+            output = torch.cat((output, output[:-1]), dim=0)
+            query_embed = torch.cat((query_embed, query_embed[:-1]), dim=0) # also pad language embdding to fix embedding
+        else:
+            self_tgt_mask = self.self_attn_mask[:,:self.num_queries,:self.num_queries].repeat(output.shape[1]*self.num_heads, 1, 1)
+        # prediction heads on learnable query features
+        results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0], task=task)
+        attn_mask = results["attn_mask"]
+        predictions_class.append(results["outputs_class"])
+        predictions_mask.append(results["outputs_mask"])
+        predictions_bbox.append(results["outputs_bbox"])
+        predictions_caption.append(results["outputs_caption"])
+        predictions_captioning.append(results["outputs_captionting"])
+        for i in range(self.num_layers):
+            level_index = i % self.num_feature_levels
+            attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+            if self.training and task == 'vlp' and self.task_switch['captioning']:
+                attn_mask = torch.cat((attn_mask, torch.zeros_like(attn_mask[:, :self.contxt_len, :])), dim=1)
+            # attention: cross-attention first
+            output, avg_attn = self.transformer_cross_attention_layers[i](
+                output, src[level_index],
+                memory_mask=attn_mask,
+                memory_key_padding_mask=None,  # here we do not apply masking on padded region
+                pos=pos[level_index], query_pos=query_embed
+            )
+            if (((self.training and task == 'seg') or (task == 'grounding_eval')) and self.task_switch['grounding']):
+                output = torch.cat((output, _grounding_tokens), dim=0)
+                query_embed = torch.cat((query_embed, grounding_tokens), dim=0)
+            output = self.transformer_self_attention_layers[i](
+                output, tgt_mask=self_tgt_mask,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed
+            )
+            # FFN
+            output = self.transformer_ffn_layers[i](
+                output
+            )
+            if ((self.training and task == 'seg') or (task == 'grounding_eval')) and self.task_switch['grounding']:
+                _grounding_tokens = output[-len(_grounding_tokens):]
+                output = output[:-len(_grounding_tokens)]
+                query_embed = query_embed[:-len(_grounding_tokens)]
+            results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels], layer_id=i, task=task)
+            attn_mask = results["attn_mask"]
+            predictions_class.append(results["outputs_class"])
+            predictions_mask.append(results["outputs_mask"])
+            predictions_bbox.append(results["outputs_bbox"])
+            predictions_caption.append(results["outputs_caption"])
+            predictions_captioning.append(results["outputs_captionting"])
+        assert len(predictions_class) == self.num_layers + 1
+        if task == 'vlp':
+            out = {'pred_captionings': predictions_captioning[-1],
+                   'pred_captions': predictions_caption[-1],
+                   'aux_outputs': [{'pred_captionings': x, 'pred_captions': y } for x, y in zip(predictions_captioning[:-1], predictions_caption[:-1])]}
+            return out
+        else:
+            out = {
+                'pred_logits': predictions_class[-1],
+                'pred_masks': predictions_mask[-1],
+                'pred_boxes': predictions_bbox[-1],
+                'pred_captions': predictions_caption[-1],
+                'aux_outputs': self._set_aux_loss(
+                    predictions_class if self.mask_classification else None, predictions_mask, predictions_bbox, predictions_caption
+                )
+            }
+            return out
+    def forward_captioning(self, x, mask_features, mask = None, target_queries = None, target_vlp = None, task='seg', extra={}):
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels
+        src = []
+        pos = []
+        size_list = []
+        # disable mask, it does not affect performance
+        del mask
+        for i in range(self.num_feature_levels):
+            size_list.append(x[i].shape[-2:])
+            pos.append(self.pe_layer(x[i], None).flatten(2))
+            src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
+            # flatten NxCxHxW to HWxNxC
+            pos[-1] = pos[-1].permute(2, 0, 1)
+            src[-1] = src[-1].permute(2, 0, 1)
+        _, bs, _ = src[0].shape
+        # QxNxC
+        query_embed_ = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        query_feat = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+        caping_lang_token = extra['start_token'].repeat(bs, 1)
+        pos_embed_caping = self.pos_embed_caping.weight.unsqueeze(1).repeat(1, bs, 1)
+        # prepare token embedding for evaluation
+        token_embs = self.lang_encoder.lang_encoder.token_embedding.weight
+        # token_embs = (token_embs / token_embs.norm(dim=-1, keepdim=True) + 1e-7)
+        for cap_idx in range(0, self.captioning_step):
+            caping_lang_embed = self.lang_encoder.forward_language_token((caping_lang_token,))[0].transpose(0, 1)
+            output = torch.cat((query_feat, caping_lang_embed), dim=0) # concat object query, class token and caption token.
+            caping_lang_embed += pos_embed_caping
+            query_embed = torch.cat((query_embed_, caping_lang_embed), dim=0) # may not add at the beginning.
+            # output = torch.cat((query_feat, query_feat_caping), dim=0) # concat object query, class token and caption token.
+            # prediction heads on learnable query features
+            results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0], task=task)
+            attn_mask = results["attn_mask"]
+            for i in range(self.num_layers):
+                level_index = i % self.num_feature_levels
+                attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+                attn_mask = torch.cat((attn_mask, torch.zeros_like(attn_mask[:, :self.contxt_len, :])), dim=1)
+                self_tgt_mask = self.self_attn_mask.repeat(output.shape[1]*self.num_heads, 1, 1)
+                if extra['captioning_mask'] is not None:
+                    bs,nq,wh = attn_mask.shape
+                    assert bs==self.num_heads, "Only support single image referring captioning."
+                    cap_mask = extra['captioning_mask']
+                    attn_mask = attn_mask.reshape(bs,nq,size_list[i%3][0],size_list[i%3][1])
+                    cap_mask = F.interpolate(cap_mask[None,].float(), size_list[i%3], mode='nearest').bool()[0,0]
+                    attn_mask[:,self.num_queries:, cap_mask] = True
+                    attn_mask = attn_mask.reshape(bs,nq,wh)
+                # attention: cross-attention first
+                output, avg_attn = self.transformer_cross_attention_layers[i](
+                    output, src[level_index],
+                    memory_mask=attn_mask,
+                    memory_key_padding_mask=None,  # here we do not apply masking on padded region
+                    pos=pos[level_index], query_pos=query_embed
+                )
+                output = self.transformer_self_attention_layers[i](
+                    output, tgt_mask=self_tgt_mask,
+                    tgt_key_padding_mask=None,
+                    query_pos=query_embed
+                )
+                # FFN
+                output = self.transformer_ffn_layers[i](
+                    output
+                )
+                results = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels], layer_id=i, task=task)
+                attn_mask = results["attn_mask"]
+            pred_captions_gen = results['outputs_captionting']
+            # pred_captions_gen = (pred_captions_gen / pred_captions_gen.norm(dim=-1, keepdim=True) + 1e-7)
+            pred_captions_gen = pred_captions_gen @ token_embs.t()
+            caping_lang_token[:,cap_idx+1] = pred_captions_gen[:,cap_idx].max(-1)[1]
+        texts = self.lang_encoder.tokenizer.batch_decode(caping_lang_token, skip_special_tokens=False)
+        texts_new = []
+        for x in texts:
+            x = x.split('<|endoftext|>')[0]
+            x = x.replace('<|endoftext|>','')
+            x = x.replace('<|startoftext|>','')
+            x = x.strip()
+            texts_new.append(x)
+        out = {'pred_captionings': caping_lang_token,
+               'pred_texts': texts_new}
+        return out
+    def forward_prediction_heads(self, output, mask_features, attn_mask_target_size, layer_id=-1, task='seg'):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        # extract image captioning token from decoder output.
+        if self.task_switch['captioning'] and (task == 'vlp' or task == 'captioning_infer'):
+            outputs_captionting = decoder_output[:,self.num_queries:] @ self.caping_embed
+        else:
+            outputs_captionting = None
+        # recompute class token output.
+        norm_decoder_output = decoder_output / (decoder_output.norm(dim=-1, keepdim=True) + 1e-7)
+        obj_token = norm_decoder_output[:,:self.num_queries-1]
+        cls_token = norm_decoder_output[:,self.num_queries-1:self.num_queries]
+        sim = (cls_token @ obj_token.transpose(1,2)).softmax(-1)[:,0,:,None] # TODO include class token.
+        cls_token = (sim * decoder_output[:,:self.num_queries-1]).sum(dim=1, keepdim=True)
+        if (((self.training and task == 'seg') or (task == 'grounding_eval')) and self.task_switch['grounding']):
+            decoder_output = torch.cat((decoder_output[:,:self.num_queries-1], cls_token, decoder_output[:,self.num_queries:2*self.num_queries-1]), dim=1)
+        else:
+            decoder_output = torch.cat((decoder_output[:,:self.num_queries-1], cls_token), dim=1)
+        # compute class, mask and bbox.
+        class_embed = decoder_output @ self.class_embed
+        # HACK do not compute similarity if mask is not on
+        outputs_class = self.lang_encoder.compute_similarity(class_embed, fake=(((not self.task_switch['mask']) and self.training)))
+        if self.task_switch['mask']:
+            mask_embed = self.mask_embed(decoder_output)
+            outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+            # NOTE: prediction is of higher-resolution
+            # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
+            attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bicubic", align_corners=False, antialias=True)
+            # must use bool type
+            # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
+            attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
+            attn_mask = attn_mask.detach()
+            # NOTE: fill False for cls token (JY)
+            attn_mask[:, self.num_queries:self.num_queries+1].fill_(False)
+        else:
+            outputs_mask = None
+            attn_mask = torch.zeros((list(decoder_output.shape[:2]) + [attn_mask_target_size[0]*attn_mask_target_size[1]]), device=decoder_output.device).repeat(self.num_heads, 1, 1).bool()
+        outputs_bbox = [None for i in range(len(decoder_output))]
+        if self.task_switch['bbox']:
+            outputs_bbox = self.bbox_embed(decoder_output)
+        outputs_caption = None
+        if self.task_switch['caption']:
+            outputs_caption = class_embed
+        results = {
+            "outputs_class": outputs_class,
+            "outputs_mask": outputs_mask,
+            "outputs_bbox": outputs_bbox,
+            "attn_mask": attn_mask,
+            "outputs_caption": outputs_caption,
+            "outputs_captionting": outputs_captionting,
+        }
+        return results
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks, outputs_boxes, outputs_captions):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        if self.mask_classification:
+            return [
+                {"pred_logits": a, "pred_masks": b, "pred_boxes": c, "pred_captions": d}
+                for a, b, c, d in zip(outputs_class[:-1], outputs_seg_masks[:-1], outputs_boxes[:-1], outputs_captions[:-1])
+            ]
+        else:
+            return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
+@register_decoder
+def get_xdecoder_interface(cfg, in_channels, lang_encoder, mask_classification, extra):
+    return XDecoder(cfg, in_channels, lang_encoder, mask_classification, extra)

modeling/language/LangEncoder/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers import AutoTokenizer
+from .transformer import *
+from .build import *
+def build_lang_encoder(config_encoder, tokenizer, verbose, **kwargs):
+    model_name = config_encoder['NAME']
+    if not is_lang_encoder(model_name):
+        raise ValueError(f'Unkown model: {model_name}')
+    return lang_encoders(model_name)(config_encoder, tokenizer, verbose, **kwargs)
+def build_tokenizer(config_encoder):
+    tokenizer = None
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    if config_encoder['TOKENIZER'] == 'clip':
+        pretrained_tokenizer = config_encoder.get(
+            'PRETRAINED_TOKENIZER', 'openai/clip-vit-base-patch32'
+        )
+        tokenizer = CLIPTokenizer.from_pretrained(pretrained_tokenizer)
+        tokenizer.add_special_tokens({'cls_token': tokenizer.eos_token})
+    elif config_encoder['TOKENIZER'] == 'clip-fast':
+        pretrained_tokenizer = config_encoder.get(
+            'PRETRAINED_TOKENIZER', 'openai/clip-vit-base-patch32'
+        )
+        tokenizer = CLIPTokenizerFast.from_pretrained(pretrained_tokenizer, from_slow=True)
+    elif config_encoder['TOKENIZER'] == 'biomed-clip':
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(config_encoder['TOKENIZER'])
+    return tokenizer

modeling/language/LangEncoder/build.py ADDED Viewed

	@@ -0,0 +1,16 @@

+_lang_encoders = {}
+def register_lang_encoder(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _lang_encoders[model_name] = fn
+    return fn
+def lang_encoders(model_name):
+    return _lang_encoders[model_name]
+def is_lang_encoder(model_name):
+    return model_name in _lang_encoders

modeling/language/LangEncoder/transformer.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import logging
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from timm.models.layers import DropPath, trunc_normal_
+from .build import register_lang_encoder
+from utilities.distributed import is_main_process
+from utilities.model import register_norm_module
+logger = logging.getLogger(__name__)
+@register_norm_module
+class LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(LayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, x):
+        pdtype = x.dtype
+        x = x.float()
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x.to(pdtype) + self.bias
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path: float = 0.0):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def attention(self, x: torch.Tensor, key_padding_mask: torch.Tensor = None):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) \
+            if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+            attn_mask=self.attn_mask
+        )[0]
+    def forward(self, x: torch.Tensor, key_padding_mask: torch.Tensor = None):
+        x = x + self.drop_path(self.attention(self.ln_1(x), key_padding_mask=key_padding_mask))
+        x = x + self.drop_path(self.mlp(self.ln_2(x)))
+        return x
+class Transformer(nn.Module):
+    def __init__(self,
+                 context_length: int,
+                 vocab_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 drop_path: float = 0.0,
+                 autogressive: bool =True):
+        super().__init__()
+        self.token_embedding = nn.Embedding(vocab_size, width)
+        self.context_length = context_length
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, width)
+        )
+        self.width = width
+        self.layers = layers
+        self.autogressive = autogressive
+        attn_mask = self.build_attention_mask() if autogressive else None
+        dpr = [x.item() for x in torch.linspace(0, drop_path, layers)]  # stochastic depth decay rule
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(width, heads, attn_mask, dpr[i])
+                for i in range(layers)
+            ]
+        )
+        self.ln_final = LayerNorm(width)
+        trunc_normal_(self.positional_embedding, std=.02)
+        # nn.init.normal_(self.token_embedding, std=.02)
+        trunc_normal_(self.token_embedding.weight, std=.02)
+        self.apply(self._init_weights)
+    @property
+    def dim_out(self):
+        return self.width
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, nn.Conv2d)):
+            if is_main_process():
+                logger.info('=> init weight of Linear/Conv2d from trunc norm')
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                if is_main_process():
+                    logger.info('=> init bias of Linear/Conv2d to zeros')
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d)):
+            nn.init.constant_(m.bias, 0)
+    def load_pretrained(self, pretrained='', pretrained_layers=[], verbose=True):
+        if os.path.isfile(pretrained):
+            pretrained_dict = torch.load(pretrained, map_location='cpu')
+            logging.info(f'=> loading pretrained model {pretrained}')
+            model_dict = self.state_dict()
+            stripped_key = lambda x: x[13:] if x.startswith('lang_encoder.') else x
+            pretrained_dict = {
+                stripped_key(k): v for k, v in pretrained_dict.items()
+                if stripped_key(k) in model_dict.keys()
+            }
+            need_init_state_dict = {}
+            for k, v in pretrained_dict.items():
+                need_init = (
+                    k.split('.')[0] in pretrained_layers
+                    or pretrained_layers[0] == '*'
+                )
+                if need_init:
+                    if verbose:
+                        logger.info(f'=> init {k} from {pretrained}')
+                    if 'positional_embedding' in k and v.size() != model_dict[k].size():
+                        positional_embedding_pretrained = v
+                        positional_embedding_current = model_dict[k]
+                        L1, nH1 = positional_embedding_pretrained.size()
+                        L2, nH2 = positional_embedding_current.size()
+                        if nH1 != nH2:
+                            logger.info(f"Error in loading {k}, passing")
+                        else:
+                            if L1 != L2:
+                                logger.info(
+                                    '=> load_pretrained: resized variant: {} to {}'
+                                        .format((L1, nH1), (L2, nH2))
+                                )
+                                posemb = positional_embedding_pretrained.float()
+                                posemb_grid = posemb.unsqueeze(dim=0).permute(0, 2, 1)
+                                posemb_grid = torch.nn.functional.interpolate(posemb_grid, size=L2, mode='linear')
+                                posemb_grid = posemb_grid.permute(0, 2, 1).squeeze(dim=0)
+                                v = posemb_grid
+                    need_init_state_dict[k] = v
+            self.load_state_dict(need_init_state_dict, strict=False)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {
+            'positional_embedding',
+            'token_embedding',
+        }
+    def forward(self, input_ids, attention_mask=None):
+        key_padding_mask = (attention_mask == 0) if (not self.autogressive and attention_mask is not None) else None
+        # key_padding_mask = (input_ids == 0) if not self.autogressive else None
+        x = self.token_embedding(input_ids)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        for block in self.resblocks:
+            x = block(x, key_padding_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        return {'last_hidden_state': x}
+@register_lang_encoder
+def lang_encoder(config_encoder, tokenizer, verbose, **kwargs):
+    transformer = Transformer(
+        context_length=config_encoder['CONTEXT_LENGTH'],
+        vocab_size=tokenizer.vocab_size,
+        width=config_encoder['WIDTH'],
+        layers=config_encoder['LAYERS'],
+        heads=config_encoder['HEADS'],
+        autogressive=config_encoder.get('AUTOGRESSIVE', True)
+    )
+    if config_encoder.get('LOAD_PRETRAINED', False):
+        transformer.load_pretrained(config_encoder['PRETRAINED'], config_encoder.get('PRETRAINED_LAYERS', ['*']))
+    return transformer

modeling/language/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .vlpencoder import *
+from .build import *
+def build_language_encoder(config, **kwargs):
+    model_name = config['MODEL']['TEXT']['ARCH']
+    if not is_model(model_name):
+        raise ValueError(f'Unkown model: {model_name}')
+    return model_entrypoints(model_name)(config, **kwargs)

modeling/language/build.py ADDED Viewed

	@@ -0,0 +1,14 @@

+_model_entrypoints = {}
+def register_model(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _model_entrypoints[model_name] = fn
+    return fn
+def model_entrypoints(model_name):
+    return _model_entrypoints[model_name]
+def is_model(model_name):
+    return model_name in _model_entrypoints

modeling/language/loss.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import pickle
+from distutils import log
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from einops import rearrange, repeat
+from timm.loss import SoftTargetCrossEntropy
+soft_cross_entropy = SoftTargetCrossEntropy()
+def is_dist_initialized():
+    return torch.distributed.is_initialized()
+def get_world_size():
+    if is_dist_initialized():
+        return torch.distributed.get_world_size()
+    return 1
+def get_rank():
+    if is_dist_initialized():
+        return dist.get_rank()
+    return 0
+def all_gather_grad(x):
+    if get_world_size() > 1:
+        all_x = [torch.zeros_like(x) for _ in range(get_world_size())]
+        torch.distributed.all_gather(all_x, x)
+        all_x[torch.distributed.get_rank()] = x
+        x = torch.cat(all_x, dim=0)
+    return x
+def vl_multilabel_contrastive_loss(image_feat, text_feat, temperature=1):
+    """
+    Args:
+        image_feat (torch.Tensor): shape [B, L1, C] # B: batch_size, L1: 1, C: 256
+        text_feat (torch.Tensor): shape [B, L2, C] # B:batch_size, L2: number of selected nouns, C: 256
+    Returns:
+    """
+    # [B, L1, C], L1 = 1
+    # image_feat = F.normalize(image_feat, dim=-1)
+    # [B, L2, C]
+    # text_feat = F.normalize(text_feat, dim=-1)
+    # HACK: normalize outside
+    # [B, L1, L2]
+    dist_per_img = image_feat @ rearrange(text_feat, 'b l c -> b c l')
+    # [B, L2, L1]
+    dist_per_text = text_feat @ rearrange(image_feat, 'b l c -> b c l')
+    batch = image_feat.shape[0]
+    img_len = image_feat.shape[1]
+    text_len = text_feat.shape[1]
+    # [B, L1, L2]
+    pos_labels_batch_img = rearrange(torch.ones_like(dist_per_text) / dist_per_text.size(1), 'b l2 l1 -> b l1 l2')
+    # [B, L2, L1]
+    pos_labels_batch_text = rearrange(torch.ones_like(dist_per_img) / dist_per_img.size(1), 'b l1 l2 -> b l2 l1')
+    image_x = rearrange(image_feat, 'b l c -> (b l) c')
+    text_x = rearrange(text_feat, 'b l c -> (b l) c')
+    logits_per_img = image_x @ all_gather_grad(text_x).t()
+    logits_per_text = text_x @ all_gather_grad(image_x).t()
+    # get label globally
+    # [B, L1, B, L2, W]
+    labels_per_img = F.one_hot(
+        torch.ones(batch, img_len, batch, text_len, dtype=torch.long, device=image_x.device) * get_rank(),
+        num_classes=get_world_size()).to(image_x.dtype)
+    labels_per_img *= rearrange(pos_labels_batch_img, 'b l1 l2 -> b l1 1 l2 1') * repeat(
+        torch.eye(batch, dtype=image_x.dtype, device=image_x.device), 'b1 b2 -> b1 1 b2 1 1')
+    # [BxL1, WxBxL2]
+    labels_per_img = rearrange(labels_per_img, 'b1 l1 b2 l2 w -> (b1 l1) (w b2 l2)')
+    # [B, L2, B, L1, W]
+    labels_per_text = F.one_hot(
+        torch.ones(batch, text_len, batch, img_len, dtype=torch.long, device=text_x.device) * get_rank(),
+        num_classes=get_world_size()).to(text_x.dtype)
+    labels_per_text *= rearrange(pos_labels_batch_text, 'b l2 l1 -> b l2 1 l1 1') * repeat(
+        torch.eye(batch, dtype=text_x.dtype, device=image_x.device), 'b2 b1 -> b2 1 b1 1 1')
+    # [BxL2, WxBxL1]
+    labels_per_text = rearrange(labels_per_text, 'b2 l2 b1 l1 w -> (b2 l2) (w b1 l1)')
+    logit_scale = temperature.exp().clamp(max=100)
+    loss_img = soft_cross_entropy(logit_scale * logits_per_img, labels_per_img)
+    loss_text = soft_cross_entropy(logit_scale * logits_per_text, labels_per_text)
+    loss = 0.5 * (loss_img + loss_text)
+    return loss
+def vl_contrastive_loss(image_feat, text_feat, temperature=1):
+    # if image_id or text_id is None, it should be None across all GPUs
+    # image_feat = F.normalize(image_feat, dim=1)
+    # text_feat = F.normalize(text_feat, dim=1)
+    # handle normalization outside
+    # add the following 4 lines
+    image_feat = all_gather_grad(image_feat)
+    text_feat = all_gather_grad(text_feat)
+    logits = torch.matmul(image_feat, text_feat.t())
+    logit_scale = temperature.exp().clamp(max=100)
+    gt = torch.arange(logits.shape[0], device=logits.device)
+    loss1 = F.cross_entropy(logit_scale * logits, gt)
+    loss2 = F.cross_entropy(logit_scale * logits.t(), gt)
+    return (loss1 + loss2) / 2 # scale it up by the number of GPUs
+def all_gather_pickle(data, device):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(device)
+    # obtain Tensor size of each rank
+    local_size = torch.LongTensor([tensor.numel()]).cuda()
+    size_list = [torch.LongTensor([0]).cuda() for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).cuda())
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).cuda()
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def all_gather_arbitary_tensor(tensor):
+    if get_world_size() > 1:
+        device = tensor.device
+        tensor_batch = all_gather_pickle(tensor.cpu(), device)
+        tensor_batch = [x.to(device) for x in tensor_batch]
+        tensor_batch[torch.distributed.get_rank()] = tensor
+        tensor_batch = torch.cat(tensor_batch, dim=0)
+    else:
+        tensor_batch = tensor
+    return tensor_batch
+def ql_contrastive_loss(image_feat, text_feat, temperature=1):
+    # add the following 4 lines
+    image_feat = all_gather_arbitary_tensor(image_feat)
+    text_feat = all_gather_arbitary_tensor(text_feat)
+    logits = torch.matmul(image_feat, text_feat.t())
+    logit_scale = temperature.exp().clamp(max=100)
+    gt = torch.arange(logits.shape[0], device=logits.device)
+    loss1 = F.cross_entropy(logit_scale * logits, gt)
+    loss2 = F.cross_entropy(logit_scale * logits.t(), gt)
+    return (loss1 + loss2) / 2 # scale it up by the number of GPUs
+def vl_similarity(image_feat, text_feat, temperature=1):
+    # Only support single GPU for now.
+    logits = torch.matmul(image_feat, text_feat.t())
+    logits = temperature.exp().clamp(max=100) * logits
+    return logits
+def ql_multi_contrastive_loss(image_feat, text_feat, text_hash, temperature=1):
+    # add the following 4 lines
+    image_feat = all_gather_arbitary_tensor(image_feat)
+    text_feat = all_gather_arbitary_tensor(text_feat)
+    text_hash_batch = all_gather_pickle(text_hash, text_feat.device)
+    text_hash_all = torch.cat(text_hash_batch)
+    text_hash_all_unique = torch.unique(text_hash_all).tolist()
+    gt = torch.zeros((image_feat.shape[0], len(text_hash_all_unique)), device=text_feat.device)
+    text_hash_all = text_hash_all.tolist()
+    text_feat_unique = torch.stack([text_feat[text_hash_all.index(txt)] for txt in text_hash_all_unique])
+    for idx, txt in enumerate(text_hash_all):
+        gt[idx][text_hash_all_unique.index(txt)] = 1
+    logits = torch.matmul(image_feat, text_feat_unique.t())
+    logits = logits*temperature.exp().clamp(max=100)
+    loss_img = soft_cross_entropy(logits, gt)
+    loss_text = soft_cross_entropy(logits.t(), gt.t() / gt.t().sum(-1, keepdim=True))
+    loss = 0.7 * loss_img + 0.3 * loss_text
+    return loss
+def image_text_contrastive_loss_queue(image_feat_inp, text_feat_inp, lang_enc, training):
+    # add the following 4 lines
+    image_feat = all_gather_grad(image_feat_inp.contiguous())
+    text_feat = all_gather_grad(text_feat_inp.contiguous())
+    image_feat = image_feat / (image_feat.norm(dim=-1, keepdim=True) + 1e-7)
+    text_feat = text_feat / (text_feat.norm(dim=-1, keepdim=True) + 1e-7)
+    temperature = lang_enc.logit_scale
+    logits = torch.matmul(image_feat, text_feat.t())
+    logit_scale = temperature.exp().clamp(max=100)
+    gt = torch.arange(logits.shape[0], device=logits.device)
+    loss1 = F.cross_entropy(logit_scale * logits, gt)
+    loss2 = F.cross_entropy(logit_scale * logits.t(), gt)
+    return (loss1 + loss2) / 2 # scale it up by the number of GPUs

modeling/language/misc.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import random
+import torch
+import nltk
+import numpy as np
+from utilities.constants import IMAGENET_DEFAULT_TEMPLATES
+nltk.download('punkt', quiet=True)
+nltk.download('averaged_perceptron_tagger', quiet=True)
+def get_tag(tokenized, tags):
+    if not isinstance(tags, (list, tuple)):
+        tags = [tags]
+    ret = []
+    for (word, pos) in nltk.pos_tag(tokenized):
+        for tag in tags:
+            if pos == tag:
+                ret.append(word)
+    return ret
+def get_noun_phrase(tokenized):
+    # Taken from Su Nam Kim Paper...
+    grammar = r"""
+        NBAR:
+            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
+        NP:
+            {<NBAR>}
+            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
+    """
+    chunker = nltk.RegexpParser(grammar)
+    chunked = chunker.parse(nltk.pos_tag(tokenized))
+    continuous_chunk = []
+    current_chunk = []
+    for subtree in chunked:
+        if isinstance(subtree, nltk.Tree):
+            current_chunk.append(' '.join([token for token, pos in subtree.leaves()]))
+        elif current_chunk:
+            named_entity = ' '.join(current_chunk)
+            if named_entity not in continuous_chunk:
+                continuous_chunk.append(named_entity)
+                current_chunk = []
+        else:
+            continue
+    return continuous_chunk
+def text_noun_with_prompt_all(text, phrase_prob=0.0, append_text=True):
+    tokenized = nltk.word_tokenize(text)
+    if random.random() >= phrase_prob:
+        nouns = get_tag(tokenized, ['NN', 'NNS', 'NNP'])
+    else:
+        nouns = get_noun_phrase(tokenized)
+    prompt_texts = [np.random.choice(IMAGENET_DEFAULT_TEMPLATES).format(noun) for noun in nouns]
+    if append_text:
+        prompt_texts += [text]
+        nouns += [text]
+    return prompt_texts, nouns

modeling/language/vlpencoder.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import torch
+from torch import nn
+from torch.nn import functional as F
+from timm.models.layers import trunc_normal_
+from .build import register_model
+from ..utils import configurable
+from .LangEncoder import build_tokenizer, build_lang_encoder
+from utilities.prompt_engineering import prompt_engineering, get_prompt_templates
+from transformers import AutoTokenizer, AutoModel
+class LanguageEncoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        tokenizer,
+        tokenizer_type,
+        lang_encoder,
+        lang_projection,
+        max_token_num,
+        queue_operator,
+    ):
+        super().__init__()
+        # seg
+        self.tokenizer = tokenizer
+        self.tokenizer_type = tokenizer_type
+        self.lang_encoder = lang_encoder
+        self.lang_proj = lang_projection
+        self.max_token_num = max_token_num
+        self.logit_scale = nn.Parameter(torch.ones([]))
+        # captioning & retrieval
+        for key, value in queue_operator.items():
+            self.register_buffer(key, value)
+        self.biomed_encoder = AutoModel.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
+    @classmethod
+    def from_config(cls, cfg):
+        # build up text encoder for seg
+        tokenizer = build_tokenizer(cfg['MODEL']['TEXT'])
+        tokenizer_type = cfg['MODEL']['TEXT']['TOKENIZER']
+        lang_encoder = build_lang_encoder(cfg['MODEL']['TEXT'], tokenizer, cfg['VERBOSE'])
+        max_token_num = cfg['MODEL']['TEXT']['CONTEXT_LENGTH']
+        dim_lang = cfg['MODEL']['TEXT']['WIDTH']
+        dim_projection = cfg['MODEL']['DIM_PROJ']
+        lang_projection = nn.Parameter(torch.empty(dim_lang, dim_projection))
+        trunc_normal_(lang_projection, std=.02)
+        # tested not working better
+        queue_operator = {}
+        return {
+            "tokenizer": tokenizer,
+            "tokenizer_type": tokenizer_type,
+            "lang_encoder": lang_encoder,
+            "lang_projection": lang_projection,
+            "max_token_num": max_token_num,
+            "queue_operator": queue_operator,
+        }
+    def get_text_embeddings(self, class_names, name='default', is_eval=False, add_bgd=False, prompt=True, norm=True, store_buffer=None):
+        if not is_eval:
+            if prompt:
+                # randomly sample one template
+                arbitary_concepts = [
+                    prompt_engineering(class_names[label].replace('-other','').replace('-merged','').replace('-stuff',''), topk=10000, suffix='.') \
+                    for label in range(len(class_names))
+                ]
+                if add_bgd:
+                    arbitary_concepts.append("A background in coco.")
+            else:
+                arbitary_concepts = class_names
+            input_ids = []
+            attention_masks = []
+            for txt in arbitary_concepts:
+                tokens = self.tokenizer(
+                    txt, padding='max_length', truncation=True, max_length=self.max_token_num, return_tensors='pt'
+                )
+                tokens['input_ids'].squeeze_()
+                tokens['attention_mask'].squeeze_()
+                input_ids.append(tokens['input_ids'])
+                attention_masks.append(tokens['attention_mask'])
+            arbitary_tokens = torch.stack(input_ids)
+            arbitary_attention_masks = torch.stack(attention_masks)
+            text_emb = self.forward_language((arbitary_tokens.cuda(), arbitary_attention_masks.cuda()), norm=norm)
+            setattr(self, '{}_text_embeddings'.format(name), text_emb)
+        else:
+            with torch.no_grad():
+                def extract_mean_emb(txts):
+                    tokens = self.tokenizer(
+                        txts, padding='max_length', truncation=True, max_length=self.max_token_num, return_tensors='pt'
+                    )
+                    clss_embedding = self.forward_language((tokens['input_ids'].cuda(), tokens['attention_mask'].cuda()), norm=norm)
+                    clss_embedding = clss_embedding.mean(dim=0)
+                    clss_embedding /= clss_embedding.norm()
+                    return clss_embedding
+                templates = get_prompt_templates()
+                clss_embeddings = []
+                if prompt:
+                    for clss in class_names:
+                        txts = [template.format(clss.replace('-other','').replace('-merged','').replace('-stuff','')) for template in templates]
+                        clss_embeddings.append(extract_mean_emb(txts))
+                else:
+                    for clss in class_names:
+                        clss_embeddings.append(extract_mean_emb([clss]))
+                if add_bgd:
+                    txts = ["A background in coco."]
+                    clss_embeddings.append(extract_mean_emb(txts))
+                text_emb = torch.stack(clss_embeddings, dim=0)
+                setattr(self, '{}_text_embeddings'.format(name), text_emb)
+    def reset_text_embeddings(self, name='default'):
+        pass
+    def get_text_token_embeddings(self, txts, name='default', token=False, norm=False):
+        if not token:
+            tokens = self.tokenizer(
+                txts, padding='max_length', truncation=True, max_length=self.max_token_num, return_tensors='pt'
+            )
+            tokens = {key: value.cuda() for key, value in tokens.items()}
+        else:
+            tokens = txts
+        token_emb, class_emb = self.forward_language_token((tokens['input_ids'], tokens['attention_mask']), norm=norm)
+        ret = {"tokens": tokens,
+                "token_emb": token_emb,
+                "class_emb": class_emb,}
+        setattr(self, '{}_token_embeddings'.format(name), ret)
+        return ret
+    def forward_language(self, texts, norm=True):
+        if self.tokenizer_type == 'biomed-clip':
+            with torch.no_grad():  # Disable gradient calculation
+                outputs = self.biomed_encoder(*texts)
+            # Extract the last hidden state
+            x = outputs['last_hidden_state']
+            x = x[:, 0]  # Get the [CLS] token's embeddings for all examples
+        else:
+            x = self.lang_encoder(*texts)
+            x = x['last_hidden_state']
+            if self.tokenizer_type == 'clip':
+                x = x[torch.arange(x.size(0)), texts[0].argmax(dim=-1)]
+            else:
+                x = x[:, 0]
+        x = x @ self.lang_proj
+        if norm:
+            x = x / (x.norm(dim=-1, keepdim=True) + 1e-7)
+        return x
+    def forward_language_token(self, texts, norm=False):
+        if self.tokenizer_type == 'biomed-clip':
+            with torch.no_grad():  # Disable gradient calculation
+                outputs = self.biomed_encoder(*texts)
+            # Extract the last hidden state
+            token_x = outputs['last_hidden_state']
+            class_x = token_x[:, 0]  # Get the [CLS] token's embeddings for all examples
+        else:
+            x = self.lang_encoder(*texts)
+            token_x = x['last_hidden_state']
+            if self.tokenizer_type == 'clip':
+                class_x = token_x[torch.arange(token_x.size(0)), texts[0].argmax(dim=-1)]
+            else:
+                class_x = token_x[:, 0]
+        class_x = class_x @ self.lang_proj
+        token_x = token_x @ self.lang_proj
+        if norm:
+            class_x = class_x / (class_x.norm(dim=-1, keepdim=True) + 1e-7)
+            token_x = token_x / (token_x.norm(dim=-1, keepdim=True) + 1e-7)
+        return token_x, class_x
+    def compute_similarity(self, v_emb, name='default', fake=False):
+        if fake:
+            return None
+        v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        t_emb = getattr(self, '{}_text_embeddings'.format(name))
+        output = self.logit_scale.exp() * v_emb @ t_emb.unsqueeze(0).transpose(1, 2)
+        return output
+@register_model
+def get_language_model(cfg, **kwargs):
+    return LanguageEncoder(cfg)

modeling/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .point_features import *
+from .position_encoding import *
+from .postprocessing import *
+from .attention import *
+from .criterion import *
+from .matcher import *

modeling/modules/attention.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import warnings
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
+from torch.nn.parameter import Parameter
+from torch.overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import pad, linear, softmax, dropout
+def multi_head_attention_forward(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Tensor,
+    in_proj_bias: Tensor,
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Tensor,
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+          will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+    tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
+    if has_torch_function(tens_ops):
+        return handle_torch_function(
+            multi_head_attention_forward,
+            tens_ops,
+            query,
+            key,
+            value,
+            embed_dim_to_check,
+            num_heads,
+            in_proj_weight,
+            in_proj_bias,
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            dropout_p,
+            out_proj_weight,
+            out_proj_bias,
+            training=training,
+            key_padding_mask=key_padding_mask,
+            need_weights=need_weights,
+            attn_mask=attn_mask,
+            use_separate_proj_weight=use_separate_proj_weight,
+            q_proj_weight=q_proj_weight,
+            k_proj_weight=k_proj_weight,
+            v_proj_weight=v_proj_weight,
+            static_k=static_k,
+            static_v=static_v,
+        )
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+    if not use_separate_proj_weight:
+        if (query is key or torch.equal(query, key)) and (key is value or torch.equal(key, value)):
+            # self-attention
+            q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
+        elif key is value or torch.equal(key, value):
+            # encoder-decoder attention
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+            else:
+                # This is inline in_proj function with in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = linear(key, _w, _b).chunk(2, dim=-1)
+        else:
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = linear(key, _w, _b)
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = linear(value, _w, _b)
+    else:
+        q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+        k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+        v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+        if in_proj_bias is not None:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim : (embed_dim * 2)])
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2) :])
+        else:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+    if attn_mask is not None:
+        assert (
+            attn_mask.dtype == torch.float32
+            or attn_mask.dtype == torch.float64
+            or attn_mask.dtype == torch.float16
+            or attn_mask.dtype == torch.uint8
+            or attn_mask.dtype == torch.bool
+        ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError("The size of the 2D attn_mask is not correct.")
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError("The size of the 3D attn_mask is not correct.")
+        else:
+            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn(
+            "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+        )
+        key_padding_mask = key_padding_mask.to(torch.bool)
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+    src_len = k.size(1)
+    if key_padding_mask is not None:
+        # assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float("-inf"))
+        else:
+            attn_output_weights += attn_mask
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1),
+            float("-inf"),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+    attn_output_weights = softmax(attn_output_weights, dim=-1)
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None
+# This class exists solely for Transformer; it has an annotation stating
+# that bias is never None, which appeases TorchScript
+class _LinearWithBias(nn.Linear):
+    bias: Tensor  # type: ignore
+    def __init__(self, in_features: int, out_features: int) -> None:
+        super().__init__(in_features, out_features, bias=True)  # type: ignore
+class MultiheadAttention(nn.Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+    Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set
+    to :attr:`embed_dim` such that query, key, and value have the same
+    number of features.
+    Examples::
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        if self._qkv_same_embed_dim is False:
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.register_parameter('in_proj_weight', None)
+        else:
+            self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+            self.register_parameter('q_proj_weight', None)
+            self.register_parameter('k_proj_weight', None)
+            self.register_parameter('v_proj_weight', None)
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = _LinearWithBias(embed_dim, embed_dim)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self._reset_parameters()
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+        super(MultiheadAttention, self).__setstate__(state)
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
+                need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    Shapes for inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: if a 2D mask: :math:`(L, S)` where L is the target sequence length, S is the
+          source sequence length.
+          If a 3D mask: :math:`(N\cdot\text{num\_heads}, L, S)` where N is the batch size, L is the target sequence
+          length, S is the source sequence length. ``attn_mask`` ensure that position i is allowed to attend
+          the unmasked positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+    Shapes for outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight)
+        else:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask)

modeling/modules/criterion.py ADDED Viewed

	@@ -0,0 +1,874 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+"""
+MaskFormer criterion.
+"""
+import logging
+import torch
+import torch.nn.functional as F
+from torch import nn
+from detectron2.utils.comm import get_world_size
+from timm.loss import SoftTargetCrossEntropy
+from .point_features import (
+    get_uncertain_point_coords_with_randomness,
+    point_sample,
+)
+from ..language.loss import ql_multi_contrastive_loss, image_text_contrastive_loss_queue, vl_similarity, all_gather_grad
+from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list, _max_by_axis
+from ..utils import box_ops
+# from image2html.visualizer import VL
+def dice_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+    ):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+dice_loss_jit = torch.jit.script(
+    dice_loss
+)  # type: torch.jit.ScriptModule
+def sigmoid_ce_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+    ):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    return loss.mean(1).sum() / num_masks
+sigmoid_ce_loss_jit = torch.jit.script(
+    sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+def calculate_uncertainty(logits):
+    """
+    We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
+        foreground class in `classes`.
+    Args:
+        logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
+            class-agnostic, where R is the total number of predicted masks in all images and C is
+            the number of foreground classes. The values are logits.
+    Returns:
+        scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
+            the most uncertain locations having the highest uncertainty score.
+    """
+    assert logits.shape[1] == 1
+    gt_class_logits = logits.clone()
+    return -(torch.abs(gt_class_logits))
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, top_x_layers, losses,
+                 num_points, oversample_ratio, importance_sample_ratio, grounding_weight):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.top_x_layers = top_x_layers
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+        # pointwise mask loss parameters
+        self.num_points = num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        # grounding
+        self.grounding_weight = grounding_weight
+    def loss_labels(self, outputs, targets, indices, num_masks, layer_id, extra):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        if layer_id > self.top_x_layers['mask']:
+            return {"loss_mask_ce_0": 0}
+        if indices is None or len(targets) == 0:
+            loss_ce = outputs['pred_logits'].sum() * 0.0
+            losses = {"loss_mask_ce_0": loss_ce}
+            return losses
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"].type(self.empty_weight.dtype)
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        if src_logits.shape[2] == self.num_classes+1:
+            empty_weight = torch.ones(self.num_classes + 1).to(src_logits.device).type(self.empty_weight.dtype)
+            empty_weight[-1] = self.eos_coef
+        else:
+            empty_weight = torch.ones(self.num_classes + 1000 + 1).to(src_logits.device).type(self.empty_weight.dtype)
+            empty_weight[self.num_classes] = self.eos_coef
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes)
+        losses = {"loss_mask_ce_0": loss_ce}
+        return losses
+    def loss_labels_openimage(self, outputs, targets, indices, num_masks, layer_id, extra):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        if layer_id > self.top_x_layers['mask']:
+            return {"loss_openimage_ce_0": 0}
+        assert "pred_captions" in outputs
+        if indices is None or len(targets) == 0 or (len(targets) > 0 and len(targets[0]['labels']) == 0):
+            loss_ce = outputs['pred_captions'].sum() * 0.0
+            losses = {"loss_openimage_ce_0": loss_ce}
+            return losses
+        # compute i2t loss
+        loss_openimage_ce = 0
+        losses = {}
+        for b in range(len(indices)):
+            pred_logit = outputs["pred_logits"][b][indices[b][0]]
+            gt_logit = torch.zeros_like(pred_logit)
+            select_idx = torch.stack((torch.arange(len(indices[b][1])), indices[b][1])).tolist()
+            gt_logit[select_idx] = 1
+            loss_openimage_ce += torch.sum(-gt_logit * F.log_softmax(pred_logit, dim=-1), dim=-1).mean()
+        loss_openimage_ce = loss_openimage_ce / len(indices)
+        losses.update({"loss_openimage_ce_0": loss_openimage_ce})
+        return losses
+    def loss_itc(self, outputs, targets, indices, num_masks, layer_id, extra):
+        if layer_id >= self.top_x_layers['retrieval']:
+            return {"loss_retrieval_decoder_0": 0}
+        t_emb = torch.cat([x['caption_proj'] for x in targets], dim=0)
+        v_emb = outputs['pred_captions'][:,-1]
+        loss_contrast = image_text_contrastive_loss_queue(v_emb, t_emb, extra['lang_encoder'], extra['training'])
+        # compute query-token contrastive loss
+        ttk_emb = torch.cat([x['caption_tokens'] for x in targets], dim=0)
+        ttk_mask = torch.cat([x['caption_mask'] for x in targets], dim=0).float()
+        ttk_mask = ttk_mask * torch.cumsum(ttk_mask, dim=1)
+        vtk_emb = outputs['pred_captions'][:,:-1]
+        keep = torch.cat([x['caption_mask'] for x in targets], dim=0).bool()
+        ttk_emb = ttk_emb / (ttk_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        vtk_emb = vtk_emb / (vtk_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        logit_scale = extra['lang_encoder'].logit_scale.exp().clamp(max=100)
+        # prepare gt
+        gt = (torch.eye(vtk_emb.shape[0]).type_as(ttk_mask).unsqueeze(-1) * ttk_mask.unsqueeze(0).repeat(vtk_emb.shape[0], 1, 1))[:,keep].flatten(1)
+        gt = gt / (gt.sum(1, keepdim=True) + 1e-7)
+        # compute i2t loss
+        logits = logit_scale * (vtk_emb @ ttk_emb[keep].transpose(0, 1)).mean(1)
+        loss_contrast_fine_vt = SoftTargetCrossEntropy()(logits, gt)
+        # loss_contrast_fine = loss_contrast_fine_vt # i2t only
+        # compute t2i loss
+        bs, nq, _ = vtk_emb.shape
+        logits = logit_scale * (ttk_emb @ vtk_emb.flatten(0,1).transpose(0, 1)).reshape(bs,-1,bs,nq).mean(dim=-1)[keep,:]
+        loss_contrast_fine_tv = SoftTargetCrossEntropy()(logits, gt.t())
+        # compute loss
+        loss_contrast_fine = (loss_contrast_fine_vt * 0.7 + loss_contrast_fine_tv * 0.3)
+        losses = {"loss_retrieval_decoder_0": loss_contrast + loss_contrast_fine * 0.5}
+        return losses
+    def loss_captionings(self, outputs, targets, indices, num_masks, layer_id, extra):
+        if layer_id >= self.top_x_layers['captioning']:
+            return {"loss_captioning_0": 0}
+        pred_captions_gen = outputs['pred_captionings'][:, :-1]
+        token_embs = extra['token_embedding'].weight
+        # token_embs = (token_embs / token_embs.norm(dim=-1, keepdim=True) + 1e-7)
+        # pred_captions_gen = (pred_captions_gen / pred_captions_gen.norm(dim=-1, keepdim=True) + 1e-7)
+        pred_captions_gen = pred_captions_gen @ token_embs.t()
+        # temperature = extra['lang_encoder'].logit_scale
+        # logit_scale = temperature.exp().clamp(max=100)
+        target_captions_gen = torch.cat([target['caption_tokenids'] for target in targets], 0)[:, 1:]
+        target_captions_gen_mask = torch.cat([target['caption_mask'] for target in targets], 0)[:, 1:]
+        # loss_caption = F.cross_entropy(pred_captions_gen.transpose(1,2) * logit_scale, target_captions_gen, reduction='none')
+        loss_caption = F.cross_entropy(pred_captions_gen.transpose(1,2), target_captions_gen, reduction='none')
+        loss_caption = (loss_caption * target_captions_gen_mask).sum() / (target_captions_gen_mask.sum() + 1)
+        losses = {"loss_captioning_0": loss_caption}
+        return losses
+    def loss_captions(self, outputs, targets, indices, num_masks, layer_id, extra):
+        if layer_id >= self.top_x_layers['caption']:
+            return {"loss_caption_0": 0}
+        matched_tokens = [m[0] for m in indices]
+        t_emb_class = torch.cat([extra['class_embeddings'][targets[bs]['labels'][m[1]]] for bs, m in enumerate(indices)])
+        t_hash_class = torch.cat([torch.tensor(targets[bs]['labels_hash'])[m[1]] for bs, m in enumerate(indices)])
+        # pred_captions denotes all unmatched object queries.
+        unmatched_pred_captions = []
+        matched_pred_captions = []
+        for idx, m in enumerate(matched_tokens):
+            unmatched_masks = torch.ones(outputs['pred_captions'].shape[1:-1]).bool()
+            matched_masks = torch.zeros(outputs['pred_captions'].shape[1:-1]).bool()
+            unmatched_masks[m] = False
+            matched_masks[m] = True
+            unmatched_pred_captions.append(outputs['pred_captions'][idx][unmatched_masks])
+            matched_pred_captions.append(outputs['pred_captions'][idx][matched_masks])
+        outputs['unmatched_pred_captions'] = unmatched_pred_captions
+        v_emb_class = torch.cat(matched_pred_captions)
+        v_emb_class = v_emb_class / (v_emb_class.norm(dim=-1, keepdim=True) + 1e-7)
+        indices = self.matcher(outputs, targets, mode="caption_womask", extra={'temperature':extra['lang_logit']})
+        src_idx = self._get_src_permutation_idx(indices)
+        t_emb = torch.cat([t['captions'][indices[bs][1]] for bs,t in enumerate(targets)])
+        t_hash = torch.cat([torch.tensor(t['captions_hash'])[indices[bs][1]] for bs,t in enumerate(targets)])
+        unmatched_pred_captions, _ = nested_tensor_from_tensor_list(unmatched_pred_captions).decompose()
+        v_emb = unmatched_pred_captions[src_idx]
+        v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        loss_contrast = ql_multi_contrastive_loss(torch.cat((v_emb, v_emb_class)), torch.cat((t_emb, t_emb_class)), torch.cat((t_hash, t_hash_class)), temperature=extra['lang_logit'])
+        losses = {"loss_caption_0": loss_contrast}
+        return losses
+    def loss_masks(self, outputs, targets, indices, num_masks, layer_id, extra):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        if layer_id >= self.top_x_layers['mask']:
+            return {"loss_mask_bce_0": 0, "loss_mask_dice_0": 0}
+        assert "pred_masks" in outputs
+        if indices is None or len(targets) == 0:
+            loss = outputs['pred_masks'].sum() * 0.0
+            losses = {"loss_mask_bce_0": loss, "loss_mask_dice_0": loss}
+            return losses
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+        # No need to upsample predictions as we are using normalized coordinates :)
+        # N x 1 x H x W
+        src_masks = src_masks[:, None]
+        target_masks = target_masks[:, None]
+        with torch.no_grad():
+            # sample point_coords
+            point_coords = get_uncertain_point_coords_with_randomness(
+                src_masks,
+                lambda logits: calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            ).type(src_masks.dtype)
+            # get gt labels
+            point_labels = point_sample(
+                target_masks,
+                point_coords,
+                align_corners=False,
+            ).squeeze(1)
+        point_logits = point_sample(
+            src_masks,
+            point_coords,
+            align_corners=False,
+        ).squeeze(1)
+        losses = {
+            "loss_mask_bce_0": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
+            "loss_mask_dice_0": dice_loss_jit(point_logits, point_labels, num_masks),
+        }
+        del src_masks
+        del target_masks
+        return losses
+    def loss_groundings(self, outputs, targets, indices, num_masks, layer_id, extra):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_gmasks" in outputs
+        assert "pred_gtexts" in outputs
+        if layer_id >= self.top_x_layers['grounding']:
+            return {"loss_grounding_bce_0": 0, "loss_grounding_dice_0": 0, "loss_grounding_ce_0": 0}
+        masks = [t["grounding_masks"] for t in targets]
+        if indices is None or None in masks:
+            loss = outputs['pred_gmasks'].sum() * 0.0
+            return {"loss_grounding_bce_0": loss, "loss_grounding_dice_0": loss, "loss_grounding_ce_0": loss}
+        pred_logits = []
+        for b in range(len(indices)):
+            t_emb = targets[b]['grounding_class_embs']
+            v_emb = outputs["pred_gtexts"][b]
+            t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            out_prob = vl_similarity(v_emb, t_emb, temperature=extra['lang_logit'])
+            pred_logits += [out_prob]
+        outputs['pred_logits'] = pred_logits
+        indices = self.matcher(outputs, targets, mode='grounding', extra={'temperature':extra['lang_logit']})
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_gmasks"]
+        src_masks = src_masks[src_idx]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+        # No need to upsample predictions as we are using normalized coordinates :)
+        # N x 1 x H x W
+        src_masks = src_masks[:, None]
+        target_masks = target_masks[:, None]
+        with torch.no_grad():
+            # sample point_coords
+            point_coords = get_uncertain_point_coords_with_randomness(
+                src_masks,
+                lambda logits: calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            ).type(src_masks.dtype)
+            # get gt labels
+            point_labels = point_sample(
+                target_masks,
+                point_coords,
+                align_corners=False,
+            ).squeeze(1)
+        point_logits = point_sample(
+            src_masks,
+            point_coords,
+            align_corners=False,
+        ).squeeze(1)
+        losses = {
+            "loss_grounding_bce_0": sigmoid_ce_loss_jit(point_logits, point_labels, len(src_masks)),
+            "loss_grounding_dice_0": dice_loss_jit(point_logits, point_labels, len(src_masks)),
+        }
+        # compute query-token contrastive loss
+        # ttk_emb = torch.cat([x['caption_tokens'] for x in targets], dim=0)
+        # ttk_mask = torch.cat([x['caption_mask'] for x in targets], dim=0).float()
+        # ttk_mask = ttk_mask * torch.cumsum(ttk_mask, dim=1)
+        # vtk_emb = outputs['pred_captions'][:,:-1]
+        # keep = torch.cat([x['caption_mask'] for x in targets], dim=0).bool()
+        # ttk_emb = ttk_emb / (ttk_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        # vtk_emb = vtk_emb / (vtk_emb.norm(dim=-1, keepdim=True) + 1e-7)
+        # logit_scale = extra['lang_encoder'].logit_scale.exp().clamp(max=100)
+        # # prepare gt
+        # gt = (torch.eye(vtk_emb.shape[0]).type_as(ttk_mask).unsqueeze(-1) * ttk_mask.unsqueeze(0).repeat(vtk_emb.shape[0], 1, 1))[:,keep].flatten(1)
+        # gt = gt / (gt.sum(1, keepdim=True) + 1e-7)
+        # # compute i2t loss
+        # logits = logit_scale * (vtk_emb @ ttk_emb[keep].transpose(0, 1)).mean(1)
+        # loss_contrast_fine_vt = SoftTargetCrossEntropy()(logits, gt)
+        # # loss_contrast_fine = loss_contrast_fine_vt # i2t only
+        # # compute t2i loss
+        # bs, nq, _ = vtk_emb.shape
+        # logits = logit_scale * (ttk_emb @ vtk_emb.flatten(0,1).transpose(0, 1)).reshape(bs,-1,bs,nq).mean(dim=-1)[keep,:]
+        # loss_contrast_fine_tv = SoftTargetCrossEntropy()(logits, gt.t())
+        # # compute loss
+        # loss_contrast_fine = (loss_contrast_fine_vt * 0.7 + loss_contrast_fine_tv * 0.3)
+        # compute t2i loss
+        loss_grd_ce = 0
+        for b in range(len(indices)):
+            task = targets[b]['grounding_task']
+            pred_logit = outputs["pred_logits"][b]
+            gt_logit = torch.zeros_like(pred_logit)
+            select_idx = torch.stack((indices[b][0], indices[b][1])).tolist()
+            gt_logit[select_idx] = 1
+            t_hash = torch.tensor(targets[b]['grounding_hash'], device=gt_logit.device)
+            hash_table = torch.zeros((len(t_hash), len(t_hash)), device=gt_logit.device)
+            for idx in range(0, len(hash_table)):
+                hash_table[idx][t_hash==t_hash[idx]] = 1
+            hash_table = hash_table / hash_table.sum(-1, keepdim=True)
+            gt_logit = gt_logit @ hash_table
+            loss_grd_ce += self.grounding_weight[task]*torch.sum(-gt_logit.t() * F.log_softmax(pred_logit.t(), dim=-1), dim=-1).mean()
+        loss_grd_ce = loss_grd_ce / len(indices)
+        losses.update({"loss_grounding_ce_0": loss_grd_ce})
+        del src_masks
+        del target_masks
+        return losses
+    def loss_spatials(self, outputs, targets, indices, num_masks, layer_id, extra):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_smasks" in outputs
+        assert "pred_smaskembs" in outputs
+        if layer_id >= self.top_x_layers['spatial']:
+            loss = outputs['pred_smasks'].sum() * 0.0
+            loss_grd_ce = outputs["pred_smasks"].sum() * 0.0
+            return {"loss_spatial_bce_0": loss, "loss_spatial_dice_0": loss, "loss_spatial_ce_0": loss_grd_ce}
+        gt_masks = [x['gt_spatial_masks'] for x in targets]
+        # compute a keep index with batch size to avoid empty gt_masks
+        stack_gt_mask = torch.cat(gt_masks)
+        bs,_,_ = stack_gt_mask.shape
+        stack_gt_mask = stack_gt_mask.view(bs,-1).sum(dim=-1)
+        keep = stack_gt_mask > 0 # only keep sample contain positive mask
+        if keep.sum() == 0:
+            loss = outputs['pred_smasks'].sum() * 0.0
+            loss_grd_ce = outputs["pred_smasks"].sum() * 0.0
+            return {"loss_spatial_bce_0": loss, "loss_spatial_dice_0": loss, "loss_spatial_ce_0": loss_grd_ce}
+        # mask embedding logits
+        v_emb = outputs["pred_smaskembs"] # [bs, nq, 512]
+        # pos mask
+        s_emb = outputs["pred_pspatials"] # [bs, ns, 512]
+        pred_logits = v_emb @ s_emb.transpose(1,2)
+        outputs['pred_pos_logits'] = pred_logits # [bs, nq, 1]
+        indices = self.matcher(outputs, targets, mode='spatial', extra={})
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        # pos class loss
+        pred_logit = torch.cat([o[:len(t['gt_spatial_masks'])] for o,t in zip(outputs["pred_pos_logits"].transpose(1,2), targets)])
+        gt_logit = torch.zeros_like(pred_logit)
+        gt_logit = gt_logit[keep]
+        _src_idx = [torch.arange(keep.sum(), device=src_idx[0].device), src_idx[1][keep.cpu()]]
+        gt_logit[_src_idx] = 1
+        pred_logit = pred_logit[keep]
+        loss_spa_ce_pos = torch.sum(-gt_logit * F.log_softmax(pred_logit, dim=-1), dim=-1).mean()
+        # neg mask
+        # s_emb = outputs["pred_nspatials"] # [bs, ns, 512]
+        # neg_mask = (s_emb.sum(dim=list(range(1, len(s_emb.shape)))) != 0).float()[keep]
+        # pred_logits = v_emb @ s_emb.transpose(1,2)
+        # outputs['pred_neg_logits'] = pred_logits # [bs, nq, 1]
+        # indices = self.matcher(outputs, targets, mode='spatial_pn', extra=extra)
+        # src_idx = self._get_src_permutation_idx(indices)
+        # tgt_idx = self._get_tgt_permutation_idx(indices)
+        # src_masks_neg = outputs["pred_smasks"][src_idx][keep]
+        # src_masks_neg = src_masks_neg*(neg_mask[:,None,None])
+        # src_masks_neg = src_masks_neg.clip(0) * (-1)
+        # neg class loss
+        # pred_logit = outputs["pred_neg_logits"]
+        # gt_logit = torch.zeros_like(pred_logit)
+        # gt_logit[src_idx] = 1
+        # bs,_,ns = pred_logit[keep].shape
+        # pred_logit = pred_logit[keep].transpose(1,2).view(bs*ns,-1)
+        # gt_logit = gt_logit[keep].transpose(1,2).view(bs*ns,-1)
+        # loss_spa_ce_neg = (torch.sum(-gt_logit * F.log_softmax(pred_logit, dim=-1), dim=-1)*neg_mask).sum() / (neg_mask.sum()+1e-6)
+        # recompute a keep index with matched tgt
+        stack_gt_mask = nn.utils.rnn.pad_sequence(gt_masks, padding_value=-1).transpose(0,1)[tgt_idx]
+        bs,_,_ = stack_gt_mask.shape
+        target_masks = stack_gt_mask
+        stack_gt_mask = stack_gt_mask.view(bs,-1).sum(dim=-1)
+        keep = stack_gt_mask > 0 # only keep sample contain positive mask
+        src_masks_pos = outputs["pred_smasks"][src_idx][keep]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks = target_masks.to(src_masks_pos)
+        target_masks = target_masks[keep]
+        # mul = extra['spatial_query_mode'][keep]
+        # src_masks_cur = src_masks_cur.clip(0) * mul[:,None,None]
+        # src_masks_cur = src_masks_cur
+        # if neg_mask[0] == 1:
+        #     import cv2
+        #     print(src_masks_pos.shape)
+        #     print(src_masks_neg.shape)
+        #     print(target_masks.shape)
+        #     # import pdb; pdb.set_trace()
+        #     v_pos_mask = (src_masks_pos[0].sigmoid() > 0.5).float().cpu().detach().numpy() * 255
+        #     v_neg_mask = (_src_masks_neg[0].sigmoid() > 0.5).float().cpu().detach().numpy() * 255
+        #     v_sum = ((src_masks_pos[0]-_src_masks_neg[0].clip(0)).sigmoid() > 0.5).float().cpu().detach().numpy() * 255
+        #     v_gt = target_masks[0].float().cpu().detach().numpy() * 255
+        #     cv2.imwrite('v_pos_mask.png', v_pos_mask)
+        #     cv2.imwrite('v_neg_mask.png', v_neg_mask)
+        #     cv2.imwrite('v_sum.png', v_sum)
+        #     cv2.imwrite('v_gt.png', v_gt)
+        #     import pdb; pdb.set_trace()
+        # src_masks = (src_masks_pos + src_masks_neg)[:, None]
+        src_masks = src_masks_pos[:, None]
+        target_masks = target_masks[:, None]
+        # debug visualization
+        # with torch.no_grad():
+        #     import cv2
+        #     import numpy as np
+        #     v_src_masks = (F.interpolate(src_masks, size=target_masks.shape[-2:], mode='bilinear', align_corners=False).sigmoid() > 0.5).float().cpu().numpy()[:,0] * 255
+        #     v_target_masks = target_masks.float().cpu().numpy()[:,0] * 255
+        #     v_masks = np.concatenate([v_src_masks, v_target_masks], axis=2)
+        #     for i in range(len(src_masks)):
+        #         v1 = v_src_masks[i]
+        #         v2 = v_target_masks[i]
+        #         v = np.concatenate([v1,v2], axis=1)
+        #         cv2.imwrite('v{}.png'.format(i), v)
+        #     import pdb; pdb.set_trace()
+        # visualization
+        # VL.step()
+        # v_img = batched_inputs[0]['image'].permute(1,2,0).cpu().numpy()
+        # VL.add_image(v_img[:,:,::-1])
+        # candidate_masks = batched_inputs[0]['spatial_query']['rand_shape'].float().cpu().numpy()
+        # gt_masks = batched_inputs[0]['spatial_query']['gt_masks'].float().cpu().numpy()
+        # texts = ['cmask' for i in range(len(candidate_masks))]
+        # VL.overlay_obj_mask_to_image(v_img[:,:,::-1], candidate_masks, texts)
+        # texts = ['gmask' for i in range(len(candidate_masks))]
+        # VL.overlay_obj_mask_to_image(v_img[:,:,::-1], gt_masks, texts)
+        # import cv2
+        # for i in range(len(src_masks)):
+        #     visual_src_mask_cur = (src_masks_cur[i].sigmoid()>0.5).detach().float().cpu().numpy() * 255
+        #     visual_src_mask_mem = (src_masks_mem[i].sigmoid()>0.5).detach().float().cpu().numpy() * 255
+        #     visual_src_mask = (src_masks[i,0].sigmoid()>0.5).detach().float().cpu().numpy() * 255
+        #     visual_target_mask = (target_masks[i,0].sigmoid()>0.5).detach().float().cpu().numpy() * 255
+        #     cv2.imwrite('visual_src_mask_cur_{}_{}.png'.format(i, mul[i].item()), visual_src_mask_cur)
+        #     cv2.imwrite('visual_src_mask_mem_{}_{}.png'.format(i, mul[i].item()), visual_src_mask_mem)
+        #     cv2.imwrite('visual_src_mask_{}_{}.png'.format(i, mul[i].item()), visual_src_mask)
+        #     cv2.imwrite('visual_target_mask_{}_{}.png'.format(i, mul[i].item()), visual_target_mask)
+        # import pdb; pdb.set_trace()
+        with torch.no_grad():
+            # sample point_coords
+            point_coords = get_uncertain_point_coords_with_randomness(
+                src_masks,
+                lambda logits: calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            ).type(src_masks.dtype)
+            # get gt labels
+            point_labels = point_sample(
+                target_masks,
+                point_coords,
+                align_corners=False,
+            ).squeeze(1)
+        point_logits = point_sample(
+            src_masks,
+            point_coords,
+            align_corners=False,
+        ).squeeze(1)
+        num_masks = len(src_masks)
+        losses = {
+            "loss_spatial_bce_0": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
+            "loss_spatial_dice_0": dice_loss_jit(point_logits, point_labels, num_masks),
+        }
+        # losses.update({"loss_spatial_ce_0": loss_spa_ce_pos + loss_spa_ce_neg})
+        losses.update({"loss_spatial_ce_0": loss_spa_ce_pos})
+        del src_masks
+        del target_masks
+        return losses
+    def loss_boxes(self, outputs, targets, indices, num_boxes, layer_id, extra):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if layer_id >= self.top_x_layers['box']:
+            return {"loss_bbox_0": 0, "loss_giou_0": 0}
+        assert 'pred_boxes' in outputs
+        if indices is None or len(targets) == 0:
+            loss = outputs['pred_boxes'].sum() * 0.0
+            losses = {"loss_bbox_0": loss, "loss_giou_0": loss}
+            return losses
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"]
+        src_boxes = src_boxes[src_idx].sigmoid()
+        target_boxes = [t['boxes'] for t in targets]
+        max_size = _max_by_axis([list(box.shape) for box in target_boxes])
+        max_size = [len(target_boxes)] + max_size
+        empty_boxes = torch.zeros(max_size).to(src_boxes.device)
+        for idx, tar_box in enumerate(target_boxes):
+            empty_boxes[idx,:tar_box.shape[0],:] = tar_box
+        target_boxes = empty_boxes[tgt_idx]
+        # target_isthings = [t['is_things'] for t in targets]
+        # max_size = _max_by_axis([list(lab.shape) for lab in target_isthings])
+        # max_size = [len(target_isthings)] + max_size
+        # empty_lab = torch.zeros(max_size).to(src_boxes.device)
+        # for idx, tar_thing in enumerate(target_isthings):
+        #     empty_lab[idx,:tar_thing.shape[0]] = tar_thing
+        # target_isthings = empty_lab[tgt_idx]
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['loss_bbox_0'] = loss_bbox.sum() / num_boxes
+        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
+            box_ops.box_cxcywh_to_xyxy(src_boxes),
+            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou_0'] = loss_giou.sum() / num_boxes
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(self, loss, outputs, targets, indices, num_masks, layer_id, extra):
+        loss_map = {
+            'labels': self.loss_labels,
+            'masks': self.loss_masks,
+            'boxes': self.loss_boxes,
+            'captions': self.loss_captions,
+            'retrievals': self.loss_itc,
+            'captionings': self.loss_captionings,
+            'groundings': self.loss_groundings,
+            'labels_openimage': self.loss_labels_openimage,
+            'spatials': self.loss_spatials,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets, indices, num_masks, layer_id, extra)
+    def forward(self, outputs, targets, extra=None):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_masks = sum(len(t["labels"]) for t in targets)
+        num_masks = torch.as_tensor(
+            [num_masks], dtype=torch.float, device=next(iter(outputs_without_aux.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, 0, extra))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            # NOTE: we reverse the aux_outputs so that the first is the second last layer
+            for i, aux_outputs in enumerate(outputs["aux_outputs"][::-1]):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, (i+1), extra)
+                    l_dict = {k.replace('_0', f"_{i+1}"): v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses
+    def forward_vlp(self, outputs, targets, extra=None):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        # Compute all the requested losses
+        losses = {}
+        num_masks = indices = None
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, 0, extra))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            # NOTE: we reverse the aux_outputs so that the first is the second last layer
+            for i, aux_outputs in enumerate(outputs["aux_outputs"][::-1]):
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, (i+1), extra)
+                    l_dict = {k.replace('_0', f"_{i+1}"): v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses
+    def forward_grounding(self, outputs, targets, extra=None):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        # Compute all the requested losses
+        losses = {}
+        indices = [[] for i in range(len(targets))]
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_masks = sum(len(t["grounding_masks"]) for t in targets) + 1e-7
+        num_masks = torch.as_tensor(
+            [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, 0, extra))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            # NOTE: we reverse the aux_outputs so that the first is the second last layer
+            for i, aux_outputs in enumerate(outputs["aux_outputs"][::-1]):
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, (i+1), extra)
+                    l_dict = {k.replace('_0', f"_{i+1}"): v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses
+    def forward_openimage(self, outputs, targets, extra=None):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        neg_class_emb =  all_gather_grad(torch.cat([x['neg_class_emb'] for x in targets]))
+        neg_hash = all_gather_grad(torch.cat([x['neg_hash'] for x in targets]))
+        extra['neg_class_emb'] = neg_class_emb
+        extra['neg_hash'] = neg_hash
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices, pred_logits = self.matcher.openimage_forward(outputs_without_aux, targets, extra=extra)
+        outputs['pred_logits'] = pred_logits
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_masks = sum(len(t["labels"]) for t in targets)
+        num_masks = torch.as_tensor(
+            [num_masks], dtype=torch.float, device=neg_class_emb.device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks, 0, extra))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            # NOTE: we reverse the aux_outputs so that the first is the second last layer
+            for i, aux_outputs in enumerate(outputs["aux_outputs"][::-1]):
+                indices, pred_logits = self.matcher.openimage_forward(aux_outputs, targets, extra=extra)
+                aux_outputs['pred_logits'] = pred_logits
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks, (i+1), extra)
+                    l_dict = {k.replace('_0', f"_{i+1}"): v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses
+    def __repr__(self):
+        head = "Criterion " + self.__class__.__name__
+        body = [
+            "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
+            "losses: {}".format(self.losses),
+            "weight_dict: {}".format(self.weight_dict),
+            "num_classes: {}".format(self.num_classes),
+            "eos_coef: {}".format(self.eos_coef),
+            "num_points: {}".format(self.num_points),
+            "oversample_ratio: {}".format(self.oversample_ratio),
+            "importance_sample_ratio: {}".format(self.importance_sample_ratio),
+        ]
+        _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)

modeling/modules/matcher.py ADDED Viewed

	@@ -0,0 +1,632 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import warnings
+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from torch.cuda.amp import autocast
+from .point_features import point_sample
+from ..language.loss import vl_similarity
+def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
+    denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+batch_dice_loss_jit = torch.jit.script(
+    batch_dice_loss
+)  # type: torch.jit.ScriptModule
+def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    hw = inputs.shape[1]
+    pos = F.binary_cross_entropy_with_logits(
+        inputs, torch.ones_like(inputs), reduction="none"
+    )
+    neg = F.binary_cross_entropy_with_logits(
+        inputs, torch.zeros_like(inputs), reduction="none"
+    )
+    loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
+        "nc,mc->nm", neg, (1 - targets)
+    )
+    return loss / hw
+batch_sigmoid_ce_loss_jit = torch.jit.script(
+    batch_sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0, spatial_cost = None):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
+            cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+        self.num_points = num_points
+        self.spatial_cost_class = cost_class
+        self.spatial_cost_mask = cost_mask
+        self.spatial_cost_dice = cost_dice
+        assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
+    @torch.no_grad()
+    def memory_efficient_forward(self, outputs, targets):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+        if bs == 0 or len(targets) == 0:
+            return None
+        indices = []
+        # Iterate through batch size
+        for b in range(bs):
+            out_prob = outputs["pred_logits"][b].softmax(-1)  # [num_queries, num_classes]
+            tgt_ids = targets[b]["labels"]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob[:, tgt_ids]
+            out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["masks"].to(out_mask)
+            out_mask = out_mask[:, None]
+            tgt_mask = tgt_mask[:, None]
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device, dtype=tgt_mask.dtype)
+            # get gt labels
+            tgt_mask = point_sample(
+                tgt_mask,
+                point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            out_mask = point_sample(
+                out_mask,
+                point_coords.repeat(out_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            with autocast(enabled=False):
+                out_mask = out_mask.float()
+                tgt_mask = tgt_mask.float()
+                # Compute the focal loss between masks
+                cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
+                # Compute the dice loss betwen masks
+                cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
+            # Final cost matrix
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+            if C.isnan().any():
+                C[C.isnan()] = 1e6 ### temporary fix
+                warnings.warn("NAN in Cost Matrix!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                raise
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+    @torch.no_grad()
+    def openimage_forward(self, outputs, targets, extra):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_captions"].shape[:2]
+        if bs == 0 or len(targets) == 0:
+            return None
+        neg_class_emb = extra['neg_class_emb']
+        neg_hash = extra['neg_hash']
+        _, unique_indices = np.unique(neg_hash.cpu().numpy(), return_index=True)
+        neg_class_emb = neg_class_emb[unique_indices]
+        neg_hash = neg_hash[unique_indices]
+        indices = []
+        pred_logits = []
+        # Iterate through batch size
+        for b in range(bs):
+            _pos_class_emb = targets[b]['pos_class_emb']
+            _pos_hash = targets[b]['pos_hash']
+            _neg_overlap_pos = ~(neg_hash[..., None] == _pos_hash).any(-1)
+            _neg_class_emb = neg_class_emb[_neg_overlap_pos]
+            t_emb = torch.cat((_pos_class_emb, _neg_class_emb))
+            v_emb = outputs["pred_captions"][b]
+            del _pos_class_emb
+            del _neg_class_emb
+            t_emb = t_emb / (t_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
+            out_prob = vl_similarity(v_emb, t_emb, temperature=extra['lang_logit'])
+            pred_logits += [out_prob]
+            out_prob = out_prob.softmax(-1)
+            tgt_ids = targets[b]["labels"]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob[:, tgt_ids]
+            out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["masks"].to(out_mask)
+            out_mask = out_mask[:, None]
+            tgt_mask = tgt_mask[:, None]
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device, dtype=tgt_mask.dtype)
+            # get gt labels
+            tgt_mask = point_sample(
+                tgt_mask,
+                point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            out_mask = point_sample(
+                out_mask,
+                point_coords.repeat(out_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            with autocast(enabled=False):
+                out_mask = out_mask.float()
+                tgt_mask = tgt_mask.float()
+                # Compute the focal loss between masks
+                cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
+                # Compute the dice loss betwen masks
+                cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
+            # Final cost matrix
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+            if C.isnan().any():
+                C[C.isnan()] = 1e6 ### temporary fix
+                warnings.warn("NAN in Cost Matrix!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                raise
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ], pred_logits
+    @torch.no_grad()
+    def grounding_forward(self, outputs, targets, extra):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_gmasks"].shape[:2]
+        if bs == 0 or len(targets) == 0:
+            return None
+        indices = []
+        # Iterate through batch size
+        for b in range(bs):
+            out_prob = outputs["pred_logits"][b]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob.softmax(dim=0)
+            out_mask = outputs["pred_gmasks"][b]  # [num_queries, H_pred, W_pred]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["grounding_masks"].to(out_mask)
+            out_mask = out_mask[:, None]
+            tgt_mask = tgt_mask[:, None]
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device, dtype=tgt_mask.dtype)
+            # get gt labels
+            tgt_mask = point_sample(
+                tgt_mask,
+                point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            out_mask = point_sample(
+                out_mask,
+                point_coords.repeat(out_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            with autocast(enabled=False):
+                out_mask = out_mask.float()
+                tgt_mask = tgt_mask.float()
+                # Compute the focal loss between masks
+                cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
+                # Compute the dice loss betwen masks
+                cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
+            # Final cost matrix
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+            if C.isnan().any():
+                C[C.isnan()] = 1e6 ### temporary fix
+                warnings.warn("NAN in Cost Matrix!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                raise
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+    @torch.no_grad()
+    def spatial_forward(self, outputs, targets, extra):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_smasks"].shape[:2]
+        if bs == 0 or len(targets) == 0:
+            return None
+        indices = []
+        # Iterate through batch size
+        for b in range(bs):
+            out_mask = outputs["pred_smasks"][b]  # [num_queries, H_pred, W_pred]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["gt_spatial_masks"].to(out_mask)
+            nd,ns = outputs["pred_pos_logits"][b].shape
+            index_masking = 1-torch.eye(ns, device=out_mask.device, dtype=tgt_mask.dtype).repeat_interleave(nd//ns,dim=0)
+            neg_masking = torch.zeros((nd,ns), device=out_mask.device, dtype=tgt_mask.dtype)
+            neg_masking.masked_fill_(index_masking.bool(), -float('inf'))
+            pos_masking = torch.zeros((nd,ns), device=out_mask.device, dtype=tgt_mask.dtype)
+            pos_masking.masked_fill_(index_masking.bool(), float('inf'))
+            out_prob = (outputs["pred_pos_logits"][b]+neg_masking)[:,:len(tgt_mask)] # remove redundant predictions for padding
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob.softmax(dim=0)
+            out_mask = out_mask[:, None]
+            tgt_mask = tgt_mask[:, None]
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device, dtype=tgt_mask.dtype)
+            # get gt labels
+            tgt_mask = point_sample(
+                tgt_mask,
+                point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            out_mask = point_sample(
+                out_mask,
+                point_coords.repeat(out_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            with autocast(enabled=False):
+                out_mask = out_mask.float()
+                tgt_mask = tgt_mask.float()
+                # Compute the focal loss between masks
+                cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask) + pos_masking[:,:len(tgt_mask)]
+                # Compute the dice loss betwen masks
+                cost_dice = batch_dice_loss_jit(out_mask, tgt_mask) + pos_masking[:,:len(tgt_mask)]
+            # Final cost matrix
+            C = (
+                self.spatial_cost_mask * cost_mask
+                + self.spatial_cost_class * cost_class
+                + self.spatial_cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+            if C.isnan().any():
+                C[C.isnan()] = 1e6 ### temporary fix
+                warnings.warn("NAN in Cost Matrix!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                raise
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+    @torch.no_grad()
+    def spatial_forward_pn(self, outputs, targets, extra):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_smasks"].shape[:2]
+        if bs == 0 or len(targets) == 0:
+            return None
+        fp_mask = extra['false_positive_mask']
+        gt_mask = torch.stack([targets[b]["gt_spatial_masks"] for b in range(bs)])
+        indices = []
+        # Iterate through batch size
+        for b in range(bs):
+            out_prob = outputs["pred_neg_logits"][b]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob.softmax(dim=0)
+            out_mask = outputs["pred_smasks"][b]  # [num_queries, H_pred, W_pred]
+            tgt_mask = fp_mask[b].to(out_mask)
+            ign_mask = (gt_mask[b] | fp_mask[b]).to(out_mask)
+            out_mask = out_mask[:, None]
+            tgt_mask = tgt_mask[:, None]
+            ign_mask = ign_mask[:, None]
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device, dtype=tgt_mask.dtype)
+            # get gt labels
+            tgt_mask = point_sample(
+                tgt_mask,
+                point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            out_mask = point_sample(
+                out_mask,
+                point_coords.repeat(out_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            ign_mask = point_sample(
+                ign_mask,
+                point_coords.repeat(ign_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            with autocast(enabled=False):
+                out_mask = out_mask.float()
+                tgt_mask = tgt_mask.float()
+                ign_mask = ign_mask.float()
+                # Compute the focal loss between masks
+                cost_mask = batch_sigmoid_ce_loss_jit(out_mask*ign_mask, tgt_mask*ign_mask)
+                # Compute the dice loss betwen masks
+                cost_dice = batch_dice_loss_jit(out_mask*ign_mask, tgt_mask*ign_mask)
+            # Final cost matrix
+            C = (
+                self.spatial_cost_mask * cost_mask
+                + self.spatial_cost_class * cost_class
+                + self.spatial_cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+            if C.isnan().any():
+                C[C.isnan()] = 1e6 ### temporary fix
+                warnings.warn("NAN in Cost Matrix!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                raise
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+    @torch.no_grad()
+    def caption_forward_womask(self, outputs, targets, extra):
+        """More memory-friendly matching"""
+        bs, _ = outputs["pred_logits"].shape[:2]
+        if bs == 0 or len(targets) == 0:
+            return None
+        indices = []
+        t_emb = torch.cat([t['captions'] for t in targets])
+        v_emb = outputs['unmatched_pred_captions']
+        caption_target_count = np.cumsum([0] + [len(t['captions']) for t in targets])
+        # Iterate through batch size
+        for b in range(bs):
+            v_emb[b] = v_emb[b] / (v_emb[b].norm(dim=-1, keepdim=True) + 1e-7)
+            num_queries = len(v_emb[b])
+            out_prob = vl_similarity(v_emb[b][None,], t_emb, temperature=extra['temperature']).softmax(-1)[0]
+            tgt_ids = [idx for idx in range(caption_target_count[b], caption_target_count[b+1])]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob[:, tgt_ids]
+            # Final cost matrix
+            C = (self.cost_class * cost_class)
+            C = C.reshape(num_queries, -1).cpu()
+            if C.isnan().any():
+                C[C.isnan()] = 1e6 ### temporary fix
+                warnings.warn("NAN in Cost Matrix!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                raise
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+    @torch.no_grad()
+    def caption_forward_wmask(self, outputs, targets, extra):
+        """More memory-friendly matching"""
+        bs, _ = outputs["pred_logits"].shape[:2]
+        if bs == 0 or len(targets) == 0:
+            return None
+        indices = []
+        t_emb = torch.cat([t['captions'] for t in targets])
+        v_emb = outputs['unmatched_pred_captions']
+        caption_target_count = np.cumsum([0] + [len(t['captions']) for t in targets])
+        # Iterate through batch size
+        for b in range(bs):
+            v_emb[b] = v_emb[b] / (v_emb[b].norm(dim=-1, keepdim=True) + 1e-7)
+            num_queries = len(v_emb[b])
+            out_prob = vl_similarity(v_emb[b][None,], t_emb, temperature=extra['temperature']).softmax(-1)[0]
+            tgt_ids = [idx for idx in range(caption_target_count[b], caption_target_count[b+1])]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob[:, tgt_ids]
+            out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["masks"].to(out_mask)
+            out_mask = out_mask[:, None]
+            tgt_mask = tgt_mask[:, None]
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device, dtype=tgt_mask.dtype)
+            # get gt labels
+            tgt_mask = point_sample(
+                tgt_mask,
+                point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            out_mask = point_sample(
+                out_mask,
+                point_coords.repeat(out_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            with autocast(enabled=False):
+                out_mask = out_mask.float()
+                tgt_mask = tgt_mask.float()
+                # Compute the focal loss between masks
+                cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
+                # Compute the dice loss betwen masks
+                cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
+            # Final cost matrix
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+            if C.isnan().any():
+                C[C.isnan()] = 1e6 ### temporary fix
+                warnings.warn("NAN in Cost Matrix!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                raise
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+    @torch.no_grad()
+    def forward(self, outputs, targets, mode='default', extra={}):
+        """Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        if mode == 'default':
+            return self.memory_efficient_forward(outputs, targets)
+        elif mode == 'grounding':
+            return self.grounding_forward(outputs, targets, extra)
+        elif mode == 'spatial':
+            return self.spatial_forward(outputs, targets, extra)
+        elif mode == 'spatial_pn':
+            return self.spatial_forward_pn(outputs, targets, extra)
+        elif mode == 'caption_womask':
+            return self.caption_forward_womask(outputs, targets, extra)
+        elif mode == 'caption_wmask':
+            return self.caption_forward_wmask(outputs, targets, extra)
+        else:
+            assert False, "Mode {} is not supported.".format(mode)
+    def __repr__(self, _repr_indent=4):
+        head = "Matcher " + self.__class__.__name__
+        body = [
+            "cost_class: {}".format(self.cost_class),
+            "cost_mask: {}".format(self.cost_mask),
+            "cost_dice: {}".format(self.cost_dice),
+        ]
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)