zhangbo2008
/

av_hubert

Model card Files Files and versions Community

zhangbo2008 commited on Sep 11, 2023

Commit

012c9b1

1 Parent(s): e06ad17

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
av_hubert/.gitmodules +3 -0
av_hubert/CODE_OF_CONDUCT.md +80 -0
av_hubert/CONTRIBUTING.md +31 -0
av_hubert/LICENSE +159 -0
av_hubert/README.md +164 -0
av_hubert/assets/lipreading.gif +3 -0
av_hubert/avhubert/__init__.py +10 -0
av_hubert/avhubert/clustering/README.md +100 -0
av_hubert/avhubert/clustering/dump_hubert_feature.py +177 -0
av_hubert/avhubert/clustering/dump_km_label.py +99 -0
av_hubert/avhubert/clustering/dump_mfcc_feature.py +117 -0
av_hubert/avhubert/clustering/learn_kmeans.py +147 -0
av_hubert/avhubert/clustering/requirements.txt +6 -0
av_hubert/avhubert/clustering/submit_cluster.py +132 -0
av_hubert/avhubert/conf/av-finetune/base_noise_pt_noise_ft_30h.yaml +121 -0
av_hubert/avhubert/conf/av-finetune/base_noise_pt_noise_ft_433h.yaml +121 -0
av_hubert/avhubert/conf/av-finetune/large_noise_pt_noise_ft_30h.yaml +124 -0
av_hubert/avhubert/conf/av-finetune/large_noise_pt_noise_ft_433h.yaml +124 -0
av_hubert/avhubert/conf/finetune/base_lrs3_30h.yaml +118 -0
av_hubert/avhubert/conf/finetune/base_lrs3_433h.yaml +118 -0
av_hubert/avhubert/conf/finetune/base_vox_30h.yaml +118 -0
av_hubert/avhubert/conf/finetune/base_vox_433h.yaml +118 -0
av_hubert/avhubert/conf/finetune/large_lrs3_30h.yaml +121 -0
av_hubert/avhubert/conf/finetune/large_lrs3_433h.yaml +121 -0
av_hubert/avhubert/conf/finetune/large_vox_30h.yaml +121 -0
av_hubert/avhubert/conf/finetune/large_vox_433h.yaml +121 -0
av_hubert/avhubert/conf/finetune/self_large_vox_30h.yaml +121 -0
av_hubert/avhubert/conf/finetune/self_large_vox_433h.yaml +121 -0
av_hubert/avhubert/conf/pretrain/base_lrs3_iter1.yaml +112 -0
av_hubert/avhubert/conf/pretrain/base_lrs3_iter2.yaml +112 -0
av_hubert/avhubert/conf/pretrain/base_lrs3_iter3.yaml +112 -0
av_hubert/avhubert/conf/pretrain/base_lrs3_iter4.yaml +112 -0
av_hubert/avhubert/conf/pretrain/base_lrs3_iter5.yaml +112 -0
av_hubert/avhubert/conf/pretrain/base_vox_iter1.yaml +113 -0
av_hubert/avhubert/conf/pretrain/base_vox_iter2.yaml +113 -0
av_hubert/avhubert/conf/pretrain/base_vox_iter3.yaml +113 -0
av_hubert/avhubert/conf/pretrain/base_vox_iter4.yaml +112 -0
av_hubert/avhubert/conf/pretrain/base_vox_iter5.yaml +113 -0
av_hubert/avhubert/conf/pretrain/large_lrs3_iter5.yaml +117 -0
av_hubert/avhubert/conf/pretrain/large_vox_iter5.yaml +117 -0
av_hubert/avhubert/conf/pretrain/noise_base_vox_iter5.yaml +115 -0
av_hubert/avhubert/conf/pretrain/noise_large_vox_iter5.yaml +119 -0
av_hubert/avhubert/conf/s2s_decode.yaml +23 -0
av_hubert/avhubert/decoder.py +243 -0
av_hubert/avhubert/hubert.py +779 -0
av_hubert/avhubert/hubert_asr.py +521 -0
av_hubert/avhubert/hubert_criterion.py +169 -0
av_hubert/avhubert/hubert_dataset.py +529 -0
av_hubert/avhubert/hubert_pretraining.py +400 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+av_hubert/assets/lipreading.gif filter=lfs diff=lfs merge=lfs -text
+av_hubert/fairseq/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text

av_hubert/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "fairseq"]
+	path = fairseq
+	url = https://github.com/pytorch/fairseq

av_hubert/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <[email protected]>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

av_hubert/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Contributing to av_hubert
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to av_hubert, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

av_hubert/LICENSE ADDED Viewed

	@@ -0,0 +1,159 @@

+AV-HuBERT LICENSE AGREEMENT
+This License Agreement (as may be amended in accordance with this License
+Agreement, “License”), between you (“Licensee” or “you”) and Meta Platforms,
+Inc. (“Meta” or “we”) applies to your use of any computer program, algorithm,
+source code, object code, or software that is made available by Meta under this
+License (“Software”) and any specifications, manuals, documentation, and other
+written information provided by Meta related to the Software (“Documentation”).
+By clicking “I Accept” below or by using the Software, you agree to the terms
+of this License. If you do not agree to this License, then you do not have any
+rights to use the Software or Documentation (collectively, the “Software
+Products”), and you must immediately cease using the Software Products.
+1. LICENSE GRANT a. Subject to your compliance with the Documentation and
+Sections 2, 3, and 5, Meta grants you a non-exclusive, worldwide,
+non-transferable, non-sublicensable, revocable, royalty free and limited
+license under Meta’s copyright interests to reproduce, distribute, and create
+derivative works of the Software solely for your non-commercial research
+purposes. The foregoing license is personal to you, and you may not assign or
+sublicense this License or any other rights or obligations under this License
+without Meta’s prior written consent; any such assignment or sublicense will be
+void and will automatically and immediately terminate this License.
+b. You may make a reasonable number of copies of the Documentation solely for
+use in connection with the license to the Software granted above.
+c. The grant of rights expressly set forth in this Section 1 (License Grant)
+are the complete grant of rights to you in the Software Products, and no other
+licenses are granted, whether by waiver, estoppel, implication, equity or
+otherwise. Meta and its licensors reserve all rights not expressly granted by
+this License.
+2. RESTRICTIONS
+You will not, and will not permit, assist or cause any third party to:
+a. use, modify, copy, reproduce, create derivative works of, or distribute the
+Software Products (or any derivative works thereof, works incorporating the
+Software Products, or any data produced by the Software), in whole or in part,
+for (i) any commercial or production purposes, (ii) military purposes or in the
+service of nuclear technology, (iii) purposes of surveillance, including any
+research or development relating to surveillance, (iv) biometric processing,
+(v) in any manner that infringes, misappropriates, or otherwise violates any
+third-party rights, or (vi) in any manner that violates any applicable law,
+including any privacy or security laws, rules, regulations, directives, or
+governmental requirements (including the General Data Privacy Regulation
+(Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and
+all laws governing the processing of biometric information), as well as all
+amendments and successor laws to any of the foregoing;
+b. decompile, disassemble, or reverse-engineer the Software, in whole or in
+part;
+c. alter or remove copyright and other proprietary notices which appear on or
+in the Software Products;
+d. utilize any equipment, device, software, or other means to circumvent or
+remove any security or protection used by Meta in connection with the Software,
+or to circumvent or remove any usage restrictions, or to enable functionality
+disabled by Meta; or
+e. offer or impose any terms on the Software Products that alter, restrict, or
+are inconsistent with the terms of this License.
+3. ATTRIBUTION
+Together with any copies of the Software Products (as well as derivative works
+thereof or works incorporating the Software Products) that you distribute, you
+must provide (i) a copy of this License, and (ii) the following attribution
+notice: “AV-HuBERT is licensed under the AV-HuBERT license, Copyright (c) Meta
+Platforms, Inc. All Rights Reserved.”
+4. DISCLAIMERS
+THE SOFTWARE PRODUCTS ARE PROVIDED “AS IS” and “WITH ALL FAULTS” WITH NO
+WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. META EXPRESSLY DISCLAIMS ALL
+REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM,
+USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE SOFTWARE PRODUCTS,
+INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR
+NON-INFRINGEMENT. META MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE SOFTWARE
+PRODUCTS WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR
+PRODUCE ANY PARTICULAR RESULTS.
+5. LIMITATION OF LIABILITY
+TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL META BE LIABLE TO YOU
+(A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE,
+STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY
+INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR
+LOST PROFITS, EVEN IF META HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+THE SOFTWARE PRODUCTS, THEIR CONSTITUENT COMPONENTS, AND ANY OUTPUT
+(COLLECTIVELY, “SOFTWARE MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN
+ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE SOFTWARE MATERIALS
+COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON,
+INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY
+RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A
+“HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE SOFTWARE MATERIALS FOR A
+HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT
+APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN
+CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT
+IN ANY OF THE SOFTWARE MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY
+THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR
+THE FIELD OF THE HIGH-RISK USE.
+6. TERMINATION; SURVIVAL
+a. This License will automatically terminate upon any breach by you of the
+terms of this License.
+b. We may terminate this License, in whole or in part, at any time upon notice
+(including electronic) to you.
+c. The following sections survive termination of this License: 2
+(Restrictions), 3 (Attribution), 4 (Disclaimers), 5 (Limitation on Liability),
+6 (Termination; Survival), 7 (Third Party Materials), 8 (Trademarks), 9
+(Applicable Law; Dispute Resolution), and 10 (Miscellaneous).
+7. THIRD PARTY MATERIALS
+The Software Products may contain third-party software or other components
+(including free and open source software) (all of the foregoing, “Third Party
+Materials”), which are subject to the license terms of the respective
+third-party licensors. Your dealings or correspondence with third parties and
+your use of or interaction with any Third Party Materials are solely between
+you and the third party. Meta does not control or endorse, and makes no
+representations or warranties regarding, any Third Party Materials, and your
+access to and use of such Third Party Materials are at your own risk.
+8. TRADEMARKS
+Licensee has not been granted any trademark license as part of this License and
+may not use any name or mark associated with Meta without the prior written
+permission of Meta, except to the extent necessary to make the reference
+required by the “ATTRIBUTION” section of this Agreement.
+9. APPLICABLE LAW; DISPUTE RESOLUTION
+This License will be governed and construed under the laws of the State of
+California without regard to conflicts of law provisions. Any suit or
+proceeding arising out of or relating to this License will be brought in the
+federal or state courts, as applicable, in San Mateo County, California, and
+each party irrevocably submits to the jurisdiction and venue of such courts.
+10. MISCELLANEOUS
+If any provision or part of a provision of this License is unlawful, void or
+unenforceable, that provision or part of the provision is deemed severed from
+this License, and will not affect the validity and enforceability of any
+remaining provisions. The failure of Meta to exercise or enforce any right or
+provision of this License will not operate as a waiver of such right or
+provision. This License does not confer any third-party beneficiary rights upon
+any other person or entity. This License, together with the Documentation,
+contains the entire understanding between you and Meta regarding the subject
+matter of this License, and supersedes all other written or oral agreements and
+understandings between you and Meta regarding such subject matter. No change or
+addition to any provision of this License will be binding unless it is in
+writing and signed by an authorized representative of both you and Meta.

av_hubert/README.md ADDED Viewed

	@@ -0,0 +1,164 @@

+# AV-HuBERT (Audio-Visual Hidden Unit BERT)
+[Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction](https://arxiv.org/abs/2201.02184)
+[Robust Self-Supervised Audio-Visual Speech Recognition](https://arxiv.org/abs/2201.01763)
+![lip-reading](assets/lipreading.gif)
+## Introduction
+AV-HuBERT is a self-supervised representation learning framework for audio-visual speech. It achieves state-of-the-art results in lip reading, ASR and audio-visual speech recognition on the LRS3 audio-visual speech benchmark.
+If you find AV-HuBERT useful in your research, please use the following BibTeX entry for citation.
+```BibTeX
+@article{shi2022avhubert,
+    author  = {Bowen Shi and Wei-Ning Hsu and Kushal Lakhotia and Abdelrahman Mohamed},
+    title = {Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction},
+    journal = {arXiv preprint arXiv:2201.02184}
+    year = {2022}
+}
+@article{shi2022avsr,
+    author  = {Bowen Shi and Wei-Ning Hsu and Abdelrahman Mohamed},
+    title = {Robust Self-Supervised Audio-Visual Speech Recognition},
+    journal = {arXiv preprint arXiv:2201.01763}
+    year = {2022}
+}
+```
+## License
+AV-HuBERT LICENSE AGREEMENT
+This License Agreement (as may be amended in accordance with this License
+Agreement, “License”), between you (“Licensee” or “you”) and Meta Platforms,
+Inc. (“Meta” or “we”) applies to your use of any computer program, algorithm,
+source code, object code, or software that is made available by Meta under this
+License (“Software”) and any specifications, manuals, documentation, and other
+written information provided by Meta related to the Software (“Documentation”).
+By using the Software, you agree to the terms of [this
+License](https://github.com/facebookresearch/av_hubert/blob/main/LICENSE). If
+you do not agree to this License, then you do not have any rights to use the
+Software or Documentation (collectively, the “Software Products”), and you must
+immediately cease using the Software Products.
+## Pre-trained and fine-tuned models
+Please find the checkpoints [here](http://facebookresearch.github.io/av_hubert)
+## Demo
+Run our lip-reading demo using Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bNXkfpHiVHzXQH8WjGhzQ-fsDxolpUjD)
+## Installation
+First, create a conda virtual environment and activate it:
+```
+conda create -n avhubert python=3.8 -y
+conda activate avhubert
+```
+Then, clone this directory:
+```
+git clone https://github.com/facebookresearch/av_hubert.git
+cd avhubert
+git submodule init
+git submodule update
+```
+Lastly, install Fairseq and the other packages:
+```
+pip install -r requirements.txt
+cd fairseq
+pip install --editable ./
+```
+## Load a pretrained model
+```sh
+$ cd avhubert
+$ python
+>>> import fairseq
+>>> import hubert_pretraining, hubert
+>>> ckpt_path = "/path/to/the/checkpoint.pt"
+>>> models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+>>> model = models[0]
+```
+## Train a new model
+### Data preparation
+Follow the steps in [`preparation`](avhubert/preparation/) to pre-process:
+- LRS3 and VoxCeleb2 datasets
+Follow the steps in [`clustering`](avhubert/clustering/) (pre-train only) to create:
+- `{train,valid}.km` frame-aligned pseudo label files.
+The `label_rate` is the same as the feature frame rate used for clustering,
+which is 100Hz for MFCC features and 25Hz for AV-HuBERT features by default.
+### Pre-train an AV-HuBERT model
+Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km`
+are saved at `/path/to/labels`, the configuration file is saved at `/path/to/conf/conf-name`, and the label rate is 100Hz.
+To train a model, run:
+```sh
+$ cd avhubert
+$ fairseq-hydra-train --config-dir /path/to/conf/ --config-name conf-name \
+  task.data=/path/to/data task.label_dir=/path/to/label \
+  model.label_rate=100 hydra.run.dir=/path/to/experiment/pretrain/ \
+  common.user_dir=`pwd`
+```
+### Finetune an AV-HuBERT model with Seq2Seq
+Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.wrd`
+are saved at `/path/to/labels`, the configuration file is saved at `/path/to/conf/conf-name`.
+To fine-tune a pre-trained HuBERT model at `/path/to/checkpoint`, run:
+```sh
+$ cd avhubert
+$ fairseq-hydra-train --config-dir /path/to/conf/ --config-name conf-name \
+  task.data=/path/to/data task.label_dir=/path/to/label \
+  task.tokenizer_bpe_model=/path/to/tokenizer model.w2v_path=/path/to/checkpoint \
+  hydra.run.dir=/path/to/experiment/finetune/ common.user_dir=`pwd`
+```
+### Decode an AV-HuBERT model
+Suppose the `test.tsv` and `test.wrd` are the video list and transcripts of
+the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is
+saved at `/path/to/checkpoint`.
+#### Seq2Seq decoding
+`task.normalize` needs to be consistent with the value used during fine-tuning.
+Decoding results will be saved at
+`/path/to/experiment/decode/s2s/test`.
+```sh
+$ cd avhubert
+$ python -B infer_s2s.py --config-dir ./conf/ --config-name conf-name \
+  dataset.gen_subset=test common_eval.path=/path/to/checkpoint \
+  common_eval.results_path=/path/to/experiment/decode/s2s/test \
+  override.modalities=['video'] common.user_dir=`pwd`
+```
+The command above uses the default decoding hyperparameter, which can be found
+in `conf/s2s_decode.yaml`. `override.modalities` can be set to `['video']` (for lip reading),
+or `['audio']` (for ASR) or `['audio','video']` (for audio-visual speech recognition).These parameters can be
+configured from the command line. For example, to search with a beam size of
+20, we can append the command above with `generation.beam=20`.
+Important parameters include:
+- generation.beam
+- generation.lenpen
+#### Different test set
+If your test data are stored in a different directory with the training data, append the following to the above command.
+`+override.data=/path/to/test +override.label_dir=/path/to/test`
+, where `/path/to/test` contains `test.{tsv,wrd}`. This is useful when you want to test with the fine-tuned checkpoints we provide.
+#### Test under noisy environment
+If you want to test your model under noisy environment, append the following to the above command.
+`+override.noise_wav=/path/to/noise override.noise_prob=1 override.noise_snr={snr}`
+ `{snr}` is the signal-to-noise ratio (SNR) and `/path/to/noise` is a folder containing noise manifest files (`/path/to/noise/{valid,test}.tsv`). See [`preparation`](avhubert/preparation/) for setting up this folder.

av_hubert/assets/lipreading.gif ADDED Viewed

Git LFS Details

SHA256: 8cf0498b502e01bd6eb72f0985854a64793a6b4f0513181a8bc474abc3e8e75f
Pointer size: 132 Bytes
Size of remote file: 1.82 MB

av_hubert/avhubert/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .hubert import *  # noqa
+from .hubert_asr import *  # noqa
+from .hubert_dataset import *
+from .hubert_pretraining import *
+from .hubert_criterion import *

av_hubert/avhubert/clustering/README.md ADDED Viewed

	@@ -0,0 +1,100 @@

+# AV-HuBERT Label Preparation
+This folder contains scripts for preparing AV-HUBERT labels from tsv files, the
+steps are:
+1. feature extraction
+2. k-means clustering
+3. k-means application
+## Installation
+To prepare labels, you need some additional packages:
+```
+pip install -r requirements.txt
+```
+## Data preparation
+`*.tsv` files contains a list of audio, where each line is the root, and
+following lines are the subpath and number of frames of each video and audio separated by `tab`:
+```
+<root-dir>
+<id-1> <video-path-1> <audio-path-1> <video-number-frames-1> <audio-number-frames-1>
+<id-2> <video-path-2> <audio-path-2> <video-number-frames-2> <audio-number-frames-2>
+...
+```
+See [here](../preparation/) for data preparation for LRS3 and VoxCeleb2.
+## Feature extraction
+### MFCC feature
+Suppose the tsv file is at `${tsv_dir}/${split}.tsv`. To extract 39-D
+mfcc+delta+ddelta features for the 1st iteration AV-HuBERT training, run:
+```sh
+python dump_mfcc_feature.py ${tsv_dir} ${split} ${nshard} ${rank} ${feat_dir}
+```
+This would shard the tsv file into `${nshard}` and extract features for the
+`${rank}`-th shard, where rank is an integer in `[0, nshard-1]`. Features would
+be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
+### AV-HuBERT feature
+To extract features from the `${layer}`-th transformer layer of a trained
+AV-HuBERT model saved at `${ckpt_path}`, run:
+```sh
+python dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} ${layer} ${nshard} ${rank} ${feat_dir} --user_dir `pwd`/../
+```
+Features would also be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
+- if out-of-memory, decrease the chunk size with `--max_chunk`
+## K-means clustering
+To fit a k-means model with `${n_clusters}` clusters on 10% of the `${split}` data, run
+```sh
+python learn_kmeans.py ${feat_dir} ${split} ${nshard} ${km_path} ${n_cluster} --percent 0.1
+```
+This saves the k-means model to `${km_path}`.
+- set `--precent -1` to use all data
+- more kmeans options can be found with `-h` flag
+## K-means application
+To apply a trained k-means model `${km_path}` to obtain labels for `${split}`, run
+```sh
+python dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
+```
+This would extract labels for the `${rank}`-th shard out of `${nshard}` shards
+and dump them to `${lab_dir}/${split}_${rank}_${shard}.km`
+Finally, merge shards for `${split}` by running
+```sh
+for rank in $(seq 0 $((nshard - 1))); do
+  cat $lab_dir/${split}_${rank}_${nshard}.km
+done > $lab_dir/${split}.km
+```
+and create a dictionary of cluster indexes by running
+```sh
+for i in $(seq 1 $((n_cluster-1)));do
+    echo $i 10000
+done > $lab_dir/dict.{mfcc,km}.txt
+```
+## Clustering on slurm
+If you are on slurm, you can combine the above steps (feature extraction + K-means clustering + K-means application) by:
+- MFCC feature cluster:
+```sh
+python submit_cluster.py --tsv ${tsv_dir} --output ${lab_dir} --ncluster ${n_cluster} \
+  --nshard ${nshard} --mfcc --percent 0.1
+```
+- AV-HuBERT feature cluster:
+```sh
+python submit_cluster.py --tsv ${tsv_dir} --output ${lab_dir} --ckpt ${ckpt_path} --nlayer ${layer} \
+  --ncluster ${n_cluster} --nshard ${nshard} --percent 0.1
+```
+This would  dump labels to `${lab_dir}/{train,valid}.km`.

av_hubert/avhubert/clustering/dump_hubert_feature.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+import os
+import sys
+import fairseq
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+import tqdm
+from npy_append_array import NpyAppendArray
+import numpy as np
+from python_speech_features import logfbank
+from scipy.io import wavfile
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_hubert_feature")
+class HubertFeatureReader(object):
+    def __init__(self, ckpt_path, layer, max_chunk=1600000, custom_utils=None):
+        (
+            model,
+            cfg,
+            task,
+        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+        self.model = model[0].eval().cuda()
+        self.task = task
+        self.layer = layer
+        self.max_chunk = max_chunk
+        self.stack_order_audio = self.task.cfg.stack_order_audio
+        image_crop_size, image_mean, image_std = self.task.cfg.image_crop_size, self.task.cfg.image_mean, self.task.cfg.image_std
+        self.transform = custom_utils.Compose([
+            custom_utils.Normalize( 0.0,255.0 ),
+            custom_utils.CenterCrop((image_crop_size, image_crop_size)),
+            custom_utils.Normalize(image_mean, image_std) ])
+        self.custom_utils = custom_utils
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+        logger.info(f"Transform: {self.transform}")
+    def load_feature(self, mix_name, ref_len=None):
+        def stacker(feats, stack_order):
+            feat_dim = feats.shape[1]
+            if len(feats) % stack_order != 0:
+                res = stack_order - len(feats) % stack_order
+                res = np.zeros([res, feat_dim]).astype(feats.dtype)
+                feats = np.concatenate([feats, res], axis=0)
+            feats = feats.reshape((-1, stack_order, feat_dim)).reshape(-1, stack_order*feat_dim)
+            return feats
+        video_fn, audio_fn = mix_name
+        video_feats = self.load_image(video_fn)
+        audio_fn = audio_fn.split(':')[0]
+        sample_rate, wav_data = wavfile.read(audio_fn)
+        assert sample_rate == 16_000 and len(wav_data.shape) == 1
+        audio_feats = logfbank(wav_data, samplerate=sample_rate).astype(np.float32)
+        audio_feats = stacker(audio_feats, self.stack_order_audio)
+        diff = len(audio_feats) - len(video_feats)
+        if diff < 0:
+            audio_feats = np.concatenate([audio_feats, np.zeros([-diff, audio_feats.shape[-1]], dtype=audio_feats.dtype)])
+        elif diff > 0:
+            audio_feats = audio_feats[:-diff]
+        return video_feats, audio_feats
+    def load_image(self, audio_name):
+        feats = self.custom_utils.load_video(audio_name)
+        feats = self.transform(feats)
+        feats = np.expand_dims(feats, axis=-1)
+        return feats
+    def get_feats(self, path, ref_len=None):
+        video_feats, audio_feats = self.load_feature(path, ref_len)
+        with torch.no_grad():
+            audio_feats, video_feats = torch.from_numpy(audio_feats.astype(np.float32)).cuda(), torch.from_numpy(video_feats.astype(np.float32)).cuda()
+            if self.task.cfg.normalize:
+                audio_feats = F.layer_norm(audio_feats, audio_feats.shape[1:])
+            video_feats = video_feats.unsqueeze(dim=0).permute((0, 4, 1, 2, 3)).contiguous()
+            audio_feats = audio_feats.unsqueeze(dim=0).transpose(1, 2)
+            source = {'audio': audio_feats, 'video': video_feats}
+            if self.layer == 0:
+                ret_conv, output_layer = True, None
+            else:
+                ret_conv, output_layer = False, self.layer
+            feat, _ = self.model.extract_features(
+                source=source,
+                padding_mask=None,
+                mask=False,
+                output_layer=output_layer,
+                ret_conv=ret_conv
+                # output_layer=self.layer,
+            )
+            return feat.squeeze(dim=0)
+def get_path_iterator(tsv, nshard, rank):
+    with open(tsv, "r") as f:
+        root = f.readline().rstrip()
+        lines = [line.rstrip() for line in f]
+        tot = len(lines)
+        shard_size = math.ceil(tot / nshard)
+        start, end = rank * shard_size, min((rank + 1) * shard_size, tot)
+        assert start < end, "start={start}, end={end}"
+        logger.info(
+            f"rank {rank} of {nshard}, process {end-start} "
+            f"({start}-{end}) out of {tot}"
+        )
+        lines = lines[start:end]
+        def iterate():
+            for line in lines:
+                items = line.strip().split("\t")
+                # audio_path = f"{items[1]}:{items[0]}"
+                yield (items[1], items[2]+':'+items[0]), int(items[3])
+        return iterate, len(lines)
+def dump_feature(
+        tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk, custom_utils=None, **kwargs
+):
+    reader = HubertFeatureReader(ckpt_path, layer, max_chunk, custom_utils=custom_utils)
+    generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+    iterator = generator()
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+    os.makedirs(feat_dir, exist_ok=True)
+    if os.path.exists(feat_path):
+        os.remove(feat_path)
+    feat_f = NpyAppendArray(feat_path)
+    with open(leng_path, "w") as leng_f:
+        for path, nsample in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path, nsample)
+            feat_f.append(feat.cpu().numpy())
+            leng_f.write(f"{len(feat)}\n")
+    logger.info("finished successfully")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("split")
+    parser.add_argument("ckpt_path")
+    parser.add_argument("layer", type=int)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("--max_chunk", type=int, default=1600000)
+    parser.add_argument("--user_dir", type=str, default=None)
+    args = parser.parse_args()
+    logger.info(args)
+    fairseq.utils.import_user_module(args)
+    sys.path.append(args.user_dir)
+    import utils as custom_utils
+    kwargs = vars(args)
+    kwargs.update({'custom_utils': custom_utils})
+    dump_feature(**kwargs)

av_hubert/avhubert/clustering/dump_km_label.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+import sys
+import numpy as np
+import joblib
+import torch
+import tqdm
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_km_label")
+class ApplyKmeans(object):
+    def __init__(self, km_path):
+        self.km_model = joblib.load(km_path)
+        self.C_np = self.km_model.cluster_centers_.transpose()
+        self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)
+        self.C = torch.from_numpy(self.C_np)
+        self.Cnorm = torch.from_numpy(self.Cnorm_np)
+        if torch.cuda.is_available():
+            self.C = self.C.cuda()
+            self.Cnorm = self.Cnorm.cuda()
+    def __call__(self, x):
+        if isinstance(x, torch.Tensor):
+            dist = (
+                x.pow(2).sum(1, keepdim=True)
+                - 2 * torch.matmul(x, self.C)
+                + self.Cnorm
+            )
+            return dist.argmin(dim=1).cpu().numpy()
+        else:
+            dist = (
+                (x ** 2).sum(1, keepdims=True)
+                - 2 * np.matmul(x, self.C_np)
+                + self.Cnorm_np
+            )
+            return np.argmin(dist, axis=1)
+def get_feat_iterator(feat_dir, split, nshard, rank):
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+    with open(leng_path, "r") as f:
+        lengs = [int(line.rstrip()) for line in f]
+        offsets = [0] + np.cumsum(lengs[:-1]).tolist()
+    def iterate():
+        feat = np.load(feat_path, mmap_mode="r")
+        assert feat.shape[0] == (offsets[-1] + lengs[-1])
+        for offset, leng in zip(offsets, lengs):
+            yield feat[offset: offset + leng]
+    return iterate, len(lengs)
+def dump_label(feat_dir, split, km_path, nshard, rank, lab_dir):
+    apply_kmeans = ApplyKmeans(km_path)
+    generator, num = get_feat_iterator(feat_dir, split, nshard, rank)
+    iterator = generator()
+    lab_path = f"{lab_dir}/{split}_{rank}_{nshard}.km"
+    os.makedirs(lab_dir, exist_ok=True)
+    with open(lab_path, "w") as f:
+        for feat in tqdm.tqdm(iterator, total=num):
+            # feat = torch.from_numpy(feat).cuda()
+            lab = apply_kmeans(feat).tolist()
+            f.write(" ".join(map(str, lab)) + "\n")
+    logger.info("finished successfully")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("feat_dir")
+    parser.add_argument("split")
+    parser.add_argument("km_path")
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("lab_dir")
+    args = parser.parse_args()
+    logging.info(str(args))
+    dump_label(**vars(args))

av_hubert/avhubert/clustering/dump_mfcc_feature.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+import os
+import sys
+import soundfile as sf
+import torch
+import torchaudio
+import tqdm
+from npy_append_array import NpyAppendArray
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_mfcc_feature")
+class MfccFeatureReader(object):
+    def __init__(self, sample_rate):
+        self.sample_rate = sample_rate
+    def read_audio(self, path, ref_len=None):
+        wav, sr = sf.read(path)
+        assert sr == self.sample_rate, sr
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float()
+            x = x.view(1, -1)
+            mfccs = torchaudio.compliance.kaldi.mfcc(
+                waveform=x,
+                sample_frequency=self.sample_rate,
+                use_energy=False,
+            )  # (time, freq)
+            mfccs = mfccs.transpose(0, 1)  # (freq, time)
+            deltas = torchaudio.functional.compute_deltas(mfccs)
+            ddeltas = torchaudio.functional.compute_deltas(deltas)
+            concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
+            concat = concat.transpose(0, 1).contiguous()  # (freq, time)
+            return concat
+def get_path_iterator(tsv, nshard, rank):
+    with open(tsv, "r") as f:
+        root = f.readline().rstrip()
+        lines = [line.rstrip() for line in f]
+        tot = len(lines)
+        shard_size = math.ceil(tot / nshard)
+        start, end = rank * shard_size, min((rank + 1) * shard_size, tot)
+        assert start < end, "start={start}, end={end}"
+        logger.info(
+            f"rank {rank} of {nshard}, process {end-start} "
+            f"({start}-{end}) out of {tot}"
+        )
+        lines = lines[start:end]
+        def iterate():
+            for line in lines:
+                _, video_path, wav_path, nsample_video, nsample_wav = line.split("\t")
+                yield f"{root}/{wav_path}", int(nsample_wav)
+        return iterate, len(lines)
+def dump_feature(tsv_dir, split, nshard, rank, feat_dir, sample_rate=16_000):
+    reader = MfccFeatureReader(sample_rate)
+    generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+    iterator = generator()
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+    os.makedirs(feat_dir, exist_ok=True)
+    if os.path.exists(feat_path):
+        os.remove(feat_path)
+    feat_f = NpyAppendArray(feat_path)
+    with open(leng_path, "w") as leng_f:
+        for path, nsample in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path, nsample)
+            feat_f.append(feat.cpu().numpy())
+            leng_f.write(f"{len(feat)}\n")
+    logger.info("finished successfully")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("split")
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("--sample_rate", type=int, default=16000)
+    args = parser.parse_args()
+    logger.info(args)
+    dump_feature(**vars(args))

av_hubert/avhubert/clustering/learn_kmeans.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+import sys
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+import joblib
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("learn_kmeans")
+def get_km_model(
+    n_clusters,
+    init,
+    max_iter,
+    batch_size,
+    tol,
+    max_no_improvement,
+    n_init,
+    reassignment_ratio,
+):
+    return MiniBatchKMeans(
+        n_clusters=n_clusters,
+        init=init,
+        max_iter=max_iter,
+        batch_size=batch_size,
+        verbose=1,
+        compute_labels=False,
+        tol=tol,
+        max_no_improvement=max_no_improvement,
+        init_size=None,
+        n_init=n_init,
+        reassignment_ratio=reassignment_ratio,
+    )
+def load_feature_shard(feat_dir, split, nshard, rank, percent):
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+    with open(leng_path, "r") as f:
+        lengs = [int(line.rstrip()) for line in f]
+        offsets = [0] + np.cumsum(lengs[:-1]).tolist()
+    if percent < 0:
+        return np.load(feat_path, mmap_mode="r")
+    else:
+        nsample = int(np.ceil(len(lengs) * percent))
+        indices = np.random.choice(len(lengs), nsample, replace=False)
+        feat = np.load(feat_path, mmap_mode="r")
+        sampled_feat = np.concatenate(
+            [feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0
+        )
+        logger.info(
+            (
+                f"sampled {nsample} utterances, {len(sampled_feat)} frames "
+                f"from shard {rank}/{nshard}"
+            )
+        )
+        return sampled_feat
+def load_feature(feat_dir, split, nshard, seed, percent):
+    assert percent <= 1.0
+    feat = np.concatenate(
+        [
+            load_feature_shard(feat_dir, split, nshard, r, percent)
+            for r in range(nshard)
+        ],
+        axis=0,
+    )
+    logging.info(f"loaded feature with dimension {feat.shape}")
+    return feat
+def learn_kmeans(
+    feat_dir,
+    split,
+    nshard,
+    km_path,
+    n_clusters,
+    seed,
+    percent,
+    init,
+    max_iter,
+    batch_size,
+    tol,
+    n_init,
+    reassignment_ratio,
+    max_no_improvement,
+):
+    np.random.seed(seed)
+    feat = load_feature(feat_dir, split, nshard, seed, percent)
+    km_model = get_km_model(
+        n_clusters,
+        init,
+        max_iter,
+        batch_size,
+        tol,
+        max_no_improvement,
+        n_init,
+        reassignment_ratio,
+    )
+    km_model.fit(feat)
+    joblib.dump(km_model, km_path)
+    inertia = -km_model.score(feat) / len(feat)
+    logger.info("total intertia: %.5f", inertia)
+    logger.info("finished successfully")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("feat_dir", type=str)
+    parser.add_argument("split", type=str)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("km_path", type=str)
+    parser.add_argument("n_clusters", type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument(
+        "--percent", default=-1, type=float, help="sample a subset; -1 for all"
+    )
+    parser.add_argument("--init", default="k-means++")
+    parser.add_argument("--max_iter", default=100, type=int)
+    parser.add_argument("--batch_size", default=10000, type=int)
+    parser.add_argument("--tol", default=0.0, type=float)
+    parser.add_argument("--max_no_improvement", default=100, type=int)
+    parser.add_argument("--n_init", default=20, type=int)
+    parser.add_argument("--reassignment_ratio", default=0.0, type=float)
+    args = parser.parse_args()
+    logging.info(str(args))
+    learn_kmeans(**vars(args))

av_hubert/avhubert/clustering/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+soundfile
+joblib
+sklearn
+torchaudio==0.10.1
+npy-append-array==0.9.13
+submitit==1.4.1

av_hubert/avhubert/clustering/submit_cluster.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os, subprocess
+import submitit
+import argparse
+from argparse import Namespace
+def dump_av_hubert(*args, **kwargs):
+    from dump_hubert_feature import dump_feature
+    import fairseq
+    import sys
+    av_hubert_dir = os.path.join(os.getcwd(), '..')
+    fairseq.utils.import_user_module(Namespace(user_dir=av_hubert_dir))
+    sys.path.append(av_hubert_dir)
+    import utils as custom_utils
+    kwargs.update({'custom_utils': custom_utils})
+    args = args[0]
+    dump_feature(*args, **kwargs)
+    return
+def dump_mfcc(*args, **kwargs):
+    from dump_mfcc_feature import dump_feature
+    args = args[0]
+    dump_feature(*args, **kwargs)
+    return
+def run_kmeans(*args, **kwargs):
+    import sys
+    from learn_kmeans import learn_kmeans
+    learn_kmeans(*args, **kwargs)
+    return
+def apply_kmeans(*args, **kwargs):
+    import sys
+    from dump_km_label import dump_label
+    args = args[0]
+    dump_label(*args, **kwargs)
+    return
+def concatenate(*args, **kwargs):
+    from concat import main as concat_fn
+    args = args[0]
+    concat_fn(*args, **kwargs)
+    return
+def main():
+    parser = argparse.ArgumentParser(description='clustering', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--tsv', type=str, help='tsv dir')
+    parser.add_argument('--output', type=str, help='output dir (labels)')
+    parser.add_argument('--ckpt', type=str, help='checkpoint of last iteration')
+    parser.add_argument('--nlayer', type=int, default=12, help='layer index for clustering')
+    parser.add_argument('--ncluster', type=int, default=500, help='number of clusters')
+    parser.add_argument('--nshard', type=int, default=100, help='number of shards')
+    parser.add_argument('--percent', type=float, default=0.05, help='Percentage for clustering')
+    parser.add_argument('--mfcc', action='store_true', help='extracting MFCC feature')
+    parser.add_argument('--slurm-partition', type=str, help='slurm partitions')
+    args = parser.parse_args()
+    tsv_dir = args.tsv
+    output_dir = args.output
+    km_dir = output_dir
+    feat_dir = output_dir
+    ckpt_path = args.ckpt
+    nlayer = args.nlayer
+    nshard = args.nshard
+    n_clusters = args.ncluster
+    slurm_partition = args.slurm_partition
+    is_mfcc = args.mfcc
+    timeout_min = 240
+    percent = 0.1
+    log_folder = "log_submit/%j"
+    km_path = f"{km_dir}/kmeans.mdl"
+    os.makedirs(output_dir, exist_ok=True)
+    ext = submitit.AutoExecutor(folder=log_folder)
+    args_array = []
+    if is_mfcc:
+        print(f"Dump MFCC feature")
+        for rank in range(nshard):
+            args = [tsv_dir, 'train', nshard, rank, output_dir]
+            args_array.append(args)
+        args_array.append([tsv_dir, 'valid', 1, 0, output_dir])
+        ext.update_parameters(timeout_min=60, slurm_partition=slurm_partition, cpus_per_task=1, slurm_array_parallelism=100)
+        jobs = ext.map_array(dump_mfcc, args_array)
+    else:
+        print(f"Dump AV-Hubert feature")
+        for rank in range(nshard):
+            args = [tsv_dir, 'train', ckpt_path, nlayer, nshard, rank, output_dir, 1600000]
+            args_array.append(args)
+        args_array.append([tsv_dir, 'valid', ckpt_path, nlayer, 1, 0, output_dir, 1600000])
+        ext.update_parameters(timeout_min=60, slurm_partition=slurm_partition, cpus_per_task=1, gpus_per_node=1, slurm_array_parallelism=100)
+        jobs = ext.map_array(dump_av_hubert, args_array)
+    [job.result() for job in jobs]
+    print(f"Learn K-means")
+    percent, batch_size = percent, 20000
+    ext.update_parameters(timeout_min=timeout_min, slurm_partition=slurm_partition, cpus_per_task=8, mem_gb=128)
+    args, kwargs = [feat_dir, 'train', nshard, km_path, n_clusters], vars(Namespace(seed=0, percent=percent, init="k-means++", max_iter=100, batch_size=batch_size, tol=0.0, n_init=20, reassignment_ratio=0.0, max_no_improvement=100))
+    print(args, kwargs)
+    job = ext.submit(run_kmeans, *args, **kwargs)
+    job.result()
+    print(f"Apply K-means")
+    args_array = []
+    for rank in range(nshard):
+        args = [feat_dir, 'train', km_path, nshard, rank, output_dir]
+        args_array.append(args)
+    args_array.append([feat_dir, 'valid', km_path, 1, 0, output_dir])
+    ext.update_parameters(timeout_min=10, slurm_partition=slurm_partition, cpus_per_task=1, slurm_array_parallelism=500)
+    jobs = ext.map_array(apply_kmeans, args_array)
+    [job.result() for job in jobs]
+    print(f"Concatenate labels")
+    cont = f"for rank in $(seq 0 {nshard-1}); do cat {output_dir}/train_${{rank}}_{nshard}.km; done > {output_dir}/train.km"
+    print(cont)
+    subprocess.call(cont, shell=True)
+    cont = f"cp {output_dir}/valid*.km {output_dir}/valid.km"
+    print(cont)
+    subprocess.call(cont, shell=True)
+    with open(f"{output_dir}/dict.km.txt", 'w') as fo:
+        for i in range(n_clusters):
+            fo.write(f"{i} {10000}\n")
+    print(f"Please delete intermediate files to save space: rm {output_dir}/*npy")
+    return
+if __name__ == '__main__':
+    main()

av_hubert/avhubert/conf/av-finetune/base_noise_pt_noise_ft_30h.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video","audio"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+  noise_prob: 0.25
+  noise_snr: 0
+  noise_wav: ???
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 30000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 10000
+  hold_steps: 0
+  decay_steps: 20000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 6
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 24000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/av-finetune/base_noise_pt_noise_ft_433h.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video","audio"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+  noise_prob: 0.25
+  noise_snr: 0
+  noise_wav: ???
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 60000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 20000
+  hold_steps: 0
+  decay_steps: 40000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 6
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 48000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/av-finetune/large_noise_pt_noise_ft_30h.yaml ADDED Viewed

	@@ -0,0 +1,124 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video","audio"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+  noise_prob: 0.25
+  noise_snr: 0
+  noise_wav: ???
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 18000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 6000
+  hold_steps: 0
+  decay_steps: 18000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 9
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 30000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+  decoder_embed_dim: 1024
+  decoder_ffn_embed_dim: 4096
+  decoder_attention_heads: 8
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/av-finetune/large_noise_pt_noise_ft_433h.yaml ADDED Viewed

	@@ -0,0 +1,124 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video","audio"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+  noise_prob: 0.25
+  noise_snr: 0
+  noise_wav: ???
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 60000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 20000
+  hold_steps: 0
+  decay_steps: 40000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 9
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 48000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+  decoder_embed_dim: 1024
+  decoder_ffn_embed_dim: 4096
+  decoder_attention_heads: 8
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/base_lrs3_30h.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 30000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 10000
+  hold_steps: 0
+  decay_steps: 20000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 6
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 30000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/base_lrs3_433h.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 120000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 40000
+  hold_steps: 0
+  decay_steps: 80000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 6
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 60000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/base_vox_30h.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 30000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 10000
+  hold_steps: 0
+  decay_steps: 20000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 6
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 24000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/base_vox_433h.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 45000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 15000
+  hold_steps: 0
+  decay_steps: 30000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 6
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 22500
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/large_lrs3_30h.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 18000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 6000
+  hold_steps: 0
+  decay_steps: 12000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 9
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 14400
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+  decoder_embed_dim: 1024
+  decoder_ffn_embed_dim: 4096
+  decoder_attention_heads: 8
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/large_lrs3_433h.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 30000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 10000
+  hold_steps: 0
+  decay_steps: 20000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 9
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 18000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+  decoder_embed_dim: 1024
+  decoder_ffn_embed_dim: 4096
+  decoder_attention_heads: 8
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/large_vox_30h.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 30000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 10000
+  hold_steps: 0
+  decay_steps: 20000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 9
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 30000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+  decoder_embed_dim: 1024
+  decoder_ffn_embed_dim: 4096
+  decoder_attention_heads: 8
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/large_vox_433h.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 8
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 30000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 10000
+  hold_steps: 0
+  decay_steps: 20000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 9
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 30000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+  decoder_embed_dim: 1024
+  decoder_ffn_embed_dim: 4096
+  decoder_attention_heads: 8
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/self_large_vox_30h.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 100000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 10000
+  hold_steps: 0
+  decay_steps: 90000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 9
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 80000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+  decoder_embed_dim: 1024
+  decoder_ffn_embed_dim: 4096
+  decoder_attention_heads: 8
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/finetune/self_large_vox_433h.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+  user_dir: ???
+checkpoint:
+  save_interval: 2
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  is_s2s: true
+  data: ???
+  label_dir: ???
+  tokenizer_bpe_model: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["wrd"]
+  single_target: true
+  fine_tuning: true
+  stack_order_audio: 4
+  tokenizer_bpe_name: sentencepiece
+  max_sample_size: 500
+  modalities: ["video"]
+  image_aug: true
+  pad_audio: true
+  random_crop: false
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  validate_after_updates: 0
+  validate_interval: 2
+  train_subset: train
+  valid_subset: valid
+criterion:
+  _name: label_smoothed_cross_entropy
+  report_accuracy: true
+  label_smoothing: 0.1
+optimization:
+  max_update: 100000
+  lr: [0.001]
+  sentence_avg: true
+  update_freq: [1]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  warmup_steps: 10000
+  hold_steps: 0
+  decay_steps: 90000
+  final_lr_scale: 0.05
+model:
+  _name: av_hubert_seq2seq
+  w2v_path: ???
+  apply_mask: false
+  mask_selection: static
+  mask_length: 10
+  mask_other: 0
+  mask_prob: 0.75
+  mask_channel_selection: static
+  mask_channel_length: 64
+  mask_channel_other: 0
+  mask_channel_prob: 0.5
+  layerdrop: 0.1
+  dropout: 0.0
+  activation_dropout: 0.1
+  attention_dropout: 0.0
+  feature_grad_mult: 1.0
+  decoder_layers: 9
+  decoder_dropout: 0.1
+  decoder_attention_dropout: 0.0
+  decoder_activation_dropout: 0.1
+  freeze_finetune_updates: 80000
+  share_decoder_input_output_embed: true
+  decoder_normalize_before: true
+  decoder_embed_dim: 1024
+  decoder_ffn_embed_dim: 4096
+  decoder_attention_heads: 8
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_lrs3_iter1.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["mfcc"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 500
+  min_sample_size: 5
+  pad_audio: true
+  random_crop: false
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: av_hubert
+  label_rate: 100
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: feature
+  mask_prob_image: 0.8
+  mask_length_image: 10
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_lrs3_iter2.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["mfcc"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 500
+  min_sample_size: 5
+  pad_audio: true
+  random_crop: false
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: av_hubert
+  label_rate: 25
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: feature
+  mask_prob_image: 0.8
+  mask_length_image: 10
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_lrs3_iter3.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["mfcc"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 500
+  min_sample_size: 5
+  pad_audio: true
+  random_crop: false
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: av_hubert
+  label_rate: 25
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: feature
+  mask_prob_image: 0.8
+  mask_length_image: 10
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_lrs3_iter4.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["mfcc"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 500
+  min_sample_size: 5
+  pad_audio: true
+  random_crop: false
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: av_hubert
+  label_rate: 25
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: feature
+  mask_prob_image: 0.8
+  mask_length_image: 10
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_lrs3_iter5.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 500
+  min_sample_size: 5
+  pad_audio: true
+  random_crop: false
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: av_hubert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: input
+  mask_prob_image: 0.3
+  mask_length_image: 5
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_vox_iter1.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["mfcc"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 800000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 64000
+model:
+  _name: av_hubert
+  label_rate: 100
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: feature
+  mask_prob_image: 0.8
+  mask_length_image: 10
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_vox_iter2.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 800000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 64000
+model:
+  _name: av_hubert
+  label_rate: 25
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: feature
+  mask_prob_image: 0.8
+  mask_length_image: 10
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_vox_iter3.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 800000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 64000
+model:
+  _name: av_hubert
+  label_rate: 25
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: feature
+  mask_prob_image: 0.8
+  mask_length_image: 10
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_vox_iter4.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 800000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 64000
+model:
+  _name: av_hubert
+  label_rate: 25
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  masking_type: feature
+  mask_prob_image: 0.8
+  mask_length_image: 10
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/base_vox_iter5.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 800000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 64000
+model:
+  _name: av_hubert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: input
+  mask_prob_image: 0.3
+  mask_length_image: 5
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/large_lrs3_iter5.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 64
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 1.0
+  loss_weights: [10,]
+optimization:
+  max_update: 400000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: av_hubert
+  label_rate: 25
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: input
+  mask_prob_image: 0.3
+  mask_length_image: 5
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/large_vox_iter5.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 64
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  # stack_order: 1
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 1.0
+  loss_weights: [10,]
+optimization:
+  max_update: 600000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 48000
+model:
+  _name: av_hubert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: input
+  mask_prob_image: 0.3
+  mask_length_image: 5
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/noise_base_vox_iter5.yaml ADDED Viewed

	@@ -0,0 +1,115 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+  noise_prob: 0.25
+  noise_snr: 0
+  noise_wav: ???
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+optimization:
+  max_update: 800000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 64000
+model:
+  _name: av_hubert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: input
+  mask_prob_image: 0.3
+  mask_length_image: 5
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/pretrain/noise_large_vox_iter5.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  user_dir: ???
+  empty_cache_freq: 10000
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 64
+  distributed_port: 29671
+  nprocs_per_node: 8
+task:
+  _name: av_hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ["km"]
+  label_rate: ${model.label_rate}
+  sample_rate: 25
+  max_sample_size: 2000
+  min_sample_size: 5
+  pad_audio: false
+  random_crop: true
+  normalize: true
+  stack_order_audio: 4
+  input_modality: image
+  image_aug: true
+  max_trim_sample_size: 400
+  noise_prob: 0.25
+  noise_snr: 0
+  noise_wav: ???
+dataset:
+  num_workers: 6
+  max_tokens: 1000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+criterion:
+  _name: av_hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 1.0
+  loss_weights: [10,]
+optimization:
+  max_update: 600000
+  lr: [0.002]
+  clip_norm: 10.0
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 48000
+model:
+  _name: av_hubert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  modality_dropout: 0.5
+  audio_dropout: 0.5
+  modality_fuse: concat
+  selection_type: same_seq
+  masking_type: input
+  mask_prob_image: 0.3
+  mask_length_image: 5
+  mask_prob_audio: 0.8
+  mask_length_audio: 10
+  extractor_mode: default
+  # conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  wav_input: false
+  layer_norm_first: true
+  audio_feat_dim: 104
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}

av_hubert/avhubert/conf/s2s_decode.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+common:
+  user_dir: ???
+generation:
+  beam: 50
+  max_len_a: 1.0
+  max_len_b: 0
+  lenpen: 1.0
+  lm_weight: 0
+common_eval:
+  results_path: ???
+  path: ???
+dataset:
+  max_tokens: 1000
+  gen_subset: valid
+  num_workers: 0
+override:
+  noise_prob: 0.0
+  noise_snr: 0
+  modalities: ???

av_hubert/avhubert/decoder.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from argparse import Namespace
+import contextlib
+import copy
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass, field
+from omegaconf import MISSING, II, open_dict
+from typing import Any, Optional
+from fairseq import checkpoint_utils, tasks, utils
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.tasks import FairseqTask
+from fairseq.models import (
+    BaseFairseqModel,
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+)
+# from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES
+from fairseq.modules import (
+    LayerNorm,
+    PositionalEmbedding,
+    TransformerDecoderLayer,
+)
+class TransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+    def __init__(
+        self,
+        cfg,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+    ):
+        super().__init__(dictionary)
+        self.dropout = cfg.decoder_dropout
+        self.share_input_output_embed = cfg.share_decoder_input_output_embed
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = cfg.decoder_embed_dim
+        self.output_embed_dim = cfg.decoder_embed_dim
+        self.layerdrop = cfg.decoder_layerdrop
+        padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = cfg.max_target_positions
+        self.embed_tokens = embed_tokens
+        # self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
+        self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim)
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+        self.embed_positions = (
+            PositionalEmbedding(
+                cfg.max_target_positions,
+                embed_dim,
+                padding_idx,
+                learned=cfg.decoder_learned_pos,
+            )
+            if not cfg.no_token_positional_embeddings
+            else None
+        )
+        # TODO: update this when transformer gets converted to dataclass configs
+        transformer_cfg = copy.deepcopy(cfg)
+        # with open_dict(transformer_cfg):
+        transformer_cfg.dropout = transformer_cfg.decoder_dropout
+        transformer_cfg.attention_dropout = (
+            transformer_cfg.decoder_attention_dropout
+        )
+        transformer_cfg.activation_dropout = (
+            transformer_cfg.decoder_activation_dropout
+        )
+        self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                TransformerDecoderLayer(transformer_cfg, no_encoder_attn)
+                for _ in range(transformer_cfg.decoder_layers)
+            ]
+        )
+        if not self.share_input_output_embed:
+            self.embed_out = nn.Parameter(
+                torch.Tensor(len(dictionary), self.output_embed_dim)
+            )
+            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)
+        if transformer_cfg.decoder_normalize_before:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+    def forward(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        prev_output_tokens = prev_output_tokens.long()
+        x, extra = self.extract_features(
+            prev_output_tokens, encoder_out, incremental_state
+        )
+        x = self.output_layer(x)
+        return x, extra
+    def extract_features(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        # embed positions
+        positions = (
+            self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+            if self.embed_positions is not None
+            else None
+        )
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+        inner_states = [x]
+        # decoder layers
+        for layer in self.layers:
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, attn, _ = layer(
+                    x,
+                    encoder_out["encoder_out"] if encoder_out is not None else None,
+                    encoder_out["padding_mask"] if encoder_out is not None else None,
+                    incremental_state,
+                    self_attn_mask=self.buffered_future_mask(x)
+                    if incremental_state is None
+                    else None,
+                )
+                inner_states.append(x)
+        if self.layer_norm:
+            x = self.layer_norm(x)
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        return x, {"attn": attn, "inner_states": inner_states}
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        # project back to size of vocabulary
+        emb_mat = self.embed_tokens.weight if self.share_input_output_embed else self.embed_out
+        return torch.matmul(features, emb_mat.transpose(0, 1))
+        # if self.share_input_output_embed:
+        #     return F.linear(features, self.embed_tokens.weight)
+        # else:
+        #     return F.linear(features, self.embed_out)
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict

av_hubert/avhubert/hubert.py ADDED Viewed

	@@ -0,0 +1,779 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os,sys
+import logging
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from dataclasses import dataclass, field
+from fairseq import utils
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.data.dictionary import Dictionary
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec.wav2vec2 import (
+    ConvFeatureExtractionModel,
+    TransformerEncoder,
+)
+from fairseq.modules import GradMultiply, LayerNorm
+from copy import deepcopy
+DBG=True if len(sys.argv) == 1 else False
+if DBG:
+    from hubert_pretraining import (
+        AVHubertPretrainingConfig,
+        AVHubertPretrainingTask,
+    )
+    from resnet import ResEncoder
+    logging.basicConfig(
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=os.environ.get("LOGLEVEL", "INFO").upper(),
+        stream=sys.stdout,
+    )
+    from utils import compute_mask_indices
+    from decoder import TransformerDecoder
+else:
+    from .hubert_pretraining import (
+        AVHubertPretrainingConfig,
+        AVHubertPretrainingTask,
+    )
+    from .resnet import ResEncoder
+    from .utils import compute_mask_indices
+    from .decoder import TransformerDecoder
+from omegaconf import II
+logger = logging.getLogger(__name__)
+EXTRACTOR_MODE_CHOICES = ChoiceEnum(["default", "layer_norm"])
+MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(
+    ["static", "uniform", "normal", "poisson"]
+)
+@dataclass
+class AVHubertConfig(FairseqDataclass):
+    label_rate: int = II("task.label_rate")
+    input_modality: str = II("task.input_modality")
+    extractor_mode: EXTRACTOR_MODE_CHOICES = field(
+        default="default",
+        metadata={
+            "help": "mode for feature extractor. default has a single group "
+            "norm with d groups in the first conv block, whereas layer_norm "
+            "has layer norms in every block (meant to use with normalize=True)"
+        },
+    )
+    encoder_layers: int = field(
+        default=12, metadata={"help": "num encoder layers in the transformer"}
+    )
+    encoder_embed_dim: int = field(
+        default=768, metadata={"help": "encoder embedding dimension"}
+    )
+    encoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "encoder embedding dimension for FFN"}
+    )
+    encoder_attention_heads: int = field(
+        default=12, metadata={"help": "num encoder attention heads"}
+    )
+    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
+        default="gelu", metadata={"help": "activation function to use"}
+    )
+    # dropouts
+    dropout: float = field(
+        default=0.1,
+        metadata={"help": "dropout probability for the transformer"},
+    )
+    attention_dropout: float = field(
+        default=0.1,
+        metadata={"help": "dropout probability for attention weights"},
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={"help": "dropout probability after activation in FFN"},
+    )
+    encoder_layerdrop: float = field(
+        default=0.0,
+        metadata={"help": "probability of dropping a tarnsformer layer"},
+    )
+    dropout_input: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the input (after feat extr)"},
+    )
+    dropout_features: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout to apply to the features (after feat extr)"
+        },
+    )
+    final_dim: int = field(
+        default=0,
+        metadata={
+            "help": "project final representations and targets to this many "
+            "dimensions. set to encoder_embed_dim is <= 0"
+        },
+    )
+    untie_final_proj: bool = field(
+        default=False,
+        metadata={"help": "use separate projection for each target"},
+    )
+    layer_norm_first: bool = field(
+        default=False,
+        metadata={"help": "apply layernorm first in the transformer"},
+    )
+    conv_feature_layers: str = field(
+        default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        metadata={
+            "help": "string describing convolutional feature extraction "
+            "layers in form of a python list that contains "
+            "[(dim, kernel_size, stride), ...]"
+        },
+    )
+    conv_bias: bool = field(
+        default=False, metadata={"help": "include bias in conv encoder"}
+    )
+    logit_temp: float = field(
+        default=0.1, metadata={"help": "temperature to divide logits by"}
+    )
+    target_glu: bool = field(
+        default=False, metadata={"help": "adds projection + glu to targets"}
+    )
+    feature_grad_mult: float = field(
+        default=1.0,
+        metadata={"help": "multiply feature extractor var grads by this"},
+    )
+    # masking
+    mask_length_audio: int = field(default=10, metadata={"help": "mask length"})
+    mask_prob_audio: float = field(
+        default=0.65,
+        metadata={"help": "probability of replacing a token with mask"},
+    )
+    mask_length_image: int = field(default=10, metadata={"help": "mask length"})
+    mask_prob_image: float = field(
+        default=0.65,
+        metadata={"help": "probability of replacing a token with mask"},
+    )
+    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose mask length"}
+    )
+    mask_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        },
+    )
+    no_mask_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow masks to overlap"}
+    )
+    mask_min_space: int = field(
+        default=1,
+        metadata={
+            "help": "min space between spans (if no overlap is enabled)"
+        },
+    )
+    # channel masking
+    mask_channel_length: int = field(
+        default=10,
+        metadata={"help": "length of the mask for features (channels)"},
+    )
+    mask_channel_prob: float = field(
+        default=0.0,
+        metadata={"help": "probability of replacing a feature with 0"},
+    )
+    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static",
+        metadata={"help": "how to choose mask length for channel masking"},
+    )
+    mask_channel_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        },
+    )
+    no_mask_channel_overlap: bool = field(
+        default=False,
+        metadata={"help": "whether to allow channel masks to overlap"},
+    )
+    mask_channel_min_space: int = field(
+        default=1,
+        metadata={
+            "help": "min space between spans (if no overlap is enabled)"
+        },
+    )
+    # positional embeddings
+    conv_pos: int = field(
+        default=128,
+        metadata={
+            "help": "number of filters for convolutional positional embeddings"
+        },
+    )
+    conv_pos_groups: int = field(
+        default=16,
+        metadata={
+            "help": "number of groups for convolutional positional embedding"
+        },
+    )
+    latent_temp: Tuple[float, float, float] = field(
+        default=(2, 0.5, 0.999995),
+        metadata={"help": "legacy (to be removed)"},
+    )
+    # loss computation
+    skip_masked: bool = field(
+        default=False,
+        metadata={"help": "skip computing losses over masked frames"},
+    )
+    skip_nomask: bool = field(
+        default=False,
+        metadata={"help": "skip computing losses over unmasked frames"},
+    )
+    resnet_relu_type: str = field(default='prelu', metadata={"help": 'relu type for resnet'})
+    resnet_weights: Optional[str] = field(default=None, metadata={"help": 'resnet weights'})
+    sim_type: str = field(default='cosine', metadata={"help": 'similarity type'})
+    sub_encoder_layers: int = field(default=0, metadata={'help': 'number of transformer layers for single modality'})
+    audio_feat_dim: int = field(default=-1, metadata={'help': 'audio feature dimension'})
+    modality_dropout: float = field(default=0, metadata={'help': 'drop one modality'})
+    audio_dropout: float = field(default=0, metadata={'help': 'drop audio feature'})
+    modality_fuse: str = field(default='concat', metadata={'help': 'fusing two modalities: add,concat'})
+    selection_type : str = field(default='same_other_seq', metadata={'help': 'type of selectig images, same_other_seq: replace masked span with span from another sequence, same_seq: repace masked span with span of the same sequence'})
+    masking_type : str = field(default='input', metadata={'help': 'input or feature masking'})
+    decoder_embed_dim: int = field(
+        default=768, metadata={"help": "decoder embedding dimension"}
+    )
+    decoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "decoder embedding dimension for FFN"}
+    )
+    decoder_layers: int = field(
+        default=6, metadata={"help": "num of decoder layers"}
+    )
+    decoder_layerdrop: float = field(
+        default=0.0, metadata={"help": "decoder layerdrop chance"}
+    )
+    decoder_attention_heads: int = field(
+        default=4, metadata={"help": "num decoder attention heads"}
+    )
+    decoder_learned_pos: bool = field(
+        default=False,
+        metadata={"help": "use learned positional embeddings in the decoder"},
+    )
+    decoder_normalize_before: bool = field(
+        default=False,
+        metadata={"help": "apply layernorm before each decoder block"},
+    )
+    no_token_positional_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, disables positional embeddings "
+            "(outside self attention)"
+        },
+    )
+    decoder_dropout: float = field(
+        default=0.1, metadata={"help": "dropout probability in the decoder"}
+    )
+    decoder_attention_dropout: float = field(
+        default=0.1,
+        metadata={
+            "help": "dropout probability for attention weights "
+            "inside the decoder"
+        },
+    )
+    decoder_activation_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability after activation in FFN "
+            "inside the decoder"
+        },
+    )
+    max_target_positions: int = field(
+        default=2048, metadata={"help": "max target positions"}
+    )
+    share_decoder_input_output_embed: bool = field(
+        default=False,
+        metadata={"help": "share decoder input and output embeddings"},
+    )
+    no_scale_embedding: bool = field(default=True, metadata={'help': 'scale embedding'})
+class SubModel(nn.Module):
+    def __init__(self, resnet=None, input_dim=None, cfg=None):
+        super().__init__()
+        self.resnet = resnet
+        self.proj = nn.Linear(input_dim, cfg.encoder_embed_dim)
+        self.encoder = TransformerEncoder(cfg) if cfg.encoder_layers > 0 else None
+    def forward(self, x):
+        if self.resnet is not None:
+            x = self.resnet(x)
+        x = self.proj(x.transpose(1, 2))
+        if self.encoder is not None:
+            x = self.encoder(x)[0].transpose(1, 2)
+        else:
+            x = x.transpose(1, 2)
+        return x
+@register_model("av_hubert", dataclass=AVHubertConfig)
+class AVHubertModel(BaseFairseqModel):
+    def __init__(
+        self,
+        cfg: AVHubertConfig,
+        task_cfg: AVHubertPretrainingConfig,
+        dictionaries: List[Dictionary],
+        **kwargs
+    ) -> None:
+        super().__init__()
+        logger.info(f"HubertModel Config: {cfg}")
+        feature_ds_rate = 1
+        self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / task_cfg.sample_rate
+        sub_cfg = deepcopy(cfg)
+        sub_cfg.encoder_layers = sub_cfg.sub_encoder_layers
+        resnet = ResEncoder(relu_type=cfg.resnet_relu_type, weights=cfg.resnet_weights)
+        self.feature_extractor_audio = SubModel(resnet=None, input_dim=cfg.audio_feat_dim, cfg=sub_cfg)
+        self.feature_extractor_video = SubModel(resnet=resnet, input_dim=resnet.backend_out, cfg=sub_cfg)
+        self.modality_dropout, self.audio_dropout = cfg.modality_dropout, cfg.audio_dropout
+        self.modality_fuse = cfg.modality_fuse
+        self.encoder_embed_dim = cfg.encoder_embed_dim
+        if self.modality_fuse == 'concat':
+            self.embed = cfg.encoder_embed_dim * 2
+        elif self.modality_fuse == 'add':
+            self.embed = cfg.encoder_embed_dim
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim)
+            if self.embed != cfg.encoder_embed_dim
+            else None
+        )
+        self.mask_prob_image, self.mask_prob_audio = cfg.mask_prob_image, cfg.mask_prob_audio
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length_image, self.mask_length_audio = cfg.mask_length_image, cfg.mask_length_audio
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.logit_temp = cfg.logit_temp
+        self.skip_masked = cfg.skip_masked
+        self.skip_nomask = cfg.skip_nomask
+        self.sim_type = cfg.sim_type
+        self.selection_type = cfg.selection_type
+        self.masking_type = cfg.masking_type
+        final_dim = (
+            cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
+        )
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(cfg.audio_feat_dim).uniform_() if self.masking_type == 'input' else torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
+        )
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+        self.target_glu = None
+        if cfg.target_glu:
+            self.target_glu = nn.Sequential(
+                nn.Linear(final_dim, final_dim * 2), nn.GLU()
+            )
+        self.untie_final_proj = cfg.untie_final_proj
+        if self.untie_final_proj:
+            self.final_proj = nn.Linear(
+                cfg.encoder_embed_dim, final_dim * len(dictionaries)
+            )
+        else:
+            self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim)
+        # modules below are not needed during fine-tuning
+        if any([d is None for d in dictionaries]):
+            logger.info(
+                "cannot find dictionary. assume will be used for fine-tuning"
+            )
+        else:
+            self.num_classes = [len(d) for d in dictionaries]
+            self.label_embs_concat = nn.Parameter(
+                torch.FloatTensor(sum(self.num_classes), final_dim)
+            )
+            nn.init.uniform_(self.label_embs_concat)
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+    @classmethod
+    def build_model(cls, cfg: AVHubertConfig, task: AVHubertPretrainingTask):
+        """Build a new model instance."""
+        kwargs = {}
+        model = AVHubertModel(cfg, task.cfg, task.dictionaries, **kwargs)
+        return model
+    def apply_input_mask(self, x, padding_mask, target_list):
+        B, C, T = x.shape[:3]
+        is_audio = True if len(x.shape) == 3 else False
+        if is_audio:
+            mask_prob, mask_length = self.mask_prob_audio, self.mask_length_audio
+        else:
+            mask_prob, mask_length = self.mask_prob_image, self.mask_length_image
+        if mask_prob > 0:
+            mask_indices, starts, ends, batch_indexes = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                mask_prob,
+                mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space,
+            )
+            mask_indices_np = mask_indices
+            mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = x.transpose(1, 2).contiguous() # [B, T, C, H, W]
+            if B == 1:
+                x[mask_indices] = 0
+            elif is_audio:
+                x[mask_indices] = self.mask_emb
+            elif self.selection_type == 'same_other_seq':
+                perm = (torch.arange(B) + torch.randint(low=1, high=B, size=(1,))) % B
+                x_perm = x[perm]
+                x[mask_indices] = x_perm[mask_indices]
+            elif self.selection_type == 'same_seq':
+                batch_indexes_, other_indexes = [], []
+                for batch_index, start, end in zip(batch_indexes, starts, ends):
+                    length = end-start
+                    other_start = np.setdiff1d(np.arange(T), np.arange(max(0, start-length), end))
+                    if len(other_start) > 0:
+                        other_start = np.random.choice(other_start, size=1)
+                    else:
+                        other_start = 0
+                    other_end = other_start + length
+                    other_indexes.append(np.arange(other_start, other_end).clip(max=T-1))
+                    batch_indexes_.append(np.zeros([length], dtype=np.int64)+batch_index)
+                batch_indexes, other_indexes = np.concatenate(batch_indexes_), np.concatenate(other_indexes)
+                x[mask_indices] = x[batch_indexes, other_indexes]
+            x = x.transpose(1, 2).contiguous()
+        else:
+            mask_indices = None
+        if self.mask_channel_prob > 0:
+            logger.info(f"No mask channel prob for input masking")
+        return x, mask_indices
+    def apply_feature_mask(self, x, padding_mask, target_list):
+        B, T, C = x.shape
+        assert self.mask_prob_audio == self.mask_prob_image and self.mask_length_audio == self.mask_length_image, f"masking prob/length for image/audio be same for feature masking"
+        mask_prob, mask_length = self.mask_prob_audio, self.mask_length_image
+        if mask_prob > 0:
+            mask_indices, _, _, _ = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                mask_prob,
+                mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space,
+            )
+            mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x[mask_indices] = self.mask_emb
+        else:
+            mask_indices = None
+        if self.mask_channel_prob > 0:
+            mask_channel_indices, _, _, _ = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+        return x, mask_indices
+    def forward_features(self, source: torch.Tensor, modality: str) -> torch.Tensor:
+        extractor = eval(f"self.feature_extractor_{modality}")
+        if self.feature_grad_mult > 0:
+            features = extractor(source)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = extractor(source)
+        return features
+    def forward_targets(
+            self, features: torch.Tensor, mask_indices: torch.Tensor, target_list: List[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Trim features to ensure labels exist and then get aligned labels
+        feat_tsz = features.size(2)
+        targ_tsz = min([t.size(1) for t in target_list])
+        if self.feat2tar_ratio * feat_tsz > targ_tsz:
+            feat_tsz = int(targ_tsz / self.feat2tar_ratio)
+            features = features[..., :feat_tsz]
+            if mask_indices is not None:
+                mask_indices = mask_indices[..., :feat_tsz]
+        target_inds = torch.arange(feat_tsz).float() * self.feat2tar_ratio
+        target_list = [t[:, target_inds.long()] for t in target_list]
+        return features, mask_indices, target_list
+    def forward_padding_mask(
+        self, features: torch.Tensor, padding_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(
+            padding_mask.size(0), features.size(1), -1
+        )
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+    def compute_logits(self, feats, emb_mat):
+        # feats: [B, T, F], emb_mat: [V, F]
+        if self.sim_type == 'dot':
+            logits = torch.matmul(feats, emb_mat.transpose(0, 1))
+        elif self.sim_type == 'cosine':
+            batch_size, timesteps, emb_dim = feats.size()
+            feats_ = feats.view(-1, emb_dim)
+            nom = (feats_.unsqueeze(dim=1) * emb_mat.unsqueeze(dim=0)).sum(dim=-1) # [B*T, V]
+            denom = (feats_**2).sum(dim=-1).sqrt().unsqueeze(dim=1) * (emb_mat**2).sum(dim=-1).sqrt().unsqueeze(dim=0) # [B*T, V]
+            logits = (nom/denom.clamp(min=1e-6)).view(batch_size, timesteps, -1)
+        else:
+            raise NotImplementedError
+        logits = logits / self.logit_temp
+        return logits
+    def forward(
+        self,
+        source: torch.Tensor,
+        target_list: Optional[List[torch.Tensor]] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        mask: bool = True,
+        features_only: bool = False,
+        output_layer: Optional[int] = None
+    ) -> Dict[str, torch.Tensor]:
+        """output layer is 1-based"""
+        src_audio, src_video = source['audio'], source['video']
+        if mask and self.masking_type == 'input':
+            src_video, mask_indices_video = self.apply_input_mask(src_video, padding_mask, target_list)
+            src_audio, mask_indices_audio = self.apply_input_mask(src_audio, padding_mask, target_list)
+            mask_indices = torch.logical_or(mask_indices_audio, mask_indices_video)
+        else:
+            src_audio, src_video, mask_indices = src_audio, src_video, None
+        features_audio = self.forward_features(src_audio, modality='audio') # features: [B, F, T]
+        features_video = self.forward_features(src_video, modality='video')
+        modality_drop_prob, audio_drop_prob = np.random.random(), np.random.random()
+        if self.training:
+            if modality_drop_prob < self.modality_dropout:
+                if audio_drop_prob < self.audio_dropout:
+                    features_audio = 0 * features_audio
+                else:
+                    features_video = 0 * features_video
+        if self.modality_fuse == 'concat':
+            features = torch.cat([features_audio, features_video], dim=1)
+        elif self.modality_fuse == 'add':
+            features = features_audio + features_video
+        if target_list is not None:
+            features, mask_indices, target_list = self.forward_targets(features, mask_indices, target_list)
+        features_pen = features.float().pow(2).mean()
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        features = self.dropout_input(features)
+        if self.masking_type == 'feature' and mask:
+            x, mask_indices = self.apply_feature_mask(features, padding_mask, target_list)
+        else:
+            x = features
+        # feature: (B, T, D), float
+        # target: (B, T), long
+        # x: (B, T, D), float
+        # padding_mask: (B, T), bool
+        # mask_indices: (B, T), bool
+        x, _ = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=None if output_layer is None else output_layer - 1
+        )
+        if features_only:
+            return {"x": x, "padding_mask": padding_mask, "features": features}
+        label_embs_list = self.label_embs_concat.split(self.num_classes, 0)
+        proj_x = self.final_proj(x)
+        if self.untie_final_proj:
+            proj_x_list = proj_x.chunk(len(self.num_classes), dim=-1)
+        else:
+            proj_x_list = [proj_x for _ in self.num_classes]
+        logit_list = [self.compute_logits(proj, emb).view(-1, num_class) for proj, emb, num_class in zip(proj_x_list, label_embs_list, self.num_classes)] # [[B*T, V]]
+        mask, unmask = torch.logical_and(mask_indices, ~padding_mask).view(-1), torch.logical_and(~mask_indices, ~padding_mask).view(-1) # [B*T]
+        logit_m_list, logit_u_list = [logit[mask] for logit in logit_list], [logit[unmask] for logit in logit_list]
+        target_m_list, target_u_list = [target.view(-1)[mask].long() for target in target_list], [target.view(-1)[unmask].long() for target in target_list]
+        result = {
+            "logit_m_list": logit_m_list,
+            "logit_u_list": logit_u_list,
+            "target_m_list": target_m_list,
+            "target_u_list": target_u_list,
+            "padding_mask": padding_mask,
+            "features_pen": features_pen,
+        }
+        return result
+    def extract_features(
+        self,
+        source: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        mask: bool = False,
+        ret_conv: bool = False,
+        output_layer: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        res = self.forward(
+            source,
+            padding_mask=padding_mask,
+            mask=mask,
+            features_only=True,
+            output_layer=output_layer,
+        )
+        feature = res["features"] if ret_conv else res["x"]
+        return feature, res["padding_mask"]
+    def extract_finetune(self, source, padding_mask=None, mask=False, ret_conv=False, output_layer=None):
+        src_audio, src_video = source['audio'], source['video']
+        if mask and self.masking_type == 'input':
+            src_video, mask_indices_video = self.apply_input_mask(src_video, padding_mask, target_list=None)
+            src_audio, mask_indices_audio = self.apply_input_mask(src_audio, padding_mask, target_list=None)
+            mask_indices = torch.logical_or(mask_indices_audio, mask_indices_video) # mask_indices not used in fine-tuning
+        else:
+            src_audio, src_video, mask_indices = src_audio, src_video, None
+        if src_audio is not None and src_video is None:
+            features_audio = self.forward_features(src_audio, modality='audio') # features: [B, F, T]
+            features_video = features_audio.new_zeros(features_audio.size(0), self.encoder_embed_dim, features_audio.size(-1))
+        elif src_audio is None and src_video is not None:
+            features_video = self.forward_features(src_video, modality='video')
+            features_audio = features_video.new_zeros(features_video.size(0), self.encoder_embed_dim, features_video.size(-1))
+        elif src_audio is not None and src_video is not None:
+            features_video = self.forward_features(src_video, modality='video')
+            features_audio = self.forward_features(src_audio, modality='audio') # features: [B, F, T]
+        if self.modality_fuse == 'concat':
+            features = torch.cat([features_audio, features_video], dim=1)
+        elif self.modality_fuse == 'add':
+            features = features_audio + features_video
+        features_pen = features.float().pow(2).mean()
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        unmasked_features = features.clone()
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        features = self.dropout_input(features)
+        unmasked_features = self.dropout_features(unmasked_features)
+        x = features
+        mask_indices = None
+        # feature: (B, T, D), float
+        # target: (B, T), long
+        # x: (B, T, D), float
+        # padding_mask: (B, T), bool
+        # mask_indices: (B, T), bool
+        x, _ = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=None if output_layer is None else output_layer - 1
+        )
+        return x, padding_mask
+    def get_extra_losses(self, net_output):
+        extra_losses = []
+        names = []
+        if "features_pen" in net_output:
+            extra_losses.append(net_output["features_pen"])
+            names.append("features_pen")
+        return extra_losses, names
+    def remove_pretraining_modules(self):
+        self.target_glu = None
+        self.final_proj = None
+    def get_logits(self, net_output, is_masked=True):
+        raise NotImplementedError
+    def get_targets(self, net_output, is_masked=True):
+        raise NotImplementedError
+    def compute_nce(self, x, pos, negs):
+        neg_is_pos = (pos == negs).all(-1)
+        pos = pos.unsqueeze(0)
+        targets = torch.cat([pos, negs], dim=0)
+        logits = torch.cosine_similarity(
+            x.float(), targets.float(), dim=-1
+        ).type_as(x)
+        logits /= self.logit_temp
+        if neg_is_pos.any():
+            logits[1:][neg_is_pos] = float("-inf")
+        logits = logits.transpose(0, 1)  # (num_x, num_cls+1)
+        return logits

av_hubert/avhubert/hubert_asr.py ADDED Viewed

	@@ -0,0 +1,521 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import sys,logging
+import contextlib
+import tempfile
+from argparse import Namespace
+from typing import Any, Optional
+import torch
+import torch.nn as nn
+from dataclasses import dataclass, field
+from fairseq import checkpoint_utils, tasks, utils
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.models import BaseFairseqModel, FairseqEncoder, FairseqEncoderDecoderModel, register_model
+from fairseq.models.hubert.hubert import MASKING_DISTRIBUTION_CHOICES
+from fairseq.tasks import FairseqTask
+from omegaconf import II, MISSING
+DBG=True if len(sys.argv) == 1 else False
+if DBG:
+    from hubert import AVHubertModel
+    from decoder import TransformerDecoder
+else:
+    from .hubert import AVHubertModel
+    from .decoder import TransformerDecoder
+logger = logging.getLogger(__name__)
+@dataclass
+class AVHubertAsrConfig(FairseqDataclass):
+    w2v_path: str = field(
+        default=MISSING, metadata={"help": "path to hubert model"}
+    )
+    no_pretrained_weights: bool = field(
+        default=False,
+        metadata={"help": "if true, does not load pretrained weights"},
+    )
+    dropout_input: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the input (after feat extr)"},
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout after transformer and before final projection"
+        },
+    )
+    dropout: float = field(
+        default=0.0,
+        metadata={"help": "dropout probability inside hubert model"},
+    )
+    attention_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability for attention weights "
+            "inside hubert model"
+        },
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability after activation in FFN "
+            "inside hubert model"
+        },
+    )
+    # masking
+    apply_mask: bool = field(
+        default=False, metadata={"help": "apply masking during fine-tuning"}
+    )
+    mask_length: int = field(
+        default=10, metadata={"help": "repeat the mask indices multiple times"}
+    )
+    mask_prob: float = field(
+        default=0.5,
+        metadata={
+            "help": "probability of replacing a token with mask "
+            "(normalized by length)"
+        },
+    )
+    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose masks"}
+    )
+    mask_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indices"
+        },
+    )
+    no_mask_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow masks to overlap"}
+    )
+    # channel masking
+    mask_channel_length: int = field(
+        default=10,
+        metadata={"help": "length of the mask for features (channels)"},
+    )
+    mask_channel_prob: float = field(
+        default=0.0,
+        metadata={"help": "probability of replacing a feature with 0"},
+    )
+    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static",
+        metadata={"help": "how to choose mask length for channel masking"},
+    )
+    mask_channel_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indices"
+        },
+    )
+    no_mask_channel_overlap: bool = field(
+        default=False,
+        metadata={"help": "whether to allow channel masks to overlap"},
+    )
+    freeze_finetune_updates: int = field(
+        default=0,
+        metadata={"help": "dont finetune hubert for this many updates"},
+    )
+    feature_grad_mult: float = field(
+        default=0.0,
+        metadata={"help": "reset feature grad mult in hubert to this"},
+    )
+    layerdrop: float = field(
+        default=0.0,
+        metadata={"help": "probability of dropping a layer in hubert"},
+    )
+    normalize: bool = II("task.normalize")
+    data: str = II("task.data")
+    # this holds the loaded hubert args
+    w2v_args: Any = None
+@dataclass
+class AVHubertCtcConfig(AVHubertAsrConfig):
+    pass
+@register_model("av_hubert_ctc", dataclass=AVHubertCtcConfig)
+class AVHubertCtc(BaseFairseqModel):
+    def __init__(self, cfg: AVHubertCtcConfig, w2v_encoder: BaseFairseqModel):
+        super().__init__()
+        self.cfg = cfg
+        self.w2v_encoder = w2v_encoder
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+    @classmethod
+    def build_model(cls, cfg: AVHubertCtcConfig, task: FairseqTask):
+        """Build a new model instance."""
+        w2v_encoder = HubertEncoder(cfg, task.target_dictionary)
+        return cls(cfg, w2v_encoder)
+    def get_normalized_probs(self, net_output, log_probs):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        logits = net_output["encoder_out"]
+        if log_probs:
+            return utils.log_softmax(logits.float(), dim=-1)
+        else:
+            return utils.softmax(logits.float(), dim=-1)
+    def get_logits(self, net_output):
+        logits = net_output["encoder_out"]
+        padding = net_output["encoder_padding_mask"]
+        if padding is not None and padding.any():
+            padding = padding.T
+            logits[padding][..., 0] = 0
+            logits[padding][..., 1:] = float("-inf")
+        return logits
+    def forward(self, **kwargs):
+        x = self.w2v_encoder(**kwargs)
+        return x
+@dataclass
+class AVHubertSeq2SeqConfig(AVHubertAsrConfig):
+    decoder_embed_dim: int = field(
+        default=768, metadata={"help": "decoder embedding dimension"}
+    )
+    decoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "decoder embedding dimension for FFN"}
+    )
+    decoder_layers: int = field(
+        default=6, metadata={"help": "num of decoder layers"}
+    )
+    decoder_layerdrop: float = field(
+        default=0.0, metadata={"help": "decoder layerdrop chance"}
+    )
+    decoder_attention_heads: int = field(
+        default=4, metadata={"help": "num decoder attention heads"}
+    )
+    decoder_learned_pos: bool = field(
+        default=False,
+        metadata={"help": "use learned positional embeddings in the decoder"},
+    )
+    decoder_normalize_before: bool = field(
+        default=False,
+        metadata={"help": "apply layernorm before each decoder block"},
+    )
+    no_token_positional_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, disables positional embeddings "
+            "(outside self attention)"
+        },
+    )
+    decoder_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability in the decoder"}
+    )
+    decoder_attention_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability for attention weights "
+            "inside the decoder"
+        },
+    )
+    decoder_activation_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability after activation in FFN "
+            "inside the decoder"
+        },
+    )
+    max_target_positions: int = field(
+        default=2048, metadata={"help": "max target positions"}
+    )
+    share_decoder_input_output_embed: bool = field(
+        default=False,
+        metadata={"help": "share decoder input and output embeddings"},
+    )
+    no_scale_embedding: bool = field(default=True, metadata={'help': 'scale embedding'})
+class HubertEncoder(FairseqEncoder):
+    def __init__(self, cfg: AVHubertAsrConfig, tgt_dict=None):
+        self.apply_mask = cfg.apply_mask
+        arg_overrides = {
+            "dropout": cfg.dropout,
+            "activation_dropout": cfg.activation_dropout,
+            "dropout_input": cfg.dropout_input,
+            "attention_dropout": cfg.attention_dropout,
+            "mask_length": cfg.mask_length,
+            "mask_prob": cfg.mask_prob,
+            "mask_selection": cfg.mask_selection,
+            "mask_other": cfg.mask_other,
+            "no_mask_overlap": cfg.no_mask_overlap,
+            "mask_channel_length": cfg.mask_channel_length,
+            "mask_channel_prob": cfg.mask_channel_prob,
+            "mask_channel_selection": cfg.mask_channel_selection,
+            "mask_channel_other": cfg.mask_channel_other,
+            "no_mask_channel_overlap": cfg.no_mask_channel_overlap,
+            "encoder_layerdrop": cfg.layerdrop,
+            "feature_grad_mult": cfg.feature_grad_mult,
+        }
+        if cfg.w2v_args is None:
+            state = checkpoint_utils.load_checkpoint_to_cpu(
+                cfg.w2v_path, arg_overrides
+            )
+            w2v_args = state.get("cfg", None)
+            if w2v_args is None:
+                w2v_args = convert_namespace_to_omegaconf(state["args"])
+            cfg.w2v_args = w2v_args
+        else:
+            state = None
+            w2v_args = cfg.w2v_args
+            if isinstance(w2v_args, Namespace):
+                cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(
+                    w2v_args
+                )
+        assert cfg.normalize == w2v_args.task.normalize, (
+            "Fine-tuning works best when data normalization is the same. "
+            "Please check that --normalize is set or unset for "
+            "both pre-training and here"
+        )
+        w2v_args.task.data = cfg.data
+        task = tasks.setup_task(w2v_args.task)
+        model = task.build_model(w2v_args.model)
+        if state is not None and not cfg.no_pretrained_weights:
+            # set strict=False because we omit some modules
+            model.load_state_dict(state["model"], strict=False)
+        model.remove_pretraining_modules()
+        super().__init__(task.source_dictionary)
+        d = model.encoder.embedding_dim
+        self.w2v_model = model
+        self.final_dropout = nn.Dropout(cfg.final_dropout)
+        self.freeze_finetune_updates = cfg.freeze_finetune_updates
+        self.num_updates = 0
+        if tgt_dict is not None:
+            self.proj = Linear(d, len(tgt_dict))
+        elif getattr(cfg, "decoder_embed_dim", d) != d:
+            self.proj = Linear(d, cfg.decoder_embed_dim)
+        else:
+            self.proj = None
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+    def forward(self, source, padding_mask, tbc=True, **kwargs):
+        w2v_args = {
+            "source": source,
+            "padding_mask": padding_mask,
+            "mask": self.apply_mask and self.training,
+        }
+        ft = self.freeze_finetune_updates <= self.num_updates
+        with torch.no_grad() if not ft else contextlib.ExitStack():
+            x, padding_mask = self.w2v_model.extract_finetune(**w2v_args)
+            if tbc:
+                # B x T x C -> T x B x C
+                x = x.transpose(0, 1)
+        x = self.final_dropout(x)
+        if self.proj:
+            x = self.proj(x)
+        return {
+            "encoder_out": x,  # T x B x C
+            "encoder_padding_mask": padding_mask,  # B x T
+            "padding_mask": padding_mask,
+        }
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out["encoder_out"] is not None:
+            encoder_out["encoder_out"] = encoder_out[
+                "encoder_out"
+            ].index_select(1, new_order)
+        if encoder_out["encoder_padding_mask"] is not None:
+            encoder_out["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ].index_select(0, new_order)
+        return encoder_out
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return None
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+class HubertEncoderWrapper(FairseqEncoder):
+    def __init__(self, w2v_model):
+        super().__init__(None)
+        self.w2v_model = w2v_model
+    def forward(self, source, padding_mask, **kwargs):
+        w2v_args = {
+            "source": source,
+            "padding_mask": padding_mask,
+        }
+        x, padding_mask = self.w2v_model.extract_finetune(**w2v_args)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        return {
+            "encoder_out": x,  # T x B x C
+            "encoder_padding_mask": padding_mask,  # B x T
+            "padding_mask": padding_mask
+        }
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out["encoder_out"] is not None:
+            encoder_out["encoder_out"] = encoder_out[
+                "encoder_out"
+            ].index_select(1, new_order)
+        if encoder_out["encoder_padding_mask"] is not None:
+            encoder_out["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ].index_select(0, new_order)
+        if encoder_out["padding_mask"] is not None:
+            encoder_out["padding_mask"] = encoder_out[
+                "padding_mask"
+            ].index_select(0, new_order)
+        return encoder_out
+@register_model("av_hubert_seq2seq", dataclass=AVHubertSeq2SeqConfig)
+class AVHubertSeq2Seq(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder, tgt_dict, cfg):
+        super().__init__(encoder, decoder)
+        self.cfg = cfg
+        self.freeze_finetune_updates = cfg.freeze_finetune_updates
+    @classmethod
+    def build_model(cls, cfg, task):
+        """Build a new model instance."""
+        arg_overrides = {
+            "dropout": cfg.dropout,
+            "activation_dropout": cfg.activation_dropout,
+            "dropout_input": cfg.dropout_input,
+            "attention_dropout": cfg.attention_dropout,
+            "mask_length": cfg.mask_length,
+            "mask_prob": cfg.mask_prob,
+            "mask_selection": cfg.mask_selection,
+            "mask_other": cfg.mask_other,
+            "no_mask_overlap": cfg.no_mask_overlap,
+            "mask_channel_length": cfg.mask_channel_length,
+            "mask_channel_prob": cfg.mask_channel_prob,
+            "mask_channel_selection": cfg.mask_channel_selection,
+            "mask_channel_other": cfg.mask_channel_other,
+            "no_mask_channel_overlap": cfg.no_mask_channel_overlap,
+            "encoder_layerdrop": cfg.layerdrop,
+            "feature_grad_mult": cfg.feature_grad_mult,
+        }
+        if cfg.w2v_args is None:
+            state = checkpoint_utils.load_checkpoint_to_cpu(
+                cfg.w2v_path, arg_overrides
+            )
+            w2v_args = state.get("cfg", None)
+            if w2v_args is None:
+                w2v_args = convert_namespace_to_omegaconf(state["args"])
+            cfg.w2v_args = w2v_args
+        else:
+            state = None
+            w2v_args = cfg.w2v_args
+            if isinstance(w2v_args, Namespace):
+                cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(
+                    w2v_args
+                )
+        assert cfg.normalize == w2v_args.task.normalize, (
+            "Fine-tuning works best when data normalization is the same. "
+            "Please check that --normalize is set or unset for "
+            "both pre-training and here"
+        )
+        w2v_args.task.data = cfg.data
+        task_pretrain = tasks.setup_task(w2v_args.task)
+        if state is not None:
+            task_pretrain.load_state_dict(state['task_state'])
+        encoder_ = task_pretrain.build_model(w2v_args.model)
+        encoder = HubertEncoderWrapper(encoder_)
+        if state is not None and not cfg.no_pretrained_weights:
+            # set strict=False because we omit some modules
+            del state['model']['mask_emb']
+            encoder.w2v_model.load_state_dict(state["model"], strict=False)
+        encoder.w2v_model.remove_pretraining_modules()
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+        def build_embedding(dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx=padding_idx)
+            return emb
+        decoder_embed_tokens = build_embedding(tgt_dict, cfg.decoder_embed_dim)
+        decoder = TransformerDecoder(cfg, tgt_dict, decoder_embed_tokens)
+        return AVHubertSeq2Seq(encoder, decoder, tgt_dict, cfg)
+    def forward(self, **kwargs):
+        ft = self.freeze_finetune_updates <= self.num_updates
+        with torch.no_grad() if not ft else contextlib.ExitStack():
+            output = self.encoder(**kwargs)
+        decoder_out = self.decoder(prev_output_tokens=kwargs['prev_output_tokens'], encoder_out=output)
+        return decoder_out
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m

av_hubert/avhubert/hubert_criterion.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import re
+from dataclasses import dataclass, field
+from typing import List, Optional
+import torch
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+@dataclass
+class AVHubertCriterionConfig(FairseqDataclass):
+    pred_masked_weight: float = field(
+        default=1.0,
+        metadata={"help": "weight for predictive loss for masked frames"},
+    )
+    pred_nomask_weight: float = field(
+        default=0.0,
+        metadata={"help": "weight for predictive loss for unmasked frames"},
+    )
+    loss_weights: Optional[List[float]] = field(
+        default=None,
+        metadata={"help": "weights for additional loss terms (not first one)"},
+    )
+    log_keys: List[str] = field(
+        default_factory=lambda: [],
+        metadata={"help": "output keys to log"},
+    )
+@register_criterion("av_hubert", dataclass=AVHubertCriterionConfig)
+class AVHubertCriterion(FairseqCriterion):
+    def __init__(self, task, pred_masked_weight, pred_nomask_weight, loss_weights=None, log_keys=None):
+        super().__init__(task)
+        self.pred_masked_weight = pred_masked_weight
+        self.pred_nomask_weight = pred_nomask_weight
+        self.loss_weights = loss_weights
+        self.log_keys = [] if log_keys is None else log_keys
+    def forward(self, model, sample, reduce=True, log_pred=False):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(target_list=sample["target_list"], **sample["net_input"])
+        loss = 0.
+        sample_size = 0
+        logging_output = {}
+        reduction = "sum" if reduce else "none"
+        loss_m_list = []
+        logp_m_list, targ_m_list = net_output['logit_m_list'], net_output['target_m_list']
+        for i, (logp_m, targ_m) in enumerate(zip(logp_m_list, targ_m_list)):
+            loss_m = F.cross_entropy(logp_m, targ_m, reduction=reduction)
+            loss_m_list.append(loss_m)
+            logging_output[f"loss_m_{i}"] = loss_m.detach().item()
+        if self.pred_masked_weight > 0:
+            loss += self.pred_masked_weight * sum(loss_m_list)
+            sample_size += targ_m_list[0].numel()
+        loss_u_list = []
+        logp_u_list, targ_u_list = net_output['logit_u_list'], net_output['target_u_list']
+        for i, (logp_u, targ_u) in enumerate(zip(logp_u_list, targ_u_list)):
+            loss_u = F.cross_entropy(logp_u, targ_u, reduction=reduction)
+            loss_u_list.append(loss_u)
+            logging_output[f"loss_u_{i}"] = loss_u.detach().item()
+        if self.pred_nomask_weight > 0:
+            loss += self.pred_nomask_weight * sum(loss_u_list)
+            sample_size += targ_u_list[0].numel()
+        if self.loss_weights is not None:
+            assert hasattr(model, "get_extra_losses")
+            extra_losses, names = model.get_extra_losses(net_output)
+            if torch.is_tensor(extra_losses):
+                extra_losses = [extra_losses]
+                names = [names]
+            if len(self.loss_weights) == 1 and len(extra_losses) != 1:
+                self.loss_weights = [self.loss_weights[0]] * len(extra_losses)
+            assert len(extra_losses) == len(self.loss_weights), f"{len(extra_losses)}, {len(self.loss_weights)}"
+            for p, n, coef in zip(extra_losses, names, self.loss_weights):
+                if coef != 0 and p is not None:
+                    p = coef * p.float() * sample_size
+                    loss += p
+                    logging_output[f"loss_{n}"] = p.item()
+        logging_output = {
+            "loss": loss.item() if reduce else loss,
+            "ntokens": sample_size,
+            "nsentences": sample["id"].numel(),
+            "sample_size": sample_size,
+            **logging_output,
+        }
+        for lk in self.log_keys:
+            if lk in net_output:
+                logging_output[lk] = float((net_output[lk]))
+        with torch.no_grad():
+            for i, logp_m in enumerate(logp_m_list):
+                # corr_m, count_m = compute_correct(logp_m)
+                if logp_m.numel() == 0:
+                    corr_m, count_m = 0, 0
+                else:
+                    corr_m, count_m = (logp_m.argmax(dim=-1)==targ_m_list[i]).sum().item(), len(targ_m_list[i])
+                logging_output[f"correct_m_{i}"] = corr_m
+                logging_output[f"count_m_{i}"] = count_m
+            for i, logp_u in enumerate(logp_u_list):
+                if logp_u.numel() == 0:
+                    corr_u, count_u = 0, 0
+                else:
+                    corr_u, count_u = (logp_u.argmax(dim=-1)==targ_u_list[i]).sum().item(), len(targ_u_list[i])
+                logging_output[f"correct_u_{i}"] = corr_u
+                logging_output[f"count_u_{i}"] = count_u
+        return loss, sample_size, logging_output
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training (copied from normal cross entropy)."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        metrics.log_scalar("loss", loss_sum / sample_size / math.log(2), sample_size, round=3)
+        if sample_size != ntokens:
+            metrics.log_scalar("nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3)
+            metrics.log_derived("ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg))
+        else:
+            metrics.log_derived("ppl", lambda meters: utils.get_perplexity(meters["loss"].avg))
+        counts = {}
+        for lk in logging_outputs[0].keys():
+            if lk.startswith("count_"):
+                val = sum(log[lk] for log in logging_outputs)
+                metrics.log_scalar(lk, val)
+                counts[lk] = val
+        for lk in logging_outputs[0].keys():
+            if lk.startswith("loss_"):
+                val = sum(log[lk] for log in logging_outputs)
+                metrics.log_scalar(lk, val / sample_size / math.log(2), round=3)
+            elif lk.startswith("correct_"):
+                val = sum(log[lk] for log in logging_outputs)
+                metrics.log_scalar(lk, val / counts[re.sub("correct", "count", lk)])
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        raise NotImplementedError()
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return False

av_hubert/avhubert/hubert_dataset.py ADDED Viewed

	@@ -0,0 +1,529 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+import logging
+import os
+import sys
+import time
+from typing import Any, List, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq.data import data_utils
+from fairseq.data.fairseq_dataset import FairseqDataset
+from python_speech_features import logfbank
+from scipy.io import wavfile
+DBG=True if len(sys.argv) == 1 else False
+if DBG:
+    import utils as custom_utils
+    logging.basicConfig(
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=os.environ.get("LOGLEVEL", "DEBUG").upper(),
+        stream=sys.stdout,
+    )
+else:
+    from . import utils as custom_utils
+logger = logging.getLogger(__name__)
+def load_audio_visual(manifest_path, max_keep, min_keep, frame_rate, label_paths, label_rates, tol=0.1):
+    def is_audio_label_aligned(audio_dur, label_durs):
+        return all([abs(audio_dur - label_dur)<tol for label_dur in label_durs])
+    n_long, n_short, n_unaligned = 0, 0, 0
+    names, inds, sizes = [], [], []
+    dur_from_label_list = []
+    is_seq_label = any([x==-1 for x in label_rates])
+    for label_path, label_rate in zip(label_paths, label_rates):
+        label_lengths = [len(line.rstrip().split())/label_rate for line in open(label_path).readlines()]
+        dur_from_label_list.append(label_lengths)
+    dur_from_label_list = list(zip(*dur_from_label_list))
+    with open(manifest_path) as f:
+        root = f.readline().strip()
+        for ind, line in enumerate(f):
+            items = line.strip().split("\t")
+            sz = int(items[-2]) #
+            if min_keep is not None and sz < min_keep:
+                n_short += 1
+            elif max_keep is not None and sz > max_keep:
+                n_long += 1
+            elif (not is_seq_label) and (not is_audio_label_aligned(sz/frame_rate, dur_from_label_list[ind])):
+                n_unaligned += 1
+            else:
+                video_path = items[1]
+                audio_path = items[2]
+                audio_id = items[0]
+                names.append((video_path, audio_path+':'+audio_id))
+                inds.append(ind)
+                sizes.append(sz)
+    tot = ind + 1
+    logger.info(
+        (
+            f"max_keep={max_keep}, min_keep={min_keep}, "
+            f"loaded {len(names)}, skipped {n_short} short and {n_long} long and {n_unaligned} unaligned, "
+            f"longest-loaded={max(sizes)}, shortest-loaded={min(sizes)}"
+        )
+    )
+    return root, names, inds, tot, sizes
+def load_label(label_path, inds, tot):
+    with open(label_path) as f:
+        labels = [line.rstrip() for line in f]
+        assert (
+            len(labels) == tot
+        ), f"number of labels does not match ({len(labels)} != {tot})"
+        labels = [labels[i] for i in inds]
+    return labels
+def load_label_offset(label_path, inds, tot):
+    with open(label_path) as f:
+        code_lengths = [len(line.encode("utf-8")) for line in f]
+        assert (
+            len(code_lengths) == tot
+        ), f"number of labels does not match ({len(code_lengths)} != {tot})"
+        offsets = list(itertools.accumulate([0] + code_lengths))
+        offsets = [(offsets[i], offsets[i + 1]) for i in inds]
+    return offsets
+def verify_label_lengths(
+    audio_sizes,
+    audio_rate,
+    label_path,
+    label_rate,
+    inds,
+    tot,
+    tol=0.1,  # tolerance in seconds
+):
+    if label_rate < 0:
+        logger.info(f"{label_path} is sequence label. skipped")
+        return
+    with open(label_path) as f:
+        lengths = [len(line.rstrip().split()) for line in f]
+        assert len(lengths) == tot
+        lengths = [lengths[i] for i in inds]
+    num_invalid = 0
+    for i, ind in enumerate(inds):
+        dur_from_audio = audio_sizes[i] / audio_rate
+        dur_from_label = lengths[i] / label_rate
+        if abs(dur_from_audio - dur_from_label) > tol:
+            logger.warning(
+                (
+                    f"audio and label duration differ too much "
+                    f"(|{dur_from_audio} - {dur_from_label}| > {tol}) "
+                    f"in line {ind+1} of {label_path}. Check if `label_rate` "
+                    f"is correctly set (currently {label_rate}). "
+                    f"num. of samples = {audio_sizes[i]}; "
+                    f"label length = {lengths[i]}"
+                )
+            )
+            num_invalid += 1
+    if num_invalid > 0:
+        logger.warning(
+            f"total {num_invalid} (audio, label) pairs with mismatched lengths"
+        )
+class AVHubertDataset(FairseqDataset):
+    def __init__(
+            self,
+            manifest_path: str,
+            sample_rate: float,
+            label_paths: List[str],
+            label_rates: Union[List[float], float],  # -1 for sequence labels
+            pad_list: List[str],
+            eos_list: List[str],
+            label_processors: Optional[List[Any]] = None,
+            max_keep_sample_size: Optional[int] = None,
+            min_keep_sample_size: Optional[int] = None,
+            max_sample_size: Optional[int] = None,
+            shuffle: bool = True,
+            pad_audio: bool = False,
+            normalize: bool = False,
+            store_labels: bool = True,
+            random_crop: bool = False,
+            single_target: bool = False,
+            stack_order_audio: int=1,
+            skip_verify: bool=False,
+            image_mean: float=0,
+            image_std: float=1,
+            image_crop_size: int=88,
+            image_aug: bool=False,
+            modalities: Optional[List[str]]=None,
+            is_s2s=False,
+            noise_fn=None,
+            noise_prob=0,
+            noise_snr=0,
+            noise_num=1
+    ):
+        self.label_rates = (
+            [label_rates for _ in range(len(label_paths))]
+            if isinstance(label_rates, int)
+            else label_rates
+        )
+        self.modalities = set(modalities)
+        self.audio_root, self.names, inds, tot, self.sizes = load_audio_visual(manifest_path, max_keep_sample_size, min_keep_sample_size, frame_rate=sample_rate, label_paths=label_paths, label_rates=self.label_rates)
+        self.sample_rate = sample_rate
+        self.stack_order_audio = stack_order_audio
+        self.shuffle = shuffle
+        self.random_crop = random_crop
+        self.num_labels = len(label_paths)
+        self.pad_list = pad_list
+        self.eos_list = eos_list
+        self.label_processors = label_processors
+        self.single_target = single_target
+        self.store_labels = store_labels
+        self.is_s2s = is_s2s
+        self.noise_wav, self.noise_prob, self.noise_snr, self.noise_num = [ln.strip() for ln in open(noise_fn).readlines()] if noise_fn is not None else [], noise_prob, noise_snr, noise_num
+        assert self.single_target == (self.label_rates[0] == -1), f"single target should be equivalent to sequence label (label_rate==-1)"
+        if store_labels:
+            self.label_list = [load_label(p, inds, tot) for p in label_paths]
+        else:
+            self.label_paths = label_paths
+            self.label_offsets_list = [
+                load_label_offset(p, inds, tot) for p in label_paths
+            ]
+        assert (
+            label_processors is None
+            or len(label_processors) == self.num_labels
+        )
+        if not skip_verify:
+            for label_path, label_rate in zip(label_paths, self.label_rates):
+                verify_label_lengths(self.sizes, self.sample_rate, label_path, label_rate, inds, tot)
+        else:
+            logger.info(f"Skip label alignment verifying")
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.pad_audio = pad_audio
+        self.normalize = normalize
+        if image_aug:
+            self.transform = custom_utils.Compose([
+                custom_utils.Normalize( 0.0,255.0 ),
+                custom_utils.RandomCrop((image_crop_size, image_crop_size)),
+                custom_utils.HorizontalFlip(0.5),
+                custom_utils.Normalize(image_mean, image_std) ])
+        else:
+            self.transform = custom_utils.Compose([
+                custom_utils.Normalize( 0.0,255.0 ),
+                custom_utils.CenterCrop((image_crop_size, image_crop_size)),
+                custom_utils.Normalize(image_mean, image_std) ])
+        logger.info(f"image transform: {self.transform}")
+        logger.info(
+            f"pad_audio={pad_audio}, random_crop={random_crop}, "
+            f"normalize={normalize}, max_sample_size={self.max_sample_size}, "
+            f"seqs2seq data={self.is_s2s},")
+        logger.info(
+            f"Noise wav: {noise_fn}->{len(self.noise_wav)} wav, Prob: {self.noise_prob}, SNR: {self.noise_snr}, Number of mixture: {self.noise_num}"
+        )
+    def get_label(self, index, label_idx):
+        if self.store_labels:
+            label = self.label_list[label_idx][index]
+        else:
+            with open(self.label_paths[label_idx]) as f:
+                offset_s, offset_e = self.label_offsets_list[label_idx][index]
+                f.seek(offset_s)
+                label = f.read(offset_e - offset_s)
+        if self.label_processors is not None:
+            label = self.label_processors[label_idx](label)
+        return label
+    def get_labels(self, index):
+        return [self.get_label(index, i) for i in range(self.num_labels)]
+    def load_feature(self, mix_name):
+        """
+        Load image and audio feature
+        Returns:
+        video_feats: numpy.ndarray of shape [T, H, W, 1], audio_feats: numpy.ndarray of shape [T, F]
+        """
+        def stacker(feats, stack_order):
+            """
+            Concatenating consecutive audio frames
+            Args:
+            feats - numpy.ndarray of shape [T, F]
+            stack_order - int (number of neighboring frames to concatenate
+            Returns:
+            feats - numpy.ndarray of shape [T', F']
+            """
+            feat_dim = feats.shape[1]
+            if len(feats) % stack_order != 0:
+                res = stack_order - len(feats) % stack_order
+                res = np.zeros([res, feat_dim]).astype(feats.dtype)
+                feats = np.concatenate([feats, res], axis=0)
+            feats = feats.reshape((-1, stack_order, feat_dim)).reshape(-1, stack_order*feat_dim)
+            return feats
+        video_fn, audio_fn = mix_name
+        if 'video' in self.modalities:
+            video_feats = self.load_video(video_fn) # [T, H, W, 1]
+        else:
+            video_feats = None
+        if 'audio' in self.modalities:
+            audio_fn = audio_fn.split(':')[0]
+            sample_rate, wav_data = wavfile.read(audio_fn)
+            assert sample_rate == 16_000 and len(wav_data.shape) == 1
+            if np.random.rand() < self.noise_prob:
+                wav_data = self.add_noise(wav_data)
+            audio_feats = logfbank(wav_data, samplerate=sample_rate).astype(np.float32) # [T, F]
+            audio_feats = stacker(audio_feats, self.stack_order_audio) # [T/stack_order_audio, F*stack_order_audio]
+        else:
+            audio_feats = None
+        if audio_feats is not None and video_feats is not None:
+            diff = len(audio_feats) - len(video_feats)
+            if diff < 0:
+                audio_feats = np.concatenate([audio_feats, np.zeros([-diff, audio_feats.shape[-1]], dtype=audio_feats.dtype)])
+            elif diff > 0:
+                audio_feats = audio_feats[:-diff]
+        return video_feats, audio_feats
+    def load_video(self, audio_name):
+        feats = custom_utils.load_video(os.path.join(self.audio_root, audio_name))
+        feats = self.transform(feats)
+        feats = np.expand_dims(feats, axis=-1)
+        return feats
+    def select_noise(self):
+        rand_indexes = np.random.randint(0, len(self.noise_wav), size=self.noise_num)
+        noise_wav = []
+        for x in rand_indexes:
+            noise_wav.append(wavfile.read(self.noise_wav[x])[1].astype(np.float32))
+        if self.noise_num == 1:
+            return noise_wav[0]
+        else:
+            min_len = min([len(x) for x in noise_wav])
+            noise_wav = [x[:min_len] for x in noise_wav]
+            noise_wav = np.floor(np.stack(noise_wav).mean(axis=0))
+            return noise_wav
+    def add_noise(self, clean_wav):
+        clean_wav = clean_wav.astype(np.float32)
+        noise_wav = self.select_noise()
+        if type(self.noise_snr) == int or type(self.noise_snr) == float:
+            snr = self.noise_snr
+        elif type(self.noise_snr) == tuple:
+            snr = np.random.randint(self.noise_snr[0], self.noise_snr[1]+1)
+        clean_rms = np.sqrt(np.mean(np.square(clean_wav), axis=-1))
+        if len(clean_wav) > len(noise_wav):
+            ratio = int(np.ceil(len(clean_wav)/len(noise_wav)))
+            noise_wav = np.concatenate([noise_wav for _ in range(ratio)])
+        if len(clean_wav) < len(noise_wav):
+            start = 0
+            noise_wav = noise_wav[start: start + len(clean_wav)]
+        noise_rms = np.sqrt(np.mean(np.square(noise_wav), axis=-1))
+        adjusted_noise_rms = clean_rms / (10**(snr/20))
+        adjusted_noise_wav = noise_wav * (adjusted_noise_rms / noise_rms)
+        mixed = clean_wav + adjusted_noise_wav
+        #Avoid clipping noise
+        max_int16 = np.iinfo(np.int16).max
+        min_int16 = np.iinfo(np.int16).min
+        if mixed.max(axis=0) > max_int16 or mixed.min(axis=0) < min_int16:
+            if mixed.max(axis=0) >= abs(mixed.min(axis=0)):
+                reduction_rate = max_int16 / mixed.max(axis=0)
+            else :
+                reduction_rate = min_int16 / mixed.min(axis=0)
+            mixed = mixed * (reduction_rate)
+        mixed = mixed.astype(np.int16)
+        return mixed
+    def __getitem__(self, index):
+        video_feats, audio_feats = self.load_feature(self.names[index])
+        audio_feats, video_feats = torch.from_numpy(audio_feats.astype(np.float32)) if audio_feats is not None else None, torch.from_numpy(video_feats.astype(np.float32)) if video_feats is not None else None
+        if self.normalize and 'audio' in self.modalities:
+            with torch.no_grad():
+                audio_feats = F.layer_norm(audio_feats, audio_feats.shape[1:])
+        labels = self.get_labels(index)
+        fid = self.names[index][1].split(':')[1]
+        return {"id": index, 'fid': fid, "video_source": video_feats, 'audio_source': audio_feats, "label_list": labels}
+    def __len__(self):
+        return len(self.sizes)
+    def crop_to_max_size(self, wav, target_size, start=None):
+        size = len(wav)
+        diff = size - target_size
+        if diff <= 0:
+            return wav, 0
+        # longer utterances
+        if start is None:
+            start, end = 0, target_size
+            if self.random_crop:
+                start = np.random.randint(0, diff + 1)
+                end = size - diff + start
+        else:
+            end = start + target_size
+        return wav[start:end], start
+    def collater(self, samples):
+        samples = [s for s in samples if s["id"] is not None]
+        if len(samples) == 0:
+            return {}
+        audio_source, video_source = [s["audio_source"] for s in samples], [s["video_source"] for s in samples]
+        if audio_source[0] is None:
+            audio_source = None
+        if video_source[0] is None:
+            video_source = None
+        if audio_source is not None:
+            audio_sizes = [len(s) for s in audio_source]
+        else:
+            audio_sizes = [len(s) for s in video_source]
+        if self.pad_audio:
+            audio_size = min(max(audio_sizes), self.max_sample_size)
+        else:
+            audio_size = min(min(audio_sizes), self.max_sample_size)
+        if audio_source is not None:
+            collated_audios, padding_mask, audio_starts = self.collater_audio(audio_source, audio_size)
+        else:
+            collated_audios, audio_starts = None, None
+        if video_source is not None:
+            collated_videos, padding_mask, audio_starts = self.collater_audio(video_source, audio_size, audio_starts)
+        else:
+            collated_videos = None
+        targets_by_label = [
+            [s["label_list"][i] for s in samples]
+            for i in range(self.num_labels)
+        ]
+        targets_list, lengths_list, ntokens_list = self.collater_label(
+            targets_by_label, audio_size, audio_starts
+        )
+        source = {"audio": collated_audios, "video": collated_videos}
+        net_input = {"source": source, "padding_mask": padding_mask}
+        batch = {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "net_input": net_input,
+            "utt_id": [s['fid'] for s in samples]
+        }
+        if self.single_target:
+            batch["target_lengths"] = lengths_list[0]
+            batch["ntokens"] = ntokens_list[0]
+            if self.is_s2s:
+                batch['target'], net_input['prev_output_tokens'] = targets_list[0][0], targets_list[0][1]
+            else:
+                batch["target"] = targets_list[0]
+        else:
+            batch["target_lengths_list"] = lengths_list
+            batch["ntokens_list"] = ntokens_list
+            batch["target_list"] = targets_list
+        return batch
+    def collater_audio(self, audios, audio_size, audio_starts=None):
+        audio_feat_shape = list(audios[0].shape[1:])
+        collated_audios = audios[0].new_zeros([len(audios), audio_size]+audio_feat_shape)
+        padding_mask = (
+            torch.BoolTensor(len(audios), audio_size).fill_(False) #
+        )
+        start_known = audio_starts is not None
+        audio_starts = [0 for _ in audios] if not start_known else audio_starts
+        for i, audio in enumerate(audios):
+            diff = len(audio) - audio_size
+            if diff == 0:
+                collated_audios[i] = audio
+            elif diff < 0:
+                assert self.pad_audio
+                collated_audios[i] = torch.cat(
+                    [audio, audio.new_full([-diff]+audio_feat_shape, 0.0)]
+                )
+                padding_mask[i, diff:] = True
+            else:
+                collated_audios[i], audio_starts[i] = self.crop_to_max_size(
+                    audio, audio_size, audio_starts[i] if start_known else None
+                )
+        if len(audios[0].shape) == 2:
+            collated_audios = collated_audios.transpose(1, 2) # [B, T, F] -> [B, F, T]
+        else:
+            collated_audios = collated_audios.permute((0, 4, 1, 2, 3)).contiguous() # [B, T, H, W, C] -> [B, C, T, H, W]
+        return collated_audios, padding_mask, audio_starts
+    def collater_frm_label(
+        self, targets, audio_size, audio_starts, label_rate, pad
+    ):
+        assert label_rate > 0
+        s2f = label_rate / self.sample_rate # num label per sample
+        frm_starts = [int(round(s * s2f)) for s in audio_starts]
+        frm_size = int(round(audio_size * s2f))
+        if not self.pad_audio:
+            rem_size = [len(t) - s for t, s in zip(targets, frm_starts)]
+            frm_size = min(frm_size, *rem_size)
+        targets = [t[s: s + frm_size] for t, s in zip(targets, frm_starts)]
+        logger.debug(f"audio_starts={audio_starts}")
+        logger.debug(f"frame_starts={frm_starts}")
+        logger.debug(f"frame_size={frm_size}")
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        targets = data_utils.collate_tokens(
+            targets, pad_idx=pad, left_pad=False
+        )
+        return targets, lengths, ntokens
+    def collater_seq_label(self, targets, pad):
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        targets = data_utils.collate_tokens(
+            targets, pad_idx=pad, left_pad=False
+        )
+        return targets, lengths, ntokens
+    def collater_seq_label_s2s(self, targets, pad):
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        pad, eos = self.label_processors[0].dictionary.pad(), self.label_processors[0].dictionary.eos()
+        targets_ = data_utils.collate_tokens(targets, pad_idx=pad, eos_idx=eos, left_pad=False)
+        prev_output_tokens = data_utils.collate_tokens(targets, pad_idx=pad, eos_idx=eos, left_pad=False, move_eos_to_beginning=True)
+        return (targets_, prev_output_tokens), lengths, ntokens
+    def collater_label(self, targets_by_label, audio_size, audio_starts):
+        targets_list, lengths_list, ntokens_list = [], [], []
+        itr = zip(targets_by_label, self.label_rates, self.pad_list)
+        for targets, label_rate, pad in itr:
+            if label_rate == -1:
+                if self.is_s2s:
+                    targets, lengths, ntokens = self.collater_seq_label_s2s(targets, pad)
+                else:
+                    targets, lengths, ntokens = self.collater_seq_label(targets, pad)
+            else:
+                targets, lengths, ntokens = self.collater_frm_label(
+                    targets, audio_size, audio_starts, label_rate, pad
+                )
+            targets_list.append(targets)
+            lengths_list.append(lengths)
+            ntokens_list.append(ntokens)
+        return targets_list, lengths_list, ntokens_list
+    def num_tokens(self, index):
+        return self.size(index)
+    def size(self, index):
+        if self.pad_audio:
+            return self.sizes[index]
+        return min(self.sizes[index], self.max_sample_size)
+    def ordered_indices(self):
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        order.append(self.sizes)
+        return np.lexsort(order)[::-1]

av_hubert/avhubert/hubert_pretraining.py ADDED Viewed

	@@ -0,0 +1,400 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os, glob
+import sys
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from dataclasses import dataclass, field
+from fairseq import metrics, search
+from fairseq.data import Dictionary, encoders
+from fairseq.dataclass.configs import FairseqDataclass
+from fairseq.tasks import register_task
+from fairseq.tasks.fairseq_task import FairseqTask
+from omegaconf import MISSING, II
+import numpy as np
+from argparse import Namespace
+DBG=True if len(sys.argv) == 1 else False
+if DBG:
+    from hubert_dataset import AVHubertDataset
+    from sequence_generator import SequenceGenerator
+else:
+    from .hubert_dataset import AVHubertDataset
+    from .sequence_generator import SequenceGenerator
+logger = logging.getLogger(__name__)
+class LabelEncoder(object):
+    def __init__(self, dictionary: Dictionary) -> None:
+        self.dictionary = dictionary
+    def __call__(self, label: str) -> List[str]:
+        return self.dictionary.encode_line(
+            label, append_eos=False, add_if_not_exist=False,
+        )
+class LabelEncoderS2SToken(object):
+    def __init__(self, dictionary: Dictionary, bpe_tokenizer) -> None:
+        self.bpe_tokenizer = bpe_tokenizer
+        self.dictionary = dictionary
+    def __call__(self, label: str) -> List[str]:
+        label = self.bpe_tokenizer.encode(label.lower())
+        return self.dictionary.encode_line(
+            label, append_eos=True, add_if_not_exist=False,
+        ).long()
+    def decode(self, tok, symbols_ignore=None):
+        tok = self.dictionary.string(tok, extra_symbols_to_ignore=symbols_ignore)
+        if self.bpe_tokenizer:
+            tok = self.bpe_tokenizer.decode(tok)
+        return tok
+@dataclass
+class AVHubertPretrainingConfig(FairseqDataclass):
+    data: str = field(
+        default=MISSING, metadata={"help": "path to data directory"}
+    )
+    labels: List[str] = field(
+        default_factory=lambda: ["ltr"],
+        metadata={
+            "help": (
+                "extension of the label files to load, frame-level labels for"
+                " pre-training, and sequence-level label for fine-tuning"
+            )
+        },
+    )
+    label_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "if set, looks for labels in this directory instead",
+        },
+    )
+    label_rate: int = field(
+        default=-1,
+        metadata={"help": "label frame rate. -1 for sequence label"},
+    )
+    sample_rate: int = field(
+        default=16_000,
+        metadata={
+            "help": "target sample rate. audio files will be up/down "
+            "sampled to this rate"
+        },
+    )
+    normalize: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, normalizes input to have 0 mean and unit variance"
+        },
+    )
+    enable_padding: bool = field(
+        default=False,
+        metadata={"help": "pad shorter samples instead of cropping"},
+    )
+    max_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "max sample size to keep in training"},
+    )
+    min_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "min sample size to keep in training"},
+    )
+    max_trim_sample_size: Optional[int] = field(
+        default=II("task.max_sample_size"),
+        metadata={"help": "max sample size to trim to for batching"},
+    )
+    single_target: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "if set, AddTargetDatasets outputs same keys "
+            "as AddTargetDataset"
+        },
+    )
+    random_crop: Optional[bool] = field(
+        default=True,
+        metadata={"help": "always crop from the beginning if false"},
+    )
+    pad_audio: Optional[bool] = field(
+        default=False,
+        metadata={"help": "pad audio to the longest one in the batch if true"},
+    )
+    pdb: Optional[bool] = field(
+        default=False,
+        metadata={"help": "pdb"},
+    )
+    stack_order_audio: int = field(
+        default=1,
+        metadata={"help": "concatenate n consecutive audio frames for one step"},
+    )
+    skip_verify: Optional[bool] = field(
+        default=False,
+        metadata={"help": "skip verifying label-audio alignment"},
+    )
+    image_aug: bool = field(default=False, metadata={'help': 'image data augmentation'})
+    image_crop_size: int = field(
+        default=88, metadata={"help": "image ROI size"})
+    image_mean: float = field(
+        default=0.421, metadata={"help": "image mean"})
+    image_std: float = field(
+        default=0.165, metadata={"help": "image std"})
+    modalities: Optional[List[str]] = field(default_factory=lambda: ["audio", "video"], metadata={'help': 'modalities to load'})
+    is_s2s: bool=field(default=False, metadata={'help': 'seq2seq fine-tuning only'})
+    tokenizer_bpe_name: Optional[str] = field(default=None, metadata={'help': 'tokenizer model name'})
+    tokenizer_bpe_model: Optional[str] = field(default=None, metadata={'help': 'tokenizer model path'})
+    noise_wav: Optional[str] = field(default=None, metadata={'help': 'manifest of noise wav files (one wav file path per line)'})
+    noise_prob: float = field(default=0, metadata={'help': 'noise probability'})
+    noise_snr: Optional[str] = field(default='0', metadata={'help': 'noise SNR in audio'})
+    noise_num: int = field(default=1, metadata={'help': 'number of noise wav files to mix'})
+    fine_tuning: bool = field(default=False, metadata={"help": "set to true if fine-tuning AV-Hubert"})
+@register_task("av_hubert_pretraining", dataclass=AVHubertPretrainingConfig)
+class AVHubertPretrainingTask(FairseqTask):
+    cfg: AVHubertPretrainingConfig
+    def __init__(
+        self,
+        cfg: AVHubertPretrainingConfig,
+    ) -> None:
+        super().__init__(cfg)
+        logger.info(f"current directory is {os.getcwd()}")
+        logger.info(f"AVHubertPretrainingTask Config {cfg}")
+        self.fine_tuning = cfg.fine_tuning
+        if cfg.fine_tuning:
+            self.state.add_factory("target_dictionary", self.load_dictionaries)
+            if cfg.is_s2s:
+                self.state.add_factory("s2s_tokenizer", self.load_tokenizer)
+        else:
+            self.state.add_factory("dictionaries", self.load_dictionaries)
+        self.blank_symbol = "<s>"
+    @property
+    def source_dictionary(self) -> Optional[Dictionary]:
+        return None # self._source_dictionary
+    @property
+    def target_dictionary(self) -> Optional[Dictionary]:
+        return self.state.target_dictionary # self._target_dictionary
+    @property
+    def dictionaries(self) -> List[Dictionary]:
+        return self.state.dictionaries
+    def load_dictionaries(self):
+        label_dir = self.cfg.data if self.cfg.label_dir is None else self.cfg.label_dir
+        dictionaries = [
+            Dictionary.load(f"{label_dir}/dict.{label}.txt")
+            for label in self.cfg.labels
+        ]
+        return dictionaries[0] if self.cfg.fine_tuning else dictionaries
+    def load_tokenizer(self):
+        bpe_args = Namespace(**{'bpe': self.cfg.tokenizer_bpe_name, f"{self.cfg.tokenizer_bpe_name}_model": self.cfg.tokenizer_bpe_model})
+        bpe_tokenizer = encoders.build_bpe(bpe_args)
+        return bpe_tokenizer
+    @property
+    def s2s_tokenizer(self):
+        return self.state.s2s_tokenizer
+    @classmethod
+    def setup_task(
+        cls, cfg: AVHubertPretrainingConfig, **kwargs
+    ) -> "AVHubertPretrainingTask":
+        if cfg.pdb:
+            import pdb
+            pdb.set_trace()
+        return cls(cfg)
+    def get_label_dir(self) -> str:
+        if self.cfg.label_dir is None:
+            return self.cfg.data
+        return self.cfg.label_dir
+    def load_dataset(self, split: str, **kwargs) -> None:
+        manifest = f"{self.cfg.data}/{split}.tsv"
+        dictionaries = [self.target_dictionary] if self.fine_tuning else self.dictionaries
+        pad_list = [dictionary.pad() for dictionary in dictionaries]
+        eos_list = [dictionary.eos() for dictionary in dictionaries]
+        if not self.cfg.is_s2s:
+            procs = [LabelEncoder(dictionary) for dictionary in dictionaries]
+        else:
+            logger.info(f"Using tokenizer")
+            bpe_tokenizer = self.s2s_tokenizer
+            procs = [LabelEncoderS2SToken(dictionary, bpe_tokenizer) for dictionary in dictionaries]
+        paths = [
+            f"{self.get_label_dir()}/{split}.{l}" for l in self.cfg.labels
+        ]
+        image_aug = self.cfg.image_aug if split == 'train' else False
+        noise_fn, noise_snr = f"{self.cfg.noise_wav}/{split}.tsv" if self.cfg.noise_wav is not None else None, eval(self.cfg.noise_snr)
+        noise_num = self.cfg.noise_num #
+        self.datasets[split] = AVHubertDataset(
+            manifest,
+            sample_rate=self.cfg.sample_rate,
+            label_paths=paths,
+            label_rates=self.cfg.label_rate,
+            pad_list=pad_list,
+            eos_list=eos_list,
+            label_processors=procs,
+            max_keep_sample_size=self.cfg.max_sample_size,
+            min_keep_sample_size=self.cfg.min_sample_size,
+            max_sample_size=self.cfg.max_trim_sample_size,
+            pad_audio=self.cfg.pad_audio,
+            normalize=self.cfg.normalize,
+            store_labels=False,
+            random_crop=self.cfg.random_crop,
+            single_target=self.cfg.single_target,
+            stack_order_audio=self.cfg.stack_order_audio,
+            skip_verify=self.cfg.skip_verify,
+            image_mean=self.cfg.image_mean,
+            image_std=self.cfg.image_std,
+            image_crop_size=self.cfg.image_crop_size,
+            image_aug=image_aug,
+            modalities=self.cfg.modalities,
+            is_s2s=self.cfg.is_s2s,
+            noise_fn=noise_fn,
+            noise_prob=self.cfg.noise_prob,
+            noise_snr=noise_snr,
+            noise_num=noise_num
+        )
+    def max_positions(self) -> Tuple[int, int]:
+        return (sys.maxsize, sys.maxsize)
+    def filter_indices_by_size(
+        self, indices: np.array, *args, **kwargs
+    ) -> np.array:
+        return indices
+    def build_generator(
+        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None,
+    ):
+        """
+        Build a :class:`~fairseq.SequenceGenerator` instance for this
+        task.
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            args (fairseq.dataclass.configs.GenerationConfig):
+                configuration object (dataclass) for generation
+            extra_gen_cls_kwargs (Dict[str, Any]): extra options to pass
+                through to SequenceGenerator
+            prefix_allowed_tokens_fn (Callable[[int, torch.Tensor], List[int]]):
+                If provided, this function constrains the beam search to
+                allowed tokens only at each step. The provided function
+                should take 2 arguments: the batch ID (`batch_id: int`)
+                and a unidimensional tensor of token ids (`inputs_ids:
+                torch.Tensor`). It has to return a `List[int]` with the
+                allowed tokens for the next generation step conditioned
+                on the previously generated tokens (`inputs_ids`) and
+                the batch ID (`batch_id`). This argument is useful for
+                constrained generation conditioned on the prefix, as
+                described in "Autoregressive Entity Retrieval"
+                (https://arxiv.org/abs/2010.00904) and
+                https://github.com/facebookresearch/GENRE.
+        """
+        if getattr(args, "score_reference", False):
+            from fairseq.sequence_scorer import SequenceScorer
+            return SequenceScorer(
+                self.target_dictionary,
+                compute_alignment=getattr(args, "print_alignment", False),
+            )
+        # Choose search strategy. Defaults to Beam Search.
+        sampling = getattr(args, "sampling", False)
+        sampling_topk = getattr(args, "sampling_topk", -1)
+        sampling_topp = getattr(args, "sampling_topp", -1.0)
+        diverse_beam_groups = getattr(args, "diverse_beam_groups", -1)
+        diverse_beam_strength = getattr(args, "diverse_beam_strength", 0.5)
+        match_source_len = getattr(args, "match_source_len", False)
+        diversity_rate = getattr(args, "diversity_rate", -1)
+        constrained = getattr(args, "constraints", False)
+        if prefix_allowed_tokens_fn is None:
+            prefix_allowed_tokens_fn = getattr(args, "prefix_allowed_tokens_fn", None)
+        if (
+            sum(
+                int(cond)
+                for cond in [
+                    sampling,
+                    diverse_beam_groups > 0,
+                    match_source_len,
+                    diversity_rate > 0,
+                ]
+            )
+            > 1
+        ):
+            raise ValueError("Provided Search parameters are mutually exclusive.")
+        assert sampling_topk < 0 or sampling, "--sampling-topk requires --sampling"
+        assert sampling_topp < 0 or sampling, "--sampling-topp requires --sampling"
+        if sampling:
+            search_strategy = search.Sampling(
+                self.target_dictionary, sampling_topk, sampling_topp
+            )
+        elif diverse_beam_groups > 0:
+            search_strategy = search.DiverseBeamSearch(
+                self.target_dictionary, diverse_beam_groups, diverse_beam_strength
+            )
+        elif match_source_len:
+            # this is useful for tagging applications where the output
+            # length should match the input length, so we hardcode the
+            # length constraints for simplicity
+            search_strategy = search.LengthConstrainedBeamSearch(
+                self.target_dictionary,
+                min_len_a=1,
+                min_len_b=0,
+                max_len_a=1,
+                max_len_b=0,
+            )
+        elif diversity_rate > -1:
+            search_strategy = search.DiverseSiblingsSearch(
+                self.target_dictionary, diversity_rate
+            )
+        elif constrained:
+            search_strategy = search.LexicallyConstrainedBeamSearch(
+                self.target_dictionary, args.constraints
+            )
+        elif prefix_allowed_tokens_fn:
+            search_strategy = search.PrefixConstrainedBeamSearch(
+                self.target_dictionary, prefix_allowed_tokens_fn
+            )
+        else:
+            search_strategy = search.BeamSearch(self.target_dictionary)
+        extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
+        if seq_gen_cls is None:
+            if getattr(args, "print_alignment", False):
+                seq_gen_cls = SequenceGeneratorWithAlignment
+                extra_gen_cls_kwargs["print_alignment"] = args.print_alignment
+            else:
+                seq_gen_cls = SequenceGenerator
+        return seq_gen_cls(
+            models,
+            self.target_dictionary,
+            beam_size=getattr(args, "beam", 5),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 200),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            search_strategy=search_strategy,
+            **extra_gen_cls_kwargs,
+        )